diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h index dc7e484a40a45..19fa7b8ae9d6f 100644 --- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h +++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h @@ -287,6 +287,8 @@ class LoopVectorizationLegality { /// we can use in-order reductions. bool canVectorizeFPMath(bool EnableStrictReductions); + bool canVectorizeMultiCond() const; + /// Return true if we can vectorize this loop while folding its tail by /// masking. bool canFoldTailByMasking() const; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index 43be72f0f34d4..4e76e11d80a72 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -43,6 +43,9 @@ AllowStridedPointerIVs("lv-strided-pointer-ivs", cl::init(false), cl::Hidden, cl::desc("Enable recognition of non-constant strided " "pointer induction variables.")); +static cl::opt EnableMultiCond("enable-multi-cond-vectorization", + cl::init(false), cl::Hidden, cl::desc("")); + namespace llvm { cl::opt HintsAllowReordering("hints-allow-reordering", cl::init(true), cl::Hidden, @@ -1378,6 +1381,8 @@ bool LoopVectorizationLegality::isFixedOrderRecurrence( } bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) const { + if (canVectorizeMultiCond() && BB != TheLoop->getHeader()) + return true; return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT); } @@ -1514,6 +1519,35 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() { return true; } +bool LoopVectorizationLegality::canVectorizeMultiCond() const { + if (!EnableMultiCond) + return false; + SmallVector Exiting; + TheLoop->getExitingBlocks(Exiting); + if (Exiting.size() != 2 || Exiting[0] != TheLoop->getHeader() || + Exiting[1] != TheLoop->getLoopLatch() || + any_of(*TheLoop->getHeader(), [](Instruction &I) { + return I.mayReadFromMemory() || I.mayHaveSideEffects(); + })) + return false; + CmpInst::Predicate Pred; + Value *A, *B; + if (!match( + TheLoop->getHeader()->getTerminator(), + m_Br(m_ICmp(Pred, m_Value(A), m_Value(B)), m_Value(), m_Value())) || + Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) + return false; + if (any_of(TheLoop->getBlocks(), [this](BasicBlock *BB) { + return any_of(*BB, [this](Instruction &I) { + return any_of(I.users(), [this](User *U) { + return !TheLoop->contains(cast(U)->getParent()); + }); + }); + })) + return false; + return true; +} + // Helper function to canVectorizeLoopNestCFG. bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp, bool UseVPlanNativePath) { diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 027ee21527d22..7aff74194b703 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1362,9 +1362,11 @@ class LoopVectorizationCostModel { // If we might exit from anywhere but the latch, must run the exiting // iteration in scalar form. if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { - LLVM_DEBUG( - dbgs() << "LV: Loop requires scalar epilogue: multiple exits\n"); - return true; + if (!Legal->canVectorizeMultiCond()) { + LLVM_DEBUG( + dbgs() << "LV: Loop requires scalar epilogue: multiple exits\n"); + return true; + } } if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) { LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: " @@ -2535,8 +2537,17 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { LoopVectorPreHeader = OrigLoop->getLoopPreheader(); assert(LoopVectorPreHeader && "Invalid loop structure"); LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr - assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF.isVector())) && - "multiple exit loop without required epilogue?"); + if (Legal->canVectorizeMultiCond()) { + BasicBlock *Latch = OrigLoop->getLoopLatch(); + BasicBlock *TrueSucc = + cast(Latch->getTerminator())->getSuccessor(0); + BasicBlock *FalseSucc = + cast(Latch->getTerminator())->getSuccessor(1); + LoopExitBlock = OrigLoop->contains(TrueSucc) ? FalseSucc : TrueSucc; + } else { + assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF.isVector())) && + "multiple exit loop without required epilogue?"); + } LoopMiddleBlock = SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, @@ -2910,7 +2921,8 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State, for (PHINode &PN : Exit->phis()) PSE.getSE()->forgetLcssaPhiWithNewPredecessor(OrigLoop, &PN); - if (Cost->requiresScalarEpilogue(VF.isVector())) { + if (Legal->canVectorizeMultiCond() || + Cost->requiresScalarEpilogue(VF.isVector())) { // No edge from the middle block to the unique exit block has been inserted // and there is nothing to fix from vector loop; phis should have incoming // from scalar loop only. @@ -3554,7 +3566,8 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { TheLoop->getExitingBlocks(Exiting); for (BasicBlock *E : Exiting) { auto *Cmp = dyn_cast(E->getTerminator()->getOperand(0)); - if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) + if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse() && + (TheLoop->getLoopLatch() == E || !Legal->canVectorizeMultiCond())) AddToWorklistIfAllowed(Cmp); } @@ -7643,12 +7656,15 @@ DenseMap LoopVectorizationPlanner::executePlan( BestVPlan.execute(&State); // 2.5 Collect reduction resume values. - auto *ExitVPBB = - cast(BestVPlan.getVectorLoopRegion()->getSingleSuccessor()); - for (VPRecipeBase &R : *ExitVPBB) { - createAndCollectMergePhiForReduction( - dyn_cast(&R), State, OrigLoop, - State.CFG.VPBB2IRBB[ExitVPBB], ExpandedSCEVs); + VPBasicBlock *ExitVPBB = nullptr; + if (BestVPlan.getVectorLoopRegion()->getSingleSuccessor()) { + ExitVPBB = cast( + BestVPlan.getVectorLoopRegion()->getSingleSuccessor()); + for (VPRecipeBase &R : *ExitVPBB) { + createAndCollectMergePhiForReduction( + dyn_cast(&R), State, OrigLoop, + State.CFG.VPBB2IRBB[ExitVPBB], ExpandedSCEVs); + } } // 2.6. Maintain Loop Hints @@ -7674,6 +7690,7 @@ DenseMap LoopVectorizationPlanner::executePlan( LoopVectorizeHints Hints(L, true, *ORE); Hints.setAlreadyVectorized(); } + TargetTransformInfo::UnrollingPreferences UP; TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE); if (!UP.UnrollVectorizedLoop || CanonicalIVStartValue) @@ -7686,15 +7703,17 @@ DenseMap LoopVectorizationPlanner::executePlan( ILV.printDebugTracesAtEnd(); // 4. Adjust branch weight of the branch in the middle block. - auto *MiddleTerm = - cast(State.CFG.VPBB2IRBB[ExitVPBB]->getTerminator()); - if (MiddleTerm->isConditional() && - hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) { - // Assume that `Count % VectorTripCount` is equally distributed. - unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue(); - assert(TripCount > 0 && "trip count should not be zero"); - const uint32_t Weights[] = {1, TripCount - 1}; - setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false); + if (ExitVPBB) { + auto *MiddleTerm = + cast(State.CFG.VPBB2IRBB[ExitVPBB]->getTerminator()); + if (MiddleTerm->isConditional() && + hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) { + // Assume that `Count % VectorTripCount` is equally distributed. + unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue(); + assert(TripCount > 0 && "trip count should not be zero"); + const uint32_t Weights[] = {1, TripCount - 1}; + setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false); + } } return State.ExpandedSCEVs; @@ -8079,7 +8098,7 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) { // If source is an exiting block, we know the exit edge is dynamically dead // in the vector loop, and thus we don't need to restrict the mask. Avoid // adding uses of an otherwise potentially dead instruction. - if (OrigLoop->isLoopExiting(Src)) + if (!Legal->canVectorizeMultiCond() && OrigLoop->isLoopExiting(Src)) return EdgeMaskCache[Edge] = SrcMask; VPValue *EdgeMask = getVPValueOrAddLiveIn(BI->getCondition()); @@ -8729,6 +8748,8 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW, static SetVector collectUsersInExitBlock( Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan, const MapVector &Inductions) { + if (!Plan.getVectorLoopRegion()->getSingleSuccessor()) + return {}; auto *MiddleVPBB = cast(Plan.getVectorLoopRegion()->getSingleSuccessor()); // No edge from the middle block to the unique exit block has been inserted @@ -8814,6 +8835,8 @@ static void addLiveOutsForFirstOrderRecurrences( // TODO: Should be replaced by // Plan->getScalarLoopRegion()->getSinglePredecessor() in the future once the // scalar region is modeled as well. + if (!VectorRegion->getSingleSuccessor()) + return; auto *MiddleVPBB = cast(VectorRegion->getSingleSuccessor()); VPBasicBlock *ScalarPHVPBB = nullptr; if (MiddleVPBB->getNumSuccessors() == 2) { @@ -9100,6 +9123,11 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { "VPBasicBlock"); RecipeBuilder.fixHeaderPhis(); + if (Legal->canVectorizeMultiCond()) { + VPlanTransforms::convertToMultiCond(*Plan, *PSE.getSE(), OrigLoop, + RecipeBuilder); + } + SetVector ExitUsersToFix = collectUsersInExitBlock( OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars()); addLiveOutsForFirstOrderRecurrences(*Plan, ExitUsersToFix); @@ -9231,8 +9259,6 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( using namespace VPlanPatternMatch; VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion(); VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock(); - VPBasicBlock *MiddleVPBB = - cast(VectorLoopRegion->getSingleSuccessor()); for (VPRecipeBase &R : Header->phis()) { auto *PhiR = dyn_cast(&R); if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered())) @@ -9251,8 +9277,6 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( for (VPUser *U : Cur->users()) { auto *UserRecipe = cast(U); if (!UserRecipe->getParent()->getEnclosingLoopRegion()) { - assert(UserRecipe->getParent() == MiddleVPBB && - "U must be either in the loop region or the middle block."); continue; } Worklist.insert(UserRecipe); @@ -9357,6 +9381,10 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( } VPBasicBlock *LatchVPBB = VectorLoopRegion->getExitingBasicBlock(); Builder.setInsertPoint(&*LatchVPBB->begin()); + if (!VectorLoopRegion->getSingleSuccessor()) + return; + VPBasicBlock *MiddleVPBB = + cast(VectorLoopRegion->getSingleSuccessor()); VPBasicBlock::iterator IP = MiddleVPBB->getFirstNonPhi(); for (VPRecipeBase &R : Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 5e3a638809494..cd6a326ed70c7 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -474,6 +474,14 @@ void VPIRBasicBlock::execute(VPTransformState *State) { // backedges. A backward successor is set when the branch is created. const auto &PredVPSuccessors = PredVPBB->getHierarchicalSuccessors(); unsigned idx = PredVPSuccessors.front() == this ? 0 : 1; + if (TermBr->getSuccessor(idx) && + PredVPBlock == getPlan()->getVectorLoopRegion() && + PredVPBlock->getNumSuccessors()) { + // Update PRedBB and TermBr for BranchOnMultiCond in predecessor. + PredBB = TermBr->getSuccessor(1); + TermBr = cast(PredBB->getTerminator()); + idx = 0; + } assert(!TermBr->getSuccessor(idx) && "Trying to reset an existing successor block."); TermBr->setSuccessor(idx, IRBB); @@ -601,9 +609,11 @@ static bool hasConditionalTerminator(const VPBasicBlock *VPBB) { } const VPRecipeBase *R = &VPBB->back(); - bool IsCondBranch = isa(R) || - match(R, m_BranchOnCond(m_VPValue())) || - match(R, m_BranchOnCount(m_VPValue(), m_VPValue())); + bool IsCondBranch = + isa(R) || match(R, m_BranchOnCond(m_VPValue())) || + match(R, m_BranchOnCount(m_VPValue(), m_VPValue())) || + (isa(R) && cast(R)->getOpcode() == + VPInstruction::BranchMultipleConds); (void)IsCondBranch; if (VPBB->getNumSuccessors() >= 2 || @@ -908,8 +918,8 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy, VPBasicBlock *MiddleVPBB = new VPBasicBlock("middle.block"); VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion); - VPBasicBlock *ScalarPH = new VPBasicBlock("scalar.ph"); if (!RequiresScalarEpilogueCheck) { + VPBasicBlock *ScalarPH = new VPBasicBlock("scalar.ph"); VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH); return Plan; } @@ -923,10 +933,14 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy, // we unconditionally branch to the scalar preheader. Do nothing. // 3) Otherwise, construct a runtime check. BasicBlock *IRExitBlock = TheLoop->getUniqueExitBlock(); - auto *VPExitBlock = VPIRBasicBlock::fromBasicBlock(IRExitBlock); - // The connection order corresponds to the operands of the conditional branch. - VPBlockUtils::insertBlockAfter(VPExitBlock, MiddleVPBB); - VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH); + if (IRExitBlock) { + auto *VPExitBlock = VPIRBasicBlock::fromBasicBlock(IRExitBlock); + // The connection order corresponds to the operands of the conditional + // branch. + VPBlockUtils::insertBlockAfter(VPExitBlock, MiddleVPBB); + VPBasicBlock *ScalarPH = new VPBasicBlock("scalar.ph"); + VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH); + } auto *ScalarLatchTerm = TheLoop->getLoopLatch()->getTerminator(); // Here we use the same DebugLoc as the scalar loop latch terminator instead @@ -1035,7 +1049,9 @@ void VPlan::execute(VPTransformState *State) { // VPlan execution rather than earlier during VPlan construction. BasicBlock *MiddleBB = State->CFG.ExitBB; VPBasicBlock *MiddleVPBB = - cast(getVectorLoopRegion()->getSingleSuccessor()); + getVectorLoopRegion()->getNumSuccessors() == 1 + ? cast(getVectorLoopRegion()->getSuccessors()[0]) + : cast(getVectorLoopRegion()->getSuccessors()[1]); // Find the VPBB for the scalar preheader, relying on the current structure // when creating the middle block and its successrs: if there's a single // predecessor, it must be the scalar preheader. Otherwise, the second @@ -1060,12 +1076,19 @@ void VPlan::execute(VPTransformState *State) { State->CFG.DTU.applyUpdates({{DominatorTree::Delete, MiddleBB, ScalarPh}}); // Generate code in the loop pre-header and body. - for (VPBlockBase *Block : vp_depth_first_shallow(Entry)) + ReversePostOrderTraversal> RPOT( + Entry); + + for (VPBlockBase *Block : RPOT) Block->execute(State); VPBasicBlock *LatchVPBB = getVectorLoopRegion()->getExitingBasicBlock(); BasicBlock *VectorLatchBB = State->CFG.VPBB2IRBB[LatchVPBB]; + if (!getVectorLoopRegion()->getSingleSuccessor()) + VectorLatchBB = + cast(VectorLatchBB->getTerminator())->getSuccessor(1); + // Fix the latch value of canonical, reduction and first-order recurrences // phis in the vector loop. VPBasicBlock *Header = getVectorLoopRegion()->getEntryBasicBlock(); @@ -1092,7 +1115,10 @@ void VPlan::execute(VPTransformState *State) { // Move the last step to the end of the latch block. This ensures // consistent placement of all induction updates. Instruction *Inc = cast(Phi->getIncomingValue(1)); - Inc->moveBefore(VectorLatchBB->getTerminator()->getPrevNode()); + if (VectorLatchBB->getTerminator() == &*VectorLatchBB->getFirstNonPHI()) + Inc->moveBefore(VectorLatchBB->getTerminator()); + else + Inc->moveBefore(VectorLatchBB->getTerminator()->getPrevNode()); // Use the steps for the last part as backedge value for the induction. if (auto *IV = dyn_cast(&R)) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 68a62638b9d58..08c6a868e9bfc 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1239,6 +1239,7 @@ class VPInstruction : public VPRecipeWithIRFlags, CanonicalIVIncrementForPart, BranchOnCount, BranchOnCond, + BranchMultipleConds, ComputeReductionResult, // Takes the VPValue to extract from as first operand and the lane or part // to extract as second operand, counting from the end starting with 1 for @@ -1249,6 +1250,7 @@ class VPInstruction : public VPRecipeWithIRFlags, // operand). Only generates scalar values (either for the first lane only or // for all lanes, depending on its uses). PtrAdd, + AnyOf, }; private: @@ -1370,6 +1372,7 @@ class VPInstruction : public VPRecipeWithIRFlags, case Instruction::AtomicRMW: case VPInstruction::BranchOnCond: case VPInstruction::BranchOnCount: + case VPInstruction::BranchMultipleConds: return false; default: return true; diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 2948ecc580edc..193d735b676b4 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -67,6 +67,8 @@ bool VPRecipeBase::mayWriteToMemory() const { default: return true; } + case VPExpandSCEVSC: + return getParent()->getPlan()->getTripCount() == getVPSingleValue(); case VPInterleaveSC: return cast(this)->getNumStoreOperands() > 0; case VPWidenStoreEVLSC: @@ -159,6 +161,8 @@ bool VPRecipeBase::mayHaveSideEffects() const { case VPPredInstPHISC: case VPScalarCastSC: return false; + case VPExpandSCEVSC: + return getParent()->getPlan()->getTripCount() == getVPSingleValue(); case VPInstructionSC: return mayWriteToMemory(); case VPWidenCallSC: { @@ -386,12 +390,14 @@ bool VPInstruction::canGenerateScalarForFirstLane() const { return true; switch (Opcode) { case Instruction::ICmp: + case Instruction::Select: case VPInstruction::BranchOnCond: case VPInstruction::BranchOnCount: case VPInstruction::CalculateTripCountMinusVF: case VPInstruction::CanonicalIVIncrementForPart: case VPInstruction::PtrAdd: case VPInstruction::ExplicitVectorLength: + case VPInstruction::AnyOf: return true; default: return false; @@ -434,9 +440,10 @@ Value *VPInstruction::generate(VPTransformState &State) { return Builder.CreateCmp(getPredicate(), A, B, Name); } case Instruction::Select: { - Value *Cond = State.get(getOperand(0)); - Value *Op1 = State.get(getOperand(1)); - Value *Op2 = State.get(getOperand(2)); + bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); + Value *Cond = State.get(getOperand(0), OnlyFirstLaneUsed); + Value *Op1 = State.get(getOperand(1), OnlyFirstLaneUsed); + Value *Op2 = State.get(getOperand(2), OnlyFirstLaneUsed); return Builder.CreateSelect(Cond, Op1, Op2, Name); } case VPInstruction::ActiveLaneMask: { @@ -529,6 +536,34 @@ Value *VPInstruction::generate(VPTransformState &State) { CondBr->setSuccessor(1, State.CFG.VPBB2IRBB[Header]); return CondBr; } + case VPInstruction::BranchMultipleConds: { + assert(getNumOperands() == 2 && "Must have exactly 2 conditions"); + assert(getParent()->isExiting() && "Must be placed in exiting block"); + assert(getParent()->getParent()->getNumSuccessors() == 2 && + "Must have exactly 2 successors"); + + VPRegionBlock *ParentRegion = getParent()->getParent(); + VPBasicBlock *Header = ParentRegion->getEntryBasicBlock(); + Value *Cond1 = State.get(getOperand(0), /*IsScalar*/ true); + Value *Cond2 = State.get(getOperand(1), /*IsScalar*/ true); + BasicBlock *BB = Builder.GetInsertBlock(); + BasicBlock *BB2 = + BB->splitBasicBlock(BB->getTerminator(), BB->getName() + ".split"); + + Builder.SetInsertPoint(BB->getTerminator()); + BranchInst *CondBr1 = Builder.CreateCondBr(Cond1, BB, BB2); + + Builder.SetInsertPoint(BB2->getTerminator()); + BranchInst *CondBr2 = Builder.CreateCondBr(Cond2, BB2, nullptr); + CondBr2->setSuccessor(1, State.CFG.VPBB2IRBB[Header]); + CondBr1->setSuccessor(0, nullptr); + CondBr2->setSuccessor(0, nullptr); + BB->getTerminator()->eraseFromParent(); + BB2->getTerminator()->eraseFromParent(); + State.CFG.PrevBB = BB2; + return CondBr2; + } + case VPInstruction::BranchOnCount: { // First create the compare. Value *IV = State.get(getOperand(0), /*IsScalar*/ true); @@ -666,6 +701,10 @@ Value *VPInstruction::generate(VPTransformState &State) { } return NewPhi; } + case VPInstruction::AnyOf: { + Value *A = State.get(getOperand(0)); + return Builder.CreateOrReduce(A); + } default: llvm_unreachable("Unsupported opcode for instruction"); @@ -674,7 +713,8 @@ Value *VPInstruction::generate(VPTransformState &State) { bool VPInstruction::isVectorToScalar() const { return getOpcode() == VPInstruction::ExtractFromEnd || - getOpcode() == VPInstruction::ComputeReductionResult; + getOpcode() == VPInstruction::ComputeReductionResult || + getOpcode() == VPInstruction::AnyOf; } bool VPInstruction::isSingleScalar() const { @@ -736,6 +776,8 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const { default: return false; case Instruction::ICmp: + case Instruction::Select: + case Instruction::Or: case VPInstruction::PtrAdd: // TODO: Cover additional opcodes. return vputils::onlyFirstLaneUsed(this); @@ -746,6 +788,7 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const { case VPInstruction::BranchOnCount: case VPInstruction::BranchOnCond: case VPInstruction::ResumePhi: + case VPInstruction::BranchMultipleConds: return true; }; llvm_unreachable("switch should return"); @@ -764,6 +807,7 @@ bool VPInstruction::onlyFirstPartUsed(const VPValue *Op) const { return vputils::onlyFirstPartUsed(this); case VPInstruction::BranchOnCount: case VPInstruction::BranchOnCond: + case VPInstruction::BranchMultipleConds: case VPInstruction::CanonicalIVIncrementForPart: return true; }; @@ -810,6 +854,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, case VPInstruction::BranchOnCond: O << "branch-on-cond"; break; + case VPInstruction::BranchMultipleConds: + O << "branch-on-multi-cond"; + break; case VPInstruction::CalculateTripCountMinusVF: O << "TC > VF ? TC - VF : 0"; break; @@ -831,6 +878,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, case VPInstruction::PtrAdd: O << "ptradd"; break; + case VPInstruction::AnyOf: + O << "any-of"; + break; default: O << Instruction::getOpcodeName(getOpcode()); } @@ -848,12 +898,13 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, void VPIRInstruction::execute(VPTransformState &State) { assert((isa(&I) || getNumOperands() == 0) && "Only PHINodes can have extra operands"); - if (getNumOperands() == 1) { - VPValue *ExitValue = getOperand(0); + for (const auto &[Idx, Op] : enumerate(operands())) { + VPValue *ExitValue = Op; auto Lane = vputils::isUniformAfterVectorization(ExitValue) ? VPLane::getFirstLane() : VPLane::getLastLaneForVF(State.VF); - auto *PredVPBB = cast(getParent()->getSinglePredecessor()); + VPBlockBase *Pred = getParent()->getPredecessors()[Idx]; + auto *PredVPBB = Pred->getExitingBasicBlock(); BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB]; // Set insertion point in PredBB in case an extract needs to be generated. // TODO: Model extracts explicitly. @@ -881,7 +932,7 @@ void VPIRInstruction::print(raw_ostream &O, const Twine &Indent, O << Indent << "IR " << I; if (getNumOperands() != 0) { - assert(getNumOperands() == 1 && "can have at most 1 operand"); + // assert(getNumOperands() == 1 && "can have at most 1 operand"); O << " (extra operand: "; printOperands(O, SlotTracker); O << ")"; diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 379bfc0a4394b..145b8c418a37b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -515,6 +515,12 @@ void VPlanTransforms::removeDeadRecipes(VPlan &Plan) { ReversePostOrderTraversal> RPOT( Plan.getEntry()); + for (VPRecipeBase &R : make_early_inc_range( + reverse(*cast(Plan.getPreheader())))) { + if (isDeadRecipe(R)) + R.eraseFromParent(); + } + for (VPBasicBlock *VPBB : reverse(VPBlockUtils::blocksOnly(RPOT))) { // The recipes in the block are processed in reverse order, to catch chains // of dead recipes. @@ -1664,3 +1670,71 @@ void VPlanTransforms::createInterleaveGroups( } } } + +void VPlanTransforms::convertToMultiCond(VPlan &Plan, ScalarEvolution &SE, + Loop *OrigLoop, + VPRecipeBuilder &RecipeBuilder) { + auto *LatchVPBB = + cast(Plan.getVectorLoopRegion()->getExiting()); + VPBuilder Builder(LatchVPBB->getTerminator()); + auto *MiddleVPBB = + cast(Plan.getVectorLoopRegion()->getSingleSuccessor()); + + VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion(); + VPBlockUtils::disconnectBlocks(LoopRegion, MiddleVPBB); + + const SCEV *BackedgeTakenCount = + SE.getExitCount(OrigLoop, OrigLoop->getLoopLatch()); + const SCEV *TripCount = SE.getTripCountFromExitCount( + BackedgeTakenCount, Plan.getCanonicalIV()->getScalarType(), OrigLoop); + VPValue *NewTC = vputils::getOrCreateVPValueForSCEVExpr(Plan, TripCount, SE); + Plan.getTripCount()->replaceAllUsesWith(NewTC); + Plan.resetTripCount(NewTC); + + VPValue *EarlyExitTaken = nullptr; + SmallVector ExitingBBs; + OrigLoop->getExitingBlocks(ExitingBBs); + for (BasicBlock *Exiting : ExitingBBs) { + auto *ExitingTerm = cast(Exiting->getTerminator()); + BasicBlock *TrueSucc = ExitingTerm->getSuccessor(0); + BasicBlock *FalseSucc = ExitingTerm->getSuccessor(1); + VPIRBasicBlock *VPExitBlock; + if (OrigLoop->getUniqueExitBlock()) + VPExitBlock = cast(MiddleVPBB->getSuccessors()[0]); + else + VPExitBlock = VPIRBasicBlock::fromBasicBlock( + !OrigLoop->contains(TrueSucc) ? TrueSucc : FalseSucc); + + for (VPRecipeBase &R : *VPExitBlock) { + auto *ExitIRI = cast(&R); + auto *ExitPhi = dyn_cast(&ExitIRI->getInstruction()); + if (!ExitPhi) + break; + Value *IncomingValue = ExitPhi->getIncomingValueForBlock(Exiting); + VPValue *V = RecipeBuilder.getVPValueOrAddLiveIn(IncomingValue); + ExitIRI->addOperand(V); + } + + if (Exiting == OrigLoop->getLoopLatch()) { + if (MiddleVPBB->getNumSuccessors() == 0) { + VPBasicBlock *ScalarPH = new VPBasicBlock("scalar.ph"); + VPBlockUtils::connectBlocks(MiddleVPBB, VPExitBlock); + VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH); + } + continue; + } + + VPValue *M = RecipeBuilder.getBlockInMask( + OrigLoop->contains(TrueSucc) ? TrueSucc : FalseSucc); + auto *N = Builder.createNot(M); + EarlyExitTaken = Builder.createNaryOp(VPInstruction::AnyOf, {N}); + VPBlockUtils::connectBlocks(LoopRegion, VPExitBlock); + } + auto *Term = dyn_cast(LatchVPBB->getTerminator()); + auto *IsLatchExiting = Builder.createICmp( + CmpInst::ICMP_EQ, Term->getOperand(0), Term->getOperand(1)); + Builder.createNaryOp(VPInstruction::BranchMultipleConds, + {EarlyExitTaken, IsLatchExiting}); + Term->eraseFromParent(); + VPBlockUtils::connectBlocks(LoopRegion, MiddleVPBB); +} diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index 3b792ee32dce6..dd043ae170651 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -121,6 +121,10 @@ struct VPlanTransforms { /// Remove dead recipes from \p Plan. static void removeDeadRecipes(VPlan &Plan); + + static void convertToMultiCond(VPlan &Plan, ScalarEvolution &SE, + Loop *OrigLoop, + VPRecipeBuilder &RecipeBuilder); }; } // namespace llvm diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index 99bc4c38a3c3c..da8d0f26ba40a 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -244,14 +244,6 @@ bool VPlanVerifier::verifyVPBasicBlock(const VPBasicBlock *VPBB) { return false; } - VPBlockBase *MiddleBB = - IRBB->getPlan()->getVectorLoopRegion()->getSingleSuccessor(); - if (IRBB != IRBB->getPlan()->getPreheader() && - IRBB->getSinglePredecessor() != MiddleBB) { - errs() << "VPIRBasicBlock can only be used as pre-header or a successor of " - "middle-block at the moment!\n"; - return false; - } return true; } @@ -269,9 +261,9 @@ static bool hasDuplicates(const SmallVectorImpl &VPBlockVec) { bool VPlanVerifier::verifyBlock(const VPBlockBase *VPB) { auto *VPBB = dyn_cast(VPB); // Check block's condition bit. - if (VPB->getNumSuccessors() > 1 || - (VPBB && VPBB->getParent() && VPBB->isExiting() && - !VPBB->getParent()->isReplicator())) { + if (VPBB && (VPB->getNumSuccessors() > 1 || + (VPBB && VPBB->getParent() && VPBB->isExiting() && + !VPBB->getParent()->isReplicator()))) { if (!VPBB || !VPBB->getTerminator()) { errs() << "Block has multiple successors but doesn't " "have a proper branch recipe!\n"; @@ -409,8 +401,10 @@ bool VPlanVerifier::verify(const VPlan &Plan) { } auto *LastInst = dyn_cast(std::prev(Exiting->end())); - if (!LastInst || (LastInst->getOpcode() != VPInstruction::BranchOnCount && - LastInst->getOpcode() != VPInstruction::BranchOnCond)) { + if (!LastInst || + (LastInst->getOpcode() != VPInstruction::BranchOnCount && + LastInst->getOpcode() != VPInstruction::BranchOnCond && + LastInst->getOpcode() != VPInstruction::BranchMultipleConds)) { errs() << "VPlan vector loop exit must end with BranchOnCount or " "BranchOnCond VPInstruction\n"; return false; diff --git a/llvm/test/Transforms/LoopVectorize/X86/multi-exit-codegen.ll b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-codegen.ll new file mode 100644 index 0000000000000..e93bfda04b11a --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-codegen.ll @@ -0,0 +1,124 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -p loop-vectorize -mcpu=skylake-avx512 -mtriple=x86_64-apple-macosx -force-vector-interleave=1 -S -enable-multi-cond-vectorization %s | FileCheck --check-prefix=MULTI %s +; RUN: opt -p loop-vectorize -mcpu=skylake-avx512 -mtriple=x86_64-apple-macosx -force-vector-interleave=1 -S -enable-multi-cond-vectorization=false %s | FileCheck --check-prefix=DEFAULT %s + +define i64 @multi_exit_with_store(ptr %p, i64 %N) { +; MULTI-LABEL: define i64 @multi_exit_with_store( +; MULTI-SAME: ptr [[P:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; MULTI-NEXT: [[ENTRY:.*]]: +; MULTI-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; MULTI: [[VECTOR_PH]]: +; MULTI-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[N]], i64 0 +; MULTI-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer +; MULTI-NEXT: br label %[[VECTOR_BODY:.*]] +; MULTI: [[VECTOR_BODY]]: +; MULTI-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY_SPLIT:.*]] ] +; MULTI-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY_SPLIT]] ] +; MULTI-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; MULTI-NEXT: [[TMP1:%.*]] = icmp uge <8 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] +; MULTI-NEXT: [[TMP2:%.*]] = xor <8 x i1> [[TMP1]], +; MULTI-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP0]] +; MULTI-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[TMP10]], i32 0 +; MULTI-NEXT: call void @llvm.masked.store.v8i32.p0(<8 x i32> zeroinitializer, ptr [[TMP8]], i32 4, <8 x i1> [[TMP2]]) +; MULTI-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; MULTI-NEXT: [[TMP5:%.*]] = xor <8 x i1> [[TMP2]], +; MULTI-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP5]]) +; MULTI-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128 +; MULTI-NEXT: br i1 [[TMP6]], label %[[E1:.*]], label %[[VECTOR_BODY_SPLIT]] +; MULTI: [[VECTOR_BODY_SPLIT]]: +; MULTI-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], +; MULTI-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; MULTI: [[MIDDLE_BLOCK]]: +; MULTI-NEXT: br i1 true, label %[[E2:.*]], label %[[SCALAR_PH]] +; MULTI: [[SCALAR_PH]]: +; MULTI-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 128, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; MULTI-NEXT: br label %[[LOOP_HEADER:.*]] +; MULTI: [[LOOP_HEADER]]: +; MULTI-NEXT: [[I_07:%.*]] = phi i64 [ [[INC:%.*]], %[[LOOP_LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; MULTI-NEXT: [[CMP1:%.*]] = icmp uge i64 [[I_07]], [[N]] +; MULTI-NEXT: br i1 [[CMP1]], label %[[E1]], label %[[LOOP_LATCH]] +; MULTI: [[LOOP_LATCH]]: +; MULTI-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[P]], i64 [[I_07]] +; MULTI-NEXT: store i32 0, ptr [[ARRAYIDX]], align 4 +; MULTI-NEXT: [[INC]] = add nuw i64 [[I_07]], 1 +; MULTI-NEXT: [[CMP_NOT:%.*]] = icmp eq i64 [[INC]], 128 +; MULTI-NEXT: br i1 [[CMP_NOT]], label %[[E2]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP0:![0-9]+]] +; MULTI: [[E1]]: +; MULTI-NEXT: ret i64 0 +; MULTI: [[E2]]: +; MULTI-NEXT: ret i64 1 +; +; DEFAULT-LABEL: define i64 @multi_exit_with_store( +; DEFAULT-SAME: ptr [[P:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; DEFAULT-NEXT: [[ENTRY:.*]]: +; DEFAULT-NEXT: [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[N]], i64 127) +; DEFAULT-NEXT: [[TMP4:%.*]] = add nuw nsw i64 [[UMIN]], 1 +; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP4]], 8 +; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; DEFAULT: [[VECTOR_PH]]: +; DEFAULT-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP4]], 8 +; DEFAULT-NEXT: [[TMP5:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; DEFAULT-NEXT: [[TMP2:%.*]] = select i1 [[TMP5]], i64 8, i64 [[N_MOD_VF]] +; DEFAULT-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP4]], [[TMP2]] +; DEFAULT-NEXT: br label %[[VECTOR_BODY:.*]] +; DEFAULT: [[VECTOR_BODY]]: +; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; DEFAULT-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; DEFAULT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[TMP0]] +; DEFAULT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; DEFAULT-NEXT: store <8 x i32> zeroinitializer, ptr [[TMP3]], align 4 +; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; DEFAULT-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; DEFAULT-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; DEFAULT: [[MIDDLE_BLOCK]]: +; DEFAULT-NEXT: br label %[[SCALAR_PH]] +; DEFAULT: [[SCALAR_PH]]: +; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; DEFAULT-NEXT: br label %[[LOOP_HEADER:.*]] +; DEFAULT: [[LOOP_HEADER]]: +; DEFAULT-NEXT: [[I_07:%.*]] = phi i64 [ [[INC:%.*]], %[[LOOP_LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; DEFAULT-NEXT: [[CMP1:%.*]] = icmp uge i64 [[I_07]], [[N]] +; DEFAULT-NEXT: br i1 [[CMP1]], label %[[E1:.*]], label %[[LOOP_LATCH]] +; DEFAULT: [[LOOP_LATCH]]: +; DEFAULT-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[P]], i64 [[I_07]] +; DEFAULT-NEXT: store i32 0, ptr [[ARRAYIDX]], align 4 +; DEFAULT-NEXT: [[INC]] = add nuw i64 [[I_07]], 1 +; DEFAULT-NEXT: [[CMP_NOT:%.*]] = icmp eq i64 [[INC]], 128 +; DEFAULT-NEXT: br i1 [[CMP_NOT]], label %[[E2:.*]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]] +; DEFAULT: [[E1]]: +; DEFAULT-NEXT: ret i64 0 +; DEFAULT: [[E2]]: +; DEFAULT-NEXT: ret i64 1 +; +entry: + br label %loop.header + +loop.header: + %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ] + %c.1 = icmp uge i64 %iv, %N + br i1 %c.1, label %e1, label %loop.latch + +loop.latch: + %arrayidx = getelementptr inbounds nuw i32, ptr %p, i64 %iv + store i32 0, ptr %arrayidx + %inc = add nuw i64 %iv, 1 + %c.2 = icmp eq i64 %inc, 128 + br i1 %c.2, label %e2, label %loop.header + +e1: + ret i64 0 + +e2: + ret i64 1 + +} +;. +; MULTI: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; MULTI: [[META1]] = !{!"llvm.loop.unroll.runtime.disable"} +; MULTI: [[META2]] = !{!"llvm.loop.isvectorized", i32 1} +;. +; DEFAULT: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; DEFAULT: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; DEFAULT: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; DEFAULT: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +;. diff --git a/llvm/test/Transforms/LoopVectorize/X86/multi-exit-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-cost.ll index cd128979fc143..1c02f10753745 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/multi-exit-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-cost.ll @@ -5,18 +5,18 @@ define i64 @test_value_in_exit_compare_chain_used_outside(ptr %src, i64 %x, i64 ; CHECK-LABEL: define i64 @test_value_in_exit_compare_chain_used_outside( ; CHECK-SAME: ptr [[SRC:%.*]], i64 [[X:%.*]], i64 range(i64 1, 32) [[N:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: [[TMP0:%.*]] = add nsw i64 [[N]], -1 -; CHECK-NEXT: [[TMP1:%.*]] = freeze i64 [[TMP0]] -; CHECK-NEXT: [[UMIN2:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP1]], i64 [[X]]) -; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[UMIN2]], 1 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP2]], 8 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]] -; CHECK: [[VECTOR_SCEVCHECK]]: ; CHECK-NEXT: [[TMP3:%.*]] = add nsw i64 [[N]], -1 ; CHECK-NEXT: [[TMP4:%.*]] = freeze i64 [[TMP3]] ; CHECK-NEXT: [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP4]], i64 [[X]]) -; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[UMIN]] to i1 -; CHECK-NEXT: [[TMP6:%.*]] = icmp ugt i64 [[UMIN]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[UMIN]], 1 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP2]], 8 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]] +; CHECK: [[VECTOR_SCEVCHECK]]: +; CHECK-NEXT: [[TMP32:%.*]] = add nsw i64 [[N]], -1 +; CHECK-NEXT: [[TMP33:%.*]] = freeze i64 [[TMP32]] +; CHECK-NEXT: [[UMIN1:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP33]], i64 [[X]]) +; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[UMIN1]] to i1 +; CHECK-NEXT: [[TMP6:%.*]] = icmp ugt i64 [[UMIN1]], 1 ; CHECK-NEXT: [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]] ; CHECK-NEXT: br i1 [[TMP7]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: diff --git a/llvm/test/Transforms/LoopVectorize/X86/multi-exit-vplan.ll b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-vplan.ll new file mode 100644 index 0000000000000..06672680ff715 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-vplan.ll @@ -0,0 +1,134 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -p loop-vectorize -mcpu=skylake-avx512 -mtriple=x86_64-apple-macosx -force-vector-interleave=1 -S -enable-multi-cond-vectorization -debug %s 2>&1 | FileCheck %s + +define i64 @multi_exiting_to_different_exits_with_store(ptr %p, i64 %N) { +; CHECK-LABEL: VPlan 'Final VPlan for VF={2,4,8},UF={1}' { +; CHECK-NEXT: Live-in vp<%0> = VF +; CHECK-NEXT: Live-in vp<%1> = VF * UF +; CHECK-NEXT: Live-in vp<%2> = vector-trip-count +; CHECK-NEXT: Live-in ir<128> = original trip-count +; CHECK-EMPTY: +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%7> +; CHECK-NEXT: WIDEN-INDUCTION %iv = phi %inc, 0, ir<1>, vp<%0> +; CHECK-NEXT: vp<%4> = SCALAR-STEPS vp<%3>, ir<1> +; CHECK-NEXT: WIDEN ir<%c.1> = icmp uge ir<%iv>, ir<%N> +; CHECK-NEXT: EMIT vp<%5> = not ir<%c.1> +; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr ir<%p>, vp<%4> +; CHECK-NEXT: vp<%6> = vector-pointer ir<%arrayidx> +; CHECK-NEXT: WIDEN store vp<%6>, ir<0>, vp<%5> +; CHECK-NEXT: EMIT vp<%7> = add nuw vp<%3>, vp<%1> +; CHECK-NEXT: EMIT vp<%8> = not vp<%5> +; CHECK-NEXT: EMIT vp<%9> = any-of vp<%8> + ; CHECK-NEXT: EMIT vp<%10> = icmp eq vp<%7>, vp<%2> +; CHECK-NEXT: EMIT branch-on-multi-cond vp<%9>, vp<%10> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): ir-bb, middle.block +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %p1 = phi i64 [ 0, %loop.header ] (extra operand: ir<0>) +; CHECK-NEXT: No successors +; CHECK-EMPTY: +; CHECK-NEXT: middle.block: +; CHECK-NEXT: EMIT vp<%12> = icmp eq ir<128>, vp<%2> +; CHECK-NEXT: EMIT branch-on-cond vp<%12> +; CHECK-NEXT: Successor(s): ir-bb, scalar.ph +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %p2 = phi i64 [ 1, %loop.latch ] (extra operand: ir<1>) +; CHECK-NEXT: No successors +; CHECK-EMPTY: +; CHECK-NEXT: scalar.ph: +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; +entry: + br label %loop.header + +loop.header: + %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ] + %c.1 = icmp uge i64 %iv, %N + br i1 %c.1, label %e1, label %loop.latch + +loop.latch: + %arrayidx = getelementptr inbounds nuw i32, ptr %p, i64 %iv + store i32 0, ptr %arrayidx + %inc = add nuw i64 %iv, 1 + %c.2 = icmp eq i64 %inc, 128 + br i1 %c.2, label %e2, label %loop.header + +e1: + %p1 = phi i64 [ 0, %loop.header ] + ret i64 %p1 + +e2: + %p2 = phi i64 [ 1, %loop.latch ] + ret i64 %p2 +} + +define i64 @multi_exiting_to_same_exit_with_store(ptr %p, i64 %N) { +; CHECK-LABEL: VPlan 'Final VPlan for VF={2,4,8},UF={1}' { +; CHECK-NEXT: Live-in vp<%0> = VF +; CHECK-NEXT: Live-in vp<%1> = VF * UF +; CHECK-NEXT: Live-in vp<%2> = vector-trip-count +; CHECK-NEXT: Live-in ir<128> = original trip-count +; CHECK-EMPTY: +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%7> +; CHECK-NEXT: WIDEN-INDUCTION %iv = phi %inc, 0, ir<1>, vp<%0> +; CHECK-NEXT: vp<%4> = SCALAR-STEPS vp<%3>, ir<1> +; CHECK-NEXT: WIDEN ir<%c.1> = icmp uge ir<%iv>, ir<%N> +; CHECK-NEXT: EMIT vp<%5> = not ir<%c.1> +; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr ir<%p>, vp<%4> +; CHECK-NEXT: vp<%6> = vector-pointer ir<%arrayidx> +; CHECK-NEXT: WIDEN store vp<%6>, ir<0>, vp<%5> +; CHECK-NEXT: EMIT vp<%7> = add nuw vp<%3>, vp<%1> +; CHECK-NEXT: EMIT vp<%8> = not vp<%5> +; CHECK-NEXT: EMIT vp<%9> = any-of vp<%8> + ; CHECK-NEXT: EMIT vp<%10> = icmp eq vp<%7>, vp<%2> +; CHECK-NEXT: EMIT branch-on-multi-cond vp<%9>, vp<%10> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): ir-bb, middle.block +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %p1 = phi i64 [ 0, %loop.header ], [ 1, %loop.latch ] (extra operand: ir<0>, ir<1>) +; CHECK-NEXT: No successors +; CHECK-EMPTY: +; CHECK-NEXT: middle.block: +; CHECK-NEXT: EMIT vp<%12> = icmp eq ir<128>, vp<%2> +; CHECK-NEXT: EMIT branch-on-cond vp<%12> +; CHECK-NEXT: Successor(s): ir-bb, scalar.ph +; CHECK-EMPTY: +; CHECK-NEXT: scalar.ph: +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; +entry: + br label %loop.header + +loop.header: + %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ] + %c.1 = icmp uge i64 %iv, %N + br i1 %c.1, label %e, label %loop.latch + +loop.latch: + %arrayidx = getelementptr inbounds nuw i32, ptr %p, i64 %iv + store i32 0, ptr %arrayidx + %inc = add nuw i64 %iv, 1 + %c.2 = icmp eq i64 %inc, 128 + br i1 %c.2, label %e, label %loop.header + +e: + %p1 = phi i64 [ 0, %loop.header ], [ 1, %loop.latch ] + ret i64 %p1 +}