@@ -2972,22 +2972,7 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
29722972 SplitBlock (LoopMiddleBlock, LoopMiddleBlock->getTerminator (), DT, LI,
29732973 nullptr , Twine (Prefix) + " scalar.ph" );
29742974
2975- auto *ScalarLatchTerm = OrigLoop->getLoopLatch ()->getTerminator ();
2976-
2977- // Set up the middle block terminator. Two cases:
2978- // 1) If we know that we must execute the scalar epilogue, emit an
2979- // unconditional branch.
2980- // 2) Otherwise, we must have a single unique exit block (due to how we
2981- // implement the multiple exit case). In this case, set up a conditional
2982- // branch from the middle block to the loop scalar preheader, and the
2983- // exit block. completeLoopSkeleton will update the condition to use an
2984- // iteration check, if required to decide whether to execute the remainder.
2985- BranchInst *BrInst =
2986- Cost->requiresScalarEpilogue (VF.isVector ())
2987- ? BranchInst::Create (LoopScalarPreHeader)
2988- : BranchInst::Create (LoopExitBlock, LoopScalarPreHeader,
2989- Builder.getTrue ());
2990- BrInst->setDebugLoc (ScalarLatchTerm->getDebugLoc ());
2975+ auto *BrInst = new UnreachableInst (LoopMiddleBlock->getContext ());
29912976 ReplaceInstWithInst (LoopMiddleBlock->getTerminator (), BrInst);
29922977
29932978 // Update dominator for loop exit. During skeleton creation, only the vector
@@ -3094,51 +3079,6 @@ void InnerLoopVectorizer::createInductionResumeValues(
30943079 }
30953080}
30963081
3097- BasicBlock *InnerLoopVectorizer::completeLoopSkeleton () {
3098- // The trip counts should be cached by now.
3099- Value *Count = getTripCount ();
3100- Value *VectorTripCount = getOrCreateVectorTripCount (LoopVectorPreHeader);
3101-
3102- auto *ScalarLatchTerm = OrigLoop->getLoopLatch ()->getTerminator ();
3103-
3104- // Add a check in the middle block to see if we have completed
3105- // all of the iterations in the first vector loop. Three cases:
3106- // 1) If we require a scalar epilogue, there is no conditional branch as
3107- // we unconditionally branch to the scalar preheader. Do nothing.
3108- // 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
3109- // Thus if tail is to be folded, we know we don't need to run the
3110- // remainder and we can use the previous value for the condition (true).
3111- // 3) Otherwise, construct a runtime check.
3112- if (!Cost->requiresScalarEpilogue (VF.isVector ()) &&
3113- !Cost->foldTailByMasking ()) {
3114- // Here we use the same DebugLoc as the scalar loop latch terminator instead
3115- // of the corresponding compare because they may have ended up with
3116- // different line numbers and we want to avoid awkward line stepping while
3117- // debugging. Eg. if the compare has got a line number inside the loop.
3118- // TODO: At the moment, CreateICmpEQ will simplify conditions with constant
3119- // operands. Perform simplification directly on VPlan once the branch is
3120- // modeled there.
3121- IRBuilder<> B (LoopMiddleBlock->getTerminator ());
3122- B.SetCurrentDebugLocation (ScalarLatchTerm->getDebugLoc ());
3123- Value *CmpN = B.CreateICmpEQ (Count, VectorTripCount, " cmp.n" );
3124- BranchInst &BI = *cast<BranchInst>(LoopMiddleBlock->getTerminator ());
3125- BI.setCondition (CmpN);
3126- if (hasBranchWeightMD (*ScalarLatchTerm)) {
3127- // Assume that `Count % VectorTripCount` is equally distributed.
3128- unsigned TripCount = UF * VF.getKnownMinValue ();
3129- assert (TripCount > 0 && " trip count should not be zero" );
3130- const uint32_t Weights[] = {1 , TripCount - 1 };
3131- setBranchWeights (BI, Weights);
3132- }
3133- }
3134-
3135- #ifdef EXPENSIVE_CHECKS
3136- assert (DT->verify (DominatorTree::VerificationLevel::Fast));
3137- #endif
3138-
3139- return LoopVectorPreHeader;
3140- }
3141-
31423082std::pair<BasicBlock *, Value *>
31433083InnerLoopVectorizer::createVectorizedLoopSkeleton (
31443084 const SCEV2ValueTy &ExpandedSCEVs) {
@@ -3198,7 +3138,7 @@ InnerLoopVectorizer::createVectorizedLoopSkeleton(
31983138 // Emit phis for the new starting index of the scalar loop.
31993139 createInductionResumeValues (ExpandedSCEVs);
32003140
3201- return {completeLoopSkeleton () , nullptr };
3141+ return {LoopVectorPreHeader , nullptr };
32023142}
32033143
32043144// Fix up external users of the induction variable. At this point, we are
@@ -3470,6 +3410,18 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
34703410 VF.getKnownMinValue () * UF);
34713411}
34723412
3413+ // Helper to reorder blocks so they match the original order even after the
3414+ // order of the predecessors changes. This is only used to avoid a number of
3415+ // test changes due to reordering of incoming blocks in phi nodes and should be
3416+ // removed soon, with the tests being updated.
3417+ static void reorderIncomingBlocks (SmallVectorImpl<BasicBlock *> &Blocks,
3418+ BasicBlock *LoopMiddleBlock) {
3419+ if (Blocks.front () == LoopMiddleBlock)
3420+ std::swap (Blocks.front (), Blocks.back ());
3421+ if (Blocks.size () == 3 )
3422+ std::swap (Blocks[0 ], Blocks[1 ]);
3423+ }
3424+
34733425void InnerLoopVectorizer::fixFixedOrderRecurrence (VPLiveOut *LO,
34743426 VPTransformState &State) {
34753427 // Extract the last vector element in the middle block. This will be the
@@ -3488,7 +3440,9 @@ void InnerLoopVectorizer::fixFixedOrderRecurrence(VPLiveOut *LO,
34883440 Builder.SetInsertPoint (LoopScalarPreHeader, LoopScalarPreHeader->begin ());
34893441 auto *ScalarPreheaderPhi =
34903442 Builder.CreatePHI (ScalarHeaderPhi->getType (), 2 , " scalar.recur.init" );
3491- for (auto *BB : predecessors (LoopScalarPreHeader)) {
3443+ SmallVector<BasicBlock *> Blocks (predecessors (LoopScalarPreHeader));
3444+ reorderIncomingBlocks (Blocks, LoopMiddleBlock);
3445+ for (auto *BB : Blocks) {
34923446 auto *Incoming = BB == LoopMiddleBlock ? ResumeScalarFOR : InitScalarFOR;
34933447 ScalarPreheaderPhi->addIncoming (Incoming, BB);
34943448 }
@@ -7388,7 +7342,9 @@ static void createAndCollectMergePhiForReduction(
73887342 // If we are fixing reductions in the epilogue loop then we should already
73897343 // have created a bc.merge.rdx Phi after the main vector body. Ensure that
73907344 // we carry over the incoming values correctly.
7391- for (auto *Incoming : predecessors (LoopScalarPreHeader)) {
7345+ SmallVector<BasicBlock *> Blocks (predecessors (LoopScalarPreHeader));
7346+ reorderIncomingBlocks (Blocks, LoopMiddleBlock);
7347+ for (auto *Incoming : Blocks) {
73927348 if (Incoming == LoopMiddleBlock)
73937349 BCBlockPhi->addIncoming (FinalValue, Incoming);
73947350 else if (ResumePhi && is_contained (ResumePhi->blocks (), Incoming))
@@ -7459,6 +7415,21 @@ LoopVectorizationPlanner::executePlan(
74597415 std::tie (State.CFG .PrevBB , CanonicalIVStartValue) =
74607416 ILV.createVectorizedLoopSkeleton (ExpandedSCEVs ? *ExpandedSCEVs
74617417 : State.ExpandedSCEVs );
7418+ #ifdef EXPENSIVE_CHECKS
7419+ assert (DT->verify (DominatorTree::VerificationLevel::Fast));
7420+ #endif
7421+
7422+ VPBasicBlock *MiddleVPBB =
7423+ cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion ()->getSingleSuccessor ());
7424+
7425+ using namespace llvm ::VPlanPatternMatch;
7426+ if (MiddleVPBB->begin () != MiddleVPBB->end () &&
7427+ match (&MiddleVPBB->back (), m_BranchOnCond (m_VPValue ()))) {
7428+ cast<VPIRBasicBlock>(MiddleVPBB->getSuccessors ()[1 ])
7429+ ->resetBlock (OrigLoop->getLoopPreheader ());
7430+ } else
7431+ cast<VPIRBasicBlock>(MiddleVPBB->getSuccessors ()[0 ])
7432+ ->resetBlock (OrigLoop->getLoopPreheader ());
74627433
74637434 // Only use noalias metadata when using memory checks guaranteeing no overlap
74647435 // across all iterations.
@@ -7539,6 +7510,18 @@ LoopVectorizationPlanner::executePlan(
75397510
75407511 ILV.printDebugTracesAtEnd ();
75417512
7513+ // Adjust branch weight of the branch in the middle block.
7514+ auto *MiddleTerm =
7515+ cast<BranchInst>(State.CFG .VPBB2IRBB [ExitVPBB]->getTerminator ());
7516+ if (MiddleTerm->isConditional () &&
7517+ hasBranchWeightMD (*OrigLoop->getLoopLatch ()->getTerminator ())) {
7518+ // Assume that `Count % VectorTripCount` is equally distributed.
7519+ unsigned TripCount = State.UF * State.VF .getKnownMinValue ();
7520+ assert (TripCount > 0 && " trip count should not be zero" );
7521+ const uint32_t Weights[] = {1 , TripCount - 1 };
7522+ setBranchWeights (*MiddleTerm, Weights);
7523+ }
7524+
75427525 return {State.ExpandedSCEVs , ReductionResumeValues};
75437526}
75447527
@@ -7595,7 +7578,7 @@ EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton(
75957578 // inductions in the epilogue loop are created before executing the plan for
75967579 // the epilogue loop.
75977580
7598- return {completeLoopSkeleton () , nullptr };
7581+ return {LoopVectorPreHeader , nullptr };
75997582}
76007583
76017584void EpilogueVectorizerMainLoop::printDebugTracesAtStart () {
@@ -7719,8 +7702,11 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
77197702 VecEpilogueIterationCountCheck,
77207703 VecEpilogueIterationCountCheck->getSinglePredecessor ());
77217704
7722- DT->changeImmediateDominator (LoopScalarPreHeader,
7723- EPI.EpilogueIterationCountCheck );
7705+ if (auto *N = DT->getNode (LoopScalarPreHeader))
7706+ DT->changeImmediateDominator (LoopScalarPreHeader,
7707+ EPI.EpilogueIterationCountCheck );
7708+ else
7709+ DT->addNewBlock (LoopScalarPreHeader, EPI.EpilogueIterationCountCheck );
77247710 if (!Cost->requiresScalarEpilogue (EPI.EpilogueVF .isVector ()))
77257711 // If there is an epilogue which must run, there's no edge from the
77267712 // middle block to exit blocks and thus no need to update the immediate
@@ -7784,7 +7770,7 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
77847770 {VecEpilogueIterationCountCheck,
77857771 EPI.VectorTripCount } /* AdditionalBypass */ );
77867772
7787- return {completeLoopSkeleton () , EPResumeVal};
7773+ return {LoopVectorPreHeader , EPResumeVal};
77887774}
77897775
77907776BasicBlock *
@@ -8534,9 +8520,25 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
85348520 // modified; a basic block for the vector pre-header, followed by a region for
85358521 // the vector loop, followed by the middle basic block. The skeleton vector
85368522 // loop region contains a header and latch basic blocks.
8523+
8524+ // Add a check in the middle block to see if we have completed
8525+ // all of the iterations in the first vector loop. Three cases:
8526+ // 1) If we require a scalar epilogue, there is no conditional branch as
8527+ // we unconditionally branch to the scalar preheader. Do nothing.
8528+ // 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
8529+ // Thus if tail is to be folded, we know we don't need to run the
8530+ // remainder and we can use the previous value for the condition (true).
8531+ // 3) Otherwise, construct a runtime check.
8532+ bool RequiresScalarEpilogueCheck =
8533+ LoopVectorizationPlanner::getDecisionAndClampRange (
8534+ [this ](ElementCount VF) {
8535+ return !CM.requiresScalarEpilogue (VF.isVector ());
8536+ },
8537+ Range);
85378538 VPlanPtr Plan = VPlan::createInitialVPlan (
85388539 createTripCountSCEV (Legal->getWidestInductionType (), PSE, OrigLoop),
8539- *PSE.getSE (), OrigLoop->getLoopPreheader ());
8540+ *PSE.getSE (), RequiresScalarEpilogueCheck, CM.foldTailByMasking (),
8541+ OrigLoop);
85408542 VPBasicBlock *HeaderVPBB = new VPBasicBlock (" vector.body" );
85418543 VPBasicBlock *LatchVPBB = new VPBasicBlock (" vector.latch" );
85428544 VPBlockUtils::insertBlockAfter (LatchVPBB, HeaderVPBB);
@@ -8784,7 +8786,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
87848786 // Create new empty VPlan
87858787 auto Plan = VPlan::createInitialVPlan (
87868788 createTripCountSCEV (Legal->getWidestInductionType (), PSE, OrigLoop),
8787- *PSE.getSE (), OrigLoop-> getLoopPreheader () );
8789+ *PSE.getSE (), true , false , OrigLoop);
87888790
87898791 // Build hierarchical CFG
87908792 VPlanHCFGBuilder HCFGBuilder (OrigLoop, LI, *Plan);
@@ -8993,6 +8995,9 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
89938995 }
89948996 }
89958997 Builder.setInsertPoint (&*LatchVPBB->begin ());
8998+ VPBasicBlock *MiddleVPBB =
8999+ cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor ());
9000+ VPBasicBlock::iterator IP = MiddleVPBB->begin ();
89969001 for (VPRecipeBase &R :
89979002 Plan->getVectorLoopRegion ()->getEntryBasicBlock ()->phis ()) {
89989003 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
@@ -9101,8 +9106,8 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
91019106 // also modeled in VPlan.
91029107 auto *FinalReductionResult = new VPInstruction (
91039108 VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL);
9104- cast<VPBasicBlock>(VectorLoopRegion-> getSingleSuccessor ())
9105- -> appendRecipe (FinalReductionResult );
9109+ FinalReductionResult-> insertBefore (*MiddleVPBB, IP);
9110+ IP = std::next (FinalReductionResult-> getIterator () );
91069111 OrigExitingVPV->replaceUsesWithIf (
91079112 FinalReductionResult,
91089113 [](VPUser &User, unsigned ) { return isa<VPLiveOut>(&User); });
0 commit comments