@@ -2964,34 +2964,6 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
29642964 LoopScalarPreHeader =
29652965 SplitBlock (LoopMiddleBlock, LoopMiddleBlock->getTerminator (), DT, LI,
29662966 nullptr , Twine (Prefix) + " scalar.ph" );
2967-
2968- // Set up the middle block terminator. Two cases:
2969- // 1) If we know that we must execute the scalar epilogue, retain the existing
2970- // unconditional branch from the middle block to the scalar preheader. In that
2971- // case, there's no edge from the middle block to exit blocks and thus no
2972- // need to update the immediate dominator of the exit blocks.
2973- if (Cost->requiresScalarEpilogue (VF.isVector ())) {
2974- assert (
2975- LoopMiddleBlock->getSingleSuccessor () == LoopScalarPreHeader &&
2976- " middle block should have the scalar preheader as single successor" );
2977- return ;
2978- }
2979-
2980- // 2) Otherwise, we must have a single unique exit block (due to how we
2981- // implement the multiple exit case). In this case, set up a conditional
2982- // branch from the middle block to the loop scalar preheader, and the
2983- // exit block. completeLoopSkeleton will update the condition to use an
2984- // iteration check, if required to decide whether to execute the remainder.
2985- BranchInst *BrInst =
2986- BranchInst::Create (LoopExitBlock, LoopScalarPreHeader, Builder.getTrue ());
2987- auto *ScalarLatchTerm = OrigLoop->getLoopLatch ()->getTerminator ();
2988- BrInst->setDebugLoc (ScalarLatchTerm->getDebugLoc ());
2989- ReplaceInstWithInst (LoopMiddleBlock->getTerminator (), BrInst);
2990-
2991- // Update dominator for loop exit. During skeleton creation, only the vector
2992- // pre-header and the middle block are created. The vector loop is entirely
2993- // created during VPlan exection.
2994- DT->changeImmediateDominator (LoopExitBlock, LoopMiddleBlock);
29952967}
29962968
29972969PHINode *InnerLoopVectorizer::createInductionResumeValue (
@@ -3088,51 +3060,6 @@ void InnerLoopVectorizer::createInductionResumeValues(
30883060 }
30893061}
30903062
3091- BasicBlock *InnerLoopVectorizer::completeLoopSkeleton () {
3092- // The trip counts should be cached by now.
3093- Value *Count = getTripCount ();
3094- Value *VectorTripCount = getOrCreateVectorTripCount (LoopVectorPreHeader);
3095-
3096- auto *ScalarLatchTerm = OrigLoop->getLoopLatch ()->getTerminator ();
3097-
3098- // Add a check in the middle block to see if we have completed
3099- // all of the iterations in the first vector loop. Three cases:
3100- // 1) If we require a scalar epilogue, there is no conditional branch as
3101- // we unconditionally branch to the scalar preheader. Do nothing.
3102- // 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
3103- // Thus if tail is to be folded, we know we don't need to run the
3104- // remainder and we can use the previous value for the condition (true).
3105- // 3) Otherwise, construct a runtime check.
3106- if (!Cost->requiresScalarEpilogue (VF.isVector ()) &&
3107- !Cost->foldTailByMasking ()) {
3108- // Here we use the same DebugLoc as the scalar loop latch terminator instead
3109- // of the corresponding compare because they may have ended up with
3110- // different line numbers and we want to avoid awkward line stepping while
3111- // debugging. Eg. if the compare has got a line number inside the loop.
3112- // TODO: At the moment, CreateICmpEQ will simplify conditions with constant
3113- // operands. Perform simplification directly on VPlan once the branch is
3114- // modeled there.
3115- IRBuilder<> B (LoopMiddleBlock->getTerminator ());
3116- B.SetCurrentDebugLocation (ScalarLatchTerm->getDebugLoc ());
3117- Value *CmpN = B.CreateICmpEQ (Count, VectorTripCount, " cmp.n" );
3118- BranchInst &BI = *cast<BranchInst>(LoopMiddleBlock->getTerminator ());
3119- BI.setCondition (CmpN);
3120- if (hasBranchWeightMD (*ScalarLatchTerm)) {
3121- // Assume that `Count % VectorTripCount` is equally distributed.
3122- unsigned TripCount = UF * VF.getKnownMinValue ();
3123- assert (TripCount > 0 && " trip count should not be zero" );
3124- const uint32_t Weights[] = {1 , TripCount - 1 };
3125- setBranchWeights (BI, Weights, /* IsExpected=*/ false );
3126- }
3127- }
3128-
3129- #ifdef EXPENSIVE_CHECKS
3130- assert (DT->verify (DominatorTree::VerificationLevel::Fast));
3131- #endif
3132-
3133- return LoopVectorPreHeader;
3134- }
3135-
31363063std::pair<BasicBlock *, Value *>
31373064InnerLoopVectorizer::createVectorizedLoopSkeleton (
31383065 const SCEV2ValueTy &ExpandedSCEVs) {
@@ -3155,17 +3082,18 @@ InnerLoopVectorizer::createVectorizedLoopSkeleton(
31553082 | [ ]_| <-- vector loop (created during VPlan execution).
31563083 | |
31573084 | v
3158- \ -[ ] <--- middle-block.
3085+ \ -[ ] <--- middle-block (wrapped in VPIRBasicBlock with the branch to
3086+ | | successors created during VPlan execution)
31593087 \/ |
31603088 /\ v
3161- | ->[ ] <--- new preheader.
3089+ | ->[ ] <--- new preheader (wrapped in VPIRBasicBlock) .
31623090 | |
31633091 (opt) v <-- edge from middle to exit iff epilogue is not required.
31643092 | [ ] \
31653093 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue).
31663094 \ |
31673095 \ v
3168- >[ ] <-- exit block(s).
3096+ >[ ] <-- exit block(s). (wrapped in VPIRBasicBlock)
31693097 ...
31703098 */
31713099
@@ -3192,7 +3120,7 @@ InnerLoopVectorizer::createVectorizedLoopSkeleton(
31923120 // Emit phis for the new starting index of the scalar loop.
31933121 createInductionResumeValues (ExpandedSCEVs);
31943122
3195- return {completeLoopSkeleton () , nullptr };
3123+ return {LoopVectorPreHeader , nullptr };
31963124}
31973125
31983126// Fix up external users of the induction variable. At this point, we are
@@ -7477,6 +7405,9 @@ LoopVectorizationPlanner::executePlan(
74777405 std::tie (State.CFG .PrevBB , CanonicalIVStartValue) =
74787406 ILV.createVectorizedLoopSkeleton (ExpandedSCEVs ? *ExpandedSCEVs
74797407 : State.ExpandedSCEVs );
7408+ #ifdef EXPENSIVE_CHECKS
7409+ assert (DT->verify (DominatorTree::VerificationLevel::Fast));
7410+ #endif
74807411
74817412 // Only use noalias metadata when using memory checks guaranteeing no overlap
74827413 // across all iterations.
@@ -7557,6 +7488,18 @@ LoopVectorizationPlanner::executePlan(
75577488
75587489 ILV.printDebugTracesAtEnd ();
75597490
7491+ // 4. Adjust branch weight of the branch in the middle block.
7492+ auto *MiddleTerm =
7493+ cast<BranchInst>(State.CFG .VPBB2IRBB [ExitVPBB]->getTerminator ());
7494+ if (MiddleTerm->isConditional () &&
7495+ hasBranchWeightMD (*OrigLoop->getLoopLatch ()->getTerminator ())) {
7496+ // Assume that `Count % VectorTripCount` is equally distributed.
7497+ unsigned TripCount = State.UF * State.VF .getKnownMinValue ();
7498+ assert (TripCount > 0 && " trip count should not be zero" );
7499+ const uint32_t Weights[] = {1 , TripCount - 1 };
7500+ setBranchWeights (*MiddleTerm, Weights, /* IsExpected=*/ false );
7501+ }
7502+
75607503 return {State.ExpandedSCEVs , ReductionResumeValues};
75617504}
75627505
@@ -7613,7 +7556,7 @@ EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton(
76137556 // inductions in the epilogue loop are created before executing the plan for
76147557 // the epilogue loop.
76157558
7616- return {completeLoopSkeleton () , nullptr };
7559+ return {LoopVectorPreHeader , nullptr };
76177560}
76187561
76197562void EpilogueVectorizerMainLoop::printDebugTracesAtStart () {
@@ -7802,7 +7745,7 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
78027745 {VecEpilogueIterationCountCheck,
78037746 EPI.VectorTripCount } /* AdditionalBypass */ );
78047747
7805- return {completeLoopSkeleton () , EPResumeVal};
7748+ return {LoopVectorPreHeader , EPResumeVal};
78067749}
78077750
78087751BasicBlock *
@@ -7847,7 +7790,6 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
78477790 setBranchWeights (BI, Weights, /* IsExpected=*/ false );
78487791 }
78497792 ReplaceInstWithInst (Insert->getTerminator (), &BI);
7850-
78517793 LoopBypassBlocks.push_back (Insert);
78527794 return Insert;
78537795}
@@ -8552,9 +8494,17 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
85528494 // modified; a basic block for the vector pre-header, followed by a region for
85538495 // the vector loop, followed by the middle basic block. The skeleton vector
85548496 // loop region contains a header and latch basic blocks.
8497+
8498+ bool RequiresScalarEpilogueCheck =
8499+ LoopVectorizationPlanner::getDecisionAndClampRange (
8500+ [this ](ElementCount VF) {
8501+ return !CM.requiresScalarEpilogue (VF.isVector ());
8502+ },
8503+ Range);
85558504 VPlanPtr Plan = VPlan::createInitialVPlan (
85568505 createTripCountSCEV (Legal->getWidestInductionType (), PSE, OrigLoop),
8557- *PSE.getSE (), OrigLoop->getLoopPreheader ());
8506+ *PSE.getSE (), RequiresScalarEpilogueCheck, CM.foldTailByMasking (),
8507+ OrigLoop);
85588508 VPBasicBlock *HeaderVPBB = new VPBasicBlock (" vector.body" );
85598509 VPBasicBlock *LatchVPBB = new VPBasicBlock (" vector.latch" );
85608510 VPBlockUtils::insertBlockAfter (LatchVPBB, HeaderVPBB);
@@ -8802,7 +8752,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
88028752 // Create new empty VPlan
88038753 auto Plan = VPlan::createInitialVPlan (
88048754 createTripCountSCEV (Legal->getWidestInductionType (), PSE, OrigLoop),
8805- *PSE.getSE (), OrigLoop-> getLoopPreheader () );
8755+ *PSE.getSE (), true , false , OrigLoop);
88068756
88078757 // Build hierarchical CFG
88088758 VPlanHCFGBuilder HCFGBuilder (OrigLoop, LI, *Plan);
@@ -10163,6 +10113,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1016310113 cast<VPHeaderPHIRecipe>(&R)->setStartValue (StartVal);
1016410114 }
1016510115
10116+ assert (DT->verify (DominatorTree::VerificationLevel::Fast) &&
10117+ " DT not preserved correctly" );
1016610118 LVP.executePlan (EPI.EpilogueVF , EPI.EpilogueUF , BestEpiPlan, EpilogILV,
1016710119 DT, true , &ExpandedSCEVs);
1016810120 ++LoopsEpilogueVectorized;
0 commit comments