@@ -2964,34 +2964,6 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
2964
2964
LoopScalarPreHeader =
2965
2965
SplitBlock (LoopMiddleBlock, LoopMiddleBlock->getTerminator (), DT, LI,
2966
2966
nullptr , Twine (Prefix) + " scalar.ph" );
2967
-
2968
- // Set up the middle block terminator. Two cases:
2969
- // 1) If we know that we must execute the scalar epilogue, retain the existing
2970
- // unconditional branch from the middle block to the scalar preheader. In that
2971
- // case, there's no edge from the middle block to exit blocks and thus no
2972
- // need to update the immediate dominator of the exit blocks.
2973
- if (Cost->requiresScalarEpilogue (VF.isVector ())) {
2974
- assert (
2975
- LoopMiddleBlock->getSingleSuccessor () == LoopScalarPreHeader &&
2976
- " middle block should have the scalar preheader as single successor" );
2977
- return ;
2978
- }
2979
-
2980
- // 2) Otherwise, we must have a single unique exit block (due to how we
2981
- // implement the multiple exit case). In this case, set up a conditional
2982
- // branch from the middle block to the loop scalar preheader, and the
2983
- // exit block. completeLoopSkeleton will update the condition to use an
2984
- // iteration check, if required to decide whether to execute the remainder.
2985
- BranchInst *BrInst =
2986
- BranchInst::Create (LoopExitBlock, LoopScalarPreHeader, Builder.getTrue ());
2987
- auto *ScalarLatchTerm = OrigLoop->getLoopLatch ()->getTerminator ();
2988
- BrInst->setDebugLoc (ScalarLatchTerm->getDebugLoc ());
2989
- ReplaceInstWithInst (LoopMiddleBlock->getTerminator (), BrInst);
2990
-
2991
- // Update dominator for loop exit. During skeleton creation, only the vector
2992
- // pre-header and the middle block are created. The vector loop is entirely
2993
- // created during VPlan exection.
2994
- DT->changeImmediateDominator (LoopExitBlock, LoopMiddleBlock);
2995
2967
}
2996
2968
2997
2969
PHINode *InnerLoopVectorizer::createInductionResumeValue (
@@ -3088,51 +3060,6 @@ void InnerLoopVectorizer::createInductionResumeValues(
3088
3060
}
3089
3061
}
3090
3062
3091
- BasicBlock *InnerLoopVectorizer::completeLoopSkeleton () {
3092
- // The trip counts should be cached by now.
3093
- Value *Count = getTripCount ();
3094
- Value *VectorTripCount = getOrCreateVectorTripCount (LoopVectorPreHeader);
3095
-
3096
- auto *ScalarLatchTerm = OrigLoop->getLoopLatch ()->getTerminator ();
3097
-
3098
- // Add a check in the middle block to see if we have completed
3099
- // all of the iterations in the first vector loop. Three cases:
3100
- // 1) If we require a scalar epilogue, there is no conditional branch as
3101
- // we unconditionally branch to the scalar preheader. Do nothing.
3102
- // 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
3103
- // Thus if tail is to be folded, we know we don't need to run the
3104
- // remainder and we can use the previous value for the condition (true).
3105
- // 3) Otherwise, construct a runtime check.
3106
- if (!Cost->requiresScalarEpilogue (VF.isVector ()) &&
3107
- !Cost->foldTailByMasking ()) {
3108
- // Here we use the same DebugLoc as the scalar loop latch terminator instead
3109
- // of the corresponding compare because they may have ended up with
3110
- // different line numbers and we want to avoid awkward line stepping while
3111
- // debugging. Eg. if the compare has got a line number inside the loop.
3112
- // TODO: At the moment, CreateICmpEQ will simplify conditions with constant
3113
- // operands. Perform simplification directly on VPlan once the branch is
3114
- // modeled there.
3115
- IRBuilder<> B (LoopMiddleBlock->getTerminator ());
3116
- B.SetCurrentDebugLocation (ScalarLatchTerm->getDebugLoc ());
3117
- Value *CmpN = B.CreateICmpEQ (Count, VectorTripCount, " cmp.n" );
3118
- BranchInst &BI = *cast<BranchInst>(LoopMiddleBlock->getTerminator ());
3119
- BI.setCondition (CmpN);
3120
- if (hasBranchWeightMD (*ScalarLatchTerm)) {
3121
- // Assume that `Count % VectorTripCount` is equally distributed.
3122
- unsigned TripCount = UF * VF.getKnownMinValue ();
3123
- assert (TripCount > 0 && " trip count should not be zero" );
3124
- const uint32_t Weights[] = {1 , TripCount - 1 };
3125
- setBranchWeights (BI, Weights, /* IsExpected=*/ false );
3126
- }
3127
- }
3128
-
3129
- #ifdef EXPENSIVE_CHECKS
3130
- assert (DT->verify (DominatorTree::VerificationLevel::Fast));
3131
- #endif
3132
-
3133
- return LoopVectorPreHeader;
3134
- }
3135
-
3136
3063
std::pair<BasicBlock *, Value *>
3137
3064
InnerLoopVectorizer::createVectorizedLoopSkeleton (
3138
3065
const SCEV2ValueTy &ExpandedSCEVs) {
@@ -3155,17 +3082,18 @@ InnerLoopVectorizer::createVectorizedLoopSkeleton(
3155
3082
| [ ]_| <-- vector loop (created during VPlan execution).
3156
3083
| |
3157
3084
| v
3158
- \ -[ ] <--- middle-block.
3085
+ \ -[ ] <--- middle-block (wrapped in VPIRBasicBlock with the branch to
3086
+ | | successors created during VPlan execution)
3159
3087
\/ |
3160
3088
/\ v
3161
- | ->[ ] <--- new preheader.
3089
+ | ->[ ] <--- new preheader (wrapped in VPIRBasicBlock) .
3162
3090
| |
3163
3091
(opt) v <-- edge from middle to exit iff epilogue is not required.
3164
3092
| [ ] \
3165
3093
| [ ]_| <-- old scalar loop to handle remainder (scalar epilogue).
3166
3094
\ |
3167
3095
\ v
3168
- >[ ] <-- exit block(s).
3096
+ >[ ] <-- exit block(s). (wrapped in VPIRBasicBlock)
3169
3097
...
3170
3098
*/
3171
3099
@@ -3192,7 +3120,7 @@ InnerLoopVectorizer::createVectorizedLoopSkeleton(
3192
3120
// Emit phis for the new starting index of the scalar loop.
3193
3121
createInductionResumeValues (ExpandedSCEVs);
3194
3122
3195
- return {completeLoopSkeleton () , nullptr };
3123
+ return {LoopVectorPreHeader , nullptr };
3196
3124
}
3197
3125
3198
3126
// Fix up external users of the induction variable. At this point, we are
@@ -7477,6 +7405,9 @@ LoopVectorizationPlanner::executePlan(
7477
7405
std::tie (State.CFG .PrevBB , CanonicalIVStartValue) =
7478
7406
ILV.createVectorizedLoopSkeleton (ExpandedSCEVs ? *ExpandedSCEVs
7479
7407
: State.ExpandedSCEVs );
7408
+ #ifdef EXPENSIVE_CHECKS
7409
+ assert (DT->verify (DominatorTree::VerificationLevel::Fast));
7410
+ #endif
7480
7411
7481
7412
// Only use noalias metadata when using memory checks guaranteeing no overlap
7482
7413
// across all iterations.
@@ -7557,6 +7488,18 @@ LoopVectorizationPlanner::executePlan(
7557
7488
7558
7489
ILV.printDebugTracesAtEnd ();
7559
7490
7491
+ // 4. Adjust branch weight of the branch in the middle block.
7492
+ auto *MiddleTerm =
7493
+ cast<BranchInst>(State.CFG .VPBB2IRBB [ExitVPBB]->getTerminator ());
7494
+ if (MiddleTerm->isConditional () &&
7495
+ hasBranchWeightMD (*OrigLoop->getLoopLatch ()->getTerminator ())) {
7496
+ // Assume that `Count % VectorTripCount` is equally distributed.
7497
+ unsigned TripCount = State.UF * State.VF .getKnownMinValue ();
7498
+ assert (TripCount > 0 && " trip count should not be zero" );
7499
+ const uint32_t Weights[] = {1 , TripCount - 1 };
7500
+ setBranchWeights (*MiddleTerm, Weights, /* IsExpected=*/ false );
7501
+ }
7502
+
7560
7503
return {State.ExpandedSCEVs , ReductionResumeValues};
7561
7504
}
7562
7505
@@ -7613,7 +7556,7 @@ EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton(
7613
7556
// inductions in the epilogue loop are created before executing the plan for
7614
7557
// the epilogue loop.
7615
7558
7616
- return {completeLoopSkeleton () , nullptr };
7559
+ return {LoopVectorPreHeader , nullptr };
7617
7560
}
7618
7561
7619
7562
void EpilogueVectorizerMainLoop::printDebugTracesAtStart () {
@@ -7802,7 +7745,7 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
7802
7745
{VecEpilogueIterationCountCheck,
7803
7746
EPI.VectorTripCount } /* AdditionalBypass */ );
7804
7747
7805
- return {completeLoopSkeleton () , EPResumeVal};
7748
+ return {LoopVectorPreHeader , EPResumeVal};
7806
7749
}
7807
7750
7808
7751
BasicBlock *
@@ -7847,7 +7790,6 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
7847
7790
setBranchWeights (BI, Weights, /* IsExpected=*/ false );
7848
7791
}
7849
7792
ReplaceInstWithInst (Insert->getTerminator (), &BI);
7850
-
7851
7793
LoopBypassBlocks.push_back (Insert);
7852
7794
return Insert;
7853
7795
}
@@ -8552,9 +8494,17 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
8552
8494
// modified; a basic block for the vector pre-header, followed by a region for
8553
8495
// the vector loop, followed by the middle basic block. The skeleton vector
8554
8496
// loop region contains a header and latch basic blocks.
8497
+
8498
+ bool RequiresScalarEpilogueCheck =
8499
+ LoopVectorizationPlanner::getDecisionAndClampRange (
8500
+ [this ](ElementCount VF) {
8501
+ return !CM.requiresScalarEpilogue (VF.isVector ());
8502
+ },
8503
+ Range);
8555
8504
VPlanPtr Plan = VPlan::createInitialVPlan (
8556
8505
createTripCountSCEV (Legal->getWidestInductionType (), PSE, OrigLoop),
8557
- *PSE.getSE (), OrigLoop->getLoopPreheader ());
8506
+ *PSE.getSE (), RequiresScalarEpilogueCheck, CM.foldTailByMasking (),
8507
+ OrigLoop);
8558
8508
VPBasicBlock *HeaderVPBB = new VPBasicBlock (" vector.body" );
8559
8509
VPBasicBlock *LatchVPBB = new VPBasicBlock (" vector.latch" );
8560
8510
VPBlockUtils::insertBlockAfter (LatchVPBB, HeaderVPBB);
@@ -8802,7 +8752,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
8802
8752
// Create new empty VPlan
8803
8753
auto Plan = VPlan::createInitialVPlan (
8804
8754
createTripCountSCEV (Legal->getWidestInductionType (), PSE, OrigLoop),
8805
- *PSE.getSE (), OrigLoop-> getLoopPreheader () );
8755
+ *PSE.getSE (), true , false , OrigLoop);
8806
8756
8807
8757
// Build hierarchical CFG
8808
8758
VPlanHCFGBuilder HCFGBuilder (OrigLoop, LI, *Plan);
@@ -10163,6 +10113,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
10163
10113
cast<VPHeaderPHIRecipe>(&R)->setStartValue (StartVal);
10164
10114
}
10165
10115
10116
+ assert (DT->verify (DominatorTree::VerificationLevel::Fast) &&
10117
+ " DT not preserved correctly" );
10166
10118
LVP.executePlan (EPI.EpilogueVF , EPI.EpilogueUF , BestEpiPlan, EpilogILV,
10167
10119
DT, true , &ExpandedSCEVs);
10168
10120
++LoopsEpilogueVectorized;
0 commit comments