@@ -2972,22 +2972,7 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
2972
2972
SplitBlock (LoopMiddleBlock, LoopMiddleBlock->getTerminator (), DT, LI,
2973
2973
nullptr , Twine (Prefix) + " scalar.ph" );
2974
2974
2975
- auto *ScalarLatchTerm = OrigLoop->getLoopLatch ()->getTerminator ();
2976
-
2977
- // Set up the middle block terminator. Two cases:
2978
- // 1) If we know that we must execute the scalar epilogue, emit an
2979
- // unconditional branch.
2980
- // 2) Otherwise, we must have a single unique exit block (due to how we
2981
- // implement the multiple exit case). In this case, set up a conditional
2982
- // branch from the middle block to the loop scalar preheader, and the
2983
- // exit block. completeLoopSkeleton will update the condition to use an
2984
- // iteration check, if required to decide whether to execute the remainder.
2985
- BranchInst *BrInst =
2986
- Cost->requiresScalarEpilogue (VF.isVector ())
2987
- ? BranchInst::Create (LoopScalarPreHeader)
2988
- : BranchInst::Create (LoopExitBlock, LoopScalarPreHeader,
2989
- Builder.getTrue ());
2990
- BrInst->setDebugLoc (ScalarLatchTerm->getDebugLoc ());
2975
+ auto *BrInst = new UnreachableInst (LoopMiddleBlock->getContext ());
2991
2976
ReplaceInstWithInst (LoopMiddleBlock->getTerminator (), BrInst);
2992
2977
2993
2978
// Update dominator for loop exit. During skeleton creation, only the vector
@@ -3094,51 +3079,6 @@ void InnerLoopVectorizer::createInductionResumeValues(
3094
3079
}
3095
3080
}
3096
3081
3097
- BasicBlock *InnerLoopVectorizer::completeLoopSkeleton () {
3098
- // The trip counts should be cached by now.
3099
- Value *Count = getTripCount ();
3100
- Value *VectorTripCount = getOrCreateVectorTripCount (LoopVectorPreHeader);
3101
-
3102
- auto *ScalarLatchTerm = OrigLoop->getLoopLatch ()->getTerminator ();
3103
-
3104
- // Add a check in the middle block to see if we have completed
3105
- // all of the iterations in the first vector loop. Three cases:
3106
- // 1) If we require a scalar epilogue, there is no conditional branch as
3107
- // we unconditionally branch to the scalar preheader. Do nothing.
3108
- // 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
3109
- // Thus if tail is to be folded, we know we don't need to run the
3110
- // remainder and we can use the previous value for the condition (true).
3111
- // 3) Otherwise, construct a runtime check.
3112
- if (!Cost->requiresScalarEpilogue (VF.isVector ()) &&
3113
- !Cost->foldTailByMasking ()) {
3114
- // Here we use the same DebugLoc as the scalar loop latch terminator instead
3115
- // of the corresponding compare because they may have ended up with
3116
- // different line numbers and we want to avoid awkward line stepping while
3117
- // debugging. Eg. if the compare has got a line number inside the loop.
3118
- // TODO: At the moment, CreateICmpEQ will simplify conditions with constant
3119
- // operands. Perform simplification directly on VPlan once the branch is
3120
- // modeled there.
3121
- IRBuilder<> B (LoopMiddleBlock->getTerminator ());
3122
- B.SetCurrentDebugLocation (ScalarLatchTerm->getDebugLoc ());
3123
- Value *CmpN = B.CreateICmpEQ (Count, VectorTripCount, " cmp.n" );
3124
- BranchInst &BI = *cast<BranchInst>(LoopMiddleBlock->getTerminator ());
3125
- BI.setCondition (CmpN);
3126
- if (hasBranchWeightMD (*ScalarLatchTerm)) {
3127
- // Assume that `Count % VectorTripCount` is equally distributed.
3128
- unsigned TripCount = UF * VF.getKnownMinValue ();
3129
- assert (TripCount > 0 && " trip count should not be zero" );
3130
- const uint32_t Weights[] = {1 , TripCount - 1 };
3131
- setBranchWeights (BI, Weights);
3132
- }
3133
- }
3134
-
3135
- #ifdef EXPENSIVE_CHECKS
3136
- assert (DT->verify (DominatorTree::VerificationLevel::Fast));
3137
- #endif
3138
-
3139
- return LoopVectorPreHeader;
3140
- }
3141
-
3142
3082
std::pair<BasicBlock *, Value *>
3143
3083
InnerLoopVectorizer::createVectorizedLoopSkeleton (
3144
3084
const SCEV2ValueTy &ExpandedSCEVs) {
@@ -3198,7 +3138,7 @@ InnerLoopVectorizer::createVectorizedLoopSkeleton(
3198
3138
// Emit phis for the new starting index of the scalar loop.
3199
3139
createInductionResumeValues (ExpandedSCEVs);
3200
3140
3201
- return {completeLoopSkeleton () , nullptr };
3141
+ return {LoopVectorPreHeader , nullptr };
3202
3142
}
3203
3143
3204
3144
// Fix up external users of the induction variable. At this point, we are
@@ -3470,6 +3410,18 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
3470
3410
VF.getKnownMinValue () * UF);
3471
3411
}
3472
3412
3413
+ // Helper to reorder blocks so they match the original order even after the
3414
+ // order of the predecessors changes. This is only used to avoid a number of
3415
+ // test changes due to reordering of incoming blocks in phi nodes and should be
3416
+ // removed soon, with the tests being updated.
3417
+ static void reorderIncomingBlocks (SmallVectorImpl<BasicBlock *> &Blocks,
3418
+ BasicBlock *LoopMiddleBlock) {
3419
+ if (Blocks.front () == LoopMiddleBlock)
3420
+ std::swap (Blocks.front (), Blocks.back ());
3421
+ if (Blocks.size () == 3 )
3422
+ std::swap (Blocks[0 ], Blocks[1 ]);
3423
+ }
3424
+
3473
3425
void InnerLoopVectorizer::fixFixedOrderRecurrence (VPLiveOut *LO,
3474
3426
VPTransformState &State) {
3475
3427
// Extract the last vector element in the middle block. This will be the
@@ -3488,7 +3440,9 @@ void InnerLoopVectorizer::fixFixedOrderRecurrence(VPLiveOut *LO,
3488
3440
Builder.SetInsertPoint (LoopScalarPreHeader, LoopScalarPreHeader->begin ());
3489
3441
auto *ScalarPreheaderPhi =
3490
3442
Builder.CreatePHI (ScalarHeaderPhi->getType (), 2 , " scalar.recur.init" );
3491
- for (auto *BB : predecessors (LoopScalarPreHeader)) {
3443
+ SmallVector<BasicBlock *> Blocks (predecessors (LoopScalarPreHeader));
3444
+ reorderIncomingBlocks (Blocks, LoopMiddleBlock);
3445
+ for (auto *BB : Blocks) {
3492
3446
auto *Incoming = BB == LoopMiddleBlock ? ResumeScalarFOR : InitScalarFOR;
3493
3447
ScalarPreheaderPhi->addIncoming (Incoming, BB);
3494
3448
}
@@ -7388,7 +7342,9 @@ static void createAndCollectMergePhiForReduction(
7388
7342
// If we are fixing reductions in the epilogue loop then we should already
7389
7343
// have created a bc.merge.rdx Phi after the main vector body. Ensure that
7390
7344
// we carry over the incoming values correctly.
7391
- for (auto *Incoming : predecessors (LoopScalarPreHeader)) {
7345
+ SmallVector<BasicBlock *> Blocks (predecessors (LoopScalarPreHeader));
7346
+ reorderIncomingBlocks (Blocks, LoopMiddleBlock);
7347
+ for (auto *Incoming : Blocks) {
7392
7348
if (Incoming == LoopMiddleBlock)
7393
7349
BCBlockPhi->addIncoming (FinalValue, Incoming);
7394
7350
else if (ResumePhi && is_contained (ResumePhi->blocks (), Incoming))
@@ -7459,6 +7415,21 @@ LoopVectorizationPlanner::executePlan(
7459
7415
std::tie (State.CFG .PrevBB , CanonicalIVStartValue) =
7460
7416
ILV.createVectorizedLoopSkeleton (ExpandedSCEVs ? *ExpandedSCEVs
7461
7417
: State.ExpandedSCEVs );
7418
+ #ifdef EXPENSIVE_CHECKS
7419
+ assert (DT->verify (DominatorTree::VerificationLevel::Fast));
7420
+ #endif
7421
+
7422
+ VPBasicBlock *MiddleVPBB =
7423
+ cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion ()->getSingleSuccessor ());
7424
+
7425
+ using namespace llvm ::VPlanPatternMatch;
7426
+ if (MiddleVPBB->begin () != MiddleVPBB->end () &&
7427
+ match (&MiddleVPBB->back (), m_BranchOnCond (m_VPValue ()))) {
7428
+ cast<VPIRBasicBlock>(MiddleVPBB->getSuccessors ()[1 ])
7429
+ ->resetBlock (OrigLoop->getLoopPreheader ());
7430
+ } else
7431
+ cast<VPIRBasicBlock>(MiddleVPBB->getSuccessors ()[0 ])
7432
+ ->resetBlock (OrigLoop->getLoopPreheader ());
7462
7433
7463
7434
// Only use noalias metadata when using memory checks guaranteeing no overlap
7464
7435
// across all iterations.
@@ -7539,6 +7510,18 @@ LoopVectorizationPlanner::executePlan(
7539
7510
7540
7511
ILV.printDebugTracesAtEnd ();
7541
7512
7513
+ // Adjust branch weight of the branch in the middle block.
7514
+ auto *MiddleTerm =
7515
+ cast<BranchInst>(State.CFG .VPBB2IRBB [ExitVPBB]->getTerminator ());
7516
+ if (MiddleTerm->isConditional () &&
7517
+ hasBranchWeightMD (*OrigLoop->getLoopLatch ()->getTerminator ())) {
7518
+ // Assume that `Count % VectorTripCount` is equally distributed.
7519
+ unsigned TripCount = State.UF * State.VF .getKnownMinValue ();
7520
+ assert (TripCount > 0 && " trip count should not be zero" );
7521
+ const uint32_t Weights[] = {1 , TripCount - 1 };
7522
+ setBranchWeights (*MiddleTerm, Weights);
7523
+ }
7524
+
7542
7525
return {State.ExpandedSCEVs , ReductionResumeValues};
7543
7526
}
7544
7527
@@ -7595,7 +7578,7 @@ EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton(
7595
7578
// inductions in the epilogue loop are created before executing the plan for
7596
7579
// the epilogue loop.
7597
7580
7598
- return {completeLoopSkeleton () , nullptr };
7581
+ return {LoopVectorPreHeader , nullptr };
7599
7582
}
7600
7583
7601
7584
void EpilogueVectorizerMainLoop::printDebugTracesAtStart () {
@@ -7719,8 +7702,11 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
7719
7702
VecEpilogueIterationCountCheck,
7720
7703
VecEpilogueIterationCountCheck->getSinglePredecessor ());
7721
7704
7722
- DT->changeImmediateDominator (LoopScalarPreHeader,
7723
- EPI.EpilogueIterationCountCheck );
7705
+ if (auto *N = DT->getNode (LoopScalarPreHeader))
7706
+ DT->changeImmediateDominator (LoopScalarPreHeader,
7707
+ EPI.EpilogueIterationCountCheck );
7708
+ else
7709
+ DT->addNewBlock (LoopScalarPreHeader, EPI.EpilogueIterationCountCheck );
7724
7710
if (!Cost->requiresScalarEpilogue (EPI.EpilogueVF .isVector ()))
7725
7711
// If there is an epilogue which must run, there's no edge from the
7726
7712
// middle block to exit blocks and thus no need to update the immediate
@@ -7784,7 +7770,7 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
7784
7770
{VecEpilogueIterationCountCheck,
7785
7771
EPI.VectorTripCount } /* AdditionalBypass */ );
7786
7772
7787
- return {completeLoopSkeleton () , EPResumeVal};
7773
+ return {LoopVectorPreHeader , EPResumeVal};
7788
7774
}
7789
7775
7790
7776
BasicBlock *
@@ -8534,9 +8520,25 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
8534
8520
// modified; a basic block for the vector pre-header, followed by a region for
8535
8521
// the vector loop, followed by the middle basic block. The skeleton vector
8536
8522
// loop region contains a header and latch basic blocks.
8523
+
8524
+ // Add a check in the middle block to see if we have completed
8525
+ // all of the iterations in the first vector loop. Three cases:
8526
+ // 1) If we require a scalar epilogue, there is no conditional branch as
8527
+ // we unconditionally branch to the scalar preheader. Do nothing.
8528
+ // 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
8529
+ // Thus if tail is to be folded, we know we don't need to run the
8530
+ // remainder and we can use the previous value for the condition (true).
8531
+ // 3) Otherwise, construct a runtime check.
8532
+ bool RequiresScalarEpilogueCheck =
8533
+ LoopVectorizationPlanner::getDecisionAndClampRange (
8534
+ [this ](ElementCount VF) {
8535
+ return !CM.requiresScalarEpilogue (VF.isVector ());
8536
+ },
8537
+ Range);
8537
8538
VPlanPtr Plan = VPlan::createInitialVPlan (
8538
8539
createTripCountSCEV (Legal->getWidestInductionType (), PSE, OrigLoop),
8539
- *PSE.getSE (), OrigLoop->getLoopPreheader ());
8540
+ *PSE.getSE (), RequiresScalarEpilogueCheck, CM.foldTailByMasking (),
8541
+ OrigLoop);
8540
8542
VPBasicBlock *HeaderVPBB = new VPBasicBlock (" vector.body" );
8541
8543
VPBasicBlock *LatchVPBB = new VPBasicBlock (" vector.latch" );
8542
8544
VPBlockUtils::insertBlockAfter (LatchVPBB, HeaderVPBB);
@@ -8784,7 +8786,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
8784
8786
// Create new empty VPlan
8785
8787
auto Plan = VPlan::createInitialVPlan (
8786
8788
createTripCountSCEV (Legal->getWidestInductionType (), PSE, OrigLoop),
8787
- *PSE.getSE (), OrigLoop-> getLoopPreheader () );
8789
+ *PSE.getSE (), true , false , OrigLoop);
8788
8790
8789
8791
// Build hierarchical CFG
8790
8792
VPlanHCFGBuilder HCFGBuilder (OrigLoop, LI, *Plan);
@@ -8993,6 +8995,9 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
8993
8995
}
8994
8996
}
8995
8997
Builder.setInsertPoint (&*LatchVPBB->begin ());
8998
+ VPBasicBlock *MiddleVPBB =
8999
+ cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor ());
9000
+ VPBasicBlock::iterator IP = MiddleVPBB->begin ();
8996
9001
for (VPRecipeBase &R :
8997
9002
Plan->getVectorLoopRegion ()->getEntryBasicBlock ()->phis ()) {
8998
9003
VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
@@ -9101,8 +9106,8 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
9101
9106
// also modeled in VPlan.
9102
9107
auto *FinalReductionResult = new VPInstruction (
9103
9108
VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL);
9104
- cast<VPBasicBlock>(VectorLoopRegion-> getSingleSuccessor ())
9105
- -> appendRecipe (FinalReductionResult );
9109
+ FinalReductionResult-> insertBefore (*MiddleVPBB, IP);
9110
+ IP = std::next (FinalReductionResult-> getIterator () );
9106
9111
OrigExitingVPV->replaceUsesWithIf (
9107
9112
FinalReductionResult,
9108
9113
[](VPUser &User, unsigned ) { return isa<VPLiveOut>(&User); });
0 commit comments