From 2f441bb3ae998745e03326cc2e59ea7b54439ec4 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Tue, 23 Jul 2024 15:25:01 +0100 Subject: [PATCH 01/11] [VPlan] Introduce explicit ExtractFromEnd recipes for live-outs. Introduce explicit ExtractFromEnd recipes to extract the final values for live-outs instead of implicitly extracting in VPLiveOut::fixPhi. This is a follow-up to the recent changes of modeling extracts for recurrences and consolidates live-out extract creation for fixed-order recurrences at a single place: addLiveOutsForFirstOrderRecurrences. It is also in preparation of replacing VPLiveOut with VPIRInstructions wrapping the original scalar phis. --- .../Transforms/Vectorize/LoopVectorize.cpp | 130 ++++++++++++++++-- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 8 +- .../Transforms/Vectorize/VPlanTransforms.cpp | 94 ------------- .../RISCV/vplan-vp-intrinsics-reduction.ll | 9 +- ...-order-recurrence-sink-replicate-region.ll | 3 +- .../instruction-only-used-outside-of-loop.ll | 20 ++- .../pr55167-fold-tail-live-out.ll | 2 +- .../LoopVectorize/select-cmp-multiuse.ll | 4 +- .../LoopVectorize/vplan-printing.ll | 11 +- 9 files changed, 144 insertions(+), 137 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 09ca859f52680..61b29295d07b5 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8452,7 +8452,104 @@ static void addUsersInExitBlock( return P && Inductions.contains(P); }))) continue; - Plan.addLiveOut(&ExitPhi, V); + + auto MiddleVPBB = + cast(Plan.getVectorLoopRegion()->getSingleSuccessor()); + VPBuilder B(MiddleVPBB); + if (auto *Terminator = MiddleVPBB->getTerminator()) { + auto *Condition = dyn_cast(Terminator->getOperand(0)); + assert((!Condition || Condition->getParent() == MiddleVPBB) && + "Condition expected in MiddleVPBB"); + B.setInsertPoint(Condition ? Condition : Terminator); + } + + VPValue *Ext; + if (auto *FOR = dyn_cast_or_null( + V->getDefiningRecipe())) { + // This is the second phase of vectorizing first-order recurrences. An + // overview of the transformation is described below. Suppose we have the + // following loop with some use after the loop of the last a[i-1], + // + // for (int i = 0; i < n; ++i) { + // t = a[i - 1]; + // b[i] = a[i] - t; + // } + // use t; + // + // There is a first-order recurrence on "a". For this loop, the shorthand + // scalar IR looks like: + // + // scalar.ph: + // s_init = a[-1] + // br scalar.body + // + // scalar.body: + // i = phi [0, scalar.ph], [i+1, scalar.body] + // s1 = phi [s_init, scalar.ph], [s2, scalar.body] + // s2 = a[i] + // b[i] = s2 - s1 + // br cond, scalar.body, exit.block + // + // exit.block: + // use = lcssa.phi [s1, scalar.body] + // + // In this example, s1 is a recurrence because it's value depends on the + // previous iteration. In the first phase of vectorization, we created a + // vector phi v1 for s1. We now complete the vectorization and produce the + // shorthand vector IR shown below (for VF = 4, UF = 1). + // + // vector.ph: + // v_init = vector(..., ..., ..., a[-1]) + // br vector.body + // + // vector.body + // i = phi [0, vector.ph], [i+4, vector.body] + // v1 = phi [v_init, vector.ph], [v2, vector.body] + // v2 = a[i, i+1, i+2, i+3]; + // v3 = vector(v1(3), v2(0, 1, 2)) + // b[i, i+1, i+2, i+3] = v2 - v3 + // br cond, vector.body, middle.block + // + // middle.block: + // s_penultimate = v2(2) = v3(3) + // s_resume = v2(3) + // br cond, scalar.ph, exit.block + // + // scalar.ph: + // s_init' = phi [s_resume, middle.block], [s_init, otherwise] + // br scalar.body + // + // scalar.body: + // i = phi [0, scalar.ph], [i+1, scalar.body] + // s1 = phi [s_init', scalar.ph], [s2, scalar.body] + // s2 = a[i] + // b[i] = s2 - s1 + // br cond, scalar.body, exit.block + // + // exit.block: + // lo = lcssa.phi [s1, scalar.body], [s.penultimate, middle.block] + // + // After execution completes the vector loop, we extract the next value of + // the recurrence (x) to use as the initial value in the scalar loop. This + // is modeled by ExtractFromEnd. + // + // Extract the penultimate value of the recurrence and update VPLiveOut + // users of the recurrence splice. Note that the extract of the final + // value used to resume in the scalar loop is created earlier during VPlan + // construction. + Ext = + B.createNaryOp(VPInstruction::ExtractFromEnd, + {FOR->getBackedgeValue(), + Plan.getOrAddLiveIn(ConstantInt::get( + IntegerType::get(ExitBB->getContext(), 32), 2))}, + {}, "vector.recur.extract.for.phi"); + } else { + Ext = B.createNaryOp( + VPInstruction::ExtractFromEnd, + {V, Plan.getOrAddLiveIn(ConstantInt::get( + IntegerType::get(ExitBB->getContext(), 32), 1))}); + } + Plan.addLiveOut(&ExitPhi, Ext); } } @@ -8660,6 +8757,14 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { // After here, VPBB should not be used. VPBB = nullptr; + assert(isa(Plan->getVectorLoopRegion()) && + !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() && + "entry block must be set to a VPRegionBlock having a non-empty entry " + "VPBasicBlock"); + RecipeBuilder.fixHeaderPhis(); + + addLiveOutsForFirstOrderRecurrences(*Plan); + if (CM.requiresScalarEpilogue(Range)) { // No edge from the middle block to the unique exit block has been inserted // and there is nothing to fix from vector loop; phis should have incoming @@ -8668,13 +8773,6 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { addUsersInExitBlock(OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars()); - assert(isa(Plan->getVectorLoopRegion()) && - !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() && - "entry block must be set to a VPRegionBlock having a non-empty entry " - "VPBasicBlock"); - RecipeBuilder.fixHeaderPhis(); - - addLiveOutsForFirstOrderRecurrences(*Plan); // --------------------------------------------------------------------------- // Transform initial VPlan: Apply previously taken decisions, in order, to @@ -8884,10 +8982,12 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( for (unsigned I = 0; I != Worklist.size(); ++I) { VPSingleDefRecipe *Cur = Worklist[I]; for (VPUser *U : Cur->users()) { - auto *UserRecipe = dyn_cast(U); - if (!UserRecipe) { - assert(isa(U) && - "U must either be a VPSingleDef or VPLiveOut"); + auto *UserRecipe = cast(U); + if (!UserRecipe->getParent()->getParent()) { + assert(cast(U) && + cast(U)->getOpcode() == + VPInstruction::ExtractFromEnd && + "U must be an ExtractFromEnd VPInstruction"); continue; } Worklist.insert(UserRecipe); @@ -9105,8 +9205,10 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL); FinalReductionResult->insertBefore(*MiddleVPBB, IP); OrigExitingVPV->replaceUsesWithIf( - FinalReductionResult, - [](VPUser &User, unsigned) { return isa(&User); }); + FinalReductionResult, [](VPUser &User, unsigned) { + auto *R = dyn_cast(&User); + return R && R->getOpcode() == VPInstruction::ExtractFromEnd; + }); } VPlanTransforms::clearReductionWrapFlags(*Plan); diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 2d6d67a55c17d..798d178e5a963 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -194,9 +194,6 @@ bool VPRecipeBase::mayHaveSideEffects() const { void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) { VPValue *ExitValue = getOperand(0); - auto Lane = vputils::isUniformAfterVectorization(ExitValue) - ? VPLane::getFirstLane() - : VPLane::getLastLaneForVF(State.VF); VPBasicBlock *MiddleVPBB = cast(Plan.getVectorLoopRegion()->getSingleSuccessor()); VPRecipeBase *ExitingRecipe = ExitValue->getDefiningRecipe(); @@ -207,10 +204,7 @@ void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) { ? MiddleVPBB : ExitingVPBB; BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB]; - // Set insertion point in PredBB in case an extract needs to be generated. - // TODO: Model extracts explicitly. - State.Builder.SetInsertPoint(PredBB, PredBB->getFirstNonPHIIt()); - Value *V = State.get(ExitValue, VPIteration(State.UF - 1, Lane)); + Value *V = State.get(ExitValue, VPIteration(0, 0)); if (Phi->getBasicBlockIndex(PredBB) != -1) Phi->setIncomingValueForBlock(PredBB, V); else diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index c91fd0f118e31..967939c4854c6 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -826,20 +826,6 @@ bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan &Plan, if (auto *FOR = dyn_cast(&R)) RecurrencePhis.push_back(FOR); - VPBasicBlock *MiddleVPBB = - cast(Plan.getVectorLoopRegion()->getSingleSuccessor()); - VPBuilder MiddleBuilder; - // Set insert point so new recipes are inserted before terminator and - // condition, if there is either the former or both. - if (auto *Term = - dyn_cast_or_null(MiddleVPBB->getTerminator())) { - if (auto *Cmp = dyn_cast(Term->getOperand(0))) - MiddleBuilder.setInsertPoint(Cmp); - else - MiddleBuilder.setInsertPoint(Term); - } else - MiddleBuilder.setInsertPoint(MiddleVPBB); - for (VPFirstOrderRecurrencePHIRecipe *FOR : RecurrencePhis) { SmallPtrSet SeenPhis; VPRecipeBase *Previous = FOR->getBackedgeValue()->getDefiningRecipe(); @@ -872,86 +858,6 @@ bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan &Plan, // Set the first operand of RecurSplice to FOR again, after replacing // all users. RecurSplice->setOperand(0, FOR); - - // This is the second phase of vectorizing first-order recurrences. An - // overview of the transformation is described below. Suppose we have the - // following loop with some use after the loop of the last a[i-1], - // - // for (int i = 0; i < n; ++i) { - // t = a[i - 1]; - // b[i] = a[i] - t; - // } - // use t; - // - // There is a first-order recurrence on "a". For this loop, the shorthand - // scalar IR looks like: - // - // scalar.ph: - // s_init = a[-1] - // br scalar.body - // - // scalar.body: - // i = phi [0, scalar.ph], [i+1, scalar.body] - // s1 = phi [s_init, scalar.ph], [s2, scalar.body] - // s2 = a[i] - // b[i] = s2 - s1 - // br cond, scalar.body, exit.block - // - // exit.block: - // use = lcssa.phi [s1, scalar.body] - // - // In this example, s1 is a recurrence because it's value depends on the - // previous iteration. In the first phase of vectorization, we created a - // vector phi v1 for s1. We now complete the vectorization and produce the - // shorthand vector IR shown below (for VF = 4, UF = 1). - // - // vector.ph: - // v_init = vector(..., ..., ..., a[-1]) - // br vector.body - // - // vector.body - // i = phi [0, vector.ph], [i+4, vector.body] - // v1 = phi [v_init, vector.ph], [v2, vector.body] - // v2 = a[i, i+1, i+2, i+3]; - // v3 = vector(v1(3), v2(0, 1, 2)) - // b[i, i+1, i+2, i+3] = v2 - v3 - // br cond, vector.body, middle.block - // - // middle.block: - // s_penultimate = v2(2) = v3(3) - // s_resume = v2(3) - // br cond, scalar.ph, exit.block - // - // scalar.ph: - // s_init' = phi [s_resume, middle.block], [s_init, otherwise] - // br scalar.body - // - // scalar.body: - // i = phi [0, scalar.ph], [i+1, scalar.body] - // s1 = phi [s_init', scalar.ph], [s2, scalar.body] - // s2 = a[i] - // b[i] = s2 - s1 - // br cond, scalar.body, exit.block - // - // exit.block: - // lo = lcssa.phi [s1, scalar.body], [s.penultimate, middle.block] - // - // After execution completes the vector loop, we extract the next value of - // the recurrence (x) to use as the initial value in the scalar loop. This - // is modeled by ExtractFromEnd. - Type *IntTy = Plan.getCanonicalIV()->getScalarType(); - - // Extract the penultimate value of the recurrence and update VPLiveOut - // users of the recurrence splice. Note that the extract of the final value - // used to resume in the scalar loop is created earlier during VPlan - // construction. - auto *Penultimate = cast(MiddleBuilder.createNaryOp( - VPInstruction::ExtractFromEnd, - {FOR->getBackedgeValue(), - Plan.getOrAddLiveIn(ConstantInt::get(IntTy, 2))}, - {}, "vector.recur.extract.for.phi")); - RecurSplice->replaceUsesWithIf( - Penultimate, [](VPUser &U, unsigned) { return isa(&U); }); } return true; } diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll index 16db6cf828af8..f14ffe854a3a6 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll @@ -55,6 +55,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) { ; IF-EVL-INLOOP-EMPTY: ; IF-EVL-INLOOP-NEXT: middle.block: ; IF-EVL-INLOOP-NEXT: EMIT vp<[[RDX:%.+]]> = compute-reduction-result ir<[[RDX_PHI]]>, ir<[[ADD]]> +; IF-EVL-INLOOP-NEXT: EMIT vp<[[RDX_EX:%.+]]> = extract-from-end vp<[[RDX]]>, ir<1> ; IF-EVL-INLOOP-NEXT: EMIT branch-on-cond ir ; IF-EVL-INLOOP-NEXT: Successor(s): ir-bb, scalar.ph ; IF-EVL-INLOOP-EMPTY: @@ -64,7 +65,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) { ; IF-EVL-INLOOP-NEXT: scalar.ph: ; IF-EVL-INLOOP-NEXT: No successors ; IF-EVL-INLOOP-EMPTY: -; IF-EVL-INLOOP-NEXT: Live-out i32 %add.lcssa = vp<[[RDX]]> +; IF-EVL-INLOOP-NEXT: Live-out i32 %add.lcssa = vp<[[RDX_EX]]> ; IF-EVL-INLOOP-NEXT: } ; @@ -93,6 +94,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) { ; NO-VP-OUTLOOP-EMPTY: ; NO-VP-OUTLOOP-NEXT: middle.block: ; NO-VP-OUTLOOP-NEXT: EMIT vp<[[RDX:%.+]]> = compute-reduction-result ir<[[RDX_PHI]]>, ir<[[ADD]]> +; NO-VP-OUTLOOP-NEXT: EMIT vp<[[RDX_EX:%.+]]> = extract-from-end vp<[[RDX]]>, ir<1> ; NO-VP-OUTLOOP-NEXT: EMIT vp<[[BOC:%.+]]> = icmp eq ir<%n>, vp<[[VTC]]> ; NO-VP-OUTLOOP-NEXT: EMIT branch-on-cond vp<[[BOC]]> ; NO-VP-OUTLOOP-NEXT: Successor(s): ir-bb, scalar.ph @@ -103,7 +105,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) { ; NO-VP-OUTLOOP-NEXT: scalar.ph: ; NO-VP-OUTLOOP-NEXT: No successors ; NO-VP-OUTLOOP-EMPTY: -; NO-VP-OUTLOOP-NEXT: Live-out i32 %add.lcssa = vp<[[RDX]]> +; NO-VP-OUTLOOP-NEXT: Live-out i32 %add.lcssa = vp<[[RDX_EX]]> ; NO-VP-OUTLOOP-NEXT: } ; @@ -132,6 +134,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) { ; NO-VP-INLOOP-EMPTY: ; NO-VP-INLOOP-NEXT: middle.block: ; NO-VP-INLOOP-NEXT: EMIT vp<[[RDX:%.+]]> = compute-reduction-result ir<[[RDX_PHI]]>, ir<[[ADD]]> +; NO-VP-INLOOP-NEXT: EMIT vp<[[RDX_EX:%.+]]> = extract-from-end vp<[[RDX]]>, ir<1> ; NO-VP-INLOOP-NEXT: EMIT vp<[[BOC:%.+]]> = icmp eq ir<%n>, vp<[[VTC]]> ; NO-VP-INLOOP-NEXT: EMIT branch-on-cond vp<[[BOC]]> ; NO-VP-INLOOP-NEXT: Successor(s): ir-bb, scalar.ph @@ -142,7 +145,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) { ; NO-VP-INLOOP-NEXT: scalar.ph: ; NO-VP-INLOOP-NEXT: No successors ; NO-VP-INLOOP-EMPTY: -; NO-VP-INLOOP-NEXT: Live-out i32 %add.lcssa = vp<[[RDX]]> +; NO-VP-INLOOP-NEXT: Live-out i32 %add.lcssa = vp<[[RDX_EX]]> ; NO-VP-INLOOP-NEXT: } ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll index 06fbeafba31c0..9e49cf6b42c6b 100644 --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll @@ -220,6 +220,7 @@ define i32 @sink_replicate_region_3_reduction(i32 %x, i8 %y, ptr %ptr) optsize { ; CHECK-NEXT: middle.block: ; CHECK-NEXT: EMIT vp<[[RED_RES:%.+]]> = compute-reduction-result ir<%and.red>, vp<[[SEL]]> ; CHECK-NEXT: EMIT vp<[[RESUME_1:%.+]]> = extract-from-end ir<%recur.next>, ir<1> +; CHECK-NEXT: EMIT vp<[[RED_EX:%.+]]> = extract-from-end vp<[[RED_RES]]>, ir<1> ; CHECK-NEXT: EMIT branch-on-cond ir ; CHECK-NEXT: Successor(s): ir-bb, scalar.ph ; CHECK-EMPTY: @@ -230,8 +231,8 @@ define i32 @sink_replicate_region_3_reduction(i32 %x, i8 %y, ptr %ptr) optsize { ; CHECK-NEXT: EMIT vp<[[RESUME_1_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<0> ; CHECK-NEXT: No successors ; CHECK-EMPTY: -; CHECK-NEXT: Live-out i32 %res = vp<[[RED_RES]]> ; CHECK-NEXT: Live-out i32 %recur = vp<[[RESUME_1_P]]> +; CHECK-NEXT: Live-out i32 %res = vp<[[RED_EX]]> ; CHECK-NEXT: } ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/instruction-only-used-outside-of-loop.ll b/llvm/test/Transforms/LoopVectorize/instruction-only-used-outside-of-loop.ll index 5f5cd78dc2d30..fcc6b7376d408 100644 --- a/llvm/test/Transforms/LoopVectorize/instruction-only-used-outside-of-loop.ll +++ b/llvm/test/Transforms/LoopVectorize/instruction-only-used-outside-of-loop.ll @@ -34,7 +34,7 @@ define i32 @one_direct_branch(ptr %src) { ; CHECK-NEXT: [[PHI_XOR:%.*]] = phi i32 [ [[XOR]], [[LOOP]] ] ; CHECK-NEXT: [[IV_NEXT]] = add nsw i32 [[IV]], 1 ; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[IV_NEXT]], 1000 -; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: [[XOR_LCSSA:%.*]] = phi i32 [ [[PHI_XOR]], [[LOOP_LATCH]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[XOR_LCSSA]] @@ -205,16 +205,14 @@ define i32 @optimizable_trunc_used_outside() { ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[OFFSET_IDX]] to i32 -; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[TMP0]], 0 -; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[TMP0]], 1 -; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP0]], 2 -; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[TMP0]], 3 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[OFFSET_IDX]], 4 -; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 -; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], +; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 +; CHECK-NEXT: br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[VEC_IND]], i32 3 ; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] @@ -226,7 +224,7 @@ define i32 @optimizable_trunc_used_outside() { ; CHECK-NEXT: [[EXITCOND_NOT_I_I:%.*]] = icmp eq i64 [[IV_NEXT]], 1000 ; CHECK-NEXT: br i1 [[EXITCOND_NOT_I_I]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: exit: -; CHECK-NEXT: [[IV_TRUNC_LCSSA:%.*]] = phi i32 [ [[IV_TRUNC]], [[LOOP]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[IV_TRUNC_LCSSA:%.*]] = phi i32 [ [[IV_TRUNC]], [[LOOP]] ], [ [[TMP1]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[IV_TRUNC_LCSSA]] ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll b/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll index 72f8cf22cafa7..b79525bc3e440 100644 --- a/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll +++ b/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll @@ -34,8 +34,8 @@ define i32 @test(i32 %a, i1 %c.1, i1 %c.2 ) #0 { ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 176 ; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i32> [[PREDPHI5]], i32 1 ; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[PREDPHI7]]) +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i32> [[PREDPHI5]], i32 1 ; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 182, [[MIDDLE_BLOCK]] ], [ 6, [[BB:%.*]] ] diff --git a/llvm/test/Transforms/LoopVectorize/select-cmp-multiuse.ll b/llvm/test/Transforms/LoopVectorize/select-cmp-multiuse.ll index 9eb90099214e1..b88e597e6bc8e 100644 --- a/llvm/test/Transforms/LoopVectorize/select-cmp-multiuse.ll +++ b/llvm/test/Transforms/LoopVectorize/select-cmp-multiuse.ll @@ -916,13 +916,13 @@ define i32 @multi_user_cmp_branch_use_and_outside_bb_use(ptr readonly %a, i64 no ; CHECK-VF4-IC1-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-VF4-IC1-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK-VF4-IC1: middle.block: -; CHECK-VF4-IC1-NEXT: [[TMP8:%.*]] = extractelement <4 x i1> [[TMP3]], i32 3 ; CHECK-VF4-IC1-NEXT: [[TMP9:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]]) ; CHECK-VF4-IC1-NEXT: [[TMP10:%.*]] = freeze i1 [[TMP9]] ; CHECK-VF4-IC1-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP10]], i1 false, i1 true ; CHECK-VF4-IC1-NEXT: [[TMP11:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]]) ; CHECK-VF4-IC1-NEXT: [[TMP12:%.*]] = freeze i1 [[TMP11]] ; CHECK-VF4-IC1-NEXT: [[RDX_SELECT2:%.*]] = select i1 [[TMP12]], i1 true, i1 false +; CHECK-VF4-IC1-NEXT: [[TMP8:%.*]] = extractelement <4 x i1> [[TMP3]], i32 3 ; CHECK-VF4-IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-VF4-IC1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK-VF4-IC1: scalar.ph: @@ -986,7 +986,6 @@ define i32 @multi_user_cmp_branch_use_and_outside_bb_use(ptr readonly %a, i64 no ; CHECK-VF4-IC2-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-VF4-IC2-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK-VF4-IC2: middle.block: -; CHECK-VF4-IC2-NEXT: [[TMP15:%.*]] = extractelement <4 x i1> [[TMP7]], i32 3 ; CHECK-VF4-IC2-NEXT: [[BIN_RDX:%.*]] = or <4 x i1> [[TMP13]], [[TMP12]] ; CHECK-VF4-IC2-NEXT: [[TMP16:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[BIN_RDX]]) ; CHECK-VF4-IC2-NEXT: [[TMP17:%.*]] = freeze i1 [[TMP16]] @@ -995,6 +994,7 @@ define i32 @multi_user_cmp_branch_use_and_outside_bb_use(ptr readonly %a, i64 no ; CHECK-VF4-IC2-NEXT: [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[BIN_RDX5]]) ; CHECK-VF4-IC2-NEXT: [[TMP19:%.*]] = freeze i1 [[TMP18]] ; CHECK-VF4-IC2-NEXT: [[RDX_SELECT6:%.*]] = select i1 [[TMP19]], i1 true, i1 false +; CHECK-VF4-IC2-NEXT: [[TMP15:%.*]] = extractelement <4 x i1> [[TMP7]], i32 3 ; CHECK-VF4-IC2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-VF4-IC2-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK-VF4-IC2: scalar.ph: diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll index 3a664de748d2d..f18ed825a6b88 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll @@ -154,6 +154,7 @@ define float @print_reduction(i64 %n, ptr noalias %y) { ; CHECK-EMPTY: ; CHECK-NEXT: middle.block: ; CHECK-NEXT: EMIT vp<[[RED_RES:%.+]]> = compute-reduction-result ir<%red>, ir<%red.next> +; CHECK-NEXT: EMIT vp<[[RED_EX:%.+]]> = extract-from-end vp<[[RED_RES]]>, ir<1> ; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq ir<%n>, vp<[[VEC_TC]]> ; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]> ; CHECK-NEXT: Successor(s): ir-bb, scalar.ph @@ -164,7 +165,7 @@ define float @print_reduction(i64 %n, ptr noalias %y) { ; CHECK-NEXT: scalar.ph ; CHECK-NEXT: No successors ; CHECK-EMPTY: -; CHECK-NEXT: Live-out float %red.next.lcssa = vp<[[RED_RES]]> +; CHECK-NEXT: Live-out float %red.next.lcssa = vp<[[RED_EX]]> ; CHECK-NEXT: } ; entry: @@ -435,6 +436,7 @@ define float @print_fmuladd_strict(ptr %a, ptr %b, i64 %n) { ; CHECK-EMPTY: ; CHECK-NEXT: middle.block: ; CHECK-NEXT: EMIT vp<[[RED_RES:%.+]]> = compute-reduction-result ir<%sum.07>, ir<[[MULADD]]> +; CHECK-NEXT: EMIT vp<[[RED_EX:%.+]]> = extract-from-end vp<[[RED_RES]]>, ir<1> ; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq ir<%n>, vp<[[VEC_TC]]> ; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]> ; CHECK-NEXT: Successor(s): ir-bb, scalar.ph @@ -445,7 +447,7 @@ define float @print_fmuladd_strict(ptr %a, ptr %b, i64 %n) { ; CHECK-NEXT: scalar.ph ; CHECK-NEXT: No successors ; CHECK-EMPTY: -; CHECK-NEXT: Live-out float %muladd.lcssa = vp<[[RED_RES]]> +; CHECK-NEXT: Live-out float %muladd.lcssa = vp<[[RED_EX]]> ; CHECK-NEXT:} entry: @@ -654,6 +656,7 @@ define i32 @print_exit_value(ptr %ptr, i32 %off) { ; CHECK-NEXT: Successor(s): middle.block ; CHECK-EMPTY: ; CHECK-NEXT: middle.block: +; CHECK-NEXT: EMIT vp<[[EXIT:%.+]]> = extract-from-end ir<%add>, ir<1> ; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq ir<1000>, vp<[[VEC_TC]]> ; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]> ; CHECK-NEXT: Successor(s): ir-bb, scalar.ph @@ -664,7 +667,7 @@ define i32 @print_exit_value(ptr %ptr, i32 %off) { ; CHECK-NEXT: scalar.ph ; CHECK-NEXT: No successors ; CHECK-EMPTY: -; CHECK-NEXT: Live-out i32 %lcssa = ir<%add> +; CHECK-NEXT: Live-out i32 %lcssa = vp<[[EXIT]]> ; CHECK-NEXT: } ; entry: @@ -1036,8 +1039,8 @@ define i16 @print_first_order_recurrence_and_result(ptr %ptr) { ; CHECK-NEXT: EMIT vp<[[RESUME_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<22> ; CHECK-NEXT: No successors ; CHECK-EMPTY: -; CHECK-NEXT: Live-out i16 %for.1.lcssa = vp<[[FOR_RESULT]]> ; CHECK-NEXT: Live-out i16 %for.1 = vp<[[RESUME_P]]> +; CHECK-NEXT: Live-out i16 %for.1.lcssa = vp<[[FOR_RESULT]]> ; CHECK-NEXT: } ; entry: From 540b9c5de195002470389e47dcfbe01c5e22a540 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Fri, 26 Jul 2024 12:25:29 +0100 Subject: [PATCH 02/11] !fixup fix formatting --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index f3cc705962357..89ea7bc563673 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8773,7 +8773,6 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { addUsersInExitBlock(OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars()); - // --------------------------------------------------------------------------- // Transform initial VPlan: Apply previously taken decisions, in order, to // bring the VPlan to its final state. From 92e99e5d6e1494020792f66f994a50e82b7a8063 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Mon, 5 Aug 2024 12:08:32 +0100 Subject: [PATCH 03/11] !fixup address latest comments, thanks! --- .../Transforms/Vectorize/LoopVectorize.cpp | 242 ++++++++++-------- .../Transforms/Vectorize/VPlanPatternMatch.h | 5 + .../first-order-recurrence-chains.ll | 26 +- .../first-order-recurrence-complex.ll | 4 +- 4 files changed, 152 insertions(+), 125 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 1b632116067e1..7c4df5d596b65 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8459,6 +8459,16 @@ static void addUsersInExitBlock( if (!ExitBB || !ExitBB->getSinglePredecessor() || !ExitingBB) return; + auto MiddleVPBB = + cast(Plan.getVectorLoopRegion()->getSingleSuccessor()); + VPBuilder B(MiddleVPBB); + if (auto *Terminator = MiddleVPBB->getTerminator()) { + auto *Condition = dyn_cast(Terminator->getOperand(0)); + assert((!Condition || Condition->getParent() == MiddleVPBB) && + "Condition expected in MiddleVPBB"); + B.setInsertPoint(Condition ? Condition : Terminator); + } + // Introduce VPUsers modeling the exit values. for (PHINode &ExitPhi : ExitBB->phis()) { Value *IncomingValue = @@ -8470,7 +8480,8 @@ static void addUsersInExitBlock( // live-outs. if ((isa(V) && !cast(V)->getTruncInst()) || - isa(V) || + isa( + V) || (isa(IncomingValue) && any_of(IncomingValue->users(), [&Inductions](User *U) { auto *P = dyn_cast(U); @@ -8478,102 +8489,10 @@ static void addUsersInExitBlock( }))) continue; - auto MiddleVPBB = - cast(Plan.getVectorLoopRegion()->getSingleSuccessor()); - VPBuilder B(MiddleVPBB); - if (auto *Terminator = MiddleVPBB->getTerminator()) { - auto *Condition = dyn_cast(Terminator->getOperand(0)); - assert((!Condition || Condition->getParent() == MiddleVPBB) && - "Condition expected in MiddleVPBB"); - B.setInsertPoint(Condition ? Condition : Terminator); - } - - VPValue *Ext; - if (auto *FOR = dyn_cast_or_null( - V->getDefiningRecipe())) { - // This is the second phase of vectorizing first-order recurrences. An - // overview of the transformation is described below. Suppose we have the - // following loop with some use after the loop of the last a[i-1], - // - // for (int i = 0; i < n; ++i) { - // t = a[i - 1]; - // b[i] = a[i] - t; - // } - // use t; - // - // There is a first-order recurrence on "a". For this loop, the shorthand - // scalar IR looks like: - // - // scalar.ph: - // s_init = a[-1] - // br scalar.body - // - // scalar.body: - // i = phi [0, scalar.ph], [i+1, scalar.body] - // s1 = phi [s_init, scalar.ph], [s2, scalar.body] - // s2 = a[i] - // b[i] = s2 - s1 - // br cond, scalar.body, exit.block - // - // exit.block: - // use = lcssa.phi [s1, scalar.body] - // - // In this example, s1 is a recurrence because it's value depends on the - // previous iteration. In the first phase of vectorization, we created a - // vector phi v1 for s1. We now complete the vectorization and produce the - // shorthand vector IR shown below (for VF = 4, UF = 1). - // - // vector.ph: - // v_init = vector(..., ..., ..., a[-1]) - // br vector.body - // - // vector.body - // i = phi [0, vector.ph], [i+4, vector.body] - // v1 = phi [v_init, vector.ph], [v2, vector.body] - // v2 = a[i, i+1, i+2, i+3]; - // v3 = vector(v1(3), v2(0, 1, 2)) - // b[i, i+1, i+2, i+3] = v2 - v3 - // br cond, vector.body, middle.block - // - // middle.block: - // s_penultimate = v2(2) = v3(3) - // s_resume = v2(3) - // br cond, scalar.ph, exit.block - // - // scalar.ph: - // s_init' = phi [s_resume, middle.block], [s_init, otherwise] - // br scalar.body - // - // scalar.body: - // i = phi [0, scalar.ph], [i+1, scalar.body] - // s1 = phi [s_init', scalar.ph], [s2, scalar.body] - // s2 = a[i] - // b[i] = s2 - s1 - // br cond, scalar.body, exit.block - // - // exit.block: - // lo = lcssa.phi [s1, scalar.body], [s.penultimate, middle.block] - // - // After execution completes the vector loop, we extract the next value of - // the recurrence (x) to use as the initial value in the scalar loop. This - // is modeled by ExtractFromEnd. - // - // Extract the penultimate value of the recurrence and update VPLiveOut - // users of the recurrence splice. Note that the extract of the final - // value used to resume in the scalar loop is created earlier during VPlan - // construction. - Ext = - B.createNaryOp(VPInstruction::ExtractFromEnd, - {FOR->getBackedgeValue(), - Plan.getOrAddLiveIn(ConstantInt::get( - IntegerType::get(ExitBB->getContext(), 32), 2))}, - {}, "vector.recur.extract.for.phi"); - } else { - Ext = B.createNaryOp( - VPInstruction::ExtractFromEnd, - {V, Plan.getOrAddLiveIn(ConstantInt::get( - IntegerType::get(ExitBB->getContext(), 32), 1))}); - } + VPValue *Ext = B.createNaryOp( + VPInstruction::ExtractFromEnd, + {V, Plan.getOrAddLiveIn(ConstantInt::get( + IntegerType::get(ExitBB->getContext(), 32), 1))}); Plan.addLiveOut(&ExitPhi, Ext); } } @@ -8582,7 +8501,8 @@ static void addUsersInExitBlock( /// if middle block branches to scalar preheader, by introducing ExtractFromEnd /// and ResumePhi recipes in each, respectively, and a VPLiveOut which uses the /// latter and corresponds to the scalar header. -static void addLiveOutsForFirstOrderRecurrences(VPlan &Plan) { +static void addLiveOutsForFirstOrderRecurrences(VPlan &Plan, Loop *OrigLoop, + bool RequiresScalarEpilogue) { VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion(); // Start by finding out if middle block branches to scalar preheader, which is @@ -8614,12 +8534,82 @@ static void addLiveOutsForFirstOrderRecurrences(VPlan &Plan) { } VPValue *OneVPV = Plan.getOrAddLiveIn( ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1)); + VPValue *TwoVPV = Plan.getOrAddLiveIn( + ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 2)); for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) { auto *FOR = dyn_cast(&HeaderPhi); if (!FOR) continue; + // This is the second phase of vectorizing first-order recurrences. An + // overview of the transformation is described below. Suppose we have the + // following loop with some use after the loop of the last a[i-1], + // + // for (int i = 0; i < n; ++i) { + // t = a[i - 1]; + // b[i] = a[i] - t; + // } + // use t; + // + // There is a first-order recurrence on "a". For this loop, the shorthand + // scalar IR looks like: + // + // scalar.ph: + // s_init = a[-1] + // br scalar.body + // + // scalar.body: + // i = phi [0, scalar.ph], [i+1, scalar.body] + // s1 = phi [s_init, scalar.ph], [s2, scalar.body] + // s2 = a[i] + // b[i] = s2 - s1 + // br cond, scalar.body, exit.block + // + // exit.block: + // use = lcssa.phi [s1, scalar.body] + // + // In this example, s1 is a recurrence because it's value depends on the + // previous iteration. In the first phase of vectorization, we created a + // vector phi v1 for s1. We now complete the vectorization and produce the + // shorthand vector IR shown below (for VF = 4, UF = 1). + // + // vector.ph: + // v_init = vector(..., ..., ..., a[-1]) + // br vector.body + // + // vector.body + // i = phi [0, vector.ph], [i+4, vector.body] + // v1 = phi [v_init, vector.ph], [v2, vector.body] + // v2 = a[i, i+1, i+2, i+3]; + // v3 = vector(v1(3), v2(0, 1, 2)) + // b[i, i+1, i+2, i+3] = v2 - v3 + // br cond, vector.body, middle.block + // + // middle.block: + // s_penultimate = v2(2) = v3(3) + // s_resume = v2(3) + // br cond, scalar.ph, exit.block + // + // scalar.ph: + // s_init' = phi [s_resume, middle.block], [s_init, otherwise] + // br scalar.body + // + // scalar.body: + // i = phi [0, scalar.ph], [i+1, scalar.body] + // s1 = phi [s_init', scalar.ph], [s2, scalar.body] + // s2 = a[i] + // b[i] = s2 - s1 + // br cond, scalar.body, exit.block + // + // exit.block: + // lo = lcssa.phi [s1, scalar.body], [s.penultimate, middle.block] + // + // After execution completes the vector loop, we extract the next value of + // the recurrence (x) to use as the initial value in the scalar loop. This + // is modeled by ExtractFromEnd. + // + // Extract the resume value and create a new VPLiveOut for it. auto *Resume = MiddleBuilder.createNaryOp(VPInstruction::ExtractFromEnd, {FOR->getBackedgeValue(), OneVPV}, @@ -8627,7 +8617,38 @@ static void addLiveOutsForFirstOrderRecurrences(VPlan &Plan) { auto *ResumePhiRecipe = ScalarPHBuilder.createNaryOp( VPInstruction::ResumePhi, {Resume, FOR->getStartValue()}, {}, "scalar.recur.init"); - Plan.addLiveOut(cast(FOR->getUnderlyingInstr()), ResumePhiRecipe); + auto *FORPhi = cast(FOR->getUnderlyingInstr()); + Plan.addLiveOut(FORPhi, ResumePhiRecipe); + + // Now create VPLiveOuts for users in the exit block. + // Extract the penultimate value of the recurrence and add VPLiveOut + // users of the recurrence splice. Note that the extract of the final + // value used to resume in the scalar loop is created earlier during VPlan + // construction. + + // No edge from the middle block to the unique exit block has been inserted + // and there is nothing to fix from vector loop; phis should have incoming + // from scalar loop only. + if (RequiresScalarEpilogue) + continue; + BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock(); + BasicBlock *ExitingBB = OrigLoop->getExitingBlock(); + // Only handle single-exit loops with unique exit blocks for now. + if (!ExitBB || !ExitBB->getSinglePredecessor() || !ExitingBB) + continue; + + for (User *U : FORPhi->users()) { + auto *UI = cast(U); + if (UI->getParent() != ExitBB) { + assert(OrigLoop->contains(UI->getParent()) && + "FOR used outside loop and exit block"); + continue; + } + VPValue *Ext = MiddleBuilder.createNaryOp( + VPInstruction::ExtractFromEnd, {FOR->getBackedgeValue(), TwoVPV}, {}, + "vector.recur.extract.for.phi"); + Plan.addLiveOut(cast(UI), Ext); + } } } @@ -8788,9 +8809,10 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { "VPBasicBlock"); RecipeBuilder.fixHeaderPhis(); - addLiveOutsForFirstOrderRecurrences(*Plan); + bool RequiresScalarEpilogue = CM.requiresScalarEpilogue(Range); + addLiveOutsForFirstOrderRecurrences(*Plan, OrigLoop, RequiresScalarEpilogue); - if (CM.requiresScalarEpilogue(Range)) { + if (RequiresScalarEpilogue) { // No edge from the middle block to the unique exit block has been inserted // and there is nothing to fix from vector loop; phis should have incoming // from scalar loop only. @@ -8949,6 +8971,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { // iteration. The final value is selected by the final ComputeReductionResult. void LoopVectorizationPlanner::adjustRecipesForReductions( VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) { + using namespace VPlanPatternMatch; VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion(); VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock(); // Gather all VPReductionPHIRecipe and sort them so that Intermediate stores @@ -9007,10 +9030,9 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( VPSingleDefRecipe *Cur = Worklist[I]; for (VPUser *U : Cur->users()) { auto *UserRecipe = cast(U); - if (!UserRecipe->getParent()->getParent()) { - assert(cast(U) && - cast(U)->getOpcode() == - VPInstruction::ExtractFromEnd && + if (!UserRecipe->getParent()->getEnclosingLoopRegion()) { + assert(match(U, m_Binary( + m_VPValue(), m_VPValue())) && "U must be an ExtractFromEnd VPInstruction"); continue; } @@ -9228,11 +9250,11 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( auto *FinalReductionResult = new VPInstruction( VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL); FinalReductionResult->insertBefore(*MiddleVPBB, IP); - OrigExitingVPV->replaceUsesWithIf( - FinalReductionResult, [](VPUser &User, unsigned) { - auto *R = dyn_cast(&User); - return R && R->getOpcode() == VPInstruction::ExtractFromEnd; - }); + OrigExitingVPV->replaceUsesWithIf(FinalReductionResult, [](VPUser &User, + unsigned) { + return match(&User, m_Binary(m_VPValue(), + m_VPValue())); + }); } VPlanTransforms::clearReductionWrapFlags(*Plan); diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h index 9cd7712624bac..5f86f2c969651 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h +++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h @@ -29,6 +29,11 @@ template bool match(Val *V, const Pattern &P) { return const_cast(P).match(V); } +template bool match(VPUser *U, const Pattern &P) { + auto *R = dyn_cast(U); + return R && match(R, P); +} + template struct class_match { template bool match(ITy *V) { return isa(V); } }; diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll index 94f35ad453670..1e34e1d0d517d 100644 --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll @@ -19,8 +19,8 @@ define i16 @test_chained_first_order_recurrences_1(ptr %ptr) { ; CHECK-NEXT: br i1 [[TMP8]], label %middle.block, label %vector.body ; CHECK: middle.block: ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3 -; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT2:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT2:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI3:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2 ; entry: @@ -62,8 +62,8 @@ define i16 @test_chained_first_order_recurrences_2(ptr %ptr) { ; CHECK-NEXT: br i1 [[TMP8]], label %middle.block, label %vector.body, !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3 -; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT2:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT2:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI3:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2 ; entry: @@ -108,10 +108,10 @@ define i16 @test_chained_first_order_recurrences_3(ptr %ptr) { ; CHECK-NEXT: br i1 [[TMP10]], label %middle.block, label %vector.body, !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3 -; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3 -; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT7:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI4:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT7:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2 ; entry: @@ -220,10 +220,10 @@ define i16 @test_chained_first_order_recurrences_3_reordered_1(ptr %ptr) { ; CHECK-NEXT: br i1 [[TMP10]], label %middle.block, label %vector.body, !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT7:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3 -; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3 -; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI4:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2 ; entry: @@ -271,10 +271,10 @@ define i16 @test_chained_first_order_recurrences_3_reordered_2(ptr %ptr) { ; CHECK-NEXT: br i1 [[TMP10]], label %middle.block, label %vector.body, !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3 -; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT7:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3 -; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI4:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT7:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2 ; entry: @@ -322,10 +322,10 @@ define i16 @test_chained_first_order_recurrences_3_for2_no_other_uses(ptr %ptr) ; CHECK-NEXT: br i1 [[TMP10]], label %middle.block, label %vector.body, !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3 -; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3 -; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT7:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI4:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT7:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2 ; entry: @@ -372,10 +372,10 @@ define i16 @test_chained_first_order_recurrences_3_for1_for2_no_other_uses(ptr % ; CHECK-NEXT: br i1 [[TMP10]], label %middle.block, label %vector.body, !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3 -; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3 -; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT7:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI4:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT7:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2 ; entry: @@ -421,8 +421,8 @@ define double @test_chained_first_order_recurrence_sink_users_1(ptr %ptr) { ; CHECK-NEXT: br i1 [[TMP9]], label %middle.block, label %vector.body, !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x double> [[WIDE_LOAD]], i32 3 -; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT2:%.*]] = extractelement <4 x double> [[TMP4]], i32 3 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x double> [[WIDE_LOAD]], i32 2 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT2:%.*]] = extractelement <4 x double> [[TMP4]], i32 3 ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI3:%.*]] = extractelement <4 x double> [[TMP4]], i32 2 ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll index 371c58e8eb9cc..3a846dd4072d3 100644 --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll @@ -1125,9 +1125,9 @@ define void @test_for_sink_instruction_after_same_incoming_2(ptr %ptr) { ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT2:%.*]] = extractelement <4 x double> [[WIDE_LOAD]], i32 3 ; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 997, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 997, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY]] ] ; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi double [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 2.000000e+01, [[ENTRY]] ] -; CHECK-NEXT: [[SCALAR_RECUR_INIT3:%.*]] = phi double [ [[VECTOR_RECUR_EXTRACT2]], [[MIDDLE_BLOCK]] ], [ 1.000000e+01, [[ENTRY]] ] +; CHECK-NEXT: [[SCALAR_RECUR_INIT3:%.*]] = phi double [ [[VECTOR_RECUR_EXTRACT2]], [[MIDDLE_BLOCK]] ], [ 1.000000e+01, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[SCALAR_RECUR:%.*]] = phi double [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[FOR_1_NEXT:%.*]], [[LOOP]] ] From 72a0c28dabc7008e703162c1ee431c1db64ee07a Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Mon, 12 Aug 2024 15:21:41 +0100 Subject: [PATCH 04/11] !fixup address latest comments, thanks! --- .../Transforms/Vectorize/LoopVectorize.cpp | 64 +++++++++---------- 1 file changed, 30 insertions(+), 34 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 96f3de6befd42..73a019f7b56dc 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8532,18 +8532,20 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW, } // Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the -// original exit block. +// original exit block fed by a reduction or VPValue that's not a +// VPWidenIntOrFpInductionRecipe or VPFirstOrderRecurrencePHIRecipe. static void addUsersInExitBlock( Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan, const MapVector &Inductions) { - BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock(); - BasicBlock *ExitingBB = OrigLoop->getExitingBlock(); - // Only handle single-exit loops with unique exit blocks for now. - if (!ExitBB || !ExitBB->getSinglePredecessor() || !ExitingBB) - return; - auto MiddleVPBB = cast(Plan.getVectorLoopRegion()->getSingleSuccessor()); + + // No edge from the middle block to the unique exit block has been inserted + // and there is nothing to fix from vector loop; phis should have incoming + // from scalar loop only. + if (MiddleVPBB->getNumSuccessors() != 2) + return; + // TODO: set B to MiddleVPBB->getFirstNonPhi(), taking care of affected tests. VPBuilder B(MiddleVPBB); if (auto *Terminator = MiddleVPBB->getTerminator()) { auto *Condition = dyn_cast(Terminator->getOperand(0)); @@ -8553,12 +8555,17 @@ static void addUsersInExitBlock( } // Introduce VPUsers modeling the exit values. + BasicBlock *ExitBB = + cast(MiddleVPBB->getSuccessors()[0])->getIRBasicBlock(); + BasicBlock *ExitingBB = OrigLoop->getExitingBlock(); for (PHINode &ExitPhi : ExitBB->phis()) { Value *IncomingValue = ExitPhi.getIncomingValueForBlock(ExitingBB); VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue, Plan); // Exit values for inductions are computed and updated outside of VPlan and // independent of induction recipes. + // Exit values for first-order recurrences are added separately in + // addLiveOutsForFirstOrderRecurrences. // TODO: Compute induction exit values in VPlan, use VPLiveOuts to update // live-outs. if ((isa(V) && @@ -8580,12 +8587,15 @@ static void addUsersInExitBlock( } } -/// Feed a resume value for every FOR from the vector loop to the scalar loop, -/// if middle block branches to scalar preheader, by introducing ExtractFromEnd -/// and ResumePhi recipes in each, respectively, and a VPLiveOut which uses the -/// latter and corresponds to the scalar header. -static void addLiveOutsForFirstOrderRecurrences(VPlan &Plan, Loop *OrigLoop, - bool RequiresScalarEpilogue) { +/// Handle live-outs for first order reductions, both in the scalar preheader +/// and the original exit block: +/// 1. Feed a resume value for every FOR from the vector loop to the scalar +/// loop, if middle block branches to scalar preheader, by introducing +/// ExtractFromEnd and ResumePhi recipes in each, respectively, and a +/// VPLiveOut which uses the latter and corresponds to the scalar header. +/// 2. Feed the penultimate value of recurrences to their LCSSA phi users in +/// the original exit block using a VPLiveOut. +static void addLiveOutsForFirstOrderRecurrences(VPlan &Plan) { VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion(); // Start by finding out if middle block branches to scalar preheader, which is @@ -8712,21 +8722,14 @@ static void addLiveOutsForFirstOrderRecurrences(VPlan &Plan, Loop *OrigLoop, // No edge from the middle block to the unique exit block has been inserted // and there is nothing to fix from vector loop; phis should have incoming // from scalar loop only. - if (RequiresScalarEpilogue) - continue; - BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock(); - BasicBlock *ExitingBB = OrigLoop->getExitingBlock(); - // Only handle single-exit loops with unique exit blocks for now. - if (!ExitBB || !ExitBB->getSinglePredecessor() || !ExitingBB) + if (MiddleVPBB->getNumSuccessors() != 2) continue; - + BasicBlock *ExitBB = + cast(MiddleVPBB->getSuccessors()[0])->getIRBasicBlock(); for (User *U : FORPhi->users()) { auto *UI = cast(U); - if (UI->getParent() != ExitBB) { - assert(OrigLoop->contains(UI->getParent()) && - "FOR used outside loop and exit block"); + if (UI->getParent() != ExitBB) continue; - } VPValue *Ext = MiddleBuilder.createNaryOp( VPInstruction::ExtractFromEnd, {FOR->getBackedgeValue(), TwoVPV}, {}, "vector.recur.extract.for.phi"); @@ -8892,16 +8895,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { "VPBasicBlock"); RecipeBuilder.fixHeaderPhis(); - bool RequiresScalarEpilogue = CM.requiresScalarEpilogue(Range); - addLiveOutsForFirstOrderRecurrences(*Plan, OrigLoop, RequiresScalarEpilogue); - - if (RequiresScalarEpilogue) { - // No edge from the middle block to the unique exit block has been inserted - // and there is nothing to fix from vector loop; phis should have incoming - // from scalar loop only. - } else - addUsersInExitBlock(OrigLoop, RecipeBuilder, *Plan, - Legal->getInductionVars()); + addLiveOutsForFirstOrderRecurrences(*Plan); + addUsersInExitBlock(OrigLoop, RecipeBuilder, *Plan, + Legal->getInductionVars()); // --------------------------------------------------------------------------- // Transform initial VPlan: Apply previously taken decisions, in order, to From 3520ce696a8f6ea6c95cff5c1c86098a25e41331 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Mon, 12 Aug 2024 15:58:56 +0100 Subject: [PATCH 05/11] !fixup address remaining comments, thanks! --- .../Transforms/Vectorize/LoopVectorize.cpp | 37 ++++++++----------- 1 file changed, 16 insertions(+), 21 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 8312ff0e89410..4f01e2b49c871 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8628,6 +8628,7 @@ static void addLiveOutsForFirstOrderRecurrences(VPlan &Plan) { VPBuilder MiddleBuilder(MiddleVPBB); // Reset insert point so new recipes are inserted before terminator and // condition, if there is either the former or both. + // TODO: set MiddleBuilder to MiddleVPBB->getFirstNonPhi(). if (auto *Terminator = MiddleVPBB->getTerminator()) { auto *Condition = dyn_cast(Terminator->getOperand(0)); assert((!Condition || Condition->getParent() == MiddleVPBB) && @@ -8644,9 +8645,10 @@ static void addLiveOutsForFirstOrderRecurrences(VPlan &Plan) { if (!FOR) continue; - // This is the second phase of vectorizing first-order recurrences. An - // overview of the transformation is described below. Suppose we have the - // following loop with some use after the loop of the last a[i-1], + // This is the second phase of vectorizing first-order recurrences, creating + // extract for users outside the loop. An overview of the transformation is + // described below. Suppose we have the following loop with some use after + // the loop of the last a[i-1], // // for (int i = 0; i < n; ++i) { // t = a[i - 1]; @@ -8658,12 +8660,12 @@ static void addLiveOutsForFirstOrderRecurrences(VPlan &Plan) { // scalar IR looks like: // // scalar.ph: - // s_init = a[-1] + // s.init = a[-1] // br scalar.body // // scalar.body: // i = phi [0, scalar.ph], [i+1, scalar.body] - // s1 = phi [s_init, scalar.ph], [s2, scalar.body] + // s1 = phi [s.init, scalar.ph], [s2, scalar.body] // s2 = a[i] // b[i] = s2 - s1 // br cond, scalar.body, exit.block @@ -8673,8 +8675,8 @@ static void addLiveOutsForFirstOrderRecurrences(VPlan &Plan) { // // In this example, s1 is a recurrence because it's value depends on the // previous iteration. In the first phase of vectorization, we created a - // vector phi v1 for s1. We now complete the vectorization and produce the - // shorthand vector IR shown below (for VF = 4, UF = 1). + // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts + // for users in the scalar preheader and exit block. // // vector.ph: // v_init = vector(..., ..., ..., a[-1]) @@ -8683,23 +8685,23 @@ static void addLiveOutsForFirstOrderRecurrences(VPlan &Plan) { // vector.body // i = phi [0, vector.ph], [i+4, vector.body] // v1 = phi [v_init, vector.ph], [v2, vector.body] - // v2 = a[i, i+1, i+2, i+3]; - // v3 = vector(v1(3), v2(0, 1, 2)) + // v2 = a[i] + // b[i] = v2 - v1 // b[i, i+1, i+2, i+3] = v2 - v3 // br cond, vector.body, middle.block // // middle.block: - // s_penultimate = v2(2) = v3(3) - // s_resume = v2(3) + // s.penultimate = v2(2) + // s.resume = v2(3) // br cond, scalar.ph, exit.block // // scalar.ph: - // s_init' = phi [s_resume, middle.block], [s_init, otherwise] + // s.init' = phi [s.resume, middle.block], [s.init, otherwise] // br scalar.body // // scalar.body: // i = phi [0, scalar.ph], [i+1, scalar.body] - // s1 = phi [s_init', scalar.ph], [s2, scalar.body] + // s1 = phi [s.init', scalar.ph], [s2, scalar.body] // s2 = a[i] // b[i] = s2 - s1 // br cond, scalar.body, exit.block @@ -8707,11 +8709,6 @@ static void addLiveOutsForFirstOrderRecurrences(VPlan &Plan) { // exit.block: // lo = lcssa.phi [s1, scalar.body], [s.penultimate, middle.block] // - // After execution completes the vector loop, we extract the next value of - // the recurrence (x) to use as the initial value in the scalar loop. This - // is modeled by ExtractFromEnd. - // - // Extract the resume value and create a new VPLiveOut for it. auto *Resume = MiddleBuilder.createNaryOp(VPInstruction::ExtractFromEnd, {FOR->getBackedgeValue(), OneVPV}, @@ -8724,9 +8721,7 @@ static void addLiveOutsForFirstOrderRecurrences(VPlan &Plan) { // Now create VPLiveOuts for users in the exit block. // Extract the penultimate value of the recurrence and add VPLiveOut - // users of the recurrence splice. Note that the extract of the final - // value used to resume in the scalar loop is created earlier during VPlan - // construction. + // users of the recurrence splice. // No edge from the middle block to the unique exit block has been inserted // and there is nothing to fix from vector loop; phis should have incoming From 31fbb4cb9a28fea101f758ff26182bca8415ffdc Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Tue, 20 Aug 2024 15:50:40 +0100 Subject: [PATCH 06/11] Collect exitingvaluestofix --- .../Transforms/Vectorize/LoopVectorize.cpp | 27 ++++++++++++------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 86e50a7f91437..ad46dd87a12b1 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8527,9 +8527,7 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW, {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); } -// Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the -// original exit block. -static void addUsersInExitBlock( +static SetVector> collectUsersInExitBlock( Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan, const MapVector &Inductions) { auto MiddleVPBB = @@ -8538,9 +8536,8 @@ static void addUsersInExitBlock( // and there is nothing to fix from vector loop; phis should have incoming // from scalar loop only. if (MiddleVPBB->getNumSuccessors() != 2) - return; - - // Introduce VPUsers modeling the exit values. + return {}; + SetVector> ExitingValuesToFix; BasicBlock *ExitBB = cast(MiddleVPBB->getSuccessors()[0])->getIRBasicBlock(); BasicBlock *ExitingBB = OrigLoop->getExitingBlock(); @@ -8561,8 +8558,18 @@ static void addUsersInExitBlock( return P && Inductions.contains(P); }))) continue; - Plan.addLiveOut(&ExitPhi, V); + ExitingValuesToFix.insert({&ExitPhi, V}); } + return ExitingValuesToFix; +} + +// Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the +// original exit block. +static void addUsersInExitBlock( + VPlan &Plan, + SetVector> &ExitingValuesToFix) { + for (const auto &[ExitPhi, V] : ExitingValuesToFix) + Plan.addLiveOut(ExitPhi, V); } /// Feed a resume value for every FOR from the vector loop to the scalar loop, @@ -8769,8 +8776,10 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { // After here, VPBB should not be used. VPBB = nullptr; - addUsersInExitBlock(OrigLoop, RecipeBuilder, *Plan, - Legal->getInductionVars()); + SetVector> ExitingValuesToFix = + collectUsersInExitBlock(OrigLoop, RecipeBuilder, *Plan, + Legal->getInductionVars()); + addUsersInExitBlock(*Plan, ExitingValuesToFix); assert(isa(Plan->getVectorLoopRegion()) && !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() && From ecde6a0e4d19f4a879c08f164900ed1a935f7423 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Tue, 20 Aug 2024 16:07:48 +0100 Subject: [PATCH 07/11] !fixup address comments, update. --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 63ac9fcd8d175..ce4f6490a653d 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8527,7 +8527,7 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW, {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); } -static SetVector> collectUsersInExitBlock( +static MapVector collectUsersInExitBlock( Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan, const MapVector &Inductions) { auto MiddleVPBB = @@ -8537,7 +8537,7 @@ static SetVector> collectUsersInExitBlock( // from scalar loop only. if (MiddleVPBB->getNumSuccessors() != 2) return {}; - SetVector> ExitingValuesToFix; + MapVector ExitingValuesToFix; BasicBlock *ExitBB = cast(MiddleVPBB->getSuccessors()[0])->getIRBasicBlock(); BasicBlock *ExitingBB = OrigLoop->getExitingBlock(); @@ -8569,7 +8569,7 @@ static SetVector> collectUsersInExitBlock( // VPWidenIntOrFpInductionRecipe or VPFirstOrderRecurrencePHIRecipe. static void addUsersInExitBlock( VPlan &Plan, - SetVector> &ExitingValuesToFix) { + MapVector &ExitingValuesToFix) { if (ExitingValuesToFix.empty()) return; @@ -8607,7 +8607,7 @@ static void addUsersInExitBlock( /// VPLiveOut which uses the latter and corresponds to the scalar header. /// 2. Feed the penultimate value of recurrences to their LCSSA phi users in /// the original exit block using a VPLiveOut. -static void addLiveOutsForFirstOrderRecurrences(VPlan &Plan) { +static void addLiveOutsForFirstOrderRecurrences(VPlan &Plan, MapVector &ExitingValuesToFix) { VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion(); // Start by finding out if middle block branches to scalar preheader, which is @@ -8729,7 +8729,7 @@ static void addLiveOutsForFirstOrderRecurrences(VPlan &Plan) { // No edge from the middle block to the unique exit block has been inserted // and there is nothing to fix from vector loop; phis should have incoming // from scalar loop only. - if (MiddleVPBB->getNumSuccessors() != 2) + if (ExitingValuesToFix.empty()) continue; BasicBlock *ExitBB = cast(MiddleVPBB->getSuccessors()[0])->getIRBasicBlock(); @@ -8741,6 +8741,7 @@ static void addLiveOutsForFirstOrderRecurrences(VPlan &Plan) { VPInstruction::ExtractFromEnd, {FOR->getBackedgeValue(), TwoVPV}, {}, "vector.recur.extract.for.phi"); Plan.addLiveOut(cast(UI), Ext); + ExitingValuesToFix.erase(cast(UI)); } } } @@ -8902,12 +8903,11 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { "VPBasicBlock"); RecipeBuilder.fixHeaderPhis(); - SetVector> ExitingValuesToFix = + MapVector ExitingValuesToFix = collectUsersInExitBlock(OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars()); - - addLiveOutsForFirstOrderRecurrences(*Plan); + addLiveOutsForFirstOrderRecurrences(*Plan, ExitingValuesToFix); addUsersInExitBlock(*Plan, ExitingValuesToFix); // --------------------------------------------------------------------------- From 655ca95c9a61d37a27e88f3ef8b7a415a9957111 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Tue, 20 Aug 2024 15:50:40 +0100 Subject: [PATCH 08/11] Collect exitingvaluestofix --- .../Transforms/Vectorize/LoopVectorize.cpp | 26 ++++++++++++------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 86e50a7f91437..987c3b8e5be23 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8527,9 +8527,7 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW, {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); } -// Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the -// original exit block. -static void addUsersInExitBlock( +static MapVector collectUsersInExitBlock( Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan, const MapVector &Inductions) { auto MiddleVPBB = @@ -8538,9 +8536,8 @@ static void addUsersInExitBlock( // and there is nothing to fix from vector loop; phis should have incoming // from scalar loop only. if (MiddleVPBB->getNumSuccessors() != 2) - return; - - // Introduce VPUsers modeling the exit values. + return {}; + MapVector ExitingValuesToFix; BasicBlock *ExitBB = cast(MiddleVPBB->getSuccessors()[0])->getIRBasicBlock(); BasicBlock *ExitingBB = OrigLoop->getExitingBlock(); @@ -8561,8 +8558,18 @@ static void addUsersInExitBlock( return P && Inductions.contains(P); }))) continue; - Plan.addLiveOut(&ExitPhi, V); + ExitingValuesToFix.insert({&ExitPhi, V}); } + return ExitingValuesToFix; +} + +// Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the +// original exit block. +static void +addUsersInExitBlock(VPlan &Plan, + MapVector &ExitingValuesToFix) { + for (const auto &[ExitPhi, V] : ExitingValuesToFix) + Plan.addLiveOut(ExitPhi, V); } /// Feed a resume value for every FOR from the vector loop to the scalar loop, @@ -8769,8 +8776,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { // After here, VPBB should not be used. VPBB = nullptr; - addUsersInExitBlock(OrigLoop, RecipeBuilder, *Plan, - Legal->getInductionVars()); + MapVector ExitingValuesToFix = collectUsersInExitBlock( + OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars()); + addUsersInExitBlock(*Plan, ExitingValuesToFix); assert(isa(Plan->getVectorLoopRegion()) && !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() && From a8786e49acb3e5f7bd0736ff47688ea652a293ab Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Tue, 20 Aug 2024 19:38:35 +0100 Subject: [PATCH 09/11] !fixup fix formatting --- .../Transforms/Vectorize/LoopVectorize.cpp | 27 ++++++++++--------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 3488e8a39f9c4..fdf835cf53f94 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8527,9 +8527,10 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW, {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); } - -// Collect (ExitPhi, ExitingValue) pairs phis in the original exit block that are modeled in VPlan. -// Some exiting values are not modeled explicitly yet and won't be included. Those are un-truncated VPWidenIntOrFpInductionRecipe, VPWidenPointerInductionRecipe and induction increments. +// Collect (ExitPhi, ExitingValue) pairs phis in the original exit block that +// are modeled in VPlan. Some exiting values are not modeled explicitly yet and +// won't be included. Those are un-truncated VPWidenIntOrFpInductionRecipe, +// VPWidenPointerInductionRecipe and induction increments. static MapVector collectUsersInExitBlock( Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan, const MapVector &Inductions) { @@ -8554,8 +8555,7 @@ static MapVector collectUsersInExitBlock( // live-outs. if ((isa(V) && !cast(V)->getTruncInst()) || - isa( - V) || + isa(V) || (isa(IncomingValue) && any_of(IncomingValue->users(), [&Inductions](User *U) { auto *P = dyn_cast(U); @@ -8567,10 +8567,11 @@ static MapVector collectUsersInExitBlock( return ExitingValuesToFix; } -// Add exit values to \p Plan. Extracts and VPLiveOuts are added for each entry in \p ExitingValuesToFix. -static void addUsersInExitBlock( - VPlan &Plan, - MapVector &ExitingValuesToFix) { +// Add exit values to \p Plan. Extracts and VPLiveOuts are added for each entry +// in \p ExitingValuesToFix. +static void +addUsersInExitBlock(VPlan &Plan, + MapVector &ExitingValuesToFix) { if (ExitingValuesToFix.empty()) return; @@ -8605,7 +8606,8 @@ static void addUsersInExitBlock( /// VPLiveOut which uses the latter and corresponds to the scalar header. /// 2. Feed the penultimate value of recurrences to their LCSSA phi users in /// the original exit block using a VPLiveOut. -static void addLiveOutsForFirstOrderRecurrences(VPlan &Plan, MapVector &ExitingValuesToFix) { +static void addLiveOutsForFirstOrderRecurrences( + VPlan &Plan, MapVector &ExitingValuesToFix) { VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion(); // Start by finding out if middle block branches to scalar preheader, which is @@ -8905,9 +8907,8 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { "VPBasicBlock"); RecipeBuilder.fixHeaderPhis(); - MapVector ExitingValuesToFix = - collectUsersInExitBlock(OrigLoop, RecipeBuilder, *Plan, - Legal->getInductionVars()); + MapVector ExitingValuesToFix = collectUsersInExitBlock( + OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars()); addLiveOutsForFirstOrderRecurrences(*Plan, ExitingValuesToFix); addUsersInExitBlock(*Plan, ExitingValuesToFix); From d53e2ba40d19c220abdf7f838b1055d48d57bbbd Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Tue, 20 Aug 2024 20:38:33 +0100 Subject: [PATCH 10/11] !fixup test updates --- .../Transforms/LoopVectorize/first-order-recurrence-complex.ll | 2 +- .../LoopVectorize/instruction-only-used-outside-of-loop.ll | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll index 3a846dd4072d3..eda92aae095dd 100644 --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll @@ -1127,7 +1127,7 @@ define void @test_for_sink_instruction_after_same_incoming_2(ptr %ptr) { ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 997, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY]] ] ; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi double [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 2.000000e+01, [[ENTRY]] ] -; CHECK-NEXT: [[SCALAR_RECUR_INIT3:%.*]] = phi double [ [[VECTOR_RECUR_EXTRACT2]], [[MIDDLE_BLOCK]] ], [ 1.000000e+01, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[SCALAR_RECUR_INIT3:%.*]] = phi double [ [[VECTOR_RECUR_EXTRACT2]], [[MIDDLE_BLOCK]] ], [ 1.000000e+01, [[ENTRY]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[SCALAR_RECUR:%.*]] = phi double [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[FOR_1_NEXT:%.*]], [[LOOP]] ] diff --git a/llvm/test/Transforms/LoopVectorize/instruction-only-used-outside-of-loop.ll b/llvm/test/Transforms/LoopVectorize/instruction-only-used-outside-of-loop.ll index fcc6b7376d408..553fc374e0fdf 100644 --- a/llvm/test/Transforms/LoopVectorize/instruction-only-used-outside-of-loop.ll +++ b/llvm/test/Transforms/LoopVectorize/instruction-only-used-outside-of-loop.ll @@ -198,6 +198,8 @@ exit: } ; Test case for PR54370. +; TODO: Should either compute the final value of the truncated IV independent +; of loop or scalarize the vector IV. define i32 @optimizable_trunc_used_outside() { ; CHECK-LABEL: @optimizable_trunc_used_outside( ; CHECK-NEXT: entry: From 4264c755a699055f6dfca9dd02bde6562005adc9 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 21 Aug 2024 07:37:07 +0100 Subject: [PATCH 11/11] !fixup address comments, thansk! --- .../Transforms/Vectorize/LoopVectorize.cpp | 30 +++++++++++-------- 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index fdf835cf53f94..364166b3ab538 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8616,16 +8616,25 @@ static void addLiveOutsForFirstOrderRecurrences( // TODO: Should be replaced by // Plan->getScalarLoopRegion()->getSinglePredecessor() in the future once the // scalar region is modeled as well. - VPBasicBlock *ScalarPHVPBB = nullptr; auto *MiddleVPBB = cast(VectorRegion->getSingleSuccessor()); - for (VPBlockBase *Succ : MiddleVPBB->getSuccessors()) { - if (isa(Succ)) - continue; - assert(!ScalarPHVPBB && "Two candidates for ScalarPHVPBB?"); - ScalarPHVPBB = cast(Succ); + BasicBlock *ExitBB = nullptr; + VPBasicBlock *ScalarPHVPBB = nullptr; + if (MiddleVPBB->getNumSuccessors() == 2) { + // Order is strict: first is the exit block, second is the scalar preheader. + ExitBB = + cast(MiddleVPBB->getSuccessors()[0])->getIRBasicBlock(); + ScalarPHVPBB = cast(MiddleVPBB->getSuccessors()[1]); + } else if (ExitingValuesToFix.empty()) { + ScalarPHVPBB = cast(MiddleVPBB->getSingleSuccessor()); + } else { + ExitBB = cast(MiddleVPBB->getSingleSuccessor()) + ->getIRBasicBlock(); } - if (!ScalarPHVPBB) + if (!ScalarPHVPBB) { + assert(ExitingValuesToFix.empty() && + "missed inserting extracts for exiting values"); return; + } VPBuilder ScalarPHBuilder(ScalarPHVPBB); VPBuilder MiddleBuilder(MiddleVPBB); @@ -8690,7 +8699,8 @@ static void addLiveOutsForFirstOrderRecurrences( // v1 = phi [v_init, vector.ph], [v2, vector.body] // v2 = a[i, i+1, i+2, i+3] // b[i] = v2 - v1 - // b[i, i+1, i+2, i+3] = v2 - v3 + // // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2)) + // b[i, i+1, i+2, i+3] = v2 - v1 // br cond, vector.body, middle.block // // middle.block: @@ -8733,10 +8743,6 @@ static void addLiveOutsForFirstOrderRecurrences( // from scalar loop only. if (ExitingValuesToFix.empty()) continue; - // If there are multiple successors of the middle block, their order is - // fixed; the first successor must be the original exit block. - BasicBlock *ExitBB = - cast(MiddleVPBB->getSuccessors()[0])->getIRBasicBlock(); for (User *U : FORPhi->users()) { auto *UI = cast(U); if (UI->getParent() != ExitBB)