From 2f441bb3ae998745e03326cc2e59ea7b54439ec4 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Tue, 23 Jul 2024 15:25:01 +0100 Subject: [PATCH 1/7] [VPlan] Introduce explicit ExtractFromEnd recipes for live-outs. Introduce explicit ExtractFromEnd recipes to extract the final values for live-outs instead of implicitly extracting in VPLiveOut::fixPhi. This is a follow-up to the recent changes of modeling extracts for recurrences and consolidates live-out extract creation for fixed-order recurrences at a single place: addLiveOutsForFirstOrderRecurrences. It is also in preparation of replacing VPLiveOut with VPIRInstructions wrapping the original scalar phis. --- .../Transforms/Vectorize/LoopVectorize.cpp | 130 ++++++++++++++++-- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 8 +- .../Transforms/Vectorize/VPlanTransforms.cpp | 94 ------------- .../RISCV/vplan-vp-intrinsics-reduction.ll | 9 +- ...-order-recurrence-sink-replicate-region.ll | 3 +- .../instruction-only-used-outside-of-loop.ll | 20 ++- .../pr55167-fold-tail-live-out.ll | 2 +- .../LoopVectorize/select-cmp-multiuse.ll | 4 +- .../LoopVectorize/vplan-printing.ll | 11 +- 9 files changed, 144 insertions(+), 137 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 09ca859f52680..61b29295d07b5 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8452,7 +8452,104 @@ static void addUsersInExitBlock( return P && Inductions.contains(P); }))) continue; - Plan.addLiveOut(&ExitPhi, V); + + auto MiddleVPBB = + cast(Plan.getVectorLoopRegion()->getSingleSuccessor()); + VPBuilder B(MiddleVPBB); + if (auto *Terminator = MiddleVPBB->getTerminator()) { + auto *Condition = dyn_cast(Terminator->getOperand(0)); + assert((!Condition || Condition->getParent() == MiddleVPBB) && + "Condition expected in MiddleVPBB"); + B.setInsertPoint(Condition ? Condition : Terminator); + } + + VPValue *Ext; + if (auto *FOR = dyn_cast_or_null( + V->getDefiningRecipe())) { + // This is the second phase of vectorizing first-order recurrences. An + // overview of the transformation is described below. Suppose we have the + // following loop with some use after the loop of the last a[i-1], + // + // for (int i = 0; i < n; ++i) { + // t = a[i - 1]; + // b[i] = a[i] - t; + // } + // use t; + // + // There is a first-order recurrence on "a". For this loop, the shorthand + // scalar IR looks like: + // + // scalar.ph: + // s_init = a[-1] + // br scalar.body + // + // scalar.body: + // i = phi [0, scalar.ph], [i+1, scalar.body] + // s1 = phi [s_init, scalar.ph], [s2, scalar.body] + // s2 = a[i] + // b[i] = s2 - s1 + // br cond, scalar.body, exit.block + // + // exit.block: + // use = lcssa.phi [s1, scalar.body] + // + // In this example, s1 is a recurrence because it's value depends on the + // previous iteration. In the first phase of vectorization, we created a + // vector phi v1 for s1. We now complete the vectorization and produce the + // shorthand vector IR shown below (for VF = 4, UF = 1). + // + // vector.ph: + // v_init = vector(..., ..., ..., a[-1]) + // br vector.body + // + // vector.body + // i = phi [0, vector.ph], [i+4, vector.body] + // v1 = phi [v_init, vector.ph], [v2, vector.body] + // v2 = a[i, i+1, i+2, i+3]; + // v3 = vector(v1(3), v2(0, 1, 2)) + // b[i, i+1, i+2, i+3] = v2 - v3 + // br cond, vector.body, middle.block + // + // middle.block: + // s_penultimate = v2(2) = v3(3) + // s_resume = v2(3) + // br cond, scalar.ph, exit.block + // + // scalar.ph: + // s_init' = phi [s_resume, middle.block], [s_init, otherwise] + // br scalar.body + // + // scalar.body: + // i = phi [0, scalar.ph], [i+1, scalar.body] + // s1 = phi [s_init', scalar.ph], [s2, scalar.body] + // s2 = a[i] + // b[i] = s2 - s1 + // br cond, scalar.body, exit.block + // + // exit.block: + // lo = lcssa.phi [s1, scalar.body], [s.penultimate, middle.block] + // + // After execution completes the vector loop, we extract the next value of + // the recurrence (x) to use as the initial value in the scalar loop. This + // is modeled by ExtractFromEnd. + // + // Extract the penultimate value of the recurrence and update VPLiveOut + // users of the recurrence splice. Note that the extract of the final + // value used to resume in the scalar loop is created earlier during VPlan + // construction. + Ext = + B.createNaryOp(VPInstruction::ExtractFromEnd, + {FOR->getBackedgeValue(), + Plan.getOrAddLiveIn(ConstantInt::get( + IntegerType::get(ExitBB->getContext(), 32), 2))}, + {}, "vector.recur.extract.for.phi"); + } else { + Ext = B.createNaryOp( + VPInstruction::ExtractFromEnd, + {V, Plan.getOrAddLiveIn(ConstantInt::get( + IntegerType::get(ExitBB->getContext(), 32), 1))}); + } + Plan.addLiveOut(&ExitPhi, Ext); } } @@ -8660,6 +8757,14 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { // After here, VPBB should not be used. VPBB = nullptr; + assert(isa(Plan->getVectorLoopRegion()) && + !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() && + "entry block must be set to a VPRegionBlock having a non-empty entry " + "VPBasicBlock"); + RecipeBuilder.fixHeaderPhis(); + + addLiveOutsForFirstOrderRecurrences(*Plan); + if (CM.requiresScalarEpilogue(Range)) { // No edge from the middle block to the unique exit block has been inserted // and there is nothing to fix from vector loop; phis should have incoming @@ -8668,13 +8773,6 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { addUsersInExitBlock(OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars()); - assert(isa(Plan->getVectorLoopRegion()) && - !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() && - "entry block must be set to a VPRegionBlock having a non-empty entry " - "VPBasicBlock"); - RecipeBuilder.fixHeaderPhis(); - - addLiveOutsForFirstOrderRecurrences(*Plan); // --------------------------------------------------------------------------- // Transform initial VPlan: Apply previously taken decisions, in order, to @@ -8884,10 +8982,12 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( for (unsigned I = 0; I != Worklist.size(); ++I) { VPSingleDefRecipe *Cur = Worklist[I]; for (VPUser *U : Cur->users()) { - auto *UserRecipe = dyn_cast(U); - if (!UserRecipe) { - assert(isa(U) && - "U must either be a VPSingleDef or VPLiveOut"); + auto *UserRecipe = cast(U); + if (!UserRecipe->getParent()->getParent()) { + assert(cast(U) && + cast(U)->getOpcode() == + VPInstruction::ExtractFromEnd && + "U must be an ExtractFromEnd VPInstruction"); continue; } Worklist.insert(UserRecipe); @@ -9105,8 +9205,10 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL); FinalReductionResult->insertBefore(*MiddleVPBB, IP); OrigExitingVPV->replaceUsesWithIf( - FinalReductionResult, - [](VPUser &User, unsigned) { return isa(&User); }); + FinalReductionResult, [](VPUser &User, unsigned) { + auto *R = dyn_cast(&User); + return R && R->getOpcode() == VPInstruction::ExtractFromEnd; + }); } VPlanTransforms::clearReductionWrapFlags(*Plan); diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 2d6d67a55c17d..798d178e5a963 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -194,9 +194,6 @@ bool VPRecipeBase::mayHaveSideEffects() const { void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) { VPValue *ExitValue = getOperand(0); - auto Lane = vputils::isUniformAfterVectorization(ExitValue) - ? VPLane::getFirstLane() - : VPLane::getLastLaneForVF(State.VF); VPBasicBlock *MiddleVPBB = cast(Plan.getVectorLoopRegion()->getSingleSuccessor()); VPRecipeBase *ExitingRecipe = ExitValue->getDefiningRecipe(); @@ -207,10 +204,7 @@ void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) { ? MiddleVPBB : ExitingVPBB; BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB]; - // Set insertion point in PredBB in case an extract needs to be generated. - // TODO: Model extracts explicitly. - State.Builder.SetInsertPoint(PredBB, PredBB->getFirstNonPHIIt()); - Value *V = State.get(ExitValue, VPIteration(State.UF - 1, Lane)); + Value *V = State.get(ExitValue, VPIteration(0, 0)); if (Phi->getBasicBlockIndex(PredBB) != -1) Phi->setIncomingValueForBlock(PredBB, V); else diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index c91fd0f118e31..967939c4854c6 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -826,20 +826,6 @@ bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan &Plan, if (auto *FOR = dyn_cast(&R)) RecurrencePhis.push_back(FOR); - VPBasicBlock *MiddleVPBB = - cast(Plan.getVectorLoopRegion()->getSingleSuccessor()); - VPBuilder MiddleBuilder; - // Set insert point so new recipes are inserted before terminator and - // condition, if there is either the former or both. - if (auto *Term = - dyn_cast_or_null(MiddleVPBB->getTerminator())) { - if (auto *Cmp = dyn_cast(Term->getOperand(0))) - MiddleBuilder.setInsertPoint(Cmp); - else - MiddleBuilder.setInsertPoint(Term); - } else - MiddleBuilder.setInsertPoint(MiddleVPBB); - for (VPFirstOrderRecurrencePHIRecipe *FOR : RecurrencePhis) { SmallPtrSet SeenPhis; VPRecipeBase *Previous = FOR->getBackedgeValue()->getDefiningRecipe(); @@ -872,86 +858,6 @@ bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan &Plan, // Set the first operand of RecurSplice to FOR again, after replacing // all users. RecurSplice->setOperand(0, FOR); - - // This is the second phase of vectorizing first-order recurrences. An - // overview of the transformation is described below. Suppose we have the - // following loop with some use after the loop of the last a[i-1], - // - // for (int i = 0; i < n; ++i) { - // t = a[i - 1]; - // b[i] = a[i] - t; - // } - // use t; - // - // There is a first-order recurrence on "a". For this loop, the shorthand - // scalar IR looks like: - // - // scalar.ph: - // s_init = a[-1] - // br scalar.body - // - // scalar.body: - // i = phi [0, scalar.ph], [i+1, scalar.body] - // s1 = phi [s_init, scalar.ph], [s2, scalar.body] - // s2 = a[i] - // b[i] = s2 - s1 - // br cond, scalar.body, exit.block - // - // exit.block: - // use = lcssa.phi [s1, scalar.body] - // - // In this example, s1 is a recurrence because it's value depends on the - // previous iteration. In the first phase of vectorization, we created a - // vector phi v1 for s1. We now complete the vectorization and produce the - // shorthand vector IR shown below (for VF = 4, UF = 1). - // - // vector.ph: - // v_init = vector(..., ..., ..., a[-1]) - // br vector.body - // - // vector.body - // i = phi [0, vector.ph], [i+4, vector.body] - // v1 = phi [v_init, vector.ph], [v2, vector.body] - // v2 = a[i, i+1, i+2, i+3]; - // v3 = vector(v1(3), v2(0, 1, 2)) - // b[i, i+1, i+2, i+3] = v2 - v3 - // br cond, vector.body, middle.block - // - // middle.block: - // s_penultimate = v2(2) = v3(3) - // s_resume = v2(3) - // br cond, scalar.ph, exit.block - // - // scalar.ph: - // s_init' = phi [s_resume, middle.block], [s_init, otherwise] - // br scalar.body - // - // scalar.body: - // i = phi [0, scalar.ph], [i+1, scalar.body] - // s1 = phi [s_init', scalar.ph], [s2, scalar.body] - // s2 = a[i] - // b[i] = s2 - s1 - // br cond, scalar.body, exit.block - // - // exit.block: - // lo = lcssa.phi [s1, scalar.body], [s.penultimate, middle.block] - // - // After execution completes the vector loop, we extract the next value of - // the recurrence (x) to use as the initial value in the scalar loop. This - // is modeled by ExtractFromEnd. - Type *IntTy = Plan.getCanonicalIV()->getScalarType(); - - // Extract the penultimate value of the recurrence and update VPLiveOut - // users of the recurrence splice. Note that the extract of the final value - // used to resume in the scalar loop is created earlier during VPlan - // construction. - auto *Penultimate = cast(MiddleBuilder.createNaryOp( - VPInstruction::ExtractFromEnd, - {FOR->getBackedgeValue(), - Plan.getOrAddLiveIn(ConstantInt::get(IntTy, 2))}, - {}, "vector.recur.extract.for.phi")); - RecurSplice->replaceUsesWithIf( - Penultimate, [](VPUser &U, unsigned) { return isa(&U); }); } return true; } diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll index 16db6cf828af8..f14ffe854a3a6 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll @@ -55,6 +55,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) { ; IF-EVL-INLOOP-EMPTY: ; IF-EVL-INLOOP-NEXT: middle.block: ; IF-EVL-INLOOP-NEXT: EMIT vp<[[RDX:%.+]]> = compute-reduction-result ir<[[RDX_PHI]]>, ir<[[ADD]]> +; IF-EVL-INLOOP-NEXT: EMIT vp<[[RDX_EX:%.+]]> = extract-from-end vp<[[RDX]]>, ir<1> ; IF-EVL-INLOOP-NEXT: EMIT branch-on-cond ir ; IF-EVL-INLOOP-NEXT: Successor(s): ir-bb, scalar.ph ; IF-EVL-INLOOP-EMPTY: @@ -64,7 +65,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) { ; IF-EVL-INLOOP-NEXT: scalar.ph: ; IF-EVL-INLOOP-NEXT: No successors ; IF-EVL-INLOOP-EMPTY: -; IF-EVL-INLOOP-NEXT: Live-out i32 %add.lcssa = vp<[[RDX]]> +; IF-EVL-INLOOP-NEXT: Live-out i32 %add.lcssa = vp<[[RDX_EX]]> ; IF-EVL-INLOOP-NEXT: } ; @@ -93,6 +94,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) { ; NO-VP-OUTLOOP-EMPTY: ; NO-VP-OUTLOOP-NEXT: middle.block: ; NO-VP-OUTLOOP-NEXT: EMIT vp<[[RDX:%.+]]> = compute-reduction-result ir<[[RDX_PHI]]>, ir<[[ADD]]> +; NO-VP-OUTLOOP-NEXT: EMIT vp<[[RDX_EX:%.+]]> = extract-from-end vp<[[RDX]]>, ir<1> ; NO-VP-OUTLOOP-NEXT: EMIT vp<[[BOC:%.+]]> = icmp eq ir<%n>, vp<[[VTC]]> ; NO-VP-OUTLOOP-NEXT: EMIT branch-on-cond vp<[[BOC]]> ; NO-VP-OUTLOOP-NEXT: Successor(s): ir-bb, scalar.ph @@ -103,7 +105,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) { ; NO-VP-OUTLOOP-NEXT: scalar.ph: ; NO-VP-OUTLOOP-NEXT: No successors ; NO-VP-OUTLOOP-EMPTY: -; NO-VP-OUTLOOP-NEXT: Live-out i32 %add.lcssa = vp<[[RDX]]> +; NO-VP-OUTLOOP-NEXT: Live-out i32 %add.lcssa = vp<[[RDX_EX]]> ; NO-VP-OUTLOOP-NEXT: } ; @@ -132,6 +134,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) { ; NO-VP-INLOOP-EMPTY: ; NO-VP-INLOOP-NEXT: middle.block: ; NO-VP-INLOOP-NEXT: EMIT vp<[[RDX:%.+]]> = compute-reduction-result ir<[[RDX_PHI]]>, ir<[[ADD]]> +; NO-VP-INLOOP-NEXT: EMIT vp<[[RDX_EX:%.+]]> = extract-from-end vp<[[RDX]]>, ir<1> ; NO-VP-INLOOP-NEXT: EMIT vp<[[BOC:%.+]]> = icmp eq ir<%n>, vp<[[VTC]]> ; NO-VP-INLOOP-NEXT: EMIT branch-on-cond vp<[[BOC]]> ; NO-VP-INLOOP-NEXT: Successor(s): ir-bb, scalar.ph @@ -142,7 +145,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) { ; NO-VP-INLOOP-NEXT: scalar.ph: ; NO-VP-INLOOP-NEXT: No successors ; NO-VP-INLOOP-EMPTY: -; NO-VP-INLOOP-NEXT: Live-out i32 %add.lcssa = vp<[[RDX]]> +; NO-VP-INLOOP-NEXT: Live-out i32 %add.lcssa = vp<[[RDX_EX]]> ; NO-VP-INLOOP-NEXT: } ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll index 06fbeafba31c0..9e49cf6b42c6b 100644 --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll @@ -220,6 +220,7 @@ define i32 @sink_replicate_region_3_reduction(i32 %x, i8 %y, ptr %ptr) optsize { ; CHECK-NEXT: middle.block: ; CHECK-NEXT: EMIT vp<[[RED_RES:%.+]]> = compute-reduction-result ir<%and.red>, vp<[[SEL]]> ; CHECK-NEXT: EMIT vp<[[RESUME_1:%.+]]> = extract-from-end ir<%recur.next>, ir<1> +; CHECK-NEXT: EMIT vp<[[RED_EX:%.+]]> = extract-from-end vp<[[RED_RES]]>, ir<1> ; CHECK-NEXT: EMIT branch-on-cond ir ; CHECK-NEXT: Successor(s): ir-bb, scalar.ph ; CHECK-EMPTY: @@ -230,8 +231,8 @@ define i32 @sink_replicate_region_3_reduction(i32 %x, i8 %y, ptr %ptr) optsize { ; CHECK-NEXT: EMIT vp<[[RESUME_1_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<0> ; CHECK-NEXT: No successors ; CHECK-EMPTY: -; CHECK-NEXT: Live-out i32 %res = vp<[[RED_RES]]> ; CHECK-NEXT: Live-out i32 %recur = vp<[[RESUME_1_P]]> +; CHECK-NEXT: Live-out i32 %res = vp<[[RED_EX]]> ; CHECK-NEXT: } ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/instruction-only-used-outside-of-loop.ll b/llvm/test/Transforms/LoopVectorize/instruction-only-used-outside-of-loop.ll index 5f5cd78dc2d30..fcc6b7376d408 100644 --- a/llvm/test/Transforms/LoopVectorize/instruction-only-used-outside-of-loop.ll +++ b/llvm/test/Transforms/LoopVectorize/instruction-only-used-outside-of-loop.ll @@ -34,7 +34,7 @@ define i32 @one_direct_branch(ptr %src) { ; CHECK-NEXT: [[PHI_XOR:%.*]] = phi i32 [ [[XOR]], [[LOOP]] ] ; CHECK-NEXT: [[IV_NEXT]] = add nsw i32 [[IV]], 1 ; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[IV_NEXT]], 1000 -; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: [[XOR_LCSSA:%.*]] = phi i32 [ [[PHI_XOR]], [[LOOP_LATCH]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[XOR_LCSSA]] @@ -205,16 +205,14 @@ define i32 @optimizable_trunc_used_outside() { ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[OFFSET_IDX]] to i32 -; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[TMP0]], 0 -; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[TMP0]], 1 -; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP0]], 2 -; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[TMP0]], 3 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[OFFSET_IDX]], 4 -; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 -; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], +; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 +; CHECK-NEXT: br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[VEC_IND]], i32 3 ; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] @@ -226,7 +224,7 @@ define i32 @optimizable_trunc_used_outside() { ; CHECK-NEXT: [[EXITCOND_NOT_I_I:%.*]] = icmp eq i64 [[IV_NEXT]], 1000 ; CHECK-NEXT: br i1 [[EXITCOND_NOT_I_I]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: exit: -; CHECK-NEXT: [[IV_TRUNC_LCSSA:%.*]] = phi i32 [ [[IV_TRUNC]], [[LOOP]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[IV_TRUNC_LCSSA:%.*]] = phi i32 [ [[IV_TRUNC]], [[LOOP]] ], [ [[TMP1]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[IV_TRUNC_LCSSA]] ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll b/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll index 72f8cf22cafa7..b79525bc3e440 100644 --- a/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll +++ b/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll @@ -34,8 +34,8 @@ define i32 @test(i32 %a, i1 %c.1, i1 %c.2 ) #0 { ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 176 ; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i32> [[PREDPHI5]], i32 1 ; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[PREDPHI7]]) +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i32> [[PREDPHI5]], i32 1 ; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 182, [[MIDDLE_BLOCK]] ], [ 6, [[BB:%.*]] ] diff --git a/llvm/test/Transforms/LoopVectorize/select-cmp-multiuse.ll b/llvm/test/Transforms/LoopVectorize/select-cmp-multiuse.ll index 9eb90099214e1..b88e597e6bc8e 100644 --- a/llvm/test/Transforms/LoopVectorize/select-cmp-multiuse.ll +++ b/llvm/test/Transforms/LoopVectorize/select-cmp-multiuse.ll @@ -916,13 +916,13 @@ define i32 @multi_user_cmp_branch_use_and_outside_bb_use(ptr readonly %a, i64 no ; CHECK-VF4-IC1-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-VF4-IC1-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK-VF4-IC1: middle.block: -; CHECK-VF4-IC1-NEXT: [[TMP8:%.*]] = extractelement <4 x i1> [[TMP3]], i32 3 ; CHECK-VF4-IC1-NEXT: [[TMP9:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]]) ; CHECK-VF4-IC1-NEXT: [[TMP10:%.*]] = freeze i1 [[TMP9]] ; CHECK-VF4-IC1-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP10]], i1 false, i1 true ; CHECK-VF4-IC1-NEXT: [[TMP11:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]]) ; CHECK-VF4-IC1-NEXT: [[TMP12:%.*]] = freeze i1 [[TMP11]] ; CHECK-VF4-IC1-NEXT: [[RDX_SELECT2:%.*]] = select i1 [[TMP12]], i1 true, i1 false +; CHECK-VF4-IC1-NEXT: [[TMP8:%.*]] = extractelement <4 x i1> [[TMP3]], i32 3 ; CHECK-VF4-IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-VF4-IC1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK-VF4-IC1: scalar.ph: @@ -986,7 +986,6 @@ define i32 @multi_user_cmp_branch_use_and_outside_bb_use(ptr readonly %a, i64 no ; CHECK-VF4-IC2-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-VF4-IC2-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK-VF4-IC2: middle.block: -; CHECK-VF4-IC2-NEXT: [[TMP15:%.*]] = extractelement <4 x i1> [[TMP7]], i32 3 ; CHECK-VF4-IC2-NEXT: [[BIN_RDX:%.*]] = or <4 x i1> [[TMP13]], [[TMP12]] ; CHECK-VF4-IC2-NEXT: [[TMP16:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[BIN_RDX]]) ; CHECK-VF4-IC2-NEXT: [[TMP17:%.*]] = freeze i1 [[TMP16]] @@ -995,6 +994,7 @@ define i32 @multi_user_cmp_branch_use_and_outside_bb_use(ptr readonly %a, i64 no ; CHECK-VF4-IC2-NEXT: [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[BIN_RDX5]]) ; CHECK-VF4-IC2-NEXT: [[TMP19:%.*]] = freeze i1 [[TMP18]] ; CHECK-VF4-IC2-NEXT: [[RDX_SELECT6:%.*]] = select i1 [[TMP19]], i1 true, i1 false +; CHECK-VF4-IC2-NEXT: [[TMP15:%.*]] = extractelement <4 x i1> [[TMP7]], i32 3 ; CHECK-VF4-IC2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-VF4-IC2-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK-VF4-IC2: scalar.ph: diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll index 3a664de748d2d..f18ed825a6b88 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll @@ -154,6 +154,7 @@ define float @print_reduction(i64 %n, ptr noalias %y) { ; CHECK-EMPTY: ; CHECK-NEXT: middle.block: ; CHECK-NEXT: EMIT vp<[[RED_RES:%.+]]> = compute-reduction-result ir<%red>, ir<%red.next> +; CHECK-NEXT: EMIT vp<[[RED_EX:%.+]]> = extract-from-end vp<[[RED_RES]]>, ir<1> ; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq ir<%n>, vp<[[VEC_TC]]> ; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]> ; CHECK-NEXT: Successor(s): ir-bb, scalar.ph @@ -164,7 +165,7 @@ define float @print_reduction(i64 %n, ptr noalias %y) { ; CHECK-NEXT: scalar.ph ; CHECK-NEXT: No successors ; CHECK-EMPTY: -; CHECK-NEXT: Live-out float %red.next.lcssa = vp<[[RED_RES]]> +; CHECK-NEXT: Live-out float %red.next.lcssa = vp<[[RED_EX]]> ; CHECK-NEXT: } ; entry: @@ -435,6 +436,7 @@ define float @print_fmuladd_strict(ptr %a, ptr %b, i64 %n) { ; CHECK-EMPTY: ; CHECK-NEXT: middle.block: ; CHECK-NEXT: EMIT vp<[[RED_RES:%.+]]> = compute-reduction-result ir<%sum.07>, ir<[[MULADD]]> +; CHECK-NEXT: EMIT vp<[[RED_EX:%.+]]> = extract-from-end vp<[[RED_RES]]>, ir<1> ; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq ir<%n>, vp<[[VEC_TC]]> ; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]> ; CHECK-NEXT: Successor(s): ir-bb, scalar.ph @@ -445,7 +447,7 @@ define float @print_fmuladd_strict(ptr %a, ptr %b, i64 %n) { ; CHECK-NEXT: scalar.ph ; CHECK-NEXT: No successors ; CHECK-EMPTY: -; CHECK-NEXT: Live-out float %muladd.lcssa = vp<[[RED_RES]]> +; CHECK-NEXT: Live-out float %muladd.lcssa = vp<[[RED_EX]]> ; CHECK-NEXT:} entry: @@ -654,6 +656,7 @@ define i32 @print_exit_value(ptr %ptr, i32 %off) { ; CHECK-NEXT: Successor(s): middle.block ; CHECK-EMPTY: ; CHECK-NEXT: middle.block: +; CHECK-NEXT: EMIT vp<[[EXIT:%.+]]> = extract-from-end ir<%add>, ir<1> ; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq ir<1000>, vp<[[VEC_TC]]> ; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]> ; CHECK-NEXT: Successor(s): ir-bb, scalar.ph @@ -664,7 +667,7 @@ define i32 @print_exit_value(ptr %ptr, i32 %off) { ; CHECK-NEXT: scalar.ph ; CHECK-NEXT: No successors ; CHECK-EMPTY: -; CHECK-NEXT: Live-out i32 %lcssa = ir<%add> +; CHECK-NEXT: Live-out i32 %lcssa = vp<[[EXIT]]> ; CHECK-NEXT: } ; entry: @@ -1036,8 +1039,8 @@ define i16 @print_first_order_recurrence_and_result(ptr %ptr) { ; CHECK-NEXT: EMIT vp<[[RESUME_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<22> ; CHECK-NEXT: No successors ; CHECK-EMPTY: -; CHECK-NEXT: Live-out i16 %for.1.lcssa = vp<[[FOR_RESULT]]> ; CHECK-NEXT: Live-out i16 %for.1 = vp<[[RESUME_P]]> +; CHECK-NEXT: Live-out i16 %for.1.lcssa = vp<[[FOR_RESULT]]> ; CHECK-NEXT: } ; entry: From 540b9c5de195002470389e47dcfbe01c5e22a540 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Fri, 26 Jul 2024 12:25:29 +0100 Subject: [PATCH 2/7] !fixup fix formatting --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index f3cc705962357..89ea7bc563673 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8773,7 +8773,6 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { addUsersInExitBlock(OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars()); - // --------------------------------------------------------------------------- // Transform initial VPlan: Apply previously taken decisions, in order, to // bring the VPlan to its final state. From 0284675e468ef130fcf8d45b6715d2c6d3829706 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Mon, 22 Jul 2024 10:47:54 +0100 Subject: [PATCH 3/7] [VPlan] Add VPIRInstruction, use for exit block live-outs. Add a new VPIRInstruction recipe to wrap existing IR instructions not to be modified during execution, execept for PHIs. For PHIs, a single VPValue operand is allowed, and it is used to add a new incoming value for the single predecessor VPBB. Expect PHIs, VPIRInstructions cannot have any operands. Depends on https://github.com/llvm/llvm-project/pull/100658. --- .../Transforms/Vectorize/LoopVectorize.cpp | 239 +++++++++--------- llvm/lib/Transforms/Vectorize/VPlan.cpp | 14 +- llvm/lib/Transforms/Vectorize/VPlan.h | 40 +++ .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 29 +++ llvm/lib/Transforms/Vectorize/VPlanValue.h | 1 + .../RISCV/riscv-vector-reverse.ll | 4 + .../RISCV/vplan-vp-intrinsics-reduction.ll | 9 +- ...-order-recurrence-sink-replicate-region.ll | 2 +- llvm/test/Transforms/LoopVectorize/pr36983.ll | 2 +- llvm/test/Transforms/LoopVectorize/pr45259.ll | 5 +- .../vplan-printing-before-execute.ll | 2 + .../LoopVectorize/vplan-printing.ll | 13 +- 12 files changed, 227 insertions(+), 133 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 89ea7bc563673..fabddbbce139a 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8434,122 +8434,133 @@ static void addUsersInExitBlock( if (!ExitBB || !ExitBB->getSinglePredecessor() || !ExitingBB) return; - // Introduce VPUsers modeling the exit values. - for (PHINode &ExitPhi : ExitBB->phis()) { - Value *IncomingValue = - ExitPhi.getIncomingValueForBlock(ExitingBB); - VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue, Plan); - // Exit values for inductions are computed and updated outside of VPlan and - // independent of induction recipes. - // TODO: Compute induction exit values in VPlan, use VPLiveOuts to update - // live-outs. - if ((isa(V) && - !cast(V)->getTruncInst()) || - isa(V) || - (isa(IncomingValue) && - any_of(IncomingValue->users(), [&Inductions](User *U) { - auto *P = dyn_cast(U); - return P && Inductions.contains(P); - }))) + auto *MiddleBlock = Plan.getVectorLoopRegion()->getSingleSuccessor(); + for (VPIRBasicBlock *VPBB : VPBlockUtils::blocksOnly( + vp_depth_first_shallow(MiddleBlock))) { + if (VPBB->getIRBasicBlock() != ExitBB) continue; - auto MiddleVPBB = - cast(Plan.getVectorLoopRegion()->getSingleSuccessor()); - VPBuilder B(MiddleVPBB); - if (auto *Terminator = MiddleVPBB->getTerminator()) { - auto *Condition = dyn_cast(Terminator->getOperand(0)); - assert((!Condition || Condition->getParent() == MiddleVPBB) && - "Condition expected in MiddleVPBB"); - B.setInsertPoint(Condition ? Condition : Terminator); - } + for (auto &R : *VPBB) { + auto *IR = dyn_cast(&R); + if (!IR) + continue; + auto *ExitPhi = dyn_cast(&IR->getInstruction()); + if (!ExitPhi) + break; + Value *IncomingValue = ExitPhi->getIncomingValueForBlock(ExitingBB); + VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue, Plan); + // Exit values for inductions are computed and updated outside of VPlan + // and independent of induction recipes. + // TODO: Compute induction exit values in VPlan, use VPLiveOuts to update + // live-outs. + if ((isa(V) && + !cast(V)->getTruncInst()) || + isa(V) || + (isa(IncomingValue) && + any_of(IncomingValue->users(), [&Inductions](User *U) { + auto *P = dyn_cast(U); + return P && Inductions.contains(P); + }))) + continue; - VPValue *Ext; - if (auto *FOR = dyn_cast_or_null( - V->getDefiningRecipe())) { - // This is the second phase of vectorizing first-order recurrences. An - // overview of the transformation is described below. Suppose we have the - // following loop with some use after the loop of the last a[i-1], - // - // for (int i = 0; i < n; ++i) { - // t = a[i - 1]; - // b[i] = a[i] - t; - // } - // use t; - // - // There is a first-order recurrence on "a". For this loop, the shorthand - // scalar IR looks like: - // - // scalar.ph: - // s_init = a[-1] - // br scalar.body - // - // scalar.body: - // i = phi [0, scalar.ph], [i+1, scalar.body] - // s1 = phi [s_init, scalar.ph], [s2, scalar.body] - // s2 = a[i] - // b[i] = s2 - s1 - // br cond, scalar.body, exit.block - // - // exit.block: - // use = lcssa.phi [s1, scalar.body] - // - // In this example, s1 is a recurrence because it's value depends on the - // previous iteration. In the first phase of vectorization, we created a - // vector phi v1 for s1. We now complete the vectorization and produce the - // shorthand vector IR shown below (for VF = 4, UF = 1). - // - // vector.ph: - // v_init = vector(..., ..., ..., a[-1]) - // br vector.body - // - // vector.body - // i = phi [0, vector.ph], [i+4, vector.body] - // v1 = phi [v_init, vector.ph], [v2, vector.body] - // v2 = a[i, i+1, i+2, i+3]; - // v3 = vector(v1(3), v2(0, 1, 2)) - // b[i, i+1, i+2, i+3] = v2 - v3 - // br cond, vector.body, middle.block - // - // middle.block: - // s_penultimate = v2(2) = v3(3) - // s_resume = v2(3) - // br cond, scalar.ph, exit.block - // - // scalar.ph: - // s_init' = phi [s_resume, middle.block], [s_init, otherwise] - // br scalar.body - // - // scalar.body: - // i = phi [0, scalar.ph], [i+1, scalar.body] - // s1 = phi [s_init', scalar.ph], [s2, scalar.body] - // s2 = a[i] - // b[i] = s2 - s1 - // br cond, scalar.body, exit.block - // - // exit.block: - // lo = lcssa.phi [s1, scalar.body], [s.penultimate, middle.block] - // - // After execution completes the vector loop, we extract the next value of - // the recurrence (x) to use as the initial value in the scalar loop. This - // is modeled by ExtractFromEnd. - // - // Extract the penultimate value of the recurrence and update VPLiveOut - // users of the recurrence splice. Note that the extract of the final - // value used to resume in the scalar loop is created earlier during VPlan - // construction. - Ext = - B.createNaryOp(VPInstruction::ExtractFromEnd, - {FOR->getBackedgeValue(), - Plan.getOrAddLiveIn(ConstantInt::get( - IntegerType::get(ExitBB->getContext(), 32), 2))}, - {}, "vector.recur.extract.for.phi"); - } else { - Ext = B.createNaryOp( - VPInstruction::ExtractFromEnd, - {V, Plan.getOrAddLiveIn(ConstantInt::get( - IntegerType::get(ExitBB->getContext(), 32), 1))}); + auto MiddleVPBB = + cast(Plan.getVectorLoopRegion()->getSingleSuccessor()); + VPBuilder B(MiddleVPBB); + if (auto *Terminator = MiddleVPBB->getTerminator()) { + auto *Condition = dyn_cast(Terminator->getOperand(0)); + assert((!Condition || Condition->getParent() == MiddleVPBB) && + "Condition expected in MiddleVPBB"); + B.setInsertPoint(Condition ? Condition : Terminator); + } + + VPValue *Ext; + if (auto *FOR = dyn_cast_or_null( + V->getDefiningRecipe())) { + // This is the second phase of vectorizing first-order recurrences. An + // overview of the transformation is described below. Suppose we have + // the following loop with some use after the loop of the last a[i-1], + // + // for (int i = 0; i < n; ++i) { + // t = a[i - 1]; + // b[i] = a[i] - t; + // } + // use t; + // + // There is a first-order recurrence on "a". For this loop, the + // shorthand scalar IR looks like: + // + // scalar.ph: + // s_init = a[-1] + // br scalar.body + // + // scalar.body: + // i = phi [0, scalar.ph], [i+1, scalar.body] + // s1 = phi [s_init, scalar.ph], [s2, scalar.body] + // s2 = a[i] + // b[i] = s2 - s1 + // br cond, scalar.body, exit.block + // + // exit.block: + // use = lcssa.phi [s1, scalar.body] + // + // In this example, s1 is a recurrence because it's value depends on the + // previous iteration. In the first phase of vectorization, we created a + // vector phi v1 for s1. We now complete the vectorization and produce + // the shorthand vector IR shown below (for VF = 4, UF = 1). + // + // vector.ph: + // v_init = vector(..., ..., ..., a[-1]) + // br vector.body + // + // vector.body + // i = phi [0, vector.ph], [i+4, vector.body] + // v1 = phi [v_init, vector.ph], [v2, vector.body] + // v2 = a[i, i+1, i+2, i+3]; + // v3 = vector(v1(3), v2(0, 1, 2)) + // b[i, i+1, i+2, i+3] = v2 - v3 + // br cond, vector.body, middle.block + // + // middle.block: + // s_penultimate = v2(2) = v3(3) + // s_resume = v2(3) + // br cond, scalar.ph, exit.block + // + // scalar.ph: + // s_init' = phi [s_resume, middle.block], [s_init, otherwise] + // br scalar.body + // + // scalar.body: + // i = phi [0, scalar.ph], [i+1, scalar.body] + // s1 = phi [s_init', scalar.ph], [s2, scalar.body] + // s2 = a[i] + // b[i] = s2 - s1 + // br cond, scalar.body, exit.block + // + // exit.block: + // lo = lcssa.phi [s1, scalar.body], [s.penultimate, middle.block] + // + // After execution completes the vector loop, we extract the next value + // of the recurrence (x) to use as the initial value in the scalar loop. + // This is modeled by ExtractFromEnd. + // + // Extract the penultimate value of the recurrence and update VPLiveOut + // users of the recurrence splice. Note that the extract of the final + // value used to resume in the scalar loop is created earlier during + // VPlan construction. + Ext = B.createNaryOp( + VPInstruction::ExtractFromEnd, + {FOR->getBackedgeValue(), + Plan.getOrAddLiveIn(ConstantInt::get( + IntegerType::get(ExitBB->getContext(), 32), 2))}, + {}, "vector.recur.extract.for.phi"); + } else { + Ext = B.createNaryOp( + VPInstruction::ExtractFromEnd, + {V, Plan.getOrAddLiveIn(ConstantInt::get( + IntegerType::get(ExitBB->getContext(), 32), 1))}); + } + IR->addOperand(Ext); } - Plan.addLiveOut(&ExitPhi, Ext); } } @@ -10156,7 +10167,9 @@ bool LoopVectorizePass::processLoop(Loop *L) { // directly in VPlan. EpilogILV.setTripCount(MainILV.getTripCount()); for (auto &R : make_early_inc_range(*BestEpiPlan.getPreheader())) { - auto *ExpandR = cast(&R); + auto *ExpandR = dyn_cast(&R); + if (!ExpandR) + continue; auto *ExpandedVal = BestEpiPlan.getOrAddLiveIn( ExpandedSCEVs.find(ExpandR->getSCEV())->second); ExpandR->replaceAllUsesWith(ExpandedVal); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 58de6256900f0..b16527386521b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -855,10 +855,18 @@ VPlan::~VPlan() { delete BackedgeTakenCount; } +static VPIRBasicBlock *createVPIRBasicBlockFor(BasicBlock *BB) { + auto *VPIRBB = new VPIRBasicBlock(BB); + for (Instruction &I : + make_range(BB->begin(), BB->getTerminator()->getIterator())) + VPIRBB->appendRecipe(new VPIRInstruction(I)); + return VPIRBB; +} + VPlanPtr VPlan::createInitialVPlan(const SCEV *TripCount, ScalarEvolution &SE, bool RequiresScalarEpilogueCheck, bool TailFolded, Loop *TheLoop) { - VPIRBasicBlock *Entry = new VPIRBasicBlock(TheLoop->getLoopPreheader()); + VPIRBasicBlock *Entry = createVPIRBasicBlockFor(TheLoop->getLoopPreheader()); VPBasicBlock *VecPreheader = new VPBasicBlock("vector.ph"); auto Plan = std::make_unique(Entry, VecPreheader); Plan->TripCount = @@ -890,7 +898,7 @@ VPlanPtr VPlan::createInitialVPlan(const SCEV *TripCount, ScalarEvolution &SE, // we unconditionally branch to the scalar preheader. Do nothing. // 3) Otherwise, construct a runtime check. BasicBlock *IRExitBlock = TheLoop->getUniqueExitBlock(); - auto *VPExitBlock = new VPIRBasicBlock(IRExitBlock); + auto *VPExitBlock = createVPIRBasicBlockFor(IRExitBlock); // The connection order corresponds to the operands of the conditional branch. VPBlockUtils::insertBlockAfter(VPExitBlock, MiddleVPBB); VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH); @@ -957,7 +965,7 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV, /// predecessor, which is rewired to the new VPIRBasicBlock. All successors of /// VPBB, if any, are rewired to the new VPIRBasicBlock. static void replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB) { - VPIRBasicBlock *IRMiddleVPBB = new VPIRBasicBlock(IRBB); + VPIRBasicBlock *IRMiddleVPBB = createVPIRBasicBlockFor(IRBB); for (auto &R : make_early_inc_range(*VPBB)) R.moveBefore(*IRMiddleVPBB, IRMiddleVPBB->end()); VPBlockBase *PredVPBB = VPBB->getSinglePredecessor(); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index c9da5e5d38a6b..2100734eead1b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -932,6 +932,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue { case VPRecipeBase::VPScalarCastSC: return true; case VPRecipeBase::VPInterleaveSC: + case VPRecipeBase::VPIRInstructionSC: case VPRecipeBase::VPBranchOnMaskSC: case VPRecipeBase::VPWidenLoadEVLSC: case VPRecipeBase::VPWidenLoadSC: @@ -1399,6 +1400,45 @@ class VPInstruction : public VPRecipeWithIRFlags { bool isSingleScalar() const; }; +/// A recipe to wrap on original IR instruction not to be modified during +/// execution, execept for PHIs. For PHIs, a single VPValue operand is allowed, +/// and it is used to add a new incoming value for the single predecessor VPBB. +/// Expect PHIs, VPIRInstructions cannot have any operands. +class VPIRInstruction : public VPRecipeBase { + Instruction &I; + +public: + VPIRInstruction(Instruction &I) + : VPRecipeBase(VPDef::VPIRInstructionSC, ArrayRef()), I(I) {} + + ~VPIRInstruction() override = default; + + VP_CLASSOF_IMPL(VPDef::VPIRInstructionSC) + + VPIRInstruction *clone() override { + auto *R = new VPIRInstruction(I); + for (auto *Op : operands()) + R->addOperand(Op); + return R; + } + + void execute(VPTransformState &State) override; + + Instruction &getInstruction() { return I; } + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif + + bool usesScalars(const VPValue *Op) const override { + assert(is_contained(operands(), Op) && + "Op must be an operand of the recipe"); + return true; + } +}; + /// VPWidenRecipe is a recipe for producing a widened instruction using the /// opcode and operands of the recipe. This recipe covers most of the /// traditional vectorization cases where each recipe transforms into a diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 798d178e5a963..780eeff5a2b7b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -856,6 +856,35 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, } #endif +void VPIRInstruction::execute(VPTransformState &State) { + assert(isa(getParent()) && + "VPIRInstructions can only be placed in VPIRBasicBlocks"); + + if (getNumOperands() == 1 && isa(&I)) { + VPValue *ExitValue = getOperand(0); + auto Lane = vputils::isUniformAfterVectorization(ExitValue) + ? VPLane::getFirstLane() + : VPLane::getLastLaneForVF(State.VF); + auto *PredVPBB = cast(getParent()->getSinglePredecessor()); + BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB]; + // Set insertion point in PredBB in case an extract needs to be generated. + // TODO: Model extracts explicitly. + State.Builder.SetInsertPoint(PredBB, PredBB->getFirstNonPHIIt()); + Value *V = State.get(ExitValue, VPIteration(State.UF - 1, Lane)); + auto *Phi = cast(&I); + Phi->addIncoming(V, PredBB); + } + + State.Builder.SetInsertPoint(I.getParent(), std::next(I.getIterator())); +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void VPIRInstruction::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "IR " << I; +} +#endif + void VPWidenCallRecipe::execute(VPTransformState &State) { assert(State.VF.isVector() && "not widening"); Function *CalledScalarFn = getCalledScalarFunction(); diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index 452c977106a77..33c221aa946f1 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -339,6 +339,7 @@ class VPDef { VPBranchOnMaskSC, VPDerivedIVSC, VPExpandSCEVSC, + VPIRInstructionSC, VPInstructionSC, VPInterleaveSC, VPReductionEVLSC, diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll index 3a14842580425..97809e50d784a 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll @@ -60,6 +60,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: vp<%2> = original trip-count ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %0 = zext i32 %n to i64 ; CHECK-NEXT: EMIT vp<%2> = EXPAND SCEV (zext i32 %n to i64) ; CHECK-NEXT: No successors ; CHECK-EMPTY: @@ -144,6 +145,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: vp<%2> = original trip-count ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %0 = zext i32 %n to i64 ; CHECK-NEXT: EMIT vp<%2> = EXPAND SCEV (zext i32 %n to i64) ; CHECK-NEXT: No successors ; CHECK-EMPTY: @@ -265,6 +267,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: vp<%2> = original trip-count ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %0 = zext i32 %n to i64 ; CHECK-NEXT: EMIT vp<%2> = EXPAND SCEV (zext i32 %n to i64) ; CHECK-NEXT: No successors ; CHECK-EMPTY: @@ -349,6 +352,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: vp<%2> = original trip-count ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %0 = zext i32 %n to i64 ; CHECK-NEXT: EMIT vp<%2> = EXPAND SCEV (zext i32 %n to i64) ; CHECK-NEXT: No successors ; CHECK-EMPTY: diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll index f14ffe854a3a6..803a5e6ecb733 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll @@ -60,12 +60,11 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) { ; IF-EVL-INLOOP-NEXT: Successor(s): ir-bb, scalar.ph ; IF-EVL-INLOOP-EMPTY: ; IF-EVL-INLOOP-NEXT: ir-bb: +; IF-EVL-INLOOP-NEXT: IR %add.lcssa = phi i32 [ %add, %for.body ] ; IF-EVL-INLOOP-NEXT: No successors ; IF-EVL-INLOOP-EMPTY: ; IF-EVL-INLOOP-NEXT: scalar.ph: ; IF-EVL-INLOOP-NEXT: No successors -; IF-EVL-INLOOP-EMPTY: -; IF-EVL-INLOOP-NEXT: Live-out i32 %add.lcssa = vp<[[RDX_EX]]> ; IF-EVL-INLOOP-NEXT: } ; @@ -100,12 +99,11 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) { ; NO-VP-OUTLOOP-NEXT: Successor(s): ir-bb, scalar.ph ; NO-VP-OUTLOOP-EMPTY: ; NO-VP-OUTLOOP-NEXT: ir-bb: +; NO-VP-OUTLOOP-NEXT: IR %add.lcssa = phi i32 [ %add, %for.body ] ; NO-VP-OUTLOOP-NEXT: No successors ; NO-VP-OUTLOOP-EMPTY: ; NO-VP-OUTLOOP-NEXT: scalar.ph: ; NO-VP-OUTLOOP-NEXT: No successors -; NO-VP-OUTLOOP-EMPTY: -; NO-VP-OUTLOOP-NEXT: Live-out i32 %add.lcssa = vp<[[RDX_EX]]> ; NO-VP-OUTLOOP-NEXT: } ; @@ -140,12 +138,11 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) { ; NO-VP-INLOOP-NEXT: Successor(s): ir-bb, scalar.ph ; NO-VP-INLOOP-EMPTY: ; NO-VP-INLOOP-NEXT: ir-bb: +; NO-VP-INLOOP-NEXT: IR %add.lcssa = phi i32 [ %add, %for.body ] ; NO-VP-INLOOP-NEXT: No successors ; NO-VP-INLOOP-EMPTY: ; NO-VP-INLOOP-NEXT: scalar.ph: ; NO-VP-INLOOP-NEXT: No successors -; NO-VP-INLOOP-EMPTY: -; NO-VP-INLOOP-NEXT: Live-out i32 %add.lcssa = vp<[[RDX_EX]]> ; NO-VP-INLOOP-NEXT: } ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll index 9e49cf6b42c6b..993b4b2a948e0 100644 --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll @@ -225,6 +225,7 @@ define i32 @sink_replicate_region_3_reduction(i32 %x, i8 %y, ptr %ptr) optsize { ; CHECK-NEXT: Successor(s): ir-bb, scalar.ph ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb +; CHECK-NEXT: IR %res = phi i32 [ %and.red.next, %loop ] ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph @@ -232,7 +233,6 @@ define i32 @sink_replicate_region_3_reduction(i32 %x, i8 %y, ptr %ptr) optsize { ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: Live-out i32 %recur = vp<[[RESUME_1_P]]> -; CHECK-NEXT: Live-out i32 %res = vp<[[RED_EX]]> ; CHECK-NEXT: } ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/pr36983.ll b/llvm/test/Transforms/LoopVectorize/pr36983.ll index f4da4f355eb0d..7e38d60b6f581 100644 --- a/llvm/test/Transforms/LoopVectorize/pr36983.ll +++ b/llvm/test/Transforms/LoopVectorize/pr36983.ll @@ -4,7 +4,7 @@ ; CHECK-LABEL: bb1.bb3_crit_edge: ; CHECK: %_tmp133.lcssa1 = phi i16 [ %_tmp133, %bb2 ], [ %vector.recur.extract.for.phi, %middle.block ] -; CHECK: %_tmp133.lcssa = phi i16 [ %_tmp133, %bb2 ], [ %vector.recur.extract.for.phi, %middle.block ] +; CHECK: %_tmp133.lcssa = phi i16 [ %_tmp133, %bb2 ], [ %vector.recur.extract.for.phi1, %middle.block ] define void @f1() { bb2.lr.ph: diff --git a/llvm/test/Transforms/LoopVectorize/pr45259.ll b/llvm/test/Transforms/LoopVectorize/pr45259.ll index dcc8f3f2f9d8f..008971697775e 100644 --- a/llvm/test/Transforms/LoopVectorize/pr45259.ll +++ b/llvm/test/Transforms/LoopVectorize/pr45259.ll @@ -14,11 +14,12 @@ define i8 @widget(ptr %arr, i8 %t9) { ; CHECK-NEXT: br i1 [[C]], label [[FOR_PREHEADER:%.*]], label [[BB6]] ; CHECK: for.preheader: ; CHECK-NEXT: [[T1_0_LCSSA:%.*]] = phi ptr [ [[T1_0]], [[BB6]] ] -; CHECK-NEXT: [[T1_0_LCSSA2:%.*]] = ptrtoint ptr [[T1_0_LCSSA]] to i64 ; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[ARR1]] to i32 ; CHECK-NEXT: [[TMP1:%.*]] = sub i32 0, [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[T1_0_LCSSA2]] to i32 +; CHECK-NEXT: [[T1_0_LCSSA3:%.*]] = ptrtoint ptr [[T1_0_LCSSA]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[T1_0_LCSSA3]] to i32 ; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[T1_0_LCSSA2:%.*]] = ptrtoint ptr [[T1_0_LCSSA]] to i64 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP3]], 4 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] ; CHECK: vector.scevcheck: diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll index e4984f52ee6ff..431d14be45857 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll @@ -14,6 +14,7 @@ define void @test_tc_less_than_16(ptr %A, i64 %N) { ; CHECK-NEXT: vp<[[TC:%.+]]> = original trip-count ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %and = and i64 %N, 15 ; CHECK-NEXT: EMIT vp<[[TC]]> = EXPAND SCEV (zext i4 (trunc i64 %N to i4) to i64) ; CHECK-NEXT: No successors ; CHECK-EMPTY: @@ -55,6 +56,7 @@ define void @test_tc_less_than_16(ptr %A, i64 %N) { ; CHECK-NEXT: vp<[[TC:%.+]]> = original trip-count ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %and = and i64 %N, 15 ; CHECK-NEXT: EMIT vp<[[TC]]> = EXPAND SCEV (zext i4 (trunc i64 %N to i4) to i64) ; CHECK-NEXT: No successors ; CHECK-EMPTY: diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll index f18ed825a6b88..9b7051630347e 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll @@ -160,12 +160,11 @@ define float @print_reduction(i64 %n, ptr noalias %y) { ; CHECK-NEXT: Successor(s): ir-bb, scalar.ph ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb +; CHECK-NEXT: IR %red.next.lcssa = phi float [ %red.next, %for.body ] ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph ; CHECK-NEXT: No successors -; CHECK-EMPTY: -; CHECK-NEXT: Live-out float %red.next.lcssa = vp<[[RED_EX]]> ; CHECK-NEXT: } ; entry: @@ -442,12 +441,11 @@ define float @print_fmuladd_strict(ptr %a, ptr %b, i64 %n) { ; CHECK-NEXT: Successor(s): ir-bb, scalar.ph ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb +; CHECK-NEXT: IR %muladd.lcssa = phi float [ %muladd, %for.body ] ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph ; CHECK-NEXT: No successors -; CHECK-EMPTY: -; CHECK-NEXT: Live-out float %muladd.lcssa = vp<[[RED_EX]]> ; CHECK-NEXT:} entry: @@ -574,6 +572,8 @@ define void @print_expand_scev(i64 %y, ptr %ptr) { ; CHECK-NEXT: vp<[[TC:%.+]]> = original trip-count ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %div = udiv i64 %y, 492802768830814060 +; CHECK-NEXT: IR %inc = add i64 %div, 1 ; CHECK-NEXT: EMIT vp<[[TC]]> = EXPAND SCEV (1 + ((15 + (%y /u 492802768830814060)) /u (1 + (%y /u 492802768830814060)))) ; CHECK-NEXT: EMIT vp<[[EXP_SCEV:%.+]]> = EXPAND SCEV (1 + (%y /u 492802768830814060)) ; CHECK-NEXT: No successors @@ -662,12 +662,11 @@ define i32 @print_exit_value(ptr %ptr, i32 %off) { ; CHECK-NEXT: Successor(s): ir-bb, scalar.ph ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb +; CHECK-NEXT: IR %lcssa = phi i32 [ %add, %loop ] ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph ; CHECK-NEXT: No successors -; CHECK-EMPTY: -; CHECK-NEXT: Live-out i32 %lcssa = vp<[[EXIT]]> ; CHECK-NEXT: } ; entry: @@ -1033,6 +1032,7 @@ define i16 @print_first_order_recurrence_and_result(ptr %ptr) { ; CHECK-NEXT: Successor(s): ir-bb, scalar.ph ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb +; CHECK-NEXT: IR %for.1.lcssa = phi i16 [ %for.1, %loop ] ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph @@ -1040,7 +1040,6 @@ define i16 @print_first_order_recurrence_and_result(ptr %ptr) { ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: Live-out i16 %for.1 = vp<[[RESUME_P]]> -; CHECK-NEXT: Live-out i16 %for.1.lcssa = vp<[[FOR_RESULT]]> ; CHECK-NEXT: } ; entry: From de8674195cdc97847b6e262c1d1f38be039b9545 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Sun, 25 Aug 2024 14:54:42 +0100 Subject: [PATCH 4/7] !fixup address comments, thanks! --- .../Transforms/Vectorize/LoopVectorize.cpp | 80 ++++++++----------- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 10 ++- ...-order-recurrence-sink-replicate-region.ll | 1 - .../LoopVectorize/vplan-printing.ll | 1 - 4 files changed, 41 insertions(+), 51 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index b51eea223d802..e0f2046a263d5 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8592,7 +8592,7 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW, // are modeled in VPlan. Some exiting values are not modeled explicitly yet and // won't be included. Those are un-truncated VPWidenIntOrFpInductionRecipe, // VPWidenPointerInductionRecipe and induction increments. -static MapVector collectUsersInExitBlock( +static MapVector collectUsersInExitBlock( Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan, const MapVector &Inductions) { auto MiddleVPBB = @@ -8602,13 +8602,17 @@ static MapVector collectUsersInExitBlock( // from scalar loop only. if (MiddleVPBB->getNumSuccessors() != 2) return {}; - MapVector ExitingValuesToFix; - BasicBlock *ExitBB = - cast(MiddleVPBB->getSuccessors()[0])->getIRBasicBlock(); + MapVector ExitingValuesToFix; + VPBasicBlock *ExitVPBB = cast(MiddleVPBB->getSuccessors()[0]); BasicBlock *ExitingBB = OrigLoop->getExitingBlock(); - for (PHINode &ExitPhi : ExitBB->phis()) { - Value *IncomingValue = - ExitPhi.getIncomingValueForBlock(ExitingBB); + for (VPRecipeBase &R : *ExitVPBB) { + auto *IR = dyn_cast(&R); + if (!IR) + continue; + auto *ExitPhi = dyn_cast(&IR->getInstruction()); + if (!ExitPhi) + break; + Value *IncomingValue = ExitPhi->getIncomingValueForBlock(ExitingBB); VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue, Plan); // Exit values for inductions are computed and updated outside of VPlan and // independent of induction recipes. @@ -8623,16 +8627,15 @@ static MapVector collectUsersInExitBlock( return P && Inductions.contains(P); }))) continue; - ExitingValuesToFix.insert({&ExitPhi, V}); + ExitingValuesToFix.insert({IR, V}); } return ExitingValuesToFix; } // Add exit values to \p Plan. Extracts and VPLiveOuts are added for each entry // in \p ExitingValuesToFix. -static void -addUsersInExitBlock(VPlan &Plan, - MapVector &ExitingValuesToFix) { +static void addUsersInExitBlock( + VPlan &Plan, MapVector &ExitingValuesToFix) { if (ExitingValuesToFix.empty()) return; @@ -8650,12 +8653,12 @@ addUsersInExitBlock(VPlan &Plan, } // Introduce VPUsers modeling the exit values. - for (const auto &[ExitPhi, V] : ExitingValuesToFix) { + for (const auto &[IR, V] : ExitingValuesToFix) { VPValue *Ext = B.createNaryOp( VPInstruction::ExtractFromEnd, {V, Plan.getOrAddLiveIn(ConstantInt::get( IntegerType::get(ExitBB->getContext(), 32), 1))}); - Plan.addLiveOut(ExitPhi, Ext); + IR->addOperand(Ext); } } @@ -8668,7 +8671,7 @@ addUsersInExitBlock(VPlan &Plan, /// 2. Feed the penultimate value of recurrences to their LCSSA phi users in /// the original exit block using a VPLiveOut. static void addLiveOutsForFirstOrderRecurrences( - VPlan &Plan, MapVector &ExitingValuesToFix) { + VPlan &Plan, MapVector &ExitingValuesToFix) { VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion(); // Start by finding out if middle block branches to scalar preheader, which is @@ -8802,17 +8805,14 @@ static void addLiveOutsForFirstOrderRecurrences( // No edge from the middle block to the unique exit block has been inserted // and there is nothing to fix from vector loop; phis should have incoming // from scalar loop only. - if (ExitingValuesToFix.empty()) - continue; - for (User *U : FORPhi->users()) { - auto *UI = cast(U); - if (UI->getParent() != ExitBB) + for (const auto &[IR, V] : ExitingValuesToFix) { + if (V != FOR) continue; VPValue *Ext = MiddleBuilder.createNaryOp( VPInstruction::ExtractFromEnd, {FOR->getBackedgeValue(), TwoVPV}, {}, "vector.recur.extract.for.phi"); - Plan.addLiveOut(cast(UI), Ext); - ExitingValuesToFix.erase(cast(UI)); + IR->addOperand(Ext); + ExitingValuesToFix.erase(IR); } } } @@ -8974,20 +8974,13 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { "VPBasicBlock"); RecipeBuilder.fixHeaderPhis(); - MapVector ExitingValuesToFix = collectUsersInExitBlock( - OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars()); + MapVector ExitingValuesToFix = + collectUsersInExitBlock(OrigLoop, RecipeBuilder, *Plan, + Legal->getInductionVars()); addLiveOutsForFirstOrderRecurrences(*Plan, ExitingValuesToFix); addUsersInExitBlock(*Plan, ExitingValuesToFix); - if (CM.requiresScalarEpilogue(Range)) { - // No edge from the middle block to the unique exit block has been inserted - // and there is nothing to fix from vector loop; phis should have incoming - // from scalar loop only. - } else - addUsersInExitBlock(OrigLoop, RecipeBuilder, *Plan, - Legal->getInductionVars()); - // --------------------------------------------------------------------------- // Transform initial VPlan: Apply previously taken decisions, in order, to // bring the VPlan to its final state. @@ -9198,13 +9191,9 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( VPSingleDefRecipe *Cur = Worklist[I]; for (VPUser *U : Cur->users()) { auto *UserRecipe = cast(U); - if (!UserRecipe->getParent()->getParent()) { - assert(cast(U) && - cast(U)->getOpcode() == - VPInstruction::ExtractFromEnd && -/* if (!UserRecipe->getParent()->getEnclosingLoopRegion()) {*/ - /*assert(match(U, m_Binary(*/ - /*m_VPValue(), m_VPValue())) &&*/ + if (!UserRecipe->getParent()->getEnclosingLoopRegion()) { + assert(match(U, m_Binary( + m_VPValue(), m_VPValue())) && "U must be an ExtractFromEnd VPInstruction"); continue; } @@ -9422,16 +9411,11 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( auto *FinalReductionResult = new VPInstruction( VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL); FinalReductionResult->insertBefore(*MiddleVPBB, IP); - OrigExitingVPV->replaceUsesWithIf( - FinalReductionResult, [](VPUser &User, unsigned) { - auto *R = dyn_cast(&User); - return R && R->getOpcode() == VPInstruction::ExtractFromEnd; - }); -/* OrigExitingVPV->replaceUsesWithIf(FinalReductionResult, [](VPUser &User,*/ - /*unsigned) {*/ - /*return match(&User, m_Binary(m_VPValue(),*/ - /*m_VPValue()));*/ - /*});*/ + OrigExitingVPV->replaceUsesWithIf(FinalReductionResult, [](VPUser &User, + unsigned) { + return match(&User, m_Binary(m_VPValue(), + m_VPValue())); + }); } VPlanTransforms::clearReductionWrapFlags(*Plan); diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 62d408e786a4f..1be9a1af94e24 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -860,7 +860,9 @@ void VPIRInstruction::execute(VPTransformState &State) { assert(isa(getParent()) && "VPIRInstructions can only be placed in VPIRBasicBlocks"); - if (getNumOperands() == 1 && isa(&I)) { + assert((isa(&I) || getNumOperands() == 0) && + "Only PHINodes can have extra operands"); + if (getNumOperands() == 1) { VPValue *ExitValue = getOperand(0); auto Lane = vputils::isUniformAfterVectorization(ExitValue) ? VPLane::getFirstLane() @@ -882,6 +884,12 @@ void VPIRInstruction::execute(VPTransformState &State) { void VPIRInstruction::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "IR " << I; + + if (getNumOperands() != 0) { + O << " (extra operands: "; + printOperands(O, SlotTracker); + O << ")"; + } } #endif diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll index f1fb2f7f58e77..993b4b2a948e0 100644 --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll @@ -233,7 +233,6 @@ define i32 @sink_replicate_region_3_reduction(i32 %x, i8 %y, ptr %ptr) optsize { ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: Live-out i32 %recur = vp<[[RESUME_1_P]]> -; CHECK-NEXT: Live-out i32 %res = vp<[[RED_EX]]> ; CHECK-NEXT: } ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll index 22f3c23b47e60..9b7051630347e 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll @@ -1040,7 +1040,6 @@ define i16 @print_first_order_recurrence_and_result(ptr %ptr) { ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: Live-out i16 %for.1 = vp<[[RESUME_P]]> -; CHECK-NEXT: Live-out i16 %for.1.lcssa = vp<[[FOR_RESULT]]> ; CHECK-NEXT: } ; entry: From aab14636f2146153c064a3597e9f60d5627669b9 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Fri, 13 Sep 2024 21:56:14 +0100 Subject: [PATCH 5/7] !fixup address latest comments, thanks! --- .../Transforms/Vectorize/LoopVectorize.cpp | 29 ++++++++++--------- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 8 ++--- .../Transforms/Vectorize/VPlanVerifier.cpp | 11 +++++++ .../RISCV/vplan-vp-intrinsics-reduction.ll | 6 ++-- ...-order-recurrence-sink-replicate-region.ll | 2 +- .../LoopVectorize/vplan-printing.ll | 8 ++--- 6 files changed, 38 insertions(+), 26 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 6232a7b1c16df..57e4da6562695 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8638,10 +8638,11 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW, {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); } -// Collect (ExitPhi, ExitingValue) pairs phis in the original exit block that -// are modeled in VPlan. Some exiting values are not modeled explicitly yet and -// won't be included. Those are un-truncated VPWidenIntOrFpInductionRecipe, -// VPWidenPointerInductionRecipe and induction increments. +// Collect (VPIRInstruction, ExitingValue) pairs for phis in the original exit +// block that are modeled in VPlan. Some exiting values are not modeled +// explicitly yet and won't be included. Those are un-truncated +// VPWidenIntOrFpInductionRecipe, VPWidenPointerInductionRecipe and induction +// increments. static MapVector collectUsersInExitBlock( Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan, const MapVector &Inductions) { @@ -8656,10 +8657,10 @@ static MapVector collectUsersInExitBlock( VPBasicBlock *ExitVPBB = cast(MiddleVPBB->getSuccessors()[0]); BasicBlock *ExitingBB = OrigLoop->getExitingBlock(); for (VPRecipeBase &R : *ExitVPBB) { - auto *IR = dyn_cast(&R); - if (!IR) + auto *ExitIRI = dyn_cast(&R); + if (!ExitIRI) continue; - auto *ExitPhi = dyn_cast(&IR->getInstruction()); + auto *ExitPhi = dyn_cast(&ExitIRI->getInstruction()); if (!ExitPhi) break; Value *IncomingValue = ExitPhi->getIncomingValueForBlock(ExitingBB); @@ -8678,7 +8679,7 @@ static MapVector collectUsersInExitBlock( return P && Inductions.contains(P); }))) continue; - ExitingValuesToFix.insert({IR, V}); + ExitingValuesToFix.insert({ExitIRI, V}); } return ExitingValuesToFix; } @@ -8697,10 +8698,10 @@ static void addUsersInExitBlock( VPBuilder B(MiddleVPBB, MiddleVPBB->getFirstNonPhi()); // Introduce VPUsers modeling the exit values. - for (const auto &[IR, V] : ExitingValuesToFix) { + for (const auto &[ExitIRI, V] : ExitingValuesToFix) { // Pass live-in values used by exit phis directly through to the live-out. if (V->isLiveIn()) { - IR->addOperand(V); + ExitIRI->addOperand(V); continue; } @@ -8708,7 +8709,7 @@ static void addUsersInExitBlock( VPInstruction::ExtractFromEnd, {V, Plan.getOrAddLiveIn(ConstantInt::get( IntegerType::get(ExitBB->getContext(), 32), 1))}); - IR->addOperand(Ext); + ExitIRI->addOperand(Ext); } } @@ -8846,14 +8847,14 @@ static void addLiveOutsForFirstOrderRecurrences( // No edge from the middle block to the unique exit block has been inserted // and there is nothing to fix from vector loop; phis should have incoming // from scalar loop only. - for (const auto &[IR, V] : ExitingValuesToFix) { + for (const auto &[ExitIRI, V] : ExitingValuesToFix) { if (V != FOR) continue; VPValue *Ext = MiddleBuilder.createNaryOp( VPInstruction::ExtractFromEnd, {FOR->getBackedgeValue(), TwoVPV}, {}, "vector.recur.extract.for.phi"); - IR->addOperand(Ext); - ExitingValuesToFix.erase(IR); + ExitIRI->addOperand(Ext); + ExitingValuesToFix.erase(ExitIRI); } } } diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 8a3242e164865..9068ccf519c55 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -868,9 +868,6 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, #endif void VPIRInstruction::execute(VPTransformState &State) { - assert(isa(getParent()) && - "VPIRInstructions can only be placed in VPIRBasicBlocks"); - assert((isa(&I) || getNumOperands() == 0) && "Only PHINodes can have extra operands"); if (getNumOperands() == 1) { @@ -888,6 +885,8 @@ void VPIRInstruction::execute(VPTransformState &State) { Phi->addIncoming(V, PredBB); } + // Advance the insert point after the wrapped IR instruction. This allows + // interleaving VPIRInstructions and other recipes. State.Builder.SetInsertPoint(I.getParent(), std::next(I.getIterator())); } @@ -897,7 +896,8 @@ void VPIRInstruction::print(raw_ostream &O, const Twine &Indent, O << Indent << "IR " << I; if (getNumOperands() != 0) { - O << " (extra operands: "; + assert(getNumOperands() == 1 && "can have at most 1 operand"); + O << " (extra operand: "; printOperands(O, SlotTracker); O << ")"; } diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index dfddb5b45f623..ee6f035f1f494 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -126,6 +126,17 @@ bool VPlanVerifier::verifyVPBasicBlock(const VPBasicBlock *VPBB) { RecipeNumbering[&R] = Cnt++; for (const VPRecipeBase &R : *VPBB) { + if (auto *IRI = dyn_cast(&R)) { + if (!isa(IRI->getParent())) { + errs() << "VPIRInstructions "; +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + IRI->dump(); + errs() << " "; +#endif + errs() << "not in a VPIRBasicBlock!\n"; + return false; + } + } for (const VPValue *V : R.definedValues()) { for (const VPUser *U : V->users()) { auto *UI = dyn_cast(U); diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll index 803a5e6ecb733..11405a1c91158 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll @@ -60,7 +60,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) { ; IF-EVL-INLOOP-NEXT: Successor(s): ir-bb, scalar.ph ; IF-EVL-INLOOP-EMPTY: ; IF-EVL-INLOOP-NEXT: ir-bb: -; IF-EVL-INLOOP-NEXT: IR %add.lcssa = phi i32 [ %add, %for.body ] +; IF-EVL-INLOOP-NEXT: IR %add.lcssa = phi i32 [ %add, %for.body ] (extra operand: vp<[[RDX_EX]]>) ; IF-EVL-INLOOP-NEXT: No successors ; IF-EVL-INLOOP-EMPTY: ; IF-EVL-INLOOP-NEXT: scalar.ph: @@ -99,7 +99,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) { ; NO-VP-OUTLOOP-NEXT: Successor(s): ir-bb, scalar.ph ; NO-VP-OUTLOOP-EMPTY: ; NO-VP-OUTLOOP-NEXT: ir-bb: -; NO-VP-OUTLOOP-NEXT: IR %add.lcssa = phi i32 [ %add, %for.body ] +; NO-VP-OUTLOOP-NEXT: IR %add.lcssa = phi i32 [ %add, %for.body ] (extra operand: vp<[[RDX_EX]]>) ; NO-VP-OUTLOOP-NEXT: No successors ; NO-VP-OUTLOOP-EMPTY: ; NO-VP-OUTLOOP-NEXT: scalar.ph: @@ -138,7 +138,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) { ; NO-VP-INLOOP-NEXT: Successor(s): ir-bb, scalar.ph ; NO-VP-INLOOP-EMPTY: ; NO-VP-INLOOP-NEXT: ir-bb: -; NO-VP-INLOOP-NEXT: IR %add.lcssa = phi i32 [ %add, %for.body ] +; NO-VP-INLOOP-NEXT: IR %add.lcssa = phi i32 [ %add, %for.body ] (extra operand: vp<[[RDX_EX]]>) ; NO-VP-INLOOP-NEXT: No successors ; NO-VP-INLOOP-EMPTY: ; NO-VP-INLOOP-NEXT: scalar.ph: diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll index b0533997e30a0..45545feffd325 100644 --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll @@ -227,7 +227,7 @@ define i32 @sink_replicate_region_3_reduction(i32 %x, i8 %y, ptr %ptr) optsize { ; CHECK-NEXT: Successor(s): ir-bb, scalar.ph ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb -; CHECK-NEXT: IR %res = phi i32 [ %and.red.next, %loop ] +; CHECK-NEXT: IR %res = phi i32 [ %and.red.next, %loop ] (extra operand: vp<[[RED_EX]]>) ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll index b0dc09e79f864..26974c2307065 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll @@ -161,7 +161,7 @@ define float @print_reduction(i64 %n, ptr noalias %y) { ; CHECK-NEXT: Successor(s): ir-bb, scalar.ph ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb -; CHECK-NEXT: IR %red.next.lcssa = phi float [ %red.next, %for.body ] +; CHECK-NEXT: IR %red.next.lcssa = phi float [ %red.next, %for.body ] (extra operand: vp<[[RED_EX]]>) ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph @@ -443,7 +443,7 @@ define float @print_fmuladd_strict(ptr %a, ptr %b, i64 %n) { ; CHECK-NEXT: Successor(s): ir-bb, scalar.ph ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb -; CHECK-NEXT: IR %muladd.lcssa = phi float [ %muladd, %for.body ] +; CHECK-NEXT: IR %muladd.lcssa = phi float [ %muladd, %for.body ] (extra operand: vp<[[RED_EX]]>) ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph @@ -666,7 +666,7 @@ define i32 @print_exit_value(ptr %ptr, i32 %off) { ; CHECK-NEXT: Successor(s): ir-bb, scalar.ph ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb -; CHECK-NEXT: IR %lcssa = phi i32 [ %add, %loop ] +; CHECK-NEXT: IR %lcssa = phi i32 [ %add, %loop ] (extra operand: vp<[[EXIT]]>) ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph @@ -1036,7 +1036,7 @@ define i16 @print_first_order_recurrence_and_result(ptr %ptr) { ; CHECK-NEXT: Successor(s): ir-bb, scalar.ph ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb -; CHECK-NEXT: IR %for.1.lcssa = phi i16 [ %for.1, %loop ] +; CHECK-NEXT: IR %for.1.lcssa = phi i16 [ %for.1, %loop ] (extra operand: vp<[[FOR_RESULT]]>) ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph From 3dc696bad75a3b741fdfa24e2d48533b05177c82 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Sat, 14 Sep 2024 20:41:09 +0100 Subject: [PATCH 6/7] !fixup address latest comments, thanks! --- .../Transforms/Vectorize/LoopVectorize.cpp | 63 ++++++++----------- .../Transforms/Vectorize/VPlanVerifier.cpp | 14 ++--- 2 files changed, 32 insertions(+), 45 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 2fea600a29882..feac26b607762 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8630,12 +8630,8 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW, {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); } -// Collect (VPIRInstruction, ExitingValue) pairs for phis in the original exit -// block that are modeled in VPlan. Some exiting values are not modeled -// explicitly yet and won't be included. Those are un-truncated -// VPWidenIntOrFpInductionRecipe, VPWidenPointerInductionRecipe and induction -// increments. -static MapVector collectUsersInExitBlock( +// Collect VPIRInstructions for phis in the original exit block that are modeled in VPlan and add the exiting VPValue as operand. Some exiting values are not modeled explicitly yet and won't be included. Those are un-truncated VPWidenIntOrFpInductionRecipe, VPWidenPointerInductionRecipe and induction increments. +static SetVector collectUsersInExitBlock( Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan, const MapVector &Inductions) { auto *MiddleVPBB = @@ -8645,7 +8641,7 @@ static MapVector collectUsersInExitBlock( // from scalar loop only. if (MiddleVPBB->getNumSuccessors() != 2) return {}; - MapVector ExitingValuesToFix; + SetVector ExitUsersToFix; VPBasicBlock *ExitVPBB = cast(MiddleVPBB->getSuccessors()[0]); BasicBlock *ExitingBB = OrigLoop->getExitingBlock(); for (VPRecipeBase &R : *ExitVPBB) { @@ -8671,16 +8667,16 @@ static MapVector collectUsersInExitBlock( return P && Inductions.contains(P); }))) continue; - ExitingValuesToFix.insert({ExitIRI, V}); + ExitUsersToFix.insert(ExitIRI); + ExitIRI->addOperand(V); } - return ExitingValuesToFix; + return ExitUsersToFix; } -// Add exit values to \p Plan. Extracts and VPLiveOuts are added for each entry -// in \p ExitingValuesToFix. +// Add exit values to \p Plan. Extracts are added for each entry in \p ExitUsersToFix if needed and their operands are updated. static void addUsersInExitBlock( - VPlan &Plan, MapVector &ExitingValuesToFix) { - if (ExitingValuesToFix.empty()) + VPlan &Plan, const SetVector &ExitUsersToFix) { + if (ExitUsersToFix.empty()) return; auto *MiddleVPBB = @@ -8689,19 +8685,18 @@ static void addUsersInExitBlock( cast(MiddleVPBB->getSuccessors()[0])->getIRBasicBlock(); VPBuilder B(MiddleVPBB, MiddleVPBB->getFirstNonPhi()); - // Introduce VPUsers modeling the exit values. - for (const auto &[ExitIRI, V] : ExitingValuesToFix) { + // Introduce extract for exiting values and update the VPIRInstructions modeling the corresponding LCSSA phis. + for (VPIRInstruction *ExitIRI : ExitUsersToFix) { + VPValue *V = ExitIRI->getOperand(0); // Pass live-in values used by exit phis directly through to the live-out. - if (V->isLiveIn()) { - ExitIRI->addOperand(V); + if (V->isLiveIn()) continue; - } VPValue *Ext = B.createNaryOp( VPInstruction::ExtractFromEnd, {V, Plan.getOrAddLiveIn(ConstantInt::get( IntegerType::get(ExitBB->getContext(), 32), 1))}); - ExitIRI->addOperand(Ext); + ExitIRI->setOperand(0, Ext); } } @@ -8714,7 +8709,7 @@ static void addUsersInExitBlock( /// 2. Feed the penultimate value of recurrences to their LCSSA phi users in /// the original exit block using a VPLiveOut. static void addLiveOutsForFirstOrderRecurrences( - VPlan &Plan, MapVector &ExitingValuesToFix) { + VPlan &Plan, SetVector &ExitUsersToFix) { VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion(); // Start by finding out if middle block branches to scalar preheader, which is @@ -8731,14 +8726,14 @@ static void addLiveOutsForFirstOrderRecurrences( ExitBB = cast(MiddleVPBB->getSuccessors()[0])->getIRBasicBlock(); ScalarPHVPBB = cast(MiddleVPBB->getSuccessors()[1]); - } else if (ExitingValuesToFix.empty()) { + } else if (ExitUsersToFix.empty()) { ScalarPHVPBB = cast(MiddleVPBB->getSingleSuccessor()); } else { ExitBB = cast(MiddleVPBB->getSingleSuccessor()) ->getIRBasicBlock(); } if (!ScalarPHVPBB) { - assert(ExitingValuesToFix.empty() && + assert(ExitUsersToFix.empty() && "missed inserting extracts for exiting values"); return; } @@ -8832,21 +8827,16 @@ static void addLiveOutsForFirstOrderRecurrences( auto *FORPhi = cast(FOR->getUnderlyingInstr()); Plan.addLiveOut(FORPhi, ResumePhiRecipe); - // Now create VPLiveOuts for users in the exit block. - // Extract the penultimate value of the recurrence and add VPLiveOut - // users of the recurrence splice. - - // No edge from the middle block to the unique exit block has been inserted - // and there is nothing to fix from vector loop; phis should have incoming - // from scalar loop only. - for (const auto &[ExitIRI, V] : ExitingValuesToFix) { - if (V != FOR) + // Now update VPIRInstructions modeling LCSSA phis in the exit block. + // Extract the penultimate value of the recurrence and use it as operand for the VPIRInstruction modeling the phi. + for (VPIRInstruction *ExitIRI : ExitUsersToFix) { + if (ExitIRI->getOperand(0) != FOR) continue; VPValue *Ext = MiddleBuilder.createNaryOp( VPInstruction::ExtractFromEnd, {FOR->getBackedgeValue(), TwoVPV}, {}, "vector.recur.extract.for.phi"); - ExitIRI->addOperand(Ext); - ExitingValuesToFix.erase(ExitIRI); + ExitIRI->setOperand(0, Ext); + ExitUsersToFix.remove(ExitIRI); } } } @@ -9008,12 +8998,11 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { "VPBasicBlock"); RecipeBuilder.fixHeaderPhis(); - MapVector ExitingValuesToFix = + SetVector ExitUsersToFix = collectUsersInExitBlock(OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars()); - - addLiveOutsForFirstOrderRecurrences(*Plan, ExitingValuesToFix); - addUsersInExitBlock(*Plan, ExitingValuesToFix); + addLiveOutsForFirstOrderRecurrences(*Plan, ExitUsersToFix); + addUsersInExitBlock(*Plan, ExitUsersToFix); // --------------------------------------------------------------------------- // Transform initial VPlan: Apply previously taken decisions, in order, to diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index ee6f035f1f494..0870671e67190 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -126,16 +126,14 @@ bool VPlanVerifier::verifyVPBasicBlock(const VPBasicBlock *VPBB) { RecipeNumbering[&R] = Cnt++; for (const VPRecipeBase &R : *VPBB) { - if (auto *IRI = dyn_cast(&R)) { - if (!isa(IRI->getParent())) { - errs() << "VPIRInstructions "; + if (isa(&R) ^ isa(VPBB)) { + errs() << "VPIRInstructions "; #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) - IRI->dump(); - errs() << " "; + R.dump(); + errs() << " "; #endif - errs() << "not in a VPIRBasicBlock!\n"; - return false; - } + errs() << "not in a VPIRBasicBlock!\n"; + return false; } for (const VPValue *V : R.definedValues()) { for (const VPUser *U : V->users()) { From f66d27e5ae2c39d657c180555116e0427c24ef1a Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Sat, 14 Sep 2024 20:45:18 +0100 Subject: [PATCH 7/7] !fixup formatting and switch case odering. --- .../Transforms/Vectorize/LoopVectorize.cpp | 25 ++++++++++++------- llvm/lib/Transforms/Vectorize/VPlan.h | 2 +- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index feac26b607762..e7ed6ad5d4746 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8630,7 +8630,11 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW, {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); } -// Collect VPIRInstructions for phis in the original exit block that are modeled in VPlan and add the exiting VPValue as operand. Some exiting values are not modeled explicitly yet and won't be included. Those are un-truncated VPWidenIntOrFpInductionRecipe, VPWidenPointerInductionRecipe and induction increments. +// Collect VPIRInstructions for phis in the original exit block that are modeled +// in VPlan and add the exiting VPValue as operand. Some exiting values are not +// modeled explicitly yet and won't be included. Those are un-truncated +// VPWidenIntOrFpInductionRecipe, VPWidenPointerInductionRecipe and induction +// increments. static SetVector collectUsersInExitBlock( Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan, const MapVector &Inductions) { @@ -8673,9 +8677,11 @@ static SetVector collectUsersInExitBlock( return ExitUsersToFix; } -// Add exit values to \p Plan. Extracts are added for each entry in \p ExitUsersToFix if needed and their operands are updated. -static void addUsersInExitBlock( - VPlan &Plan, const SetVector &ExitUsersToFix) { +// Add exit values to \p Plan. Extracts are added for each entry in \p +// ExitUsersToFix if needed and their operands are updated. +static void +addUsersInExitBlock(VPlan &Plan, + const SetVector &ExitUsersToFix) { if (ExitUsersToFix.empty()) return; @@ -8685,7 +8691,8 @@ static void addUsersInExitBlock( cast(MiddleVPBB->getSuccessors()[0])->getIRBasicBlock(); VPBuilder B(MiddleVPBB, MiddleVPBB->getFirstNonPhi()); - // Introduce extract for exiting values and update the VPIRInstructions modeling the corresponding LCSSA phis. + // Introduce extract for exiting values and update the VPIRInstructions + // modeling the corresponding LCSSA phis. for (VPIRInstruction *ExitIRI : ExitUsersToFix) { VPValue *V = ExitIRI->getOperand(0); // Pass live-in values used by exit phis directly through to the live-out. @@ -8828,7 +8835,8 @@ static void addLiveOutsForFirstOrderRecurrences( Plan.addLiveOut(FORPhi, ResumePhiRecipe); // Now update VPIRInstructions modeling LCSSA phis in the exit block. - // Extract the penultimate value of the recurrence and use it as operand for the VPIRInstruction modeling the phi. + // Extract the penultimate value of the recurrence and use it as operand for + // the VPIRInstruction modeling the phi. for (VPIRInstruction *ExitIRI : ExitUsersToFix) { if (ExitIRI->getOperand(0) != FOR) continue; @@ -8998,9 +9006,8 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { "VPBasicBlock"); RecipeBuilder.fixHeaderPhis(); - SetVector ExitUsersToFix = - collectUsersInExitBlock(OrigLoop, RecipeBuilder, *Plan, - Legal->getInductionVars()); + SetVector ExitUsersToFix = collectUsersInExitBlock( + OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars()); addLiveOutsForFirstOrderRecurrences(*Plan, ExitUsersToFix); addUsersInExitBlock(*Plan, ExitUsersToFix); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 3f51d36736788..cff3b7514857c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -936,9 +936,9 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue { case VPRecipeBase::VPReductionPHISC: case VPRecipeBase::VPScalarCastSC: return true; + case VPRecipeBase::VPBranchOnMaskSC: case VPRecipeBase::VPInterleaveSC: case VPRecipeBase::VPIRInstructionSC: - case VPRecipeBase::VPBranchOnMaskSC: case VPRecipeBase::VPWidenLoadEVLSC: case VPRecipeBase::VPWidenLoadSC: case VPRecipeBase::VPWidenStoreEVLSC: