diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index dec7a87ba9c50..ad3d3ba161fbb 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -9051,7 +9051,6 @@ static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan, static SetVector collectUsersInExitBlocks(Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan) { - auto *MiddleVPBB = Plan.getMiddleBlock(); SetVector ExitUsersToFix; for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) { for (VPRecipeBase &R : *ExitVPBB) { @@ -9061,33 +9060,33 @@ collectUsersInExitBlocks(Loop *OrigLoop, VPRecipeBuilder &Builder, auto *ExitPhi = dyn_cast(&ExitIRI->getInstruction()); if (!ExitPhi) break; - for (VPBlockBase *PredVPBB : ExitVPBB->getPredecessors()) { - BasicBlock *ExitingBB = OrigLoop->getLoopLatch(); - if (PredVPBB != MiddleVPBB) { - SmallVector ExitingBlocks; - OrigLoop->getExitingBlocks(ExitingBlocks); - assert(ExitingBlocks.size() == 2 && "only support 2 exiting blocks"); - ExitingBB = ExitingBB == ExitingBlocks[0] ? ExitingBlocks[1] - : ExitingBlocks[0]; - } - Value *IncomingValue = ExitPhi->getIncomingValueForBlock(ExitingBB); - VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue); - ExitUsersToFix.insert(ExitIRI); - ExitIRI->addOperand(V); + if (ExitVPBB->getSinglePredecessor() != Plan.getMiddleBlock()) { + assert(ExitIRI->getNumOperands() == + ExitVPBB->getPredecessors().size() && + "early-exit must update exit values on construction"); + continue; } + BasicBlock *ExitingBB = OrigLoop->getLoopLatch(); + Value *IncomingValue = ExitPhi->getIncomingValueForBlock(ExitingBB); + VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue); + ExitIRI->addOperand(V); + if (V->isLiveIn()) + continue; + assert(V->getDefiningRecipe()->getParent()->getEnclosingLoopRegion() && + "Only recipes defined inside a region should need fixing."); + ExitUsersToFix.insert(ExitIRI); } } return ExitUsersToFix; } // Add exit values to \p Plan. Extracts are added for each entry in \p -// ExitUsersToFix if needed and their operands are updated. Returns true if all -// exit users can be handled, otherwise return false. -static bool +// ExitUsersToFix if needed and their operands are updated. +static void addUsersInExitBlocks(VPlan &Plan, const SetVector &ExitUsersToFix) { if (ExitUsersToFix.empty()) - return true; + return; auto *MiddleVPBB = Plan.getMiddleBlock(); VPBuilder B(MiddleVPBB, MiddleVPBB->getFirstNonPhi()); @@ -9096,25 +9095,12 @@ addUsersInExitBlocks(VPlan &Plan, // Introduce extract for exiting values and update the VPIRInstructions // modeling the corresponding LCSSA phis. for (VPIRInstruction *ExitIRI : ExitUsersToFix) { - for (const auto &[Idx, Op] : enumerate(ExitIRI->operands())) { - // Pass live-in values used by exit phis directly through to their users - // in the exit block. - if (Op->isLiveIn()) - continue; - - // Currently only live-ins can be used by exit values from blocks not - // exiting via the vector latch through to the middle block. - if (ExitIRI->getParent()->getSinglePredecessor() != MiddleVPBB) - return false; - - LLVMContext &Ctx = ExitIRI->getInstruction().getContext(); - VPValue *Ext = B.createNaryOp(VPInstruction::ExtractFromEnd, - {Op, Plan.getOrAddLiveIn(ConstantInt::get( - IntegerType::get(Ctx, 32), 1))}); - ExitIRI->setOperand(Idx, Ext); - } + assert(ExitIRI->getNumOperands() == 1 && + ExitIRI->getParent()->getSinglePredecessor() == MiddleVPBB && + "exit values from early exits must be fixed when branch to " + "early-exit is added"); + ExitIRI->extractLastLaneOfOperand(B); } - return true; } /// Handle users in the exit block for first order reductions in the original @@ -9410,20 +9396,21 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { if (auto *UncountableExitingBlock = Legal->getUncountableEarlyExitingBlock()) { - VPlanTransforms::handleUncountableEarlyExit( - *Plan, *PSE.getSE(), OrigLoop, UncountableExitingBlock, RecipeBuilder); + if (!VPlanTransforms::handleUncountableEarlyExit( + *Plan, *PSE.getSE(), OrigLoop, UncountableExitingBlock, + RecipeBuilder)) { + reportVectorizationFailure( + "Some exit values in loop with uncountable exit not supported yet", + "UncountableEarlyExitLoopsUnsupportedExitValue", ORE, OrigLoop); + return nullptr; + } } DenseMap IVEndValues; addScalarResumePhis(RecipeBuilder, *Plan, IVEndValues); SetVector ExitUsersToFix = collectUsersInExitBlocks(OrigLoop, RecipeBuilder, *Plan); addExitUsersForFirstOrderRecurrences(*Plan, ExitUsersToFix); - if (!addUsersInExitBlocks(*Plan, ExitUsersToFix)) { - reportVectorizationFailure( - "Some exit values in loop with uncountable exit not supported yet", - "UncountableEarlyExitLoopsUnsupportedExitValue", ORE, OrigLoop); - return nullptr; - } + addUsersInExitBlocks(*Plan, ExitUsersToFix); // --------------------------------------------------------------------------- // Transform initial VPlan: Apply previously taken decisions, in order, to diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 16c64f32ab634..fcfc172b82897 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -60,6 +60,7 @@ class RecurrenceDescriptor; class SCEV; class Type; class VPBasicBlock; +class VPBuilder; class VPRegionBlock; class VPlan; class VPReplicateRecipe; @@ -1428,6 +1429,11 @@ class VPIRInstruction : public VPRecipeBase { "Op must be an operand of the recipe"); return true; } + + /// Update the recipes single operand to the last lane of the operand using \p + /// Builder. Must only be used for single operand VPIRInstructions wrapping a + /// PHINode. + void extractLastLaneOfOperand(VPBuilder &Builder); }; /// VPWidenRecipe is a recipe for producing a widened instruction using the diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 7b5d0d70933fd..1430f2d27c2b4 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -11,6 +11,7 @@ /// //===----------------------------------------------------------------------===// +#include "LoopVectorizationPlanner.h" #include "VPlan.h" #include "VPlanAnalysis.h" #include "VPlanPatternMatch.h" @@ -937,6 +938,22 @@ InstructionCost VPIRInstruction::computeCost(ElementCount VF, return 0; } +void VPIRInstruction::extractLastLaneOfOperand(VPBuilder &Builder) { + assert(isa(getInstruction()) && + "can only add exiting operands to phi nodes"); + assert(getNumOperands() == 1 && "must have a single operand"); + VPValue *Exiting = getOperand(0); + if (!Exiting->isLiveIn()) { + LLVMContext &Ctx = getInstruction().getContext(); + auto &Plan = *getParent()->getPlan(); + Exiting = Builder.createNaryOp( + VPInstruction::ExtractFromEnd, + {Exiting, + Plan.getOrAddLiveIn(ConstantInt::get(IntegerType::get(Ctx, 32), 1))}); + } + setOperand(0, Exiting); +} + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPIRInstruction::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 9febd612c644e..714250a56ff57 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -2062,7 +2062,7 @@ void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan) { } } -void VPlanTransforms::handleUncountableEarlyExit( +bool VPlanTransforms::handleUncountableEarlyExit( VPlan &Plan, ScalarEvolution &SE, Loop *OrigLoop, BasicBlock *UncountableExitingBlock, VPRecipeBuilder &RecipeBuilder) { VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion(); @@ -2103,7 +2103,32 @@ void VPlanTransforms::handleUncountableEarlyExit( VPBlockUtils::connectBlocks(NewMiddle, VPEarlyExitBlock); NewMiddle->swapSuccessors(); + // Update the exit phis in the early exit block. VPBuilder MiddleBuilder(NewMiddle); + for (VPRecipeBase &R : *VPEarlyExitBlock) { + auto *ExitIRI = cast(&R); + auto *ExitPhi = dyn_cast(&ExitIRI->getInstruction()); + if (!ExitPhi) + break; + + VPValue *IncomingFromEarlyExit = RecipeBuilder.getVPValueOrAddLiveIn( + ExitPhi->getIncomingValueForBlock(UncountableExitingBlock)); + // The incoming value from the early exit must be a live-in for now. + if (!IncomingFromEarlyExit->isLiveIn()) + return false; + + if (OrigLoop->getUniqueExitBlock()) { + // If there's a unique exit block, VPEarlyExitBlock has 2 predecessors + // (MiddleVPBB and NewMiddle). Add the incoming value from MiddleVPBB + // which is coming from the original latch. + VPValue *IncomingFromLatch = RecipeBuilder.getVPValueOrAddLiveIn( + ExitPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch())); + ExitIRI->addOperand(IncomingFromLatch); + ExitIRI->extractLastLaneOfOperand(MiddleBuilder); + } + // Add the incoming value from the early exit. + ExitIRI->addOperand(IncomingFromEarlyExit); + } MiddleBuilder.createNaryOp(VPInstruction::BranchOnCond, {IsEarlyExitTaken}); // Replace the condition controlling the non-early exit from the vector loop @@ -2119,4 +2144,5 @@ void VPlanTransforms::handleUncountableEarlyExit( Instruction::Or, {IsEarlyExitTaken, IsLatchExitTaken}); Builder.createNaryOp(VPInstruction::BranchOnCond, AnyExitTaken); LatchExitingBranch->eraseFromParent(); + return true; } diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index a751b8b5e8dc5..b31fef5d62456 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -130,7 +130,7 @@ struct VPlanTransforms { /// exit conditions /// * splitting the original middle block to branch to the early exit block /// if taken. - static void handleUncountableEarlyExit(VPlan &Plan, ScalarEvolution &SE, + static bool handleUncountableEarlyExit(VPlan &Plan, ScalarEvolution &SE, Loop *OrigLoop, BasicBlock *UncountableExitingBlock, VPRecipeBuilder &RecipeBuilder); diff --git a/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll b/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll index 6e542bd873b8c..56d0871feacd3 100644 --- a/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll +++ b/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll @@ -217,21 +217,50 @@ define i64 @same_exit_block_pre_inc_use2() { ; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 ; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) ; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: ; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4 +; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP5]], splat (i1 true) +; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]]) +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) +; CHECK-NEXT: [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]] +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_SPLIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.split: +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[VEC_IND]], i32 3 +; CHECK-NEXT: br i1 [[TMP7]], label [[LOOP_END:%.*]], label [[MIDDLE_BLOCK:%.*]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[LOOP_END]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] ; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] ; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 ; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] -; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]] +; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]] ; CHECK: loop.inc: ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP1]], label [[LOOP_END]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: loop.end: -; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ 67, [[LOOP]] ], [ [[INDEX]], [[LOOP_INC]] ] +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ 67, [[LOOP1]] ], [ [[INDEX]], [[LOOP_INC]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ 67, [[MIDDLE_SPLIT]] ] ; CHECK-NEXT: ret i64 [[RETVAL]] ; entry: @@ -548,7 +577,7 @@ define i64 @diff_exit_block_pre_inc_use2() { ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) ; CHECK-NEXT: [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]] -; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_SPLIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_SPLIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.split: ; CHECK-NEXT: br i1 [[TMP7]], label [[LOOP_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]] ; CHECK: middle.block: @@ -568,7 +597,7 @@ define i64 @diff_exit_block_pre_inc_use2() { ; CHECK: loop.inc: ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP1]], label [[LOOP_END]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP1]], label [[LOOP_END]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: loop.early.exit: ; CHECK-NEXT: [[RETVAL1:%.*]] = phi i64 [ 67, [[LOOP1]] ], [ 67, [[MIDDLE_SPLIT]] ] ; CHECK-NEXT: ret i64 [[RETVAL1]] @@ -1029,4 +1058,6 @@ attributes #0 = { "vector-function-abi-variant"="_ZGVsNxv_foo(foo_vec)" } ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} ; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} ; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} ;.