diff --git a/llvm/docs/Vectorizers.rst b/llvm/docs/Vectorizers.rst index f134a6df94a69..d19d03050436a 100644 --- a/llvm/docs/Vectorizers.rst +++ b/llvm/docs/Vectorizers.rst @@ -405,9 +405,11 @@ Early Exit Vectorization When vectorizing a loop with a single early exit, the loop blocks following the early exit are predicated and the vector loop will always exit via the latch. If the early exit has been taken, the vector loop's successor block -(``middle.split`` below) branches to the early exit block. Otherwise -``middle.block`` selects between the exit block from the latch or the scalar -remainder loop. +(``middle.split`` below) branches to the early exit block via an intermediate +block (``vector.early.exit`` below). This intermediate block is responsible for +calculating any exit values of loop-defined variables that are used in the +early exit block. Otherwise, ``middle.block`` selects between the exit block +from the latch or the scalar remainder loop. .. image:: vplan-early-exit.png diff --git a/llvm/docs/vplan-early-exit.dot b/llvm/docs/vplan-early-exit.dot index 63490b0cdb2e4..980fc6939b6d4 100644 --- a/llvm/docs/vplan-early-exit.dot +++ b/llvm/docs/vplan-early-exit.dot @@ -19,23 +19,27 @@ compound=true "middle.split" ] N4 -> N5 [ label=""] - N4 -> N6 [ label=""] + N4 -> N7 [ label=""] N5 [label = - "early.exit" + "vector.early.exit" ] + N5 -> N6 [ label=""] N6 [label = - "middle.block" + "early.exit" ] - N6 -> N9 [ label=""] - N6 -> N7 [ label=""] N7 [label = - "scalar.ph" + "middle.block" ] + N7 -> N10 [ label=""] N7 -> N8 [ label=""] N8 [label = - "loop.header" + "scalar.ph" ] + N8 -> N9 [ label=""] N9 [label = + "loop.header" + ] + N10 [label = "latch.exit" ] } diff --git a/llvm/docs/vplan-early-exit.png b/llvm/docs/vplan-early-exit.png index 3cd293bcdbcc8..249b90c69e2b8 100644 Binary files a/llvm/docs/vplan-early-exit.png and b/llvm/docs/vplan-early-exit.png differ diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 57b7358049bce..73964d0fbfc40 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -9396,14 +9396,8 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { if (auto *UncountableExitingBlock = Legal->getUncountableEarlyExitingBlock()) { - if (!VPlanTransforms::handleUncountableEarlyExit( - *Plan, *PSE.getSE(), OrigLoop, UncountableExitingBlock, - RecipeBuilder)) { - reportVectorizationFailure( - "Some exit values in loop with uncountable exit not supported yet", - "UncountableEarlyExitLoopsUnsupportedExitValue", ORE, OrigLoop); - return nullptr; - } + VPlanTransforms::handleUncountableEarlyExit( + *Plan, *PSE.getSE(), OrigLoop, UncountableExitingBlock, RecipeBuilder); } DenseMap IVEndValues; addScalarResumePhis(RecipeBuilder, *Plan, IVEndValues); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 35da93ee3b407..3c381307c237c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -500,8 +500,15 @@ void VPBasicBlock::execute(VPTransformState *State) { UnreachableInst *Terminator = State->Builder.CreateUnreachable(); // Register NewBB in its loop. In innermost loops its the same for all // BB's. - if (State->CurrentParentLoop) - State->CurrentParentLoop->addBasicBlockToLoop(NewBB, *State->LI); + Loop *ParentLoop = State->CurrentParentLoop; + // If this block has a sole successor that is an exit block then it needs + // adding to the same parent loop as the exit block. + VPBlockBase *SuccVPBB = getSingleSuccessor(); + if (SuccVPBB && State->Plan->isExitBlock(SuccVPBB)) + ParentLoop = State->LI->getLoopFor( + cast(SuccVPBB)->getIRBasicBlock()); + if (ParentLoop) + ParentLoop->addBasicBlockToLoop(NewBB, *State->LI); State->Builder.SetInsertPoint(Terminator); State->CFG.PrevBB = NewBB; @@ -949,6 +956,10 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV, } } +bool VPlan::isExitBlock(VPBlockBase *VPBB) { + return isa(VPBB) && VPBB->getNumSuccessors() == 0; +} + /// Generate the code inside the preheader and body of the vectorized loop. /// Assumes a single pre-header basic-block was created for this. Introduce /// additional basic-blocks as needed, and fill them all. diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index a1ff684b2b801..459222234bc37 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1223,6 +1223,9 @@ class VPInstruction : public VPRecipeWithIRFlags, // Returns a scalar boolean value, which is true if any lane of its (only // boolean) vector operand is true. AnyOf, + // Extracts the first active lane of a vector, where the first operand is + // the predicate, and the second operand is the vector to extract. + ExtractFirstActive, }; private: @@ -3967,6 +3970,9 @@ class VPlan { /// of VPBlockShallowTraversalWrapper. auto getExitBlocks(); + /// Returns true if \p VPBB is an exit block. + bool isExitBlock(VPBlockBase *VPBB); + /// The trip count of the original loop. VPValue *getTripCount() const { assert(TripCount && "trip count needs to be set before accessing it"); diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index 27357ff04b5f2..71fb6d42116cf 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -78,6 +78,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) { case VPInstruction::CanonicalIVIncrementForPart: case VPInstruction::AnyOf: return SetResultTyFromOp(); + case VPInstruction::ExtractFirstActive: case VPInstruction::ExtractFromEnd: { Type *BaseTy = inferScalarType(R->getOperand(0)); if (auto *VecTy = dyn_cast(BaseTy)) diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 2679ed6b26b5d..81031b9401ca0 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -697,7 +697,13 @@ Value *VPInstruction::generate(VPTransformState &State) { Value *A = State.get(getOperand(0)); return Builder.CreateOrReduce(A); } - + case VPInstruction::ExtractFirstActive: { + Value *Vec = State.get(getOperand(0)); + Value *Mask = State.get(getOperand(1)); + Value *Ctz = Builder.CreateCountTrailingZeroElems( + Builder.getInt64Ty(), Mask, true, "first.active.lane"); + return Builder.CreateExtractElement(Vec, Ctz, "early.exit.value"); + } default: llvm_unreachable("Unsupported opcode for instruction"); } @@ -705,6 +711,7 @@ Value *VPInstruction::generate(VPTransformState &State) { bool VPInstruction::isVectorToScalar() const { return getOpcode() == VPInstruction::ExtractFromEnd || + getOpcode() == VPInstruction::ExtractFirstActive || getOpcode() == VPInstruction::ComputeReductionResult || getOpcode() == VPInstruction::AnyOf; } @@ -769,6 +776,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const { case VPInstruction::CalculateTripCountMinusVF: case VPInstruction::CanonicalIVIncrementForPart: case VPInstruction::ExtractFromEnd: + case VPInstruction::ExtractFirstActive: case VPInstruction::FirstOrderRecurrenceSplice: case VPInstruction::LogicalAnd: case VPInstruction::Not: @@ -888,6 +896,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, case VPInstruction::AnyOf: O << "any-of"; break; + case VPInstruction::ExtractFirstActive: + O << "extract-first-active"; + break; default: O << Instruction::getOpcodeName(getOpcode()); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 714250a56ff57..5b2ef68fff056 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -2062,7 +2062,7 @@ void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan) { } } -bool VPlanTransforms::handleUncountableEarlyExit( +void VPlanTransforms::handleUncountableEarlyExit( VPlan &Plan, ScalarEvolution &SE, Loop *OrigLoop, BasicBlock *UncountableExitingBlock, VPRecipeBuilder &RecipeBuilder) { VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion(); @@ -2099,12 +2099,17 @@ bool VPlanTransforms::handleUncountableEarlyExit( Builder.createNaryOp(VPInstruction::AnyOf, {EarlyExitTakenCond}); VPBasicBlock *NewMiddle = Plan.createVPBasicBlock("middle.split"); + VPBasicBlock *VectorEarlyExitVPBB = + Plan.createVPBasicBlock("vector.early.exit"); VPBlockUtils::insertOnEdge(LoopRegion, MiddleVPBB, NewMiddle); - VPBlockUtils::connectBlocks(NewMiddle, VPEarlyExitBlock); + VPBlockUtils::connectBlocks(NewMiddle, VectorEarlyExitVPBB); NewMiddle->swapSuccessors(); + VPBlockUtils::connectBlocks(VectorEarlyExitVPBB, VPEarlyExitBlock); + // Update the exit phis in the early exit block. VPBuilder MiddleBuilder(NewMiddle); + VPBuilder EarlyExitB(VectorEarlyExitVPBB); for (VPRecipeBase &R : *VPEarlyExitBlock) { auto *ExitIRI = cast(&R); auto *ExitPhi = dyn_cast(&ExitIRI->getInstruction()); @@ -2113,9 +2118,6 @@ bool VPlanTransforms::handleUncountableEarlyExit( VPValue *IncomingFromEarlyExit = RecipeBuilder.getVPValueOrAddLiveIn( ExitPhi->getIncomingValueForBlock(UncountableExitingBlock)); - // The incoming value from the early exit must be a live-in for now. - if (!IncomingFromEarlyExit->isLiveIn()) - return false; if (OrigLoop->getUniqueExitBlock()) { // If there's a unique exit block, VPEarlyExitBlock has 2 predecessors @@ -2127,6 +2129,10 @@ bool VPlanTransforms::handleUncountableEarlyExit( ExitIRI->extractLastLaneOfOperand(MiddleBuilder); } // Add the incoming value from the early exit. + if (!IncomingFromEarlyExit->isLiveIn()) + IncomingFromEarlyExit = + EarlyExitB.createNaryOp(VPInstruction::ExtractFirstActive, + {IncomingFromEarlyExit, EarlyExitTakenCond}); ExitIRI->addOperand(IncomingFromEarlyExit); } MiddleBuilder.createNaryOp(VPInstruction::BranchOnCond, {IsEarlyExitTaken}); @@ -2144,5 +2150,4 @@ bool VPlanTransforms::handleUncountableEarlyExit( Instruction::Or, {IsEarlyExitTaken, IsLatchExitTaken}); Builder.createNaryOp(VPInstruction::BranchOnCond, AnyExitTaken); LatchExitingBranch->eraseFromParent(); - return true; } diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index b31fef5d62456..a751b8b5e8dc5 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -130,7 +130,7 @@ struct VPlanTransforms { /// exit conditions /// * splitting the original middle block to branch to the early exit block /// if taken. - static bool handleUncountableEarlyExit(VPlan &Plan, ScalarEvolution &SE, + static void handleUncountableEarlyExit(VPlan &Plan, ScalarEvolution &SE, Loop *OrigLoop, BasicBlock *UncountableExitingBlock, VPRecipeBuilder &RecipeBuilder); diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index 0f151c897d938..96156de444f88 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -209,7 +209,9 @@ bool VPlanVerifier::verifyVPBasicBlock(const VPBasicBlock *VPBB) { auto *UI = cast(U); // TODO: check dominance of incoming values for phis properly. if (!UI || - isa(UI)) + isa(UI) || + (isa(UI) && + isa(cast(UI)->getInstruction()))) continue; // If the user is in the same block, check it comes after R in the diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll b/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll index ca3a5722affe9..9d21ea0ab6de3 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll @@ -13,21 +13,70 @@ define i64 @same_exit_block_pre_inc_use1() #1 { ; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 4 ; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) ; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 64, [[TMP1]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 64, [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 64, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16 +; CHECK-NEXT: [[TMP6:%.*]] = add i64 3, [[N_VEC]] +; CHECK-NEXT: [[TMP7:%.*]] = call @llvm.stepvector.nxv16i64() +; CHECK-NEXT: [[TMP8:%.*]] = mul [[TMP7]], splat (i64 1) +; CHECK-NEXT: [[INDUCTION:%.*]] = add splat (i64 3), [[TMP8]] +; CHECK-NEXT: [[TMP9:%.*]] = mul i64 1, [[TMP5]] +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP9]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]] +; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP12]], align 1 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP14]], align 1 +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq [[WIDE_LOAD]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], [[TMP5]] +; CHECK-NEXT: [[TMP16:%.*]] = xor [[TMP15]], splat (i1 true) +; CHECK-NEXT: [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1( [[TMP16]]) +; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT3]], [[N_VEC]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP19:%.*]] = or i1 [[TMP17]], [[TMP18]] +; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.split: +; CHECK-NEXT: br i1 [[TMP17]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]] +; CHECK: vector.early.exit: +; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1( [[TMP16]], i1 true) +; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = extractelement [[VEC_IND]], i64 [[FIRST_ACTIVE_LANE]] +; CHECK-NEXT: br label [[LOOP_END:%.*]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 64, [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_END]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] ; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] ; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 ; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] -; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]] +; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]] ; CHECK: loop.inc: ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: loop.end: -; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ] +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ], [ 67, [[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ] ; CHECK-NEXT: ret i64 [[RETVAL]] ; entry: @@ -66,19 +115,48 @@ define i64 @same_exit_block_pre_inc_use4() { ; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i64], align 8 ; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) ; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[P1]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ult <2 x i64> [[VEC_IND]], [[WIDE_LOAD]] +; CHECK-NEXT: [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], 2 +; CHECK-NEXT: [[TMP4:%.*]] = xor <2 x i1> [[TMP3]], splat (i1 true) +; CHECK-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[TMP4]]) +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT2]], 64 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2) +; CHECK-NEXT: [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: middle.split: +; CHECK-NEXT: br i1 [[TMP5]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]] +; CHECK: vector.early.exit: +; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> [[TMP4]], i1 true) +; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = extractelement <2 x i64> [[VEC_IND]], i64 [[FIRST_ACTIVE_LANE]] +; CHECK-NEXT: br label [[LOOP_END:%.*]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[LOOP_END]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[P1]], i64 [[INDEX]] ; CHECK-NEXT: [[LD1:%.*]] = load i64, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[CMP3:%.*]] = icmp ult i64 [[INDEX]], [[LD1]] -; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]] +; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]] ; CHECK: loop.inc: ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: loop.end: -; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ] +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ], [ 67, [[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ] ; CHECK-NEXT: ret i64 [[RETVAL]] ; entry: @@ -114,20 +192,50 @@ define i64 @loop_contains_safe_call() #1 { ; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 4 ; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) ; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[P1]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> [[WIDE_LOAD]]) +; CHECK-NEXT: [[TMP4:%.*]] = fcmp fast ult <4 x float> [[TMP3]], splat (float 3.000000e+00) +; CHECK-NEXT: [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], 4 +; CHECK-NEXT: [[TMP5:%.*]] = xor <4 x i1> [[TMP4]], splat (i1 true) +; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]]) +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT2]], 64 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) +; CHECK-NEXT: [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]] +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: middle.split: +; CHECK-NEXT: br i1 [[TMP6]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]] +; CHECK: vector.early.exit: +; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP5]], i1 true) +; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[FIRST_ACTIVE_LANE]] +; CHECK-NEXT: br label [[LOOP_END:%.*]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[LOOP_END]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[P1]], i64 [[INDEX]] ; CHECK-NEXT: [[LD1:%.*]] = load float, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[SQRT:%.*]] = tail call fast float @llvm.sqrt.f32(float [[LD1]]) ; CHECK-NEXT: [[CMP:%.*]] = fcmp fast ult float [[SQRT]], 3.000000e+00 -; CHECK-NEXT: br i1 [[CMP]], label [[LOOP_INC]], label [[LOOP_END:%.*]] +; CHECK-NEXT: br i1 [[CMP]], label [[LOOP_INC]], label [[LOOP_END]] ; CHECK: loop.inc: ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: loop.end: -; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ] +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ], [ 67, [[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ] ; CHECK-NEXT: ret i64 [[RETVAL]] ; entry: @@ -164,20 +272,50 @@ define i64 @loop_contains_safe_div() #1 { ; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 4 ; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) ; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = udiv <2 x i32> [[WIDE_LOAD]], splat (i32 20000) +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <2 x i32> [[TMP3]], splat (i32 1) +; CHECK-NEXT: [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], 2 +; CHECK-NEXT: [[TMP5:%.*]] = xor <2 x i1> [[TMP4]], splat (i1 true) +; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[TMP5]]) +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT2]], 64 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2) +; CHECK-NEXT: [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]] +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: middle.split: +; CHECK-NEXT: br i1 [[TMP6]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]] +; CHECK: vector.early.exit: +; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> [[TMP5]], i1 true) +; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = extractelement <2 x i64> [[VEC_IND]], i64 [[FIRST_ACTIVE_LANE]] +; CHECK-NEXT: br label [[LOOP_END:%.*]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[LOOP_END]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[INDEX]] ; CHECK-NEXT: [[LD1:%.*]] = load i32, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[DIV:%.*]] = udiv i32 [[LD1]], 20000 ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[DIV]], 1 -; CHECK-NEXT: br i1 [[CMP]], label [[LOOP_INC]], label [[LOOP_END:%.*]] +; CHECK-NEXT: br i1 [[CMP]], label [[LOOP_INC]], label [[LOOP_END]] ; CHECK: loop.inc: ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: loop.end: -; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ] +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ], [ 67, [[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ] ; CHECK-NEXT: ret i64 [[RETVAL]] ; entry: @@ -216,21 +354,54 @@ define i64 @loop_contains_load_after_early_exit(ptr dereferenceable(1024) align( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 4 ; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], splat (i32 1) +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[P2]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i64, ptr [[TMP4]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8 +; CHECK-NEXT: [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4 +; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP3]], splat (i1 true) +; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]]) +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) +; CHECK-NEXT: [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]] +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK: middle.split: +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[WIDE_LOAD2]], i32 3 +; CHECK-NEXT: br i1 [[TMP7]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]] +; CHECK: vector.early.exit: +; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true) +; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[FIRST_ACTIVE_LANE]] +; CHECK-NEXT: br label [[LOOP_END:%.*]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[LOOP_END]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[INDEX]] ; CHECK-NEXT: [[LD1:%.*]] = load i32, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[LD1]], 1 -; CHECK-NEXT: br i1 [[CMP]], label [[LOOP_INC]], label [[LOOP_END:%.*]] +; CHECK-NEXT: br i1 [[CMP]], label [[LOOP_INC]], label [[LOOP_END]] ; CHECK: loop.inc: ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[P2]], i64 [[INDEX]] ; CHECK-NEXT: [[LD2:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: loop.end: -; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ [[LD2]], [[LOOP_INC]] ] +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ [[LD2]], [[LOOP_INC]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ] ; CHECK-NEXT: ret i64 [[RETVAL]] ; entry: @@ -305,9 +476,11 @@ define i32 @diff_exit_block_needs_scev_check(i32 %end) { ; CHECK-NEXT: [[TMP15:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP14]]) ; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: [[TMP17:%.*]] = or i1 [[TMP15]], [[TMP16]] -; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: middle.split: -; CHECK-NEXT: br i1 [[TMP15]], label [[FOUND:%.*]], label [[MIDDLE_BLOCK:%.*]] +; CHECK-NEXT: br i1 [[TMP15]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]] +; CHECK: vector.early.exit: +; CHECK-NEXT: br label [[FOUND:%.*]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[UMAX1]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] @@ -329,7 +502,7 @@ define i32 @diff_exit_block_needs_scev_check(i32 %end) { ; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[IND_NEXT]] to i32 ; CHECK-NEXT: [[GEP_IND_NEXT]] = add i64 [[GEP_IND]], 1 ; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[CONV]], [[END_CLAMPED]] -; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[EXIT]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[EXIT]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: found: ; CHECK-NEXT: ret i32 1 ; CHECK: exit: @@ -420,5 +593,15 @@ attributes #1 = { "target-features"="+sve" vscale_range(1,16) } ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} ; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]} +; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} +; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} +; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} +; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} +; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]} +; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]} +; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]} +; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]} +; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META1]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/early_exit_legality.ll b/llvm/test/Transforms/LoopVectorize/early_exit_legality.ll index 6d365b9d77e80..de455c81d363e 100644 --- a/llvm/test/Transforms/LoopVectorize/early_exit_legality.ll +++ b/llvm/test/Transforms/LoopVectorize/early_exit_legality.ll @@ -49,7 +49,7 @@ define i64 @same_exit_block_pre_inc_use1() { ; CHECK-LABEL: LV: Checking a loop in 'same_exit_block_pre_inc_use1' ; CHECK: LV: Found an early exit loop with symbolic max backedge taken count: 63 ; CHECK-NEXT: LV: We can vectorize this loop! -; CHECK: LV: Not vectorizing: Some exit values in loop with uncountable exit not supported yet. +; CHECK-NOT: LV: Not vectorizing entry: %p1 = alloca [1024 x i8] %p2 = alloca [1024 x i8] @@ -141,7 +141,7 @@ define i64 @loop_contains_load_after_early_exit(ptr dereferenceable(1024) align( ; CHECK-LABEL: LV: Checking a loop in 'loop_contains_load_after_early_exit' ; CHECK: LV: Found an early exit loop with symbolic max backedge taken count: 63 ; CHECK-NEXT: LV: We can vectorize this loop! -; CHECK: LV: Not vectorizing: Some exit values in loop with uncountable exit not supported yet. +; CHECK-NOT: LV: Not vectorizing entry: %p1 = alloca [1024 x i8] call void @init_mem(ptr %p1, i64 1024) diff --git a/llvm/test/Transforms/LoopVectorize/single_early_exit.ll b/llvm/test/Transforms/LoopVectorize/single_early_exit.ll index 67a64a90c1179..4bcf8e0180d63 100644 --- a/llvm/test/Transforms/LoopVectorize/single_early_exit.ll +++ b/llvm/test/Transforms/LoopVectorize/single_early_exit.ll @@ -32,7 +32,9 @@ define i64 @same_exit_block_phi_of_consts() { ; CHECK-NEXT: [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]] ; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.split: -; CHECK-NEXT: br i1 [[TMP7]], label [[LOOP_END:%.*]], label [[MIDDLE_BLOCK:%.*]] +; CHECK-NEXT: br i1 [[TMP7]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]] +; CHECK: vector.early.exit: +; CHECK-NEXT: br label [[LOOP_END:%.*]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[LOOP_END]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -51,7 +53,7 @@ define i64 @same_exit_block_phi_of_consts() { ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: loop.end: -; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ 0, [[LOOP]] ], [ 1, [[LOOP_INC]] ], [ 1, [[MIDDLE_BLOCK]] ], [ 0, [[MIDDLE_SPLIT]] ] +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ 0, [[LOOP]] ], [ 1, [[LOOP_INC]] ], [ 1, [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_EARLY_EXIT]] ] ; CHECK-NEXT: ret i64 [[RETVAL]] ; entry: @@ -109,7 +111,9 @@ define i64 @diff_exit_block_phi_of_consts() { ; CHECK-NEXT: [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]] ; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.split: -; CHECK-NEXT: br i1 [[TMP7]], label [[LOOP_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]] +; CHECK-NEXT: br i1 [[TMP7]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]] +; CHECK: vector.early.exit: +; CHECK-NEXT: br label [[LOOP_EARLY_EXIT:%.*]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -206,7 +210,9 @@ define i32 @diff_exit_block_needs_scev_check(i32 %end) { ; CHECK-NEXT: [[TMP17:%.*]] = or i1 [[TMP15]], [[TMP16]] ; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.split: -; CHECK-NEXT: br i1 [[TMP15]], label [[FOUND:%.*]], label [[MIDDLE_BLOCK:%.*]] +; CHECK-NEXT: br i1 [[TMP15]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]] +; CHECK: vector.early.exit: +; CHECK-NEXT: br label [[FOUND:%.*]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[UMAX1]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] @@ -291,7 +297,9 @@ define i32 @diff_blocks_invariant_early_exit_cond(ptr %s) { ; CHECK-NEXT: [[TMP3:%.*]] = or i1 [[TMP1]], [[TMP2]] ; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.split: -; CHECK-NEXT: br i1 [[TMP1]], label [[EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]] +; CHECK-NEXT: br i1 [[TMP1]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]] +; CHECK: vector.early.exit: +; CHECK-NEXT: br label [[EARLY_EXIT:%.*]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: diff --git a/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll b/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll index f03c0f544c294..1bfe054057089 100644 --- a/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll +++ b/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll @@ -10,21 +10,53 @@ define i64 @same_exit_block_pre_inc_use1() { ; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 ; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) ; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4 +; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP5]], splat (i1 true) +; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]]) +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) +; CHECK-NEXT: [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]] +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.split: +; CHECK-NEXT: br i1 [[TMP7]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]] +; CHECK: vector.early.exit: +; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true) +; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[FIRST_ACTIVE_LANE]] +; CHECK-NEXT: br label [[LOOP_END:%.*]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[LOOP_END]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] ; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] ; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 ; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] -; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]] +; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]] ; CHECK: loop.inc: ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: loop.end: -; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ] +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ], [ 67, [[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ] ; CHECK-NEXT: ret i64 [[RETVAL]] ; entry: @@ -62,22 +94,55 @@ define i64 @same_exit_block_pre_inc1_use_inv_cond(i1 %cond) { ; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 ; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) ; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[COND]], <4 x i1> [[TMP5]], <4 x i1> zeroinitializer +; CHECK-NEXT: [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4 +; CHECK-NEXT: [[TMP7:%.*]] = xor <4 x i1> [[TMP6]], splat (i1 true) +; CHECK-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP7]]) +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) +; CHECK-NEXT: [[TMP10:%.*]] = or i1 [[TMP8]], [[TMP9]] +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: middle.split: +; CHECK-NEXT: br i1 [[TMP8]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]] +; CHECK: vector.early.exit: +; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP7]], i1 true) +; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[FIRST_ACTIVE_LANE]] +; CHECK-NEXT: br label [[LOOP_END:%.*]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[LOOP_END]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] ; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] ; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 ; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] ; CHECK-NEXT: [[CMP4:%.*]] = select i1 [[COND]], i1 [[CMP3]], i1 false -; CHECK-NEXT: br i1 [[CMP4]], label [[LOOP_INC]], label [[LOOP_END:%.*]] +; CHECK-NEXT: br i1 [[CMP4]], label [[LOOP_INC]], label [[LOOP_END]] ; CHECK: loop.inc: ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: loop.end: -; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ] +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ], [ 67, [[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ] ; CHECK-NEXT: ret i64 [[RETVAL]] ; entry: @@ -115,21 +180,53 @@ define i64 @same_exit_block_pre_inc_use1_gep_two_indices() { ; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 ; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) ; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1024 x i8], ptr [[P1]], i64 0, i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1024 x i8], ptr [[P2]], i64 0, i64 [[TMP0]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4 +; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP5]], splat (i1 true) +; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]]) +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) +; CHECK-NEXT: [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]] +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: middle.split: +; CHECK-NEXT: br i1 [[TMP7]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]] +; CHECK: vector.early.exit: +; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true) +; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[FIRST_ACTIVE_LANE]] +; CHECK-NEXT: br label [[LOOP_END:%.*]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[LOOP_END]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i8], ptr [[P1]], i64 0, i64 [[INDEX]] ; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [1024 x i8], ptr [[P2]], i64 0, i64 [[INDEX]] ; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 ; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] -; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]] +; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]] ; CHECK: loop.inc: ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: loop.end: -; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ] +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ], [ 67, [[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ] ; CHECK-NEXT: ret i64 [[RETVAL]] ; entry: @@ -166,21 +263,53 @@ define i64 @same_exit_block_pre_inc_use1_alloca_diff_type() { ; CHECK-NEXT: [[P2:%.*]] = alloca [40 x i32], align 4 ; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) ; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4 +; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP5]], splat (i1 true) +; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]]) +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) +; CHECK-NEXT: [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]] +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: middle.split: +; CHECK-NEXT: br i1 [[TMP7]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]] +; CHECK: vector.early.exit: +; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true) +; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[FIRST_ACTIVE_LANE]] +; CHECK-NEXT: br label [[LOOP_END:%.*]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[LOOP_END]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] ; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] ; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 ; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] -; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]] +; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]] ; CHECK: loop.inc: ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: loop.end: -; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ] +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ], [ 67, [[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ] ; CHECK-NEXT: ret i64 [[RETVAL]] ; entry: @@ -238,10 +367,12 @@ define i64 @same_exit_block_pre_inc_use2() { ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) ; CHECK-NEXT: [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]] -; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.split: ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[VEC_IND]], i32 3 -; CHECK-NEXT: br i1 [[TMP7]], label [[LOOP_END:%.*]], label [[MIDDLE_BLOCK:%.*]] +; CHECK-NEXT: br i1 [[TMP7]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]] +; CHECK: vector.early.exit: +; CHECK-NEXT: br label [[LOOP_END:%.*]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[LOOP_END]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -258,9 +389,9 @@ define i64 @same_exit_block_pre_inc_use2() { ; CHECK: loop.inc: ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: loop.end: -; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ 67, [[LOOP]] ], [ [[INDEX]], [[LOOP_INC]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ 67, [[MIDDLE_SPLIT]] ] +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ 67, [[LOOP]] ], [ [[INDEX]], [[LOOP_INC]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ 67, [[VECTOR_EARLY_EXIT]] ] ; CHECK-NEXT: ret i64 [[RETVAL]] ; entry: @@ -297,21 +428,54 @@ define i64 @same_exit_block_pre_inc_use3() { ; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 ; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) ; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4 +; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP5]], splat (i1 true) +; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]]) +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) +; CHECK-NEXT: [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]] +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK: middle.split: +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[VEC_IND]], i32 3 +; CHECK-NEXT: br i1 [[TMP7]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]] +; CHECK: vector.early.exit: +; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true) +; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[FIRST_ACTIVE_LANE]] +; CHECK-NEXT: br label [[LOOP_END:%.*]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[LOOP_END]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] ; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] ; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 ; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] -; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]] +; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]] ; CHECK: loop.inc: ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: loop.end: -; CHECK-NEXT: [[INDEX_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LOOP_INC]] ], [ [[INDEX]], [[LOOP]] ] +; CHECK-NEXT: [[INDEX_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LOOP_INC]] ], [ [[INDEX]], [[LOOP]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ] ; CHECK-NEXT: ret i64 [[INDEX_LCSSA]] ; entry: @@ -349,19 +513,48 @@ define i64 @same_exit_block_pre_inc_use4() { ; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i64], align 8 ; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) ; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[P1]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ult <4 x i64> [[VEC_IND]], [[WIDE_LOAD]] +; CHECK-NEXT: [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], 4 +; CHECK-NEXT: [[TMP4:%.*]] = xor <4 x i1> [[TMP3]], splat (i1 true) +; CHECK-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]]) +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT2]], 64 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) +; CHECK-NEXT: [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK: middle.split: +; CHECK-NEXT: br i1 [[TMP5]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]] +; CHECK: vector.early.exit: +; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 true) +; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[FIRST_ACTIVE_LANE]] +; CHECK-NEXT: br label [[LOOP_END:%.*]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[LOOP_END]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[P1]], i64 [[INDEX]] ; CHECK-NEXT: [[LD1:%.*]] = load i64, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[CMP3:%.*]] = icmp ult i64 [[INDEX]], [[LD1]] -; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]] +; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]] ; CHECK: loop.inc: ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK: loop.end: -; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ] +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ], [ 67, [[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ] ; CHECK-NEXT: ret i64 [[RETVAL]] ; entry: @@ -396,21 +589,60 @@ define i64 @same_exit_block_post_inc_use() { ; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 ; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) ; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 3 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP5]], align 1 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP7]], align 1 +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[TMP0]], 1 +; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[TMP1]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[TMP2]], 1 +; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[TMP3]], 1 +; CHECK-NEXT: [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4 +; CHECK-NEXT: [[TMP13:%.*]] = xor <4 x i1> [[TMP8]], splat (i1 true) +; CHECK-NEXT: [[TMP14:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP13]]) +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) +; CHECK-NEXT: [[TMP16:%.*]] = or i1 [[TMP14]], [[TMP15]] +; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK: middle.split: +; CHECK-NEXT: br i1 [[TMP14]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]] +; CHECK: vector.early.exit: +; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP13]], i1 true) +; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[FIRST_ACTIVE_LANE]] +; CHECK-NEXT: br label [[LOOP_END:%.*]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[LOOP_END]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] ; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] ; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 ; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] -; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]] +; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]] ; CHECK: loop.inc: ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]], !llvm.loop [[LOOP17:![0-9]+]] ; CHECK: loop.end: -; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ [[INDEX_NEXT]], [[LOOP_INC]] ] +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ [[INDEX_NEXT]], [[LOOP_INC]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ] ; CHECK-NEXT: ret i64 [[RETVAL]] ; entry: @@ -447,21 +679,65 @@ define i64 @same_exit_block_post_inc_use2() { ; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 ; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) ; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 3 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP5]], align 1 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP7]], align 1 +; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[TMP0]], 1 +; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[TMP1]], 1 +; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[TMP2]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[TMP3]], 1 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i64> poison, i64 [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i64> [[TMP12]], i64 [[TMP9]], i32 1 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x i64> [[TMP13]], i64 [[TMP10]], i32 2 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x i64> [[TMP14]], i64 [[TMP11]], i32 3 +; CHECK-NEXT: [[TMP16:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4 +; CHECK-NEXT: [[TMP17:%.*]] = xor <4 x i1> [[TMP16]], splat (i1 true) +; CHECK-NEXT: [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP17]]) +; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) +; CHECK-NEXT: [[TMP20:%.*]] = or i1 [[TMP18]], [[TMP19]] +; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK: middle.split: +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i64> [[VEC_IND]], i32 3 +; CHECK-NEXT: br i1 [[TMP18]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]] +; CHECK: vector.early.exit: +; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP17]], i1 true) +; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = extractelement <4 x i64> [[TMP15]], i64 [[FIRST_ACTIVE_LANE]] +; CHECK-NEXT: br label [[LOOP_END:%.*]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[LOOP_END]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] ; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] ; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] -; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]] +; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]] ; CHECK: loop.inc: ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]], !llvm.loop [[LOOP19:![0-9]+]] ; CHECK: loop.end: -; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX_NEXT]], [[LOOP]] ], [ [[INDEX]], [[LOOP_INC]] ] +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX_NEXT]], [[LOOP]] ], [ [[INDEX]], [[LOOP_INC]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ] ; CHECK-NEXT: ret i64 [[RETVAL]] ; entry: @@ -498,24 +774,56 @@ define i64 @diff_exit_block_pre_inc_use1() { ; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 ; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) ; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4 +; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP5]], splat (i1 true) +; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]]) +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) +; CHECK-NEXT: [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]] +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK: middle.split: +; CHECK-NEXT: br i1 [[TMP7]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]] +; CHECK: vector.early.exit: +; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true) +; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[FIRST_ACTIVE_LANE]] +; CHECK-NEXT: br label [[LOOP_EARLY_EXIT:%.*]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] ; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] ; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 ; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] -; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_EARLY_EXIT:%.*]] +; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_EARLY_EXIT]] ; CHECK: loop.inc: ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END:%.*]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]], !llvm.loop [[LOOP21:![0-9]+]] ; CHECK: loop.early.exit: -; CHECK-NEXT: [[RETVAL1:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ] +; CHECK-NEXT: [[RETVAL1:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ] ; CHECK-NEXT: ret i64 [[RETVAL1]] ; CHECK: loop.end: -; CHECK-NEXT: [[RETVAL2:%.*]] = phi i64 [ 67, [[LOOP_INC]] ] +; CHECK-NEXT: [[RETVAL2:%.*]] = phi i64 [ 67, [[LOOP_INC]] ], [ 67, [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i64 [[RETVAL2]] ; entry: @@ -577,9 +885,11 @@ define i64 @diff_exit_block_pre_inc_use2() { ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) ; CHECK-NEXT: [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]] -; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] ; CHECK: middle.split: -; CHECK-NEXT: br i1 [[TMP7]], label [[LOOP_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]] +; CHECK-NEXT: br i1 [[TMP7]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]] +; CHECK: vector.early.exit: +; CHECK-NEXT: br label [[LOOP_EARLY_EXIT:%.*]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[VEC_IND]], i32 3 ; CHECK-NEXT: br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]] @@ -597,9 +907,9 @@ define i64 @diff_exit_block_pre_inc_use2() { ; CHECK: loop.inc: ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]], !llvm.loop [[LOOP23:![0-9]+]] ; CHECK: loop.early.exit: -; CHECK-NEXT: [[RETVAL1:%.*]] = phi i64 [ 67, [[LOOP]] ], [ 67, [[MIDDLE_SPLIT]] ] +; CHECK-NEXT: [[RETVAL1:%.*]] = phi i64 [ 67, [[LOOP]] ], [ 67, [[VECTOR_EARLY_EXIT]] ] ; CHECK-NEXT: ret i64 [[RETVAL1]] ; CHECK: loop.end: ; CHECK-NEXT: [[RETVAL2:%.*]] = phi i64 [ [[INDEX]], [[LOOP_INC]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ] @@ -643,24 +953,57 @@ define i64 @diff_exit_block_pre_inc_use3() { ; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 ; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) ; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX2]] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD3]] +; CHECK-NEXT: [[INDEX_NEXT4]] = add nuw i64 [[INDEX2]], 4 +; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP5]], splat (i1 true) +; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]]) +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT4]], 64 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) +; CHECK-NEXT: [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]] +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +; CHECK: middle.split: +; CHECK-NEXT: br i1 [[TMP7]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]] +; CHECK: vector.early.exit: +; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true) +; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[FIRST_ACTIVE_LANE]] +; CHECK-NEXT: br label [[LOOP_EARLY_EXIT:%.*]] +; CHECK: middle.block: +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[VEC_IND]], i32 3 +; CHECK-NEXT: br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] ; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] ; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 ; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] -; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_EARLY_EXIT:%.*]] +; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_EARLY_EXIT]] ; CHECK: loop.inc: ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END:%.*]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]], !llvm.loop [[LOOP25:![0-9]+]] ; CHECK: loop.early.exit: -; CHECK-NEXT: [[INDEX_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ] +; CHECK-NEXT: [[INDEX_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ] ; CHECK-NEXT: ret i64 [[INDEX_LCSSA]] ; CHECK: loop.end: -; CHECK-NEXT: [[INDEX_LCSSA1:%.*]] = phi i64 [ [[INDEX]], [[LOOP_INC]] ] +; CHECK-NEXT: [[INDEX_LCSSA1:%.*]] = phi i64 [ [[INDEX]], [[LOOP_INC]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i64 [[INDEX_LCSSA1]] ; entry: @@ -699,24 +1042,63 @@ define i64 @diff_exit_block_post_inc_use1() { ; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 ; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) ; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 3 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP5]], align 1 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP7]], align 1 +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[TMP0]], 1 +; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[TMP1]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[TMP2]], 1 +; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[TMP3]], 1 +; CHECK-NEXT: [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4 +; CHECK-NEXT: [[TMP13:%.*]] = xor <4 x i1> [[TMP8]], splat (i1 true) +; CHECK-NEXT: [[TMP14:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP13]]) +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) +; CHECK-NEXT: [[TMP16:%.*]] = or i1 [[TMP14]], [[TMP15]] +; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] +; CHECK: middle.split: +; CHECK-NEXT: br i1 [[TMP14]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]] +; CHECK: vector.early.exit: +; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP13]], i1 true) +; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[FIRST_ACTIVE_LANE]] +; CHECK-NEXT: br label [[LOOP_EARLY_EXIT:%.*]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] ; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] ; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 ; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] -; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_EARLY_EXIT:%.*]] +; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_EARLY_EXIT]] ; CHECK: loop.inc: ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END:%.*]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]], !llvm.loop [[LOOP27:![0-9]+]] ; CHECK: loop.early.exit: -; CHECK-NEXT: [[RETVAL1:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ] +; CHECK-NEXT: [[RETVAL1:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ] ; CHECK-NEXT: ret i64 [[RETVAL1]] ; CHECK: loop.end: -; CHECK-NEXT: [[RETVAL2:%.*]] = phi i64 [ [[INDEX_NEXT]], [[LOOP_INC]] ] +; CHECK-NEXT: [[RETVAL2:%.*]] = phi i64 [ [[INDEX_NEXT]], [[LOOP_INC]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i64 [[RETVAL2]] ; entry: @@ -757,24 +1139,68 @@ define i64 @diff_exit_block_post_inc_use2() { ; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 ; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) ; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 3 +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[TMP0]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[TMP1]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[TMP2]], 1 +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP3]], 1 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i64> [[TMP8]], i64 [[TMP5]], i32 1 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i64> [[TMP9]], i64 [[TMP6]], i32 2 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i64> [[TMP10]], i64 [[TMP7]], i32 3 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP13]], align 1 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP15]], align 1 +; CHECK-NEXT: [[TMP16:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4 +; CHECK-NEXT: [[TMP17:%.*]] = xor <4 x i1> [[TMP16]], splat (i1 true) +; CHECK-NEXT: [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP17]]) +; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) +; CHECK-NEXT: [[TMP20:%.*]] = or i1 [[TMP18]], [[TMP19]] +; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] +; CHECK: middle.split: +; CHECK-NEXT: br i1 [[TMP18]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]] +; CHECK: vector.early.exit: +; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP17]], i1 true) +; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = extractelement <4 x i64> [[TMP11]], i64 [[FIRST_ACTIVE_LANE]] +; CHECK-NEXT: br label [[LOOP_EARLY_EXIT:%.*]] +; CHECK: middle.block: +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i64> [[VEC_IND]], i32 3 +; CHECK-NEXT: br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] ; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] ; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 ; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] -; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_EARLY_EXIT:%.*]] +; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_EARLY_EXIT]] ; CHECK: loop.inc: ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END:%.*]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]], !llvm.loop [[LOOP29:![0-9]+]] ; CHECK: loop.early.exit: -; CHECK-NEXT: [[RETVAL1:%.*]] = phi i64 [ [[INDEX_NEXT]], [[LOOP]] ] +; CHECK-NEXT: [[RETVAL1:%.*]] = phi i64 [ [[INDEX_NEXT]], [[LOOP]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ] ; CHECK-NEXT: ret i64 [[RETVAL1]] ; CHECK: loop.end: -; CHECK-NEXT: [[RETVAL2:%.*]] = phi i64 [ [[INDEX]], [[LOOP_INC]] ] +; CHECK-NEXT: [[RETVAL2:%.*]] = phi i64 [ [[INDEX]], [[LOOP_INC]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i64 [[RETVAL2]] ; entry: @@ -815,20 +1241,50 @@ define i64 @loop_contains_safe_call() { ; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 ; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) ; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[P1]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> [[WIDE_LOAD]]) +; CHECK-NEXT: [[TMP4:%.*]] = fcmp fast ult <4 x float> [[TMP3]], splat (float 3.000000e+00) +; CHECK-NEXT: [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], 4 +; CHECK-NEXT: [[TMP5:%.*]] = xor <4 x i1> [[TMP4]], splat (i1 true) +; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]]) +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT2]], 64 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) +; CHECK-NEXT: [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]] +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]] +; CHECK: middle.split: +; CHECK-NEXT: br i1 [[TMP6]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]] +; CHECK: vector.early.exit: +; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP5]], i1 true) +; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[FIRST_ACTIVE_LANE]] +; CHECK-NEXT: br label [[LOOP_END:%.*]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[LOOP_END]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[P1]], i64 [[INDEX]] ; CHECK-NEXT: [[LD1:%.*]] = load float, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[SQRT:%.*]] = tail call fast float @llvm.sqrt.f32(float [[LD1]]) ; CHECK-NEXT: [[CMP:%.*]] = fcmp fast ult float [[SQRT]], 3.000000e+00 -; CHECK-NEXT: br i1 [[CMP]], label [[LOOP_INC]], label [[LOOP_END:%.*]] +; CHECK-NEXT: br i1 [[CMP]], label [[LOOP_INC]], label [[LOOP_END]] ; CHECK: loop.inc: ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]], !llvm.loop [[LOOP31:![0-9]+]] ; CHECK: loop.end: -; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ] +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ], [ 67, [[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ] ; CHECK-NEXT: ret i64 [[RETVAL]] ; entry: @@ -864,20 +1320,50 @@ define i64 @loop_contains_safe_div() { ; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 ; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) ; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = udiv <4 x i32> [[WIDE_LOAD]], splat (i32 20000) +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <4 x i32> [[TMP3]], splat (i32 1) +; CHECK-NEXT: [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], 4 +; CHECK-NEXT: [[TMP5:%.*]] = xor <4 x i1> [[TMP4]], splat (i1 true) +; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]]) +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT2]], 64 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) +; CHECK-NEXT: [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]] +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]] +; CHECK: middle.split: +; CHECK-NEXT: br i1 [[TMP6]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]] +; CHECK: vector.early.exit: +; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP5]], i1 true) +; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[FIRST_ACTIVE_LANE]] +; CHECK-NEXT: br label [[LOOP_END:%.*]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[LOOP_END]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[INDEX]] ; CHECK-NEXT: [[LD1:%.*]] = load i32, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[DIV:%.*]] = udiv i32 [[LD1]], 20000 ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[DIV]], 1 -; CHECK-NEXT: br i1 [[CMP]], label [[LOOP_INC]], label [[LOOP_END:%.*]] +; CHECK-NEXT: br i1 [[CMP]], label [[LOOP_INC]], label [[LOOP_END]] ; CHECK: loop.inc: ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]], !llvm.loop [[LOOP33:![0-9]+]] ; CHECK: loop.end: -; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ] +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ], [ 67, [[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ] ; CHECK-NEXT: ret i64 [[RETVAL]] ; entry: @@ -912,21 +1398,54 @@ define i64 @loop_contains_load_after_early_exit(ptr dereferenceable(1024) align( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i8], align 1 ; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], splat (i32 1) +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[P2]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i64, ptr [[TMP4]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8 +; CHECK-NEXT: [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4 +; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP3]], splat (i1 true) +; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]]) +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) +; CHECK-NEXT: [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]] +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]] +; CHECK: middle.split: +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[WIDE_LOAD2]], i32 3 +; CHECK-NEXT: br i1 [[TMP7]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]] +; CHECK: vector.early.exit: +; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true) +; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[FIRST_ACTIVE_LANE]] +; CHECK-NEXT: br label [[LOOP_END:%.*]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[LOOP_END]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[INDEX]] ; CHECK-NEXT: [[LD1:%.*]] = load i32, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[LD1]], 1 -; CHECK-NEXT: br i1 [[CMP]], label [[LOOP_INC]], label [[LOOP_END:%.*]] +; CHECK-NEXT: br i1 [[CMP]], label [[LOOP_INC]], label [[LOOP_END]] ; CHECK: loop.inc: ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[P2]], i64 [[INDEX]] ; CHECK-NEXT: [[LD2:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]], !llvm.loop [[LOOP35:![0-9]+]] ; CHECK: loop.end: -; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ [[LD2]], [[LOOP_INC]] ] +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ [[LD2]], [[LOOP_INC]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ] ; CHECK-NEXT: ret i64 [[RETVAL]] ; entry: @@ -961,21 +1480,57 @@ define i64 @same_exit_block_pre_inc_use1_reverse() { ; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 ; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) ; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX1]] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 -3 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1 +; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD]], <4 x i8> poison, <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 -3 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP6]], align 1 +; CHECK-NEXT: [[REVERSE3:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD2]], <4 x i8> poison, <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq <4 x i8> [[REVERSE]], [[REVERSE3]] +; CHECK-NEXT: [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], 4 +; CHECK-NEXT: [[TMP8:%.*]] = xor <4 x i1> [[TMP7]], splat (i1 true) +; CHECK-NEXT: [[TMP9:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP8]]) +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT4]], 1020 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 -4) +; CHECK-NEXT: [[TMP11:%.*]] = or i1 [[TMP9]], [[TMP10]] +; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]] +; CHECK: middle.split: +; CHECK-NEXT: br i1 [[TMP9]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]] +; CHECK: vector.early.exit: +; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP8]], i1 true) +; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[FIRST_ACTIVE_LANE]] +; CHECK-NEXT: br label [[LOOP_END:%.*]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 false, label [[LOOP_END]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, [[MIDDLE_BLOCK]] ], [ 1023, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 1023, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] ; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] ; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 ; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] -; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]] +; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]] ; CHECK: loop.inc: ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], -1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP_END]], label [[LOOP]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP_END]], label [[LOOP]], !llvm.loop [[LOOP37:![0-9]+]] ; CHECK: loop.end: -; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 1024, [[LOOP_INC]] ] +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 1024, [[LOOP_INC]] ], [ 1024, [[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ] ; CHECK-NEXT: ret i64 [[RETVAL]] ; entry: @@ -1009,21 +1564,53 @@ define i64 @same_exit_block_pre_inc_use1_deref_ptrs(ptr dereferenceable(1024) %p ; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1_deref_ptrs( ; CHECK-SAME: ptr dereferenceable(1024) [[P1:%.*]], ptr dereferenceable(1024) [[P2:%.*]]) { ; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4 +; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP5]], splat (i1 true) +; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]]) +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) +; CHECK-NEXT: [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]] +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]] +; CHECK: middle.split: +; CHECK-NEXT: br i1 [[TMP7]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]] +; CHECK: vector.early.exit: +; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true) +; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = extractelement <4 x i64> [[VEC_IND]], i64 [[FIRST_ACTIVE_LANE]] +; CHECK-NEXT: br label [[LOOP_END:%.*]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[LOOP_END]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] ; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] ; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 ; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] -; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]] +; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]] ; CHECK: loop.inc: ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]], !llvm.loop [[LOOP39:![0-9]+]] ; CHECK: loop.end: -; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ] +; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ], [ 67, [[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ] ; CHECK-NEXT: ret i64 [[RETVAL]] ; entry: @@ -1060,4 +1647,38 @@ attributes #0 = { "vector-function-abi-variant"="_ZGVsNxv_foo(foo_vec)" } ; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} ; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} ; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} +; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} +; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} +; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} +; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]} +; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]} +; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]} +; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]} +; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]} +; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]} +; CHECK: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]} +; CHECK: [[LOOP16]] = distinct !{[[LOOP16]], [[META1]], [[META2]]} +; CHECK: [[LOOP17]] = distinct !{[[LOOP17]], [[META2]], [[META1]]} +; CHECK: [[LOOP18]] = distinct !{[[LOOP18]], [[META1]], [[META2]]} +; CHECK: [[LOOP19]] = distinct !{[[LOOP19]], [[META2]], [[META1]]} +; CHECK: [[LOOP20]] = distinct !{[[LOOP20]], [[META1]], [[META2]]} +; CHECK: [[LOOP21]] = distinct !{[[LOOP21]], [[META2]], [[META1]]} +; CHECK: [[LOOP22]] = distinct !{[[LOOP22]], [[META1]], [[META2]]} +; CHECK: [[LOOP23]] = distinct !{[[LOOP23]], [[META2]], [[META1]]} +; CHECK: [[LOOP24]] = distinct !{[[LOOP24]], [[META1]], [[META2]]} +; CHECK: [[LOOP25]] = distinct !{[[LOOP25]], [[META2]], [[META1]]} +; CHECK: [[LOOP26]] = distinct !{[[LOOP26]], [[META1]], [[META2]]} +; CHECK: [[LOOP27]] = distinct !{[[LOOP27]], [[META2]], [[META1]]} +; CHECK: [[LOOP28]] = distinct !{[[LOOP28]], [[META1]], [[META2]]} +; CHECK: [[LOOP29]] = distinct !{[[LOOP29]], [[META2]], [[META1]]} +; CHECK: [[LOOP30]] = distinct !{[[LOOP30]], [[META1]], [[META2]]} +; CHECK: [[LOOP31]] = distinct !{[[LOOP31]], [[META2]], [[META1]]} +; CHECK: [[LOOP32]] = distinct !{[[LOOP32]], [[META1]], [[META2]]} +; CHECK: [[LOOP33]] = distinct !{[[LOOP33]], [[META2]], [[META1]]} +; CHECK: [[LOOP34]] = distinct !{[[LOOP34]], [[META1]], [[META2]]} +; CHECK: [[LOOP35]] = distinct !{[[LOOP35]], [[META2]], [[META1]]} +; CHECK: [[LOOP36]] = distinct !{[[LOOP36]], [[META1]], [[META2]]} +; CHECK: [[LOOP37]] = distinct !{[[LOOP37]], [[META2]], [[META1]]} +; CHECK: [[LOOP38]] = distinct !{[[LOOP38]], [[META1]], [[META2]]} +; CHECK: [[LOOP39]] = distinct !{[[LOOP39]], [[META2]], [[META1]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/uncountable-early-exit-vplan.ll b/llvm/test/Transforms/LoopVectorize/uncountable-early-exit-vplan.ll index b427b43cdb133..0520b3c77b5b6 100644 --- a/llvm/test/Transforms/LoopVectorize/uncountable-early-exit-vplan.ll +++ b/llvm/test/Transforms/LoopVectorize/uncountable-early-exit-vplan.ll @@ -39,7 +39,7 @@ define i64 @multi_exiting_to_different_exits_live_in_exit_values() { ; CHECK-EMPTY: ; CHECK-NEXT: middle.split: ; CHECK-NEXT: EMIT branch-on-cond vp<[[EA_TAKEN]]> -; CHECK-NEXT: Successor(s): ir-bb, middle.block +; CHECK-NEXT: Successor(s): vector.early.exit, middle.block ; CHECK-EMPTY: ; CHECK-NEXT: middle.block: ; CHECK-NEXT: EMIT vp<[[MIDDLE_CMP:%.+]]> = icmp eq ir<128>, vp<[[VTC]]> @@ -58,8 +58,11 @@ define i64 @multi_exiting_to_different_exits_live_in_exit_values() { ; CHECK-NEXT: IR %p2 = phi i64 [ 1, %loop.latch ] (extra operand: ir<1> from middle.block) ; CHECK-NEXT: No successors ; CHECK-EMPTY: +; CHECK-NEXT: vector.early.exit: +; CHECK-NEXT: Successor(s): ir-bb +; CHECK-EMPTY: ; CHECK-NEXT: ir-bb: -; CHECK-NEXT: IR %p1 = phi i64 [ 0, %loop.header ] (extra operand: ir<0> from middle.split) +; CHECK-NEXT: IR %p1 = phi i64 [ 0, %loop.header ] (extra operand: ir<0> from vector.early.exit) ; CHECK-NEXT: No successors ; CHECK-NEXT: } entry: @@ -122,7 +125,7 @@ define i64 @multi_exiting_to_same_exit_live_in_exit_values() { ; CHECK-EMPTY: ; CHECK-NEXT: middle.split: ; CHECK-NEXT: EMIT branch-on-cond vp<[[EA_TAKEN]]> -; CHECK-NEXT: Successor(s): ir-bb, middle.block +; CHECK-NEXT: Successor(s): vector.early.exit, middle.block ; CHECK-EMPTY: ; CHECK-NEXT: middle.block: ; CHECK-NEXT: EMIT vp<[[MIDDLE_CMP:%.+]]> = icmp eq ir<128>, vp<[[VTC]]> @@ -137,8 +140,11 @@ define i64 @multi_exiting_to_same_exit_live_in_exit_values() { ; CHECK-NEXT: IR %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ] (extra operand: vp<[[RESUME]]> from scalar.ph) ; CHECK: No successors ; CHECK-EMPTY: +; CHECK-NEXT: vector.early.exit: +; CHECK-NEXT: Successor(s): ir-bb +; CHECK-EMPTY: ; CHECK-NEXT: ir-bb: -; CHECK-NEXT: IR %p = phi i64 [ 0, %loop.header ], [ 1, %loop.latch ] (extra operands: ir<1> from middle.block, ir<0> from middle.split) +; CHECK-NEXT: IR %p = phi i64 [ 0, %loop.header ], [ 1, %loop.latch ] (extra operands: ir<1> from middle.block, ir<0> from vector.early.exit) ; CHECK-NEXT: No successors ; CHECK-NEXT: } @@ -198,7 +204,7 @@ define i64 @multi_exiting_to_same_exit_live_in_exit_values_2() { ; CHECK-EMPTY: ; CHECK-NEXT: middle.split: ; CHECK-NEXT: EMIT branch-on-cond vp<[[EA_TAKEN]]> -; CHECK-NEXT: Successor(s): ir-bb, middle.block +; CHECK-NEXT: Successor(s): vector.early.exit, middle.block ; CHECK-EMPTY: ; CHECK-NEXT: middle.block: ; CHECK-NEXT: EMIT vp<[[MIDDLE_CMP:%.+]]> = icmp eq ir<128>, vp<[[VTC]]> @@ -213,8 +219,11 @@ define i64 @multi_exiting_to_same_exit_live_in_exit_values_2() { ; CHECK-NEXT: IR %iv = phi i64 [ %inc, %loop.latch ], [ 0, %entry ] (extra operand: vp<[[RESUME]]> from scalar.ph) ; CHECK: No successors ; CHECK-EMPTY: +; CHECK-NEXT: vector.early.exit: +; CHECK-NEXT: Successor(s): ir-bb +; CHECK-EMPTY: ; CHECK-NEXT: ir-bb: -; CHECK-NEXT: IR %p = phi i64 [ 0, %loop.header ], [ 1, %loop.latch ] (extra operands: ir<1> from middle.block, ir<0> from middle.split) +; CHECK-NEXT: IR %p = phi i64 [ 0, %loop.header ], [ 1, %loop.latch ] (extra operands: ir<1> from middle.block, ir<0> from vector.early.exit) ; CHECK-NEXT: No successors ; CHECK-NEXT: }