Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
b7b23a9
[LV] Use ExtractLane(LastActiveLane, V) live outs when tail-folding.
fhahn Oct 2, 2025
9985387
Merge remote-tracking branch 'origin/main' into lv-tf-external-users
fhahn Oct 13, 2025
6b0c9d2
!fixup add LastActiveLane opcode, lowered via FirstActiveLane.
fhahn Oct 13, 2025
2a45f94
Merge remote-tracking branch 'origin/main' into lv-tf-external-users
fhahn Oct 15, 2025
7d1ab3c
!fixup verify LastActiveLane.
fhahn Oct 15, 2025
8ca0426
Merge remote-tracking branch 'origin/main' into lv-tf-external-users
fhahn Oct 15, 2025
96c827b
[VPlan] Mark VPlan argument in isHeaderMask as const (NFC).
fhahn Oct 15, 2025
72f3796
!fixup clean up verifier changes
fhahn Oct 15, 2025
f3fdac7
Merge remote-tracking branch 'origin/main' into lv-tf-external-users
fhahn Oct 16, 2025
af82079
!fixup add note about operands needing to be prefix-masks.
fhahn Oct 16, 2025
cc7e913
!fixup fix formatting
fhahn Oct 16, 2025
2d3012f
Merge remote-tracking branch 'origin/main' into lv-tf-external-users
fhahn Oct 23, 2025
4fa7442
!fixup tighten verifier check for broadcast of EVL
fhahn Oct 23, 2025
ae81e08
Merge remote-tracking branch 'origin/main' into lv-tf-external-users
fhahn Nov 11, 2025
f10fa37
!fixup add additional pattern matching helpers.
fhahn Nov 11, 2025
2758f96
Merge remote-tracking branch 'origin/main' into lv-tf-external-users
fhahn Nov 11, 2025
73b3197
Merge remote-tracking branch 'origin/main' into lv-tf-external-users
fhahn Nov 12, 2025
d8e77ca
!fixup address comments, thanks
fhahn Nov 12, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 0 additions & 18 deletions llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2083,24 +2083,6 @@ bool LoopVectorizationLegality::canFoldTailByMasking() const {
for (const auto &Reduction : getReductionVars())
ReductionLiveOuts.insert(Reduction.second.getLoopExitInstr());

// TODO: handle non-reduction outside users when tail is folded by masking.
for (auto *AE : AllowedExit) {
// Check that all users of allowed exit values are inside the loop or
// are the live-out of a reduction.
if (ReductionLiveOuts.count(AE))
continue;
for (User *U : AE->users()) {
Instruction *UI = cast<Instruction>(U);
if (TheLoop->contains(UI))
continue;
LLVM_DEBUG(
dbgs()
<< "LV: Cannot fold tail by masking, loop has an outside user for "
<< *UI << "\n");
return false;
}
}

for (const auto &Entry : getInductionVars()) {
PHINode *OrigPhi = Entry.first;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does the PR handle the IV liveout user as well?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This patch still leaves inductions untouched for now

for (User *U : OrigPhi->users()) {
Expand Down
6 changes: 5 additions & 1 deletion llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8798,7 +8798,11 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
if (FinalReductionResult == U || Parent->getParent())
continue;
U->replaceUsesOfWith(OrigExitingVPV, FinalReductionResult);
if (match(U, m_ExtractLastElement(m_VPValue())))
if (match(U,
m_CombineOr(m_VPInstruction<VPInstruction::ExtractLastElement>(
m_VPValue()),
m_VPInstruction<VPInstruction::ExtractLane>(
m_VPValue(), m_VPValue()))))
cast<VPInstruction>(U)->replaceAllUsesWith(FinalReductionResult);
}

Expand Down
51 changes: 46 additions & 5 deletions llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,6 @@ class VPPredicator {
/// possibly inserting new recipes at \p Dst (using Builder's insertion point)
VPValue *createEdgeMask(VPBasicBlock *Src, VPBasicBlock *Dst);

/// Returns the *entry* mask for \p VPBB.
VPValue *getBlockInMask(VPBasicBlock *VPBB) const {
return BlockMaskCache.lookup(VPBB);
}

/// Record \p Mask as the *entry* mask of \p VPBB, which is expected to not
/// already have a mask.
void setBlockInMask(VPBasicBlock *VPBB, VPValue *Mask) {
Expand All @@ -68,6 +63,11 @@ class VPPredicator {
}

public:
/// Returns the *entry* mask for \p VPBB.
VPValue *getBlockInMask(VPBasicBlock *VPBB) const {
return BlockMaskCache.lookup(VPBB);
}

/// Returns the precomputed predicate of the edge from \p Src to \p Dst.
VPValue *getEdgeMask(const VPBasicBlock *Src, const VPBasicBlock *Dst) const {
return EdgeMaskCache.lookup({Src, Dst});
Expand Down Expand Up @@ -300,5 +300,46 @@ VPlanTransforms::introduceMasksAndLinearize(VPlan &Plan, bool FoldTail) {

PrevVPBB = VPBB;
}

// If we folded the tail and introduced a header mask, any extract of the
// last element must be updated to extract from the last active lane of the
// header mask instead (i.e., the lane corresponding to the last active
// iteration).
if (FoldTail) {
assert(Plan.getExitBlocks().size() == 1 &&
"only a single-exit block is supported currently");
VPBasicBlock *EB = Plan.getExitBlocks().front();
assert(EB->getSinglePredecessor() == Plan.getMiddleBlock() &&
"the exit block must have middle block as single predecessor");

VPValue *LastActiveLane = nullptr;
VPBuilder B(Plan.getMiddleBlock()->getTerminator());
for (auto &P : EB->phis()) {
auto *ExitIRI = cast<VPIRPhi>(&P);
VPValue *Inc = ExitIRI->getIncomingValue(0);
VPValue *Op;
if (!match(Inc, m_VPInstruction<VPInstruction::ExtractLastElement>(
m_VPValue(Op))))
continue;

if (!LastActiveLane) {
// Compute the index of the last active lane by getting the first lane
// where the header mask is false (first inactive lane), then
// subtracting 1. This gives us the last lane where the mask was true.
VPValue *HeaderMask = Predicator.getBlockInMask(
Plan.getVectorLoopRegion()->getEntryBasicBlock());
VPValue *NotHeaderMask = B.createNot(HeaderMask);
VPValue *FirstInactiveLane =
B.createNaryOp(VPInstruction::FirstActiveLane, {NotHeaderMask});
VPValue *One = Plan.getOrAddLiveIn(
ConstantInt::get(Type::getInt64Ty(Plan.getContext()), 1));
LastActiveLane =
B.createNaryOp(Instruction::Sub, {FirstInactiveLane, One});
}
auto *Ext =
B.createNaryOp(VPInstruction::ExtractLane, {LastActiveLane, Op});
Inc->replaceAllUsesWith(Ext);
}
}
return Predicator.getBlockMaskCache();
}
7 changes: 6 additions & 1 deletion llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -638,7 +638,12 @@ Value *VPInstruction::generate(VPTransformState &State) {
llvm_unreachable("should be handled by VPPhi::execute");
}
case Instruction::Select: {
bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
bool OnlyFirstLaneUsed =
vputils::onlyFirstLaneUsed(this) ||
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this change worth submitting separately? Not sure if it has any effect outside this PR.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I couldn't find any cases independent of the PR, as only now we have selects with to vectorToScalar ops.

(isa<VPInstruction>(getOperand(1)) &&
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this restricted to just VPInstruction types? Can isVectorToScalar only be called for VPInstructions?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep, isVectorToScalar is a property of VPInstructions only at the moment

cast<VPInstruction>(getOperand(1))->isVectorToScalar() &&
isa<VPInstruction>(getOperand(2)) &&
cast<VPInstruction>(getOperand(2))->isVectorToScalar());
Value *Cond = State.get(getOperand(0), OnlyFirstLaneUsed);
Value *Op1 = State.get(getOperand(1), OnlyFirstLaneUsed);
Value *Op2 = State.get(getOperand(2), OnlyFirstLaneUsed);
Expand Down
29 changes: 29 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1941,6 +1941,35 @@ bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan &Plan,
// Set the first operand of RecurSplice to FOR again, after replacing
// all users.
RecurSplice->setOperand(0, FOR);

// Check for users extracting the second-to-last active lane of the FOR. If
// only a single lane is active in the current iteration, we need to select
// the last element of the value from the previous iteration, directly from
// the FOR phi.
for (VPUser *U : RecurSplice->users()) {
if (!match(U, m_VPInstruction<VPInstruction::ExtractLane>(
m_Sub(m_VPInstruction<VPInstruction::FirstActiveLane>(
m_VPValue()),
m_SpecificInt(1)),
m_Specific(RecurSplice))))
continue;

VPBuilder B(cast<VPInstruction>(U));
VPValue *LastActiveLane = cast<VPInstruction>(U)->getOperand(0);
Type *I64Ty = Type::getInt64Ty(Plan.getContext());
VPValue *Zero = Plan.getOrAddLiveIn(ConstantInt::get(I64Ty, 0));
VPValue *One = Plan.getOrAddLiveIn(ConstantInt::get(I64Ty, 1));
VPValue *PenultimateIndex =
B.createNaryOp(Instruction::Sub, {LastActiveLane, One});
VPValue *PenultimateLastIter =
B.createNaryOp(VPInstruction::ExtractLane,
{PenultimateIndex, FOR->getBackedgeValue()});
VPValue *LastPrevIter =
B.createNaryOp(VPInstruction::ExtractLastElement, {FOR});
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
B.createNaryOp(VPInstruction::ExtractLastElement, {FOR});
B.createNaryOp(VPInstruction::ExtractLastElement, FOR);

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done thanks

VPValue *Cmp = B.createICmp(CmpInst::ICMP_EQ, LastActiveLane, Zero);
VPValue *Sel = B.createSelect(Cmp, LastPrevIter, PenultimateLastIter);
cast<VPInstruction>(U)->replaceAllUsesWith(Sel);
}
}
return true;
}
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -377,6 +377,12 @@ void UnrollState::unrollBlock(VPBlockBase *VPB) {
match(&R, m_VPInstruction<VPInstruction::ExtractPenultimateElement>(
m_VPValue(Op0)))) {
addUniformForAllParts(cast<VPSingleDefRecipe>(&R));
if (isa<VPFirstOrderRecurrencePHIRecipe>(Op0)) {
assert(match(&R, m_ExtractLastElement(m_VPValue())) &&
"can only extract last element of FOR");
continue;
}

if (Plan.hasScalarVFOnly()) {
auto *I = cast<VPInstruction>(&R);
// Extracting from end with VF = 1 implies retrieving the last or
Expand Down
68 changes: 34 additions & 34 deletions llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
Original file line number Diff line number Diff line change
Expand Up @@ -69,61 +69,61 @@ exit:
define i8 @dead_live_out_due_to_scalar_epilogue_required(ptr %src, ptr %dst) {
; CHECK-LABEL: define i8 @dead_live_out_due_to_scalar_epilogue_required(
; CHECK-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[ENTRY:.*]]:
; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
; CHECK-NEXT: [[TMP1:%.*]] = shl nuw i32 [[TMP0]], 2
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.umax.i32(i32 [[TMP1]], i32 6)
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i32 252, [[TMP2]]
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: br label %[[VECTOR_MEMCHECK:.*]]
; CHECK: [[VECTOR_MEMCHECK]]:
; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 1005
; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[SRC]], i64 1005
; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]]
; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]]
; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vscale.i32()
; CHECK-NEXT: [[TMP4:%.*]] = mul nuw i32 [[TMP3]], 4
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 252, [[TMP4]]
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[N_MOD_VF]], 0
; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP4]], i32 [[N_MOD_VF]]
; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 252, [[TMP6]]
; CHECK-NEXT: [[IND_END:%.*]] = mul i32 [[N_VEC]], 4
; CHECK-NEXT: [[TMP9:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
; CHECK-NEXT: [[TMP11:%.*]] = mul <vscale x 4 x i32> [[TMP9]], splat (i32 4)
; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i32> zeroinitializer, [[TMP11]]
; CHECK-NEXT: [[TMP14:%.*]] = mul i32 4, [[TMP4]]
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP14]], i64 0
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
; CHECK-NEXT: [[TMP1:%.*]] = mul <vscale x 16 x i32> [[TMP0]], splat (i32 4)
; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 16 x i32> zeroinitializer, [[TMP1]]
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP15:%.*]] = sext <vscale x 4 x i32> [[VEC_IND]] to <vscale x 4 x i64>
; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[DST]], <vscale x 4 x i64> [[TMP15]]
; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i8.nxv4p0(<vscale x 4 x i8> zeroinitializer, <vscale x 4 x ptr> [[TMP16]], i32 1, <vscale x 4 x i1> splat (i1 true)), !alias.scope [[META3:![0-9]+]], !noalias [[META6:![0-9]+]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP4]]
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[VEC_IND]], [[DOTSPLAT]]
; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[AVL:%.*]] = phi i32 [ 252, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 16, i1 true)
; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP2]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT2]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
; CHECK-NEXT: [[TMP3:%.*]] = mul i32 4, [[TMP2]]
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP3]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
; CHECK-NEXT: [[TMP5:%.*]] = icmp uge <vscale x 16 x i32> [[TMP0]], [[BROADCAST_SPLAT3]]
; CHECK-NEXT: [[TMP9:%.*]] = sext <vscale x 16 x i32> [[VEC_IND]] to <vscale x 16 x i64>
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[SRC]], <vscale x 16 x i64> [[TMP9]]
; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 16 x i8> @llvm.vp.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> align 1 [[TMP6]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP2]]), !alias.scope [[META3:![0-9]+]]
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[DST]], <vscale x 16 x i64> [[TMP9]]
; CHECK-NEXT: call void @llvm.vp.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x ptr> align 1 [[TMP7]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP2]]), !alias.scope [[META6:![0-9]+]], !noalias [[META3]]
; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP2]]
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[AVL_NEXT]], 0
; CHECK-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: br label %[[SCALAR_PH]]
; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP5]], i1 true)
; CHECK-NEXT: [[TMP11:%.*]] = sub i64 [[TMP10]], 1
; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP13:%.*]] = mul nuw i64 [[TMP12]], 16
; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP13]], 0
; CHECK-NEXT: [[TMP15:%.*]] = extractelement <vscale x 16 x i8> [[WIDE_MASKED_GATHER]], i64 [[TMP11]]
; CHECK-NEXT: br label %[[EXIT:.*]]
; CHECK: [[SCALAR_PH]]:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[IV]] to i64
; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[IDXPROM]]
; CHECK-NEXT: [[L:%.*]] = load i8, ptr [[GEP_SRC]], align 1
; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IDXPROM]]
; CHECK-NEXT: store i8 0, ptr [[GEP_DST]], align 1
; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 4
; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[IV]], 1001
; CHECK-NEXT: br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]], !llvm.loop [[LOOP9:![0-9]+]]
; CHECK-NEXT: br i1 [[CMP]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP9:![0-9]+]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: [[R:%.*]] = phi i8 [ [[L]], %[[LOOP]] ]
; CHECK-NEXT: [[R:%.*]] = phi i8 [ [[L]], %[[LOOP]] ], [ [[TMP15]], %[[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i8 [[R]]
;
entry:
Expand Down
Loading