Skip to content

Commit 2535516

Browse files
committed
[VPlan] Try to hoist Previous (and operands), if sinking fails for FORs.
In some cases, Previous (and its operands) can be hoisted. This allows supporting additional cases where sinking of all users of to FOR fails, e.g. due having to sink recipes with side-effects. This fixes a crash where we fail to create a scalar VPlan for a first-order recurrence, but can create a vector VPlan, because the trunc instruction of an IV which generates the previous value of the recurrence has been optimized to a truncated induction recipe, thus hoisting it to the beginning. Fixes #106523.
1 parent 3829fd7 commit 2535516

File tree

5 files changed

+294
-19
lines changed

5 files changed

+294
-19
lines changed

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 88 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -771,6 +771,92 @@ sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR,
771771
return true;
772772
}
773773

774+
/// Try to hoist \p Previous and its operands before all users of \p FOR.
775+
static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR,
776+
VPRecipeBase *Previous,
777+
VPDominatorTree &VPDT) {
778+
using namespace llvm::VPlanPatternMatch;
779+
if (Previous->mayHaveSideEffects() || Previous->mayReadFromMemory())
780+
return false;
781+
782+
// Collect recipes that need hoisting.
783+
SmallVector<VPRecipeBase *> WorkList;
784+
SmallPtrSet<VPRecipeBase *, 8> Seen;
785+
VPBasicBlock *HoistBlock = FOR->getParent();
786+
auto HoistPoint = HoistBlock->getFirstNonPhi();
787+
auto TryToPushHoistCandidate = [&](VPRecipeBase *HoistCandidate) {
788+
// If we reach FOR, it means the original Previous depends on some other
789+
// recurrence that in turn depends on FOR. If that is the case, we would
790+
// also need to hoist recipes involving the other FOR, which may break
791+
// dependencies.
792+
if (HoistCandidate == FOR)
793+
return false;
794+
795+
// Hoist candidate outside any region, no need to hoist.
796+
if (!HoistCandidate->getParent()->getParent())
797+
return true;
798+
799+
// Hoist candidate is a header phi or already visited, no need to hoist.
800+
if (isa<VPHeaderPHIRecipe>(HoistCandidate) ||
801+
!Seen.insert(HoistCandidate).second)
802+
return true;
803+
804+
// If we reached a recipe that dominates all users of FOR, we don't need to
805+
// hoist the recipe.
806+
if (all_of(FOR->users(), [&VPDT, HoistCandidate](VPUser *U) {
807+
return VPDT.properlyDominates(HoistCandidate, cast<VPRecipeBase>(U));
808+
})) {
809+
if (VPDT.properlyDominates(&*HoistPoint, HoistCandidate)) {
810+
// This HoistCandidate domiantes all users of FOR and is closer to them
811+
// than the previous HoistPoint.
812+
HoistPoint = std::next(HoistCandidate->getIterator());
813+
HoistBlock = HoistCandidate->getParent();
814+
}
815+
return true;
816+
}
817+
818+
// Don't move candiates with sideeffects, as we do not yet analyze recipes
819+
// between candidate and hoist destination yet.
820+
if (HoistCandidate->mayHaveSideEffects())
821+
return false;
822+
823+
WorkList.push_back(HoistCandidate);
824+
return true;
825+
};
826+
827+
// Recursively try to hoist Previous and its operands before all users of FOR.
828+
// Update HoistPoint to the closest recipe that dominates all users of FOR.
829+
if (!TryToPushHoistCandidate(Previous))
830+
return false;
831+
832+
for (unsigned I = 0; I != WorkList.size(); ++I) {
833+
VPRecipeBase *Current = WorkList[I];
834+
assert(Current->getNumDefinedValues() == 1 &&
835+
"only recipes with a single defined value expected");
836+
837+
for (VPValue *Op : Current->operands())
838+
if (auto *R = Op->getDefiningRecipe())
839+
if (!TryToPushHoistCandidate(R))
840+
return false;
841+
}
842+
843+
// Keep recipes to hoist ordered by dominance so earlier instructions are
844+
// processed first.
845+
sort(WorkList, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
846+
return VPDT.properlyDominates(A, B);
847+
});
848+
849+
for (VPRecipeBase *HoistCandidate : WorkList) {
850+
if (HoistPoint == HoistCandidate->getIterator()) {
851+
HoistPoint = std::next(HoistCandidate->getIterator());
852+
continue;
853+
}
854+
HoistCandidate->moveBefore(*HoistBlock, HoistPoint);
855+
}
856+
857+
return true;
858+
}
859+
774860
bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan &Plan,
775861
VPBuilder &LoopBuilder) {
776862
VPDominatorTree VPDT;
@@ -795,7 +881,8 @@ bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan &Plan,
795881
}
796882

797883
if (!sinkRecurrenceUsersAfterPrevious(FOR, Previous, VPDT))
798-
return false;
884+
if (!hoistPreviousBeforeFORUsers(FOR, Previous, VPDT))
885+
return false;
799886

800887
// Introduce a recipe to combine the incoming and previous values of a
801888
// fixed-order recurrence.

llvm/lib/Transforms/Vectorize/VPlanTransforms.h

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -36,11 +36,13 @@ struct VPlanTransforms {
3636
GetIntOrFpInductionDescriptor,
3737
ScalarEvolution &SE, const TargetLibraryInfo &TLI);
3838

39-
/// Sink users of fixed-order recurrences after the recipe defining their
40-
/// previous value. Then introduce FirstOrderRecurrenceSplice VPInstructions
41-
/// to combine the value from the recurrence phis and previous values. The
42-
/// current implementation assumes all users can be sunk after the previous
43-
/// value, which is enforced by earlier legality checks.
39+
/// Try to move users of fixed-order recurrences after the recipe defining
40+
/// their previous value, either by sinking them or hoisting the recipe
41+
/// defining their previous value (and its operands). Then introduce
42+
/// FirstOrderRecurrenceSplice VPInstructions to combine the value from the
43+
/// recurrence phis and previous values. The current implementation assumes
44+
/// all users can be sunk after the previous value, which is enforced by
45+
/// earlier legality checks.
4446
/// \returns true if all users of fixed-order recurrences could be re-arranged
4547
/// as needed or false if it is not possible. In the latter case, \p Plan is
4648
/// not valid.

llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -278,3 +278,67 @@ exit:
278278
store double %.lcssa, ptr %C
279279
ret i64 %.in.lcssa
280280
}
281+
282+
; Test for https://github.com/llvm/llvm-project/issues/106523.
283+
define void @for_iv_trunc_optimized(ptr %dst) {
284+
; CHECK-LABEL: @for_iv_trunc_optimized(
285+
; CHECK-NEXT: bb:
286+
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
287+
; CHECK: vector.ph:
288+
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
289+
; CHECK: vector.body:
290+
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
291+
; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 1>, [[VECTOR_PH]] ], [ [[STEP_ADD:%.*]], [[VECTOR_BODY]] ]
292+
; CHECK-NEXT: [[VECTOR_RECUR1:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
293+
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 1, i32 2, i32 3, i32 4>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
294+
; CHECK-NEXT: [[STEP_ADD]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
295+
; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[VEC_IND]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
296+
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[VEC_IND]], <4 x i32> [[STEP_ADD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
297+
; CHECK-NEXT: [[TMP2:%.*]] = or <4 x i32> [[TMP0]], zeroinitializer
298+
; CHECK-NEXT: [[TMP3]] = or <4 x i32> [[TMP1]], zeroinitializer
299+
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
300+
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP5]], i32 3
301+
; CHECK-NEXT: store i32 [[TMP6]], ptr [[DST:%.*]], align 4
302+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
303+
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], <i32 4, i32 4, i32 4, i32 4>
304+
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 336
305+
; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
306+
; CHECK: middle.block:
307+
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[STEP_ADD]], i32 3
308+
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
309+
; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
310+
; CHECK: scalar.ph:
311+
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 337, [[MIDDLE_BLOCK]] ], [ 1, [[BB:%.*]] ]
312+
; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 1, [[BB]] ]
313+
; CHECK-NEXT: [[SCALAR_RECUR_INIT4:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT3]], [[MIDDLE_BLOCK]] ], [ 0, [[BB]] ]
314+
; CHECK-NEXT: br label [[LOOP:%.*]]
315+
; CHECK: loop:
316+
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[ADD:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
317+
; CHECK-NEXT: [[FOR_1:%.*]] = phi i32 [ [[TRUNC:%.*]], [[LOOP]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ]
318+
; CHECK-NEXT: [[FOR_2:%.*]] = phi i32 [ [[OR:%.*]], [[LOOP]] ], [ [[SCALAR_RECUR_INIT4]], [[SCALAR_PH]] ]
319+
; CHECK-NEXT: [[OR]] = or i32 [[FOR_1]], 0
320+
; CHECK-NEXT: [[ADD]] = add i64 [[IV]], 1
321+
; CHECK-NEXT: store i32 [[FOR_2]], ptr [[DST]], align 4
322+
; CHECK-NEXT: [[ICMP:%.*]] = icmp ult i64 [[IV]], 337
323+
; CHECK-NEXT: [[TRUNC]] = trunc i64 [[IV]] to i32
324+
; CHECK-NEXT: br i1 [[ICMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP9:![0-9]+]]
325+
; CHECK: exit:
326+
; CHECK-NEXT: ret void
327+
;
328+
bb:
329+
br label %loop
330+
331+
loop:
332+
%iv = phi i64 [ %add, %loop ], [ 1, %bb ]
333+
%for.1 = phi i32 [ %trunc, %loop ], [ 1, %bb ]
334+
%for.2 = phi i32 [ %or, %loop ], [ 0, %bb ]
335+
%or = or i32 %for.1, 0
336+
%add = add i64 %iv, 1
337+
store i32 %for.2, ptr %dst, align 4
338+
%icmp = icmp ult i64 %iv, 337
339+
%trunc = trunc i64 %iv to i32
340+
br i1 %icmp, label %loop, label %exit
341+
342+
exit:
343+
ret void
344+
}

llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll

Lines changed: 97 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -147,14 +147,57 @@ exit:
147147
}
148148

149149
; This test has two FORs (for.x and for.y) where incoming value from the previous
150-
; iteration (for.x.prev) of one FOR (for.y) depends on another FOR (for.x). Due to
151-
; this dependency all uses of the former FOR (for.y) should be sunk after
152-
; incoming value from the previous iteration (for.x.prev) of te latter FOR (for.y).
153-
; That means side-effecting user (store i64 %for.y.i64, ptr %gep) of the latter
154-
; FOR (for.y) should be moved which is not currently supported.
150+
; iteration (for.x.prev) of one FOR (for.y) depends on another FOR (for.x).
151+
; Sinking would require moving a recipe with side effects (store). Instead,
152+
; for.x.prev can be hoisted.
155153
define i32 @test_chained_first_order_recurrences_4(ptr %base, i64 %x) {
156154
; CHECK-LABEL: 'test_chained_first_order_recurrences_4'
157-
; CHECK: No VPlans built.
155+
; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' {
156+
; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF
157+
; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
158+
; CHECK-NEXT: Live-in ir<4098> = original trip-count
159+
; CHECK-EMPTY:
160+
; CHECK-NEXT: vector.ph:
161+
; CHECK-NEXT: WIDEN ir<%for.x.next> = mul ir<%x>, ir<2>
162+
; CHECK-NEXT: Successor(s): vector loop
163+
; CHECK-EMPTY:
164+
; CHECK-NEXT: <x1> vector loop: {
165+
; CHECK-NEXT: vector.body:
166+
; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]>
167+
; CHECK-NEXT: FIRST-ORDER-RECURRENCE-PHI ir<%for.x> = phi ir<0>, ir<%for.x.next>
168+
; CHECK-NEXT: FIRST-ORDER-RECURRENCE-PHI ir<%for.y> = phi ir<0>, ir<%for.x.prev>
169+
; CHECK-NEXT: vp<[[SCALAR_STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
170+
; CHECK-NEXT: EMIT vp<[[SPLICE_X:%.]]> = first-order splice ir<%for.x>, ir<%for.x.next>
171+
; CHECK-NEXT: WIDEN-CAST ir<%for.x.prev> = trunc vp<[[SPLICE_X]]> to i32
172+
; CHECK-NEXT: EMIT vp<[[SPLICE_Y:%.+]]> = first-order splice ir<%for.y>, ir<%for.x.prev>
173+
; CHECK-NEXT: CLONE ir<%gep> = getelementptr ir<%base>, vp<[[SCALAR_STEPS]]>
174+
; CHECK-NEXT: WIDEN-CAST ir<%for.y.i64> = sext vp<[[SPLICE_Y]]> to i64
175+
; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%gep>
176+
; CHECK-NEXT: WIDEN store vp<[[VEC_PTR]]>, ir<%for.y.i64>
177+
; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
178+
; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]>
179+
; CHECK-NEXT: No successors
180+
; CHECK-NEXT: }
181+
; CHECK-NEXT: Successor(s): middle.block
182+
; CHECK-EMPTY:
183+
; CHECK-NEXT: middle.block:
184+
; CHECK-NEXT: EMIT vp<[[EXT_X:%.+]]> = extract-from-end ir<%for.x.next>, ir<1>
185+
; CHECK-NEXT: EMIT vp<[[EXT_Y:%.+]]> = extract-from-end ir<%for.x.prev>, ir<1>
186+
; CHECK-NEXT: EMIT vp<[[MIDDLE_C:%.+]]> = icmp eq ir<4098>, vp<[[VTC]]>
187+
; CHECK-NEXT: EMIT branch-on-cond vp<[[MIDDLE_C]]>
188+
; CHECK-NEXT: Successor(s): ir-bb<ret>, scalar.ph
189+
; CHECK-EMPTY:
190+
; CHECK-NEXT: ir-bb<ret>:
191+
; CHECK-NEXT: No successors
192+
; CHECK-EMPTY:
193+
; CHECK-NEXT: scalar.ph:
194+
; CHECK-NEXT: EMIT vp<[[RESUME_X:%.+]]> = resume-phi vp<[[EXT_X]]>, ir<0>
195+
; CHECK-NEXT: EMIT vp<[[RESUME_Y:%.+]]> = resume-phi vp<[[EXT_Y]]>, ir<0>
196+
; CHECK-NEXT: No successors
197+
; CHECK-EMPTY:
198+
; CHECK-NEXT: Live-out i64 %for.x = vp<[[RESUME_X]]>
199+
; CHECK-NEXT: Live-out i32 %for.y = vp<[[RESUME_Y]]>
200+
; CHECK-NEXT: }
158201
;
159202
entry:
160203
br label %loop
@@ -178,7 +221,54 @@ ret:
178221

179222
define i32 @test_chained_first_order_recurrences_5_hoist_to_load(ptr %base) {
180223
; CHECK-LABEL: 'test_chained_first_order_recurrences_5_hoist_to_load'
181-
; CHECK: No VPlans built.
224+
; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' {
225+
; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF
226+
; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
227+
; CHECK-NEXT: Live-in ir<4098> = original trip-count
228+
; CHECK-EMPTY:
229+
; CHECK-NEXT: vector.ph:
230+
; CHECK-NEXT: Successor(s): vector loop
231+
; CHECK-EMPTY:
232+
; CHECK-NEXT: <x1> vector loop: {
233+
; CHECK-NEXT: vector.body:
234+
; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]>
235+
; CHECK-NEXT: FIRST-ORDER-RECURRENCE-PHI ir<%for.x> = phi ir<0>, ir<%for.x.next>
236+
; CHECK-NEXT: FIRST-ORDER-RECURRENCE-PHI ir<%for.y> = phi ir<0>, ir<%for.x.prev>
237+
; CHECK-NEXT: vp<[[SCALAR_STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
238+
; CHECK-NEXT: CLONE ir<%gep> = getelementptr ir<%base>, vp<[[SCALAR_STEPS]]>
239+
; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%gep>
240+
; CHECK-NEXT: WIDEN ir<%l> = load vp<[[VEC_PTR]]>
241+
; CHECK-NEXT: WIDEN ir<%for.x.next> = mul ir<%l>, ir<2>
242+
; CHECK-NEXT: EMIT vp<[[SPLICE_X:%.]]> = first-order splice ir<%for.x>, ir<%for.x.next>
243+
; CHECK-NEXT: WIDEN-CAST ir<%for.x.prev> = trunc vp<[[SPLICE_X]]> to i32
244+
; CHECK-NEXT: EMIT vp<[[SPLICE_Y:%.+]]> = first-order splice ir<%for.y>, ir<%for.x.prev>
245+
; CHECK-NEXT: WIDEN-CAST ir<%for.y.i64> = sext vp<[[SPLICE_Y]]> to i64
246+
; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%gep>
247+
; CHECK-NEXT: WIDEN store vp<[[VEC_PTR]]>, ir<%for.y.i64>
248+
; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
249+
; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]>
250+
; CHECK-NEXT: No successors
251+
; CHECK-NEXT: }
252+
; CHECK-NEXT: Successor(s): middle.block
253+
; CHECK-EMPTY:
254+
; CHECK-NEXT: middle.block:
255+
; CHECK-NEXT: EMIT vp<[[EXT_X:%.+]]> = extract-from-end ir<%for.x.next>, ir<1>
256+
; CHECK-NEXT: EMIT vp<[[EXT_Y:%.+]]> = extract-from-end ir<%for.x.prev>, ir<1>
257+
; CHECK-NEXT: EMIT vp<[[MIDDLE_C:%.+]]> = icmp eq ir<4098>, vp<[[VTC]]>
258+
; CHECK-NEXT: EMIT branch-on-cond vp<[[MIDDLE_C]]>
259+
; CHECK-NEXT: Successor(s): ir-bb<ret>, scalar.ph
260+
; CHECK-EMPTY:
261+
; CHECK-NEXT: ir-bb<ret>:
262+
; CHECK-NEXT: No successors
263+
; CHECK-EMPTY:
264+
; CHECK-NEXT: scalar.ph:
265+
; CHECK-NEXT: EMIT vp<[[RESUME_X:%.+]]> = resume-phi vp<[[EXT_X]]>, ir<0>
266+
; CHECK-NEXT: EMIT vp<[[RESUME_Y:%.+]]> = resume-phi vp<[[EXT_Y]]>, ir<0>
267+
; CHECK-NEXT: No successors
268+
; CHECK-EMPTY:
269+
; CHECK-NEXT: Live-out i64 %for.x = vp<[[RESUME_X]]>
270+
; CHECK-NEXT: Live-out i32 %for.y = vp<[[RESUME_Y]]>
271+
; CHECK-NEXT: }
182272
;
183273
entry:
184274
br label %loop

llvm/test/Transforms/LoopVectorize/first-order-recurrence-multiply-recurrences.ll

Lines changed: 38 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -385,19 +385,51 @@ exit:
385385
define void @hoist_previous_value_and_operands(ptr %dst, i64 %mask) {
386386
; CHECK-LABEL: @hoist_previous_value_and_operands(
387387
; CHECK-NEXT: bb:
388+
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
389+
; CHECK: vector.ph:
390+
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[MASK:%.*]], i64 0
391+
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
392+
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
393+
; CHECK: vector.body:
394+
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
395+
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 1, i64 2, i64 3, i64 4>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
396+
; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 1>, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
397+
; CHECK-NEXT: [[VECTOR_RECUR1:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
398+
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]]
399+
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
400+
; CHECK-NEXT: [[TMP1:%.*]] = and <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
401+
; CHECK-NEXT: [[TMP2]] = trunc <4 x i64> [[TMP1]] to <4 x i32>
402+
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP2]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
403+
; CHECK-NEXT: [[TMP4]] = or <4 x i32> [[TMP3]], zeroinitializer
404+
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR1]], <4 x i32> [[TMP4]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
405+
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[TMP0]]
406+
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
407+
; CHECK-NEXT: store <4 x i32> [[TMP5]], ptr [[TMP7]], align 4
408+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
409+
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
410+
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 336
411+
; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
412+
; CHECK: middle.block:
413+
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
414+
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT2:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
415+
; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
416+
; CHECK: scalar.ph:
417+
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 337, [[MIDDLE_BLOCK]] ], [ 1, [[BB:%.*]] ]
418+
; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 1, [[BB]] ]
419+
; CHECK-NEXT: [[SCALAR_RECUR_INIT3:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT2]], [[MIDDLE_BLOCK]] ], [ 0, [[BB]] ]
388420
; CHECK-NEXT: br label [[LOOP:%.*]]
389421
; CHECK: loop:
390-
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[ADD:%.*]], [[LOOP]] ], [ 1, [[BB:%.*]] ]
391-
; CHECK-NEXT: [[FOR_1:%.*]] = phi i32 [ [[TRUNC:%.*]], [[LOOP]] ], [ 1, [[BB]] ]
392-
; CHECK-NEXT: [[FOR_2:%.*]] = phi i32 [ [[OR:%.*]], [[LOOP]] ], [ 0, [[BB]] ]
422+
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[ADD:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
423+
; CHECK-NEXT: [[FOR_1:%.*]] = phi i32 [ [[TRUNC:%.*]], [[LOOP]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ]
424+
; CHECK-NEXT: [[FOR_2:%.*]] = phi i32 [ [[OR:%.*]], [[LOOP]] ], [ [[SCALAR_RECUR_INIT3]], [[SCALAR_PH]] ]
393425
; CHECK-NEXT: [[OR]] = or i32 [[FOR_1]], 0
394426
; CHECK-NEXT: [[ADD]] = add i64 [[IV]], 1
395-
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[IV]]
427+
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[IV]]
396428
; CHECK-NEXT: store i32 [[FOR_2]], ptr [[GEP]], align 4
397429
; CHECK-NEXT: [[ICMP:%.*]] = icmp ult i64 [[IV]], 337
398-
; CHECK-NEXT: [[A:%.*]] = and i64 [[IV]], [[MASK:%.*]]
430+
; CHECK-NEXT: [[A:%.*]] = and i64 [[IV]], [[MASK]]
399431
; CHECK-NEXT: [[TRUNC]] = trunc i64 [[A]] to i32
400-
; CHECK-NEXT: br i1 [[ICMP]], label [[LOOP]], label [[EXIT:%.*]]
432+
; CHECK-NEXT: br i1 [[ICMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP7:![0-9]+]]
401433
; CHECK: exit:
402434
; CHECK-NEXT: ret void
403435
;

0 commit comments

Comments
 (0)