-
Notifications
You must be signed in to change notification settings - Fork 13.7k
[LoopPeel] Support min/max intrinsics in loop peeling #93162
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[LoopPeel] Support min/max intrinsics in loop peeling #93162
Conversation
@llvm/pr-subscribers-llvm-transforms Author: Sergey Kachkov (skachkov-sc) ChangesMotivating example: https://godbolt.org/z/qfxz8rKac This example shows very different loop peeling behaviour for similar code depending on whether icmp+select was canonicalized to min/max intrinsic or not. This patch adds processing of min/max intrinsics in LoopPeel in the similar way as it was done for conditional statements: for min/max(IterVal, BoundVal) we peel iterations where IterVal < BoundVal for monotonically increasing IterVal; for monotonically decreasing IterVal we peel iterations where IterVal > BoundVal (strict comparision predicates are used to minimize number of peeled iterations). Compile-time results on llvm test-suite (including SPEC benchmarks) shows 3 tests affected by this:
Full diff: https://github.com/llvm/llvm-project/pull/93162.diff 2 Files Affected:
diff --git a/llvm/lib/Transforms/Utils/LoopPeel.cpp b/llvm/lib/Transforms/Utils/LoopPeel.cpp
index f76fa3bb6c611..a6d37b9bdf11d 100644
--- a/llvm/lib/Transforms/Utils/LoopPeel.cpp
+++ b/llvm/lib/Transforms/Utils/LoopPeel.cpp
@@ -449,10 +449,59 @@ static unsigned countToEliminateCompares(Loop &L, unsigned MaxPeelCount,
DesiredPeelCount = std::max(DesiredPeelCount, NewPeelCount);
};
+ auto ComputePeelCountMinMax = [&](IntrinsicInst *II) {
+ bool IsSigned;
+ switch (II->getIntrinsicID()) {
+ case Intrinsic::smax:
+ case Intrinsic::smin:
+ IsSigned = true;
+ break;
+ case Intrinsic::umax:
+ case Intrinsic::umin:
+ IsSigned = false;
+ break;
+ default:
+ return;
+ }
+ Value *LHS = II->getOperand(0), *RHS = II->getOperand(1);
+ const SCEV *BoundSCEV, *IterSCEV;
+ if (L.isLoopInvariant(LHS)) {
+ BoundSCEV = SE.getSCEV(LHS);
+ IterSCEV = SE.getSCEV(RHS);
+ } else if (L.isLoopInvariant(RHS)) {
+ BoundSCEV = SE.getSCEV(RHS);
+ IterSCEV = SE.getSCEV(LHS);
+ } else
+ return;
+ const auto *AddRec = dyn_cast<SCEVAddRecExpr>(IterSCEV);
+ // For simplicity, we support only affine recurrences.
+ if (!AddRec || !AddRec->isAffine() || AddRec->getLoop() != &L)
+ return;
+ const SCEV *Step = AddRec->getStepRecurrence(SE);
+ // To minimize number of peeled iterations, we use strict relational
+ // predicates here.
+ ICmpInst::Predicate Pred;
+ if (SE.isKnownPositive(Step))
+ Pred = IsSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT;
+ else if (SE.isKnownNegative(Step))
+ Pred = IsSigned ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT;
+ else
+ return;
+ const SCEV *IterVal = AddRec->evaluateAtIteration(
+ SE.getConstant(AddRec->getType(), DesiredPeelCount), SE);
+ while (DesiredPeelCount < MaxPeelCount &&
+ SE.isKnownPredicate(Pred, IterVal, BoundSCEV)) {
+ IterVal = SE.getAddExpr(IterVal, Step);
+ ++DesiredPeelCount;
+ }
+ };
+
for (BasicBlock *BB : L.blocks()) {
for (Instruction &I : *BB) {
if (SelectInst *SI = dyn_cast<SelectInst>(&I))
ComputePeelCount(SI->getCondition(), 0);
+ if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I))
+ ComputePeelCountMinMax(II);
}
auto *BI = dyn_cast<BranchInst>(BB->getTerminator());
diff --git a/llvm/test/Transforms/LoopUnroll/peel-loop-min-max-intrinsics.ll b/llvm/test/Transforms/LoopUnroll/peel-loop-min-max-intrinsics.ll
new file mode 100644
index 0000000000000..1a3f60b482793
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/peel-loop-min-max-intrinsics.ll
@@ -0,0 +1,288 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -S -passes=loop-unroll -unroll-peel-max-count=2 | FileCheck %s
+
+declare void @foo(i32)
+
+declare i32 @llvm.smin.i32(i32, i32)
+declare i32 @llvm.smax.i32(i32, i32)
+declare i32 @llvm.umin.i32(i32, i32)
+declare i32 @llvm.umax.i32(i32, i32)
+
+define void @test1(i32 %N) {
+; CHECK-LABEL: define void @test1(
+; CHECK-SAME: i32 [[N:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP5_NOT:%.*]] = icmp eq i32 [[N]], 0
+; CHECK-NEXT: br i1 [[CMP5_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; CHECK: for.body.preheader:
+; CHECK-NEXT: br label [[FOR_BODY_PEEL_BEGIN:%.*]]
+; CHECK: for.body.peel.begin:
+; CHECK-NEXT: br label [[FOR_BODY_PEEL:%.*]]
+; CHECK: for.body.peel:
+; CHECK-NEXT: [[COND_PEEL:%.*]] = tail call i32 @llvm.umin.i32(i32 0, i32 2)
+; CHECK-NEXT: tail call void @foo(i32 [[COND_PEEL]])
+; CHECK-NEXT: [[INC_PEEL:%.*]] = add nuw i32 0, 1
+; CHECK-NEXT: [[EXITCOND_NOT_PEEL:%.*]] = icmp eq i32 [[INC_PEEL]], [[N]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT_PEEL]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY_PEEL_NEXT:%.*]]
+; CHECK: for.body.peel.next:
+; CHECK-NEXT: br label [[FOR_BODY_PEEL2:%.*]]
+; CHECK: for.body.peel2:
+; CHECK-NEXT: [[COND_PEEL3:%.*]] = tail call i32 @llvm.umin.i32(i32 [[INC_PEEL]], i32 2)
+; CHECK-NEXT: tail call void @foo(i32 [[COND_PEEL3]])
+; CHECK-NEXT: [[INC_PEEL4:%.*]] = add nuw i32 [[INC_PEEL]], 1
+; CHECK-NEXT: [[EXITCOND_NOT_PEEL5:%.*]] = icmp eq i32 [[INC_PEEL4]], [[N]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT_PEEL5]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY_PEEL_NEXT1:%.*]]
+; CHECK: for.body.peel.next1:
+; CHECK-NEXT: br label [[FOR_BODY_PEEL_NEXT6:%.*]]
+; CHECK: for.body.peel.next6:
+; CHECK-NEXT: br label [[FOR_BODY_PREHEADER_PEEL_NEWPH:%.*]]
+; CHECK: for.body.preheader.peel.newph:
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.body:
+; CHECK-NEXT: [[I_06:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[INC_PEEL4]], [[FOR_BODY_PREHEADER_PEEL_NEWPH]] ]
+; CHECK-NEXT: tail call void @foo(i32 2)
+; CHECK-NEXT: [[INC]] = add nuw i32 [[I_06]], 1
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT_LOOPEXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: for.cond.cleanup.loopexit.loopexit:
+; CHECK-NEXT: br label [[FOR_COND_CLEANUP_LOOPEXIT]]
+; CHECK: for.cond.cleanup.loopexit:
+; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: ret void
+;
+entry:
+ %cmp5.not = icmp eq i32 %N, 0
+ br i1 %cmp5.not, label %for.cond.cleanup, label %for.body
+
+for.body:
+ %i.06 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+ %cond = tail call i32 @llvm.umin.i32(i32 %i.06, i32 2)
+ tail call void @foo(i32 %cond)
+ %inc = add nuw i32 %i.06, 1
+ %exitcond.not = icmp eq i32 %inc, %N
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+ ret void
+}
+
+define void @test2(i32 %N) {
+; CHECK-LABEL: define void @test2(
+; CHECK-SAME: i32 [[N:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP5_NOT:%.*]] = icmp eq i32 [[N]], 0
+; CHECK-NEXT: br i1 [[CMP5_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; CHECK: for.body.preheader:
+; CHECK-NEXT: br label [[FOR_BODY_PEEL_BEGIN:%.*]]
+; CHECK: for.body.peel.begin:
+; CHECK-NEXT: br label [[FOR_BODY_PEEL:%.*]]
+; CHECK: for.body.peel:
+; CHECK-NEXT: [[COND_PEEL:%.*]] = tail call i32 @llvm.umax.i32(i32 0, i32 2)
+; CHECK-NEXT: tail call void @foo(i32 [[COND_PEEL]])
+; CHECK-NEXT: [[INC_PEEL:%.*]] = add nuw i32 0, 1
+; CHECK-NEXT: [[EXITCOND_NOT_PEEL:%.*]] = icmp eq i32 [[INC_PEEL]], [[N]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT_PEEL]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY_PEEL_NEXT:%.*]]
+; CHECK: for.body.peel.next:
+; CHECK-NEXT: br label [[FOR_BODY_PEEL2:%.*]]
+; CHECK: for.body.peel2:
+; CHECK-NEXT: [[COND_PEEL3:%.*]] = tail call i32 @llvm.umax.i32(i32 [[INC_PEEL]], i32 2)
+; CHECK-NEXT: tail call void @foo(i32 [[COND_PEEL3]])
+; CHECK-NEXT: [[INC_PEEL4:%.*]] = add nuw i32 [[INC_PEEL]], 1
+; CHECK-NEXT: [[EXITCOND_NOT_PEEL5:%.*]] = icmp eq i32 [[INC_PEEL4]], [[N]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT_PEEL5]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY_PEEL_NEXT1:%.*]]
+; CHECK: for.body.peel.next1:
+; CHECK-NEXT: br label [[FOR_BODY_PEEL_NEXT6:%.*]]
+; CHECK: for.body.peel.next6:
+; CHECK-NEXT: br label [[FOR_BODY_PREHEADER_PEEL_NEWPH:%.*]]
+; CHECK: for.body.preheader.peel.newph:
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.body:
+; CHECK-NEXT: [[I_06:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[INC_PEEL4]], [[FOR_BODY_PREHEADER_PEEL_NEWPH]] ]
+; CHECK-NEXT: tail call void @foo(i32 [[I_06]])
+; CHECK-NEXT: [[INC]] = add nuw i32 [[I_06]], 1
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT_LOOPEXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK: for.cond.cleanup.loopexit.loopexit:
+; CHECK-NEXT: br label [[FOR_COND_CLEANUP_LOOPEXIT]]
+; CHECK: for.cond.cleanup.loopexit:
+; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: ret void
+;
+entry:
+ %cmp5.not = icmp eq i32 %N, 0
+ br i1 %cmp5.not, label %for.cond.cleanup, label %for.body
+
+for.body:
+ %i.06 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+ %cond = tail call i32 @llvm.umax.i32(i32 %i.06, i32 2)
+ tail call void @foo(i32 %cond)
+ %inc = add nuw i32 %i.06, 1
+ %exitcond.not = icmp eq i32 %inc, %N
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+ ret void
+}
+
+define void @test3(i32 %N) {
+; CHECK-LABEL: define void @test3(
+; CHECK-SAME: i32 [[N:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP5:%.*]] = icmp slt i32 [[N]], 0
+; CHECK-NEXT: br i1 [[CMP5]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK: for.body.preheader:
+; CHECK-NEXT: br label [[FOR_BODY_PEEL_BEGIN:%.*]]
+; CHECK: for.body.peel.begin:
+; CHECK-NEXT: br label [[FOR_BODY_PEEL:%.*]]
+; CHECK: for.body.peel:
+; CHECK-NEXT: [[COND_PEEL:%.*]] = tail call i32 @llvm.smax.i32(i32 0, i32 -2)
+; CHECK-NEXT: tail call void @foo(i32 [[COND_PEEL]])
+; CHECK-NEXT: [[DEC_PEEL:%.*]] = add nsw i32 0, -1
+; CHECK-NEXT: [[CMP_PEEL:%.*]] = icmp sgt i32 [[DEC_PEEL]], [[N]]
+; CHECK-NEXT: br i1 [[CMP_PEEL]], label [[FOR_BODY_PEEL_NEXT:%.*]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
+; CHECK: for.body.peel.next:
+; CHECK-NEXT: br label [[FOR_BODY_PEEL2:%.*]]
+; CHECK: for.body.peel2:
+; CHECK-NEXT: [[COND_PEEL3:%.*]] = tail call i32 @llvm.smax.i32(i32 [[DEC_PEEL]], i32 -2)
+; CHECK-NEXT: tail call void @foo(i32 [[COND_PEEL3]])
+; CHECK-NEXT: [[DEC_PEEL4:%.*]] = add nsw i32 [[DEC_PEEL]], -1
+; CHECK-NEXT: [[CMP_PEEL5:%.*]] = icmp sgt i32 [[DEC_PEEL4]], [[N]]
+; CHECK-NEXT: br i1 [[CMP_PEEL5]], label [[FOR_BODY_PEEL_NEXT1:%.*]], label [[FOR_COND_CLEANUP_LOOPEXIT]]
+; CHECK: for.body.peel.next1:
+; CHECK-NEXT: br label [[FOR_BODY_PEEL_NEXT6:%.*]]
+; CHECK: for.body.peel.next6:
+; CHECK-NEXT: br label [[FOR_BODY_PREHEADER_PEEL_NEWPH:%.*]]
+; CHECK: for.body.preheader.peel.newph:
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.body:
+; CHECK-NEXT: [[I_06:%.*]] = phi i32 [ [[DEC:%.*]], [[FOR_BODY]] ], [ [[DEC_PEEL4]], [[FOR_BODY_PREHEADER_PEEL_NEWPH]] ]
+; CHECK-NEXT: tail call void @foo(i32 -2)
+; CHECK-NEXT: [[DEC]] = add nsw i32 [[I_06]], -1
+; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[DEC]], [[N]]
+; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT_LOOPEXIT:%.*]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK: for.cond.cleanup.loopexit.loopexit:
+; CHECK-NEXT: br label [[FOR_COND_CLEANUP_LOOPEXIT]]
+; CHECK: for.cond.cleanup.loopexit:
+; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: ret void
+;
+entry:
+ %cmp5 = icmp slt i32 %N, 0
+ br i1 %cmp5, label %for.body, label %for.cond.cleanup
+
+for.body:
+ %i.06 = phi i32 [ %dec, %for.body ], [ 0, %entry ]
+ %cond = tail call i32 @llvm.smax.i32(i32 %i.06, i32 -2)
+ tail call void @foo(i32 %cond)
+ %dec = add nsw i32 %i.06, -1
+ %cmp = icmp sgt i32 %dec, %N
+ br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+ ret void
+}
+
+define void @test4(i32 %N) {
+; CHECK-LABEL: define void @test4(
+; CHECK-SAME: i32 [[N:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP5:%.*]] = icmp slt i32 [[N]], 0
+; CHECK-NEXT: br i1 [[CMP5]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK: for.body.preheader:
+; CHECK-NEXT: br label [[FOR_BODY_PEEL_BEGIN:%.*]]
+; CHECK: for.body.peel.begin:
+; CHECK-NEXT: br label [[FOR_BODY_PEEL:%.*]]
+; CHECK: for.body.peel:
+; CHECK-NEXT: [[COND_PEEL:%.*]] = tail call i32 @llvm.smin.i32(i32 0, i32 -2)
+; CHECK-NEXT: tail call void @foo(i32 noundef signext [[COND_PEEL]])
+; CHECK-NEXT: [[DEC_PEEL:%.*]] = add nsw i32 0, -1
+; CHECK-NEXT: [[CMP_PEEL:%.*]] = icmp sgt i32 [[DEC_PEEL]], [[N]]
+; CHECK-NEXT: br i1 [[CMP_PEEL]], label [[FOR_BODY_PEEL_NEXT:%.*]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
+; CHECK: for.body.peel.next:
+; CHECK-NEXT: br label [[FOR_BODY_PEEL2:%.*]]
+; CHECK: for.body.peel2:
+; CHECK-NEXT: [[COND_PEEL3:%.*]] = tail call i32 @llvm.smin.i32(i32 [[DEC_PEEL]], i32 -2)
+; CHECK-NEXT: tail call void @foo(i32 noundef signext [[COND_PEEL3]])
+; CHECK-NEXT: [[DEC_PEEL4:%.*]] = add nsw i32 [[DEC_PEEL]], -1
+; CHECK-NEXT: [[CMP_PEEL5:%.*]] = icmp sgt i32 [[DEC_PEEL4]], [[N]]
+; CHECK-NEXT: br i1 [[CMP_PEEL5]], label [[FOR_BODY_PEEL_NEXT1:%.*]], label [[FOR_COND_CLEANUP_LOOPEXIT]]
+; CHECK: for.body.peel.next1:
+; CHECK-NEXT: br label [[FOR_BODY_PEEL_NEXT6:%.*]]
+; CHECK: for.body.peel.next6:
+; CHECK-NEXT: br label [[FOR_BODY_PREHEADER_PEEL_NEWPH:%.*]]
+; CHECK: for.body.preheader.peel.newph:
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.body:
+; CHECK-NEXT: [[I_06:%.*]] = phi i32 [ [[DEC:%.*]], [[FOR_BODY]] ], [ [[DEC_PEEL4]], [[FOR_BODY_PREHEADER_PEEL_NEWPH]] ]
+; CHECK-NEXT: tail call void @foo(i32 noundef signext [[I_06]])
+; CHECK-NEXT: [[DEC]] = add nsw i32 [[I_06]], -1
+; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[DEC]], [[N]]
+; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT_LOOPEXIT:%.*]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK: for.cond.cleanup.loopexit.loopexit:
+; CHECK-NEXT: br label [[FOR_COND_CLEANUP_LOOPEXIT]]
+; CHECK: for.cond.cleanup.loopexit:
+; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: ret void
+;
+entry:
+ %cmp5 = icmp slt i32 %N, 0
+ br i1 %cmp5, label %for.body, label %for.cond.cleanup
+
+for.body:
+ %i.06 = phi i32 [ %dec, %for.body ], [ 0, %entry ]
+ %cond = tail call i32 @llvm.smin.i32(i32 %i.06, i32 -2)
+ tail call void @foo(i32 noundef signext %cond)
+ %dec = add nsw i32 %i.06, -1
+ %cmp = icmp sgt i32 %dec, %N
+ br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+ ret void
+}
+
+define void @test_negative(i32 %End, i32 %Step) {
+; CHECK-LABEL: define void @test_negative(
+; CHECK-SAME: i32 [[END:%.*]], i32 [[STEP:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP_NOT5:%.*]] = icmp eq i32 [[END]], 0
+; CHECK-NEXT: br i1 [[CMP_NOT5]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; CHECK: for.body.preheader:
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.body:
+; CHECK-NEXT: [[I_06:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT: [[COND:%.*]] = tail call i32 @llvm.smin.i32(i32 [[I_06]], i32 2)
+; CHECK-NEXT: tail call void @foo(i32 [[COND]])
+; CHECK-NEXT: [[ADD]] = add nsw i32 [[I_06]], [[STEP]]
+; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[ADD]], [[END]]
+; CHECK-NEXT: br i1 [[CMP_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]]
+; CHECK: for.cond.cleanup.loopexit:
+; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: ret void
+;
+entry:
+ %cmp.not5 = icmp eq i32 %End, 0
+ br i1 %cmp.not5, label %for.cond.cleanup, label %for.body
+
+for.body:
+ %i.06 = phi i32 [ %add, %for.body ], [ 0, %entry ]
+ %cond = tail call i32 @llvm.smin.i32(i32 %i.06, i32 2)
+ tail call void @foo(i32 %cond)
+ %add = add nsw i32 %i.06, %Step
+ %cmp.not = icmp eq i32 %add, %End
+ br i1 %cmp.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+ ret void
+}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.peeled.count", i32 2}
+; CHECK: [[LOOP2]] = distinct !{[[LOOP2]], [[META1]]}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]]}
+;.
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The idea here looks reasonable, but I wonder if we can reuse the existing code more? Extract the part that works on a decomposed icmp into a separate function and then call it with the icmp for the select expansion of the min/max?
The ammount of shared code will be quite small (because for min/max intrinsics we don't need to process very special equality comparision case), but we can try to split part that finds number of iterations where predicate is still known. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nice idea, LGTM. Please wait for review from nikic@ as well
c42f9cd
to
72c0869
Compare
The arm_mult_q15.ll test is failing. Maybe need to guard against vector min/max? |
|
||
declare i8 @llvm.umin.i8(i8, i8) | ||
|
||
define void @test1(i32 %N) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
define void @test1(i32 %N) { | |
define void @test_umin(i32 %N) { |
Test names could be a bit more meaningful...
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Renamed tests as suggested
declare void @foo(i32) | ||
|
||
declare i32 @llvm.smin.i32(i32, i32) | ||
declare i32 @llvm.smax.i32(i32, i32) | ||
declare i32 @llvm.umin.i32(i32, i32) | ||
declare i32 @llvm.umax.i32(i32, i32) | ||
|
||
declare void @bar(i8) | ||
|
||
declare i8 @llvm.umin.i8(i8, i8) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
declare void @foo(i32) | |
declare i32 @llvm.smin.i32(i32, i32) | |
declare i32 @llvm.smax.i32(i32, i32) | |
declare i32 @llvm.umin.i32(i32, i32) | |
declare i32 @llvm.umax.i32(i32, i32) | |
declare void @bar(i8) | |
declare i8 @llvm.umin.i8(i8, i8) | |
declare void @foo(i32) | |
declare void @bar(i8) |
Intrinsic declarations are no longer needed.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done
@@ -0,0 +1,328 @@ | |||
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 | |||
; RUN: opt < %s -S -passes=loop-unroll -unroll-peel-max-count=2 | FileCheck %s |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
; RUN: opt < %s -S -passes=loop-unroll -unroll-peel-max-count=2 | FileCheck %s | |
; RUN: opt < %s -S -passes=loop-unroll -unroll-peel-max-count=3 | FileCheck %s |
I think that would show that peeling actually stops after the right number of iterations?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Increased the limit; also added special test (test_max_count_threshold) to check that if we want to peel more than max-peel-count, we shouldn't peel loop at all (because such transform will not help to eliminate min/max intrinsic in loop)
72c0869
to
6944e4f
Compare
Yep, didn't have ARM backend enabled in local build... Added check to process only IntegerType min/max |
6944e4f
to
9f81074
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
9f81074
to
46263ea
Compare
Motivating example: https://godbolt.org/z/qfxz8rKac
This example shows very different loop peeling behaviour for similar code depending on whether icmp+select was canonicalized to min/max intrinsic or not.
This patch adds processing of min/max intrinsics in LoopPeel in the similar way as it was done for conditional statements: for min/max(IterVal, BoundVal) we peel iterations where IterVal < BoundVal for monotonically increasing IterVal; for monotonically decreasing IterVal we peel iterations where IterVal > BoundVal (strict comparision predicates are used to minimize number of peeled iterations).
Compile-time results on llvm test-suite (including SPEC benchmarks) shows 3 tests affected by this: