-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[LV] Optimise latch exit induction users for some early exit loops #128880
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
This is the first of two PRs that attempts to improve the IR generated in the exit blocks of vectorised loops with uncountable early exits. In this PR I am improving the generated code for users of induction variables in early exit loops that have a unique exit block, when exiting via the latch. I intend to follow this up very soon with another patch to optimise the code for induction users in the vector.early.exit block.
@llvm/pr-subscribers-vectorizers @llvm/pr-subscribers-llvm-transforms Author: David Sherwood (david-arm) ChangesThis is the first of two PRs that attempts to improve the IR I intend to follow this up very soon with another patch to Patch is 20.00 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/128880.diff 2 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 8e773272300b1..bf643b0a51631 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -730,67 +730,74 @@ static VPWidenInductionRecipe *getOptimizableIVOf(VPValue *VPV) {
return IsWideIVInc() ? WideIV : nullptr;
}
-void VPlanTransforms::optimizeInductionExitUsers(
- VPlan &Plan, DenseMap<VPValue *, VPValue *> &EndValues) {
+static VPValue *
+optimizeLatchExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo,
+ VPBlockBase *PredVPBB, VPValue *Op,
+ DenseMap<VPValue *, VPValue *> &EndValues) {
using namespace VPlanPatternMatch;
- SmallVector<VPIRBasicBlock *> ExitVPBBs(Plan.getExitBlocks());
- if (ExitVPBBs.size() != 1)
- return;
- VPIRBasicBlock *ExitVPBB = ExitVPBBs[0];
- VPBlockBase *PredVPBB = ExitVPBB->getSinglePredecessor();
- if (!PredVPBB)
- return;
- assert(PredVPBB == Plan.getMiddleBlock() &&
- "predecessor must be the middle block");
-
- VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
- VPBuilder B(Plan.getMiddleBlock()->getTerminator());
- for (VPRecipeBase &R : *ExitVPBB) {
- auto *ExitIRI = cast<VPIRInstruction>(&R);
- if (!isa<PHINode>(ExitIRI->getInstruction()))
- break;
+ VPValue *Incoming;
+ if (!match(Op, m_VPInstruction<VPInstruction::ExtractFromEnd>(
+ m_VPValue(Incoming), m_SpecificInt(1))))
+ return nullptr;
- VPValue *Incoming;
- if (!match(ExitIRI->getOperand(0),
- m_VPInstruction<VPInstruction::ExtractFromEnd>(
- m_VPValue(Incoming), m_SpecificInt(1))))
- continue;
+ auto *WideIV = getOptimizableIVOf(Incoming);
+ if (!WideIV)
+ return nullptr;
- auto *WideIV = getOptimizableIVOf(Incoming);
- if (!WideIV)
- continue;
- VPValue *EndValue = EndValues.lookup(WideIV);
- assert(EndValue && "end value must have been pre-computed");
+ VPValue *EndValue = EndValues.lookup(WideIV);
+ assert(EndValue && "end value must have been pre-computed");
+
+ // This only happens if Incoming is the increment of an induction recipe.
+ if (Incoming != WideIV)
+ return EndValue;
+
+ // Otherwise subtract the step from the EndValue.
+ VPBuilder B(cast<VPBasicBlock>(PredVPBB)->getTerminator());
+ VPValue *Step = WideIV->getStepValue();
+ Type *ScalarTy = TypeInfo.inferScalarType(WideIV);
+ VPValue *Escape = nullptr;
+ if (ScalarTy->isIntegerTy()) {
+ Escape =
+ B.createNaryOp(Instruction::Sub, {EndValue, Step}, {}, "ind.escape");
+ } else if (ScalarTy->isPointerTy()) {
+ auto *Zero = Plan.getOrAddLiveIn(
+ ConstantInt::get(Step->getLiveInIRValue()->getType(), 0));
+ Escape =
+ B.createPtrAdd(EndValue, B.createNaryOp(Instruction::Sub, {Zero, Step}),
+ {}, "ind.escape");
+ } else if (ScalarTy->isFloatingPointTy()) {
+ const auto &ID = WideIV->getInductionDescriptor();
+ Escape = B.createNaryOp(
+ ID.getInductionBinOp()->getOpcode() == Instruction::FAdd
+ ? Instruction::FSub
+ : Instruction::FAdd,
+ {EndValue, Step}, {ID.getInductionBinOp()->getFastMathFlags()});
+ } else {
+ llvm_unreachable("all possible induction types must be handled");
+ }
+ return Escape;
+}
- if (Incoming != WideIV) {
- ExitIRI->setOperand(0, EndValue);
- continue;
- }
+void VPlanTransforms::optimizeInductionExitUsers(
+ VPlan &Plan, DenseMap<VPValue *, VPValue *> &EndValues) {
+ VPBlockBase *MiddleVPBB = Plan.getMiddleBlock();
+ VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
+ for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
+ for (VPRecipeBase &R : *ExitVPBB) {
+ auto *ExitIRI = cast<VPIRInstruction>(&R);
+ if (!isa<PHINode>(ExitIRI->getInstruction()))
+ break;
- VPValue *Escape = nullptr;
- VPValue *Step = WideIV->getStepValue();
- Type *ScalarTy = TypeInfo.inferScalarType(WideIV);
- if (ScalarTy->isIntegerTy()) {
- Escape =
- B.createNaryOp(Instruction::Sub, {EndValue, Step}, {}, "ind.escape");
- } else if (ScalarTy->isPointerTy()) {
- auto *Zero = Plan.getOrAddLiveIn(
- ConstantInt::get(Step->getLiveInIRValue()->getType(), 0));
- Escape = B.createPtrAdd(EndValue,
- B.createNaryOp(Instruction::Sub, {Zero, Step}),
- {}, "ind.escape");
- } else if (ScalarTy->isFloatingPointTy()) {
- const auto &ID = WideIV->getInductionDescriptor();
- Escape = B.createNaryOp(
- ID.getInductionBinOp()->getOpcode() == Instruction::FAdd
- ? Instruction::FSub
- : Instruction::FAdd,
- {EndValue, Step}, {ID.getInductionBinOp()->getFastMathFlags()});
- } else {
- llvm_unreachable("all possible induction types must be handled");
+ for (auto [Idx, PredVPBB] : enumerate(ExitVPBB->getPredecessors())) {
+ if (PredVPBB == MiddleVPBB)
+ if (VPValue *Escape = optimizeLatchExitInductionUser(
+ Plan, TypeInfo, PredVPBB, ExitIRI->getOperand(Idx),
+ EndValues))
+ ExitIRI->setOperand(Idx, Escape);
+ // TODO: Optimize early exit induction users in follow-on patch.
+ }
}
- ExitIRI->setOperand(0, Escape);
}
}
diff --git a/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll b/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll
index e24c6090b704b..616dd3c2caa4a 100644
--- a/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll
+++ b/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll
@@ -351,7 +351,6 @@ define i64 @same_exit_block_pre_inc_use2() {
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 3, i64 4, i64 5, i64 6>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP0]]
@@ -365,11 +364,9 @@ define i64 @same_exit_block_pre_inc_use2() {
; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP5]], splat (i1 true)
; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
; CHECK-NEXT: [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]]
; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; CHECK: middle.split:
-; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[VEC_IND]], i32 3
; CHECK-NEXT: br i1 [[TMP7]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
; CHECK: middle.block:
; CHECK-NEXT: br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
@@ -391,7 +388,7 @@ define i64 @same_exit_block_pre_inc_use2() {
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]], !llvm.loop [[LOOP11:![0-9]+]]
; CHECK: loop.end:
-; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ 67, [[LOOP]] ], [ [[INDEX]], [[LOOP_INC]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ 67, [[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ 67, [[LOOP]] ], [ [[INDEX]], [[LOOP_INC]] ], [ 66, [[MIDDLE_BLOCK]] ], [ 67, [[VECTOR_EARLY_EXIT]] ]
; CHECK-NEXT: ret i64 [[RETVAL]]
;
entry:
@@ -451,7 +448,6 @@ define i64 @same_exit_block_pre_inc_use3() {
; CHECK-NEXT: [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]]
; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
; CHECK: middle.split:
-; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[VEC_IND]], i32 3
; CHECK-NEXT: br i1 [[TMP7]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
; CHECK: middle.block:
; CHECK-NEXT: br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
@@ -475,7 +471,7 @@ define i64 @same_exit_block_pre_inc_use3() {
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]], !llvm.loop [[LOOP13:![0-9]+]]
; CHECK: loop.end:
-; CHECK-NEXT: [[INDEX_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LOOP_INC]] ], [ [[INDEX]], [[LOOP]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT: [[INDEX_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LOOP_INC]] ], [ [[INDEX]], [[LOOP]] ], [ 66, [[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ]
; CHECK-NEXT: ret i64 [[INDEX_LCSSA]]
;
entry:
@@ -597,9 +593,6 @@ define i64 @same_exit_block_post_inc_use() {
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 3, i64 4, i64 5, i64 6>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 1
-; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 2
-; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 3
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP0]]
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP5]], align 1
@@ -607,10 +600,6 @@ define i64 @same_exit_block_post_inc_use() {
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 0
; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP7]], align 1
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
-; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[TMP0]], 1
-; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[TMP1]], 1
-; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[TMP2]], 1
-; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[TMP3]], 1
; CHECK-NEXT: [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4
; CHECK-NEXT: [[TMP13:%.*]] = xor <4 x i1> [[TMP8]], splat (i1 true)
; CHECK-NEXT: [[TMP14:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP13]])
@@ -642,7 +631,7 @@ define i64 @same_exit_block_post_inc_use() {
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]], !llvm.loop [[LOOP17:![0-9]+]]
; CHECK: loop.end:
-; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ [[INDEX_NEXT]], [[LOOP_INC]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ [[INDEX_NEXT]], [[LOOP_INC]] ], [ 67, [[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ]
; CHECK-NEXT: ret i64 [[RETVAL]]
;
entry:
@@ -684,7 +673,6 @@ define i64 @same_exit_block_post_inc_use2() {
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 3, i64 4, i64 5, i64 6>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 1
@@ -709,11 +697,9 @@ define i64 @same_exit_block_post_inc_use2() {
; CHECK-NEXT: [[TMP17:%.*]] = xor <4 x i1> [[TMP16]], splat (i1 true)
; CHECK-NEXT: [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP17]])
; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
; CHECK-NEXT: [[TMP20:%.*]] = or i1 [[TMP18]], [[TMP19]]
; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
; CHECK: middle.split:
-; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i64> [[VEC_IND]], i32 3
; CHECK-NEXT: br i1 [[TMP18]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
; CHECK: middle.block:
; CHECK-NEXT: br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
@@ -737,7 +723,7 @@ define i64 @same_exit_block_post_inc_use2() {
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]], !llvm.loop [[LOOP19:![0-9]+]]
; CHECK: loop.end:
-; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX_NEXT]], [[LOOP]] ], [ [[INDEX]], [[LOOP_INC]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT: [[RETVAL:%.*]] = phi i64 [ [[INDEX_NEXT]], [[LOOP]] ], [ [[INDEX]], [[LOOP_INC]] ], [ 66, [[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ]
; CHECK-NEXT: ret i64 [[RETVAL]]
;
entry:
@@ -869,7 +855,6 @@ define i64 @diff_exit_block_pre_inc_use2() {
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 3, i64 4, i64 5, i64 6>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP0]]
@@ -883,13 +868,11 @@ define i64 @diff_exit_block_pre_inc_use2() {
; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP5]], splat (i1 true)
; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
; CHECK-NEXT: [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]]
; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
; CHECK: middle.split:
; CHECK-NEXT: br i1 [[TMP7]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
; CHECK: middle.block:
-; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[VEC_IND]], i32 3
; CHECK-NEXT: br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
; CHECK: vector.early.exit:
; CHECK-NEXT: br label [[LOOP_EARLY_EXIT:%.*]]
@@ -912,7 +895,7 @@ define i64 @diff_exit_block_pre_inc_use2() {
; CHECK-NEXT: [[RETVAL1:%.*]] = phi i64 [ 67, [[LOOP]] ], [ 67, [[VECTOR_EARLY_EXIT]] ]
; CHECK-NEXT: ret i64 [[RETVAL1]]
; CHECK: loop.end:
-; CHECK-NEXT: [[RETVAL2:%.*]] = phi i64 [ [[INDEX]], [[LOOP_INC]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: [[RETVAL2:%.*]] = phi i64 [ [[INDEX]], [[LOOP_INC]] ], [ 66, [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i64 [[RETVAL2]]
;
entry:
@@ -978,7 +961,6 @@ define i64 @diff_exit_block_pre_inc_use3() {
; CHECK: middle.split:
; CHECK-NEXT: br i1 [[TMP7]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
; CHECK: middle.block:
-; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[VEC_IND]], i32 3
; CHECK-NEXT: br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
; CHECK: vector.early.exit:
; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
@@ -1003,7 +985,7 @@ define i64 @diff_exit_block_pre_inc_use3() {
; CHECK-NEXT: [[INDEX_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ]
; CHECK-NEXT: ret i64 [[INDEX_LCSSA]]
; CHECK: loop.end:
-; CHECK-NEXT: [[INDEX_LCSSA1:%.*]] = phi i64 [ [[INDEX]], [[LOOP_INC]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: [[INDEX_LCSSA1:%.*]] = phi i64 [ [[INDEX]], [[LOOP_INC]] ], [ 66, [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i64 [[INDEX_LCSSA1]]
;
entry:
@@ -1050,9 +1032,6 @@ define i64 @diff_exit_block_post_inc_use1() {
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 3, i64 4, i64 5, i64 6>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 1
-; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 2
-; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 3
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP0]]
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP5]], align 1
@@ -1060,10 +1039,6 @@ define i64 @diff_exit_block_post_inc_use1() {
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 0
; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP7]], align 1
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
-; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[TMP0]], 1
-; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[TMP1]], 1
-; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[TMP2]], 1
-; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[TMP3]], 1
; CHECK-NEXT: [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4
; CHECK-NEXT: [[TMP13:%.*]] = xor <4 x i1> [[TMP8]], splat (i1 true)
; CHECK-NEXT: [[TMP14:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP13]])
@@ -1098,7 +1073,7 @@ define i64 @diff_exit_block_post_inc_use1() {
; CHECK-NEXT: [[RETVAL1:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ]
; CHECK-NEXT: ret i64 [[RETVAL1]]
; CHECK: loop.end:
-; CHECK-NEXT: [[RETVAL2:%.*]] = phi i64 [ [[INDEX_NEXT]], [[LOOP_INC]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: [[RETVAL2:%.*]] = phi i64 [ [[INDEX_NEXT]], [[LOOP_INC]] ], [ 67, [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i64 [[RETVAL2]]
;
entry:
@@ -1144,7 +1119,6 @@ define i64 @diff_exit_block_post_inc_use2() {
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 3, i64 4, i64 5, i64 6>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 1
@@ -1169,13 +1143,11 @@ define i64 @diff_exit_block_post_inc_use2() {
; CHECK-NEXT: [[TMP17:%.*]] = xor <4 x i1> [[TMP16]], splat (i1 true)
; CHECK-NEXT: [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP17]])
; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
; CHECK-NEXT: [[TMP20:%.*]] = or i1 [[TMP18]], [[TMP19]]
; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
; CHECK: middle.split:
; CHECK-NEXT: br i1 [[TMP18]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
; CHECK: middle.block:
-; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i64> [[VEC_IND]], i32 3
; CHECK-NEXT: br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]]
; CHECK: vector.early.exit:
; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP17]], i1 true)
@@ -1200,7 +1172,7 @@ define i64 @diff_exit_block_post_inc_use2() {
; CHECK-NEXT: [[RETVAL1:%.*]] = phi i64 [ [[INDEX_NEXT]], [[LOOP]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ]
; CHECK-NEXT: ret i64 [[RETVAL1]]
; CHECK: loop.end:
-; CHECK-NEXT: [[RETVAL2:%.*]] = phi i64 [ [[INDEX]], [[LOOP_INC]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: [[RETVAL2:%.*]] = phi i64 [ [[INDEX]], [[LOOP_INC]] ], [ 66, [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i64 [[RETVAL2]]
;
en...
[truncated]
|
continue; | ||
VPValue *EndValue = EndValues.lookup(WideIV); | ||
assert(EndValue && "end value must have been pre-computed"); | ||
VPValue *EndValue = EndValues.lookup(WideIV); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I see the end values are populated in addScalarResumePhis()
, am I right in thinking this map goes from IV
-> exit IV value from scalar loop? Could this be non-constant? (in the changed tests it is always a constant).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes in theory this could be non-constant, although since the tests are all testing users of the induction variable used to control the latch exit they end up being constant. This is because the only early exit loops we can currently vectorise are effectively ones with known trip counts, since the trip counts are used to prove that the loads always fall within the bounds of the object, such as an alloca. I'll see if I can add a test for a user of something that isn't the trip count.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
IIUC the patch moves the existing code to compute the end value to a helper and uses it for all exits from the latch. Would be good to make that clear in the patch description to describe the patch.
Gentle ping. :) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM from my side, thanks for addressing my comments! Please wait to see if @fhahn has any further comments before landing.
Rebased and ran "make check-all" downstream. Looks good. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, thanks!
This is the second of two PRs that attempts to improve the IR generated in the exit blocks of vectorised loops with uncountable early exits. It follows on from PR llvm#128880. In this PR I am improving the generated code for users of induction variables in early exit blocks. This required adding a new VPInstruction called FirstActiveLane, which calculates the index of the first active predicate in the mask operand. I have added a new function optimizeEarlyExitInductionUser that is called from optimizeInductionExitUsers when handling with users in early exit blocks.
This is the second of two PRs that attempts to improve the IR generated in the exit blocks of vectorised loops with uncountable early exits. It follows on from PR llvm#128880. In this PR I am improving the generated code for users of induction variables in early exit blocks. This requires using a newly added VPInstruction called FirstActiveLane, which calculates the index of the first active predicate in the mask operand. I have added a new function optimizeEarlyExitInductionUser that is called from optimizeInductionExitUsers when handling with users in early exit blocks.
This is the second of two PRs that attempts to improve the IR generated in the exit blocks of vectorised loops with uncountable early exits. It follows on from PR llvm#128880. In this PR I am improving the generated code for users of induction variables in early exit blocks. This requires using a newly added VPInstruction called FirstActiveLane, which calculates the index of the first active predicate in the mask operand. I have added a new function optimizeEarlyExitInductionUser that is called from optimizeInductionExitUsers when handling with users in early exit blocks.
This is the second of two PRs that attempts to improve the IR generated in the exit blocks of vectorised loops with uncountable early exits. It follows on from PR llvm#128880. In this PR I am improving the generated code for users of induction variables in early exit blocks. This requires using a newly added VPInstruction called FirstActiveLane, which calculates the index of the first active predicate in the mask operand. I have added a new function optimizeEarlyExitInductionUser that is called from optimizeInductionExitUsers when handling with users in early exit blocks.
) This is the second of two PRs that attempts to improve the IR generated in the exit blocks of vectorised loops with uncountable early exits. It follows on from PR #128880. In this PR I am improving the generated code for users of induction variables in early exit blocks. This required using a newly add VPInstruction called FirstActiveLane, which calculates the index of the first active predicate in the mask operand. I have added a new function optimizeEarlyExitInductionUser that is called from optimizeInductionExitUsers when handling users in early exit blocks.
This is the first of two PRs that attempts to improve the IR
generated in the exit blocks of vectorised loops with uncountable
early exits. In this PR I am improving the generated code for
users of induction variables in early exit loops that have a
unique exit block, when exiting via the latch.
I have moved some of the code for calculating the exit values in
latch exit blocks from
optimizeInductionExitUsers
into a newfunction
optimizeLatchExitInductionUser
.I intend to follow this up very soon with another patch to
optimise the code for induction users in the vector.early.exit
block.