diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index e75a1de548f7d..a0140f64eb643 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -693,6 +693,8 @@ class VPLiveOut : public VPUser { return true; } + bool onlyFirstLaneUsed(const VPValue *Op) const override; + PHINode *getPhi() const { return Phi; } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 5eb99ffd1e10e..5cb10471771d1 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -186,6 +186,13 @@ bool VPRecipeBase::mayHaveSideEffects() const { } } +bool VPLiveOut::onlyFirstLaneUsed(const VPValue *Op) const { + assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); + + return vputils::isUniformAfterVectorization(getOperand(0)) || + isa(Op); +} + void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) { auto Lane = VPLane::getLastLaneForVF(State.VF); VPValue *ExitValue = getOperand(0); diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll index 8b64d7a083662..26b46ad7b7289 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll @@ -27,50 +27,56 @@ define ptr @test(ptr %start.1, ptr %start.2, ptr %end) { ; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 4 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[START_1]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 2 -; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 2 -; CHECK-NEXT: [[TMP15:%.*]] = mul i64 8, [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP13]], 0 -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP16]], i64 0 +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8 +; CHECK-NEXT: [[TMP12:%.*]] = call @llvm.experimental.stepvector.nxv2i64() +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[OFFSET_IDX]], i64 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = call @llvm.experimental.stepvector.nxv2i64() -; CHECK-NEXT: [[TMP18:%.*]] = add [[DOTSPLAT]], [[TMP17]] -; CHECK-NEXT: [[VECTOR_GEP:%.*]] = mul [[TMP18]], shufflevector ( insertelement ( poison, i64 8, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], [[VECTOR_GEP]] -; CHECK-NEXT: [[TMP20:%.*]] = mul i64 [[TMP13]], 1 -; CHECK-NEXT: [[DOTSPLATINSERT5:%.*]] = insertelement poison, i64 [[TMP20]], i64 0 +; CHECK-NEXT: [[TMP13:%.*]] = add zeroinitializer, [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = mul [[TMP13]], shufflevector ( insertelement ( poison, i64 8, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP15:%.*]] = add [[DOTSPLAT]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[OFFSET_IDX]], 8 +; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 2 +; CHECK-NEXT: [[DOTSPLATINSERT5:%.*]] = insertelement poison, i64 [[TMP19]], i64 0 ; CHECK-NEXT: [[DOTSPLAT6:%.*]] = shufflevector [[DOTSPLATINSERT5]], poison, zeroinitializer -; CHECK-NEXT: [[TMP21:%.*]] = call @llvm.experimental.stepvector.nxv2i64() -; CHECK-NEXT: [[TMP22:%.*]] = add [[DOTSPLAT6]], [[TMP21]] -; CHECK-NEXT: [[VECTOR_GEP7:%.*]] = mul [[TMP22]], shufflevector ( insertelement ( poison, i64 8, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], [[VECTOR_GEP7]] -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8 -; CHECK-NEXT: [[TMP24:%.*]] = add i64 [[OFFSET_IDX]], 0 -; CHECK-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP26:%.*]] = mul i64 [[TMP25]], 2 -; CHECK-NEXT: [[TMP27:%.*]] = add i64 [[TMP26]], 0 -; CHECK-NEXT: [[TMP28:%.*]] = mul i64 [[TMP27]], 8 -; CHECK-NEXT: [[TMP29:%.*]] = add i64 [[OFFSET_IDX]], [[TMP28]] -; CHECK-NEXT: [[TMP30:%.*]] = getelementptr i8, ptr [[START_2]], i64 [[TMP24]] -; CHECK-NEXT: [[TMP31:%.*]] = getelementptr i8, ptr [[START_2]], i64 [[TMP29]] -; CHECK-NEXT: [[TMP32:%.*]] = getelementptr i64, ptr [[TMP30]], i32 0 -; CHECK-NEXT: [[TMP33:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP34:%.*]] = mul i64 [[TMP33]], 2 -; CHECK-NEXT: [[TMP35:%.*]] = getelementptr i64, ptr [[TMP30]], i64 [[TMP34]] -; CHECK-NEXT: store zeroinitializer, ptr [[TMP32]], align 8 +; CHECK-NEXT: [[TMP20:%.*]] = add [[DOTSPLAT6]], [[TMP12]] +; CHECK-NEXT: [[TMP21:%.*]] = mul [[TMP20]], shufflevector ( insertelement ( poison, i64 8, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP22:%.*]] = add [[DOTSPLAT]], [[TMP21]] +; CHECK-NEXT: [[TMP23:%.*]] = add i64 [[TMP19]], 0 +; CHECK-NEXT: [[TMP24:%.*]] = mul i64 [[TMP23]], 8 +; CHECK-NEXT: [[TMP25:%.*]] = add i64 [[OFFSET_IDX]], [[TMP24]] +; CHECK-NEXT: [[TMP26:%.*]] = add i64 [[TMP19]], 1 +; CHECK-NEXT: [[TMP27:%.*]] = mul i64 [[TMP26]], 8 +; CHECK-NEXT: [[TMP28:%.*]] = add i64 [[OFFSET_IDX]], [[TMP27]] +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START_1]], i64 [[TMP16]] +; CHECK-NEXT: [[NEXT_GEP7:%.*]] = getelementptr i8, ptr [[START_1]], i64 [[TMP17]] +; CHECK-NEXT: [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[START_1]], i64 [[TMP25]] +; CHECK-NEXT: [[NEXT_GEP9:%.*]] = getelementptr i8, ptr [[START_1]], i64 [[TMP28]] +; CHECK-NEXT: [[OFFSET_IDX10:%.*]] = mul i64 [[INDEX]], 8 +; CHECK-NEXT: [[TMP29:%.*]] = add i64 [[OFFSET_IDX10]], 0 +; CHECK-NEXT: [[TMP30:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP31:%.*]] = mul i64 [[TMP30]], 2 +; CHECK-NEXT: [[TMP32:%.*]] = add i64 [[TMP31]], 0 +; CHECK-NEXT: [[TMP33:%.*]] = mul i64 [[TMP32]], 8 +; CHECK-NEXT: [[TMP34:%.*]] = add i64 [[OFFSET_IDX10]], [[TMP33]] +; CHECK-NEXT: [[NEXT_GEP11:%.*]] = getelementptr i8, ptr [[START_2]], i64 [[TMP29]] +; CHECK-NEXT: [[NEXT_GEP12:%.*]] = getelementptr i8, ptr [[START_2]], i64 [[TMP34]] +; CHECK-NEXT: [[TMP35:%.*]] = getelementptr i64, ptr [[NEXT_GEP11]], i32 0 +; CHECK-NEXT: [[TMP36:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP37:%.*]] = mul i64 [[TMP36]], 2 +; CHECK-NEXT: [[TMP38:%.*]] = getelementptr i64, ptr [[NEXT_GEP11]], i64 [[TMP37]] ; CHECK-NEXT: store zeroinitializer, ptr [[TMP35]], align 8 +; CHECK-NEXT: store zeroinitializer, ptr [[TMP38]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]] -; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP15]] -; CHECK-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP39:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP39]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] ; CHECK-NEXT: [[CMO:%.*]] = sub i64 [[N_VEC]], 1 -; CHECK-NEXT: [[TMP37:%.*]] = mul i64 [[CMO]], 8 -; CHECK-NEXT: [[IND_ESCAPE:%.*]] = getelementptr i8, ptr [[START_1]], i64 [[TMP37]] +; CHECK-NEXT: [[TMP40:%.*]] = mul i64 [[CMO]], 8 +; CHECK-NEXT: [[IND_ESCAPE:%.*]] = getelementptr i8, ptr [[START_1]], i64 [[TMP40]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[START_1]], [[ENTRY:%.*]] ] diff --git a/llvm/test/Transforms/LoopVectorize/vplan-optimize-ptr-induction.ll b/llvm/test/Transforms/LoopVectorize/vplan-optimize-ptr-induction.ll new file mode 100644 index 0000000000000..25d6c64a5fce6 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/vplan-optimize-ptr-induction.ll @@ -0,0 +1,98 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -scalable-vectorization=on -force-target-supports-scalable-vectors -passes=loop-vectorize < %s -S | FileCheck %s +define ptr @foo(ptr %y, float %alpha, i32 %N) { +; CHECK-LABEL: define ptr @foo( +; CHECK-SAME: ptr [[Y:%.*]], float [[ALPHA:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[CMP3:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP3]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]] +; CHECK: [[FOR_BODY_PREHEADER]]: +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP0]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP1]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[Y]], i64 [[N_VEC]] +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, float [[ALPHA]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[TMP6:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.experimental.stepvector.nxv1i64() +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP6]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = add zeroinitializer, [[TMP3]] +; CHECK-NEXT: [[VECTOR_GEP:%.*]] = mul [[TMP8]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP9:%.*]] = add [[DOTSPLAT]], [[VECTOR_GEP]] +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0 +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[Y]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[TMP6]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[Y]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP12]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = fadd fast [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: store [[TMP13]], ptr [[TMP12]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP6]], [[TMP2]] +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-NEXT: [[CMO:%.*]] = sub i64 [[N_VEC]], 1 +; CHECK-NEXT: [[IND_ESCAPE:%.*]] = getelementptr i8, ptr [[Y]], i64 [[CMO]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[Y]], %[[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT]]: +; CHECK-NEXT: [[END_0_LCSSA:%.*]] = phi ptr [ [[END_0:%.*]], %[[FOR_BODY]] ], [ [[IND_ESCAPE]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]] +; CHECK: [[FOR_COND_CLEANUP]]: +; CHECK-NEXT: [[RESULT:%.*]] = phi ptr [ [[Y]], %[[ENTRY]] ], [ [[END_0_LCSSA]], %[[FOR_COND_CLEANUP_LOOPEXIT]] ] +; CHECK-NEXT: ret ptr [[RESULT]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[END_0]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INCDEC_PTR:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL1]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[Y]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP15:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP15]], [[ALPHA]] +; CHECK-NEXT: store float [[ADD]], ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[END_0]], i64 1 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; +entry: + %cmp3 = icmp sgt i32 %N, 0 + br i1 %cmp3, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %wide.trip.count = zext nneg i32 %N to i64 + br label %for.body + +for.cond.cleanup: + %result = phi ptr [ %y, %entry ], [ %end.0, %for.body ] + ret ptr %result + +for.body: + %end.0 = phi ptr [ %y, %for.body.preheader ], [ %incdec.ptr, %for.body ] + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds float, ptr %y, i64 %indvars.iv + %0 = load float, ptr %arrayidx, align 4 + %add = fadd fast float %0, %alpha + store float %add, ptr %arrayidx, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + %incdec.ptr = getelementptr inbounds i8, ptr %end.0, i64 1 + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} +;. +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +;.