From a84fd4681cdbc7d20d2d8817d46d4951e60b8cea Mon Sep 17 00:00:00 2001 From: wengliqin Date: Fri, 13 Sep 2024 18:53:01 +0800 Subject: [PATCH 1/6] [LV][EVL] Support icmp/fcmp instruction with EVL-vectorization --- .../Transforms/Vectorize/SLPVectorizer.cpp | 2 +- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 28 +++++ .../Transforms/Vectorize/VPlanTransforms.cpp | 3 +- ...rize-force-tail-with-evl-cond-reduction.ll | 2 +- ...ze-force-tail-with-evl-masked-loadstore.ll | 30 ++--- ...-force-tail-with-evl-reverse-load-store.ll | 1 + .../RISCV/vplan-vp-cmp-intrinsics.ll | 118 ++++++++++++++++++ 7 files changed, 166 insertions(+), 18 deletions(-) create mode 100644 llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-cmp-intrinsics.ll diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 268546fe99e13..c5a2c92e54879 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -11716,7 +11716,7 @@ InstructionCost BoUpSLP::getSpillCost() const { if (auto *FPMO = dyn_cast(II)) FMF = FPMO->getFastMathFlags(); IntrinsicCostAttributes ICA(II->getIntrinsicID(), II->getType(), Tys, - FMF); + FMF, II); InstructionCost IntrCost = TTI->getIntrinsicInstrCost(ICA, TTI::TCK_RecipThroughput); InstructionCost CallCost = TTI->getCallInstrCost( diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index de7023167df89..752cbfda93c96 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -1457,6 +1457,34 @@ InstructionCost VPWidenRecipe::computeCost(ElementCount VF, void VPWidenEVLRecipe::execute(VPTransformState &State) { unsigned Opcode = getOpcode(); // TODO: Support other opcodes + if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) { + Value *Op1 = State.get(getOperand(0), 0); + Value *Op2 = State.get(getOperand(1), 0); + auto &Ctx = State.Builder.getContext(); + Value *Pred = MetadataAsValue::get( + Ctx, MDString::get(Ctx, CmpInst::getPredicateName(getPredicate()))); + + IRBuilderBase &BuilderIR = State.Builder; + VectorBuilder Builder(BuilderIR); + Value *Mask = BuilderIR.CreateVectorSplat(State.VF, BuilderIR.getTrue()); + Builder.setMask(Mask).setEVL(State.get(getEVL(), /*NeedsScalar=*/true)); + + VectorType *DataType = VectorType::get(Type::getInt1Ty(Ctx), State.VF); + + Value *VPInst = Builder.createVectorInstruction(Opcode, DataType, + {Op1, Op2, Pred}, "vp.op"); + // if (isa(VPInst)) + // setFlags(cast(VPInst)); + if (VPInst) { + if (auto *VecOp = dyn_cast(VPInst)) + VecOp->copyIRFlags(getUnderlyingInstr()); + } + State.set(this, VPInst, 0); + State.addMetadata(VPInst, + dyn_cast_or_null(getUnderlyingValue())); + return; + } + if (!Instruction::isBinaryOp(Opcode) && !Instruction::isUnaryOp(Opcode)) llvm_unreachable("Unsupported opcode in VPWidenEVLRecipe::execute"); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 355781f955052..43c5ff7d4ea7c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1476,7 +1476,8 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { .Case([&](VPWidenRecipe *W) -> VPRecipeBase * { unsigned Opcode = W->getOpcode(); if (!Instruction::isBinaryOp(Opcode) && - !Instruction::isUnaryOp(Opcode)) + !Instruction::isUnaryOp(Opcode) && + Opcode != Instruction::ICmp && Opcode != Instruction::FCmp) return nullptr; return new VPWidenEVLRecipe(*W, EVL); }) diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll index fc12dd54f88df..f2ca2f8749525 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll @@ -282,7 +282,7 @@ define i32 @cond_add_pred(ptr %a, i64 %n, i32 %start) { ; IF-EVL-INLOOP-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP13]] ; IF-EVL-INLOOP-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0 ; IF-EVL-INLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP18]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) -; IF-EVL-INLOOP-NEXT: [[TMP19:%.*]] = icmp sgt [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer) +; IF-EVL-INLOOP-NEXT: [[TMP19:%.*]] = call @llvm.vp.icmp.nxv4i32( [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer), metadata !"sgt", shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) ; IF-EVL-INLOOP-NEXT: [[TMP20:%.*]] = select [[TMP16]], [[TMP19]], zeroinitializer ; IF-EVL-INLOOP-NEXT: [[TMP21:%.*]] = call i32 @llvm.vp.reduce.add.nxv4i32(i32 0, [[VP_OP_LOAD]], [[TMP20]], i32 [[TMP12]]) ; IF-EVL-INLOOP-NEXT: [[TMP22]] = add i32 [[TMP21]], [[VEC_PHI]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-masked-loadstore.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-masked-loadstore.ll index 99da5058fbf92..c915ffd9a80d9 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-masked-loadstore.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-masked-loadstore.ll @@ -45,18 +45,18 @@ define void @masked_loadstore(ptr noalias %a, ptr noalias %b, i64 %n) { ; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP11]] ; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 0 ; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP16]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) -; IF-EVL-NEXT: [[TMP17:%.*]] = icmp ne [[VP_OP_LOAD]], zeroinitializer -; IF-EVL-NEXT: [[TMP18:%.*]] = select [[TMP14]], [[TMP17]], zeroinitializer -; IF-EVL-NEXT: [[TMP19:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[TMP11]] -; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr i32, ptr [[TMP19]], i32 0 -; IF-EVL-NEXT: [[VP_OP_LOAD3:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP20]], [[TMP18]], i32 [[TMP10]]) -; IF-EVL-NEXT: [[VP_OP:%.*]] = call @llvm.vp.add.nxv4i32( [[VP_OP_LOAD]], [[VP_OP_LOAD3]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) -; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_OP]], ptr align 4 [[TMP20]], [[TMP18]], i32 [[TMP10]]) -; IF-EVL-NEXT: [[TMP21:%.*]] = zext i32 [[TMP10]] to i64 -; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP21]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[VP_OP:%.*]] = call @llvm.vp.icmp.nxv4i32( [[VP_OP_LOAD]], zeroinitializer, metadata !"ne", shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP17:%.*]] = select [[TMP14]], [[VP_OP]], zeroinitializer +; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[TMP11]] +; IF-EVL-NEXT: [[TMP19:%.*]] = getelementptr i32, ptr [[TMP18]], i32 0 +; IF-EVL-NEXT: [[VP_OP_LOAD3:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP19]], [[TMP17]], i32 [[TMP10]]) +; IF-EVL-NEXT: [[VP_OP4:%.*]] = call @llvm.vp.add.nxv4i32( [[VP_OP_LOAD]], [[VP_OP_LOAD3]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_OP4]], ptr align 4 [[TMP19]], [[TMP17]], i32 [[TMP10]]) +; IF-EVL-NEXT: [[TMP20:%.*]] = zext i32 [[TMP10]] to i64 +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP20]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] -; IF-EVL-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; IF-EVL-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; IF-EVL-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; IF-EVL: middle.block: ; IF-EVL-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; IF-EVL: scalar.ph: @@ -65,13 +65,13 @@ define void @masked_loadstore(ptr noalias %a, ptr noalias %b, i64 %n) { ; IF-EVL: for.body: ; IF-EVL-NEXT: [[I_011:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[I_011]] -; IF-EVL-NEXT: [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; IF-EVL-NEXT: [[CMP1:%.*]] = icmp ne i32 [[TMP23]], 0 +; IF-EVL-NEXT: [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; IF-EVL-NEXT: [[CMP1:%.*]] = icmp ne i32 [[TMP22]], 0 ; IF-EVL-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; IF-EVL: if.then: ; IF-EVL-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I_011]] -; IF-EVL-NEXT: [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4 -; IF-EVL-NEXT: [[ADD:%.*]] = add i32 [[TMP23]], [[TMP24]] +; IF-EVL-NEXT: [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4 +; IF-EVL-NEXT: [[ADD:%.*]] = add i32 [[TMP22]], [[TMP23]] ; IF-EVL-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX3]], align 4 ; IF-EVL-NEXT: br label [[FOR_INC]] ; IF-EVL: for.inc: diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll index 9a001f36da7d4..8eff8b51c0345 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll @@ -166,6 +166,7 @@ define void @reverse_load_store_masked(i64 %startval, ptr noalias %ptr, ptr noal ; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]] ; IF-EVL-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] + ; IF-EVL: middle.block: ; IF-EVL-NEXT: br i1 true, label [[LOOPEND:%.*]], label [[SCALAR_PH]] ; IF-EVL: scalar.ph: diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-cmp-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-cmp-intrinsics.ll new file mode 100644 index 0000000000000..4557a3762595f --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-cmp-intrinsics.ll @@ -0,0 +1,118 @@ +; REQUIRES: asserts + +; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \ +; RUN: -force-tail-folding-style=data-with-evl \ +; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \ +; RUN: -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -disable-output < %s 2>&1 | FileCheck --check-prefix=IF-EVL %s + +define void @vp_icmp(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) { +; IF-EVL: VPlan 'Final VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' { +; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF +; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count +; IF-EVL-NEXT: Live-in ir<%N> = original trip-count + +; IF-EVL: vector.ph: +; IF-EVL-NEXT: Successor(s): vector loop + +; IF-EVL: vector loop: { +; IF-EVL-NEXT: vector.body: +; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION +; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEX:%[0-9]+]]> +; IF-EVL-NEXT: EMIT vp<[[AVL:%.+]]> = sub ir<%N>, vp<[[EVL_PHI]]> +; IF-EVL-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> +; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1> +; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]> +; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]> +; IF-EVL-NEXT: WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]> +; IF-EVL-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]> +; IF-EVL-NEXT: vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]> +; IF-EVL-NEXT: WIDEN ir<[[LD2:%.+]]> = vp.load vp<[[PTR2]]>, vp<[[EVL]]> +; IF-EVL-NEXT: WIDEN ir<[[ICMP:%.+]]> = vp.icmp sgt ir<[[LD1]]>, ir<[[LD2]]>, vp<[[EVL]]> +; IF-EVL-NEXT: WIDEN-CAST ir<[[ZEXT:%.+]]> = zext ir<[[ICMP]]> to i32 +; IF-EVL-NEXT: CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]> +; IF-EVL-NEXT: vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]> +; IF-EVL-NEXT: WIDEN vp.store vp<[[PTR3]]>, ir<[[ZEXT]]>, vp<[[EVL]]> +; IF-EVL-NEXT: SCALAR-CAST vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64 +; IF-EVL-NEXT: EMIT vp<[[IV_NEX]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]> +; IF-EVL-NEXT: EMIT vp<[[IV_NEXT_EXIT:%[0-9]+]]> = add vp<[[IV]]>, vp<[[VFUF]]> +; IF-EVL-NEXT: EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]> +; IF-EVL-NEXT: No successors +; IF-EVL-NEXT: } + +entry: + %cmp12 = icmp sgt i64 %N, 0 + br i1 %cmp12, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i32, ptr %b, i64 %indvars.iv + %0 = load i32, ptr %arrayidx, align 4 + %arrayidx3 = getelementptr inbounds i32, ptr %c, i64 %indvars.iv + %1 = load i32, ptr %arrayidx3, align 4 + %cmp4 = icmp sgt i32 %0, %1 + %conv5 = zext i1 %cmp4 to i32 + %arrayidx7 = getelementptr inbounds i32, ptr %a, i64 %indvars.iv + store i32 %conv5, ptr %arrayidx7, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +define void @vp_fcmp(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) { +; IF-EVL: VPlan 'Final VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' { +; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF +; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count +; IF-EVL-NEXT: Live-in ir<%N> = original trip-count + +; IF-EVL: vector.ph: +; IF-EVL-NEXT: Successor(s): vector loop + +; IF-EVL: vector loop: { +; IF-EVL-NEXT: vector.body: +; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION +; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEX:%[0-9]+]]> +; IF-EVL-NEXT: EMIT vp<[[AVL:%.+]]> = sub ir<%N>, vp<[[EVL_PHI]]> +; IF-EVL-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> +; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1> +; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]> +; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]> +; IF-EVL-NEXT: WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]> +; IF-EVL-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]> +; IF-EVL-NEXT: vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]> +; IF-EVL-NEXT: WIDEN ir<[[LD2:%.+]]> = vp.load vp<[[PTR2]]>, vp<[[EVL]]> +; IF-EVL-NEXT: WIDEN ir<[[FCMP:%.+]]> = vp.fcmp ogt ir<[[LD1]]>, ir<[[LD2]]>, vp<[[EVL]]> +; IF-EVL-NEXT: WIDEN-CAST ir<[[UITOFP:%.+]]> = uitofp ir<[[FCMP]]> to float +; IF-EVL-NEXT: CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]> +; IF-EVL-NEXT: vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]> +; IF-EVL-NEXT: WIDEN vp.store vp<[[PTR3]]>, ir<[[UITOFP]]>, vp<[[EVL]]> +; IF-EVL-NEXT: SCALAR-CAST vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64 +; IF-EVL-NEXT: EMIT vp<[[IV_NEX]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]> +; IF-EVL-NEXT: EMIT vp<[[IV_NEXT_EXIT:%[0-9]+]]> = add vp<[[IV]]>, vp<[[VFUF]]> +; IF-EVL-NEXT: EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]> +; IF-EVL-NEXT: No successors +; IF-EVL-NEXT: } + +entry: + %cmp13 = icmp sgt i64 %N, 0 + br i1 %cmp13, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: + ret void + +for.body: + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds float, ptr %b, i64 %indvars.iv + %0 = load float, ptr %arrayidx, align 4 + %arrayidx3 = getelementptr inbounds float, ptr %c, i64 %indvars.iv + %1 = load float, ptr %arrayidx3, align 4 + %cmp4 = fcmp ogt float %0, %1 + %conv6 = uitofp i1 %cmp4 to float + %arrayidx8 = getelementptr inbounds float, ptr %a, i64 %indvars.iv + store float %conv6, ptr %arrayidx8, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} \ No newline at end of file From be51c9f57f0e8bf2f0324db41c6ad0e9f89d799c Mon Sep 17 00:00:00 2001 From: "Liqin.Weng" Date: Wed, 16 Oct 2024 18:03:55 +0800 Subject: [PATCH 2/6] rebase and fix the comments --- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 16 ++---- ...rize-force-tail-with-evl-cond-reduction.ll | 2 +- .../RISCV/vplan-vp-cmp-intrinsics.ll | 56 +++++++++---------- .../RISCV/vplan-vp-select-intrinsics.ll | 2 +- 4 files changed, 35 insertions(+), 41 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 752cbfda93c96..ecedd504335ed 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -1466,19 +1466,15 @@ void VPWidenEVLRecipe::execute(VPTransformState &State) { IRBuilderBase &BuilderIR = State.Builder; VectorBuilder Builder(BuilderIR); + Value *Mask = BuilderIR.CreateVectorSplat(State.VF, BuilderIR.getTrue()); Builder.setMask(Mask).setEVL(State.get(getEVL(), /*NeedsScalar=*/true)); - - VectorType *DataType = VectorType::get(Type::getInt1Ty(Ctx), State.VF); - - Value *VPInst = Builder.createVectorInstruction(Opcode, DataType, + VectorType *RetType = VectorType::get(Type::getInt1Ty(Ctx), State.VF); + Value *VPInst = Builder.createVectorInstruction(Opcode, RetType, {Op1, Op2, Pred}, "vp.op"); - // if (isa(VPInst)) - // setFlags(cast(VPInst)); - if (VPInst) { - if (auto *VecOp = dyn_cast(VPInst)) - VecOp->copyIRFlags(getUnderlyingInstr()); - } + if (auto *VecOp = dyn_cast(VPInst)) + VecOp->copyIRFlags(getUnderlyingInstr()); + State.set(this, VPInst, 0); State.addMetadata(VPInst, dyn_cast_or_null(getUnderlyingValue())); diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll index f2ca2f8749525..1166d4cdc76b3 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll @@ -69,7 +69,7 @@ define i32 @cond_add(ptr %a, i64 %n, i32 %start) { ; IF-EVL-INLOOP-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP13]] ; IF-EVL-INLOOP-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0 ; IF-EVL-INLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP18]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) -; IF-EVL-INLOOP-NEXT: [[TMP19:%.*]] = icmp sgt [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer) +; IF-EVL-INLOOP-NEXT: [[TMP19:%.*]] = call @llvm.vp.icmp.nxv4i32( %vp.op.load, shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer), metadata !"sgt", shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 %9) ; IF-EVL-INLOOP-NEXT: [[TMP20:%.*]] = call @llvm.vp.select.nxv4i32( [[TMP19]], [[VP_OP_LOAD]], zeroinitializer, i32 [[TMP12]]) ; IF-EVL-INLOOP-NEXT: [[TMP21:%.*]] = call i32 @llvm.vp.reduce.add.nxv4i32(i32 0, [[TMP20]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP12]]) ; IF-EVL-INLOOP-NEXT: [[TMP22]] = add i32 [[TMP21]], [[VEC_PHI]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-cmp-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-cmp-intrinsics.ll index 4557a3762595f..67de3460069b9 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-cmp-intrinsics.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-cmp-intrinsics.ll @@ -40,25 +40,24 @@ define void @vp_icmp(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) { ; IF-EVL-NEXT: } entry: - %cmp12 = icmp sgt i64 %N, 0 - br i1 %cmp12, label %for.body, label %for.cond.cleanup + br label %loop -for.cond.cleanup: ; preds = %for.body, %entry - ret void - -for.body: ; preds = %entry, %for.body - %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] - %arrayidx = getelementptr inbounds i32, ptr %b, i64 %indvars.iv +loop: + %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i32, ptr %b, i64 %iv %0 = load i32, ptr %arrayidx, align 4 - %arrayidx3 = getelementptr inbounds i32, ptr %c, i64 %indvars.iv - %1 = load i32, ptr %arrayidx3, align 4 + %gep = getelementptr inbounds i32, ptr %c, i64 %iv + %1 = load i32, ptr %gep, align 4 %cmp4 = icmp sgt i32 %0, %1 %conv5 = zext i1 %cmp4 to i32 - %arrayidx7 = getelementptr inbounds i32, ptr %a, i64 %indvars.iv + %arrayidx7 = getelementptr inbounds i32, ptr %a, i64 %iv store i32 %conv5, ptr %arrayidx7, align 4 - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %exitcond.not = icmp eq i64 %indvars.iv.next, %N - br i1 %exitcond.not, label %for.cond.cleanup, label %for.body + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %N + br i1 %exitcond.not, label %exit, label %loop + +exit: + ret void } define void @vp_fcmp(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) { @@ -96,23 +95,22 @@ define void @vp_fcmp(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) { ; IF-EVL-NEXT: } entry: - %cmp13 = icmp sgt i64 %N, 0 - br i1 %cmp13, label %for.body, label %for.cond.cleanup + br label %loop -for.cond.cleanup: - ret void - -for.body: - %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] - %arrayidx = getelementptr inbounds float, ptr %b, i64 %indvars.iv +loop: + %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ] + %arrayidx = getelementptr inbounds float, ptr %b, i64 %iv %0 = load float, ptr %arrayidx, align 4 - %arrayidx3 = getelementptr inbounds float, ptr %c, i64 %indvars.iv - %1 = load float, ptr %arrayidx3, align 4 + %gep = getelementptr inbounds float, ptr %c, i64 %iv + %1 = load float, ptr %gep, align 4 %cmp4 = fcmp ogt float %0, %1 %conv6 = uitofp i1 %cmp4 to float - %arrayidx8 = getelementptr inbounds float, ptr %a, i64 %indvars.iv + %arrayidx8 = getelementptr inbounds float, ptr %a, i64 %iv store float %conv6, ptr %arrayidx8, align 4 - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %exitcond.not = icmp eq i64 %indvars.iv.next, %N - br i1 %exitcond.not, label %for.cond.cleanup, label %for.body -} \ No newline at end of file + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %N + br i1 %exitcond.not, label %exit, label %loop + +exit: + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll index 6d6cfb5e9d18e..81ca2486588e5 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll @@ -27,7 +27,7 @@ ; IF-EVL-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]> ; IF-EVL-NEXT: vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]> ; IF-EVL-NEXT: WIDEN ir<[[LD2:%.+]]> = vp.load vp<[[PTR2]]>, vp<[[EVL]]> - ; IF-EVL-NEXT: WIDEN ir<[[CMP:%.+]]> = icmp sgt ir<[[LD1]]>, ir<[[LD2]]> + ; IF-EVL-NEXT: WIDEN ir<[[CMP:%.+]]> = vp.icmp sgt ir<[[LD1]]>, ir<[[LD2]]> ; IF-EVL-NEXT: WIDEN ir<[[SUB:%.+]]> = vp.sub ir<0>, ir<[[LD2]]>, vp<[[EVL]]> ; IF-EVL-NEXT: WIDEN-INTRINSIC vp<[[SELECT:%.+]]> = call llvm.vp.select(ir<[[CMP]]>, ir<%1>, ir<%2>, vp<[[EVL]]>) ; IF-EVL-NEXT: WIDEN ir<[[ADD:%.+]]> = vp.add vp<[[SELECT]]>, ir<[[LD1]]>, vp<[[EVL]]> From b13d40b1cfbb4b9424f16240fab0d392383e9401 Mon Sep 17 00:00:00 2001 From: "Liqin.Weng" Date: Fri, 25 Oct 2024 17:48:37 +0800 Subject: [PATCH 3/6] Rebase && fix the comments --- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 62 +++++++++---------- .../Transforms/Vectorize/VPlanTransforms.cpp | 4 +- ...-force-tail-with-evl-reverse-load-store.ll | 2 +- .../RISCV/vplan-vp-cmp-intrinsics.ll | 12 ++-- 4 files changed, 38 insertions(+), 42 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index ecedd504335ed..6cc306e6644e6 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -1456,10 +1456,9 @@ InstructionCost VPWidenRecipe::computeCost(ElementCount VF, void VPWidenEVLRecipe::execute(VPTransformState &State) { unsigned Opcode = getOpcode(); - // TODO: Support other opcodes if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) { - Value *Op1 = State.get(getOperand(0), 0); - Value *Op2 = State.get(getOperand(1), 0); + Value *Op1 = State.get(getOperand(0)); + Value *Op2 = State.get(getOperand(1)); auto &Ctx = State.Builder.getContext(); Value *Pred = MetadataAsValue::get( Ctx, MDString::get(Ctx, CmpInst::getPredicateName(getPredicate()))); @@ -1472,46 +1471,45 @@ void VPWidenEVLRecipe::execute(VPTransformState &State) { VectorType *RetType = VectorType::get(Type::getInt1Ty(Ctx), State.VF); Value *VPInst = Builder.createVectorInstruction(Opcode, RetType, {Op1, Op2, Pred}, "vp.op"); - if (auto *VecOp = dyn_cast(VPInst)) - VecOp->copyIRFlags(getUnderlyingInstr()); + if (isa(VPInst)) + setFlags(cast(VPInst)); - State.set(this, VPInst, 0); + State.set(this, VPInst); State.addMetadata(VPInst, dyn_cast_or_null(getUnderlyingValue())); return; } - if (!Instruction::isBinaryOp(Opcode) && !Instruction::isUnaryOp(Opcode)) - llvm_unreachable("Unsupported opcode in VPWidenEVLRecipe::execute"); + if (Instruction::isBinaryOp(Opcode) || Instruction::isUnaryOp(Opcode)) { + State.setDebugLocFrom(getDebugLoc()); - State.setDebugLocFrom(getDebugLoc()); - - assert(State.get(getOperand(0))->getType()->isVectorTy() && - "VPWidenEVLRecipe should not be used for scalars"); + assert(State.get(getOperand(0))->getType()->isVectorTy() && + "VPWidenEVLRecipe should not be used for scalars"); - VPValue *EVL = getEVL(); - Value *EVLArg = State.get(EVL, /*NeedsScalar=*/true); - IRBuilderBase &BuilderIR = State.Builder; - VectorBuilder Builder(BuilderIR); - Value *Mask = BuilderIR.CreateVectorSplat(State.VF, BuilderIR.getTrue()); + VPValue *EVL = getEVL(); + Value *EVLArg = State.get(EVL, /*NeedsScalar=*/true); + IRBuilderBase &BuilderIR = State.Builder; + VectorBuilder Builder(BuilderIR); + Value *Mask = BuilderIR.CreateVectorSplat(State.VF, BuilderIR.getTrue()); - SmallVector Ops; - for (unsigned I = 0, E = getNumOperands() - 1; I < E; ++I) { - VPValue *VPOp = getOperand(I); - Ops.push_back(State.get(VPOp)); - } + SmallVector Ops; + for (unsigned I = 0, E = getNumOperands() - 1; I < E; ++I) { + VPValue *VPOp = getOperand(I); + Ops.push_back(State.get(VPOp)); + } - Builder.setMask(Mask).setEVL(EVLArg); - Value *VPInst = - Builder.createVectorInstruction(Opcode, Ops[0]->getType(), Ops, "vp.op"); - // Currently vp-intrinsics only accept FMF flags. - // TODO: Enable other flags when support is added. - if (isa(VPInst)) - setFlags(cast(VPInst)); + Builder.setMask(Mask).setEVL(EVLArg); + Value *VPInst = Builder.createVectorInstruction(Opcode, Ops[0]->getType(), + Ops, "vp.op"); + // Currently vp-intrinsics only accept FMF flags. + // TODO: Enable other flags when support is added. + if (isa(VPInst)) + setFlags(cast(VPInst)); - State.set(this, VPInst); - State.addMetadata(VPInst, - dyn_cast_or_null(getUnderlyingValue())); + State.set(this, VPInst); + State.addMetadata(VPInst, + dyn_cast_or_null(getUnderlyingValue())); + } } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 43c5ff7d4ea7c..f402b48af0f17 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1475,9 +1475,7 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { }) .Case([&](VPWidenRecipe *W) -> VPRecipeBase * { unsigned Opcode = W->getOpcode(); - if (!Instruction::isBinaryOp(Opcode) && - !Instruction::isUnaryOp(Opcode) && - Opcode != Instruction::ICmp && Opcode != Instruction::FCmp) + if (Opcode == Instruction::Freeze) return nullptr; return new VPWidenEVLRecipe(*W, EVL); }) diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll index 8eff8b51c0345..c291128b1e50c 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll @@ -143,7 +143,7 @@ define void @reverse_load_store_masked(i64 %startval, ptr noalias %ptr, ptr noal ; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i32 [[TMP7]] ; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0 ; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP5]]) -; IF-EVL-NEXT: [[TMP14:%.*]] = icmp slt [[VP_OP_LOAD]], shufflevector ( insertelement ( poison, i32 100, i64 0), poison, zeroinitializer) +; IF-EVL-NEXT: [[TMP14:%.*]] = call @llvm.vp.icmp.nxv4i32( %vp.op.load, shufflevector ( insertelement ( poison, i32 100, i64 0), poison, zeroinitializer), metadata !"slt", shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 %5) ; IF-EVL-NEXT: [[TMP15:%.*]] = select [[TMP10]], [[TMP14]], zeroinitializer ; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[PTR1:%.*]], i64 [[TMP11]] ; IF-EVL-NEXT: [[TMP17:%.*]] = mul i64 0, [[TMP4]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-cmp-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-cmp-intrinsics.ll index 67de3460069b9..312bb75a17622 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-cmp-intrinsics.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-cmp-intrinsics.ll @@ -17,7 +17,7 @@ define void @vp_icmp(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) { ; IF-EVL: vector loop: { ; IF-EVL-NEXT: vector.body: ; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION -; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEX:%[0-9]+]]> +; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]> ; IF-EVL-NEXT: EMIT vp<[[AVL:%.+]]> = sub ir<%N>, vp<[[EVL_PHI]]> ; IF-EVL-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> ; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1> @@ -33,8 +33,8 @@ define void @vp_icmp(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) { ; IF-EVL-NEXT: vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]> ; IF-EVL-NEXT: WIDEN vp.store vp<[[PTR3]]>, ir<[[ZEXT]]>, vp<[[EVL]]> ; IF-EVL-NEXT: SCALAR-CAST vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64 -; IF-EVL-NEXT: EMIT vp<[[IV_NEX]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]> -; IF-EVL-NEXT: EMIT vp<[[IV_NEXT_EXIT:%[0-9]+]]> = add vp<[[IV]]>, vp<[[VFUF]]> +; IF-EVL-NEXT: EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]> +; IF-EVL-NEXT: EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]> ; IF-EVL-NEXT: EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]> ; IF-EVL-NEXT: No successors ; IF-EVL-NEXT: } @@ -72,7 +72,7 @@ define void @vp_fcmp(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) { ; IF-EVL: vector loop: { ; IF-EVL-NEXT: vector.body: ; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION -; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEX:%[0-9]+]]> +; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]> ; IF-EVL-NEXT: EMIT vp<[[AVL:%.+]]> = sub ir<%N>, vp<[[EVL_PHI]]> ; IF-EVL-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]> ; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1> @@ -88,8 +88,8 @@ define void @vp_fcmp(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) { ; IF-EVL-NEXT: vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]> ; IF-EVL-NEXT: WIDEN vp.store vp<[[PTR3]]>, ir<[[UITOFP]]>, vp<[[EVL]]> ; IF-EVL-NEXT: SCALAR-CAST vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64 -; IF-EVL-NEXT: EMIT vp<[[IV_NEX]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]> -; IF-EVL-NEXT: EMIT vp<[[IV_NEXT_EXIT:%[0-9]+]]> = add vp<[[IV]]>, vp<[[VFUF]]> +; IF-EVL-NEXT: EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]> +; IF-EVL-NEXT: EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]> ; IF-EVL-NEXT: EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]> ; IF-EVL-NEXT: No successors ; IF-EVL-NEXT: } From 3264b283cbddb526d9866647e4f54b506f086e45 Mon Sep 17 00:00:00 2001 From: LiqinWeng Date: Wed, 30 Oct 2024 18:20:24 +0800 Subject: [PATCH 4/6] [VPlan] Use VPWidenIntrinsicRecipe to support binary and unary operations with EVL-vectorization --- llvm/include/llvm/IR/VectorBuilder.h | 4 +- llvm/lib/IR/VectorBuilder.cpp | 9 +- llvm/lib/Transforms/Utils/LoopUtils.cpp | 4 +- llvm/lib/Transforms/Vectorize/VPlan.h | 79 ++-------- .../Transforms/Vectorize/VPlanAnalysis.cpp | 5 +- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 141 +++++++----------- .../Transforms/Vectorize/VPlanTransforms.cpp | 12 +- llvm/lib/Transforms/Vectorize/VPlanValue.h | 1 - .../Transforms/Vectorize/VPlanVerifier.cpp | 4 - .../RISCV/vplan-vp-intrinsics.ll | 2 +- .../RISCV/vplan-vp-select-intrinsics.ll | 6 +- 11 files changed, 94 insertions(+), 173 deletions(-) diff --git a/llvm/include/llvm/IR/VectorBuilder.h b/llvm/include/llvm/IR/VectorBuilder.h index b0277c2b52595..830163984e37b 100644 --- a/llvm/include/llvm/IR/VectorBuilder.h +++ b/llvm/include/llvm/IR/VectorBuilder.h @@ -99,11 +99,11 @@ class VectorBuilder { const Twine &Name = Twine()); /// Emit a VP reduction intrinsic call for recurrence kind. - /// \param RdxID The intrinsic ID of llvm.vector.reduce.* + /// \param ID The intrinsic ID of call Intrinsic /// \param ValTy The type of operand which the reduction operation is /// performed. /// \param VecOpArray The operand list. - Value *createSimpleReduction(Intrinsic::ID RdxID, Type *ValTy, + Value *createSimpleIntrinsic(Intrinsic::ID ID, Type *ValTy, ArrayRef VecOpArray, const Twine &Name = Twine()); }; diff --git a/llvm/lib/IR/VectorBuilder.cpp b/llvm/lib/IR/VectorBuilder.cpp index 737f49b1334d7..d629a2fb6af7b 100644 --- a/llvm/lib/IR/VectorBuilder.cpp +++ b/llvm/lib/IR/VectorBuilder.cpp @@ -60,13 +60,12 @@ Value *VectorBuilder::createVectorInstruction(unsigned Opcode, Type *ReturnTy, return createVectorInstructionImpl(VPID, ReturnTy, InstOpArray, Name); } -Value *VectorBuilder::createSimpleReduction(Intrinsic::ID RdxID, - Type *ValTy, +Value *VectorBuilder::createSimpleIntrinsic(Intrinsic::ID ID, Type *ValTy, ArrayRef InstOpArray, const Twine &Name) { - auto VPID = VPIntrinsic::getForIntrinsic(RdxID); - assert(VPReductionIntrinsic::isVPReduction(VPID) && - "No VPIntrinsic for this reduction"); + auto VPID = VPIntrinsic::getForIntrinsic(ID); + assert(VPIntrinsic::isVPIntrinsic(VPID) && + "No VPIntrinsic for this Intrinsic"); return createVectorInstructionImpl(VPID, ValTy, InstOpArray, Name); } diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp index 70047273c3b9a..2dac2d43f7f3a 100644 --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -1300,7 +1300,7 @@ Value *llvm::createSimpleReduction(VectorBuilder &VBuilder, Value *Src, Type *SrcEltTy = SrcTy->getElementType(); Value *Iden = getRecurrenceIdentity(Kind, SrcEltTy, Desc.getFastMathFlags()); Value *Ops[] = {Iden, Src}; - return VBuilder.createSimpleReduction(Id, SrcTy, Ops); + return VBuilder.createSimpleIntrinsic(Id, SrcTy, Ops); } Value *llvm::createReduction(IRBuilderBase &B, @@ -1343,7 +1343,7 @@ Value *llvm::createOrderedReduction(VectorBuilder &VBuilder, Intrinsic::ID Id = getReductionIntrinsicID(RecurKind::FAdd); auto *SrcTy = cast(Src->getType()); Value *Ops[] = {Start, Src}; - return VBuilder.createSimpleReduction(Id, SrcTy, Ops); + return VBuilder.createSimpleIntrinsic(Id, SrcTy, Ops); } void llvm::propagateIRFlags(Value *I, ArrayRef VL, Value *OpValue, diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 0e0c64f6df9cb..4300c3fb02c87 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -912,7 +912,6 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue { case VPRecipeBase::VPWidenGEPSC: case VPRecipeBase::VPWidenIntrinsicSC: case VPRecipeBase::VPWidenSC: - case VPRecipeBase::VPWidenEVLSC: case VPRecipeBase::VPWidenSelectSC: case VPRecipeBase::VPBlendSC: case VPRecipeBase::VPPredInstPHISC: @@ -1107,7 +1106,6 @@ class VPRecipeWithIRFlags : public VPSingleDefRecipe { static inline bool classof(const VPRecipeBase *R) { return R->getVPDefID() == VPRecipeBase::VPInstructionSC || R->getVPDefID() == VPRecipeBase::VPWidenSC || - R->getVPDefID() == VPRecipeBase::VPWidenEVLSC || R->getVPDefID() == VPRecipeBase::VPWidenGEPSC || R->getVPDefID() == VPRecipeBase::VPWidenCastSC || R->getVPDefID() == VPRecipeBase::VPReplicateSC || @@ -1474,16 +1472,11 @@ class VPIRInstruction : public VPRecipeBase { class VPWidenRecipe : public VPRecipeWithIRFlags { unsigned Opcode; -protected: - template - VPWidenRecipe(unsigned VPDefOpcode, Instruction &I, - iterator_range Operands) - : VPRecipeWithIRFlags(VPDefOpcode, Operands, I), Opcode(I.getOpcode()) {} - public: template VPWidenRecipe(Instruction &I, iterator_range Operands) - : VPWidenRecipe(VPDef::VPWidenSC, I, Operands) {} + : VPRecipeWithIRFlags(VPDef::VPWidenSC, Operands, I), + Opcode(I.getOpcode()) {} ~VPWidenRecipe() override = default; @@ -1493,15 +1486,7 @@ class VPWidenRecipe : public VPRecipeWithIRFlags { return R; } - static inline bool classof(const VPRecipeBase *R) { - return R->getVPDefID() == VPRecipeBase::VPWidenSC || - R->getVPDefID() == VPRecipeBase::VPWidenEVLSC; - } - - static inline bool classof(const VPUser *U) { - auto *R = dyn_cast(U); - return R && classof(R); - } + VP_CLASSOF_IMPL(VPDef::VPWidenSC) /// Produce a widened instruction using the opcode and operands of the recipe, /// processing State.VF elements. @@ -1520,54 +1505,6 @@ class VPWidenRecipe : public VPRecipeWithIRFlags { #endif }; -/// A recipe for widening operations with vector-predication intrinsics with -/// explicit vector length (EVL). -class VPWidenEVLRecipe : public VPWidenRecipe { - using VPRecipeWithIRFlags::transferFlags; - -public: - template - VPWidenEVLRecipe(Instruction &I, iterator_range Operands, VPValue &EVL) - : VPWidenRecipe(VPDef::VPWidenEVLSC, I, Operands) { - addOperand(&EVL); - } - VPWidenEVLRecipe(VPWidenRecipe &W, VPValue &EVL) - : VPWidenEVLRecipe(*W.getUnderlyingInstr(), W.operands(), EVL) { - transferFlags(W); - } - - ~VPWidenEVLRecipe() override = default; - - VPWidenRecipe *clone() override final { - llvm_unreachable("VPWidenEVLRecipe cannot be cloned"); - return nullptr; - } - - VP_CLASSOF_IMPL(VPDef::VPWidenEVLSC); - - VPValue *getEVL() { return getOperand(getNumOperands() - 1); } - const VPValue *getEVL() const { return getOperand(getNumOperands() - 1); } - - /// Produce a vp-intrinsic using the opcode and operands of the recipe, - /// processing EVL elements. - void execute(VPTransformState &State) override final; - - /// Returns true if the recipe only uses the first lane of operand \p Op. - bool onlyFirstLaneUsed(const VPValue *Op) const override { - assert(is_contained(operands(), Op) && - "Op must be an operand of the recipe"); - // EVL in that recipe is always the last operand, thus any use before means - // the VPValue should be vectorized. - return getEVL() == Op; - } - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) - /// Print the recipe. - void print(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const override final; -#endif -}; - /// VPWidenCastRecipe is a recipe to create vector cast instructions. class VPWidenCastRecipe : public VPRecipeWithIRFlags { /// Cast instruction opcode. @@ -1686,6 +1623,16 @@ class VPWidenIntrinsicRecipe : public VPRecipeWithIRFlags { MayWriteToMemory(CI.mayWriteToMemory()), MayHaveSideEffects(CI.mayHaveSideEffects()) {} + template + VPWidenIntrinsicRecipe(Instruction &I, Intrinsic::ID VectorIntrinsicID, + iterator_range Operands, Type *Ty, + DebugLoc DL = {}) + : VPRecipeWithIRFlags(VPDef::VPWidenIntrinsicSC, Operands, I), + VectorIntrinsicID(VectorIntrinsicID), ResultTy(Ty), + MayReadFromMemory(I.mayReadFromMemory()), + MayWriteToMemory(I.mayWriteToMemory()), + MayHaveSideEffects(I.mayHaveSideEffects()) {} + VPWidenIntrinsicRecipe(Intrinsic::ID VectorIntrinsicID, ArrayRef CallArguments, Type *Ty, DebugLoc DL = {}) diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index 8b8ab6be99b0d..f50a1286316a0 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -267,9 +267,8 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) { [this](const VPRecipeBase *R) { return inferScalarType(R->getOperand(0)); }) - .Case( + .Case( [this](const auto *R) { return inferScalarTypeForRecipe(R); }) .Case([](const VPWidenIntrinsicRecipe *R) { return R->getResultType(); diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 6cc306e6644e6..e9b3ce171b3b5 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -99,7 +99,6 @@ bool VPRecipeBase::mayWriteToMemory() const { case VPWidenLoadSC: case VPWidenPHISC: case VPWidenSC: - case VPWidenEVLSC: case VPWidenSelectSC: { const Instruction *I = dyn_cast_or_null(getVPSingleValue()->getUnderlyingValue()); @@ -143,7 +142,6 @@ bool VPRecipeBase::mayReadFromMemory() const { case VPWidenIntOrFpInductionSC: case VPWidenPHISC: case VPWidenSC: - case VPWidenEVLSC: case VPWidenSelectSC: { const Instruction *I = dyn_cast_or_null(getVPSingleValue()->getUnderlyingValue()); @@ -184,7 +182,6 @@ bool VPRecipeBase::mayHaveSideEffects() const { case VPWidenPHISC: case VPWidenPointerInductionSC: case VPWidenSC: - case VPWidenEVLSC: case VPWidenSelectSC: { const Instruction *I = dyn_cast_or_null(getVPSingleValue()->getUnderlyingValue()); @@ -994,24 +991,53 @@ void VPWidenIntrinsicRecipe::execute(VPTransformState &State) { Args.push_back(Arg); } - // Use vector version of the intrinsic. - Module *M = State.Builder.GetInsertBlock()->getModule(); - Function *VectorF = - Intrinsic::getOrInsertDeclaration(M, VectorIntrinsicID, TysForDecl); - assert(VectorF && "Can't retrieve vector intrinsic."); + if (VPIntrinsic::isVPIntrinsic(VectorIntrinsicID) && + VectorIntrinsicID != Intrinsic::vp_select) { + VectorBuilder VBuilder(State.Builder); + Value *Mask = + State.Builder.CreateVectorSplat(State.VF, State.Builder.getTrue()); + VBuilder.setMask(Mask).setEVL(Args.back()); + // Remove EVL from Args + Args.pop_back(); + + if (VectorIntrinsicID == Intrinsic::vp_icmp || + VectorIntrinsicID == Intrinsic::vp_fcmp) { + auto &Ctx = State.Builder.getContext(); + Value *Pred = MetadataAsValue::get( + Ctx, MDString::get(Ctx, CmpInst::getPredicateName(getPredicate()))); + Args.push_back(Pred); + } - auto *CI = cast_or_null(getUnderlyingValue()); - SmallVector OpBundles; - if (CI) - CI->getOperandBundlesAsDefs(OpBundles); + Value *VPInst = VBuilder.createSimpleIntrinsic( + VectorIntrinsicID, TysForDecl[0], Args, "vp.call"); - CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles); + if (isa(VPInst)) + setFlags(cast(VPInst)); - setFlags(V); + if (!VPInst->getType()->isVoidTy()) + State.set(this, VPInst); + State.addMetadata(VPInst, + dyn_cast_or_null(getUnderlyingValue())); + } else { + // Use vector version of the intrinsic. + Module *M = State.Builder.GetInsertBlock()->getModule(); + Function *VectorF = + Intrinsic::getOrInsertDeclaration(M, VectorIntrinsicID, TysForDecl); + assert(VectorF && "Can't retrieve vector intrinsic."); - if (!V->getType()->isVoidTy()) - State.set(this, V); - State.addMetadata(V, CI); + auto *CI = cast_or_null(getUnderlyingValue()); + SmallVector OpBundles; + if (CI) + CI->getOperandBundlesAsDefs(OpBundles); + + CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles); + + setFlags(V); + + if (!V->getType()->isVoidTy()) + State.set(this, V); + State.addMetadata(V, CI); + } } InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF, @@ -1043,6 +1069,20 @@ InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF, ParamTys.push_back( ToVectorTy(Ctx.Types.inferScalarType(getOperand(I)), VF)); + // TODO: Implment in cost model + if (std::optional FOp = + VPIntrinsic::getFunctionalOpcodeForVP(VectorIntrinsicID)) { + if (FOp == Instruction::FNeg) { + // Instruction *CtxI = + dyn_cast_or_null(getUnderlyingValue()); + Type *VectorTy = ToVectorTy(getResultType(), VF); + return Ctx.TTI.getArithmeticInstrCost( + FOp.value(), VectorTy, CostKind, + {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, + {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}); + } + } + // TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst. FastMathFlags FMF = hasFastMathFlags() ? getFastMathFlags() : FastMathFlags(); IntrinsicCostAttributes CostAttrs( @@ -1454,64 +1494,6 @@ InstructionCost VPWidenRecipe::computeCost(ElementCount VF, } } -void VPWidenEVLRecipe::execute(VPTransformState &State) { - unsigned Opcode = getOpcode(); - if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) { - Value *Op1 = State.get(getOperand(0)); - Value *Op2 = State.get(getOperand(1)); - auto &Ctx = State.Builder.getContext(); - Value *Pred = MetadataAsValue::get( - Ctx, MDString::get(Ctx, CmpInst::getPredicateName(getPredicate()))); - - IRBuilderBase &BuilderIR = State.Builder; - VectorBuilder Builder(BuilderIR); - - Value *Mask = BuilderIR.CreateVectorSplat(State.VF, BuilderIR.getTrue()); - Builder.setMask(Mask).setEVL(State.get(getEVL(), /*NeedsScalar=*/true)); - VectorType *RetType = VectorType::get(Type::getInt1Ty(Ctx), State.VF); - Value *VPInst = Builder.createVectorInstruction(Opcode, RetType, - {Op1, Op2, Pred}, "vp.op"); - if (isa(VPInst)) - setFlags(cast(VPInst)); - - State.set(this, VPInst); - State.addMetadata(VPInst, - dyn_cast_or_null(getUnderlyingValue())); - return; - } - - if (Instruction::isBinaryOp(Opcode) || Instruction::isUnaryOp(Opcode)) { - State.setDebugLocFrom(getDebugLoc()); - - assert(State.get(getOperand(0))->getType()->isVectorTy() && - "VPWidenEVLRecipe should not be used for scalars"); - - VPValue *EVL = getEVL(); - Value *EVLArg = State.get(EVL, /*NeedsScalar=*/true); - IRBuilderBase &BuilderIR = State.Builder; - VectorBuilder Builder(BuilderIR); - Value *Mask = BuilderIR.CreateVectorSplat(State.VF, BuilderIR.getTrue()); - - SmallVector Ops; - for (unsigned I = 0, E = getNumOperands() - 1; I < E; ++I) { - VPValue *VPOp = getOperand(I); - Ops.push_back(State.get(VPOp)); - } - - Builder.setMask(Mask).setEVL(EVLArg); - Value *VPInst = Builder.createVectorInstruction(Opcode, Ops[0]->getType(), - Ops, "vp.op"); - // Currently vp-intrinsics only accept FMF flags. - // TODO: Enable other flags when support is added. - if (isa(VPInst)) - setFlags(cast(VPInst)); - - State.set(this, VPInst); - State.addMetadata(VPInst, - dyn_cast_or_null(getUnderlyingValue())); - } -} - #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { @@ -1521,15 +1503,6 @@ void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent, printFlags(O); printOperands(O, SlotTracker); } - -void VPWidenEVLRecipe::print(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent << "WIDEN "; - printAsOperand(O, SlotTracker); - O << " = vp." << Instruction::getOpcodeName(getOpcode()); - printFlags(O); - printOperands(O, SlotTracker); -} #endif void VPWidenCastRecipe::execute(VPTransformState &State) { diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index f402b48af0f17..cf963d8000efa 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1475,9 +1475,17 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { }) .Case([&](VPWidenRecipe *W) -> VPRecipeBase * { unsigned Opcode = W->getOpcode(); - if (Opcode == Instruction::Freeze) + // TODO: Support other opcodes + if (!Instruction::isBinaryOp(Opcode) && + !Instruction::isUnaryOp(Opcode)) return nullptr; - return new VPWidenEVLRecipe(*W, EVL); + auto *I = cast(W->getUnderlyingInstr()); + SmallVector Ops(W->operands()); + Ops.push_back(&EVL); + Intrinsic::ID VPID = VPIntrinsic::getForOpcode(W->getOpcode()); + return new VPWidenIntrinsicRecipe( + *I, VPID, make_range(Ops.begin(), Ops.end()), I->getType(), + I->getDebugLoc()); }) .Case([&](VPReductionRecipe *Red) { VPValue *NewMask = GetNewMask(Red->getCondOp()); diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index 89b3ed72b8eb6..9e1f5b3cf6130 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -357,7 +357,6 @@ class VPDef { VPWidenStoreEVLSC, VPWidenStoreSC, VPWidenSC, - VPWidenEVLSC, VPWidenSelectSC, VPBlendSC, VPHistogramSC, diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index 7ea5ee341cc54..9036dbc9a4c36 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -148,10 +148,6 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const { .Case([&](const VPWidenLoadEVLRecipe *L) { return VerifyEVLUse(*L, 1); }) - .Case([&](const VPWidenEVLRecipe *W) { - return VerifyEVLUse( - *W, Instruction::isUnaryOp(W->getOpcode()) ? 1 : 2); - }) .Case([&](const VPReductionEVLRecipe *R) { return VerifyEVLUse(*R, 2); }) diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll index 706b6f8882984..f1f7dbca0b10b 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll @@ -32,7 +32,7 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) { ; IF-EVL-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]> ; IF-EVL-NEXT: vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]> ; IF-EVL-NEXT: WIDEN ir<[[LD2:%.+]]> = vp.load vp<[[PTR2]]>, vp<[[EVL]]> -; IF-EVL-NEXT: WIDEN ir<[[ADD:%.+]]> = vp.add nsw ir<[[LD2]]>, ir<[[LD1]]>, vp<[[EVL]]> +; IF-EVL-NEXT: WIDEN-INTRINSIC ir<[[ADD:%.+]]> = call nsw llvm.vp.add(ir<[[LD2]]>, ir<[[LD1]]>, vp<[[EVL]]>) ; IF-EVL-NEXT: CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]> ; IF-EVL-NEXT: vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]> ; IF-EVL-NEXT: WIDEN vp.store vp<[[PTR3]]>, ir<[[ADD]]>, vp<[[EVL]]> diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll index 81ca2486588e5..f174f9d007ef5 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll @@ -27,10 +27,10 @@ ; IF-EVL-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]> ; IF-EVL-NEXT: vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]> ; IF-EVL-NEXT: WIDEN ir<[[LD2:%.+]]> = vp.load vp<[[PTR2]]>, vp<[[EVL]]> - ; IF-EVL-NEXT: WIDEN ir<[[CMP:%.+]]> = vp.icmp sgt ir<[[LD1]]>, ir<[[LD2]]> - ; IF-EVL-NEXT: WIDEN ir<[[SUB:%.+]]> = vp.sub ir<0>, ir<[[LD2]]>, vp<[[EVL]]> + ; IF-EVL-NEXT: WIDEN ir<[[CMP:%.+]]> = icmp sgt ir<[[LD1]]>, ir<[[LD2]]> + ; IF-EVL-NEXT: WIDEN-INTRINSIC ir<[[SUB:%.+]]> = call llvm.vp.sub(ir<0>, ir<[[LD2]]>, vp<[[EVL]]>) ; IF-EVL-NEXT: WIDEN-INTRINSIC vp<[[SELECT:%.+]]> = call llvm.vp.select(ir<[[CMP]]>, ir<%1>, ir<%2>, vp<[[EVL]]>) - ; IF-EVL-NEXT: WIDEN ir<[[ADD:%.+]]> = vp.add vp<[[SELECT]]>, ir<[[LD1]]>, vp<[[EVL]]> + ; IF-EVL-NEXT: WIDEN-INTRINSIC ir<[[ADD:%.+]]> = call llvm.vp.add(vp<[[SELECT]]>, ir<[[LD1]]>, vp<[[EVL]]>) ; IF-EVL-NEXT: CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]> ; IF-EVL-NEXT: vp<[[PTR3:%.+]]> = vector-pointer ir<[[GEP3]]> ; IF-EVL-NEXT: WIDEN vp.store vp<[[PTR3]]>, ir<[[ADD]]>, vp<[[EVL]]> From 74a432114af40ffe38b0681792ba4fa12bb3f5dc Mon Sep 17 00:00:00 2001 From: LiqinWeng Date: Wed, 30 Oct 2024 19:19:23 +0800 Subject: [PATCH 5/6] Remove some unused code && implement the vp_fneg cost in TTI --- .../Target/RISCV/RISCVTargetTransformInfo.cpp | 2 ++ .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 22 ------------------- 2 files changed, 2 insertions(+), 22 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index 395baa5f1aab9..5443d95a54bf4 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -1106,6 +1106,8 @@ RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, case Intrinsic::vp_udiv: case Intrinsic::vp_urem: case Intrinsic::vp_xor: + // TODO: add new patch for it. + case Intrinsic::vp_fneg: // vp float arithmetic ops. case Intrinsic::vp_fadd: case Intrinsic::vp_fsub: diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index e9b3ce171b3b5..78eaa6c01c3d1 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -1000,14 +1000,6 @@ void VPWidenIntrinsicRecipe::execute(VPTransformState &State) { // Remove EVL from Args Args.pop_back(); - if (VectorIntrinsicID == Intrinsic::vp_icmp || - VectorIntrinsicID == Intrinsic::vp_fcmp) { - auto &Ctx = State.Builder.getContext(); - Value *Pred = MetadataAsValue::get( - Ctx, MDString::get(Ctx, CmpInst::getPredicateName(getPredicate()))); - Args.push_back(Pred); - } - Value *VPInst = VBuilder.createSimpleIntrinsic( VectorIntrinsicID, TysForDecl[0], Args, "vp.call"); @@ -1069,20 +1061,6 @@ InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF, ParamTys.push_back( ToVectorTy(Ctx.Types.inferScalarType(getOperand(I)), VF)); - // TODO: Implment in cost model - if (std::optional FOp = - VPIntrinsic::getFunctionalOpcodeForVP(VectorIntrinsicID)) { - if (FOp == Instruction::FNeg) { - // Instruction *CtxI = - dyn_cast_or_null(getUnderlyingValue()); - Type *VectorTy = ToVectorTy(getResultType(), VF); - return Ctx.TTI.getArithmeticInstrCost( - FOp.value(), VectorTy, CostKind, - {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, - {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}); - } - } - // TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst. FastMathFlags FMF = hasFastMathFlags() ? getFastMathFlags() : FastMathFlags(); IntrinsicCostAttributes CostAttrs( From f219d2c7d797eb2c5a40ea149352efeba6a4f40c Mon Sep 17 00:00:00 2001 From: "Liqin.Weng" Date: Wed, 30 Oct 2024 19:33:45 +0800 Subject: [PATCH 6/6] [LV] Use VPWidenIntrinsicRecipe to vp_icmp/vp_fcmp --- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 19 +++++++++++++++++++ .../Transforms/Vectorize/VPlanTransforms.cpp | 4 +--- .../RISCV/vplan-vp-cmp-intrinsics.ll | 4 ++-- .../RISCV/vplan-vp-select-intrinsics.ll | 2 +- 4 files changed, 23 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 78eaa6c01c3d1..ac7140219b5a4 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -1000,6 +1000,13 @@ void VPWidenIntrinsicRecipe::execute(VPTransformState &State) { // Remove EVL from Args Args.pop_back(); + if (VPCmpIntrinsic::isVPCmp(VectorIntrinsicID)) { + auto &Ctx = State.Builder.getContext(); + Value *Pred = MetadataAsValue::get( + Ctx, MDString::get(Ctx, CmpInst::getPredicateName(getPredicate()))); + Args.push_back(Pred); + } + Value *VPInst = VBuilder.createSimpleIntrinsic( VectorIntrinsicID, TysForDecl[0], Args, "vp.call"); @@ -1061,6 +1068,18 @@ InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF, ParamTys.push_back( ToVectorTy(Ctx.Types.inferScalarType(getOperand(I)), VF)); + if (std::optional FOp = + VPIntrinsic::getFunctionalOpcodeForVP(VectorIntrinsicID)) { + if (VPCmpIntrinsic::isVPCmp(VectorIntrinsicID)) { + Instruction *CtxI = dyn_cast_or_null(getUnderlyingValue()); + Type *VectorTy = ToVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF); + return Ctx.TTI.getCmpSelInstrCost(FOp.value(), VectorTy, nullptr, + getPredicate(), CostKind, + {TTI::OK_AnyValue, TTI::OP_None}, + {TTI::OK_AnyValue, TTI::OP_None}, CtxI); + } + } + // TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst. FastMathFlags FMF = hasFastMathFlags() ? getFastMathFlags() : FastMathFlags(); IntrinsicCostAttributes CostAttrs( diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index cf963d8000efa..d9006044c00e5 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1475,9 +1475,7 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { }) .Case([&](VPWidenRecipe *W) -> VPRecipeBase * { unsigned Opcode = W->getOpcode(); - // TODO: Support other opcodes - if (!Instruction::isBinaryOp(Opcode) && - !Instruction::isUnaryOp(Opcode)) + if (Opcode == Instruction::Freeze) return nullptr; auto *I = cast(W->getUnderlyingInstr()); SmallVector Ops(W->operands()); diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-cmp-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-cmp-intrinsics.ll index 312bb75a17622..dc96d7921356a 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-cmp-intrinsics.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-cmp-intrinsics.ll @@ -27,7 +27,7 @@ define void @vp_icmp(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) { ; IF-EVL-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]> ; IF-EVL-NEXT: vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]> ; IF-EVL-NEXT: WIDEN ir<[[LD2:%.+]]> = vp.load vp<[[PTR2]]>, vp<[[EVL]]> -; IF-EVL-NEXT: WIDEN ir<[[ICMP:%.+]]> = vp.icmp sgt ir<[[LD1]]>, ir<[[LD2]]>, vp<[[EVL]]> +; IF-EVL-NEXT: WIDEN-INTRINSIC ir<[[ICMP:%.+]]> = call sgt llvm.vp.icmp(ir<[[LD1]]>, ir<[[LD2]]>, vp<[[EVL]]>) ; IF-EVL-NEXT: WIDEN-CAST ir<[[ZEXT:%.+]]> = zext ir<[[ICMP]]> to i32 ; IF-EVL-NEXT: CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]> ; IF-EVL-NEXT: vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]> @@ -82,7 +82,7 @@ define void @vp_fcmp(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) { ; IF-EVL-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]> ; IF-EVL-NEXT: vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]> ; IF-EVL-NEXT: WIDEN ir<[[LD2:%.+]]> = vp.load vp<[[PTR2]]>, vp<[[EVL]]> -; IF-EVL-NEXT: WIDEN ir<[[FCMP:%.+]]> = vp.fcmp ogt ir<[[LD1]]>, ir<[[LD2]]>, vp<[[EVL]]> +; IF-EVL-NEXT: WIDEN-INTRINSIC ir<[[FCMP:%.+]]> = call ogt llvm.vp.fcmp(ir<[[LD1]]>, ir<[[LD2]]>, vp<[[EVL]]>) ; IF-EVL-NEXT: WIDEN-CAST ir<[[UITOFP:%.+]]> = uitofp ir<[[FCMP]]> to float ; IF-EVL-NEXT: CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]> ; IF-EVL-NEXT: vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]> diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll index f174f9d007ef5..92a9cb10f63e2 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll @@ -27,7 +27,7 @@ ; IF-EVL-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]> ; IF-EVL-NEXT: vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]> ; IF-EVL-NEXT: WIDEN ir<[[LD2:%.+]]> = vp.load vp<[[PTR2]]>, vp<[[EVL]]> - ; IF-EVL-NEXT: WIDEN ir<[[CMP:%.+]]> = icmp sgt ir<[[LD1]]>, ir<[[LD2]]> + ; IF-EVL-NEXT: WIDEN-INTRINSIC ir<[[CMP:%.+]]> = call sgt llvm.vp.icmp(ir<[[LD1]]>, ir<[[LD2]]>, vp<[[EVL]]>) ; IF-EVL-NEXT: WIDEN-INTRINSIC ir<[[SUB:%.+]]> = call llvm.vp.sub(ir<0>, ir<[[LD2]]>, vp<[[EVL]]>) ; IF-EVL-NEXT: WIDEN-INTRINSIC vp<[[SELECT:%.+]]> = call llvm.vp.select(ir<[[CMP]]>, ir<%1>, ir<%2>, vp<[[EVL]]>) ; IF-EVL-NEXT: WIDEN-INTRINSIC ir<[[ADD:%.+]]> = call llvm.vp.add(vp<[[SELECT]]>, ir<[[LD1]]>, vp<[[EVL]]>)