diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index d6af8a1435d07..cc6187e8a95bf 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7391,62 +7391,6 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF, } } - // The legacy cost model has special logic to compute the cost of in-loop - // reductions, which may be smaller than the sum of all instructions involved - // in the reduction. - // TODO: Switch to costing based on VPlan once the logic has been ported. - for (const auto &[RedPhi, RdxDesc] : Legal->getReductionVars()) { - if (ForceTargetInstructionCost.getNumOccurrences()) - continue; - - if (!CM.isInLoopReduction(RedPhi)) - continue; - - const auto &ChainOps = RdxDesc.getReductionOpChain(RedPhi, OrigLoop); - SetVector ChainOpsAndOperands(llvm::from_range, ChainOps); - auto IsZExtOrSExt = [](const unsigned Opcode) -> bool { - return Opcode == Instruction::ZExt || Opcode == Instruction::SExt; - }; - // Also include the operands of instructions in the chain, as the cost-model - // may mark extends as free. - // - // For ARM, some of the instruction can folded into the reducion - // instruction. So we need to mark all folded instructions free. - // For example: We can fold reduce(mul(ext(A), ext(B))) into one - // instruction. - for (auto *ChainOp : ChainOps) { - for (Value *Op : ChainOp->operands()) { - if (auto *I = dyn_cast(Op)) { - ChainOpsAndOperands.insert(I); - if (I->getOpcode() == Instruction::Mul) { - auto *Ext0 = dyn_cast(I->getOperand(0)); - auto *Ext1 = dyn_cast(I->getOperand(1)); - if (Ext0 && IsZExtOrSExt(Ext0->getOpcode()) && Ext1 && - Ext0->getOpcode() == Ext1->getOpcode()) { - ChainOpsAndOperands.insert(Ext0); - ChainOpsAndOperands.insert(Ext1); - } - } - } - } - } - - // Pre-compute the cost for I, if it has a reduction pattern cost. - for (Instruction *I : ChainOpsAndOperands) { - auto ReductionCost = - CM.getReductionPatternCost(I, VF, toVectorTy(I->getType(), VF)); - if (!ReductionCost) - continue; - - assert(!CostCtx.SkipCostComputation.contains(I) && - "reduction op visited multiple times"); - CostCtx.SkipCostComputation.insert(I); - LLVM_DEBUG(dbgs() << "Cost of " << ReductionCost << " for VF " << VF - << ":\n in-loop reduction " << *I << "\n"); - Cost += *ReductionCost; - } - } - // Pre-compute the costs for branches except for the backedge, as the number // of replicate regions in a VPlan may not directly match the number of // branches, which would lead to different decisions. diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 45e403eaf9e50..d05cbe799fa1a 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -2669,6 +2669,10 @@ class VPExtendedReductionRecipe : public VPReductionRecipe { "VPExtendedRecipe + VPReductionRecipe before execution."); }; + /// Return the cost of VPExtendedReductionRecipe. + InstructionCost computeCost(ElementCount VF, + VPCostContext &Ctx) const override; + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, @@ -2768,6 +2772,10 @@ class VPMulAccumulateReductionRecipe : public VPReductionRecipe { "VPWidenRecipe + VPReductionRecipe before execution"); } + /// Return the cost of VPMulAccumulateReductionRecipe. + InstructionCost computeCost(ElementCount VF, + VPCostContext &Ctx) const override; + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index fc1ee89e81c75..bf0453fda9f36 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -782,19 +782,25 @@ Value *VPInstruction::generate(VPTransformState &State) { InstructionCost VPInstruction::computeCost(ElementCount VF, VPCostContext &Ctx) const { if (Instruction::isBinaryOp(getOpcode())) { + + Type *ResTy = Ctx.Types.inferScalarType(this); + if (!vputils::onlyFirstLaneUsed(this)) + ResTy = toVectorTy(ResTy, VF); + if (!getUnderlyingValue()) { - // TODO: Compute cost for VPInstructions without underlying values once - // the legacy cost model has been retired. - return 0; + switch (getOpcode()) { + case Instruction::FMul: + return Ctx.TTI.getArithmeticInstrCost(getOpcode(), ResTy, Ctx.CostKind); + default: + // TODO: Compute cost for VPInstructions without underlying values once + // the legacy cost model has been retired. + return 0; + } } assert(!doesGeneratePerAllLanes() && "Should only generate a vector value or single scalar, not scalars " "for all lanes."); - Type *ResTy = Ctx.Types.inferScalarType(this); - if (!vputils::onlyFirstLaneUsed(this)) - ResTy = toVectorTy(ResTy, VF); - return Ctx.TTI.getArithmeticInstrCost(getOpcode(), ResTy, Ctx.CostKind); } @@ -2527,6 +2533,8 @@ InstructionCost VPReductionRecipe::computeCost(ElementCount VF, auto *VectorTy = cast(toVectorTy(ElementTy, VF)); unsigned Opcode = RecurrenceDescriptor::getOpcode(RdxKind); FastMathFlags FMFs = getFastMathFlags(); + std::optional OptionalFMF = + ElementTy->isFloatingPointTy() ? std::make_optional(FMFs) : std::nullopt; // TODO: Support any-of reductions. assert( @@ -2534,17 +2542,38 @@ InstructionCost VPReductionRecipe::computeCost(ElementCount VF, ForceTargetInstructionCost.getNumOccurrences() > 0) && "Any-of reduction not implemented in VPlan-based cost model currently."); - // Cost = Reduction cost + BinOp cost - InstructionCost Cost = - Ctx.TTI.getArithmeticInstrCost(Opcode, ElementTy, Ctx.CostKind); + // Note that TTI should model the cost of moving result to the scalar register + // and the BinOp cost in the getReductionCost(). if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RdxKind)) { Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RdxKind); - return Cost + - Ctx.TTI.getMinMaxReductionCost(Id, VectorTy, FMFs, Ctx.CostKind); + return Ctx.TTI.getMinMaxReductionCost(Id, VectorTy, FMFs, Ctx.CostKind); } - return Cost + Ctx.TTI.getArithmeticReductionCost(Opcode, VectorTy, FMFs, - Ctx.CostKind); + return Ctx.TTI.getArithmeticReductionCost(Opcode, VectorTy, OptionalFMF, + Ctx.CostKind); +} + +InstructionCost +VPExtendedReductionRecipe::computeCost(ElementCount VF, + VPCostContext &Ctx) const { + unsigned Opcode = RecurrenceDescriptor::getOpcode(getRecurrenceKind()); + Type *RedTy = Ctx.Types.inferScalarType(this); + auto *SrcVecTy = + cast(toVectorTy(Ctx.Types.inferScalarType(getVecOp()), VF)); + assert(RedTy->isIntegerTy() && + "ExtendedReduction only support integer type currently."); + return Ctx.TTI.getExtendedReductionCost(Opcode, isZExt(), RedTy, SrcVecTy, + std::nullopt, Ctx.CostKind); +} + +InstructionCost +VPMulAccumulateReductionRecipe::computeCost(ElementCount VF, + VPCostContext &Ctx) const { + Type *RedTy = Ctx.Types.inferScalarType(this); + auto *SrcVecTy = + cast(toVectorTy(Ctx.Types.inferScalarType(getVecOp0()), VF)); + return Ctx.TTI.getMulAccReductionCost(isZExt(), RedTy, SrcVecTy, + Ctx.CostKind); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-strict-fadd-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-strict-fadd-cost.ll index 0efdf077dca66..4208773e94734 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-strict-fadd-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-strict-fadd-cost.ll @@ -12,14 +12,14 @@ target triple="aarch64-unknown-linux-gnu" ; CHECK-VSCALE2-LABEL: LV: Checking a loop in 'fadd_strict32' ; CHECK-VSCALE2: Cost of 4 for VF vscale x 2: -; CHECK-VSCALE2: in-loop reduction %add = fadd float %0, %sum.07 +; CHECK-VSCALE2: REDUCE ir<%add> = ir<%sum.07> + reduce.fadd (ir<%0>) ; CHECK-VSCALE2: Cost of 8 for VF vscale x 4: -; CHECK-VSCALE2: in-loop reduction %add = fadd float %0, %sum.07 +; CHECK-VSCALE2: REDUCE ir<%add> = ir<%sum.07> + reduce.fadd (ir<%0>) ; CHECK-VSCALE1-LABEL: LV: Checking a loop in 'fadd_strict32' ; CHECK-VSCALE1: Cost of 2 for VF vscale x 2: -; CHECK-VSCALE1: in-loop reduction %add = fadd float %0, %sum.07 +; CHECK-VSCALE1: REDUCE ir<%add> = ir<%sum.07> + reduce.fadd (ir<%0>) ; CHECK-VSCALE1: Cost of 4 for VF vscale x 4: -; CHECK-VSCALE1: in-loop reduction %add = fadd float %0, %sum.07 +; CHECK-VSCALE1: REDUCE ir<%add> = ir<%sum.07> + reduce.fadd (ir<%0>) define float @fadd_strict32(ptr noalias nocapture readonly %a, i64 %n) #0 { entry: @@ -42,10 +42,10 @@ for.end: ; CHECK-VSCALE2-LABEL: LV: Checking a loop in 'fadd_strict64' ; CHECK-VSCALE2: Cost of 4 for VF vscale x 2: -; CHECK-VSCALE2: in-loop reduction %add = fadd double %0, %sum.07 +; CHECK-VSCALE2: REDUCE ir<%add> = ir<%sum.07> + reduce.fadd (ir<%0>) ; CHECK-VSCALE1-LABEL: LV: Checking a loop in 'fadd_strict64' ; CHECK-VSCALE1: Cost of 2 for VF vscale x 2: -; CHECK-VSCALE1: in-loop reduction %add = fadd double %0, %sum.07 +; CHECK-VSCALE1: REDUCE ir<%add> = ir<%sum.07> + reduce.fadd (ir<%0>) define double @fadd_strict64(ptr noalias nocapture readonly %a, i64 %n) #0 { entry: diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll index d021306b89aab..fa2daa0e4423f 100644 --- a/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll @@ -800,11 +800,11 @@ define i32 @mla_i32_i32(ptr nocapture readonly %x, ptr nocapture readonly %y, i3 ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]]) -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[INDEX]] -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP0]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison) ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[Y:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison) -; CHECK-NEXT: [[TMP2:%.*]] = mul nsw <4 x i32> [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[Y1:%.*]], i32 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP7]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison) +; CHECK-NEXT: [[TMP2:%.*]] = mul nsw <4 x i32> [[WIDE_MASKED_LOAD2]], [[WIDE_MASKED_LOAD1]] ; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP2]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3]]) ; CHECK-NEXT: [[TMP5]] = add i32 [[TMP4]], [[VEC_PHI]] @@ -961,11 +961,11 @@ define signext i16 @mla_i16_i16(ptr nocapture readonly %x, ptr nocapture readonl ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i16 [ 0, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[INDEX]], i32 [[N]]) -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[INDEX]] -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[TMP0]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison) ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[Y:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[TMP1]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison) -; CHECK-NEXT: [[TMP2:%.*]] = mul <8 x i16> [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[Y1:%.*]], i32 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[TMP7]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison) +; CHECK-NEXT: [[TMP2:%.*]] = mul <8 x i16> [[WIDE_MASKED_LOAD2]], [[WIDE_MASKED_LOAD1]] ; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> [[TMP2]], <8 x i16> zeroinitializer ; CHECK-NEXT: [[TMP4:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[TMP3]]) ; CHECK-NEXT: [[TMP5]] = add i16 [[TMP4]], [[VEC_PHI]] @@ -1067,11 +1067,11 @@ define zeroext i8 @mla_i8_i8(ptr nocapture readonly %x, ptr nocapture readonly % ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i8 [ 0, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]]) -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[X:%.*]], i32 [[INDEX]] -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP0]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison) ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[Y:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP1]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison) -; CHECK-NEXT: [[TMP2:%.*]] = mul <16 x i8> [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[Y1:%.*]], i32 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP7]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison) +; CHECK-NEXT: [[TMP2:%.*]] = mul <16 x i8> [[WIDE_MASKED_LOAD2]], [[WIDE_MASKED_LOAD1]] ; CHECK-NEXT: [[TMP3:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> [[TMP2]], <16 x i8> zeroinitializer ; CHECK-NEXT: [[TMP4:%.*]] = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> [[TMP3]]) ; CHECK-NEXT: [[TMP5]] = add i8 [[TMP4]], [[VEC_PHI]] @@ -1181,11 +1181,11 @@ define i64 @red_mla_ext_s16_u16_s64(ptr noalias nocapture readonly %A, ptr noali ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[A:%.*]], i32 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP0]], align 1 -; CHECK-NEXT: [[TMP1:%.*]] = sext <4 x i16> [[WIDE_LOAD]] to <4 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[B:%.*]], i32 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i16>, ptr [[TMP2]], align 2 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP2]], align 1 +; CHECK-NEXT: [[TMP1:%.*]] = sext <4 x i16> [[WIDE_LOAD]] to <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i16, ptr [[B1:%.*]], i32 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i16>, ptr [[TMP11]], align 2 ; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[WIDE_LOAD1]] to <4 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = mul nsw <4 x i32> [[TMP3]], [[TMP1]] ; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i32> [[TMP4]] to <4 x i64> @@ -1204,10 +1204,10 @@ define i64 @red_mla_ext_s16_u16_s64(ptr noalias nocapture readonly %A, ptr noali ; CHECK: for.body: ; CHECK-NEXT: [[I_011:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[S_010:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[A]], i32 [[I_011]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[I_011]] ; CHECK-NEXT: [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP9]] to i32 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[I_011]] +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i16, ptr [[B1]], i32 [[I_011]] ; CHECK-NEXT: [[TMP10:%.*]] = load i16, ptr [[ARRAYIDX1]], align 2 ; CHECK-NEXT: [[CONV2:%.*]] = zext i16 [[TMP10]] to i32 ; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[CONV2]], [[CONV]] @@ -1266,12 +1266,12 @@ define i32 @red_mla_u8_s8_u32(ptr noalias nocapture readonly %A, ptr noalias noc ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]]) -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i32 [[INDEX]] -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr [[TMP0]], i32 1, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i8> poison) -; CHECK-NEXT: [[TMP1:%.*]] = zext <4 x i8> [[WIDE_MASKED_LOAD]] to <4 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr [[TMP2]], i32 1, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i8> poison) -; CHECK-NEXT: [[TMP3:%.*]] = sext <4 x i8> [[WIDE_MASKED_LOAD1]] to <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = zext <4 x i8> [[WIDE_MASKED_LOAD1]] to <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[B1:%.*]], i32 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr [[TMP9]], i32 1, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i8> poison) +; CHECK-NEXT: [[TMP3:%.*]] = sext <4 x i8> [[WIDE_MASKED_LOAD2]] to <4 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = mul nsw <4 x i32> [[TMP3]], [[TMP1]] ; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP4]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP5]]) @@ -1408,8 +1408,8 @@ define i32 @mla_i8_i32_multiuse(ptr nocapture readonly %x, ptr nocapture readonl ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]]) ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[X:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP0]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison) -; CHECK-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = mul nuw nsw <16 x i32> [[TMP1]], [[TMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = mul nuw nsw <16 x i32> [[TMP7]], [[TMP7]] ; CHECK-NEXT: [[TMP3:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP2]], <16 x i32> zeroinitializer ; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP3]]) ; CHECK-NEXT: [[TMP5]] = add i32 [[TMP4]], [[VEC_PHI]] diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll index 17e3bb3cce7eb..d497f0c22dbbc 100644 --- a/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll @@ -424,62 +424,62 @@ define i32 @reduction_mix(ptr noalias nocapture %A, ptr noalias nocapture %B) { ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i64 0 ; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; CHECK: pred.load.if: -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i64 0 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[TMP6]], i64 0 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[B1:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP8]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> poison, i32 [[TMP12]], i64 0 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE]] ; CHECK: pred.load.continue: -; CHECK-NEXT: [[TMP8:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP4]], [[PRED_LOAD_IF]] ] ; CHECK-NEXT: [[TMP9:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP14:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP13]], [[PRED_LOAD_IF]] ] ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i1> [[TMP0]], i64 1 ; CHECK-NEXT: br i1 [[TMP10]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]] ; CHECK: pred.load.if3: ; CHECK-NEXT: [[TMP11:%.*]] = or disjoint i64 [[INDEX]], 1 -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4 -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP13]], i64 1 ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP11]] ; CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4 ; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP16]], i64 1 +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[B1]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr [[TMP18]], align 4 +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <4 x i32> [[TMP14]], i32 [[TMP22]], i64 1 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE4]] ; CHECK: pred.load.continue4: -; CHECK-NEXT: [[TMP18:%.*]] = phi <4 x i32> [ [[TMP8]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], [[PRED_LOAD_IF3]] ] ; CHECK-NEXT: [[TMP19:%.*]] = phi <4 x i32> [ [[TMP9]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP17]], [[PRED_LOAD_IF3]] ] +; CHECK-NEXT: [[TMP24:%.*]] = phi <4 x i32> [ [[TMP14]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP23]], [[PRED_LOAD_IF3]] ] ; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i1> [[TMP0]], i64 2 ; CHECK-NEXT: br i1 [[TMP20]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]] ; CHECK: pred.load.if5: ; CHECK-NEXT: [[TMP21:%.*]] = or disjoint i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP21]] -; CHECK-NEXT: [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4 -; CHECK-NEXT: [[TMP24:%.*]] = insertelement <4 x i32> [[TMP18]], i32 [[TMP23]], i64 2 ; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP21]] ; CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[TMP25]], align 4 ; CHECK-NEXT: [[TMP27:%.*]] = insertelement <4 x i32> [[TMP19]], i32 [[TMP26]], i64 2 +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[B1]], i64 [[TMP21]] +; CHECK-NEXT: [[TMP32:%.*]] = load i32, ptr [[TMP28]], align 4 +; CHECK-NEXT: [[TMP33:%.*]] = insertelement <4 x i32> [[TMP24]], i32 [[TMP32]], i64 2 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE6]] ; CHECK: pred.load.continue6: -; CHECK-NEXT: [[TMP28:%.*]] = phi <4 x i32> [ [[TMP18]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP24]], [[PRED_LOAD_IF5]] ] ; CHECK-NEXT: [[TMP29:%.*]] = phi <4 x i32> [ [[TMP19]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP27]], [[PRED_LOAD_IF5]] ] +; CHECK-NEXT: [[TMP34:%.*]] = phi <4 x i32> [ [[TMP24]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP33]], [[PRED_LOAD_IF5]] ] ; CHECK-NEXT: [[TMP30:%.*]] = extractelement <4 x i1> [[TMP0]], i64 3 ; CHECK-NEXT: br i1 [[TMP30]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8]] ; CHECK: pred.load.if7: ; CHECK-NEXT: [[TMP31:%.*]] = or disjoint i64 [[INDEX]], 3 -; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP31]] -; CHECK-NEXT: [[TMP33:%.*]] = load i32, ptr [[TMP32]], align 4 -; CHECK-NEXT: [[TMP34:%.*]] = insertelement <4 x i32> [[TMP28]], i32 [[TMP33]], i64 3 ; CHECK-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP31]] ; CHECK-NEXT: [[TMP36:%.*]] = load i32, ptr [[TMP35]], align 4 ; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i32> [[TMP29]], i32 [[TMP36]], i64 3 +; CHECK-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, ptr [[B1]], i64 [[TMP31]] +; CHECK-NEXT: [[TMP48:%.*]] = load i32, ptr [[TMP38]], align 4 +; CHECK-NEXT: [[TMP49:%.*]] = insertelement <4 x i32> [[TMP34]], i32 [[TMP48]], i64 3 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE8]] ; CHECK: pred.load.continue8: -; CHECK-NEXT: [[TMP38:%.*]] = phi <4 x i32> [ [[TMP28]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP34]], [[PRED_LOAD_IF7]] ] ; CHECK-NEXT: [[TMP39:%.*]] = phi <4 x i32> [ [[TMP29]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP37]], [[PRED_LOAD_IF7]] ] +; CHECK-NEXT: [[TMP50:%.*]] = phi <4 x i32> [ [[TMP34]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP49]], [[PRED_LOAD_IF7]] ] ; CHECK-NEXT: [[TMP41:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[VEC_IND1]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP42:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP41]]) ; CHECK-NEXT: [[TMP43:%.*]] = add i32 [[TMP42]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP40:%.*]] = mul nsw <4 x i32> [[TMP39]], [[TMP38]] +; CHECK-NEXT: [[TMP40:%.*]] = mul nsw <4 x i32> [[TMP50]], [[TMP39]] ; CHECK-NEXT: [[TMP44:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[TMP40]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP45:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP44]]) ; CHECK-NEXT: [[TMP46]] = add i32 [[TMP45]], [[TMP43]] diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll index 9ca7a84b3ea1c..e762c9ff81322 100644 --- a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll @@ -221,13 +221,13 @@ define i32 @reduction_mix(ptr noalias nocapture %A, ptr noalias nocapture %B) { ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[B1:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP8]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[VEC_IND]]) ; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP2:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP2:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD2]], [[WIDE_LOAD1]] ; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP2]]) ; CHECK-NEXT: [[TMP6]] = add i32 [[TMP5]], [[TMP4]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 @@ -329,11 +329,11 @@ define i32 @start_at_non_zero(ptr nocapture %in, ptr nocapture %coeff, ptr nocap ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 120, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[IN:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[COEFF:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[COEFF1:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP6]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD2]], [[WIDE_LOAD1]] ; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP2]]) ; CHECK-NEXT: [[TMP4]] = add i32 [[TMP3]], [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4