From 34d5f25a026e6bdb337a1ba8e1a2cf7a8a4291d5 Mon Sep 17 00:00:00 2001 From: Nick Guy Date: Tue, 17 Dec 2024 17:07:45 +0000 Subject: [PATCH 1/8] [LoopVectorizer] Add support for chaining partial reductions --- .../AArch64/AArch64TargetTransformInfo.h | 2 +- .../Transforms/Vectorize/LoopVectorize.cpp | 55 +- .../Transforms/Vectorize/VPRecipeBuilder.h | 4 +- llvm/lib/Transforms/Vectorize/VPlan.h | 5 +- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 14 +- .../AArch64/partial-reduction-chained.ll | 568 ++++++++++++++++++ 6 files changed, 623 insertions(+), 25 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/partial-reduction-chained.ll diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index 8e7e590c173ff..c6cebcca67935 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -368,7 +368,7 @@ class AArch64TTIImpl : public BasicTTIImplBase { InstructionCost Invalid = InstructionCost::getInvalid(); InstructionCost Cost(TTI::TCC_Basic); - if (Opcode != Instruction::Add) + if (Opcode != Instruction::Add && Opcode != Instruction::Sub) return Invalid; if (InputTypeA != InputTypeB) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 99f6a8860f0f4..79be3e15594c4 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8790,12 +8790,12 @@ VPReplicateRecipe *VPRecipeBuilder::handleReplication(Instruction *I, /// are valid so recipes can be formed later. void VPRecipeBuilder::collectScaledReductions(VFRange &Range) { // Find all possible partial reductions. - SmallVector, 1> + SmallVector> PartialReductionChains; - for (const auto &[Phi, RdxDesc] : Legal->getReductionVars()) - if (std::optional> Pair = - getScaledReduction(Phi, RdxDesc, Range)) - PartialReductionChains.push_back(*Pair); + for (const auto &[Phi, RdxDesc] : Legal->getReductionVars()) { + if (auto SR = getScaledReduction(Phi, RdxDesc.getLoopExitInstr(), Range)) + PartialReductionChains.append(*SR); + } // A partial reduction is invalid if any of its extends are used by // something that isn't another partial reduction. This is because the @@ -8823,26 +8823,42 @@ void VPRecipeBuilder::collectScaledReductions(VFRange &Range) { } } -std::optional> -VPRecipeBuilder::getScaledReduction(PHINode *PHI, - const RecurrenceDescriptor &Rdx, +std::optional>> +VPRecipeBuilder::getScaledReduction(Instruction *PHI, + Instruction *RdxExitInstr, VFRange &Range) { + + if(!CM.TheLoop->contains(RdxExitInstr)) + return std::nullopt; + // TODO: Allow scaling reductions when predicating. The select at // the end of the loop chooses between the phi value and most recent // reduction result, both of which have different VFs to the active lane // mask when scaling. - if (CM.blockNeedsPredicationForAnyReason(Rdx.getLoopExitInstr()->getParent())) + if (CM.blockNeedsPredicationForAnyReason(RdxExitInstr->getParent())) return std::nullopt; - auto *Update = dyn_cast(Rdx.getLoopExitInstr()); + auto *Update = dyn_cast(RdxExitInstr); if (!Update) return std::nullopt; Value *Op = Update->getOperand(0); Value *PhiOp = Update->getOperand(1); - if (Op == PHI) { - Op = Update->getOperand(1); - PhiOp = Update->getOperand(0); + if (Op == PHI) + std::swap(Op, PhiOp); + + SmallVector> Chains; + + if (auto *OpInst = dyn_cast(Op)) { + if(auto SR0 = getScaledReduction(PHI, OpInst, Range)) { + Chains.append(*SR0); + PHI = SR0->rbegin()->first.Reduction; + + Op = Update->getOperand(0); + PhiOp = Update->getOperand(1); + if (Op == PHI) + std::swap(Op, PhiOp); + } } if (PhiOp != PHI) return std::nullopt; @@ -8860,12 +8876,16 @@ VPRecipeBuilder::getScaledReduction(PHINode *PHI, Instruction *ExtA = cast(BinOp->getOperand(0)); Instruction *ExtB = cast(BinOp->getOperand(1)); + // Check that the extends extend from the same type. + if (A->getType() != B->getType()) + return std::nullopt; + TTI::PartialReductionExtendKind OpAExtend = TargetTransformInfo::getPartialReductionExtendKind(ExtA); TTI::PartialReductionExtendKind OpBExtend = TargetTransformInfo::getPartialReductionExtendKind(ExtB); - PartialReductionChain Chain(Rdx.getLoopExitInstr(), ExtA, ExtB, BinOp); + PartialReductionChain Chain(RdxExitInstr, ExtA, ExtB, BinOp); unsigned TargetScaleFactor = PHI->getType()->getPrimitiveSizeInBits().getKnownScalarFactor( @@ -8880,9 +8900,9 @@ VPRecipeBuilder::getScaledReduction(PHINode *PHI, return Cost.isValid(); }, Range)) - return std::make_pair(Chain, TargetScaleFactor); + Chains.push_back(std::make_pair(Chain, TargetScaleFactor)); - return std::nullopt; + return Chains; } VPRecipeBase * @@ -8979,7 +8999,8 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction, VPValue *BinOp = Operands[0]; VPValue *Phi = Operands[1]; - if (isa(BinOp->getDefiningRecipe())) + VPRecipeBase *BinOpRecipe = BinOp->getDefiningRecipe(); + if (isa(BinOpRecipe) || isa(BinOpRecipe)) std::swap(BinOp, Phi); return new VPPartialReductionRecipe(Reduction->getOpcode(), BinOp, Phi, diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h index cf653e2d3e658..6be9a716cacbf 100644 --- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -144,8 +144,8 @@ class VPRecipeBuilder { /// Returns null if no scaled reduction was found, otherwise a pair with a /// struct containing reduction information and the scaling factor between the /// number of elements in the input and output. - std::optional> - getScaledReduction(PHINode *PHI, const RecurrenceDescriptor &Rdx, + std::optional>> + getScaledReduction(Instruction *PHI, Instruction *RdxExitInstr, VFRange &Range); public: diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 87f87bf143719..b2d3d3944c1a5 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -2453,13 +2453,14 @@ class VPPartialReductionRecipe : public VPSingleDefRecipe { : VPSingleDefRecipe(VPDef::VPPartialReductionSC, ArrayRef({Op0, Op1}), ReductionInst), Opcode(Opcode) { - assert(isa(getOperand(1)->getDefiningRecipe()) && + auto *DefiningRecipe = getOperand(1)->getDefiningRecipe(); + assert((isa(DefiningRecipe) || isa(DefiningRecipe)) && "Unexpected operand order for partial reduction recipe"); } ~VPPartialReductionRecipe() override = default; VPPartialReductionRecipe *clone() override { - return new VPPartialReductionRecipe(Opcode, getOperand(0), getOperand(1)); + return new VPPartialReductionRecipe(Opcode, getOperand(0), getOperand(1), getUnderlyingInstr()); } VP_CLASSOF_IMPL(VPDef::VPPartialReductionSC) diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 979a8e0768a99..4fb32c14398e5 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -317,13 +317,21 @@ void VPPartialReductionRecipe::execute(VPTransformState &State) { State.setDebugLocFrom(getDebugLoc()); auto &Builder = State.Builder; - assert(getOpcode() == Instruction::Add && - "Unhandled partial reduction opcode"); - Value *BinOpVal = State.get(getOperand(0)); Value *PhiVal = State.get(getOperand(1)); assert(PhiVal && BinOpVal && "Phi and Mul must be set"); + auto Opcode = getOpcode(); + + // Currently we don't have a partial_reduce_sub intrinsic, + // so mimic the behaviour by negating the second operand + if(Opcode == Instruction::Sub) { + BinOpVal = Builder.CreateSub(Constant::getNullValue(BinOpVal->getType()), BinOpVal); + Opcode = Instruction::Add; + } + + assert(Opcode == Instruction::Add && "Unhandled partial reduction opcode"); + Type *RetTy = PhiVal->getType(); CallInst *V = Builder.CreateIntrinsic( diff --git a/llvm/test/CodeGen/AArch64/partial-reduction-chained.ll b/llvm/test/CodeGen/AArch64/partial-reduction-chained.ll new file mode 100644 index 0000000000000..4272e2f755249 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/partial-reduction-chained.ll @@ -0,0 +1,568 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -S -o - %s --passes=loop-vectorize -force-vector-width=16 -force-vector-interleave=1 -scalable-vectorization=on | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-none-unknown-elf" + +; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: read) vscale_range(1,16) +define i32 @chained_partial_reduce_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 { +; CHECK-LABEL: define i32 @chained_partial_reduce_add_sub( +; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE9:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP11]], align 1 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP13]], align 1 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load , ptr [[TMP14]], align 1 +; CHECK-NEXT: [[TMP25:%.*]] = sext [[WIDE_LOAD4]] to +; CHECK-NEXT: [[TMP18:%.*]] = sext [[WIDE_LOAD3]] to +; CHECK-NEXT: [[TMP27:%.*]] = sext [[WIDE_LOAD6]] to +; CHECK-NEXT: [[TMP29:%.*]] = mul nsw [[TMP25]], [[TMP18]] +; CHECK-NEXT: [[PARTIAL_REDUCE7:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI1]], [[TMP29]]) +; CHECK-NEXT: [[TMP31:%.*]] = mul nsw [[TMP25]], [[TMP27]] +; CHECK-NEXT: [[TMP33:%.*]] = sub zeroinitializer, [[TMP31]] +; CHECK-NEXT: [[PARTIAL_REDUCE9]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[PARTIAL_REDUCE7]], [[TMP33]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE9]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP23]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[SUB:%.*]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i32 [[RES_0_LCSSA]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB]], [[FOR_BODY]] ] +; CHECK-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[B_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1 +; CHECK-NEXT: [[C_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1 +; CHECK-NEXT: [[D_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1 +; CHECK-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32 +; CHECK-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32 +; CHECK-NEXT: [[D_EXT:%.*]] = sext i8 [[D_VAL]] to i32 +; CHECK-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[B_EXT]], [[C_EXT]] +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[RES]], [[MUL_AC]] +; CHECK-NEXT: [[MUL_DB:%.*]] = mul nsw i32 [[B_EXT]], [[D_EXT]] +; CHECK-NEXT: [[SUB]] = sub i32 [[ADD]], [[MUL_DB]] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]], !loop [[META4:![0-9]+]] +; +entry: + %cmp28.not = icmp ult i32 %N, 2 + %div27 = lshr i32 %N, 1 + %wide.trip.count = zext nneg i32 %div27 to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + %res.0.lcssa = phi i32 [ %sub, %for.body ] + ret i32 %res.0.lcssa + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %res = phi i32 [ 0, %entry ], [ %sub, %for.body ] + %a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %indvars.iv + %b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %indvars.iv + %c.ptr = getelementptr inbounds nuw i8, ptr %c, i64 %indvars.iv + %a.val = load i8, ptr %a.ptr, align 1 + %b.val = load i8, ptr %b.ptr, align 1 + %c.val = load i8, ptr %c.ptr, align 1 + %a.ext = sext i8 %a.val to i32 + %b.ext = sext i8 %b.val to i32 + %c.ext = sext i8 %c.val to i32 + %mul.ab = mul nsw i32 %a.ext, %b.ext + %add = add nsw i32 %res, %mul.ab + %mul.ac = mul nsw i32 %a.ext, %c.ext + %sub = sub i32 %add, %mul.ac + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1 +} + +define i32 @chained_partial_reduce_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 { +; CHECK-LABEL: define i32 @chained_partial_reduce_add_add( +; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE9:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP11]], align 1 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP13]], align 1 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load , ptr [[TMP14]], align 1 +; CHECK-NEXT: [[TMP25:%.*]] = sext [[WIDE_LOAD4]] to +; CHECK-NEXT: [[TMP18:%.*]] = sext [[WIDE_LOAD3]] to +; CHECK-NEXT: [[TMP27:%.*]] = sext [[WIDE_LOAD6]] to +; CHECK-NEXT: [[TMP29:%.*]] = mul nsw [[TMP25]], [[TMP18]] +; CHECK-NEXT: [[PARTIAL_REDUCE7:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI1]], [[TMP29]]) +; CHECK-NEXT: [[TMP31:%.*]] = mul nsw [[TMP25]], [[TMP27]] +; CHECK-NEXT: [[PARTIAL_REDUCE9]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[PARTIAL_REDUCE7]], [[TMP31]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE9]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP22]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[SUB:%.*]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i32 [[RES_0_LCSSA]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB]], [[FOR_BODY]] ] +; CHECK-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[B_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1 +; CHECK-NEXT: [[C_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1 +; CHECK-NEXT: [[D_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1 +; CHECK-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32 +; CHECK-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32 +; CHECK-NEXT: [[D_EXT:%.*]] = sext i8 [[D_VAL]] to i32 +; CHECK-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[B_EXT]], [[C_EXT]] +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[RES]], [[MUL_AC]] +; CHECK-NEXT: [[MUL_DB:%.*]] = mul nsw i32 [[B_EXT]], [[D_EXT]] +; CHECK-NEXT: [[SUB]] = add i32 [[ADD]], [[MUL_DB]] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]], !loop [[META4]] +; +entry: + %cmp28.not = icmp ult i32 %N, 2 + %div27 = lshr i32 %N, 1 + %wide.trip.count = zext nneg i32 %div27 to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + %res.0.lcssa = phi i32 [ %add.2, %for.body ] + ret i32 %res.0.lcssa + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %res = phi i32 [ 0, %entry ], [ %add.2, %for.body ] + %a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %indvars.iv + %b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %indvars.iv + %c.ptr = getelementptr inbounds nuw i8, ptr %c, i64 %indvars.iv + %a.val = load i8, ptr %a.ptr, align 1 + %b.val = load i8, ptr %b.ptr, align 1 + %c.val = load i8, ptr %c.ptr, align 1 + %a.ext = sext i8 %a.val to i32 + %b.ext = sext i8 %b.val to i32 + %c.ext = sext i8 %c.val to i32 + %mul.ab = mul nsw i32 %a.ext, %b.ext + %add = add nsw i32 %res, %mul.ab + %mul.ac = mul nsw i32 %a.ext, %c.ext + %add.2 = add i32 %add, %mul.ac + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1 +} + +define i32 @chained_partial_reduce_sub_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 { +; CHECK-LABEL: define i32 @chained_partial_reduce_sub_add( +; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16 +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE9:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP11]], align 1 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP13]], align 1 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load , ptr [[TMP14]], align 1 +; CHECK-NEXT: [[TMP25:%.*]] = sext [[WIDE_LOAD4]] to +; CHECK-NEXT: [[TMP18:%.*]] = sext [[WIDE_LOAD3]] to +; CHECK-NEXT: [[TMP27:%.*]] = sext [[WIDE_LOAD6]] to +; CHECK-NEXT: [[TMP29:%.*]] = mul nsw [[TMP25]], [[TMP18]] +; CHECK-NEXT: [[TMP31:%.*]] = sub zeroinitializer, [[TMP29]] +; CHECK-NEXT: [[PARTIAL_REDUCE7:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI1]], [[TMP31]]) +; CHECK-NEXT: [[TMP33:%.*]] = mul nsw [[TMP25]], [[TMP27]] +; CHECK-NEXT: [[PARTIAL_REDUCE9]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[PARTIAL_REDUCE7]], [[TMP33]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE9]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP23]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: br label [[FOR_BODY1:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[SUB:%.*]], [[FOR_BODY1]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i32 [[RES_0_LCSSA]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY1]] ] +; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB]], [[FOR_BODY1]] ] +; CHECK-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[B_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1 +; CHECK-NEXT: [[C_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1 +; CHECK-NEXT: [[D_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1 +; CHECK-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32 +; CHECK-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32 +; CHECK-NEXT: [[D_EXT:%.*]] = sext i8 [[D_VAL]] to i32 +; CHECK-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[B_EXT]], [[C_EXT]] +; CHECK-NEXT: [[SUB1:%.*]] = sub nsw i32 [[RES]], [[MUL_AC]] +; CHECK-NEXT: [[MUL_DB:%.*]] = mul nsw i32 [[B_EXT]], [[D_EXT]] +; CHECK-NEXT: [[SUB]] = add i32 [[SUB1]], [[MUL_DB]] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY1]], !llvm.loop [[LOOP9:![0-9]+]], !loop [[META4]] +; +entry: + %cmp28.not = icmp ult i32 %N, 2 + %div27 = lshr i32 %N, 1 + %wide.trip.count = zext nneg i32 %div27 to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + %res.0.lcssa = phi i32 [ %add, %for.body ] + ret i32 %res.0.lcssa + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %res = phi i32 [ 0, %entry ], [ %add, %for.body ] + + %a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %indvars.iv + %b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %indvars.iv + %c.ptr = getelementptr inbounds nuw i8, ptr %c, i64 %indvars.iv + %a.val = load i8, ptr %a.ptr, align 1 + %b.val = load i8, ptr %b.ptr, align 1 + %c.val = load i8, ptr %c.ptr, align 1 + + %a.ext = sext i8 %a.val to i32 + %b.ext = sext i8 %b.val to i32 + %c.ext = sext i8 %c.val to i32 + %mul.ab = mul nsw i32 %a.ext, %b.ext + %sub = sub nsw i32 %res, %mul.ab + %mul.ac = mul nsw i32 %a.ext, %c.ext + %add = add i32 %sub, %mul.ac + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1 +} + +define i32 @chained_partial_reduce_sub_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 { +; CHECK-LABEL: define i32 @chained_partial_reduce_sub_sub( +; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16 +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE9:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP11]], align 1 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP13]], align 1 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load , ptr [[TMP14]], align 1 +; CHECK-NEXT: [[TMP25:%.*]] = sext [[WIDE_LOAD4]] to +; CHECK-NEXT: [[TMP18:%.*]] = sext [[WIDE_LOAD3]] to +; CHECK-NEXT: [[TMP27:%.*]] = sext [[WIDE_LOAD6]] to +; CHECK-NEXT: [[TMP29:%.*]] = mul nsw [[TMP25]], [[TMP18]] +; CHECK-NEXT: [[TMP31:%.*]] = sub zeroinitializer, [[TMP29]] +; CHECK-NEXT: [[PARTIAL_REDUCE7:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI1]], [[TMP31]]) +; CHECK-NEXT: [[TMP33:%.*]] = mul nsw [[TMP25]], [[TMP27]] +; CHECK-NEXT: [[TMP35:%.*]] = sub zeroinitializer, [[TMP33]] +; CHECK-NEXT: [[PARTIAL_REDUCE9]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[PARTIAL_REDUCE7]], [[TMP35]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE9]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP23]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: br label [[FOR_BODY1:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY1]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i32 [[RES_0_LCSSA]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY1]] ] +; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD]], [[FOR_BODY1]] ] +; CHECK-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[B_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1 +; CHECK-NEXT: [[C_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1 +; CHECK-NEXT: [[D_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1 +; CHECK-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32 +; CHECK-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32 +; CHECK-NEXT: [[D_EXT:%.*]] = sext i8 [[D_VAL]] to i32 +; CHECK-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[B_EXT]], [[C_EXT]] +; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[RES]], [[MUL_AC]] +; CHECK-NEXT: [[MUL_DB:%.*]] = mul nsw i32 [[B_EXT]], [[D_EXT]] +; CHECK-NEXT: [[ADD]] = sub i32 [[SUB]], [[MUL_DB]] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY1]], !llvm.loop [[LOOP11:![0-9]+]], !loop [[META4]] +; +entry: + %cmp28.not = icmp ult i32 %N, 2 + %div27 = lshr i32 %N, 1 + %wide.trip.count = zext nneg i32 %div27 to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + %res.0.lcssa = phi i32 [ %sub.2, %for.body ] + ret i32 %res.0.lcssa + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %res = phi i32 [ 0, %entry ], [ %sub.2, %for.body ] + + %a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %indvars.iv + %b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %indvars.iv + %c.ptr = getelementptr inbounds nuw i8, ptr %c, i64 %indvars.iv + %a.val = load i8, ptr %a.ptr, align 1 + %b.val = load i8, ptr %b.ptr, align 1 + %c.val = load i8, ptr %c.ptr, align 1 + + %a.ext = sext i8 %a.val to i32 + %b.ext = sext i8 %b.val to i32 + %c.ext = sext i8 %c.val to i32 + + %mul.ab = mul nsw i32 %a.ext, %b.ext + %sub = sub nsw i32 %res, %mul.ab + %mul.ac = mul nsw i32 %a.ext, %c.ext + %sub.2 = sub i32 %sub, %mul.ac + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1 +} + +define i32 @chained_partial_reduce_sub_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 { +; CHECK-LABEL: define i32 @chained_partial_reduce_sub_add_sub( +; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP14]], align 1 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP15]], align 1 +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP16]], align 1 +; CHECK-NEXT: [[TMP19:%.*]] = sext [[WIDE_LOAD2]] to +; CHECK-NEXT: [[TMP20:%.*]] = sext [[WIDE_LOAD3]] to +; CHECK-NEXT: [[TMP21:%.*]] = sext [[WIDE_LOAD4]] to +; CHECK-NEXT: [[TMP22:%.*]] = mul nsw [[TMP19]], [[TMP20]] +; CHECK-NEXT: [[TMP23:%.*]] = sub zeroinitializer, [[TMP22]] +; CHECK-NEXT: [[PARTIAL_REDUCE:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[TMP23]]) +; CHECK-NEXT: [[TMP24:%.*]] = mul nsw [[TMP19]], [[TMP21]] +; CHECK-NEXT: [[PARTIAL_REDUCE4:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[PARTIAL_REDUCE]], [[TMP24]]) +; CHECK-NEXT: [[TMP25:%.*]] = mul nsw [[TMP20]], [[TMP21]] +; CHECK-NEXT: [[TMP26:%.*]] = sub zeroinitializer, [[TMP25]] +; CHECK-NEXT: [[PARTIAL_REDUCE5]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[PARTIAL_REDUCE4]], [[TMP26]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[TMP28:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE5]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP28]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[SUB_2:%.*]], [[FOR_BODY]] ], [ [[TMP28]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i32 [[RES_0_LCSSA]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB_2]], [[FOR_BODY]] ] +; CHECK-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[C_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1 +; CHECK-NEXT: [[D_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1 +; CHECK-NEXT: [[E_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1 +; CHECK-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32 +; CHECK-NEXT: [[D_EXT:%.*]] = sext i8 [[D_VAL]] to i32 +; CHECK-NEXT: [[E_EXT:%.*]] = sext i8 [[E_VAL]] to i32 +; CHECK-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[C_EXT]], [[D_EXT]] +; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[RES]], [[MUL_AC]] +; CHECK-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[C_EXT]], [[E_EXT]] +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[SUB]], [[MUL_AB]] +; CHECK-NEXT: [[MUL_DB:%.*]] = mul nsw i32 [[D_EXT]], [[E_EXT]] +; CHECK-NEXT: [[SUB_2]] = sub i32 [[ADD]], [[MUL_DB]] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]], !loop [[META4]] +; +entry: + %cmp28.not = icmp ult i32 %N, 2 + %div27 = lshr i32 %N, 1 + %wide.trip.count = zext nneg i32 %div27 to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + %res.0.lcssa = phi i32 [ %sub.2, %for.body ] + ret i32 %res.0.lcssa + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %res = phi i32 [ 0, %entry ], [ %sub.2, %for.body ] + + %a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %indvars.iv + %b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %indvars.iv + %c.ptr = getelementptr inbounds nuw i8, ptr %c, i64 %indvars.iv + %a.val = load i8, ptr %a.ptr, align 1 + %b.val = load i8, ptr %b.ptr, align 1 + %c.val = load i8, ptr %c.ptr, align 1 + + %a.ext = sext i8 %a.val to i32 + %b.ext = sext i8 %b.val to i32 + %c.ext = sext i8 %c.val to i32 + + %mul.ab = mul nsw i32 %a.ext, %b.ext + %sub = sub nsw i32 %res, %mul.ab + %mul.ac = mul nsw i32 %a.ext, %c.ext + %add = add nsw i32 %sub, %mul.ac + %mul.bc = mul nsw i32 %b.ext, %c.ext + %sub.2 = sub i32 %add, %mul.bc + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1 +} + +attributes #0 = { mustprogress noinline nounwind vscale_range(1,16) "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-x3" "target-features"="+bf16,+bti,+ccidx,+complxnum,+crc,+dit,+dotprod,+ete,+flagm,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+jsconv,+lse,+mte,+pauth,+perfmon,+predres,+ras,+rcpc,+rdm,+sb,+spe,+ssbs,+sve,+sve2,+sve2-bitperm,+trbe,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8a,+v9a" } + + +!0 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} +!1 = distinct !{!0} +;. +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; CHECK: [[META4]] = distinct !{[[META5:![0-9]+]]} +; CHECK: [[META5]] = !{!"llvm.loop.vectorize.scalable.enable", i1 true} +; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} +; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} +; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} +; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]} +; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]} +; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]} +; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]} +; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]} +;. From abe57fbe05839e527ba4c08114ef9cf32598a5b2 Mon Sep 17 00:00:00 2001 From: Nick Guy Date: Wed, 15 Jan 2025 13:49:02 +0000 Subject: [PATCH 2/8] Format --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 10 +++++----- llvm/lib/Transforms/Vectorize/VPlan.h | 6 ++++-- llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 5 +++-- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 79be3e15594c4..a18cf3c9bec2b 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8824,11 +8824,10 @@ void VPRecipeBuilder::collectScaledReductions(VFRange &Range) { } std::optional>> -VPRecipeBuilder::getScaledReduction(Instruction *PHI, - Instruction *RdxExitInstr, +VPRecipeBuilder::getScaledReduction(Instruction *PHI, Instruction *RdxExitInstr, VFRange &Range) { - if(!CM.TheLoop->contains(RdxExitInstr)) + if (!CM.TheLoop->contains(RdxExitInstr)) return std::nullopt; // TODO: Allow scaling reductions when predicating. The select at @@ -8850,7 +8849,7 @@ VPRecipeBuilder::getScaledReduction(Instruction *PHI, SmallVector> Chains; if (auto *OpInst = dyn_cast(Op)) { - if(auto SR0 = getScaledReduction(PHI, OpInst, Range)) { + if (auto SR0 = getScaledReduction(PHI, OpInst, Range)) { Chains.append(*SR0); PHI = SR0->rbegin()->first.Reduction; @@ -9000,7 +8999,8 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction, VPValue *BinOp = Operands[0]; VPValue *Phi = Operands[1]; VPRecipeBase *BinOpRecipe = BinOp->getDefiningRecipe(); - if (isa(BinOpRecipe) || isa(BinOpRecipe)) + if (isa(BinOpRecipe) || + isa(BinOpRecipe)) std::swap(BinOp, Phi); return new VPPartialReductionRecipe(Reduction->getOpcode(), BinOp, Phi, diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index b2d3d3944c1a5..9e09bfc62105c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -2454,13 +2454,15 @@ class VPPartialReductionRecipe : public VPSingleDefRecipe { ArrayRef({Op0, Op1}), ReductionInst), Opcode(Opcode) { auto *DefiningRecipe = getOperand(1)->getDefiningRecipe(); - assert((isa(DefiningRecipe) || isa(DefiningRecipe)) && + assert((isa(DefiningRecipe) || + isa(DefiningRecipe)) && "Unexpected operand order for partial reduction recipe"); } ~VPPartialReductionRecipe() override = default; VPPartialReductionRecipe *clone() override { - return new VPPartialReductionRecipe(Opcode, getOperand(0), getOperand(1), getUnderlyingInstr()); + return new VPPartialReductionRecipe(Opcode, getOperand(0), getOperand(1), + getUnderlyingInstr()); } VP_CLASSOF_IMPL(VPDef::VPPartialReductionSC) diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 4fb32c14398e5..668c317033fe7 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -325,8 +325,9 @@ void VPPartialReductionRecipe::execute(VPTransformState &State) { // Currently we don't have a partial_reduce_sub intrinsic, // so mimic the behaviour by negating the second operand - if(Opcode == Instruction::Sub) { - BinOpVal = Builder.CreateSub(Constant::getNullValue(BinOpVal->getType()), BinOpVal); + if (Opcode == Instruction::Sub) { + BinOpVal = Builder.CreateSub(Constant::getNullValue(BinOpVal->getType()), + BinOpVal); Opcode = Instruction::Add; } From 6c4a6b0146a6562a7280e72b74958d1d77b3d858 Mon Sep 17 00:00:00 2001 From: Nick Guy Date: Mon, 20 Jan 2025 14:34:03 +0000 Subject: [PATCH 3/8] Remove partial.reduce.sub support --- .../Target/AArch64/AArch64TargetTransformInfo.h | 2 +- llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 15 +++------------ 2 files changed, 4 insertions(+), 13 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index c6cebcca67935..8e7e590c173ff 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -368,7 +368,7 @@ class AArch64TTIImpl : public BasicTTIImplBase { InstructionCost Invalid = InstructionCost::getInvalid(); InstructionCost Cost(TTI::TCC_Basic); - if (Opcode != Instruction::Add && Opcode != Instruction::Sub) + if (Opcode != Instruction::Add) return Invalid; if (InputTypeA != InputTypeB) diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 668c317033fe7..979a8e0768a99 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -317,22 +317,13 @@ void VPPartialReductionRecipe::execute(VPTransformState &State) { State.setDebugLocFrom(getDebugLoc()); auto &Builder = State.Builder; + assert(getOpcode() == Instruction::Add && + "Unhandled partial reduction opcode"); + Value *BinOpVal = State.get(getOperand(0)); Value *PhiVal = State.get(getOperand(1)); assert(PhiVal && BinOpVal && "Phi and Mul must be set"); - auto Opcode = getOpcode(); - - // Currently we don't have a partial_reduce_sub intrinsic, - // so mimic the behaviour by negating the second operand - if (Opcode == Instruction::Sub) { - BinOpVal = Builder.CreateSub(Constant::getNullValue(BinOpVal->getType()), - BinOpVal); - Opcode = Instruction::Add; - } - - assert(Opcode == Instruction::Add && "Unhandled partial reduction opcode"); - Type *RetTy = PhiVal->getType(); CallInst *V = Builder.CreateIntrinsic( From 2e18209632c18e544d17e28db7d2efc2a043f329 Mon Sep 17 00:00:00 2001 From: Nick Guy Date: Mon, 20 Jan 2025 14:34:47 +0000 Subject: [PATCH 4/8] No longer set the underlying instruction in VPPartialReductionRecipe::clone --- llvm/lib/Transforms/Vectorize/VPlan.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 9e09bfc62105c..1550630fb819a 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -2461,8 +2461,7 @@ class VPPartialReductionRecipe : public VPSingleDefRecipe { ~VPPartialReductionRecipe() override = default; VPPartialReductionRecipe *clone() override { - return new VPPartialReductionRecipe(Opcode, getOperand(0), getOperand(1), - getUnderlyingInstr()); + return new VPPartialReductionRecipe(Opcode, getOperand(0), getOperand(1)); } VP_CLASSOF_IMPL(VPDef::VPPartialReductionSC) From f79f577f92cec30887fd1e1de574b8cd9dcb1396 Mon Sep 17 00:00:00 2001 From: Nick Guy Date: Mon, 20 Jan 2025 14:35:07 +0000 Subject: [PATCH 5/8] Address comments and update test --- .../Transforms/Vectorize/LoopVectorize.cpp | 7 +- .../AArch64/partial-reduction-chained.ll | 568 ------ .../AArch64/partial-reduce-chained.ll | 1557 +++++++++++++++++ 3 files changed, 1560 insertions(+), 572 deletions(-) delete mode 100644 llvm/test/CodeGen/AArch64/partial-reduction-chained.ll create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index a18cf3c9bec2b..d913954c51c00 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8848,6 +8848,9 @@ VPRecipeBuilder::getScaledReduction(Instruction *PHI, Instruction *RdxExitInstr, SmallVector> Chains; + // Try and get a scaled reduction from the first non-phi operand. + // If one is found, we use the discovered reduction instruction in + // place of the accumulator for costing. if (auto *OpInst = dyn_cast(Op)) { if (auto SR0 = getScaledReduction(PHI, OpInst, Range)) { Chains.append(*SR0); @@ -8875,10 +8878,6 @@ VPRecipeBuilder::getScaledReduction(Instruction *PHI, Instruction *RdxExitInstr, Instruction *ExtA = cast(BinOp->getOperand(0)); Instruction *ExtB = cast(BinOp->getOperand(1)); - // Check that the extends extend from the same type. - if (A->getType() != B->getType()) - return std::nullopt; - TTI::PartialReductionExtendKind OpAExtend = TargetTransformInfo::getPartialReductionExtendKind(ExtA); TTI::PartialReductionExtendKind OpBExtend = diff --git a/llvm/test/CodeGen/AArch64/partial-reduction-chained.ll b/llvm/test/CodeGen/AArch64/partial-reduction-chained.ll deleted file mode 100644 index 4272e2f755249..0000000000000 --- a/llvm/test/CodeGen/AArch64/partial-reduction-chained.ll +++ /dev/null @@ -1,568 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 -; RUN: opt -S -o - %s --passes=loop-vectorize -force-vector-width=16 -force-vector-interleave=1 -scalable-vectorization=on | FileCheck %s - -target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" -target triple = "aarch64-none-unknown-elf" - -; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: read) vscale_range(1,16) -define i32 @chained_partial_reduce_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 { -; CHECK-LABEL: define i32 @chained_partial_reduce_add_sub( -; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 -; CHECK-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 -; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK: vector.ph: -; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] -; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16 -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE9:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP11]], align 1 -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP13]], align 1 -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load , ptr [[TMP14]], align 1 -; CHECK-NEXT: [[TMP25:%.*]] = sext [[WIDE_LOAD4]] to -; CHECK-NEXT: [[TMP18:%.*]] = sext [[WIDE_LOAD3]] to -; CHECK-NEXT: [[TMP27:%.*]] = sext [[WIDE_LOAD6]] to -; CHECK-NEXT: [[TMP29:%.*]] = mul nsw [[TMP25]], [[TMP18]] -; CHECK-NEXT: [[PARTIAL_REDUCE7:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI1]], [[TMP29]]) -; CHECK-NEXT: [[TMP31:%.*]] = mul nsw [[TMP25]], [[TMP27]] -; CHECK-NEXT: [[TMP33:%.*]] = sub zeroinitializer, [[TMP31]] -; CHECK-NEXT: [[PARTIAL_REDUCE9]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[PARTIAL_REDUCE7]], [[TMP33]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] -; CHECK: middle.block: -; CHECK-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE9]]) -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP23]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[SUB:%.*]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ] -; CHECK-NEXT: ret i32 [[RES_0_LCSSA]] -; CHECK: for.body: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB]], [[FOR_BODY]] ] -; CHECK-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[B_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1 -; CHECK-NEXT: [[C_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1 -; CHECK-NEXT: [[D_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1 -; CHECK-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32 -; CHECK-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32 -; CHECK-NEXT: [[D_EXT:%.*]] = sext i8 [[D_VAL]] to i32 -; CHECK-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[B_EXT]], [[C_EXT]] -; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[RES]], [[MUL_AC]] -; CHECK-NEXT: [[MUL_DB:%.*]] = mul nsw i32 [[B_EXT]], [[D_EXT]] -; CHECK-NEXT: [[SUB]] = sub i32 [[ADD]], [[MUL_DB]] -; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]], !loop [[META4:![0-9]+]] -; -entry: - %cmp28.not = icmp ult i32 %N, 2 - %div27 = lshr i32 %N, 1 - %wide.trip.count = zext nneg i32 %div27 to i64 - br label %for.body - -for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry - %res.0.lcssa = phi i32 [ %sub, %for.body ] - ret i32 %res.0.lcssa - -for.body: ; preds = %for.body.preheader, %for.body - %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] - %res = phi i32 [ 0, %entry ], [ %sub, %for.body ] - %a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %indvars.iv - %b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %indvars.iv - %c.ptr = getelementptr inbounds nuw i8, ptr %c, i64 %indvars.iv - %a.val = load i8, ptr %a.ptr, align 1 - %b.val = load i8, ptr %b.ptr, align 1 - %c.val = load i8, ptr %c.ptr, align 1 - %a.ext = sext i8 %a.val to i32 - %b.ext = sext i8 %b.val to i32 - %c.ext = sext i8 %c.val to i32 - %mul.ab = mul nsw i32 %a.ext, %b.ext - %add = add nsw i32 %res, %mul.ab - %mul.ac = mul nsw i32 %a.ext, %c.ext - %sub = sub i32 %add, %mul.ac - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count - br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1 -} - -define i32 @chained_partial_reduce_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 { -; CHECK-LABEL: define i32 @chained_partial_reduce_add_add( -; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 -; CHECK-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 -; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK: vector.ph: -; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] -; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16 -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE9:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP11]], align 1 -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP13]], align 1 -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load , ptr [[TMP14]], align 1 -; CHECK-NEXT: [[TMP25:%.*]] = sext [[WIDE_LOAD4]] to -; CHECK-NEXT: [[TMP18:%.*]] = sext [[WIDE_LOAD3]] to -; CHECK-NEXT: [[TMP27:%.*]] = sext [[WIDE_LOAD6]] to -; CHECK-NEXT: [[TMP29:%.*]] = mul nsw [[TMP25]], [[TMP18]] -; CHECK-NEXT: [[PARTIAL_REDUCE7:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI1]], [[TMP29]]) -; CHECK-NEXT: [[TMP31:%.*]] = mul nsw [[TMP25]], [[TMP27]] -; CHECK-NEXT: [[PARTIAL_REDUCE9]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[PARTIAL_REDUCE7]], [[TMP31]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] -; CHECK: middle.block: -; CHECK-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE9]]) -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP22]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[SUB:%.*]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ] -; CHECK-NEXT: ret i32 [[RES_0_LCSSA]] -; CHECK: for.body: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB]], [[FOR_BODY]] ] -; CHECK-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[B_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1 -; CHECK-NEXT: [[C_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1 -; CHECK-NEXT: [[D_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1 -; CHECK-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32 -; CHECK-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32 -; CHECK-NEXT: [[D_EXT:%.*]] = sext i8 [[D_VAL]] to i32 -; CHECK-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[B_EXT]], [[C_EXT]] -; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[RES]], [[MUL_AC]] -; CHECK-NEXT: [[MUL_DB:%.*]] = mul nsw i32 [[B_EXT]], [[D_EXT]] -; CHECK-NEXT: [[SUB]] = add i32 [[ADD]], [[MUL_DB]] -; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]], !loop [[META4]] -; -entry: - %cmp28.not = icmp ult i32 %N, 2 - %div27 = lshr i32 %N, 1 - %wide.trip.count = zext nneg i32 %div27 to i64 - br label %for.body - -for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry - %res.0.lcssa = phi i32 [ %add.2, %for.body ] - ret i32 %res.0.lcssa - -for.body: ; preds = %for.body.preheader, %for.body - %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] - %res = phi i32 [ 0, %entry ], [ %add.2, %for.body ] - %a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %indvars.iv - %b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %indvars.iv - %c.ptr = getelementptr inbounds nuw i8, ptr %c, i64 %indvars.iv - %a.val = load i8, ptr %a.ptr, align 1 - %b.val = load i8, ptr %b.ptr, align 1 - %c.val = load i8, ptr %c.ptr, align 1 - %a.ext = sext i8 %a.val to i32 - %b.ext = sext i8 %b.val to i32 - %c.ext = sext i8 %c.val to i32 - %mul.ab = mul nsw i32 %a.ext, %b.ext - %add = add nsw i32 %res, %mul.ab - %mul.ac = mul nsw i32 %a.ext, %c.ext - %add.2 = add i32 %add, %mul.ac - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count - br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1 -} - -define i32 @chained_partial_reduce_sub_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 { -; CHECK-LABEL: define i32 @chained_partial_reduce_sub_add( -; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 -; CHECK-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 -; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK: vector.ph: -; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] -; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16 -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE9:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP11]], align 1 -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP13]], align 1 -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load , ptr [[TMP14]], align 1 -; CHECK-NEXT: [[TMP25:%.*]] = sext [[WIDE_LOAD4]] to -; CHECK-NEXT: [[TMP18:%.*]] = sext [[WIDE_LOAD3]] to -; CHECK-NEXT: [[TMP27:%.*]] = sext [[WIDE_LOAD6]] to -; CHECK-NEXT: [[TMP29:%.*]] = mul nsw [[TMP25]], [[TMP18]] -; CHECK-NEXT: [[TMP31:%.*]] = sub zeroinitializer, [[TMP29]] -; CHECK-NEXT: [[PARTIAL_REDUCE7:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI1]], [[TMP31]]) -; CHECK-NEXT: [[TMP33:%.*]] = mul nsw [[TMP25]], [[TMP27]] -; CHECK-NEXT: [[PARTIAL_REDUCE9]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[PARTIAL_REDUCE7]], [[TMP33]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] -; CHECK: middle.block: -; CHECK-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE9]]) -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP23]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] -; CHECK-NEXT: br label [[FOR_BODY1:%.*]] -; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[SUB:%.*]], [[FOR_BODY1]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ] -; CHECK-NEXT: ret i32 [[RES_0_LCSSA]] -; CHECK: for.body: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY1]] ] -; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB]], [[FOR_BODY1]] ] -; CHECK-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[B_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1 -; CHECK-NEXT: [[C_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1 -; CHECK-NEXT: [[D_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1 -; CHECK-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32 -; CHECK-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32 -; CHECK-NEXT: [[D_EXT:%.*]] = sext i8 [[D_VAL]] to i32 -; CHECK-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[B_EXT]], [[C_EXT]] -; CHECK-NEXT: [[SUB1:%.*]] = sub nsw i32 [[RES]], [[MUL_AC]] -; CHECK-NEXT: [[MUL_DB:%.*]] = mul nsw i32 [[B_EXT]], [[D_EXT]] -; CHECK-NEXT: [[SUB]] = add i32 [[SUB1]], [[MUL_DB]] -; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY1]], !llvm.loop [[LOOP9:![0-9]+]], !loop [[META4]] -; -entry: - %cmp28.not = icmp ult i32 %N, 2 - %div27 = lshr i32 %N, 1 - %wide.trip.count = zext nneg i32 %div27 to i64 - br label %for.body - -for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry - %res.0.lcssa = phi i32 [ %add, %for.body ] - ret i32 %res.0.lcssa - -for.body: ; preds = %for.body.preheader, %for.body - %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] - %res = phi i32 [ 0, %entry ], [ %add, %for.body ] - - %a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %indvars.iv - %b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %indvars.iv - %c.ptr = getelementptr inbounds nuw i8, ptr %c, i64 %indvars.iv - %a.val = load i8, ptr %a.ptr, align 1 - %b.val = load i8, ptr %b.ptr, align 1 - %c.val = load i8, ptr %c.ptr, align 1 - - %a.ext = sext i8 %a.val to i32 - %b.ext = sext i8 %b.val to i32 - %c.ext = sext i8 %c.val to i32 - %mul.ab = mul nsw i32 %a.ext, %b.ext - %sub = sub nsw i32 %res, %mul.ab - %mul.ac = mul nsw i32 %a.ext, %c.ext - %add = add i32 %sub, %mul.ac - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count - br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1 -} - -define i32 @chained_partial_reduce_sub_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 { -; CHECK-LABEL: define i32 @chained_partial_reduce_sub_sub( -; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 -; CHECK-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 -; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK: vector.ph: -; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] -; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16 -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE9:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP11]], align 1 -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP13]], align 1 -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load , ptr [[TMP14]], align 1 -; CHECK-NEXT: [[TMP25:%.*]] = sext [[WIDE_LOAD4]] to -; CHECK-NEXT: [[TMP18:%.*]] = sext [[WIDE_LOAD3]] to -; CHECK-NEXT: [[TMP27:%.*]] = sext [[WIDE_LOAD6]] to -; CHECK-NEXT: [[TMP29:%.*]] = mul nsw [[TMP25]], [[TMP18]] -; CHECK-NEXT: [[TMP31:%.*]] = sub zeroinitializer, [[TMP29]] -; CHECK-NEXT: [[PARTIAL_REDUCE7:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI1]], [[TMP31]]) -; CHECK-NEXT: [[TMP33:%.*]] = mul nsw [[TMP25]], [[TMP27]] -; CHECK-NEXT: [[TMP35:%.*]] = sub zeroinitializer, [[TMP33]] -; CHECK-NEXT: [[PARTIAL_REDUCE9]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[PARTIAL_REDUCE7]], [[TMP35]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] -; CHECK: middle.block: -; CHECK-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE9]]) -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP23]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] -; CHECK-NEXT: br label [[FOR_BODY1:%.*]] -; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY1]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ] -; CHECK-NEXT: ret i32 [[RES_0_LCSSA]] -; CHECK: for.body: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY1]] ] -; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD]], [[FOR_BODY1]] ] -; CHECK-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[B_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1 -; CHECK-NEXT: [[C_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1 -; CHECK-NEXT: [[D_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1 -; CHECK-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32 -; CHECK-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32 -; CHECK-NEXT: [[D_EXT:%.*]] = sext i8 [[D_VAL]] to i32 -; CHECK-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[B_EXT]], [[C_EXT]] -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[RES]], [[MUL_AC]] -; CHECK-NEXT: [[MUL_DB:%.*]] = mul nsw i32 [[B_EXT]], [[D_EXT]] -; CHECK-NEXT: [[ADD]] = sub i32 [[SUB]], [[MUL_DB]] -; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY1]], !llvm.loop [[LOOP11:![0-9]+]], !loop [[META4]] -; -entry: - %cmp28.not = icmp ult i32 %N, 2 - %div27 = lshr i32 %N, 1 - %wide.trip.count = zext nneg i32 %div27 to i64 - br label %for.body - -for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry - %res.0.lcssa = phi i32 [ %sub.2, %for.body ] - ret i32 %res.0.lcssa - -for.body: ; preds = %for.body.preheader, %for.body - %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] - %res = phi i32 [ 0, %entry ], [ %sub.2, %for.body ] - - %a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %indvars.iv - %b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %indvars.iv - %c.ptr = getelementptr inbounds nuw i8, ptr %c, i64 %indvars.iv - %a.val = load i8, ptr %a.ptr, align 1 - %b.val = load i8, ptr %b.ptr, align 1 - %c.val = load i8, ptr %c.ptr, align 1 - - %a.ext = sext i8 %a.val to i32 - %b.ext = sext i8 %b.val to i32 - %c.ext = sext i8 %c.val to i32 - - %mul.ab = mul nsw i32 %a.ext, %b.ext - %sub = sub nsw i32 %res, %mul.ab - %mul.ac = mul nsw i32 %a.ext, %c.ext - %sub.2 = sub i32 %sub, %mul.ac - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count - br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1 -} - -define i32 @chained_partial_reduce_sub_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 { -; CHECK-LABEL: define i32 @chained_partial_reduce_sub_add_sub( -; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 -; CHECK-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 -; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK: vector.ph: -; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] -; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16 -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP14]], align 1 -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP15]], align 1 -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP16]], align 1 -; CHECK-NEXT: [[TMP19:%.*]] = sext [[WIDE_LOAD2]] to -; CHECK-NEXT: [[TMP20:%.*]] = sext [[WIDE_LOAD3]] to -; CHECK-NEXT: [[TMP21:%.*]] = sext [[WIDE_LOAD4]] to -; CHECK-NEXT: [[TMP22:%.*]] = mul nsw [[TMP19]], [[TMP20]] -; CHECK-NEXT: [[TMP23:%.*]] = sub zeroinitializer, [[TMP22]] -; CHECK-NEXT: [[PARTIAL_REDUCE:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[TMP23]]) -; CHECK-NEXT: [[TMP24:%.*]] = mul nsw [[TMP19]], [[TMP21]] -; CHECK-NEXT: [[PARTIAL_REDUCE4:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[PARTIAL_REDUCE]], [[TMP24]]) -; CHECK-NEXT: [[TMP25:%.*]] = mul nsw [[TMP20]], [[TMP21]] -; CHECK-NEXT: [[TMP26:%.*]] = sub zeroinitializer, [[TMP25]] -; CHECK-NEXT: [[PARTIAL_REDUCE5]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[PARTIAL_REDUCE4]], [[TMP26]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] -; CHECK: middle.block: -; CHECK-NEXT: [[TMP28:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE5]]) -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP28]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[SUB_2:%.*]], [[FOR_BODY]] ], [ [[TMP28]], [[MIDDLE_BLOCK]] ] -; CHECK-NEXT: ret i32 [[RES_0_LCSSA]] -; CHECK: for.body: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB_2]], [[FOR_BODY]] ] -; CHECK-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[C_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1 -; CHECK-NEXT: [[D_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1 -; CHECK-NEXT: [[E_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1 -; CHECK-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32 -; CHECK-NEXT: [[D_EXT:%.*]] = sext i8 [[D_VAL]] to i32 -; CHECK-NEXT: [[E_EXT:%.*]] = sext i8 [[E_VAL]] to i32 -; CHECK-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[C_EXT]], [[D_EXT]] -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[RES]], [[MUL_AC]] -; CHECK-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[C_EXT]], [[E_EXT]] -; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[SUB]], [[MUL_AB]] -; CHECK-NEXT: [[MUL_DB:%.*]] = mul nsw i32 [[D_EXT]], [[E_EXT]] -; CHECK-NEXT: [[SUB_2]] = sub i32 [[ADD]], [[MUL_DB]] -; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]], !loop [[META4]] -; -entry: - %cmp28.not = icmp ult i32 %N, 2 - %div27 = lshr i32 %N, 1 - %wide.trip.count = zext nneg i32 %div27 to i64 - br label %for.body - -for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry - %res.0.lcssa = phi i32 [ %sub.2, %for.body ] - ret i32 %res.0.lcssa - -for.body: ; preds = %for.body.preheader, %for.body - %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] - %res = phi i32 [ 0, %entry ], [ %sub.2, %for.body ] - - %a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %indvars.iv - %b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %indvars.iv - %c.ptr = getelementptr inbounds nuw i8, ptr %c, i64 %indvars.iv - %a.val = load i8, ptr %a.ptr, align 1 - %b.val = load i8, ptr %b.ptr, align 1 - %c.val = load i8, ptr %c.ptr, align 1 - - %a.ext = sext i8 %a.val to i32 - %b.ext = sext i8 %b.val to i32 - %c.ext = sext i8 %c.val to i32 - - %mul.ab = mul nsw i32 %a.ext, %b.ext - %sub = sub nsw i32 %res, %mul.ab - %mul.ac = mul nsw i32 %a.ext, %c.ext - %add = add nsw i32 %sub, %mul.ac - %mul.bc = mul nsw i32 %b.ext, %c.ext - %sub.2 = sub i32 %add, %mul.bc - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count - br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1 -} - -attributes #0 = { mustprogress noinline nounwind vscale_range(1,16) "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-x3" "target-features"="+bf16,+bti,+ccidx,+complxnum,+crc,+dit,+dotprod,+ete,+flagm,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+jsconv,+lse,+mte,+pauth,+perfmon,+predres,+ras,+rcpc,+rdm,+sb,+spe,+ssbs,+sve,+sve2,+sve2-bitperm,+trbe,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8a,+v9a" } - - -!0 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} -!1 = distinct !{!0} -;. -; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} -; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} -; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} -; CHECK: [[META4]] = distinct !{[[META5:![0-9]+]]} -; CHECK: [[META5]] = !{!"llvm.loop.vectorize.scalable.enable", i1 true} -; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} -; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} -; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} -; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]} -; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]} -; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]} -; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]} -; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]} -;. diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll new file mode 100644 index 0000000000000..94614f4a39631 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll @@ -0,0 +1,1557 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt --mattr=+neon,+dotprod -passes=loop-vectorize -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-NEON +; RUN: opt --mattr=+sve -passes=loop-vectorize -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-SVE +; RUN: opt --mattr=+sve -vectorizer-maximize-bandwidth -passes=loop-vectorize -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-SVE-MAXBW + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-none-unknown-elf" + +; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: read) vscale_range(1,16) +define i32 @chained_partial_reduce_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 { +; CHECK-NEON-LABEL: define i32 @chained_partial_reduce_add_sub( +; CHECK-NEON-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEON-NEXT: entry: +; CHECK-NEON-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-NEON-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 +; CHECK-NEON-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 +; CHECK-NEON-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16 +; CHECK-NEON-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEON: vector.ph: +; CHECK-NEON-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16 +; CHECK-NEON-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-NEON-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-NEON: vector.body: +; CHECK-NEON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEON-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEON-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEON-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP0]] +; CHECK-NEON-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP0]] +; CHECK-NEON-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP0]] +; CHECK-NEON-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i32 0 +; CHECK-NEON-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1 +; CHECK-NEON-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i32 0 +; CHECK-NEON-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-NEON-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP3]], i32 0 +; CHECK-NEON-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1 +; CHECK-NEON-NEXT: [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-NEON-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-NEON-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]] +; CHECK-NEON-NEXT: [[TMP11:%.*]] = add <16 x i32> [[VEC_PHI]], [[TMP10]] +; CHECK-NEON-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]] +; CHECK-NEON-NEXT: [[TMP13]] = sub <16 x i32> [[TMP11]], [[TMP12]] +; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-NEON-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEON-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEON: middle.block: +; CHECK-NEON-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP13]]) +; CHECK-NEON-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-NEON-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK-NEON: scalar.ph: +; CHECK-NEON-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEON-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP15]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEON-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NEON: for.cond.cleanup: +; CHECK-NEON-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[SUB:%.*]], [[FOR_BODY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ] +; CHECK-NEON-NEXT: ret i32 [[RES_0_LCSSA]] +; CHECK-NEON: for.body: +; CHECK-NEON-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEON-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB]], [[FOR_BODY]] ] +; CHECK-NEON-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEON-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]] +; CHECK-NEON-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]] +; CHECK-NEON-NEXT: [[A_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1 +; CHECK-NEON-NEXT: [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1 +; CHECK-NEON-NEXT: [[C_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1 +; CHECK-NEON-NEXT: [[A_EXT:%.*]] = sext i8 [[A_VAL]] to i32 +; CHECK-NEON-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32 +; CHECK-NEON-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32 +; CHECK-NEON-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]] +; CHECK-NEON-NEXT: [[ADD:%.*]] = add nsw i32 [[RES]], [[MUL_AB]] +; CHECK-NEON-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[A_EXT]], [[C_EXT]] +; CHECK-NEON-NEXT: [[SUB]] = sub i32 [[ADD]], [[MUL_AC]] +; CHECK-NEON-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEON-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-NEON-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]], !loop [[META4:![0-9]+]] +; +; CHECK-SVE-LABEL: define i32 @chained_partial_reduce_add_sub( +; CHECK-SVE-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-SVE-NEXT: entry: +; CHECK-SVE-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 +; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 +; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]] +; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-SVE: vector.ph: +; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]] +; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-SVE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-SVE: vector.body: +; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]] +; CHECK-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]] +; CHECK-SVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]] +; CHECK-SVE-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0 +; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP10]], align 1 +; CHECK-SVE-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0 +; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP11]], align 1 +; CHECK-SVE-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0 +; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP12]], align 1 +; CHECK-SVE-NEXT: [[TMP13:%.*]] = sext [[WIDE_LOAD]] to +; CHECK-SVE-NEXT: [[TMP14:%.*]] = sext [[WIDE_LOAD1]] to +; CHECK-SVE-NEXT: [[TMP15:%.*]] = sext [[WIDE_LOAD2]] to +; CHECK-SVE-NEXT: [[TMP16:%.*]] = mul nsw [[TMP13]], [[TMP14]] +; CHECK-SVE-NEXT: [[TMP17:%.*]] = add [[VEC_PHI]], [[TMP16]] +; CHECK-SVE-NEXT: [[TMP18:%.*]] = mul nsw [[TMP13]], [[TMP15]] +; CHECK-SVE-NEXT: [[TMP19]] = sub [[TMP17]], [[TMP18]] +; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-SVE-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-SVE-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-SVE: middle.block: +; CHECK-SVE-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP19]]) +; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK-SVE: scalar.ph: +; CHECK-SVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-SVE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP21]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-SVE-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-SVE: for.cond.cleanup: +; CHECK-SVE-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[SUB:%.*]], [[FOR_BODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] +; CHECK-SVE-NEXT: ret i32 [[RES_0_LCSSA]] +; CHECK-SVE: for.body: +; CHECK-SVE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-SVE-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB]], [[FOR_BODY]] ] +; CHECK-SVE-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-SVE-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]] +; CHECK-SVE-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]] +; CHECK-SVE-NEXT: [[A_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1 +; CHECK-SVE-NEXT: [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1 +; CHECK-SVE-NEXT: [[C_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1 +; CHECK-SVE-NEXT: [[A_EXT:%.*]] = sext i8 [[A_VAL]] to i32 +; CHECK-SVE-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32 +; CHECK-SVE-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32 +; CHECK-SVE-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]] +; CHECK-SVE-NEXT: [[ADD:%.*]] = add nsw i32 [[RES]], [[MUL_AB]] +; CHECK-SVE-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[A_EXT]], [[C_EXT]] +; CHECK-SVE-NEXT: [[SUB]] = sub i32 [[ADD]], [[MUL_AC]] +; CHECK-SVE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-SVE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-SVE-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]], !loop [[META4:![0-9]+]] +; +; CHECK-SVE-MAXBW-LABEL: define i32 @chained_partial_reduce_add_sub( +; CHECK-SVE-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-SVE-MAXBW-NEXT: entry: +; CHECK-SVE-MAXBW-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-SVE-MAXBW-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 +; CHECK-SVE-MAXBW-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 +; CHECK-SVE-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; CHECK-SVE-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]] +; CHECK-SVE-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-SVE-MAXBW: vector.ph: +; CHECK-SVE-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 +; CHECK-SVE-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]] +; CHECK-SVE-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-SVE-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-SVE-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-SVE-MAXBW: vector.body: +; CHECK-SVE-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-SVE-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]] +; CHECK-SVE-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]] +; CHECK-SVE-MAXBW-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]] +; CHECK-SVE-MAXBW-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0 +; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP10]], align 1 +; CHECK-SVE-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0 +; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP11]], align 1 +; CHECK-SVE-MAXBW-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0 +; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP12]], align 1 +; CHECK-SVE-MAXBW-NEXT: [[TMP13:%.*]] = sext [[WIDE_LOAD]] to +; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext [[WIDE_LOAD1]] to +; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = sext [[WIDE_LOAD2]] to +; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw [[TMP13]], [[TMP14]] +; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = add [[VEC_PHI]], [[TMP16]] +; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = mul nsw [[TMP13]], [[TMP15]] +; CHECK-SVE-MAXBW-NEXT: [[TMP19]] = sub [[TMP17]], [[TMP18]] +; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-SVE-MAXBW-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-SVE-MAXBW-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-SVE-MAXBW: middle.block: +; CHECK-SVE-MAXBW-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32( [[TMP19]]) +; CHECK-SVE-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-SVE-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK-SVE-MAXBW: scalar.ph: +; CHECK-SVE-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-SVE-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP21]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-SVE-MAXBW-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-SVE-MAXBW: for.cond.cleanup: +; CHECK-SVE-MAXBW-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[SUB:%.*]], [[FOR_BODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] +; CHECK-SVE-MAXBW-NEXT: ret i32 [[RES_0_LCSSA]] +; CHECK-SVE-MAXBW: for.body: +; CHECK-SVE-MAXBW-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-SVE-MAXBW-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB]], [[FOR_BODY]] ] +; CHECK-SVE-MAXBW-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-SVE-MAXBW-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]] +; CHECK-SVE-MAXBW-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]] +; CHECK-SVE-MAXBW-NEXT: [[A_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1 +; CHECK-SVE-MAXBW-NEXT: [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1 +; CHECK-SVE-MAXBW-NEXT: [[C_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1 +; CHECK-SVE-MAXBW-NEXT: [[A_EXT:%.*]] = sext i8 [[A_VAL]] to i32 +; CHECK-SVE-MAXBW-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32 +; CHECK-SVE-MAXBW-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32 +; CHECK-SVE-MAXBW-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]] +; CHECK-SVE-MAXBW-NEXT: [[ADD:%.*]] = add nsw i32 [[RES]], [[MUL_AB]] +; CHECK-SVE-MAXBW-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[A_EXT]], [[C_EXT]] +; CHECK-SVE-MAXBW-NEXT: [[SUB]] = sub i32 [[ADD]], [[MUL_AC]] +; CHECK-SVE-MAXBW-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-SVE-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-SVE-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]], !loop [[META4:![0-9]+]] +; +entry: + %cmp28.not = icmp ult i32 %N, 2 + %div27 = lshr i32 %N, 1 + %wide.trip.count = zext nneg i32 %div27 to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + %res.0.lcssa = phi i32 [ %sub, %for.body ] + ret i32 %res.0.lcssa + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %res = phi i32 [ 0, %entry ], [ %sub, %for.body ] + %a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %indvars.iv + %b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %indvars.iv + %c.ptr = getelementptr inbounds nuw i8, ptr %c, i64 %indvars.iv + %a.val = load i8, ptr %a.ptr, align 1 + %b.val = load i8, ptr %b.ptr, align 1 + %c.val = load i8, ptr %c.ptr, align 1 + %a.ext = sext i8 %a.val to i32 + %b.ext = sext i8 %b.val to i32 + %c.ext = sext i8 %c.val to i32 + %mul.ab = mul nsw i32 %a.ext, %b.ext + %add = add nsw i32 %res, %mul.ab + %mul.ac = mul nsw i32 %a.ext, %c.ext + %sub = sub i32 %add, %mul.ac + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1 +} + +define i32 @chained_partial_reduce_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 { +; CHECK-NEON-LABEL: define i32 @chained_partial_reduce_add_add( +; CHECK-NEON-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEON-NEXT: entry: +; CHECK-NEON-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-NEON-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 +; CHECK-NEON-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 +; CHECK-NEON-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16 +; CHECK-NEON-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEON: vector.ph: +; CHECK-NEON-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16 +; CHECK-NEON-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-NEON-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-NEON: vector.body: +; CHECK-NEON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEON-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEON-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEON-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP0]] +; CHECK-NEON-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP0]] +; CHECK-NEON-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP0]] +; CHECK-NEON-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i32 0 +; CHECK-NEON-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1 +; CHECK-NEON-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i32 0 +; CHECK-NEON-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-NEON-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP3]], i32 0 +; CHECK-NEON-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1 +; CHECK-NEON-NEXT: [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-NEON-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-NEON-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]] +; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]]) +; CHECK-NEON-NEXT: [[TMP11:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]] +; CHECK-NEON-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP11]]) +; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-NEON-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEON-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEON: middle.block: +; CHECK-NEON-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE3]]) +; CHECK-NEON-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-NEON-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK-NEON: scalar.ph: +; CHECK-NEON-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEON-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP13]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEON-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NEON: for.cond.cleanup: +; CHECK-NEON-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[ADD_2:%.*]], [[FOR_BODY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ] +; CHECK-NEON-NEXT: ret i32 [[RES_0_LCSSA]] +; CHECK-NEON: for.body: +; CHECK-NEON-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEON-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD_2]], [[FOR_BODY]] ] +; CHECK-NEON-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEON-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]] +; CHECK-NEON-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]] +; CHECK-NEON-NEXT: [[A_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1 +; CHECK-NEON-NEXT: [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1 +; CHECK-NEON-NEXT: [[C_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1 +; CHECK-NEON-NEXT: [[A_EXT:%.*]] = sext i8 [[A_VAL]] to i32 +; CHECK-NEON-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32 +; CHECK-NEON-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32 +; CHECK-NEON-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]] +; CHECK-NEON-NEXT: [[ADD:%.*]] = add nsw i32 [[RES]], [[MUL_AB]] +; CHECK-NEON-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[A_EXT]], [[C_EXT]] +; CHECK-NEON-NEXT: [[ADD_2]] = add i32 [[ADD]], [[MUL_AC]] +; CHECK-NEON-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEON-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-NEON-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]], !loop [[META4]] +; +; CHECK-SVE-LABEL: define i32 @chained_partial_reduce_add_add( +; CHECK-SVE-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-SVE-NEXT: entry: +; CHECK-SVE-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 +; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 +; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]] +; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-SVE: vector.ph: +; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]] +; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-SVE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-SVE: vector.body: +; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]] +; CHECK-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]] +; CHECK-SVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]] +; CHECK-SVE-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0 +; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP10]], align 1 +; CHECK-SVE-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0 +; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP11]], align 1 +; CHECK-SVE-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0 +; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP12]], align 1 +; CHECK-SVE-NEXT: [[TMP13:%.*]] = sext [[WIDE_LOAD]] to +; CHECK-SVE-NEXT: [[TMP14:%.*]] = sext [[WIDE_LOAD1]] to +; CHECK-SVE-NEXT: [[TMP15:%.*]] = sext [[WIDE_LOAD2]] to +; CHECK-SVE-NEXT: [[TMP16:%.*]] = mul nsw [[TMP13]], [[TMP14]] +; CHECK-SVE-NEXT: [[TMP17:%.*]] = add [[VEC_PHI]], [[TMP16]] +; CHECK-SVE-NEXT: [[TMP18:%.*]] = mul nsw [[TMP13]], [[TMP15]] +; CHECK-SVE-NEXT: [[TMP19]] = add [[TMP17]], [[TMP18]] +; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-SVE-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-SVE-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-SVE: middle.block: +; CHECK-SVE-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP19]]) +; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK-SVE: scalar.ph: +; CHECK-SVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-SVE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP21]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-SVE-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-SVE: for.cond.cleanup: +; CHECK-SVE-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[ADD_2:%.*]], [[FOR_BODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] +; CHECK-SVE-NEXT: ret i32 [[RES_0_LCSSA]] +; CHECK-SVE: for.body: +; CHECK-SVE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-SVE-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD_2]], [[FOR_BODY]] ] +; CHECK-SVE-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-SVE-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]] +; CHECK-SVE-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]] +; CHECK-SVE-NEXT: [[A_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1 +; CHECK-SVE-NEXT: [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1 +; CHECK-SVE-NEXT: [[C_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1 +; CHECK-SVE-NEXT: [[A_EXT:%.*]] = sext i8 [[A_VAL]] to i32 +; CHECK-SVE-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32 +; CHECK-SVE-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32 +; CHECK-SVE-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]] +; CHECK-SVE-NEXT: [[ADD:%.*]] = add nsw i32 [[RES]], [[MUL_AB]] +; CHECK-SVE-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[A_EXT]], [[C_EXT]] +; CHECK-SVE-NEXT: [[ADD_2]] = add i32 [[ADD]], [[MUL_AC]] +; CHECK-SVE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-SVE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-SVE-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]], !loop [[META4]] +; +; CHECK-SVE-MAXBW-LABEL: define i32 @chained_partial_reduce_add_add( +; CHECK-SVE-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-SVE-MAXBW-NEXT: entry: +; CHECK-SVE-MAXBW-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-SVE-MAXBW-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 +; CHECK-SVE-MAXBW-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 +; CHECK-SVE-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; CHECK-SVE-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]] +; CHECK-SVE-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-SVE-MAXBW: vector.ph: +; CHECK-SVE-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 +; CHECK-SVE-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]] +; CHECK-SVE-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-SVE-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-SVE-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-SVE-MAXBW: vector.body: +; CHECK-SVE-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-SVE-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]] +; CHECK-SVE-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]] +; CHECK-SVE-MAXBW-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]] +; CHECK-SVE-MAXBW-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0 +; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP10]], align 1 +; CHECK-SVE-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0 +; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP11]], align 1 +; CHECK-SVE-MAXBW-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0 +; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP12]], align 1 +; CHECK-SVE-MAXBW-NEXT: [[TMP13:%.*]] = sext [[WIDE_LOAD]] to +; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext [[WIDE_LOAD1]] to +; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = sext [[WIDE_LOAD2]] to +; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw [[TMP13]], [[TMP14]] +; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32( [[VEC_PHI]], [[TMP16]]) +; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = mul nsw [[TMP13]], [[TMP15]] +; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE3]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32( [[PARTIAL_REDUCE]], [[TMP17]]) +; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-SVE-MAXBW-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-SVE-MAXBW: middle.block: +; CHECK-SVE-MAXBW-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32( [[PARTIAL_REDUCE3]]) +; CHECK-SVE-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-SVE-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK-SVE-MAXBW: scalar.ph: +; CHECK-SVE-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-SVE-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP19]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-SVE-MAXBW-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-SVE-MAXBW: for.cond.cleanup: +; CHECK-SVE-MAXBW-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[ADD_2:%.*]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ] +; CHECK-SVE-MAXBW-NEXT: ret i32 [[RES_0_LCSSA]] +; CHECK-SVE-MAXBW: for.body: +; CHECK-SVE-MAXBW-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-SVE-MAXBW-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD_2]], [[FOR_BODY]] ] +; CHECK-SVE-MAXBW-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-SVE-MAXBW-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]] +; CHECK-SVE-MAXBW-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]] +; CHECK-SVE-MAXBW-NEXT: [[A_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1 +; CHECK-SVE-MAXBW-NEXT: [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1 +; CHECK-SVE-MAXBW-NEXT: [[C_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1 +; CHECK-SVE-MAXBW-NEXT: [[A_EXT:%.*]] = sext i8 [[A_VAL]] to i32 +; CHECK-SVE-MAXBW-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32 +; CHECK-SVE-MAXBW-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32 +; CHECK-SVE-MAXBW-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]] +; CHECK-SVE-MAXBW-NEXT: [[ADD:%.*]] = add nsw i32 [[RES]], [[MUL_AB]] +; CHECK-SVE-MAXBW-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[A_EXT]], [[C_EXT]] +; CHECK-SVE-MAXBW-NEXT: [[ADD_2]] = add i32 [[ADD]], [[MUL_AC]] +; CHECK-SVE-MAXBW-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-SVE-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-SVE-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]], !loop [[META4]] +; +entry: + %cmp28.not = icmp ult i32 %N, 2 + %div27 = lshr i32 %N, 1 + %wide.trip.count = zext nneg i32 %div27 to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + %res.0.lcssa = phi i32 [ %add.2, %for.body ] + ret i32 %res.0.lcssa + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %res = phi i32 [ 0, %entry ], [ %add.2, %for.body ] + %a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %indvars.iv + %b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %indvars.iv + %c.ptr = getelementptr inbounds nuw i8, ptr %c, i64 %indvars.iv + %a.val = load i8, ptr %a.ptr, align 1 + %b.val = load i8, ptr %b.ptr, align 1 + %c.val = load i8, ptr %c.ptr, align 1 + %a.ext = sext i8 %a.val to i32 + %b.ext = sext i8 %b.val to i32 + %c.ext = sext i8 %c.val to i32 + %mul.ab = mul nsw i32 %a.ext, %b.ext + %add = add nsw i32 %res, %mul.ab + %mul.ac = mul nsw i32 %a.ext, %c.ext + %add.2 = add i32 %add, %mul.ac + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1 +} + +define i32 @chained_partial_reduce_sub_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 { +; CHECK-NEON-LABEL: define i32 @chained_partial_reduce_sub_add( +; CHECK-NEON-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEON-NEXT: entry: +; CHECK-NEON-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-NEON-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 +; CHECK-NEON-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 +; CHECK-NEON-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16 +; CHECK-NEON-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEON: vector.ph: +; CHECK-NEON-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16 +; CHECK-NEON-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-NEON-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-NEON: vector.body: +; CHECK-NEON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEON-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEON-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEON-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP0]] +; CHECK-NEON-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP0]] +; CHECK-NEON-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP0]] +; CHECK-NEON-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i32 0 +; CHECK-NEON-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1 +; CHECK-NEON-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i32 0 +; CHECK-NEON-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-NEON-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP3]], i32 0 +; CHECK-NEON-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1 +; CHECK-NEON-NEXT: [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-NEON-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-NEON-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]] +; CHECK-NEON-NEXT: [[TMP11:%.*]] = sub <16 x i32> [[VEC_PHI]], [[TMP10]] +; CHECK-NEON-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]] +; CHECK-NEON-NEXT: [[TMP13]] = add <16 x i32> [[TMP11]], [[TMP12]] +; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-NEON-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEON-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEON: middle.block: +; CHECK-NEON-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP13]]) +; CHECK-NEON-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-NEON-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK-NEON: scalar.ph: +; CHECK-NEON-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEON-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP15]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEON-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NEON: for.cond.cleanup: +; CHECK-NEON-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ] +; CHECK-NEON-NEXT: ret i32 [[RES_0_LCSSA]] +; CHECK-NEON: for.body: +; CHECK-NEON-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEON-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD]], [[FOR_BODY]] ] +; CHECK-NEON-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEON-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]] +; CHECK-NEON-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]] +; CHECK-NEON-NEXT: [[A_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1 +; CHECK-NEON-NEXT: [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1 +; CHECK-NEON-NEXT: [[C_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1 +; CHECK-NEON-NEXT: [[A_EXT:%.*]] = sext i8 [[A_VAL]] to i32 +; CHECK-NEON-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32 +; CHECK-NEON-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32 +; CHECK-NEON-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]] +; CHECK-NEON-NEXT: [[SUB:%.*]] = sub nsw i32 [[RES]], [[MUL_AB]] +; CHECK-NEON-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[A_EXT]], [[C_EXT]] +; CHECK-NEON-NEXT: [[ADD]] = add i32 [[SUB]], [[MUL_AC]] +; CHECK-NEON-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEON-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-NEON-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]], !loop [[META4]] +; +; CHECK-SVE-LABEL: define i32 @chained_partial_reduce_sub_add( +; CHECK-SVE-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-SVE-NEXT: entry: +; CHECK-SVE-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 +; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 +; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]] +; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-SVE: vector.ph: +; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]] +; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-SVE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-SVE: vector.body: +; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]] +; CHECK-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]] +; CHECK-SVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]] +; CHECK-SVE-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0 +; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP10]], align 1 +; CHECK-SVE-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0 +; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP11]], align 1 +; CHECK-SVE-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0 +; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP12]], align 1 +; CHECK-SVE-NEXT: [[TMP13:%.*]] = sext [[WIDE_LOAD]] to +; CHECK-SVE-NEXT: [[TMP14:%.*]] = sext [[WIDE_LOAD1]] to +; CHECK-SVE-NEXT: [[TMP15:%.*]] = sext [[WIDE_LOAD2]] to +; CHECK-SVE-NEXT: [[TMP16:%.*]] = mul nsw [[TMP13]], [[TMP14]] +; CHECK-SVE-NEXT: [[TMP17:%.*]] = sub [[VEC_PHI]], [[TMP16]] +; CHECK-SVE-NEXT: [[TMP18:%.*]] = mul nsw [[TMP13]], [[TMP15]] +; CHECK-SVE-NEXT: [[TMP19]] = add [[TMP17]], [[TMP18]] +; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-SVE-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-SVE-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-SVE: middle.block: +; CHECK-SVE-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP19]]) +; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK-SVE: scalar.ph: +; CHECK-SVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-SVE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP21]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-SVE-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-SVE: for.cond.cleanup: +; CHECK-SVE-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] +; CHECK-SVE-NEXT: ret i32 [[RES_0_LCSSA]] +; CHECK-SVE: for.body: +; CHECK-SVE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-SVE-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD]], [[FOR_BODY]] ] +; CHECK-SVE-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-SVE-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]] +; CHECK-SVE-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]] +; CHECK-SVE-NEXT: [[A_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1 +; CHECK-SVE-NEXT: [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1 +; CHECK-SVE-NEXT: [[C_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1 +; CHECK-SVE-NEXT: [[A_EXT:%.*]] = sext i8 [[A_VAL]] to i32 +; CHECK-SVE-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32 +; CHECK-SVE-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32 +; CHECK-SVE-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]] +; CHECK-SVE-NEXT: [[SUB:%.*]] = sub nsw i32 [[RES]], [[MUL_AB]] +; CHECK-SVE-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[A_EXT]], [[C_EXT]] +; CHECK-SVE-NEXT: [[ADD]] = add i32 [[SUB]], [[MUL_AC]] +; CHECK-SVE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-SVE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-SVE-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]], !loop [[META4]] +; +; CHECK-SVE-MAXBW-LABEL: define i32 @chained_partial_reduce_sub_add( +; CHECK-SVE-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-SVE-MAXBW-NEXT: entry: +; CHECK-SVE-MAXBW-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-SVE-MAXBW-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 +; CHECK-SVE-MAXBW-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 +; CHECK-SVE-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; CHECK-SVE-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]] +; CHECK-SVE-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-SVE-MAXBW: vector.ph: +; CHECK-SVE-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 +; CHECK-SVE-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]] +; CHECK-SVE-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-SVE-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-SVE-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-SVE-MAXBW: vector.body: +; CHECK-SVE-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-SVE-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]] +; CHECK-SVE-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]] +; CHECK-SVE-MAXBW-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]] +; CHECK-SVE-MAXBW-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0 +; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP10]], align 1 +; CHECK-SVE-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0 +; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP11]], align 1 +; CHECK-SVE-MAXBW-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0 +; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP12]], align 1 +; CHECK-SVE-MAXBW-NEXT: [[TMP13:%.*]] = sext [[WIDE_LOAD]] to +; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext [[WIDE_LOAD1]] to +; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = sext [[WIDE_LOAD2]] to +; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw [[TMP13]], [[TMP14]] +; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = sub [[VEC_PHI]], [[TMP16]] +; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = mul nsw [[TMP13]], [[TMP15]] +; CHECK-SVE-MAXBW-NEXT: [[TMP19]] = add [[TMP17]], [[TMP18]] +; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-SVE-MAXBW-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-SVE-MAXBW-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-SVE-MAXBW: middle.block: +; CHECK-SVE-MAXBW-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32( [[TMP19]]) +; CHECK-SVE-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-SVE-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK-SVE-MAXBW: scalar.ph: +; CHECK-SVE-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-SVE-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP21]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-SVE-MAXBW-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-SVE-MAXBW: for.cond.cleanup: +; CHECK-SVE-MAXBW-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] +; CHECK-SVE-MAXBW-NEXT: ret i32 [[RES_0_LCSSA]] +; CHECK-SVE-MAXBW: for.body: +; CHECK-SVE-MAXBW-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-SVE-MAXBW-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD]], [[FOR_BODY]] ] +; CHECK-SVE-MAXBW-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-SVE-MAXBW-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]] +; CHECK-SVE-MAXBW-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]] +; CHECK-SVE-MAXBW-NEXT: [[A_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1 +; CHECK-SVE-MAXBW-NEXT: [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1 +; CHECK-SVE-MAXBW-NEXT: [[C_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1 +; CHECK-SVE-MAXBW-NEXT: [[A_EXT:%.*]] = sext i8 [[A_VAL]] to i32 +; CHECK-SVE-MAXBW-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32 +; CHECK-SVE-MAXBW-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32 +; CHECK-SVE-MAXBW-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]] +; CHECK-SVE-MAXBW-NEXT: [[SUB:%.*]] = sub nsw i32 [[RES]], [[MUL_AB]] +; CHECK-SVE-MAXBW-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[A_EXT]], [[C_EXT]] +; CHECK-SVE-MAXBW-NEXT: [[ADD]] = add i32 [[SUB]], [[MUL_AC]] +; CHECK-SVE-MAXBW-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-SVE-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-SVE-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]], !loop [[META4]] +; +entry: + %cmp28.not = icmp ult i32 %N, 2 + %div27 = lshr i32 %N, 1 + %wide.trip.count = zext nneg i32 %div27 to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + %res.0.lcssa = phi i32 [ %add, %for.body ] + ret i32 %res.0.lcssa + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %res = phi i32 [ 0, %entry ], [ %add, %for.body ] + + %a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %indvars.iv + %b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %indvars.iv + %c.ptr = getelementptr inbounds nuw i8, ptr %c, i64 %indvars.iv + %a.val = load i8, ptr %a.ptr, align 1 + %b.val = load i8, ptr %b.ptr, align 1 + %c.val = load i8, ptr %c.ptr, align 1 + + %a.ext = sext i8 %a.val to i32 + %b.ext = sext i8 %b.val to i32 + %c.ext = sext i8 %c.val to i32 + %mul.ab = mul nsw i32 %a.ext, %b.ext + %sub = sub nsw i32 %res, %mul.ab + %mul.ac = mul nsw i32 %a.ext, %c.ext + %add = add i32 %sub, %mul.ac + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1 +} + +define i32 @chained_partial_reduce_sub_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 { +; CHECK-NEON-LABEL: define i32 @chained_partial_reduce_sub_sub( +; CHECK-NEON-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEON-NEXT: entry: +; CHECK-NEON-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-NEON-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 +; CHECK-NEON-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 +; CHECK-NEON-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16 +; CHECK-NEON-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEON: vector.ph: +; CHECK-NEON-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16 +; CHECK-NEON-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-NEON-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-NEON: vector.body: +; CHECK-NEON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEON-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEON-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEON-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP0]] +; CHECK-NEON-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP0]] +; CHECK-NEON-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP0]] +; CHECK-NEON-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i32 0 +; CHECK-NEON-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1 +; CHECK-NEON-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i32 0 +; CHECK-NEON-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-NEON-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP3]], i32 0 +; CHECK-NEON-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1 +; CHECK-NEON-NEXT: [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-NEON-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-NEON-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]] +; CHECK-NEON-NEXT: [[TMP11:%.*]] = sub <16 x i32> [[VEC_PHI]], [[TMP10]] +; CHECK-NEON-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]] +; CHECK-NEON-NEXT: [[TMP13]] = sub <16 x i32> [[TMP11]], [[TMP12]] +; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-NEON-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEON-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEON: middle.block: +; CHECK-NEON-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP13]]) +; CHECK-NEON-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-NEON-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK-NEON: scalar.ph: +; CHECK-NEON-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEON-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP15]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEON-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NEON: for.cond.cleanup: +; CHECK-NEON-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[SUB_2:%.*]], [[FOR_BODY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ] +; CHECK-NEON-NEXT: ret i32 [[RES_0_LCSSA]] +; CHECK-NEON: for.body: +; CHECK-NEON-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEON-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB_2]], [[FOR_BODY]] ] +; CHECK-NEON-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEON-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]] +; CHECK-NEON-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]] +; CHECK-NEON-NEXT: [[A_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1 +; CHECK-NEON-NEXT: [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1 +; CHECK-NEON-NEXT: [[C_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1 +; CHECK-NEON-NEXT: [[A_EXT:%.*]] = sext i8 [[A_VAL]] to i32 +; CHECK-NEON-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32 +; CHECK-NEON-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32 +; CHECK-NEON-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]] +; CHECK-NEON-NEXT: [[SUB:%.*]] = sub nsw i32 [[RES]], [[MUL_AB]] +; CHECK-NEON-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[A_EXT]], [[C_EXT]] +; CHECK-NEON-NEXT: [[SUB_2]] = sub i32 [[SUB]], [[MUL_AC]] +; CHECK-NEON-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEON-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-NEON-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]], !loop [[META4]] +; +; CHECK-SVE-LABEL: define i32 @chained_partial_reduce_sub_sub( +; CHECK-SVE-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-SVE-NEXT: entry: +; CHECK-SVE-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 +; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 +; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]] +; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-SVE: vector.ph: +; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]] +; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-SVE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-SVE: vector.body: +; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]] +; CHECK-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]] +; CHECK-SVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]] +; CHECK-SVE-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0 +; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP10]], align 1 +; CHECK-SVE-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0 +; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP11]], align 1 +; CHECK-SVE-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0 +; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP12]], align 1 +; CHECK-SVE-NEXT: [[TMP13:%.*]] = sext [[WIDE_LOAD]] to +; CHECK-SVE-NEXT: [[TMP14:%.*]] = sext [[WIDE_LOAD1]] to +; CHECK-SVE-NEXT: [[TMP15:%.*]] = sext [[WIDE_LOAD2]] to +; CHECK-SVE-NEXT: [[TMP16:%.*]] = mul nsw [[TMP13]], [[TMP14]] +; CHECK-SVE-NEXT: [[TMP17:%.*]] = sub [[VEC_PHI]], [[TMP16]] +; CHECK-SVE-NEXT: [[TMP18:%.*]] = mul nsw [[TMP13]], [[TMP15]] +; CHECK-SVE-NEXT: [[TMP19]] = sub [[TMP17]], [[TMP18]] +; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-SVE-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-SVE-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-SVE: middle.block: +; CHECK-SVE-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP19]]) +; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK-SVE: scalar.ph: +; CHECK-SVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-SVE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP21]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-SVE-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-SVE: for.cond.cleanup: +; CHECK-SVE-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[SUB_2:%.*]], [[FOR_BODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] +; CHECK-SVE-NEXT: ret i32 [[RES_0_LCSSA]] +; CHECK-SVE: for.body: +; CHECK-SVE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-SVE-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB_2]], [[FOR_BODY]] ] +; CHECK-SVE-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-SVE-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]] +; CHECK-SVE-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]] +; CHECK-SVE-NEXT: [[A_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1 +; CHECK-SVE-NEXT: [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1 +; CHECK-SVE-NEXT: [[C_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1 +; CHECK-SVE-NEXT: [[A_EXT:%.*]] = sext i8 [[A_VAL]] to i32 +; CHECK-SVE-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32 +; CHECK-SVE-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32 +; CHECK-SVE-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]] +; CHECK-SVE-NEXT: [[SUB:%.*]] = sub nsw i32 [[RES]], [[MUL_AB]] +; CHECK-SVE-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[A_EXT]], [[C_EXT]] +; CHECK-SVE-NEXT: [[SUB_2]] = sub i32 [[SUB]], [[MUL_AC]] +; CHECK-SVE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-SVE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-SVE-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]], !loop [[META4]] +; +; CHECK-SVE-MAXBW-LABEL: define i32 @chained_partial_reduce_sub_sub( +; CHECK-SVE-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-SVE-MAXBW-NEXT: entry: +; CHECK-SVE-MAXBW-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-SVE-MAXBW-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 +; CHECK-SVE-MAXBW-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 +; CHECK-SVE-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; CHECK-SVE-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]] +; CHECK-SVE-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-SVE-MAXBW: vector.ph: +; CHECK-SVE-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 +; CHECK-SVE-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]] +; CHECK-SVE-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-SVE-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-SVE-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-SVE-MAXBW: vector.body: +; CHECK-SVE-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-SVE-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]] +; CHECK-SVE-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]] +; CHECK-SVE-MAXBW-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]] +; CHECK-SVE-MAXBW-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0 +; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP10]], align 1 +; CHECK-SVE-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0 +; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP11]], align 1 +; CHECK-SVE-MAXBW-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0 +; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP12]], align 1 +; CHECK-SVE-MAXBW-NEXT: [[TMP13:%.*]] = sext [[WIDE_LOAD]] to +; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext [[WIDE_LOAD1]] to +; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = sext [[WIDE_LOAD2]] to +; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw [[TMP13]], [[TMP14]] +; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = sub [[VEC_PHI]], [[TMP16]] +; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = mul nsw [[TMP13]], [[TMP15]] +; CHECK-SVE-MAXBW-NEXT: [[TMP19]] = sub [[TMP17]], [[TMP18]] +; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-SVE-MAXBW-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-SVE-MAXBW-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-SVE-MAXBW: middle.block: +; CHECK-SVE-MAXBW-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32( [[TMP19]]) +; CHECK-SVE-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-SVE-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK-SVE-MAXBW: scalar.ph: +; CHECK-SVE-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-SVE-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP21]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-SVE-MAXBW-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-SVE-MAXBW: for.cond.cleanup: +; CHECK-SVE-MAXBW-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[SUB_2:%.*]], [[FOR_BODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] +; CHECK-SVE-MAXBW-NEXT: ret i32 [[RES_0_LCSSA]] +; CHECK-SVE-MAXBW: for.body: +; CHECK-SVE-MAXBW-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-SVE-MAXBW-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB_2]], [[FOR_BODY]] ] +; CHECK-SVE-MAXBW-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-SVE-MAXBW-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]] +; CHECK-SVE-MAXBW-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]] +; CHECK-SVE-MAXBW-NEXT: [[A_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1 +; CHECK-SVE-MAXBW-NEXT: [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1 +; CHECK-SVE-MAXBW-NEXT: [[C_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1 +; CHECK-SVE-MAXBW-NEXT: [[A_EXT:%.*]] = sext i8 [[A_VAL]] to i32 +; CHECK-SVE-MAXBW-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32 +; CHECK-SVE-MAXBW-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32 +; CHECK-SVE-MAXBW-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]] +; CHECK-SVE-MAXBW-NEXT: [[SUB:%.*]] = sub nsw i32 [[RES]], [[MUL_AB]] +; CHECK-SVE-MAXBW-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[A_EXT]], [[C_EXT]] +; CHECK-SVE-MAXBW-NEXT: [[SUB_2]] = sub i32 [[SUB]], [[MUL_AC]] +; CHECK-SVE-MAXBW-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-SVE-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-SVE-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]], !loop [[META4]] +; +entry: + %cmp28.not = icmp ult i32 %N, 2 + %div27 = lshr i32 %N, 1 + %wide.trip.count = zext nneg i32 %div27 to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + %res.0.lcssa = phi i32 [ %sub.2, %for.body ] + ret i32 %res.0.lcssa + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %res = phi i32 [ 0, %entry ], [ %sub.2, %for.body ] + + %a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %indvars.iv + %b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %indvars.iv + %c.ptr = getelementptr inbounds nuw i8, ptr %c, i64 %indvars.iv + %a.val = load i8, ptr %a.ptr, align 1 + %b.val = load i8, ptr %b.ptr, align 1 + %c.val = load i8, ptr %c.ptr, align 1 + + %a.ext = sext i8 %a.val to i32 + %b.ext = sext i8 %b.val to i32 + %c.ext = sext i8 %c.val to i32 + + %mul.ab = mul nsw i32 %a.ext, %b.ext + %sub = sub nsw i32 %res, %mul.ab + %mul.ac = mul nsw i32 %a.ext, %c.ext + %sub.2 = sub i32 %sub, %mul.ac + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1 +} + +define i32 @chained_partial_reduce_add_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 { +; CHECK-NEON-LABEL: define i32 @chained_partial_reduce_add_add_add( +; CHECK-NEON-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEON-NEXT: entry: +; CHECK-NEON-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-NEON-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 +; CHECK-NEON-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 +; CHECK-NEON-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16 +; CHECK-NEON-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEON: vector.ph: +; CHECK-NEON-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16 +; CHECK-NEON-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-NEON-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-NEON: vector.body: +; CHECK-NEON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEON-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE4:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEON-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEON-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP0]] +; CHECK-NEON-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP0]] +; CHECK-NEON-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP0]] +; CHECK-NEON-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i32 0 +; CHECK-NEON-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1 +; CHECK-NEON-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i32 0 +; CHECK-NEON-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-NEON-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP3]], i32 0 +; CHECK-NEON-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1 +; CHECK-NEON-NEXT: [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-NEON-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-NEON-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]] +; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]]) +; CHECK-NEON-NEXT: [[TMP11:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]] +; CHECK-NEON-NEXT: [[PARTIAL_REDUCE3:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP11]]) +; CHECK-NEON-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP8]], [[TMP9]] +; CHECK-NEON-NEXT: [[PARTIAL_REDUCE4]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE3]], <16 x i32> [[TMP12]]) +; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-NEON-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEON-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEON: middle.block: +; CHECK-NEON-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE4]]) +; CHECK-NEON-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-NEON-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK-NEON: scalar.ph: +; CHECK-NEON-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEON-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP14]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEON-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NEON: for.cond.cleanup: +; CHECK-NEON-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[SUB_2:%.*]], [[FOR_BODY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ] +; CHECK-NEON-NEXT: ret i32 [[RES_0_LCSSA]] +; CHECK-NEON: for.body: +; CHECK-NEON-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEON-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB_2]], [[FOR_BODY]] ] +; CHECK-NEON-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEON-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]] +; CHECK-NEON-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]] +; CHECK-NEON-NEXT: [[A_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1 +; CHECK-NEON-NEXT: [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1 +; CHECK-NEON-NEXT: [[C_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1 +; CHECK-NEON-NEXT: [[A_EXT:%.*]] = sext i8 [[A_VAL]] to i32 +; CHECK-NEON-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32 +; CHECK-NEON-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32 +; CHECK-NEON-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]] +; CHECK-NEON-NEXT: [[SUB:%.*]] = add nsw i32 [[RES]], [[MUL_AB]] +; CHECK-NEON-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[A_EXT]], [[C_EXT]] +; CHECK-NEON-NEXT: [[ADD:%.*]] = add nsw i32 [[SUB]], [[MUL_AC]] +; CHECK-NEON-NEXT: [[MUL_BC:%.*]] = mul nsw i32 [[B_EXT]], [[C_EXT]] +; CHECK-NEON-NEXT: [[SUB_2]] = add i32 [[ADD]], [[MUL_BC]] +; CHECK-NEON-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEON-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-NEON-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]], !loop [[META4]] +; +; CHECK-SVE-LABEL: define i32 @chained_partial_reduce_add_add_add( +; CHECK-SVE-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-SVE-NEXT: entry: +; CHECK-SVE-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 +; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 +; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]] +; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-SVE: vector.ph: +; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]] +; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-SVE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-SVE: vector.body: +; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]] +; CHECK-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]] +; CHECK-SVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]] +; CHECK-SVE-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0 +; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP10]], align 1 +; CHECK-SVE-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0 +; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP11]], align 1 +; CHECK-SVE-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0 +; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP12]], align 1 +; CHECK-SVE-NEXT: [[TMP13:%.*]] = sext [[WIDE_LOAD]] to +; CHECK-SVE-NEXT: [[TMP14:%.*]] = sext [[WIDE_LOAD1]] to +; CHECK-SVE-NEXT: [[TMP15:%.*]] = sext [[WIDE_LOAD2]] to +; CHECK-SVE-NEXT: [[TMP16:%.*]] = mul nsw [[TMP13]], [[TMP14]] +; CHECK-SVE-NEXT: [[TMP17:%.*]] = add [[VEC_PHI]], [[TMP16]] +; CHECK-SVE-NEXT: [[TMP18:%.*]] = mul nsw [[TMP13]], [[TMP15]] +; CHECK-SVE-NEXT: [[TMP19:%.*]] = add [[TMP17]], [[TMP18]] +; CHECK-SVE-NEXT: [[TMP20:%.*]] = mul nsw [[TMP14]], [[TMP15]] +; CHECK-SVE-NEXT: [[TMP21]] = add [[TMP19]], [[TMP20]] +; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-SVE-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-SVE-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-SVE: middle.block: +; CHECK-SVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP21]]) +; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK-SVE: scalar.ph: +; CHECK-SVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-SVE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP23]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-SVE-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-SVE: for.cond.cleanup: +; CHECK-SVE-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[SUB_2:%.*]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ] +; CHECK-SVE-NEXT: ret i32 [[RES_0_LCSSA]] +; CHECK-SVE: for.body: +; CHECK-SVE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-SVE-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB_2]], [[FOR_BODY]] ] +; CHECK-SVE-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-SVE-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]] +; CHECK-SVE-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]] +; CHECK-SVE-NEXT: [[A_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1 +; CHECK-SVE-NEXT: [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1 +; CHECK-SVE-NEXT: [[C_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1 +; CHECK-SVE-NEXT: [[A_EXT:%.*]] = sext i8 [[A_VAL]] to i32 +; CHECK-SVE-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32 +; CHECK-SVE-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32 +; CHECK-SVE-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]] +; CHECK-SVE-NEXT: [[SUB:%.*]] = add nsw i32 [[RES]], [[MUL_AB]] +; CHECK-SVE-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[A_EXT]], [[C_EXT]] +; CHECK-SVE-NEXT: [[ADD:%.*]] = add nsw i32 [[SUB]], [[MUL_AC]] +; CHECK-SVE-NEXT: [[MUL_BC:%.*]] = mul nsw i32 [[B_EXT]], [[C_EXT]] +; CHECK-SVE-NEXT: [[SUB_2]] = add i32 [[ADD]], [[MUL_BC]] +; CHECK-SVE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-SVE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-SVE-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]], !loop [[META4]] +; +; CHECK-SVE-MAXBW-LABEL: define i32 @chained_partial_reduce_add_add_add( +; CHECK-SVE-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-SVE-MAXBW-NEXT: entry: +; CHECK-SVE-MAXBW-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-SVE-MAXBW-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 +; CHECK-SVE-MAXBW-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 +; CHECK-SVE-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16 +; CHECK-SVE-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]] +; CHECK-SVE-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-SVE-MAXBW: vector.ph: +; CHECK-SVE-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16 +; CHECK-SVE-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]] +; CHECK-SVE-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-SVE-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16 +; CHECK-SVE-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-SVE-MAXBW: vector.body: +; CHECK-SVE-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE4:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-SVE-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]] +; CHECK-SVE-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]] +; CHECK-SVE-MAXBW-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]] +; CHECK-SVE-MAXBW-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0 +; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP10]], align 1 +; CHECK-SVE-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0 +; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP11]], align 1 +; CHECK-SVE-MAXBW-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0 +; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP12]], align 1 +; CHECK-SVE-MAXBW-NEXT: [[TMP13:%.*]] = sext [[WIDE_LOAD]] to +; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext [[WIDE_LOAD1]] to +; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = sext [[WIDE_LOAD2]] to +; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw [[TMP13]], [[TMP14]] +; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[TMP16]]) +; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = mul nsw [[TMP13]], [[TMP15]] +; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE3:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[PARTIAL_REDUCE]], [[TMP17]]) +; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = mul nsw [[TMP14]], [[TMP15]] +; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE4]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[PARTIAL_REDUCE3]], [[TMP18]]) +; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-SVE-MAXBW-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-SVE-MAXBW-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-SVE-MAXBW: middle.block: +; CHECK-SVE-MAXBW-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE4]]) +; CHECK-SVE-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-SVE-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK-SVE-MAXBW: scalar.ph: +; CHECK-SVE-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-SVE-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP20]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-SVE-MAXBW-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-SVE-MAXBW: for.cond.cleanup: +; CHECK-SVE-MAXBW-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[SUB_2:%.*]], [[FOR_BODY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ] +; CHECK-SVE-MAXBW-NEXT: ret i32 [[RES_0_LCSSA]] +; CHECK-SVE-MAXBW: for.body: +; CHECK-SVE-MAXBW-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-SVE-MAXBW-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB_2]], [[FOR_BODY]] ] +; CHECK-SVE-MAXBW-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-SVE-MAXBW-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]] +; CHECK-SVE-MAXBW-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]] +; CHECK-SVE-MAXBW-NEXT: [[A_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1 +; CHECK-SVE-MAXBW-NEXT: [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1 +; CHECK-SVE-MAXBW-NEXT: [[C_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1 +; CHECK-SVE-MAXBW-NEXT: [[A_EXT:%.*]] = sext i8 [[A_VAL]] to i32 +; CHECK-SVE-MAXBW-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32 +; CHECK-SVE-MAXBW-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32 +; CHECK-SVE-MAXBW-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]] +; CHECK-SVE-MAXBW-NEXT: [[SUB:%.*]] = add nsw i32 [[RES]], [[MUL_AB]] +; CHECK-SVE-MAXBW-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[A_EXT]], [[C_EXT]] +; CHECK-SVE-MAXBW-NEXT: [[ADD:%.*]] = add nsw i32 [[SUB]], [[MUL_AC]] +; CHECK-SVE-MAXBW-NEXT: [[MUL_BC:%.*]] = mul nsw i32 [[B_EXT]], [[C_EXT]] +; CHECK-SVE-MAXBW-NEXT: [[SUB_2]] = add i32 [[ADD]], [[MUL_BC]] +; CHECK-SVE-MAXBW-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-SVE-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-SVE-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]], !loop [[META4]] +; +entry: + %cmp28.not = icmp ult i32 %N, 2 + %div27 = lshr i32 %N, 1 + %wide.trip.count = zext nneg i32 %div27 to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + %res.0.lcssa = phi i32 [ %sub.2, %for.body ] + ret i32 %res.0.lcssa + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %res = phi i32 [ 0, %entry ], [ %sub.2, %for.body ] + + %a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %indvars.iv + %b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %indvars.iv + %c.ptr = getelementptr inbounds nuw i8, ptr %c, i64 %indvars.iv + %a.val = load i8, ptr %a.ptr, align 1 + %b.val = load i8, ptr %b.ptr, align 1 + %c.val = load i8, ptr %c.ptr, align 1 + + %a.ext = sext i8 %a.val to i32 + %b.ext = sext i8 %b.val to i32 + %c.ext = sext i8 %c.val to i32 + + %mul.ab = mul nsw i32 %a.ext, %b.ext + %sub = add nsw i32 %res, %mul.ab + %mul.ac = mul nsw i32 %a.ext, %c.ext + %add = add nsw i32 %sub, %mul.ac + %mul.bc = mul nsw i32 %b.ext, %c.ext + %sub.2 = add i32 %add, %mul.bc + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1 +} + +define i32 @chained_partial_reduce_sub_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 { +; CHECK-NEON-LABEL: define i32 @chained_partial_reduce_sub_add_sub( +; CHECK-NEON-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEON-NEXT: entry: +; CHECK-NEON-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-NEON-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 +; CHECK-NEON-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 +; CHECK-NEON-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16 +; CHECK-NEON-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEON: vector.ph: +; CHECK-NEON-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16 +; CHECK-NEON-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-NEON-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-NEON: vector.body: +; CHECK-NEON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEON-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEON-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEON-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP0]] +; CHECK-NEON-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP0]] +; CHECK-NEON-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP0]] +; CHECK-NEON-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i32 0 +; CHECK-NEON-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1 +; CHECK-NEON-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i32 0 +; CHECK-NEON-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-NEON-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP3]], i32 0 +; CHECK-NEON-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1 +; CHECK-NEON-NEXT: [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-NEON-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-NEON-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]] +; CHECK-NEON-NEXT: [[TMP11:%.*]] = sub <16 x i32> [[VEC_PHI]], [[TMP10]] +; CHECK-NEON-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]] +; CHECK-NEON-NEXT: [[TMP13:%.*]] = add <16 x i32> [[TMP11]], [[TMP12]] +; CHECK-NEON-NEXT: [[TMP14:%.*]] = mul nsw <16 x i32> [[TMP8]], [[TMP9]] +; CHECK-NEON-NEXT: [[TMP15]] = sub <16 x i32> [[TMP13]], [[TMP14]] +; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-NEON-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEON-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-NEON: middle.block: +; CHECK-NEON-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP15]]) +; CHECK-NEON-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-NEON-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK-NEON: scalar.ph: +; CHECK-NEON-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEON-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP17]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEON-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NEON: for.cond.cleanup: +; CHECK-NEON-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[SUB_2:%.*]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ] +; CHECK-NEON-NEXT: ret i32 [[RES_0_LCSSA]] +; CHECK-NEON: for.body: +; CHECK-NEON-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEON-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB_2]], [[FOR_BODY]] ] +; CHECK-NEON-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEON-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]] +; CHECK-NEON-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]] +; CHECK-NEON-NEXT: [[A_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1 +; CHECK-NEON-NEXT: [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1 +; CHECK-NEON-NEXT: [[C_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1 +; CHECK-NEON-NEXT: [[A_EXT:%.*]] = sext i8 [[A_VAL]] to i32 +; CHECK-NEON-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32 +; CHECK-NEON-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32 +; CHECK-NEON-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]] +; CHECK-NEON-NEXT: [[SUB:%.*]] = sub nsw i32 [[RES]], [[MUL_AB]] +; CHECK-NEON-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[A_EXT]], [[C_EXT]] +; CHECK-NEON-NEXT: [[ADD:%.*]] = add nsw i32 [[SUB]], [[MUL_AC]] +; CHECK-NEON-NEXT: [[MUL_BC:%.*]] = mul nsw i32 [[B_EXT]], [[C_EXT]] +; CHECK-NEON-NEXT: [[SUB_2]] = sub i32 [[ADD]], [[MUL_BC]] +; CHECK-NEON-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEON-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-NEON-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]], !loop [[META4]] +; +; CHECK-SVE-LABEL: define i32 @chained_partial_reduce_sub_add_sub( +; CHECK-SVE-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-SVE-NEXT: entry: +; CHECK-SVE-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 +; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 +; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]] +; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-SVE: vector.ph: +; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]] +; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-SVE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-SVE: vector.body: +; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]] +; CHECK-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]] +; CHECK-SVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]] +; CHECK-SVE-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0 +; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP10]], align 1 +; CHECK-SVE-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0 +; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP11]], align 1 +; CHECK-SVE-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0 +; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP12]], align 1 +; CHECK-SVE-NEXT: [[TMP13:%.*]] = sext [[WIDE_LOAD]] to +; CHECK-SVE-NEXT: [[TMP14:%.*]] = sext [[WIDE_LOAD1]] to +; CHECK-SVE-NEXT: [[TMP15:%.*]] = sext [[WIDE_LOAD2]] to +; CHECK-SVE-NEXT: [[TMP16:%.*]] = mul nsw [[TMP13]], [[TMP14]] +; CHECK-SVE-NEXT: [[TMP17:%.*]] = sub [[VEC_PHI]], [[TMP16]] +; CHECK-SVE-NEXT: [[TMP18:%.*]] = mul nsw [[TMP13]], [[TMP15]] +; CHECK-SVE-NEXT: [[TMP19:%.*]] = add [[TMP17]], [[TMP18]] +; CHECK-SVE-NEXT: [[TMP20:%.*]] = mul nsw [[TMP14]], [[TMP15]] +; CHECK-SVE-NEXT: [[TMP21]] = sub [[TMP19]], [[TMP20]] +; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-SVE-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-SVE-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-SVE: middle.block: +; CHECK-SVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP21]]) +; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK-SVE: scalar.ph: +; CHECK-SVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-SVE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP23]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-SVE-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-SVE: for.cond.cleanup: +; CHECK-SVE-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[SUB_2:%.*]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ] +; CHECK-SVE-NEXT: ret i32 [[RES_0_LCSSA]] +; CHECK-SVE: for.body: +; CHECK-SVE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-SVE-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB_2]], [[FOR_BODY]] ] +; CHECK-SVE-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-SVE-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]] +; CHECK-SVE-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]] +; CHECK-SVE-NEXT: [[A_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1 +; CHECK-SVE-NEXT: [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1 +; CHECK-SVE-NEXT: [[C_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1 +; CHECK-SVE-NEXT: [[A_EXT:%.*]] = sext i8 [[A_VAL]] to i32 +; CHECK-SVE-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32 +; CHECK-SVE-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32 +; CHECK-SVE-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]] +; CHECK-SVE-NEXT: [[SUB:%.*]] = sub nsw i32 [[RES]], [[MUL_AB]] +; CHECK-SVE-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[A_EXT]], [[C_EXT]] +; CHECK-SVE-NEXT: [[ADD:%.*]] = add nsw i32 [[SUB]], [[MUL_AC]] +; CHECK-SVE-NEXT: [[MUL_BC:%.*]] = mul nsw i32 [[B_EXT]], [[C_EXT]] +; CHECK-SVE-NEXT: [[SUB_2]] = sub i32 [[ADD]], [[MUL_BC]] +; CHECK-SVE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-SVE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-SVE-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]], !loop [[META4]] +; +; CHECK-SVE-MAXBW-LABEL: define i32 @chained_partial_reduce_sub_add_sub( +; CHECK-SVE-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-SVE-MAXBW-NEXT: entry: +; CHECK-SVE-MAXBW-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 +; CHECK-SVE-MAXBW-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 +; CHECK-SVE-MAXBW-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 +; CHECK-SVE-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; CHECK-SVE-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]] +; CHECK-SVE-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-SVE-MAXBW: vector.ph: +; CHECK-SVE-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 +; CHECK-SVE-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]] +; CHECK-SVE-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-SVE-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-SVE-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-SVE-MAXBW: vector.body: +; CHECK-SVE-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-SVE-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]] +; CHECK-SVE-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]] +; CHECK-SVE-MAXBW-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]] +; CHECK-SVE-MAXBW-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0 +; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP10]], align 1 +; CHECK-SVE-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0 +; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP11]], align 1 +; CHECK-SVE-MAXBW-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0 +; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP12]], align 1 +; CHECK-SVE-MAXBW-NEXT: [[TMP13:%.*]] = sext [[WIDE_LOAD]] to +; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext [[WIDE_LOAD1]] to +; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = sext [[WIDE_LOAD2]] to +; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw [[TMP13]], [[TMP14]] +; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = sub [[VEC_PHI]], [[TMP16]] +; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = mul nsw [[TMP13]], [[TMP15]] +; CHECK-SVE-MAXBW-NEXT: [[TMP19:%.*]] = add [[TMP17]], [[TMP18]] +; CHECK-SVE-MAXBW-NEXT: [[TMP20:%.*]] = mul nsw [[TMP14]], [[TMP15]] +; CHECK-SVE-MAXBW-NEXT: [[TMP21]] = sub [[TMP19]], [[TMP20]] +; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-SVE-MAXBW-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-SVE-MAXBW-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-SVE-MAXBW: middle.block: +; CHECK-SVE-MAXBW-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32( [[TMP21]]) +; CHECK-SVE-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-SVE-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK-SVE-MAXBW: scalar.ph: +; CHECK-SVE-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-SVE-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP23]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-SVE-MAXBW-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-SVE-MAXBW: for.cond.cleanup: +; CHECK-SVE-MAXBW-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[SUB_2:%.*]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ] +; CHECK-SVE-MAXBW-NEXT: ret i32 [[RES_0_LCSSA]] +; CHECK-SVE-MAXBW: for.body: +; CHECK-SVE-MAXBW-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-SVE-MAXBW-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB_2]], [[FOR_BODY]] ] +; CHECK-SVE-MAXBW-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-SVE-MAXBW-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]] +; CHECK-SVE-MAXBW-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]] +; CHECK-SVE-MAXBW-NEXT: [[A_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1 +; CHECK-SVE-MAXBW-NEXT: [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1 +; CHECK-SVE-MAXBW-NEXT: [[C_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1 +; CHECK-SVE-MAXBW-NEXT: [[A_EXT:%.*]] = sext i8 [[A_VAL]] to i32 +; CHECK-SVE-MAXBW-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32 +; CHECK-SVE-MAXBW-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32 +; CHECK-SVE-MAXBW-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]] +; CHECK-SVE-MAXBW-NEXT: [[SUB:%.*]] = sub nsw i32 [[RES]], [[MUL_AB]] +; CHECK-SVE-MAXBW-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[A_EXT]], [[C_EXT]] +; CHECK-SVE-MAXBW-NEXT: [[ADD:%.*]] = add nsw i32 [[SUB]], [[MUL_AC]] +; CHECK-SVE-MAXBW-NEXT: [[MUL_BC:%.*]] = mul nsw i32 [[B_EXT]], [[C_EXT]] +; CHECK-SVE-MAXBW-NEXT: [[SUB_2]] = sub i32 [[ADD]], [[MUL_BC]] +; CHECK-SVE-MAXBW-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-SVE-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-SVE-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]], !loop [[META4]] +; +entry: + %cmp28.not = icmp ult i32 %N, 2 + %div27 = lshr i32 %N, 1 + %wide.trip.count = zext nneg i32 %div27 to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + %res.0.lcssa = phi i32 [ %sub.2, %for.body ] + ret i32 %res.0.lcssa + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %res = phi i32 [ 0, %entry ], [ %sub.2, %for.body ] + + %a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %indvars.iv + %b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %indvars.iv + %c.ptr = getelementptr inbounds nuw i8, ptr %c, i64 %indvars.iv + %a.val = load i8, ptr %a.ptr, align 1 + %b.val = load i8, ptr %b.ptr, align 1 + %c.val = load i8, ptr %c.ptr, align 1 + + %a.ext = sext i8 %a.val to i32 + %b.ext = sext i8 %b.val to i32 + %c.ext = sext i8 %c.val to i32 + + %mul.ab = mul nsw i32 %a.ext, %b.ext + %sub = sub nsw i32 %res, %mul.ab + %mul.ac = mul nsw i32 %a.ext, %c.ext + %add = add nsw i32 %sub, %mul.ac + %mul.bc = mul nsw i32 %b.ext, %c.ext + %sub.2 = sub i32 %add, %mul.bc + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1 +} + +attributes #0 = { vscale_range(1,16) } + + +!0 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} +!1 = distinct !{!0} +;. +; CHECK-NEON: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK-NEON: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK-NEON: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK-NEON: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; CHECK-NEON: [[META4]] = distinct !{[[META5:![0-9]+]]} +; CHECK-NEON: [[META5]] = !{!"llvm.loop.vectorize.scalable.enable", i1 true} +; CHECK-NEON: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} +; CHECK-NEON: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} +; CHECK-NEON: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} +; CHECK-NEON: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]} +; CHECK-NEON: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]} +; CHECK-NEON: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]} +; CHECK-NEON: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]} +; CHECK-NEON: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]} +; CHECK-NEON: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]} +; CHECK-NEON: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]} +;. +; CHECK-SVE: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK-SVE: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK-SVE: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK-SVE: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; CHECK-SVE: [[META4]] = distinct !{[[META5:![0-9]+]]} +; CHECK-SVE: [[META5]] = !{!"llvm.loop.vectorize.scalable.enable", i1 true} +; CHECK-SVE: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} +; CHECK-SVE: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} +; CHECK-SVE: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} +; CHECK-SVE: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]} +; CHECK-SVE: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]} +; CHECK-SVE: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]} +; CHECK-SVE: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]} +; CHECK-SVE: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]} +; CHECK-SVE: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]} +; CHECK-SVE: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]} +;. +; CHECK-SVE-MAXBW: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK-SVE-MAXBW: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK-SVE-MAXBW: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK-SVE-MAXBW: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; CHECK-SVE-MAXBW: [[META4]] = distinct !{[[META5:![0-9]+]]} +; CHECK-SVE-MAXBW: [[META5]] = !{!"llvm.loop.vectorize.scalable.enable", i1 true} +; CHECK-SVE-MAXBW: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} +; CHECK-SVE-MAXBW: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} +; CHECK-SVE-MAXBW: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} +; CHECK-SVE-MAXBW: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]} +; CHECK-SVE-MAXBW: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]} +; CHECK-SVE-MAXBW: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]} +; CHECK-SVE-MAXBW: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]} +; CHECK-SVE-MAXBW: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]} +; CHECK-SVE-MAXBW: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]} +; CHECK-SVE-MAXBW: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]} +;. From a03afa755dfaaf1d7e66369f334cf6ef929e5620 Mon Sep 17 00:00:00 2001 From: Nick Guy Date: Wed, 22 Jan 2025 14:30:12 +0000 Subject: [PATCH 6/8] Update test --- .../AArch64/partial-reduce-chained.ll | 534 +----------------- 1 file changed, 1 insertion(+), 533 deletions(-) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll index 94614f4a39631..bedf8b6b3a9b5 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll @@ -47,32 +47,6 @@ define i32 @chained_partial_reduce_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 { ; CHECK-NEON-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP13]]) ; CHECK-NEON-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-NEON-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] -; CHECK-NEON: scalar.ph: -; CHECK-NEON-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEON-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP15]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] -; CHECK-NEON-NEXT: br label [[FOR_BODY:%.*]] -; CHECK-NEON: for.cond.cleanup: -; CHECK-NEON-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[SUB:%.*]], [[FOR_BODY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ] -; CHECK-NEON-NEXT: ret i32 [[RES_0_LCSSA]] -; CHECK-NEON: for.body: -; CHECK-NEON-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEON-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB]], [[FOR_BODY]] ] -; CHECK-NEON-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]] -; CHECK-NEON-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]] -; CHECK-NEON-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]] -; CHECK-NEON-NEXT: [[A_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1 -; CHECK-NEON-NEXT: [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1 -; CHECK-NEON-NEXT: [[C_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1 -; CHECK-NEON-NEXT: [[A_EXT:%.*]] = sext i8 [[A_VAL]] to i32 -; CHECK-NEON-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32 -; CHECK-NEON-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32 -; CHECK-NEON-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]] -; CHECK-NEON-NEXT: [[ADD:%.*]] = add nsw i32 [[RES]], [[MUL_AB]] -; CHECK-NEON-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[A_EXT]], [[C_EXT]] -; CHECK-NEON-NEXT: [[SUB]] = sub i32 [[ADD]], [[MUL_AC]] -; CHECK-NEON-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; CHECK-NEON-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; CHECK-NEON-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]], !loop [[META4:![0-9]+]] ; ; CHECK-SVE-LABEL: define i32 @chained_partial_reduce_add_sub( ; CHECK-SVE-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] { @@ -119,32 +93,6 @@ define i32 @chained_partial_reduce_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 { ; CHECK-SVE-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP19]]) ; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] -; CHECK-SVE: scalar.ph: -; CHECK-SVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-SVE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP21]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] -; CHECK-SVE-NEXT: br label [[FOR_BODY:%.*]] -; CHECK-SVE: for.cond.cleanup: -; CHECK-SVE-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[SUB:%.*]], [[FOR_BODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] -; CHECK-SVE-NEXT: ret i32 [[RES_0_LCSSA]] -; CHECK-SVE: for.body: -; CHECK-SVE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-SVE-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB]], [[FOR_BODY]] ] -; CHECK-SVE-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]] -; CHECK-SVE-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]] -; CHECK-SVE-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]] -; CHECK-SVE-NEXT: [[A_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1 -; CHECK-SVE-NEXT: [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1 -; CHECK-SVE-NEXT: [[C_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1 -; CHECK-SVE-NEXT: [[A_EXT:%.*]] = sext i8 [[A_VAL]] to i32 -; CHECK-SVE-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32 -; CHECK-SVE-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32 -; CHECK-SVE-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]] -; CHECK-SVE-NEXT: [[ADD:%.*]] = add nsw i32 [[RES]], [[MUL_AB]] -; CHECK-SVE-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[A_EXT]], [[C_EXT]] -; CHECK-SVE-NEXT: [[SUB]] = sub i32 [[ADD]], [[MUL_AC]] -; CHECK-SVE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; CHECK-SVE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; CHECK-SVE-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]], !loop [[META4:![0-9]+]] ; ; CHECK-SVE-MAXBW-LABEL: define i32 @chained_partial_reduce_add_sub( ; CHECK-SVE-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] { @@ -191,32 +139,6 @@ define i32 @chained_partial_reduce_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 { ; CHECK-SVE-MAXBW-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32( [[TMP19]]) ; CHECK-SVE-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-SVE-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] -; CHECK-SVE-MAXBW: scalar.ph: -; CHECK-SVE-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-SVE-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP21]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] -; CHECK-SVE-MAXBW-NEXT: br label [[FOR_BODY:%.*]] -; CHECK-SVE-MAXBW: for.cond.cleanup: -; CHECK-SVE-MAXBW-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[SUB:%.*]], [[FOR_BODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] -; CHECK-SVE-MAXBW-NEXT: ret i32 [[RES_0_LCSSA]] -; CHECK-SVE-MAXBW: for.body: -; CHECK-SVE-MAXBW-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-SVE-MAXBW-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB]], [[FOR_BODY]] ] -; CHECK-SVE-MAXBW-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]] -; CHECK-SVE-MAXBW-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]] -; CHECK-SVE-MAXBW-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]] -; CHECK-SVE-MAXBW-NEXT: [[A_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1 -; CHECK-SVE-MAXBW-NEXT: [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1 -; CHECK-SVE-MAXBW-NEXT: [[C_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1 -; CHECK-SVE-MAXBW-NEXT: [[A_EXT:%.*]] = sext i8 [[A_VAL]] to i32 -; CHECK-SVE-MAXBW-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32 -; CHECK-SVE-MAXBW-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32 -; CHECK-SVE-MAXBW-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]] -; CHECK-SVE-MAXBW-NEXT: [[ADD:%.*]] = add nsw i32 [[RES]], [[MUL_AB]] -; CHECK-SVE-MAXBW-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[A_EXT]], [[C_EXT]] -; CHECK-SVE-MAXBW-NEXT: [[SUB]] = sub i32 [[ADD]], [[MUL_AC]] -; CHECK-SVE-MAXBW-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; CHECK-SVE-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; CHECK-SVE-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]], !loop [[META4:![0-9]+]] ; entry: %cmp28.not = icmp ult i32 %N, 2 @@ -289,32 +211,6 @@ define i32 @chained_partial_reduce_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 { ; CHECK-NEON-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE3]]) ; CHECK-NEON-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-NEON-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] -; CHECK-NEON: scalar.ph: -; CHECK-NEON-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEON-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP13]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] -; CHECK-NEON-NEXT: br label [[FOR_BODY:%.*]] -; CHECK-NEON: for.cond.cleanup: -; CHECK-NEON-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[ADD_2:%.*]], [[FOR_BODY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ] -; CHECK-NEON-NEXT: ret i32 [[RES_0_LCSSA]] -; CHECK-NEON: for.body: -; CHECK-NEON-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEON-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD_2]], [[FOR_BODY]] ] -; CHECK-NEON-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]] -; CHECK-NEON-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]] -; CHECK-NEON-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]] -; CHECK-NEON-NEXT: [[A_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1 -; CHECK-NEON-NEXT: [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1 -; CHECK-NEON-NEXT: [[C_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1 -; CHECK-NEON-NEXT: [[A_EXT:%.*]] = sext i8 [[A_VAL]] to i32 -; CHECK-NEON-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32 -; CHECK-NEON-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32 -; CHECK-NEON-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]] -; CHECK-NEON-NEXT: [[ADD:%.*]] = add nsw i32 [[RES]], [[MUL_AB]] -; CHECK-NEON-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[A_EXT]], [[C_EXT]] -; CHECK-NEON-NEXT: [[ADD_2]] = add i32 [[ADD]], [[MUL_AC]] -; CHECK-NEON-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; CHECK-NEON-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; CHECK-NEON-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]], !loop [[META4]] ; ; CHECK-SVE-LABEL: define i32 @chained_partial_reduce_add_add( ; CHECK-SVE-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { @@ -361,32 +257,6 @@ define i32 @chained_partial_reduce_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 { ; CHECK-SVE-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP19]]) ; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] -; CHECK-SVE: scalar.ph: -; CHECK-SVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-SVE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP21]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] -; CHECK-SVE-NEXT: br label [[FOR_BODY:%.*]] -; CHECK-SVE: for.cond.cleanup: -; CHECK-SVE-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[ADD_2:%.*]], [[FOR_BODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] -; CHECK-SVE-NEXT: ret i32 [[RES_0_LCSSA]] -; CHECK-SVE: for.body: -; CHECK-SVE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-SVE-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD_2]], [[FOR_BODY]] ] -; CHECK-SVE-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]] -; CHECK-SVE-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]] -; CHECK-SVE-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]] -; CHECK-SVE-NEXT: [[A_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1 -; CHECK-SVE-NEXT: [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1 -; CHECK-SVE-NEXT: [[C_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1 -; CHECK-SVE-NEXT: [[A_EXT:%.*]] = sext i8 [[A_VAL]] to i32 -; CHECK-SVE-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32 -; CHECK-SVE-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32 -; CHECK-SVE-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]] -; CHECK-SVE-NEXT: [[ADD:%.*]] = add nsw i32 [[RES]], [[MUL_AB]] -; CHECK-SVE-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[A_EXT]], [[C_EXT]] -; CHECK-SVE-NEXT: [[ADD_2]] = add i32 [[ADD]], [[MUL_AC]] -; CHECK-SVE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; CHECK-SVE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; CHECK-SVE-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]], !loop [[META4]] ; ; CHECK-SVE-MAXBW-LABEL: define i32 @chained_partial_reduce_add_add( ; CHECK-SVE-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { @@ -433,32 +303,6 @@ define i32 @chained_partial_reduce_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 { ; CHECK-SVE-MAXBW-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32( [[PARTIAL_REDUCE3]]) ; CHECK-SVE-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-SVE-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] -; CHECK-SVE-MAXBW: scalar.ph: -; CHECK-SVE-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-SVE-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP19]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] -; CHECK-SVE-MAXBW-NEXT: br label [[FOR_BODY:%.*]] -; CHECK-SVE-MAXBW: for.cond.cleanup: -; CHECK-SVE-MAXBW-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[ADD_2:%.*]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ] -; CHECK-SVE-MAXBW-NEXT: ret i32 [[RES_0_LCSSA]] -; CHECK-SVE-MAXBW: for.body: -; CHECK-SVE-MAXBW-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-SVE-MAXBW-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD_2]], [[FOR_BODY]] ] -; CHECK-SVE-MAXBW-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]] -; CHECK-SVE-MAXBW-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]] -; CHECK-SVE-MAXBW-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]] -; CHECK-SVE-MAXBW-NEXT: [[A_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1 -; CHECK-SVE-MAXBW-NEXT: [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1 -; CHECK-SVE-MAXBW-NEXT: [[C_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1 -; CHECK-SVE-MAXBW-NEXT: [[A_EXT:%.*]] = sext i8 [[A_VAL]] to i32 -; CHECK-SVE-MAXBW-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32 -; CHECK-SVE-MAXBW-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32 -; CHECK-SVE-MAXBW-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]] -; CHECK-SVE-MAXBW-NEXT: [[ADD:%.*]] = add nsw i32 [[RES]], [[MUL_AB]] -; CHECK-SVE-MAXBW-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[A_EXT]], [[C_EXT]] -; CHECK-SVE-MAXBW-NEXT: [[ADD_2]] = add i32 [[ADD]], [[MUL_AC]] -; CHECK-SVE-MAXBW-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; CHECK-SVE-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; CHECK-SVE-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]], !loop [[META4]] ; entry: %cmp28.not = icmp ult i32 %N, 2 @@ -531,32 +375,6 @@ define i32 @chained_partial_reduce_sub_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 { ; CHECK-NEON-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP13]]) ; CHECK-NEON-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-NEON-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] -; CHECK-NEON: scalar.ph: -; CHECK-NEON-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEON-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP15]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] -; CHECK-NEON-NEXT: br label [[FOR_BODY:%.*]] -; CHECK-NEON: for.cond.cleanup: -; CHECK-NEON-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ] -; CHECK-NEON-NEXT: ret i32 [[RES_0_LCSSA]] -; CHECK-NEON: for.body: -; CHECK-NEON-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEON-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD]], [[FOR_BODY]] ] -; CHECK-NEON-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]] -; CHECK-NEON-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]] -; CHECK-NEON-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]] -; CHECK-NEON-NEXT: [[A_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1 -; CHECK-NEON-NEXT: [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1 -; CHECK-NEON-NEXT: [[C_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1 -; CHECK-NEON-NEXT: [[A_EXT:%.*]] = sext i8 [[A_VAL]] to i32 -; CHECK-NEON-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32 -; CHECK-NEON-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32 -; CHECK-NEON-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]] -; CHECK-NEON-NEXT: [[SUB:%.*]] = sub nsw i32 [[RES]], [[MUL_AB]] -; CHECK-NEON-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[A_EXT]], [[C_EXT]] -; CHECK-NEON-NEXT: [[ADD]] = add i32 [[SUB]], [[MUL_AC]] -; CHECK-NEON-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; CHECK-NEON-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; CHECK-NEON-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]], !loop [[META4]] ; ; CHECK-SVE-LABEL: define i32 @chained_partial_reduce_sub_add( ; CHECK-SVE-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { @@ -603,32 +421,6 @@ define i32 @chained_partial_reduce_sub_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 { ; CHECK-SVE-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP19]]) ; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] -; CHECK-SVE: scalar.ph: -; CHECK-SVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-SVE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP21]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] -; CHECK-SVE-NEXT: br label [[FOR_BODY:%.*]] -; CHECK-SVE: for.cond.cleanup: -; CHECK-SVE-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] -; CHECK-SVE-NEXT: ret i32 [[RES_0_LCSSA]] -; CHECK-SVE: for.body: -; CHECK-SVE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-SVE-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD]], [[FOR_BODY]] ] -; CHECK-SVE-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]] -; CHECK-SVE-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]] -; CHECK-SVE-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]] -; CHECK-SVE-NEXT: [[A_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1 -; CHECK-SVE-NEXT: [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1 -; CHECK-SVE-NEXT: [[C_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1 -; CHECK-SVE-NEXT: [[A_EXT:%.*]] = sext i8 [[A_VAL]] to i32 -; CHECK-SVE-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32 -; CHECK-SVE-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32 -; CHECK-SVE-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]] -; CHECK-SVE-NEXT: [[SUB:%.*]] = sub nsw i32 [[RES]], [[MUL_AB]] -; CHECK-SVE-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[A_EXT]], [[C_EXT]] -; CHECK-SVE-NEXT: [[ADD]] = add i32 [[SUB]], [[MUL_AC]] -; CHECK-SVE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; CHECK-SVE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; CHECK-SVE-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]], !loop [[META4]] ; ; CHECK-SVE-MAXBW-LABEL: define i32 @chained_partial_reduce_sub_add( ; CHECK-SVE-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { @@ -675,32 +467,6 @@ define i32 @chained_partial_reduce_sub_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 { ; CHECK-SVE-MAXBW-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32( [[TMP19]]) ; CHECK-SVE-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-SVE-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] -; CHECK-SVE-MAXBW: scalar.ph: -; CHECK-SVE-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-SVE-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP21]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] -; CHECK-SVE-MAXBW-NEXT: br label [[FOR_BODY:%.*]] -; CHECK-SVE-MAXBW: for.cond.cleanup: -; CHECK-SVE-MAXBW-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] -; CHECK-SVE-MAXBW-NEXT: ret i32 [[RES_0_LCSSA]] -; CHECK-SVE-MAXBW: for.body: -; CHECK-SVE-MAXBW-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-SVE-MAXBW-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD]], [[FOR_BODY]] ] -; CHECK-SVE-MAXBW-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]] -; CHECK-SVE-MAXBW-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]] -; CHECK-SVE-MAXBW-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]] -; CHECK-SVE-MAXBW-NEXT: [[A_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1 -; CHECK-SVE-MAXBW-NEXT: [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1 -; CHECK-SVE-MAXBW-NEXT: [[C_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1 -; CHECK-SVE-MAXBW-NEXT: [[A_EXT:%.*]] = sext i8 [[A_VAL]] to i32 -; CHECK-SVE-MAXBW-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32 -; CHECK-SVE-MAXBW-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32 -; CHECK-SVE-MAXBW-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]] -; CHECK-SVE-MAXBW-NEXT: [[SUB:%.*]] = sub nsw i32 [[RES]], [[MUL_AB]] -; CHECK-SVE-MAXBW-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[A_EXT]], [[C_EXT]] -; CHECK-SVE-MAXBW-NEXT: [[ADD]] = add i32 [[SUB]], [[MUL_AC]] -; CHECK-SVE-MAXBW-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; CHECK-SVE-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; CHECK-SVE-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]], !loop [[META4]] ; entry: %cmp28.not = icmp ult i32 %N, 2 @@ -775,32 +541,6 @@ define i32 @chained_partial_reduce_sub_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 { ; CHECK-NEON-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP13]]) ; CHECK-NEON-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-NEON-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] -; CHECK-NEON: scalar.ph: -; CHECK-NEON-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEON-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP15]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] -; CHECK-NEON-NEXT: br label [[FOR_BODY:%.*]] -; CHECK-NEON: for.cond.cleanup: -; CHECK-NEON-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[SUB_2:%.*]], [[FOR_BODY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ] -; CHECK-NEON-NEXT: ret i32 [[RES_0_LCSSA]] -; CHECK-NEON: for.body: -; CHECK-NEON-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEON-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB_2]], [[FOR_BODY]] ] -; CHECK-NEON-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]] -; CHECK-NEON-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]] -; CHECK-NEON-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]] -; CHECK-NEON-NEXT: [[A_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1 -; CHECK-NEON-NEXT: [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1 -; CHECK-NEON-NEXT: [[C_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1 -; CHECK-NEON-NEXT: [[A_EXT:%.*]] = sext i8 [[A_VAL]] to i32 -; CHECK-NEON-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32 -; CHECK-NEON-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32 -; CHECK-NEON-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]] -; CHECK-NEON-NEXT: [[SUB:%.*]] = sub nsw i32 [[RES]], [[MUL_AB]] -; CHECK-NEON-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[A_EXT]], [[C_EXT]] -; CHECK-NEON-NEXT: [[SUB_2]] = sub i32 [[SUB]], [[MUL_AC]] -; CHECK-NEON-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; CHECK-NEON-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; CHECK-NEON-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]], !loop [[META4]] ; ; CHECK-SVE-LABEL: define i32 @chained_partial_reduce_sub_sub( ; CHECK-SVE-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { @@ -847,32 +587,6 @@ define i32 @chained_partial_reduce_sub_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 { ; CHECK-SVE-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP19]]) ; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] -; CHECK-SVE: scalar.ph: -; CHECK-SVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-SVE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP21]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] -; CHECK-SVE-NEXT: br label [[FOR_BODY:%.*]] -; CHECK-SVE: for.cond.cleanup: -; CHECK-SVE-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[SUB_2:%.*]], [[FOR_BODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] -; CHECK-SVE-NEXT: ret i32 [[RES_0_LCSSA]] -; CHECK-SVE: for.body: -; CHECK-SVE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-SVE-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB_2]], [[FOR_BODY]] ] -; CHECK-SVE-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]] -; CHECK-SVE-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]] -; CHECK-SVE-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]] -; CHECK-SVE-NEXT: [[A_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1 -; CHECK-SVE-NEXT: [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1 -; CHECK-SVE-NEXT: [[C_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1 -; CHECK-SVE-NEXT: [[A_EXT:%.*]] = sext i8 [[A_VAL]] to i32 -; CHECK-SVE-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32 -; CHECK-SVE-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32 -; CHECK-SVE-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]] -; CHECK-SVE-NEXT: [[SUB:%.*]] = sub nsw i32 [[RES]], [[MUL_AB]] -; CHECK-SVE-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[A_EXT]], [[C_EXT]] -; CHECK-SVE-NEXT: [[SUB_2]] = sub i32 [[SUB]], [[MUL_AC]] -; CHECK-SVE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; CHECK-SVE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; CHECK-SVE-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]], !loop [[META4]] ; ; CHECK-SVE-MAXBW-LABEL: define i32 @chained_partial_reduce_sub_sub( ; CHECK-SVE-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { @@ -919,32 +633,6 @@ define i32 @chained_partial_reduce_sub_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 { ; CHECK-SVE-MAXBW-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32( [[TMP19]]) ; CHECK-SVE-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-SVE-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] -; CHECK-SVE-MAXBW: scalar.ph: -; CHECK-SVE-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-SVE-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP21]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] -; CHECK-SVE-MAXBW-NEXT: br label [[FOR_BODY:%.*]] -; CHECK-SVE-MAXBW: for.cond.cleanup: -; CHECK-SVE-MAXBW-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[SUB_2:%.*]], [[FOR_BODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] -; CHECK-SVE-MAXBW-NEXT: ret i32 [[RES_0_LCSSA]] -; CHECK-SVE-MAXBW: for.body: -; CHECK-SVE-MAXBW-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-SVE-MAXBW-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB_2]], [[FOR_BODY]] ] -; CHECK-SVE-MAXBW-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]] -; CHECK-SVE-MAXBW-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]] -; CHECK-SVE-MAXBW-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]] -; CHECK-SVE-MAXBW-NEXT: [[A_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1 -; CHECK-SVE-MAXBW-NEXT: [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1 -; CHECK-SVE-MAXBW-NEXT: [[C_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1 -; CHECK-SVE-MAXBW-NEXT: [[A_EXT:%.*]] = sext i8 [[A_VAL]] to i32 -; CHECK-SVE-MAXBW-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32 -; CHECK-SVE-MAXBW-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32 -; CHECK-SVE-MAXBW-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]] -; CHECK-SVE-MAXBW-NEXT: [[SUB:%.*]] = sub nsw i32 [[RES]], [[MUL_AB]] -; CHECK-SVE-MAXBW-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[A_EXT]], [[C_EXT]] -; CHECK-SVE-MAXBW-NEXT: [[SUB_2]] = sub i32 [[SUB]], [[MUL_AC]] -; CHECK-SVE-MAXBW-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; CHECK-SVE-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; CHECK-SVE-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]], !loop [[META4]] ; entry: %cmp28.not = icmp ult i32 %N, 2 @@ -1022,34 +710,6 @@ define i32 @chained_partial_reduce_add_add_add(ptr %a, ptr %b, ptr %c, i32 %N) # ; CHECK-NEON-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE4]]) ; CHECK-NEON-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-NEON-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] -; CHECK-NEON: scalar.ph: -; CHECK-NEON-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEON-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP14]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] -; CHECK-NEON-NEXT: br label [[FOR_BODY:%.*]] -; CHECK-NEON: for.cond.cleanup: -; CHECK-NEON-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[SUB_2:%.*]], [[FOR_BODY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ] -; CHECK-NEON-NEXT: ret i32 [[RES_0_LCSSA]] -; CHECK-NEON: for.body: -; CHECK-NEON-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEON-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB_2]], [[FOR_BODY]] ] -; CHECK-NEON-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]] -; CHECK-NEON-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]] -; CHECK-NEON-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]] -; CHECK-NEON-NEXT: [[A_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1 -; CHECK-NEON-NEXT: [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1 -; CHECK-NEON-NEXT: [[C_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1 -; CHECK-NEON-NEXT: [[A_EXT:%.*]] = sext i8 [[A_VAL]] to i32 -; CHECK-NEON-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32 -; CHECK-NEON-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32 -; CHECK-NEON-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]] -; CHECK-NEON-NEXT: [[SUB:%.*]] = add nsw i32 [[RES]], [[MUL_AB]] -; CHECK-NEON-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[A_EXT]], [[C_EXT]] -; CHECK-NEON-NEXT: [[ADD:%.*]] = add nsw i32 [[SUB]], [[MUL_AC]] -; CHECK-NEON-NEXT: [[MUL_BC:%.*]] = mul nsw i32 [[B_EXT]], [[C_EXT]] -; CHECK-NEON-NEXT: [[SUB_2]] = add i32 [[ADD]], [[MUL_BC]] -; CHECK-NEON-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; CHECK-NEON-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; CHECK-NEON-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]], !loop [[META4]] ; ; CHECK-SVE-LABEL: define i32 @chained_partial_reduce_add_add_add( ; CHECK-SVE-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { @@ -1098,34 +758,6 @@ define i32 @chained_partial_reduce_add_add_add(ptr %a, ptr %b, ptr %c, i32 %N) # ; CHECK-SVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP21]]) ; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] -; CHECK-SVE: scalar.ph: -; CHECK-SVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-SVE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP23]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] -; CHECK-SVE-NEXT: br label [[FOR_BODY:%.*]] -; CHECK-SVE: for.cond.cleanup: -; CHECK-SVE-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[SUB_2:%.*]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ] -; CHECK-SVE-NEXT: ret i32 [[RES_0_LCSSA]] -; CHECK-SVE: for.body: -; CHECK-SVE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-SVE-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB_2]], [[FOR_BODY]] ] -; CHECK-SVE-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]] -; CHECK-SVE-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]] -; CHECK-SVE-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]] -; CHECK-SVE-NEXT: [[A_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1 -; CHECK-SVE-NEXT: [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1 -; CHECK-SVE-NEXT: [[C_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1 -; CHECK-SVE-NEXT: [[A_EXT:%.*]] = sext i8 [[A_VAL]] to i32 -; CHECK-SVE-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32 -; CHECK-SVE-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32 -; CHECK-SVE-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]] -; CHECK-SVE-NEXT: [[SUB:%.*]] = add nsw i32 [[RES]], [[MUL_AB]] -; CHECK-SVE-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[A_EXT]], [[C_EXT]] -; CHECK-SVE-NEXT: [[ADD:%.*]] = add nsw i32 [[SUB]], [[MUL_AC]] -; CHECK-SVE-NEXT: [[MUL_BC:%.*]] = mul nsw i32 [[B_EXT]], [[C_EXT]] -; CHECK-SVE-NEXT: [[SUB_2]] = add i32 [[ADD]], [[MUL_BC]] -; CHECK-SVE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; CHECK-SVE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; CHECK-SVE-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]], !loop [[META4]] ; ; CHECK-SVE-MAXBW-LABEL: define i32 @chained_partial_reduce_add_add_add( ; CHECK-SVE-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { @@ -1171,37 +803,9 @@ define i32 @chained_partial_reduce_add_add_add(ptr %a, ptr %b, ptr %c, i32 %N) # ; CHECK-SVE-MAXBW-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-SVE-MAXBW-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK-SVE-MAXBW: middle.block: -; CHECK-SVE-MAXBW-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE4]]) +; CHECK-SVE-MAXBW-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE4]]) ; CHECK-SVE-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-SVE-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] -; CHECK-SVE-MAXBW: scalar.ph: -; CHECK-SVE-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-SVE-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP20]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] -; CHECK-SVE-MAXBW-NEXT: br label [[FOR_BODY:%.*]] -; CHECK-SVE-MAXBW: for.cond.cleanup: -; CHECK-SVE-MAXBW-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[SUB_2:%.*]], [[FOR_BODY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ] -; CHECK-SVE-MAXBW-NEXT: ret i32 [[RES_0_LCSSA]] -; CHECK-SVE-MAXBW: for.body: -; CHECK-SVE-MAXBW-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-SVE-MAXBW-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB_2]], [[FOR_BODY]] ] -; CHECK-SVE-MAXBW-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]] -; CHECK-SVE-MAXBW-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]] -; CHECK-SVE-MAXBW-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]] -; CHECK-SVE-MAXBW-NEXT: [[A_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1 -; CHECK-SVE-MAXBW-NEXT: [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1 -; CHECK-SVE-MAXBW-NEXT: [[C_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1 -; CHECK-SVE-MAXBW-NEXT: [[A_EXT:%.*]] = sext i8 [[A_VAL]] to i32 -; CHECK-SVE-MAXBW-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32 -; CHECK-SVE-MAXBW-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32 -; CHECK-SVE-MAXBW-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]] -; CHECK-SVE-MAXBW-NEXT: [[SUB:%.*]] = add nsw i32 [[RES]], [[MUL_AB]] -; CHECK-SVE-MAXBW-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[A_EXT]], [[C_EXT]] -; CHECK-SVE-MAXBW-NEXT: [[ADD:%.*]] = add nsw i32 [[SUB]], [[MUL_AC]] -; CHECK-SVE-MAXBW-NEXT: [[MUL_BC:%.*]] = mul nsw i32 [[B_EXT]], [[C_EXT]] -; CHECK-SVE-MAXBW-NEXT: [[SUB_2]] = add i32 [[ADD]], [[MUL_BC]] -; CHECK-SVE-MAXBW-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; CHECK-SVE-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; CHECK-SVE-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]], !loop [[META4]] ; entry: %cmp28.not = icmp ult i32 %N, 2 @@ -1281,34 +885,6 @@ define i32 @chained_partial_reduce_sub_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) # ; CHECK-NEON-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP15]]) ; CHECK-NEON-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-NEON-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] -; CHECK-NEON: scalar.ph: -; CHECK-NEON-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEON-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP17]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] -; CHECK-NEON-NEXT: br label [[FOR_BODY:%.*]] -; CHECK-NEON: for.cond.cleanup: -; CHECK-NEON-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[SUB_2:%.*]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ] -; CHECK-NEON-NEXT: ret i32 [[RES_0_LCSSA]] -; CHECK-NEON: for.body: -; CHECK-NEON-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEON-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB_2]], [[FOR_BODY]] ] -; CHECK-NEON-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]] -; CHECK-NEON-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]] -; CHECK-NEON-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]] -; CHECK-NEON-NEXT: [[A_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1 -; CHECK-NEON-NEXT: [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1 -; CHECK-NEON-NEXT: [[C_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1 -; CHECK-NEON-NEXT: [[A_EXT:%.*]] = sext i8 [[A_VAL]] to i32 -; CHECK-NEON-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32 -; CHECK-NEON-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32 -; CHECK-NEON-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]] -; CHECK-NEON-NEXT: [[SUB:%.*]] = sub nsw i32 [[RES]], [[MUL_AB]] -; CHECK-NEON-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[A_EXT]], [[C_EXT]] -; CHECK-NEON-NEXT: [[ADD:%.*]] = add nsw i32 [[SUB]], [[MUL_AC]] -; CHECK-NEON-NEXT: [[MUL_BC:%.*]] = mul nsw i32 [[B_EXT]], [[C_EXT]] -; CHECK-NEON-NEXT: [[SUB_2]] = sub i32 [[ADD]], [[MUL_BC]] -; CHECK-NEON-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; CHECK-NEON-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; CHECK-NEON-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]], !loop [[META4]] ; ; CHECK-SVE-LABEL: define i32 @chained_partial_reduce_sub_add_sub( ; CHECK-SVE-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { @@ -1357,34 +933,6 @@ define i32 @chained_partial_reduce_sub_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) # ; CHECK-SVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP21]]) ; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] -; CHECK-SVE: scalar.ph: -; CHECK-SVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-SVE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP23]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] -; CHECK-SVE-NEXT: br label [[FOR_BODY:%.*]] -; CHECK-SVE: for.cond.cleanup: -; CHECK-SVE-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[SUB_2:%.*]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ] -; CHECK-SVE-NEXT: ret i32 [[RES_0_LCSSA]] -; CHECK-SVE: for.body: -; CHECK-SVE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-SVE-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB_2]], [[FOR_BODY]] ] -; CHECK-SVE-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]] -; CHECK-SVE-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]] -; CHECK-SVE-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]] -; CHECK-SVE-NEXT: [[A_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1 -; CHECK-SVE-NEXT: [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1 -; CHECK-SVE-NEXT: [[C_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1 -; CHECK-SVE-NEXT: [[A_EXT:%.*]] = sext i8 [[A_VAL]] to i32 -; CHECK-SVE-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32 -; CHECK-SVE-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32 -; CHECK-SVE-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]] -; CHECK-SVE-NEXT: [[SUB:%.*]] = sub nsw i32 [[RES]], [[MUL_AB]] -; CHECK-SVE-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[A_EXT]], [[C_EXT]] -; CHECK-SVE-NEXT: [[ADD:%.*]] = add nsw i32 [[SUB]], [[MUL_AC]] -; CHECK-SVE-NEXT: [[MUL_BC:%.*]] = mul nsw i32 [[B_EXT]], [[C_EXT]] -; CHECK-SVE-NEXT: [[SUB_2]] = sub i32 [[ADD]], [[MUL_BC]] -; CHECK-SVE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; CHECK-SVE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; CHECK-SVE-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]], !loop [[META4]] ; ; CHECK-SVE-MAXBW-LABEL: define i32 @chained_partial_reduce_sub_add_sub( ; CHECK-SVE-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { @@ -1433,34 +981,6 @@ define i32 @chained_partial_reduce_sub_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) # ; CHECK-SVE-MAXBW-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32( [[TMP21]]) ; CHECK-SVE-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-SVE-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] -; CHECK-SVE-MAXBW: scalar.ph: -; CHECK-SVE-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-SVE-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP23]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] -; CHECK-SVE-MAXBW-NEXT: br label [[FOR_BODY:%.*]] -; CHECK-SVE-MAXBW: for.cond.cleanup: -; CHECK-SVE-MAXBW-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[SUB_2:%.*]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ] -; CHECK-SVE-MAXBW-NEXT: ret i32 [[RES_0_LCSSA]] -; CHECK-SVE-MAXBW: for.body: -; CHECK-SVE-MAXBW-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-SVE-MAXBW-NEXT: [[RES:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB_2]], [[FOR_BODY]] ] -; CHECK-SVE-MAXBW-NEXT: [[A_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDVARS_IV]] -; CHECK-SVE-MAXBW-NEXT: [[B_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDVARS_IV]] -; CHECK-SVE-MAXBW-NEXT: [[C_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDVARS_IV]] -; CHECK-SVE-MAXBW-NEXT: [[A_VAL:%.*]] = load i8, ptr [[A_PTR]], align 1 -; CHECK-SVE-MAXBW-NEXT: [[B_VAL:%.*]] = load i8, ptr [[B_PTR]], align 1 -; CHECK-SVE-MAXBW-NEXT: [[C_VAL:%.*]] = load i8, ptr [[C_PTR]], align 1 -; CHECK-SVE-MAXBW-NEXT: [[A_EXT:%.*]] = sext i8 [[A_VAL]] to i32 -; CHECK-SVE-MAXBW-NEXT: [[B_EXT:%.*]] = sext i8 [[B_VAL]] to i32 -; CHECK-SVE-MAXBW-NEXT: [[C_EXT:%.*]] = sext i8 [[C_VAL]] to i32 -; CHECK-SVE-MAXBW-NEXT: [[MUL_AB:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]] -; CHECK-SVE-MAXBW-NEXT: [[SUB:%.*]] = sub nsw i32 [[RES]], [[MUL_AB]] -; CHECK-SVE-MAXBW-NEXT: [[MUL_AC:%.*]] = mul nsw i32 [[A_EXT]], [[C_EXT]] -; CHECK-SVE-MAXBW-NEXT: [[ADD:%.*]] = add nsw i32 [[SUB]], [[MUL_AC]] -; CHECK-SVE-MAXBW-NEXT: [[MUL_BC:%.*]] = mul nsw i32 [[B_EXT]], [[C_EXT]] -; CHECK-SVE-MAXBW-NEXT: [[SUB_2]] = sub i32 [[ADD]], [[MUL_BC]] -; CHECK-SVE-MAXBW-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; CHECK-SVE-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; CHECK-SVE-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]], !loop [[META4]] ; entry: %cmp28.not = icmp ult i32 %N, 2 @@ -1503,55 +1023,3 @@ attributes #0 = { vscale_range(1,16) } !0 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} !1 = distinct !{!0} -;. -; CHECK-NEON: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} -; CHECK-NEON: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} -; CHECK-NEON: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK-NEON: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} -; CHECK-NEON: [[META4]] = distinct !{[[META5:![0-9]+]]} -; CHECK-NEON: [[META5]] = !{!"llvm.loop.vectorize.scalable.enable", i1 true} -; CHECK-NEON: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} -; CHECK-NEON: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} -; CHECK-NEON: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} -; CHECK-NEON: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]} -; CHECK-NEON: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]} -; CHECK-NEON: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]} -; CHECK-NEON: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]} -; CHECK-NEON: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]} -; CHECK-NEON: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]} -; CHECK-NEON: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]} -;. -; CHECK-SVE: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} -; CHECK-SVE: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} -; CHECK-SVE: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK-SVE: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} -; CHECK-SVE: [[META4]] = distinct !{[[META5:![0-9]+]]} -; CHECK-SVE: [[META5]] = !{!"llvm.loop.vectorize.scalable.enable", i1 true} -; CHECK-SVE: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} -; CHECK-SVE: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} -; CHECK-SVE: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} -; CHECK-SVE: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]} -; CHECK-SVE: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]} -; CHECK-SVE: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]} -; CHECK-SVE: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]} -; CHECK-SVE: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]} -; CHECK-SVE: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]} -; CHECK-SVE: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]} -;. -; CHECK-SVE-MAXBW: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} -; CHECK-SVE-MAXBW: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} -; CHECK-SVE-MAXBW: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK-SVE-MAXBW: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} -; CHECK-SVE-MAXBW: [[META4]] = distinct !{[[META5:![0-9]+]]} -; CHECK-SVE-MAXBW: [[META5]] = !{!"llvm.loop.vectorize.scalable.enable", i1 true} -; CHECK-SVE-MAXBW: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} -; CHECK-SVE-MAXBW: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} -; CHECK-SVE-MAXBW: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} -; CHECK-SVE-MAXBW: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]} -; CHECK-SVE-MAXBW: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]} -; CHECK-SVE-MAXBW: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]} -; CHECK-SVE-MAXBW: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]} -; CHECK-SVE-MAXBW: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]} -; CHECK-SVE-MAXBW: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]} -; CHECK-SVE-MAXBW: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]} -;. From 33939d8942b1d6de1afe0534e404eb48d442bfb8 Mon Sep 17 00:00:00 2001 From: Nick Guy Date: Thu, 23 Jan 2025 14:31:05 +0000 Subject: [PATCH 7/8] Rename variables --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 6 +++--- llvm/lib/Transforms/Vectorize/VPlan.h | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index d913954c51c00..6f99a05ce9b69 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8996,13 +8996,13 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction, "Unexpected number of operands for partial reduction"); VPValue *BinOp = Operands[0]; - VPValue *Phi = Operands[1]; + VPValue *Accumulator = Operands[1]; VPRecipeBase *BinOpRecipe = BinOp->getDefiningRecipe(); if (isa(BinOpRecipe) || isa(BinOpRecipe)) - std::swap(BinOp, Phi); + std::swap(BinOp, Accumulator); - return new VPPartialReductionRecipe(Reduction->getOpcode(), BinOp, Phi, + return new VPPartialReductionRecipe(Reduction->getOpcode(), BinOp, Accumulator, Reduction); } diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 1550630fb819a..5865e8b5dee64 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -2453,9 +2453,9 @@ class VPPartialReductionRecipe : public VPSingleDefRecipe { : VPSingleDefRecipe(VPDef::VPPartialReductionSC, ArrayRef({Op0, Op1}), ReductionInst), Opcode(Opcode) { - auto *DefiningRecipe = getOperand(1)->getDefiningRecipe(); - assert((isa(DefiningRecipe) || - isa(DefiningRecipe)) && + auto *AccumulatorRecipe = getOperand(1)->getDefiningRecipe(); + assert((isa(AccumulatorRecipe) || + isa(AccumulatorRecipe)) && "Unexpected operand order for partial reduction recipe"); } ~VPPartialReductionRecipe() override = default; From 48aeb02dadcab8b3e0123eb74cd7fe5ffa269db6 Mon Sep 17 00:00:00 2001 From: Nick Guy Date: Thu, 23 Jan 2025 15:05:56 +0000 Subject: [PATCH 8/8] Format --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 6f99a05ce9b69..585785a181a84 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -9002,8 +9002,8 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction, isa(BinOpRecipe)) std::swap(BinOp, Accumulator); - return new VPPartialReductionRecipe(Reduction->getOpcode(), BinOp, Accumulator, - Reduction); + return new VPPartialReductionRecipe(Reduction->getOpcode(), BinOp, + Accumulator, Reduction); } void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,