diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 325056c781a53..a1529e49d5dba 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -4683,7 +4683,9 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost( InstructionCost Invalid = InstructionCost::getInvalid(); InstructionCost Cost(TTI::TCC_Basic); - if (Opcode != Instruction::Add) + // Sub opcodes currently only occur in chained cases. + // Independent partial reduction subtractions are still costed as an add + if (Opcode != Instruction::Add && Opcode != Instruction::Sub) return Invalid; if (InputTypeA != InputTypeB) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 610e4904a80ad..39f7ed971e55b 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8800,6 +8800,10 @@ bool VPRecipeBuilder::getScaledReductions( return false; using namespace llvm::PatternMatch; + // Use the side-effect of match to replace BinOp only if the pattern is + // matched, we don't care at this point whether it actually matched. + match(BinOp, m_Neg(m_BinOp(BinOp))); + Value *A, *B; if (!match(BinOp->getOperand(0), m_ZExtOrSExt(m_Value(A))) || !match(BinOp->getOperand(1), m_ZExtOrSExt(m_Value(B)))) @@ -8932,6 +8936,19 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction, std::swap(BinOp, Accumulator); unsigned ReductionOpcode = Reduction->getOpcode(); + if (ReductionOpcode == Instruction::Sub) { + VPBasicBlock *ParentBlock = Builder.getInsertBlock(); + assert(ParentBlock && "Builder must have an insert block."); + + auto *const Zero = ConstantInt::get(Reduction->getType(), 0); + SmallVector Ops; + Ops.push_back(Plan.getOrAddLiveIn(Zero)); + Ops.push_back(BinOp); + BinOp = new VPWidenRecipe(*Reduction, make_range(Ops.begin(), Ops.end())); + ParentBlock->appendRecipe(BinOp->getDefiningRecipe()); + ReductionOpcode = Instruction::Add; + } + if (CM.blockNeedsPredicationForAnyReason(Reduction->getParent())) { assert((ReductionOpcode == Instruction::Add || ReductionOpcode == Instruction::Sub) && diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 618c8aef38408..c3c0a92e5a943 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -27,6 +27,7 @@ #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" #include "llvm/IR/VectorBuilder.h" @@ -284,13 +285,18 @@ InstructionCost VPPartialReductionRecipe::computeCost(ElementCount VF, VPCostContext &Ctx) const { std::optional Opcode = std::nullopt; - VPRecipeBase *BinOpR = getOperand(0)->getDefiningRecipe(); + VPValue *BinOp = getOperand(0); // If the partial reduction is predicated, a select will be operand 0 rather // than the binary op using namespace llvm::VPlanPatternMatch; if (match(getOperand(0), m_Select(m_VPValue(), m_VPValue(), m_VPValue()))) - BinOpR = BinOpR->getOperand(1)->getDefiningRecipe(); + BinOp = BinOp->getDefiningRecipe()->getOperand(1); + + // If BinOp is a negation, use the side effect of match to assign the actual + // binary operation to BinOp + match(BinOp, m_Binary(m_SpecificInt(0), m_VPValue(BinOp))); + VPRecipeBase *BinOpR = BinOp->getDefiningRecipe(); if (auto *WidenR = dyn_cast(BinOpR)) Opcode = std::make_optional(WidenR->getOpcode()); diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll index bedf8b6b3a9b5..4e4a5c82c298a 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 ; RUN: opt --mattr=+neon,+dotprod -passes=loop-vectorize -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-NEON ; RUN: opt --mattr=+sve -passes=loop-vectorize -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-SVE -; RUN: opt --mattr=+sve -vectorizer-maximize-bandwidth -passes=loop-vectorize -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-SVE-MAXBW +; RUN: opt --mattr=+sve -vectorizer-maximize-bandwidth -passes=loop-vectorize -force-vector-width=8 -scalable-vectorization=preferred -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-SVE-MAXBW target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64-none-unknown-elf" @@ -22,7 +22,7 @@ define i32 @chained_partial_reduce_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 { ; CHECK-NEON-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-NEON: vector.body: ; CHECK-NEON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEON-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEON-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEON-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEON-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP0]] ; CHECK-NEON-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP0]] @@ -37,14 +37,15 @@ define i32 @chained_partial_reduce_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 { ; CHECK-NEON-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> ; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-NEON-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]] -; CHECK-NEON-NEXT: [[TMP11:%.*]] = add <16 x i32> [[VEC_PHI]], [[TMP10]] +; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]]) ; CHECK-NEON-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]] -; CHECK-NEON-NEXT: [[TMP13]] = sub <16 x i32> [[TMP11]], [[TMP12]] +; CHECK-NEON-NEXT: [[TMP13:%.*]] = sub <16 x i32> zeroinitializer, [[TMP12]] +; CHECK-NEON-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP13]]) ; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEON-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEON-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-NEON: middle.block: -; CHECK-NEON-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP13]]) +; CHECK-NEON-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE3]]) ; CHECK-NEON-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-NEON-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; @@ -114,7 +115,7 @@ define i32 @chained_partial_reduce_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 { ; CHECK-SVE-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-SVE-MAXBW: vector.body: ; CHECK-SVE-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-SVE-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ] ; CHECK-SVE-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 ; CHECK-SVE-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]] ; CHECK-SVE-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]] @@ -129,14 +130,15 @@ define i32 @chained_partial_reduce_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 { ; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext [[WIDE_LOAD1]] to ; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = sext [[WIDE_LOAD2]] to ; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw [[TMP13]], [[TMP14]] -; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = add [[VEC_PHI]], [[TMP16]] -; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = mul nsw [[TMP13]], [[TMP15]] -; CHECK-SVE-MAXBW-NEXT: [[TMP19]] = sub [[TMP17]], [[TMP18]] +; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32( [[VEC_PHI]], [[TMP16]]) +; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = mul nsw [[TMP13]], [[TMP15]] +; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = sub zeroinitializer, [[TMP17]] +; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE3]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32( [[PARTIAL_REDUCE]], [[TMP18]]) ; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-SVE-MAXBW-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-SVE-MAXBW-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-SVE-MAXBW: middle.block: -; CHECK-SVE-MAXBW-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32( [[TMP19]]) +; CHECK-SVE-MAXBW-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32( [[PARTIAL_REDUCE3]]) ; CHECK-SVE-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-SVE-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; @@ -350,7 +352,7 @@ define i32 @chained_partial_reduce_sub_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 { ; CHECK-NEON-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-NEON: vector.body: ; CHECK-NEON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEON-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEON-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEON-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEON-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP0]] ; CHECK-NEON-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP0]] @@ -365,14 +367,15 @@ define i32 @chained_partial_reduce_sub_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 { ; CHECK-NEON-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> ; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-NEON-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]] -; CHECK-NEON-NEXT: [[TMP11:%.*]] = sub <16 x i32> [[VEC_PHI]], [[TMP10]] +; CHECK-NEON-NEXT: [[TMP11:%.*]] = sub nsw <16 x i32> zeroinitializer, [[TMP10]] +; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]]) ; CHECK-NEON-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]] -; CHECK-NEON-NEXT: [[TMP13]] = add <16 x i32> [[TMP11]], [[TMP12]] +; CHECK-NEON-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP12]]) ; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEON-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEON-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK-NEON: middle.block: -; CHECK-NEON-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP13]]) +; CHECK-NEON-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE3]]) ; CHECK-NEON-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-NEON-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; @@ -442,7 +445,7 @@ define i32 @chained_partial_reduce_sub_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 { ; CHECK-SVE-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-SVE-MAXBW: vector.body: ; CHECK-SVE-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-SVE-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ] ; CHECK-SVE-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 ; CHECK-SVE-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]] ; CHECK-SVE-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]] @@ -457,14 +460,15 @@ define i32 @chained_partial_reduce_sub_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 { ; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext [[WIDE_LOAD1]] to ; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = sext [[WIDE_LOAD2]] to ; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw [[TMP13]], [[TMP14]] -; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = sub [[VEC_PHI]], [[TMP16]] +; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = sub nsw zeroinitializer, [[TMP16]] +; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32( [[VEC_PHI]], [[TMP17]]) ; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = mul nsw [[TMP13]], [[TMP15]] -; CHECK-SVE-MAXBW-NEXT: [[TMP19]] = add [[TMP17]], [[TMP18]] +; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE3]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32( [[PARTIAL_REDUCE]], [[TMP18]]) ; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-SVE-MAXBW-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-SVE-MAXBW-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK-SVE-MAXBW: middle.block: -; CHECK-SVE-MAXBW-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32( [[TMP19]]) +; CHECK-SVE-MAXBW-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32( [[PARTIAL_REDUCE3]]) ; CHECK-SVE-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-SVE-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; @@ -516,7 +520,7 @@ define i32 @chained_partial_reduce_sub_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 { ; CHECK-NEON-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-NEON: vector.body: ; CHECK-NEON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEON-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEON-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEON-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEON-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP0]] ; CHECK-NEON-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP0]] @@ -531,14 +535,16 @@ define i32 @chained_partial_reduce_sub_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 { ; CHECK-NEON-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> ; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-NEON-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]] -; CHECK-NEON-NEXT: [[TMP11:%.*]] = sub <16 x i32> [[VEC_PHI]], [[TMP10]] +; CHECK-NEON-NEXT: [[TMP11:%.*]] = sub nsw <16 x i32> zeroinitializer, [[TMP10]] +; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]]) ; CHECK-NEON-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]] -; CHECK-NEON-NEXT: [[TMP13]] = sub <16 x i32> [[TMP11]], [[TMP12]] +; CHECK-NEON-NEXT: [[TMP13:%.*]] = sub <16 x i32> zeroinitializer, [[TMP12]] +; CHECK-NEON-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP13]]) ; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEON-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEON-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK-NEON: middle.block: -; CHECK-NEON-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP13]]) +; CHECK-NEON-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE3]]) ; CHECK-NEON-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-NEON-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; @@ -608,7 +614,7 @@ define i32 @chained_partial_reduce_sub_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 { ; CHECK-SVE-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-SVE-MAXBW: vector.body: ; CHECK-SVE-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-SVE-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ] ; CHECK-SVE-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 ; CHECK-SVE-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]] ; CHECK-SVE-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]] @@ -623,14 +629,16 @@ define i32 @chained_partial_reduce_sub_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 { ; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext [[WIDE_LOAD1]] to ; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = sext [[WIDE_LOAD2]] to ; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw [[TMP13]], [[TMP14]] -; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = sub [[VEC_PHI]], [[TMP16]] +; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = sub nsw zeroinitializer, [[TMP16]] +; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32( [[VEC_PHI]], [[TMP17]]) ; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = mul nsw [[TMP13]], [[TMP15]] -; CHECK-SVE-MAXBW-NEXT: [[TMP19]] = sub [[TMP17]], [[TMP18]] +; CHECK-SVE-MAXBW-NEXT: [[TMP19:%.*]] = sub zeroinitializer, [[TMP18]] +; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE3]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32( [[PARTIAL_REDUCE]], [[TMP19]]) ; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-SVE-MAXBW-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-SVE-MAXBW-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK-SVE-MAXBW: middle.block: -; CHECK-SVE-MAXBW-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32( [[TMP19]]) +; CHECK-SVE-MAXBW-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32( [[PARTIAL_REDUCE3]]) ; CHECK-SVE-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-SVE-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; @@ -766,44 +774,44 @@ define i32 @chained_partial_reduce_add_add_add(ptr %a, ptr %b, ptr %c, i32 %N) # ; CHECK-SVE-MAXBW-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 ; CHECK-SVE-MAXBW-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 ; CHECK-SVE-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-SVE-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16 +; CHECK-SVE-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 ; CHECK-SVE-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]] ; CHECK-SVE-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-SVE-MAXBW: vector.ph: ; CHECK-SVE-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-SVE-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16 +; CHECK-SVE-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 ; CHECK-SVE-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]] ; CHECK-SVE-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] ; CHECK-SVE-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-SVE-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16 +; CHECK-SVE-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 ; CHECK-SVE-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-SVE-MAXBW: vector.body: ; CHECK-SVE-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-SVE-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE4:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE4:%.*]], [[VECTOR_BODY]] ] ; CHECK-SVE-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 ; CHECK-SVE-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]] ; CHECK-SVE-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]] ; CHECK-SVE-MAXBW-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]] ; CHECK-SVE-MAXBW-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0 -; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP10]], align 1 +; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP10]], align 1 ; CHECK-SVE-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0 -; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP11]], align 1 +; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP11]], align 1 ; CHECK-SVE-MAXBW-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0 -; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP12]], align 1 -; CHECK-SVE-MAXBW-NEXT: [[TMP13:%.*]] = sext [[WIDE_LOAD]] to -; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext [[WIDE_LOAD1]] to -; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = sext [[WIDE_LOAD2]] to -; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw [[TMP13]], [[TMP14]] -; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[TMP16]]) -; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = mul nsw [[TMP13]], [[TMP15]] -; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE3:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[PARTIAL_REDUCE]], [[TMP17]]) -; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = mul nsw [[TMP14]], [[TMP15]] -; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE4]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[PARTIAL_REDUCE3]], [[TMP18]]) +; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP12]], align 1 +; CHECK-SVE-MAXBW-NEXT: [[TMP13:%.*]] = sext [[WIDE_LOAD]] to +; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext [[WIDE_LOAD1]] to +; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = sext [[WIDE_LOAD2]] to +; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw [[TMP13]], [[TMP14]] +; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32( [[VEC_PHI]], [[TMP16]]) +; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = mul nsw [[TMP13]], [[TMP15]] +; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE3:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32( [[PARTIAL_REDUCE]], [[TMP17]]) +; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = mul nsw [[TMP14]], [[TMP15]] +; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE4]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32( [[PARTIAL_REDUCE3]], [[TMP18]]) ; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-SVE-MAXBW-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-SVE-MAXBW-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK-SVE-MAXBW: middle.block: -; CHECK-SVE-MAXBW-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE4]]) +; CHECK-SVE-MAXBW-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32( [[PARTIAL_REDUCE4]]) ; CHECK-SVE-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-SVE-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; @@ -858,7 +866,7 @@ define i32 @chained_partial_reduce_sub_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) # ; CHECK-NEON-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-NEON: vector.body: ; CHECK-NEON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEON-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEON-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE4:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEON-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEON-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP0]] ; CHECK-NEON-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP0]] @@ -873,16 +881,18 @@ define i32 @chained_partial_reduce_sub_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) # ; CHECK-NEON-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> ; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-NEON-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]] -; CHECK-NEON-NEXT: [[TMP11:%.*]] = sub <16 x i32> [[VEC_PHI]], [[TMP10]] +; CHECK-NEON-NEXT: [[TMP11:%.*]] = sub nsw <16 x i32> zeroinitializer, [[TMP10]] +; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]]) ; CHECK-NEON-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]] -; CHECK-NEON-NEXT: [[TMP13:%.*]] = add <16 x i32> [[TMP11]], [[TMP12]] +; CHECK-NEON-NEXT: [[PARTIAL_REDUCE3:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP12]]) ; CHECK-NEON-NEXT: [[TMP14:%.*]] = mul nsw <16 x i32> [[TMP8]], [[TMP9]] -; CHECK-NEON-NEXT: [[TMP15]] = sub <16 x i32> [[TMP13]], [[TMP14]] +; CHECK-NEON-NEXT: [[TMP15:%.*]] = sub <16 x i32> zeroinitializer, [[TMP14]] +; CHECK-NEON-NEXT: [[PARTIAL_REDUCE4]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE3]], <16 x i32> [[TMP15]]) ; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEON-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEON-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK-NEON: middle.block: -; CHECK-NEON-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP15]]) +; CHECK-NEON-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE4]]) ; CHECK-NEON-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-NEON-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; @@ -954,7 +964,7 @@ define i32 @chained_partial_reduce_sub_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) # ; CHECK-SVE-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-SVE-MAXBW: vector.body: ; CHECK-SVE-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-SVE-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE4:%.*]], [[VECTOR_BODY]] ] ; CHECK-SVE-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 ; CHECK-SVE-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]] ; CHECK-SVE-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]] @@ -969,16 +979,18 @@ define i32 @chained_partial_reduce_sub_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) # ; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext [[WIDE_LOAD1]] to ; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = sext [[WIDE_LOAD2]] to ; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw [[TMP13]], [[TMP14]] -; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = sub [[VEC_PHI]], [[TMP16]] +; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = sub nsw zeroinitializer, [[TMP16]] +; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32( [[VEC_PHI]], [[TMP17]]) ; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = mul nsw [[TMP13]], [[TMP15]] -; CHECK-SVE-MAXBW-NEXT: [[TMP19:%.*]] = add [[TMP17]], [[TMP18]] -; CHECK-SVE-MAXBW-NEXT: [[TMP20:%.*]] = mul nsw [[TMP14]], [[TMP15]] -; CHECK-SVE-MAXBW-NEXT: [[TMP21]] = sub [[TMP19]], [[TMP20]] +; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE3:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32( [[PARTIAL_REDUCE]], [[TMP18]]) +; CHECK-SVE-MAXBW-NEXT: [[TMP19:%.*]] = mul nsw [[TMP14]], [[TMP15]] +; CHECK-SVE-MAXBW-NEXT: [[TMP20:%.*]] = sub zeroinitializer, [[TMP19]] +; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE4]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32( [[PARTIAL_REDUCE3]], [[TMP20]]) ; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-SVE-MAXBW-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-SVE-MAXBW-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK-SVE-MAXBW: middle.block: -; CHECK-SVE-MAXBW-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32( [[TMP21]]) +; CHECK-SVE-MAXBW-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32( [[PARTIAL_REDUCE4]]) ; CHECK-SVE-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-SVE-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll new file mode 100644 index 0000000000000..37da523ed7337 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll @@ -0,0 +1,163 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVE1 +; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVED +; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -vectorizer-maximize-bandwidth -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-MAXBW + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-none-unknown-elf" + +define i32 @dotp(ptr %a, ptr %b) #0 { +; CHECK-INTERLEAVE1-LABEL: define i32 @dotp( +; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-INTERLEAVE1-NEXT: entry: +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; CHECK-INTERLEAVE1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVE1: vector.body: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP6]] +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]] +; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP11]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = zext [[WIDE_LOAD1]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = mul [[TMP12]], [[TMP9]] +; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = sub zeroinitializer, [[TMP13]] +; CHECK-INTERLEAVE1-NEXT: [[TMP15]] = add [[VEC_PHI]], [[TMP14]] +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-INTERLEAVE1: middle.block: +; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP15]]) +; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; +; CHECK-INTERLEAVED-LABEL: define i32 @dotp( +; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-INTERLEAVED-NEXT: entry: +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; CHECK-INTERLEAVED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVED: vector.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP26:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP6]] +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP7]], i64 [[TMP10]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP11]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = zext [[WIDE_LOAD2]] to +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]] +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[TMP14]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP14]], i64 [[TMP17]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP15]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP18]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = zext [[WIDE_LOAD3]] to +; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = zext [[WIDE_LOAD4]] to +; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = mul [[TMP19]], [[TMP12]] +; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = mul [[TMP20]], [[TMP13]] +; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = sub zeroinitializer, [[TMP21]] +; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = sub zeroinitializer, [[TMP22]] +; CHECK-INTERLEAVED-NEXT: [[TMP25]] = add [[VEC_PHI]], [[TMP23]] +; CHECK-INTERLEAVED-NEXT: [[TMP26]] = add [[VEC_PHI1]], [[TMP24]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-INTERLEAVED: middle.block: +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add [[TMP26]], [[TMP25]] +; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; +; CHECK-MAXBW-LABEL: define i32 @dotp( +; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-MAXBW-NEXT: entry: +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; CHECK-MAXBW-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] +; CHECK-MAXBW: vector.ph: +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] +; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-MAXBW-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-MAXBW: vector.body: +; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[ENTRY]] ], [ [[PARTIAL_REDUCE:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[IV]], 0 +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP6]] +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 1 +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]] +; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP11]], align 1 +; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = zext [[WIDE_LOAD1]] to +; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = mul [[TMP12]], [[TMP9]] +; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = sub zeroinitializer, [[TMP13]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32( [[VEC_PHI]], [[TMP14]]) +; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], [[TMP5]] +; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-MAXBW: middle.block: +; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32( [[PARTIAL_REDUCE]]) +; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = zext i8 %load.a to i32 + %gep.b = getelementptr i8, ptr %b, i64 %iv + %load.b = load i8, ptr %gep.b, align 1 + %ext.b = zext i8 %load.b to i32 + %mul = mul i32 %ext.b, %ext.a + %sub = sub i32 0, %mul + %add = add i32 %accum, %sub + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + ret i32 %add +} + +!7 = distinct !{!7, !8, !9, !10} +!8 = !{!"llvm.loop.mustprogress"} +!9 = !{!"llvm.loop.vectorize.predicate.enable", i1 true} +!10 = !{!"llvm.loop.vectorize.enable", i1 true} +attributes #0 = { vscale_range(1,16) "target-features"="+sve" }