@@ -1371,18 +1371,6 @@ class BoUpSLP {
13711371 return MinBWs.at(VectorizableTree.front().get()).second;
13721372 }
13731373
1374- /// Returns reduction bitwidth and signedness, if it does not match the
1375- /// original requested size.
1376- std::optional<std::pair<unsigned, bool>> getReductionBitWidthAndSign() const {
1377- if (ReductionBitWidth == 0 ||
1378- ReductionBitWidth ==
1379- DL->getTypeSizeInBits(
1380- VectorizableTree.front()->Scalars.front()->getType()))
1381- return std::nullopt;
1382- return std::make_pair(ReductionBitWidth,
1383- MinBWs.at(VectorizableTree.front().get()).second);
1384- }
1385-
13861374 /// Builds external uses of the vectorized scalars, i.e. the list of
13871375 /// vectorized scalars to be extracted, their lanes and their scalar users. \p
13881376 /// ExternallyUsedValues contains additional list of external uses to handle
@@ -17899,37 +17887,24 @@ void BoUpSLP::computeMinimumValueSizes() {
1789917887 // Add reduction ops sizes, if any.
1790017888 if (UserIgnoreList &&
1790117889 isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {
17902- // Convert vector_reduce_add(ZExt(<n x i1>)) to ZExtOrTrunc(ctpop(bitcast <n
17903- // x i1> to in)).
17904- if (all_of(*UserIgnoreList,
17905- [](Value *V) {
17906- return cast<Instruction>(V)->getOpcode() == Instruction::Add;
17907- }) &&
17908- VectorizableTree.front()->State == TreeEntry::Vectorize &&
17909- VectorizableTree.front()->getOpcode() == Instruction::ZExt &&
17910- cast<CastInst>(VectorizableTree.front()->getMainOp())->getSrcTy() ==
17911- Builder.getInt1Ty()) {
17912- ReductionBitWidth = 1;
17913- } else {
17914- for (Value *V : *UserIgnoreList) {
17915- unsigned NumSignBits = ComputeNumSignBits(V, *DL, 0, AC, nullptr, DT);
17916- TypeSize NumTypeBits = DL->getTypeSizeInBits(V->getType());
17917- unsigned BitWidth1 = NumTypeBits - NumSignBits;
17918- if (!isKnownNonNegative(V, SimplifyQuery(*DL)))
17919- ++BitWidth1;
17920- unsigned BitWidth2 = BitWidth1;
17921- if (!RecurrenceDescriptor::isIntMinMaxRecurrenceKind(::getRdxKind(V))) {
17922- APInt Mask = DB->getDemandedBits(cast<Instruction>(V));
17923- BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
17924- }
17925- ReductionBitWidth =
17926- std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
17890+ for (Value *V : *UserIgnoreList) {
17891+ auto NumSignBits = ComputeNumSignBits(V, *DL, 0, AC, nullptr, DT);
17892+ auto NumTypeBits = DL->getTypeSizeInBits(V->getType());
17893+ unsigned BitWidth1 = NumTypeBits - NumSignBits;
17894+ if (!isKnownNonNegative(V, SimplifyQuery(*DL)))
17895+ ++BitWidth1;
17896+ unsigned BitWidth2 = BitWidth1;
17897+ if (!RecurrenceDescriptor::isIntMinMaxRecurrenceKind(::getRdxKind(V))) {
17898+ auto Mask = DB->getDemandedBits(cast<Instruction>(V));
17899+ BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
1792717900 }
17928- if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
17929- ReductionBitWidth = 8;
17930-
17931- ReductionBitWidth = bit_ceil(ReductionBitWidth);
17901+ ReductionBitWidth =
17902+ std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
1793217903 }
17904+ if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
17905+ ReductionBitWidth = 8;
17906+
17907+ ReductionBitWidth = bit_ceil(ReductionBitWidth);
1793317908 }
1793417909 bool IsTopRoot = NodeIdx == 0;
1793517910 while (NodeIdx < VectorizableTree.size() &&
@@ -19785,8 +19760,8 @@ class HorizontalReduction {
1978519760
1978619761 // Estimate cost.
1978719762 InstructionCost TreeCost = V.getTreeCost(VL);
19788- InstructionCost ReductionCost = getReductionCost(
19789- TTI, VL, IsCmpSelMinMax, RdxFMF, V.getReductionBitWidthAndSign() );
19763+ InstructionCost ReductionCost =
19764+ getReductionCost( TTI, VL, IsCmpSelMinMax, ReduxWidth, RdxFMF );
1979019765 InstructionCost Cost = TreeCost + ReductionCost;
1979119766 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
1979219767 << " for reduction\n");
@@ -19891,12 +19866,10 @@ class HorizontalReduction {
1989119866 createStrideMask(I, ScalarTyNumElements, VL.size());
1989219867 Value *Lane = Builder.CreateShuffleVector(VectorizedRoot, Mask);
1989319868 ReducedSubTree = Builder.CreateInsertElement(
19894- ReducedSubTree,
19895- emitReduction(Lane, Builder, TTI, RdxRootInst->getType()), I);
19869+ ReducedSubTree, emitReduction(Lane, Builder, TTI), I);
1989619870 }
1989719871 } else {
19898- ReducedSubTree = emitReduction(VectorizedRoot, Builder, TTI,
19899- RdxRootInst->getType());
19872+ ReducedSubTree = emitReduction(VectorizedRoot, Builder, TTI);
1990019873 }
1990119874 if (ReducedSubTree->getType() != VL.front()->getType()) {
1990219875 assert(ReducedSubTree->getType() != VL.front()->getType() &&
@@ -20077,13 +20050,12 @@ class HorizontalReduction {
2007720050
2007820051private:
2007920052 /// Calculate the cost of a reduction.
20080- InstructionCost getReductionCost(
20081- TargetTransformInfo *TTI, ArrayRef<Value *> ReducedVals,
20082- bool IsCmpSelMinMax, FastMathFlags FMF ,
20083- const std::optional<std::pair<unsigned, bool>> BitwidthAndSign ) {
20053+ InstructionCost getReductionCost(TargetTransformInfo *TTI,
20054+ ArrayRef<Value *> ReducedVals,
20055+ bool IsCmpSelMinMax, unsigned ReduxWidth ,
20056+ FastMathFlags FMF ) {
2008420057 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2008520058 Type *ScalarTy = ReducedVals.front()->getType();
20086- unsigned ReduxWidth = ReducedVals.size();
2008720059 FixedVectorType *VectorTy = getWidenedType(ScalarTy, ReduxWidth);
2008820060 InstructionCost VectorCost = 0, ScalarCost;
2008920061 // If all of the reduced values are constant, the vector cost is 0, since
@@ -20142,22 +20114,8 @@ class HorizontalReduction {
2014220114 VecTy, APInt::getAllOnes(ScalarTyNumElements), /*Insert*/ true,
2014320115 /*Extract*/ false, TTI::TCK_RecipThroughput);
2014420116 } else {
20145- auto [Bitwidth, IsSigned] =
20146- BitwidthAndSign.value_or(std::make_pair(0u, false));
20147- if (RdxKind == RecurKind::Add && Bitwidth == 1) {
20148- // Represent vector_reduce_add(ZExt(<n x i1>)) to
20149- // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
20150- auto *IntTy = IntegerType::get(ScalarTy->getContext(), ReduxWidth);
20151- IntrinsicCostAttributes ICA(Intrinsic::ctpop, IntTy, {IntTy}, FMF);
20152- VectorCost =
20153- TTI->getCastInstrCost(Instruction::BitCast, IntTy,
20154- getWidenedType(ScalarTy, ReduxWidth),
20155- TTI::CastContextHint::None, CostKind) +
20156- TTI->getIntrinsicInstrCost(ICA, CostKind);
20157- } else {
20158- VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy,
20159- FMF, CostKind);
20160- }
20117+ VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, FMF,
20118+ CostKind);
2016120119 }
2016220120 }
2016320121 ScalarCost = EvaluateScalarCost([&]() {
@@ -20194,22 +20152,11 @@ class HorizontalReduction {
2019420152
2019520153 /// Emit a horizontal reduction of the vectorized value.
2019620154 Value *emitReduction(Value *VectorizedValue, IRBuilderBase &Builder,
20197- const TargetTransformInfo *TTI, Type *DestTy ) {
20155+ const TargetTransformInfo *TTI) {
2019820156 assert(VectorizedValue && "Need to have a vectorized tree node");
2019920157 assert(RdxKind != RecurKind::FMulAdd &&
2020020158 "A call to the llvm.fmuladd intrinsic is not handled yet");
2020120159
20202- auto *FTy = cast<FixedVectorType>(VectorizedValue->getType());
20203- if (FTy->getScalarType() == Builder.getInt1Ty() &&
20204- RdxKind == RecurKind::Add &&
20205- DestTy->getScalarType() != FTy->getScalarType()) {
20206- // Convert vector_reduce_add(ZExt(<n x i1>)) to
20207- // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
20208- Value *V = Builder.CreateBitCast(
20209- VectorizedValue, Builder.getIntNTy(FTy->getNumElements()));
20210- ++NumVectorInstructions;
20211- return Builder.CreateUnaryIntrinsic(Intrinsic::ctpop, V);
20212- }
2021320160 ++NumVectorInstructions;
2021420161 return createSimpleReduction(Builder, VectorizedValue, RdxKind);
2021520162 }
0 commit comments