@@ -2911,7 +2911,8 @@ class BoUpSLP {
2911
2911
}
2912
2912
if (Last->State != TreeEntry::NeedToGather) {
2913
2913
for (Value *V : VL) {
2914
- assert(!getTreeEntry(V) && "Scalar already in tree!");
2914
+ [[maybe_unused]] const TreeEntry *TE = getTreeEntry(V);
2915
+ assert((!TE || TE == Last) && "Scalar already in tree!");
2915
2916
ScalarToTreeEntry[V] = Last;
2916
2917
}
2917
2918
// Update the scheduler bundle to point to this TreeEntry.
@@ -2924,7 +2925,8 @@ class BoUpSLP {
2924
2925
for (Value *V : VL) {
2925
2926
if (doesNotNeedToBeScheduled(V))
2926
2927
continue;
2927
- assert(BundleMember && "Unexpected end of bundle.");
2928
+ if (!BundleMember)
2929
+ continue;
2928
2930
BundleMember->TE = Last;
2929
2931
BundleMember = BundleMember->NextInBundle;
2930
2932
}
@@ -5583,9 +5585,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
5583
5585
5584
5586
SmallVector<int> ReuseShuffleIndicies;
5585
5587
SmallVector<Value *> UniqueValues;
5586
- auto &&TryToFindDuplicates = [&VL, &ReuseShuffleIndicies, &UniqueValues,
5587
- &UserTreeIdx ,
5588
- this](const InstructionsState &S ) {
5588
+ SmallVector<Value *> NonUniqueValueVL;
5589
+ auto TryToFindDuplicates = [&](const InstructionsState &S ,
5590
+ bool DoNotFail = false ) {
5589
5591
// Check that every instruction appears once in this bundle.
5590
5592
DenseMap<Value *, unsigned> UniquePositions(VL.size());
5591
5593
for (Value *V : VL) {
@@ -5612,6 +5614,26 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
5612
5614
!isConstant(V);
5613
5615
})) ||
5614
5616
!llvm::has_single_bit<uint32_t>(NumUniqueScalarValues)) {
5617
+ SmallVector<Value *> IgnoredVals;
5618
+ if (UserIgnoreList)
5619
+ IgnoredVals.assign(UserIgnoreList->begin(), UserIgnoreList->end());
5620
+ if (DoNotFail && UniquePositions.size() > 1 &&
5621
+ NumUniqueScalarValues > 1 && S.MainOp->isSafeToRemove() &&
5622
+ all_of(UniqueValues, [=](Value *V) {
5623
+ return isa<ExtractElementInst>(V) ||
5624
+ areAllUsersVectorized(cast<Instruction>(V), IgnoredVals);
5625
+ })) {
5626
+ unsigned PWSz = PowerOf2Ceil(UniqueValues.size());
5627
+ if (PWSz == VL.size()) {
5628
+ ReuseShuffleIndicies.clear();
5629
+ } else {
5630
+ NonUniqueValueVL.assign(UniqueValues.begin(), UniqueValues.end());
5631
+ NonUniqueValueVL.append(PWSz - UniqueValues.size(),
5632
+ UniqueValues.back());
5633
+ VL = NonUniqueValueVL;
5634
+ }
5635
+ return true;
5636
+ }
5615
5637
LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
5616
5638
newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
5617
5639
return false;
@@ -5857,7 +5879,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
5857
5879
}
5858
5880
5859
5881
// Check that every instruction appears once in this bundle.
5860
- if (!TryToFindDuplicates(S))
5882
+ if (!TryToFindDuplicates(S, /*DoNotFail=*/true ))
5861
5883
return;
5862
5884
5863
5885
// Perform specific checks for each particular instruction kind.
@@ -5877,7 +5899,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
5877
5899
5878
5900
BlockScheduling &BS = *BSRef;
5879
5901
5880
- std::optional<ScheduleData *> Bundle = BS.tryScheduleBundle(VL, this, S);
5902
+ std::optional<ScheduleData *> Bundle =
5903
+ BS.tryScheduleBundle(UniqueValues, this, S);
5881
5904
#ifdef EXPENSIVE_CHECKS
5882
5905
// Make sure we didn't break any internal invariants
5883
5906
BS.verify();
@@ -7537,7 +7560,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
7537
7560
Instruction *VL0 = E->getMainOp();
7538
7561
unsigned ShuffleOrOp =
7539
7562
E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
7540
- const unsigned Sz = VL.size();
7563
+ SetVector<Value *> UniqueValues(VL.begin(), VL.end());
7564
+ const unsigned Sz = UniqueValues.size();
7541
7565
auto GetCostDiff =
7542
7566
[=](function_ref<InstructionCost(unsigned)> ScalarEltCost,
7543
7567
function_ref<InstructionCost(InstructionCost)> VectorCost) {
@@ -7644,7 +7668,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
7644
7668
// Count reused scalars.
7645
7669
InstructionCost ScalarCost = 0;
7646
7670
SmallPtrSet<const TreeEntry *, 4> CountedOps;
7647
- for (Value *V : VL ) {
7671
+ for (Value *V : UniqueValues ) {
7648
7672
auto *PHI = dyn_cast<PHINode>(V);
7649
7673
if (!PHI)
7650
7674
continue;
@@ -7665,8 +7689,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
7665
7689
}
7666
7690
case Instruction::ExtractValue:
7667
7691
case Instruction::ExtractElement: {
7668
- auto GetScalarCost = [= ](unsigned Idx) {
7669
- auto *I = cast<Instruction>(VL [Idx]);
7692
+ auto GetScalarCost = [& ](unsigned Idx) {
7693
+ auto *I = cast<Instruction>(UniqueValues [Idx]);
7670
7694
VectorType *SrcVecTy;
7671
7695
if (ShuffleOrOp == Instruction::ExtractElement) {
7672
7696
auto *EE = cast<ExtractElementInst>(I);
@@ -7844,9 +7868,10 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
7844
7868
Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
7845
7869
}
7846
7870
}
7847
- auto GetScalarCost = [=](unsigned Idx) {
7848
- auto *VI =
7849
- VL0->getOpcode() == Opcode ? cast<Instruction>(VL[Idx]) : nullptr;
7871
+ auto GetScalarCost = [&](unsigned Idx) {
7872
+ auto *VI = VL0->getOpcode() == Opcode
7873
+ ? cast<Instruction>(UniqueValues[Idx])
7874
+ : nullptr;
7850
7875
return TTI->getCastInstrCost(Opcode, ScalarTy, SrcScalarTy,
7851
7876
TTI::getCastContextHint(VI), CostKind, VI);
7852
7877
};
@@ -7891,7 +7916,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
7891
7916
? CmpInst::BAD_FCMP_PREDICATE
7892
7917
: CmpInst::BAD_ICMP_PREDICATE;
7893
7918
auto GetScalarCost = [&](unsigned Idx) {
7894
- auto *VI = cast<Instruction>(VL [Idx]);
7919
+ auto *VI = cast<Instruction>(UniqueValues [Idx]);
7895
7920
CmpInst::Predicate CurrentPred = ScalarTy->isFloatingPointTy()
7896
7921
? CmpInst::BAD_FCMP_PREDICATE
7897
7922
: CmpInst::BAD_ICMP_PREDICATE;
@@ -7951,8 +7976,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
7951
7976
case Instruction::And:
7952
7977
case Instruction::Or:
7953
7978
case Instruction::Xor: {
7954
- auto GetScalarCost = [= ](unsigned Idx) {
7955
- auto *VI = cast<Instruction>(VL [Idx]);
7979
+ auto GetScalarCost = [& ](unsigned Idx) {
7980
+ auto *VI = cast<Instruction>(UniqueValues [Idx]);
7956
7981
unsigned OpIdx = isa<UnaryOperator>(VI) ? 0 : 1;
7957
7982
TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(VI->getOperand(0));
7958
7983
TTI::OperandValueInfo Op2Info =
@@ -7975,14 +8000,14 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
7975
8000
return CommonCost + GetGEPCostDiff(VL, VL0);
7976
8001
}
7977
8002
case Instruction::Load: {
7978
- auto GetScalarCost = [= ](unsigned Idx) {
7979
- auto *VI = cast<LoadInst>(VL [Idx]);
8003
+ auto GetScalarCost = [& ](unsigned Idx) {
8004
+ auto *VI = cast<LoadInst>(UniqueValues [Idx]);
7980
8005
return TTI->getMemoryOpCost(Instruction::Load, ScalarTy, VI->getAlign(),
7981
8006
VI->getPointerAddressSpace(), CostKind,
7982
8007
TTI::OperandValueInfo(), VI);
7983
8008
};
7984
8009
auto *LI0 = cast<LoadInst>(VL0);
7985
- auto GetVectorCost = [= ](InstructionCost CommonCost) {
8010
+ auto GetVectorCost = [& ](InstructionCost CommonCost) {
7986
8011
InstructionCost VecLdCost;
7987
8012
if (E->State == TreeEntry::Vectorize) {
7988
8013
VecLdCost = TTI->getMemoryOpCost(
@@ -7993,7 +8018,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
7993
8018
E->State == TreeEntry::PossibleStridedVectorize) &&
7994
8019
"Unknown EntryState");
7995
8020
Align CommonAlignment = LI0->getAlign();
7996
- for (Value *V : VL )
8021
+ for (Value *V : UniqueValues )
7997
8022
CommonAlignment =
7998
8023
std::min(CommonAlignment, cast<LoadInst>(V)->getAlign());
7999
8024
VecLdCost = TTI->getGatherScatterOpCost(
@@ -8045,8 +8070,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
8045
8070
GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
8046
8071
}
8047
8072
case Instruction::Call: {
8048
- auto GetScalarCost = [= ](unsigned Idx) {
8049
- auto *CI = cast<CallInst>(VL [Idx]);
8073
+ auto GetScalarCost = [& ](unsigned Idx) {
8074
+ auto *CI = cast<CallInst>(UniqueValues [Idx]);
8050
8075
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8051
8076
if (ID != Intrinsic::not_intrinsic) {
8052
8077
IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
@@ -8087,8 +8112,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
8087
8112
}
8088
8113
return false;
8089
8114
};
8090
- auto GetScalarCost = [= ](unsigned Idx) {
8091
- auto *VI = cast<Instruction>(VL [Idx]);
8115
+ auto GetScalarCost = [& ](unsigned Idx) {
8116
+ auto *VI = cast<Instruction>(UniqueValues [Idx]);
8092
8117
assert(E->isOpcodeOrAlt(VI) && "Unexpected main/alternate opcode");
8093
8118
(void)E;
8094
8119
return TTI->getInstructionCost(VI, CostKind);
@@ -8607,6 +8632,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
8607
8632
SmallVector<MapVector<const TreeEntry *, SmallVector<int>>> ShuffleMasks;
8608
8633
SmallVector<std::pair<Value *, const TreeEntry *>> FirstUsers;
8609
8634
SmallVector<APInt> DemandedElts;
8635
+ SmallDenseSet<Value *, 4> UsedInserts;
8610
8636
for (ExternalUser &EU : ExternalUses) {
8611
8637
// We only add extract cost once for the same scalar.
8612
8638
if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
@@ -8627,6 +8653,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
8627
8653
// to detect it as a final shuffled/identity match.
8628
8654
if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User)) {
8629
8655
if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {
8656
+ if (!UsedInserts.insert(VU).second)
8657
+ continue;
8630
8658
std::optional<unsigned> InsertIdx = getInsertIndex(VU);
8631
8659
if (InsertIdx) {
8632
8660
const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar);
@@ -11008,6 +11036,7 @@ Value *BoUpSLP::vectorizeTree(
11008
11036
// Maps extract Scalar to the corresponding extractelement instruction in the
11009
11037
// basic block. Only one extractelement per block should be emitted.
11010
11038
DenseMap<Value *, DenseMap<BasicBlock *, Instruction *>> ScalarToEEs;
11039
+ SmallDenseSet<Value *, 4> UsedInserts;
11011
11040
// Extract all of the elements with the external uses.
11012
11041
for (const auto &ExternalUse : ExternalUses) {
11013
11042
Value *Scalar = ExternalUse.Scalar;
@@ -11106,6 +11135,8 @@ Value *BoUpSLP::vectorizeTree(
11106
11135
// Skip if the scalar is another vector op or Vec is not an instruction.
11107
11136
if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
11108
11137
if (auto *FTy = dyn_cast<FixedVectorType>(User->getType())) {
11138
+ if (!UsedInserts.insert(VU).second)
11139
+ continue;
11109
11140
std::optional<unsigned> InsertIdx = getInsertIndex(VU);
11110
11141
if (InsertIdx) {
11111
11142
// Need to use original vector, if the root is truncated.
0 commit comments