@@ -1253,7 +1253,7 @@ class BoUpSLP {
1253
1253
NonScheduledFirst.clear();
1254
1254
EntryToLastInstruction.clear();
1255
1255
ExternalUses.clear();
1256
- ExternalUsesAsGEPs .clear();
1256
+ ExternalUsesAsOriginalScalar .clear();
1257
1257
for (auto &Iter : BlocksSchedules) {
1258
1258
BlockScheduling *BS = Iter.second.get();
1259
1259
BS->clear();
@@ -3468,7 +3468,7 @@ class BoUpSLP {
3468
3468
3469
3469
/// A list of GEPs which can be reaplced by scalar GEPs instead of
3470
3470
/// extractelement instructions.
3471
- SmallPtrSet<Value *, 4> ExternalUsesAsGEPs ;
3471
+ SmallPtrSet<Value *, 4> ExternalUsesAsOriginalScalar ;
3472
3472
3473
3473
/// Values used only by @llvm.assume calls.
3474
3474
SmallPtrSet<const Value *, 32> EphValues;
@@ -10663,6 +10663,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
10663
10663
SmallDenseSet<Value *, 4> UsedInserts;
10664
10664
DenseSet<std::pair<const TreeEntry *, Type *>> VectorCasts;
10665
10665
std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
10666
+ DenseMap<const TreeEntry *, DenseSet<Value *>> ExtractsCount;
10666
10667
for (ExternalUser &EU : ExternalUses) {
10667
10668
// We only add extract cost once for the same scalar.
10668
10669
if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
@@ -10771,52 +10772,90 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
10771
10772
}
10772
10773
}
10773
10774
}
10774
- // Leave the GEPs as is, they are free in most cases and better to keep them
10775
- // as GEPs.
10775
+
10776
10776
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
10777
- if (auto *GEP = dyn_cast<GetElementPtrInst>(EU.Scalar)) {
10777
+ // If we plan to rewrite the tree in a smaller type, we will need to sign
10778
+ // extend the extracted value back to the original type. Here, we account
10779
+ // for the extract and the added cost of the sign extend if needed.
10780
+ InstructionCost ExtraCost = TTI::TCC_Free;
10781
+ auto *VecTy = getWidenedType(EU.Scalar->getType(), BundleWidth);
10782
+ const TreeEntry *Entry = getTreeEntry(EU.Scalar);
10783
+ auto It = MinBWs.find(Entry);
10784
+ if (It != MinBWs.end()) {
10785
+ auto *MinTy = IntegerType::get(F->getContext(), It->second.first);
10786
+ unsigned Extend =
10787
+ It->second.second ? Instruction::SExt : Instruction::ZExt;
10788
+ VecTy = getWidenedType(MinTy, BundleWidth);
10789
+ ExtraCost = TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
10790
+ VecTy, EU.Lane);
10791
+ } else {
10792
+ ExtraCost = TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
10793
+ CostKind, EU.Lane);
10794
+ }
10795
+ // Leave the scalar instructions as is if they are cheaper than extracts.
10796
+ if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
10797
+ Entry->getOpcode() == Instruction::Load) {
10778
10798
if (!ValueToExtUses) {
10779
10799
ValueToExtUses.emplace();
10780
10800
for_each(enumerate(ExternalUses), [&](const auto &P) {
10801
+ // Ignore phis in loops.
10802
+ if (auto *Phi = dyn_cast_if_present<PHINode>(P.value().User)) {
10803
+ auto *I = cast<Instruction>(P.value().Scalar);
10804
+ const Loop *L = LI->getLoopFor(Phi->getParent());
10805
+ if (L && (Phi->getParent() == I->getParent() ||
10806
+ L == LI->getLoopFor(I->getParent())))
10807
+ return;
10808
+ }
10809
+
10781
10810
ValueToExtUses->try_emplace(P.value().Scalar, P.index());
10782
10811
});
10783
10812
}
10784
- // Can use original GEP, if no operands vectorized or they are marked as
10785
- // externally used already.
10786
- bool CanBeUsedAsGEP = all_of(GEP->operands(), [&](Value *V) {
10787
- if (!getTreeEntry(V))
10788
- return true;
10789
- auto It = ValueToExtUses->find(V);
10790
- if (It != ValueToExtUses->end()) {
10791
- // Replace all uses to avoid compiler crash.
10792
- ExternalUses[It->second].User = nullptr;
10813
+ // Can use original instruction, if no operands vectorized or they are
10814
+ // marked as externally used already.
10815
+ auto *Inst = cast<Instruction>(EU.Scalar);
10816
+ bool CanBeUsedAsScalar = all_of(Inst->operands(), [&](Value *V) {
10817
+ if (!getTreeEntry(V)) {
10818
+ // Some extractelements might be not vectorized, but
10819
+ // transformed into shuffle and removed from the function,
10820
+ // consider it here.
10821
+ if (auto *EE = dyn_cast<ExtractElementInst>(V))
10822
+ return !EE->hasOneUse() || !MustGather.contains(EE);
10793
10823
return true;
10794
10824
}
10795
- return false ;
10825
+ return ValueToExtUses->contains(V) ;
10796
10826
});
10797
- if (CanBeUsedAsGEP) {
10798
- ExtractCost += TTI->getInstructionCost(GEP, CostKind);
10799
- ExternalUsesAsGEPs.insert(EU.Scalar);
10800
- continue;
10827
+ if (CanBeUsedAsScalar) {
10828
+ InstructionCost ScalarCost = TTI->getInstructionCost(Inst, CostKind);
10829
+ bool KeepScalar = ScalarCost <= ExtraCost;
10830
+ if (KeepScalar && ScalarCost != TTI::TCC_Free &&
10831
+ ExtraCost - ScalarCost <= TTI::TCC_Basic) {
10832
+ unsigned ScalarUsesCount = count_if(Entry->Scalars, [&](Value *V) {
10833
+ return ValueToExtUses->contains(V);
10834
+ });
10835
+ auto It = ExtractsCount.find(Entry);
10836
+ if (It != ExtractsCount.end())
10837
+ ScalarUsesCount -= It->getSecond().size();
10838
+ // Keep original scalar if number of externally used instructions in
10839
+ // the same entry is not power of 2. It may help to do some extra
10840
+ // vectorization for now.
10841
+ KeepScalar = ScalarUsesCount <= 1 || !isPowerOf2_32(ScalarUsesCount);
10842
+ }
10843
+ if (KeepScalar) {
10844
+ ExternalUsesAsOriginalScalar.insert(EU.Scalar);
10845
+ for_each(Inst->operands(), [&](Value *V) {
10846
+ auto It = ValueToExtUses->find(V);
10847
+ if (It != ValueToExtUses->end()) {
10848
+ // Replace all uses to avoid compiler crash.
10849
+ ExternalUses[It->second].User = nullptr;
10850
+ }
10851
+ });
10852
+ ExtraCost = ScalarCost;
10853
+ ExtractsCount[Entry].insert(Inst);
10854
+ }
10801
10855
}
10802
10856
}
10803
10857
10804
- // If we plan to rewrite the tree in a smaller type, we will need to sign
10805
- // extend the extracted value back to the original type. Here, we account
10806
- // for the extract and the added cost of the sign extend if needed.
10807
- auto *VecTy = getWidenedType(EU.Scalar->getType(), BundleWidth);
10808
- auto It = MinBWs.find(getTreeEntry(EU.Scalar));
10809
- if (It != MinBWs.end()) {
10810
- auto *MinTy = IntegerType::get(F->getContext(), It->second.first);
10811
- unsigned Extend =
10812
- It->second.second ? Instruction::SExt : Instruction::ZExt;
10813
- VecTy = getWidenedType(MinTy, BundleWidth);
10814
- ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
10815
- VecTy, EU.Lane);
10816
- } else {
10817
- ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
10818
- CostKind, EU.Lane);
10819
- }
10858
+ ExtractCost += ExtraCost;
10820
10859
}
10821
10860
// Add reduced value cost, if resized.
10822
10861
if (!VectorizedVals.empty()) {
@@ -14067,8 +14106,7 @@ Value *BoUpSLP::vectorizeTree(
14067
14106
DenseMap<Value *, InsertElementInst *> VectorToInsertElement;
14068
14107
// Maps extract Scalar to the corresponding extractelement instruction in the
14069
14108
// basic block. Only one extractelement per block should be emitted.
14070
- DenseMap<Value *,
14071
- DenseMap<BasicBlock *, std::pair<Instruction *, Instruction *>>>
14109
+ DenseMap<Value *, DenseMap<BasicBlock *, std::pair<Value *, Value *>>>
14072
14110
ScalarToEEs;
14073
14111
SmallDenseSet<Value *, 4> UsedInserts;
14074
14112
DenseMap<std::pair<Value *, Type *>, Value *> VectorCasts;
@@ -14098,30 +14136,41 @@ Value *BoUpSLP::vectorizeTree(
14098
14136
if (Scalar->getType() != Vec->getType()) {
14099
14137
Value *Ex = nullptr;
14100
14138
Value *ExV = nullptr;
14101
- auto *GEP = dyn_cast<GetElementPtrInst >(Scalar);
14102
- bool ReplaceGEP = GEP && ExternalUsesAsGEPs .contains(GEP );
14139
+ auto *Inst = dyn_cast<Instruction >(Scalar);
14140
+ bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar .contains(Inst );
14103
14141
auto It = ScalarToEEs.find(Scalar);
14104
14142
if (It != ScalarToEEs.end()) {
14105
14143
// No need to emit many extracts, just move the only one in the
14106
14144
// current block.
14107
- auto EEIt = It->second.find(Builder.GetInsertBlock());
14145
+ auto EEIt = It->second.find(ReplaceInst ? Inst->getParent()
14146
+ : Builder.GetInsertBlock());
14108
14147
if (EEIt != It->second.end()) {
14109
- Instruction *I = EEIt->second.first;
14110
- if (Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
14148
+ Value *PrevV = EEIt->second.first;
14149
+ if (auto *I = dyn_cast<Instruction>(PrevV);
14150
+ I && !ReplaceInst &&
14151
+ Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
14111
14152
Builder.GetInsertPoint()->comesBefore(I)) {
14112
14153
I->moveBefore(*Builder.GetInsertPoint()->getParent(),
14113
14154
Builder.GetInsertPoint());
14114
- if (auto *CI = EEIt->second.second)
14155
+ if (auto *CI = dyn_cast<Instruction>( EEIt->second.second) )
14115
14156
CI->moveAfter(I);
14116
14157
}
14117
- Ex = I ;
14158
+ Ex = PrevV ;
14118
14159
ExV = EEIt->second.second ? EEIt->second.second : Ex;
14119
14160
}
14120
14161
}
14121
14162
if (!Ex) {
14122
14163
// "Reuse" the existing extract to improve final codegen.
14123
- if (auto *ES = dyn_cast<ExtractElementInst>(Scalar);
14124
- ES && isa<Instruction>(Vec)) {
14164
+ if (ReplaceInst) {
14165
+ // Leave the instruction as is, if it cheaper extracts and all
14166
+ // operands are scalar.
14167
+ auto *CloneInst = Inst->clone();
14168
+ CloneInst->insertBefore(Inst);
14169
+ if (Inst->hasName())
14170
+ CloneInst->takeName(Inst);
14171
+ Ex = CloneInst;
14172
+ } else if (auto *ES = dyn_cast<ExtractElementInst>(Scalar);
14173
+ ES && isa<Instruction>(Vec)) {
14125
14174
Value *V = ES->getVectorOperand();
14126
14175
auto *IVec = cast<Instruction>(Vec);
14127
14176
if (const TreeEntry *ETE = getTreeEntry(V))
@@ -14132,18 +14181,6 @@ Value *BoUpSLP::vectorizeTree(
14132
14181
Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
14133
14182
else
14134
14183
Ex = Builder.CreateExtractElement(Vec, Lane);
14135
- } else if (ReplaceGEP) {
14136
- // Leave the GEPs as is, they are free in most cases and better to
14137
- // keep them as GEPs.
14138
- auto *CloneGEP = GEP->clone();
14139
- if (isa<Instruction>(Vec))
14140
- CloneGEP->insertBefore(*Builder.GetInsertBlock(),
14141
- Builder.GetInsertPoint());
14142
- else
14143
- CloneGEP->insertBefore(GEP);
14144
- if (GEP->hasName())
14145
- CloneGEP->takeName(GEP);
14146
- Ex = CloneGEP;
14147
14184
} else if (auto *VecTy =
14148
14185
dyn_cast<FixedVectorType>(Scalar->getType())) {
14149
14186
assert(SLPReVec && "FixedVectorType is not expected.");
@@ -14164,14 +14201,15 @@ Value *BoUpSLP::vectorizeTree(
14164
14201
if (Scalar->getType() != Ex->getType())
14165
14202
ExV = Builder.CreateIntCast(Ex, Scalar->getType(),
14166
14203
MinBWs.find(E)->second.second);
14167
- if ( auto *I = dyn_cast<Instruction>(Ex))
14168
- ScalarToEEs[Scalar].try_emplace(
14169
- Builder.GetInsertBlock (),
14170
- std::make_pair(I, cast<Instruction>( ExV) ));
14204
+ auto *I = dyn_cast<Instruction>(Ex);
14205
+ ScalarToEEs[Scalar].try_emplace(I ? I->getParent()
14206
+ : &F->getEntryBlock (),
14207
+ std::make_pair(Ex, ExV));
14171
14208
}
14172
14209
// The then branch of the previous if may produce constants, since 0
14173
14210
// operand might be a constant.
14174
- if (auto *ExI = dyn_cast<Instruction>(Ex)) {
14211
+ if (auto *ExI = dyn_cast<Instruction>(Ex);
14212
+ ExI && !isa<PHINode>(ExI) && !mayHaveNonDefUseDependency(*ExI)) {
14175
14213
GatherShuffleExtractSeq.insert(ExI);
14176
14214
CSEBlocks.insert(ExI->getParent());
14177
14215
}
@@ -14192,9 +14230,10 @@ Value *BoUpSLP::vectorizeTree(
14192
14230
continue;
14193
14231
assert((ExternallyUsedValues.count(Scalar) ||
14194
14232
Scalar->hasNUsesOrMore(UsesLimit) ||
14233
+ ExternalUsesAsOriginalScalar.contains(Scalar) ||
14195
14234
any_of(Scalar->users(),
14196
14235
[&](llvm::User *U) {
14197
- if (ExternalUsesAsGEPs .contains(U))
14236
+ if (ExternalUsesAsOriginalScalar .contains(U))
14198
14237
return true;
14199
14238
TreeEntry *UseEntry = getTreeEntry(U);
14200
14239
return UseEntry &&
0 commit comments