Skip to content

Commit b10ecfa

Browse files
[SLP]Represent externally used values as original scalars, if profitable.
Currently SLP vectorizer tries to keep only GEPs as scalar, if they are vectorized but used externally. Same approach can be used for all scalar values. This patch tries to keep original scalars if all its operands remain scalar or externally used, the cost of the original scalar is lower than the cost of the extractelement instruction, or if the number of externally used scalars in the same entry is power of 2. Last criterion allows better revectorization for multiply used scalars. Reviewers: RKSimon Reviewed By: RKSimon Pull Request: #100904
1 parent 34514ce commit b10ecfa

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

48 files changed

+665
-542
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 103 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -1253,7 +1253,7 @@ class BoUpSLP {
12531253
NonScheduledFirst.clear();
12541254
EntryToLastInstruction.clear();
12551255
ExternalUses.clear();
1256-
ExternalUsesAsGEPs.clear();
1256+
ExternalUsesAsOriginalScalar.clear();
12571257
for (auto &Iter : BlocksSchedules) {
12581258
BlockScheduling *BS = Iter.second.get();
12591259
BS->clear();
@@ -3468,7 +3468,7 @@ class BoUpSLP {
34683468

34693469
/// A list of GEPs which can be reaplced by scalar GEPs instead of
34703470
/// extractelement instructions.
3471-
SmallPtrSet<Value *, 4> ExternalUsesAsGEPs;
3471+
SmallPtrSet<Value *, 4> ExternalUsesAsOriginalScalar;
34723472

34733473
/// Values used only by @llvm.assume calls.
34743474
SmallPtrSet<const Value *, 32> EphValues;
@@ -10663,6 +10663,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
1066310663
SmallDenseSet<Value *, 4> UsedInserts;
1066410664
DenseSet<std::pair<const TreeEntry *, Type *>> VectorCasts;
1066510665
std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
10666+
DenseMap<const TreeEntry *, DenseSet<Value *>> ExtractsCount;
1066610667
for (ExternalUser &EU : ExternalUses) {
1066710668
// We only add extract cost once for the same scalar.
1066810669
if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
@@ -10771,52 +10772,90 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
1077110772
}
1077210773
}
1077310774
}
10774-
// Leave the GEPs as is, they are free in most cases and better to keep them
10775-
// as GEPs.
10775+
1077610776
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
10777-
if (auto *GEP = dyn_cast<GetElementPtrInst>(EU.Scalar)) {
10777+
// If we plan to rewrite the tree in a smaller type, we will need to sign
10778+
// extend the extracted value back to the original type. Here, we account
10779+
// for the extract and the added cost of the sign extend if needed.
10780+
InstructionCost ExtraCost = TTI::TCC_Free;
10781+
auto *VecTy = getWidenedType(EU.Scalar->getType(), BundleWidth);
10782+
const TreeEntry *Entry = getTreeEntry(EU.Scalar);
10783+
auto It = MinBWs.find(Entry);
10784+
if (It != MinBWs.end()) {
10785+
auto *MinTy = IntegerType::get(F->getContext(), It->second.first);
10786+
unsigned Extend =
10787+
It->second.second ? Instruction::SExt : Instruction::ZExt;
10788+
VecTy = getWidenedType(MinTy, BundleWidth);
10789+
ExtraCost = TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
10790+
VecTy, EU.Lane);
10791+
} else {
10792+
ExtraCost = TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
10793+
CostKind, EU.Lane);
10794+
}
10795+
// Leave the scalar instructions as is if they are cheaper than extracts.
10796+
if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
10797+
Entry->getOpcode() == Instruction::Load) {
1077810798
if (!ValueToExtUses) {
1077910799
ValueToExtUses.emplace();
1078010800
for_each(enumerate(ExternalUses), [&](const auto &P) {
10801+
// Ignore phis in loops.
10802+
if (auto *Phi = dyn_cast_if_present<PHINode>(P.value().User)) {
10803+
auto *I = cast<Instruction>(P.value().Scalar);
10804+
const Loop *L = LI->getLoopFor(Phi->getParent());
10805+
if (L && (Phi->getParent() == I->getParent() ||
10806+
L == LI->getLoopFor(I->getParent())))
10807+
return;
10808+
}
10809+
1078110810
ValueToExtUses->try_emplace(P.value().Scalar, P.index());
1078210811
});
1078310812
}
10784-
// Can use original GEP, if no operands vectorized or they are marked as
10785-
// externally used already.
10786-
bool CanBeUsedAsGEP = all_of(GEP->operands(), [&](Value *V) {
10787-
if (!getTreeEntry(V))
10788-
return true;
10789-
auto It = ValueToExtUses->find(V);
10790-
if (It != ValueToExtUses->end()) {
10791-
// Replace all uses to avoid compiler crash.
10792-
ExternalUses[It->second].User = nullptr;
10813+
// Can use original instruction, if no operands vectorized or they are
10814+
// marked as externally used already.
10815+
auto *Inst = cast<Instruction>(EU.Scalar);
10816+
bool CanBeUsedAsScalar = all_of(Inst->operands(), [&](Value *V) {
10817+
if (!getTreeEntry(V)) {
10818+
// Some extractelements might be not vectorized, but
10819+
// transformed into shuffle and removed from the function,
10820+
// consider it here.
10821+
if (auto *EE = dyn_cast<ExtractElementInst>(V))
10822+
return !EE->hasOneUse() || !MustGather.contains(EE);
1079310823
return true;
1079410824
}
10795-
return false;
10825+
return ValueToExtUses->contains(V);
1079610826
});
10797-
if (CanBeUsedAsGEP) {
10798-
ExtractCost += TTI->getInstructionCost(GEP, CostKind);
10799-
ExternalUsesAsGEPs.insert(EU.Scalar);
10800-
continue;
10827+
if (CanBeUsedAsScalar) {
10828+
InstructionCost ScalarCost = TTI->getInstructionCost(Inst, CostKind);
10829+
bool KeepScalar = ScalarCost <= ExtraCost;
10830+
if (KeepScalar && ScalarCost != TTI::TCC_Free &&
10831+
ExtraCost - ScalarCost <= TTI::TCC_Basic) {
10832+
unsigned ScalarUsesCount = count_if(Entry->Scalars, [&](Value *V) {
10833+
return ValueToExtUses->contains(V);
10834+
});
10835+
auto It = ExtractsCount.find(Entry);
10836+
if (It != ExtractsCount.end())
10837+
ScalarUsesCount -= It->getSecond().size();
10838+
// Keep original scalar if number of externally used instructions in
10839+
// the same entry is not power of 2. It may help to do some extra
10840+
// vectorization for now.
10841+
KeepScalar = ScalarUsesCount <= 1 || !isPowerOf2_32(ScalarUsesCount);
10842+
}
10843+
if (KeepScalar) {
10844+
ExternalUsesAsOriginalScalar.insert(EU.Scalar);
10845+
for_each(Inst->operands(), [&](Value *V) {
10846+
auto It = ValueToExtUses->find(V);
10847+
if (It != ValueToExtUses->end()) {
10848+
// Replace all uses to avoid compiler crash.
10849+
ExternalUses[It->second].User = nullptr;
10850+
}
10851+
});
10852+
ExtraCost = ScalarCost;
10853+
ExtractsCount[Entry].insert(Inst);
10854+
}
1080110855
}
1080210856
}
1080310857

10804-
// If we plan to rewrite the tree in a smaller type, we will need to sign
10805-
// extend the extracted value back to the original type. Here, we account
10806-
// for the extract and the added cost of the sign extend if needed.
10807-
auto *VecTy = getWidenedType(EU.Scalar->getType(), BundleWidth);
10808-
auto It = MinBWs.find(getTreeEntry(EU.Scalar));
10809-
if (It != MinBWs.end()) {
10810-
auto *MinTy = IntegerType::get(F->getContext(), It->second.first);
10811-
unsigned Extend =
10812-
It->second.second ? Instruction::SExt : Instruction::ZExt;
10813-
VecTy = getWidenedType(MinTy, BundleWidth);
10814-
ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
10815-
VecTy, EU.Lane);
10816-
} else {
10817-
ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
10818-
CostKind, EU.Lane);
10819-
}
10858+
ExtractCost += ExtraCost;
1082010859
}
1082110860
// Add reduced value cost, if resized.
1082210861
if (!VectorizedVals.empty()) {
@@ -14067,8 +14106,7 @@ Value *BoUpSLP::vectorizeTree(
1406714106
DenseMap<Value *, InsertElementInst *> VectorToInsertElement;
1406814107
// Maps extract Scalar to the corresponding extractelement instruction in the
1406914108
// basic block. Only one extractelement per block should be emitted.
14070-
DenseMap<Value *,
14071-
DenseMap<BasicBlock *, std::pair<Instruction *, Instruction *>>>
14109+
DenseMap<Value *, DenseMap<BasicBlock *, std::pair<Value *, Value *>>>
1407214110
ScalarToEEs;
1407314111
SmallDenseSet<Value *, 4> UsedInserts;
1407414112
DenseMap<std::pair<Value *, Type *>, Value *> VectorCasts;
@@ -14098,30 +14136,41 @@ Value *BoUpSLP::vectorizeTree(
1409814136
if (Scalar->getType() != Vec->getType()) {
1409914137
Value *Ex = nullptr;
1410014138
Value *ExV = nullptr;
14101-
auto *GEP = dyn_cast<GetElementPtrInst>(Scalar);
14102-
bool ReplaceGEP = GEP && ExternalUsesAsGEPs.contains(GEP);
14139+
auto *Inst = dyn_cast<Instruction>(Scalar);
14140+
bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.contains(Inst);
1410314141
auto It = ScalarToEEs.find(Scalar);
1410414142
if (It != ScalarToEEs.end()) {
1410514143
// No need to emit many extracts, just move the only one in the
1410614144
// current block.
14107-
auto EEIt = It->second.find(Builder.GetInsertBlock());
14145+
auto EEIt = It->second.find(ReplaceInst ? Inst->getParent()
14146+
: Builder.GetInsertBlock());
1410814147
if (EEIt != It->second.end()) {
14109-
Instruction *I = EEIt->second.first;
14110-
if (Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
14148+
Value *PrevV = EEIt->second.first;
14149+
if (auto *I = dyn_cast<Instruction>(PrevV);
14150+
I && !ReplaceInst &&
14151+
Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
1411114152
Builder.GetInsertPoint()->comesBefore(I)) {
1411214153
I->moveBefore(*Builder.GetInsertPoint()->getParent(),
1411314154
Builder.GetInsertPoint());
14114-
if (auto *CI = EEIt->second.second)
14155+
if (auto *CI = dyn_cast<Instruction>(EEIt->second.second))
1411514156
CI->moveAfter(I);
1411614157
}
14117-
Ex = I;
14158+
Ex = PrevV;
1411814159
ExV = EEIt->second.second ? EEIt->second.second : Ex;
1411914160
}
1412014161
}
1412114162
if (!Ex) {
1412214163
// "Reuse" the existing extract to improve final codegen.
14123-
if (auto *ES = dyn_cast<ExtractElementInst>(Scalar);
14124-
ES && isa<Instruction>(Vec)) {
14164+
if (ReplaceInst) {
14165+
// Leave the instruction as is, if it cheaper extracts and all
14166+
// operands are scalar.
14167+
auto *CloneInst = Inst->clone();
14168+
CloneInst->insertBefore(Inst);
14169+
if (Inst->hasName())
14170+
CloneInst->takeName(Inst);
14171+
Ex = CloneInst;
14172+
} else if (auto *ES = dyn_cast<ExtractElementInst>(Scalar);
14173+
ES && isa<Instruction>(Vec)) {
1412514174
Value *V = ES->getVectorOperand();
1412614175
auto *IVec = cast<Instruction>(Vec);
1412714176
if (const TreeEntry *ETE = getTreeEntry(V))
@@ -14132,18 +14181,6 @@ Value *BoUpSLP::vectorizeTree(
1413214181
Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
1413314182
else
1413414183
Ex = Builder.CreateExtractElement(Vec, Lane);
14135-
} else if (ReplaceGEP) {
14136-
// Leave the GEPs as is, they are free in most cases and better to
14137-
// keep them as GEPs.
14138-
auto *CloneGEP = GEP->clone();
14139-
if (isa<Instruction>(Vec))
14140-
CloneGEP->insertBefore(*Builder.GetInsertBlock(),
14141-
Builder.GetInsertPoint());
14142-
else
14143-
CloneGEP->insertBefore(GEP);
14144-
if (GEP->hasName())
14145-
CloneGEP->takeName(GEP);
14146-
Ex = CloneGEP;
1414714184
} else if (auto *VecTy =
1414814185
dyn_cast<FixedVectorType>(Scalar->getType())) {
1414914186
assert(SLPReVec && "FixedVectorType is not expected.");
@@ -14164,14 +14201,15 @@ Value *BoUpSLP::vectorizeTree(
1416414201
if (Scalar->getType() != Ex->getType())
1416514202
ExV = Builder.CreateIntCast(Ex, Scalar->getType(),
1416614203
MinBWs.find(E)->second.second);
14167-
if (auto *I = dyn_cast<Instruction>(Ex))
14168-
ScalarToEEs[Scalar].try_emplace(
14169-
Builder.GetInsertBlock(),
14170-
std::make_pair(I, cast<Instruction>(ExV)));
14204+
auto *I = dyn_cast<Instruction>(Ex);
14205+
ScalarToEEs[Scalar].try_emplace(I ? I->getParent()
14206+
: &F->getEntryBlock(),
14207+
std::make_pair(Ex, ExV));
1417114208
}
1417214209
// The then branch of the previous if may produce constants, since 0
1417314210
// operand might be a constant.
14174-
if (auto *ExI = dyn_cast<Instruction>(Ex)) {
14211+
if (auto *ExI = dyn_cast<Instruction>(Ex);
14212+
ExI && !isa<PHINode>(ExI) && !mayHaveNonDefUseDependency(*ExI)) {
1417514213
GatherShuffleExtractSeq.insert(ExI);
1417614214
CSEBlocks.insert(ExI->getParent());
1417714215
}
@@ -14192,9 +14230,10 @@ Value *BoUpSLP::vectorizeTree(
1419214230
continue;
1419314231
assert((ExternallyUsedValues.count(Scalar) ||
1419414232
Scalar->hasNUsesOrMore(UsesLimit) ||
14233+
ExternalUsesAsOriginalScalar.contains(Scalar) ||
1419514234
any_of(Scalar->users(),
1419614235
[&](llvm::User *U) {
14197-
if (ExternalUsesAsGEPs.contains(U))
14236+
if (ExternalUsesAsOriginalScalar.contains(U))
1419814237
return true;
1419914238
TreeEntry *UseEntry = getTreeEntry(U);
1420014239
return UseEntry &&

llvm/test/Transforms/SLPVectorizer/AArch64/external-non-inst-use.ll

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,9 @@
44
define i16 @foo(ptr %p1, ptr %p2) {
55
; CHECK-LABEL: @foo(
66
; CHECK-NEXT: entry:
7-
; CHECK-NEXT: store i32 0, ptr [[P1:%.*]], align 1
8-
; CHECK-NEXT: [[CONST_MAT:%.*]] = or i32 0, 0
7+
; CHECK-NEXT: [[CONST:%.*]] = bitcast i32 0 to i32
8+
; CHECK-NEXT: store i32 [[CONST]], ptr [[P1:%.*]], align 1
9+
; CHECK-NEXT: [[CONST_MAT:%.*]] = or i32 [[CONST]], 0
910
; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr [[P2:%.*]], align 1
1011
; CHECK-NEXT: ret i16 0
1112
;

llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1236,20 +1236,20 @@ define void @crash_no_tracked_instructions(ptr %arg, ptr %arg.2, ptr %arg.3, i1
12361236
; CHECK: bb22:
12371237
; CHECK-NEXT: [[T23:%.*]] = fmul float [[T20]], 9.900000e+01
12381238
; CHECK-NEXT: [[T25:%.*]] = getelementptr inbounds float, ptr [[T19]], i64 2
1239+
; CHECK-NEXT: [[T26:%.*]] = fmul float [[T23]], 1.000000e+01
12391240
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[T23]], i32 0
12401241
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <2 x i32> zeroinitializer
12411242
; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x float> [[TMP2]], <float 9.900000e+01, float 1.000000e+01>
1242-
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
1243-
; CHECK-NEXT: store float [[TMP4]], ptr [[T25]], align 4
1243+
; CHECK-NEXT: store float [[T26]], ptr [[T25]], align 4
12441244
; CHECK-NEXT: [[T27:%.*]] = load float, ptr [[ARG_2:%.*]], align 8
1245-
; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x float> [[TMP3]], <float 2.000000e+01, float 2.000000e+01>
1245+
; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x float> [[TMP3]], <float 2.000000e+01, float 2.000000e+01>
12461246
; CHECK-NEXT: br label [[BB30]]
12471247
; CHECK: bb30:
1248-
; CHECK-NEXT: [[TMP6:%.*]] = phi <2 x float> [ [[TMP5]], [[BB22]] ], [ [[TMP0]], [[ENTRY:%.*]] ]
1248+
; CHECK-NEXT: [[TMP5:%.*]] = phi <2 x float> [ [[TMP4]], [[BB22]] ], [ [[TMP0]], [[ENTRY:%.*]] ]
12491249
; CHECK-NEXT: br label [[BB36:%.*]]
12501250
; CHECK: bb36:
1251-
; CHECK-NEXT: [[TMP7:%.*]] = fmul <2 x float> [[TMP6]], <float 3.000000e+00, float 3.000000e+00>
1252-
; CHECK-NEXT: store <2 x float> [[TMP7]], ptr [[ARG_3]], align 4
1251+
; CHECK-NEXT: [[TMP6:%.*]] = fmul <2 x float> [[TMP5]], <float 3.000000e+00, float 3.000000e+00>
1252+
; CHECK-NEXT: store <2 x float> [[TMP6]], ptr [[ARG_3]], align 4
12531253
; CHECK-NEXT: br label [[BB41:%.*]]
12541254
; CHECK: bb41:
12551255
; CHECK-NEXT: ret void

0 commit comments

Comments
 (0)