Skip to content

Commit 466217e

Browse files
[SLP]Fix graph traversal in getSpillCost
getSpill cost relies on def-use order when performs the analysis for the vectorized instructions live-over-calls spills. Patch fixes it to check the dependencies based on TreeEntries and performs actual vectorized type analysis. Reviewers: RKSimon, preames Reviewed By: preames Pull Request: #124984
1 parent 28d7880 commit 466217e

File tree

3 files changed

+417
-635
lines changed

3 files changed

+417
-635
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 50 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -1395,7 +1395,7 @@ class BoUpSLP {
13951395

13961396
/// \returns the cost incurred by unwanted spills and fills, caused by
13971397
/// holding live values over call sites.
1398-
InstructionCost getSpillCost() const;
1398+
InstructionCost getSpillCost();
13991399

14001400
/// \returns the vectorization cost of the subtree that starts at \p VL.
14011401
/// A negative number means that this is profitable.
@@ -2958,7 +2958,7 @@ class BoUpSLP {
29582958
}
29592959

29602960
/// Check if the value is vectorized in the tree.
2961-
bool isVectorized(Value *V) const {
2961+
bool isVectorized(const Value *V) const {
29622962
assert(V && "V cannot be nullptr.");
29632963
return ScalarToTreeEntries.contains(V);
29642964
}
@@ -12160,78 +12160,80 @@ bool BoUpSLP::isTreeNotExtendable() const {
1216012160
return Res;
1216112161
}
1216212162

12163-
InstructionCost BoUpSLP::getSpillCost() const {
12163+
InstructionCost BoUpSLP::getSpillCost() {
1216412164
// Walk from the bottom of the tree to the top, tracking which values are
1216512165
// live. When we see a call instruction that is not part of our tree,
1216612166
// query TTI to see if there is a cost to keeping values live over it
1216712167
// (for example, if spills and fills are required).
12168-
unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
1216912168
InstructionCost Cost = 0;
1217012169

12171-
SmallPtrSet<Instruction *, 4> LiveValues;
12172-
Instruction *PrevInst = nullptr;
12170+
SmallPtrSet<const TreeEntry *, 4> LiveEntries;
12171+
const TreeEntry *Prev = nullptr;
1217312172

1217412173
// The entries in VectorizableTree are not necessarily ordered by their
1217512174
// position in basic blocks. Collect them and order them by dominance so later
1217612175
// instructions are guaranteed to be visited first. For instructions in
1217712176
// different basic blocks, we only scan to the beginning of the block, so
1217812177
// their order does not matter, as long as all instructions in a basic block
1217912178
// are grouped together. Using dominance ensures a deterministic order.
12180-
SmallVector<Instruction *, 16> OrderedScalars;
12179+
SmallVector<TreeEntry *, 16> OrderedEntries;
1218112180
for (const auto &TEPtr : VectorizableTree) {
12182-
if (TEPtr->State != TreeEntry::Vectorize)
12181+
if (TEPtr->isGather())
1218312182
continue;
12184-
Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
12185-
if (!Inst)
12186-
continue;
12187-
OrderedScalars.push_back(Inst);
12188-
}
12189-
llvm::sort(OrderedScalars, [&](Instruction *A, Instruction *B) {
12190-
auto *NodeA = DT->getNode(A->getParent());
12191-
auto *NodeB = DT->getNode(B->getParent());
12183+
OrderedEntries.push_back(TEPtr.get());
12184+
}
12185+
llvm::stable_sort(OrderedEntries, [&](const TreeEntry *TA,
12186+
const TreeEntry *TB) {
12187+
Instruction &A = getLastInstructionInBundle(TA);
12188+
Instruction &B = getLastInstructionInBundle(TB);
12189+
auto *NodeA = DT->getNode(A.getParent());
12190+
auto *NodeB = DT->getNode(B.getParent());
1219212191
assert(NodeA && "Should only process reachable instructions");
1219312192
assert(NodeB && "Should only process reachable instructions");
1219412193
assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
1219512194
"Different nodes should have different DFS numbers");
1219612195
if (NodeA != NodeB)
1219712196
return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
12198-
return B->comesBefore(A);
12197+
return B.comesBefore(&A);
1219912198
});
1220012199

12201-
for (Instruction *Inst : OrderedScalars) {
12202-
if (!PrevInst) {
12203-
PrevInst = Inst;
12200+
for (const TreeEntry *TE : OrderedEntries) {
12201+
if (!Prev) {
12202+
Prev = TE;
1220412203
continue;
1220512204
}
1220612205

12207-
// Update LiveValues.
12208-
LiveValues.erase(PrevInst);
12209-
for (auto &J : PrevInst->operands()) {
12210-
if (isa<Instruction>(&*J) && isVectorized(&*J))
12211-
LiveValues.insert(cast<Instruction>(&*J));
12206+
LiveEntries.erase(Prev);
12207+
for (unsigned I : seq<unsigned>(Prev->getNumOperands())) {
12208+
const TreeEntry *Op = getVectorizedOperand(Prev, I);
12209+
if (!Op)
12210+
continue;
12211+
assert(!Op->isGather() && "Expected vectorized operand.");
12212+
LiveEntries.insert(Op);
1221212213
}
1221312214

1221412215
LLVM_DEBUG({
12215-
dbgs() << "SLP: #LV: " << LiveValues.size();
12216-
for (auto *X : LiveValues)
12217-
dbgs() << " " << X->getName();
12216+
dbgs() << "SLP: #LV: " << LiveEntries.size();
12217+
for (auto *X : LiveEntries)
12218+
X->dump();
1221812219
dbgs() << ", Looking at ";
12219-
Inst->dump();
12220+
TE->dump();
1222012221
});
1222112222

1222212223
// Now find the sequence of instructions between PrevInst and Inst.
1222312224
unsigned NumCalls = 0;
12224-
BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(),
12225-
PrevInstIt =
12226-
PrevInst->getIterator().getReverse();
12225+
const Instruction *PrevInst = &getLastInstructionInBundle(Prev);
12226+
BasicBlock::const_reverse_iterator
12227+
InstIt = ++getLastInstructionInBundle(TE).getIterator().getReverse(),
12228+
PrevInstIt = PrevInst->getIterator().getReverse();
1222712229
while (InstIt != PrevInstIt) {
1222812230
if (PrevInstIt == PrevInst->getParent()->rend()) {
12229-
PrevInstIt = Inst->getParent()->rbegin();
12231+
PrevInstIt = getLastInstructionInBundle(TE).getParent()->rbegin();
1223012232
continue;
1223112233
}
1223212234

12233-
auto NoCallIntrinsic = [this](Instruction *I) {
12234-
auto *II = dyn_cast<IntrinsicInst>(I);
12235+
auto NoCallIntrinsic = [this](const Instruction *I) {
12236+
const auto *II = dyn_cast<IntrinsicInst>(I);
1223512237
if (!II)
1223612238
return false;
1223712239
if (II->isAssumeLikeIntrinsic())
@@ -12252,25 +12254,28 @@ InstructionCost BoUpSLP::getSpillCost() const {
1225212254
};
1225312255

1225412256
// Debug information does not impact spill cost.
12255-
if (isa<CallBase>(&*PrevInstIt) && !NoCallIntrinsic(&*PrevInstIt) &&
12256-
&*PrevInstIt != PrevInst)
12257+
// Vectorized calls, represented as vector intrinsics, do not impact spill
12258+
// cost.
12259+
if (const auto *CB = dyn_cast<CallBase>(&*PrevInstIt);
12260+
CB && !NoCallIntrinsic(CB) && !isVectorized(CB))
1225712261
NumCalls++;
1225812262

1225912263
++PrevInstIt;
1226012264
}
1226112265

1226212266
if (NumCalls) {
12263-
SmallVector<Type *, 4> V;
12264-
for (auto *II : LiveValues) {
12265-
auto *ScalarTy = II->getType();
12266-
if (auto *VectorTy = dyn_cast<FixedVectorType>(ScalarTy))
12267-
ScalarTy = VectorTy->getElementType();
12268-
V.push_back(getWidenedType(ScalarTy, BundleWidth));
12267+
SmallVector<Type *, 4> EntriesTypes;
12268+
for (const TreeEntry *TE : LiveEntries) {
12269+
auto *ScalarTy = TE->getMainOp()->getType();
12270+
auto It = MinBWs.find(TE);
12271+
if (It != MinBWs.end())
12272+
ScalarTy = IntegerType::get(ScalarTy->getContext(), It->second.first);
12273+
EntriesTypes.push_back(getWidenedType(ScalarTy, TE->getVectorFactor()));
1226912274
}
12270-
Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(V);
12275+
Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(EntriesTypes);
1227112276
}
1227212277

12273-
PrevInst = Inst;
12278+
Prev = TE;
1227412279
}
1227512280

1227612281
return Cost;

llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll

Lines changed: 15 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -684,27 +684,27 @@ define void @store_blockstrided3(ptr nocapture noundef readonly %x, ptr nocaptur
684684
; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM5]]
685685
; CHECK-NEXT: [[MUL:%.*]] = shl nsw i32 [[STRIDE]], 1
686686
; CHECK-NEXT: [[IDXPROM11:%.*]] = sext i32 [[MUL]] to i64
687-
; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM11]]
688-
; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX12]], align 4
689-
; CHECK-NEXT: [[ADD14:%.*]] = or disjoint i32 [[MUL]], 1
687+
; CHECK-NEXT: [[ARRAYIDX28:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM11]]
688+
; CHECK-NEXT: [[ADD14:%.*]] = add nsw i32 [[MUL]], 2
690689
; CHECK-NEXT: [[IDXPROM15:%.*]] = sext i32 [[ADD14]] to i64
691690
; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM15]]
691+
; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX16]], align 4
692692
; CHECK-NEXT: [[MUL21:%.*]] = mul nsw i32 [[STRIDE]], 3
693693
; CHECK-NEXT: [[IDXPROM23:%.*]] = sext i32 [[MUL21]] to i64
694694
; CHECK-NEXT: [[ARRAYIDX24:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM23]]
695695
; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX24]], align 4
696696
; CHECK-NEXT: [[ADD26:%.*]] = add nsw i32 [[MUL21]], 1
697697
; CHECK-NEXT: [[IDXPROM27:%.*]] = sext i32 [[ADD26]] to i64
698-
; CHECK-NEXT: [[ARRAYIDX28:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM27]]
698+
; CHECK-NEXT: [[ARRAYIDX64:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM27]]
699699
; CHECK-NEXT: [[ARRAYIDX35:%.*]] = getelementptr inbounds nuw i8, ptr [[Y:%.*]], i64 8
700700
; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX35]], align 4
701701
; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM5]]
702-
; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM11]]
702+
; CHECK-NEXT: [[ARRAYIDX49:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM11]]
703+
; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM15]]
703704
; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX48]], align 4
704-
; CHECK-NEXT: [[ARRAYIDX52:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM15]]
705705
; CHECK-NEXT: [[ARRAYIDX60:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM23]]
706706
; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX60]], align 4
707-
; CHECK-NEXT: [[ARRAYIDX64:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM27]]
707+
; CHECK-NEXT: [[ARRAYIDX65:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM27]]
708708
; CHECK-NEXT: [[ARRAYIDX72:%.*]] = getelementptr inbounds nuw i8, ptr [[Z:%.*]], i64 4
709709
; CHECK-NEXT: [[MUL73:%.*]] = mul nsw i32 [[TMP3]], [[TMP0]]
710710
; CHECK-NEXT: [[ARRAYIDX76:%.*]] = getelementptr inbounds nuw i8, ptr [[Z]], i64 24
@@ -715,25 +715,22 @@ define void @store_blockstrided3(ptr nocapture noundef readonly %x, ptr nocaptur
715715
; CHECK-NEXT: [[TMP10:%.*]] = mul nsw <2 x i32> [[TMP8]], [[TMP6]]
716716
; CHECK-NEXT: [[TMP11:%.*]] = mul nsw <2 x i32> [[TMP9]], [[TMP7]]
717717
; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <4 x i32> <i32 1, i32 0, i32 3, i32 2>
718+
; CHECK-NEXT: [[ARRAYIDX84:%.*]] = getelementptr inbounds nuw i8, ptr [[Z]], i64 28
718719
; CHECK-NEXT: [[MUL81:%.*]] = mul nsw i32 [[TMP4]], [[TMP1]]
719-
; CHECK-NEXT: [[ARRAYIDX82:%.*]] = getelementptr inbounds nuw i8, ptr [[Z]], i64 32
720-
; CHECK-NEXT: [[TMP13:%.*]] = load <2 x i32>, ptr [[ARRAYIDX16]], align 4
721-
; CHECK-NEXT: [[TMP14:%.*]] = load <2 x i32>, ptr [[ARRAYIDX52]], align 4
722-
; CHECK-NEXT: [[TMP15:%.*]] = mul nsw <2 x i32> [[TMP14]], [[TMP13]]
723-
; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <2 x i32> [[TMP15]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
724720
; CHECK-NEXT: [[MUL87:%.*]] = mul nsw i32 [[TMP5]], [[TMP2]]
725721
; CHECK-NEXT: [[ARRAYIDX88:%.*]] = getelementptr inbounds nuw i8, ptr [[Z]], i64 44
726-
; CHECK-NEXT: [[ARRAYIDX92:%.*]] = getelementptr inbounds nuw i8, ptr [[Z]], i64 36
727722
; CHECK-NEXT: [[TMP17:%.*]] = load <2 x i32>, ptr [[ARRAYIDX28]], align 4
728723
; CHECK-NEXT: [[TMP18:%.*]] = load <2 x i32>, ptr [[ARRAYIDX64]], align 4
724+
; CHECK-NEXT: [[TMP15:%.*]] = load <2 x i32>, ptr [[ARRAYIDX49]], align 4
725+
; CHECK-NEXT: [[TMP16:%.*]] = load <2 x i32>, ptr [[ARRAYIDX65]], align 4
729726
; CHECK-NEXT: store i32 [[MUL73]], ptr [[Z]], align 4
730727
; CHECK-NEXT: store <4 x i32> [[TMP12]], ptr [[ARRAYIDX72]], align 4
731-
; CHECK-NEXT: store i32 [[MUL81]], ptr [[ARRAYIDX82]], align 4
732-
; CHECK-NEXT: store <2 x i32> [[TMP16]], ptr [[ARRAYIDX76]], align 4
728+
; CHECK-NEXT: store i32 [[MUL81]], ptr [[ARRAYIDX76]], align 4
733729
; CHECK-NEXT: store i32 [[MUL87]], ptr [[ARRAYIDX88]], align 4
734-
; CHECK-NEXT: [[TMP19:%.*]] = mul nsw <2 x i32> [[TMP18]], [[TMP17]]
735-
; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <2 x i32> [[TMP19]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
736-
; CHECK-NEXT: store <2 x i32> [[TMP20]], ptr [[ARRAYIDX92]], align 4
730+
; CHECK-NEXT: [[TMP20:%.*]] = mul nsw <2 x i32> [[TMP15]], [[TMP17]]
731+
; CHECK-NEXT: [[TMP21:%.*]] = mul nsw <2 x i32> [[TMP16]], [[TMP18]]
732+
; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <2 x i32> [[TMP20]], <2 x i32> [[TMP21]], <4 x i32> <i32 1, i32 0, i32 3, i32 2>
733+
; CHECK-NEXT: store <4 x i32> [[TMP19]], ptr [[ARRAYIDX84]], align 4
737734
; CHECK-NEXT: ret void
738735
;
739736
entry:

0 commit comments

Comments
 (0)