Skip to content

Commit 6f7160e

Browse files
committed
[SLP]Attempt to vectorize long stores, if short one failed.
We can try to vectorize long store sequences, if short ones were unsuccessful because of the non-profitable vectorization. It should not increase compile time significantly (stores are sorted already, complexity is n x log n), but vectorize extra code. Metric: size..text Program size..text results results0 diff test-suite :: External/SPEC/CINT2006/400.perlbench/400.perlbench.test 1088012.00 1088236.00 0.0% test-suite :: SingleSource/UnitTests/matrix-types-spec.test 480396.00 480476.00 0.0% test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test 664613.00 664661.00 0.0% test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test 664613.00 664661.00 0.0% test-suite :: External/SPEC/CFP2017rate/510.parest_r/510.parest_r.test 2041105.00 2040961.00 -0.0% test-suite :: MultiSource/Applications/JM/lencod/lencod.test 836563.00 836387.00 -0.0% test-suite :: MultiSource/Benchmarks/7zip/7zip-benchmark.test 1035100.00 1032140.00 -0.3% In all benchmarks extra code gets vectorized Reviewers: RKSimon Reviewed By: RKSimon Pull Request: #88563
1 parent 693a458 commit 6f7160e

File tree

2 files changed

+80
-72
lines changed

2 files changed

+80
-72
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 69 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -15164,10 +15164,6 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
1516415164
BoUpSLP::ValueSet VectorizedStores;
1516515165
bool Changed = false;
1516615166

15167-
// Stores the pair of stores (first_store, last_store) in a range, that were
15168-
// already tried to be vectorized. Allows to skip the store ranges that were
15169-
// already tried to be vectorized but the attempts were unsuccessful.
15170-
DenseSet<std::pair<Value *, Value *>> TriedSequences;
1517115167
struct StoreDistCompare {
1517215168
bool operator()(const std::pair<unsigned, int> &Op1,
1517315169
const std::pair<unsigned, int> &Op2) const {
@@ -15209,8 +15205,10 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
1520915205
Type *ValueTy = StoreTy;
1521015206
if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
1521115207
ValueTy = Trunc->getSrcTy();
15212-
unsigned MinVF = PowerOf2Ceil(TTI->getStoreMinimumVF(
15213-
R.getMinVF(DL->getTypeStoreSizeInBits(StoreTy)), StoreTy, ValueTy));
15208+
unsigned MinVF = std::max<unsigned>(
15209+
2, PowerOf2Ceil(TTI->getStoreMinimumVF(
15210+
R.getMinVF(DL->getTypeStoreSizeInBits(StoreTy)), StoreTy,
15211+
ValueTy)));
1521415212

1521515213
if (MaxVF < MinVF) {
1521615214
LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
@@ -15236,40 +15234,74 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
1523615234
VF = Size > MaxVF ? NonPowerOf2VF : Size;
1523715235
Size *= 2;
1523815236
});
15239-
unsigned StartIdx = 0;
15240-
for (unsigned Size : CandidateVFs) {
15241-
for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) {
15242-
ArrayRef<Value *> Slice = ArrayRef(Operands).slice(Cnt, Size);
15243-
assert(
15244-
all_of(
15245-
Slice,
15246-
[&](Value *V) {
15247-
return cast<StoreInst>(V)->getValueOperand()->getType() ==
15248-
cast<StoreInst>(Slice.front())
15249-
->getValueOperand()
15250-
->getType();
15251-
}) &&
15252-
"Expected all operands of same type.");
15253-
if (!VectorizedStores.count(Slice.front()) &&
15254-
!VectorizedStores.count(Slice.back()) &&
15255-
TriedSequences.insert(std::make_pair(Slice.front(), Slice.back()))
15256-
.second &&
15257-
vectorizeStoreChain(Slice, R, Cnt, MinVF)) {
15258-
// Mark the vectorized stores so that we don't vectorize them again.
15259-
VectorizedStores.insert(Slice.begin(), Slice.end());
15260-
Changed = true;
15261-
// If we vectorized initial block, no need to try to vectorize it
15262-
// again.
15263-
if (Cnt == StartIdx)
15264-
StartIdx += Size;
15265-
Cnt += Size;
15266-
continue;
15237+
unsigned End = Operands.size();
15238+
unsigned Repeat = 0;
15239+
constexpr unsigned MaxAttempts = 2;
15240+
SmallBitVector Range(Operands.size());
15241+
while (true) {
15242+
++Repeat;
15243+
for (unsigned Size : CandidateVFs) {
15244+
int StartIdx = Range.find_first_unset();
15245+
while (StartIdx != -1) {
15246+
int EndIdx = Range.find_next(StartIdx);
15247+
unsigned Sz = EndIdx == -1 ? End : EndIdx;
15248+
for (unsigned Cnt = StartIdx; Cnt + Size <= Sz;) {
15249+
ArrayRef<Value *> Slice = ArrayRef(Operands).slice(Cnt, Size);
15250+
assert(all_of(Slice,
15251+
[&](Value *V) {
15252+
return cast<StoreInst>(V)
15253+
->getValueOperand()
15254+
->getType() ==
15255+
cast<StoreInst>(Slice.front())
15256+
->getValueOperand()
15257+
->getType();
15258+
}) &&
15259+
"Expected all operands of same type.");
15260+
if (vectorizeStoreChain(Slice, R, Cnt, MinVF)) {
15261+
// Mark the vectorized stores so that we don't vectorize them
15262+
// again.
15263+
VectorizedStores.insert(Slice.begin(), Slice.end());
15264+
// Mark the vectorized stores so that we don't vectorize them
15265+
// again.
15266+
Changed = true;
15267+
// If we vectorized initial block, no need to try to vectorize
15268+
// it again.
15269+
Range.set(Cnt, Cnt + Size);
15270+
if (Cnt < StartIdx + MinVF)
15271+
Range.set(StartIdx, Cnt);
15272+
if (Cnt > EndIdx - Size - MinVF) {
15273+
Range.set(Cnt + Size, EndIdx);
15274+
End = Cnt;
15275+
}
15276+
Cnt += Size;
15277+
continue;
15278+
}
15279+
++Cnt;
15280+
}
15281+
if (Sz >= End)
15282+
break;
15283+
StartIdx = Range.find_next_unset(EndIdx);
1526715284
}
15268-
++Cnt;
1526915285
}
15270-
// Check if the whole array was vectorized already - exit.
15271-
if (StartIdx >= Operands.size())
15286+
// All values vectorize - exit.
15287+
if (Range.all())
15288+
break;
15289+
// Check if tried all attempts or no need for the last attempts at all.
15290+
if (Repeat >= MaxAttempts)
15291+
break;
15292+
constexpr unsigned MaxVFScale = 4;
15293+
constexpr unsigned StoresLimit = 16;
15294+
const unsigned MaxTotalNum = std::min(
15295+
std::max<unsigned>(StoresLimit, MaxVFScale * MaxVF),
15296+
bit_floor(static_cast<unsigned>(Range.find_last_unset() -
15297+
Range.find_first_unset() + 1)));
15298+
if (MaxVF >= MaxTotalNum)
1527215299
break;
15300+
// Last attempt to vectorize max number of elements, if all previous
15301+
// attempts were unsuccessful because of the cost issues.
15302+
CandidateVFs.clear();
15303+
for (unsigned Size = MaxTotalNum; Size > MaxVF; Size /= 2)
15304+
CandidateVFs.push_back(Size);
1527315305
}
1527415306
}
1527515307
};

llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll

Lines changed: 11 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -100,41 +100,17 @@ define void @store_i8(ptr nocapture %0, i32 %1, i32 %2) {
100100
define void @store_i64(ptr nocapture %0, i32 %1, i32 %2) {
101101
; SSE-LABEL: @store_i64(
102102
; SSE-NEXT: [[TMP4:%.*]] = zext i32 [[TMP1:%.*]] to i64
103-
; SSE-NEXT: [[TMP5:%.*]] = load i64, ptr [[TMP0:%.*]], align 8, !tbaa [[TBAA5:![0-9]+]]
104-
; SSE-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], [[TMP4]]
105-
; SSE-NEXT: [[TMP7:%.*]] = lshr i64 [[TMP6]], 15
106-
; SSE-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32
107-
; SSE-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], 255
108-
; SSE-NEXT: [[TMP10:%.*]] = and i64 [[TMP7]], 4294967295
109-
; SSE-NEXT: [[TMP11:%.*]] = select i1 [[TMP9]], i64 [[TMP10]], i64 255
110-
; SSE-NEXT: store i64 [[TMP11]], ptr [[TMP0]], align 8, !tbaa [[TBAA5]]
111-
; SSE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8
112-
; SSE-NEXT: [[TMP13:%.*]] = load i64, ptr [[TMP12]], align 8, !tbaa [[TBAA5]]
113-
; SSE-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], [[TMP4]]
114-
; SSE-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP14]], 15
115-
; SSE-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP15]] to i32
116-
; SSE-NEXT: [[TMP17:%.*]] = icmp ult i32 [[TMP16]], 255
117-
; SSE-NEXT: [[TMP18:%.*]] = and i64 [[TMP15]], 4294967295
118-
; SSE-NEXT: [[TMP19:%.*]] = select i1 [[TMP17]], i64 [[TMP18]], i64 255
119-
; SSE-NEXT: store i64 [[TMP19]], ptr [[TMP12]], align 8, !tbaa [[TBAA5]]
120-
; SSE-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 16
121-
; SSE-NEXT: [[TMP21:%.*]] = load i64, ptr [[TMP20]], align 8, !tbaa [[TBAA5]]
122-
; SSE-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], [[TMP4]]
123-
; SSE-NEXT: [[TMP23:%.*]] = lshr i64 [[TMP22]], 15
124-
; SSE-NEXT: [[TMP24:%.*]] = trunc i64 [[TMP23]] to i32
125-
; SSE-NEXT: [[TMP25:%.*]] = icmp ult i32 [[TMP24]], 255
126-
; SSE-NEXT: [[TMP26:%.*]] = and i64 [[TMP23]], 4294967295
127-
; SSE-NEXT: [[TMP27:%.*]] = select i1 [[TMP25]], i64 [[TMP26]], i64 255
128-
; SSE-NEXT: store i64 [[TMP27]], ptr [[TMP20]], align 8, !tbaa [[TBAA5]]
129-
; SSE-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 24
130-
; SSE-NEXT: [[TMP29:%.*]] = load i64, ptr [[TMP28]], align 8, !tbaa [[TBAA5]]
131-
; SSE-NEXT: [[TMP30:%.*]] = mul i64 [[TMP29]], [[TMP4]]
132-
; SSE-NEXT: [[TMP31:%.*]] = lshr i64 [[TMP30]], 15
133-
; SSE-NEXT: [[TMP32:%.*]] = trunc i64 [[TMP31]] to i32
134-
; SSE-NEXT: [[TMP33:%.*]] = icmp ult i32 [[TMP32]], 255
135-
; SSE-NEXT: [[TMP34:%.*]] = and i64 [[TMP31]], 4294967295
136-
; SSE-NEXT: [[TMP35:%.*]] = select i1 [[TMP33]], i64 [[TMP34]], i64 255
137-
; SSE-NEXT: store i64 [[TMP35]], ptr [[TMP28]], align 8, !tbaa [[TBAA5]]
103+
; SSE-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 8, !tbaa [[TBAA5:![0-9]+]]
104+
; SSE-NEXT: [[TMP6:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i64 0
105+
; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> poison, <4 x i32> zeroinitializer
106+
; SSE-NEXT: [[TMP8:%.*]] = mul <4 x i64> [[TMP5]], [[TMP7]]
107+
; SSE-NEXT: [[TMP9:%.*]] = lshr <4 x i64> [[TMP8]], <i64 15, i64 15, i64 15, i64 15>
108+
; SSE-NEXT: [[TMP10:%.*]] = trunc <4 x i64> [[TMP9]] to <4 x i32>
109+
; SSE-NEXT: [[TMP11:%.*]] = icmp ult <4 x i32> [[TMP10]], <i32 255, i32 255, i32 255, i32 255>
110+
; SSE-NEXT: [[TMP12:%.*]] = trunc <4 x i64> [[TMP9]] to <4 x i32>
111+
; SSE-NEXT: [[TMP13:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> <i32 255, i32 255, i32 255, i32 255>
112+
; SSE-NEXT: [[TMP14:%.*]] = zext <4 x i32> [[TMP13]] to <4 x i64>
113+
; SSE-NEXT: store <4 x i64> [[TMP14]], ptr [[TMP0]], align 8, !tbaa [[TBAA5]]
138114
; SSE-NEXT: ret void
139115
;
140116
; AVX-LABEL: @store_i64(

0 commit comments

Comments
 (0)