-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[SLP]Attempt to vectorize long stores, if short one failed. #88563
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SLP]Attempt to vectorize long stores, if short one failed. #88563
Conversation
Created using spr 1.3.5
@llvm/pr-subscribers-llvm-transforms Author: Alexey Bataev (alexey-bataev) ChangesWe can try to vectorize long store sequences, if short ones were Metric: size..text Program size..text In all benchmarks extra code gets vectorized Full diff: https://github.com/llvm/llvm-project/pull/88563.diff 2 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index df891371fdf758..4f70490f5171cc 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -15111,39 +15111,60 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
Size /= 2;
});
unsigned StartIdx = 0;
- for (unsigned Size : CandidateVFs) {
- for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) {
- ArrayRef<Value *> Slice = ArrayRef(Operands).slice(Cnt, Size);
- assert(
- all_of(
- Slice,
- [&](Value *V) {
- return cast<StoreInst>(V)->getValueOperand()->getType() ==
- cast<StoreInst>(Slice.front())
- ->getValueOperand()
- ->getType();
- }) &&
- "Expected all operands of same type.");
- if (!VectorizedStores.count(Slice.front()) &&
- !VectorizedStores.count(Slice.back()) &&
- TriedSequences.insert(std::make_pair(Slice.front(), Slice.back()))
- .second &&
- vectorizeStoreChain(Slice, R, Cnt, MinVF)) {
- // Mark the vectorized stores so that we don't vectorize them again.
- VectorizedStores.insert(Slice.begin(), Slice.end());
- Changed = true;
- // If we vectorized initial block, no need to try to vectorize it
- // again.
- if (Cnt == StartIdx)
- StartIdx += Size;
- Cnt += Size;
- continue;
+ unsigned Repeat = 0;
+ constexpr unsigned MaxAttempts = 2;
+ while (true) {
+ ++Repeat;
+ for (unsigned Size : CandidateVFs) {
+ for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) {
+ ArrayRef<Value *> Slice = ArrayRef(Operands).slice(Cnt, Size);
+ assert(
+ all_of(
+ Slice,
+ [&](Value *V) {
+ return cast<StoreInst>(V)->getValueOperand()->getType() ==
+ cast<StoreInst>(Slice.front())
+ ->getValueOperand()
+ ->getType();
+ }) &&
+ "Expected all operands of same type.");
+ if (!VectorizedStores.count(Slice.front()) &&
+ !VectorizedStores.count(Slice.back()) &&
+ TriedSequences
+ .insert(std::make_pair(Slice.front(), Slice.back()))
+ .second &&
+ vectorizeStoreChain(Slice, R, Cnt, MinVF)) {
+ // Mark the vectorized stores so that we don't vectorize them
+ // again.
+ VectorizedStores.insert(Slice.begin(), Slice.end());
+ Changed = true;
+ // If we vectorized initial block, no need to try to vectorize
+ // it again.
+ if (Cnt == StartIdx)
+ StartIdx += Size;
+ Cnt += Size;
+ continue;
+ }
+ ++Cnt;
+ }
+ // Check if the whole array was vectorized already - exit.
+ if (StartIdx >= Operands.size()) {
+ Repeat = MaxAttempts;
+ break;
}
- ++Cnt;
}
- // Check if the whole array was vectorized already - exit.
- if (StartIdx >= Operands.size())
+ // Check if tried all attempts or no need for the last attempts at all.
+ if (Repeat >= MaxAttempts)
break;
+ const unsigned MaxTotalNum = bit_floor(Operands.size() - StartIdx);
+ if (MaxVF >= MaxTotalNum)
+ break;
+ // Last attempt to vectorize max number of elements, if all previous
+ // attempts were unsuccessful because of the cost issues.
+ CandidateVFs.clear();
+ for (unsigned Size = MaxTotalNum; Size > MaxVF; Size /= 2) {
+ CandidateVFs.push_back(Size);
+ }
}
}
};
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll
index 75505f632a43f3..3deab0975ce764 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll
@@ -100,41 +100,17 @@ define void @store_i8(ptr nocapture %0, i32 %1, i32 %2) {
define void @store_i64(ptr nocapture %0, i32 %1, i32 %2) {
; SSE-LABEL: @store_i64(
; SSE-NEXT: [[TMP4:%.*]] = zext i32 [[TMP1:%.*]] to i64
-; SSE-NEXT: [[TMP5:%.*]] = load i64, ptr [[TMP0:%.*]], align 8, !tbaa [[TBAA5:![0-9]+]]
-; SSE-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], [[TMP4]]
-; SSE-NEXT: [[TMP7:%.*]] = lshr i64 [[TMP6]], 15
-; SSE-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32
-; SSE-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], 255
-; SSE-NEXT: [[TMP10:%.*]] = and i64 [[TMP7]], 4294967295
-; SSE-NEXT: [[TMP11:%.*]] = select i1 [[TMP9]], i64 [[TMP10]], i64 255
-; SSE-NEXT: store i64 [[TMP11]], ptr [[TMP0]], align 8, !tbaa [[TBAA5]]
-; SSE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8
-; SSE-NEXT: [[TMP13:%.*]] = load i64, ptr [[TMP12]], align 8, !tbaa [[TBAA5]]
-; SSE-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], [[TMP4]]
-; SSE-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP14]], 15
-; SSE-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP15]] to i32
-; SSE-NEXT: [[TMP17:%.*]] = icmp ult i32 [[TMP16]], 255
-; SSE-NEXT: [[TMP18:%.*]] = and i64 [[TMP15]], 4294967295
-; SSE-NEXT: [[TMP19:%.*]] = select i1 [[TMP17]], i64 [[TMP18]], i64 255
-; SSE-NEXT: store i64 [[TMP19]], ptr [[TMP12]], align 8, !tbaa [[TBAA5]]
-; SSE-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 16
-; SSE-NEXT: [[TMP21:%.*]] = load i64, ptr [[TMP20]], align 8, !tbaa [[TBAA5]]
-; SSE-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], [[TMP4]]
-; SSE-NEXT: [[TMP23:%.*]] = lshr i64 [[TMP22]], 15
-; SSE-NEXT: [[TMP24:%.*]] = trunc i64 [[TMP23]] to i32
-; SSE-NEXT: [[TMP25:%.*]] = icmp ult i32 [[TMP24]], 255
-; SSE-NEXT: [[TMP26:%.*]] = and i64 [[TMP23]], 4294967295
-; SSE-NEXT: [[TMP27:%.*]] = select i1 [[TMP25]], i64 [[TMP26]], i64 255
-; SSE-NEXT: store i64 [[TMP27]], ptr [[TMP20]], align 8, !tbaa [[TBAA5]]
-; SSE-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 24
-; SSE-NEXT: [[TMP29:%.*]] = load i64, ptr [[TMP28]], align 8, !tbaa [[TBAA5]]
-; SSE-NEXT: [[TMP30:%.*]] = mul i64 [[TMP29]], [[TMP4]]
-; SSE-NEXT: [[TMP31:%.*]] = lshr i64 [[TMP30]], 15
-; SSE-NEXT: [[TMP32:%.*]] = trunc i64 [[TMP31]] to i32
-; SSE-NEXT: [[TMP33:%.*]] = icmp ult i32 [[TMP32]], 255
-; SSE-NEXT: [[TMP34:%.*]] = and i64 [[TMP31]], 4294967295
-; SSE-NEXT: [[TMP35:%.*]] = select i1 [[TMP33]], i64 [[TMP34]], i64 255
-; SSE-NEXT: store i64 [[TMP35]], ptr [[TMP28]], align 8, !tbaa [[TBAA5]]
+; SSE-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 8, !tbaa [[TBAA5:![0-9]+]]
+; SSE-NEXT: [[TMP6:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i64 0
+; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> poison, <4 x i32> zeroinitializer
+; SSE-NEXT: [[TMP8:%.*]] = mul <4 x i64> [[TMP5]], [[TMP7]]
+; SSE-NEXT: [[TMP9:%.*]] = lshr <4 x i64> [[TMP8]], <i64 15, i64 15, i64 15, i64 15>
+; SSE-NEXT: [[TMP10:%.*]] = trunc <4 x i64> [[TMP9]] to <4 x i32>
+; SSE-NEXT: [[TMP11:%.*]] = icmp ult <4 x i32> [[TMP10]], <i32 255, i32 255, i32 255, i32 255>
+; SSE-NEXT: [[TMP12:%.*]] = trunc <4 x i64> [[TMP9]] to <4 x i32>
+; SSE-NEXT: [[TMP13:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> <i32 255, i32 255, i32 255, i32 255>
+; SSE-NEXT: [[TMP14:%.*]] = zext <4 x i32> [[TMP13]] to <4 x i64>
+; SSE-NEXT: store <4 x i64> [[TMP14]], ptr [[TMP0]], align 8, !tbaa [[TBAA5]]
; SSE-NEXT: ret void
;
; AVX-LABEL: @store_i64(
|
Ping! |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
I've reverted this change due to large compile-time regressions (https://llvm-compile-time-tracker.com/compare.php?from=c7657cf7d1ee57f9cb9133164536591a1842b43c&to=7d4e8c1f3bbfe976f4871c9cf953f76d771b0eda&stat=instructions:u). The regressions go up to 10% on individual files (https://llvm-compile-time-tracker.com/compare.php?from=c7657cf7d1ee57f9cb9133164536591a1842b43c&to=7d4e8c1f3bbfe976f4871c9cf953f76d771b0eda&stat=instructions%3Au&details=on). Though the 10% regressions are in LTO builds on small files, so easier to investigate is probably something like constants.c from pairlocalalign that regresses by 5% in a normal O3 build. |
We can try to vectorize long store sequences, if short ones were unsuccessful because of the non-profitable vectorization. It should not increase compile time significantly (stores are sorted already, complexity is n x log n), but vectorize extra code. Metric: size..text Program size..text results results0 diff test-suite :: External/SPEC/CINT2006/400.perlbench/400.perlbench.test 1088012.00 1088236.00 0.0% test-suite :: SingleSource/UnitTests/matrix-types-spec.test 480396.00 480476.00 0.0% test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test 664613.00 664661.00 0.0% test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test 664613.00 664661.00 0.0% test-suite :: External/SPEC/CFP2017rate/510.parest_r/510.parest_r.test 2041105.00 2040961.00 -0.0% test-suite :: MultiSource/Applications/JM/lencod/lencod.test 836563.00 836387.00 -0.0% test-suite :: MultiSource/Benchmarks/7zip/7zip-benchmark.test 1035100.00 1032140.00 -0.3% In all benchmarks extra code gets vectorized Reviewers: RKSimon Reviewed By: RKSimon Pull Request: #88563
I've reverted the new version again -- it looks like the pairlocalalign regressions have been fixed, but all the other ones are still there (https://llvm-compile-time-tracker.com/compare.php?from=693a458287d019c5c6a66fe3019d099df2978cdb&to=6f7160eedb2db02f37d4ffd52fff7b0cf88b3fdc&stat=instructions:u) For example libclamav_nsis_LZMADecode.c from clamscan has a 5-6% regression, transform8x8.c from lencod as well, Ppmd7.c from 7zip has 7%. |
I've added your LLVM fork the the compile-time tracker, in case you want to test changes before landing them upstream. |
I tried it locally and did not see significant compile time regressions, will double check tomorrow |
We can try to vectorize long store sequences, if short ones were unsuccessful because of the non-profitable vectorization. It should not increase compile time significantly (stores are sorted already, complexity is n x log n), but vectorize extra code. Metric: size..text Program size..text results results0 diff test-suite :: External/SPEC/CINT2006/400.perlbench/400.perlbench.test 1088012.00 1088236.00 0.0% test-suite :: SingleSource/UnitTests/matrix-types-spec.test 480396.00 480476.00 0.0% test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test 664613.00 664661.00 0.0% test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test 664613.00 664661.00 0.0% test-suite :: External/SPEC/CFP2017rate/510.parest_r/510.parest_r.test 2041105.00 2040961.00 -0.0% test-suite :: MultiSource/Applications/JM/lencod/lencod.test 836563.00 836387.00 -0.0% test-suite :: MultiSource/Benchmarks/7zip/7zip-benchmark.test 1035100.00 1032140.00 -0.3% In all benchmarks extra code gets vectorized Reviewers: RKSimon Reviewed By: RKSimon Pull Request: #88563
We can try to vectorize long store sequences, if short ones were
unsuccessful because of the non-profitable vectorization. It should not
increase compile time significantly (stores are sorted already,
complexity is n x log n), but vectorize extra code.
Metric: size..text
Program size..text
results results0 diff
test-suite :: External/SPEC/CINT2006/400.perlbench/400.perlbench.test 1088012.00 1088236.00 0.0%
test-suite :: SingleSource/UnitTests/matrix-types-spec.test 480396.00 480476.00 0.0%
test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test 664613.00 664661.00 0.0%
test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test 664613.00 664661.00 0.0%
test-suite :: External/SPEC/CFP2017rate/510.parest_r/510.parest_r.test 2041105.00 2040961.00 -0.0%
test-suite :: MultiSource/Applications/JM/lencod/lencod.test 836563.00 836387.00 -0.0%
test-suite :: MultiSource/Benchmarks/7zip/7zip-benchmark.test 1035100.00 1032140.00 -0.3%
In all benchmarks extra code gets vectorized