Skip to content

Commit 197f4a9

Browse files
[SLP]Remove ExtraArgs from reductions.
No need to handle extra arguments during the reductions anymore, the compiler now can handle all reduced values and reduction operands correctly, even if they are from different basic blocks. Simplifies analysis, reduces compiler size, improves overall vectorization. Metric: size..text test-suite :: SingleSource/Benchmarks/Misc-C++/stepanov_container.test 16668.00 17148.00 2.9% test-suite :: External/SPEC/CINT2006/483.xalancbmk/483.xalancbmk.test 2389675.00 2418683.00 1.2% test-suite :: MultiSource/Benchmarks/ASCI_Purple/SMG2000/smg2000.test 253517.00 253645.00 0.1% test-suite :: MultiSource/Benchmarks/Bullet/bullet.test 309678.00 309806.00 0.0% test-suite :: MultiSource/Applications/JM/ldecod/ldecod.test 389203.00 389363.00 0.0% test-suite :: MultiSource/Benchmarks/MiBench/consumer-jpeg/consumer-jpeg.test 111120.00 111152.00 0.0% test-suite :: MultiSource/Benchmarks/tramp3d-v4/tramp3d-v4.test 1039103.00 1039215.00 0.0% test-suite :: External/SPEC/CFP2017rate/511.povray_r/511.povray_r.test 1155883.00 1155963.00 0.0% test-suite :: MicroBenchmarks/LoopVectorization/LoopInterleavingBenchmarks.test 276646.00 276662.00 0.0% test-suite :: MultiSource/Applications/JM/lencod/lencod.test 848691.00 848739.00 0.0% test-suite :: External/SPEC/CFP2006/453.povray/453.povray.test 1138604.00 1138636.00 0.0% test-suite :: External/SPEC/CINT2006/445.gobmk/445.gobmk.test 910201.00 910217.00 0.0% test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 12385484.00 12385628.00 0.0% test-suite :: External/SPEC/CINT2017speed/602.gcc_s/602.gcc_s.test 9667580.00 9667676.00 0.0% test-suite :: External/SPEC/CINT2017rate/502.gcc_r/502.gcc_r.test 9667580.00 9667676.00 0.0% test-suite :: External/SPEC/CINT2017rate/523.xalancbmk_r/523.xalancbmk_r.test 2856182.00 2856198.00 0.0% test-suite :: External/SPEC/CINT2017speed/623.xalancbmk_s/623.xalancbmk_s.test 2856182.00 2856198.00 0.0% test-suite :: External/SPEC/CINT2006/464.h264ref/464.h264ref.test 773224.00 773192.00 -0.0% test-suite :: MultiSource/Benchmarks/7zip/7zip-benchmark.test 1035148.00 1035084.00 -0.0% test-suite :: External/SPEC/CINT2017speed/631.deepsjeng_s/631.deepsjeng_s.test 98126.00 98094.00 -0.0% test-suite :: External/SPEC/CINT2017rate/531.deepsjeng_r/531.deepsjeng_r.test 97966.00 97934.00 -0.0% test-suite :: MultiSource/Benchmarks/MallocBench/gs/gs.test 167391.00 167215.00 -0.1% test-suite :: MultiSource/Applications/ALAC/encode/alacconvert-encode.test 56685.00 56605.00 -0.1% test-suite :: MultiSource/Applications/ALAC/decode/alacconvert-decode.test 56685.00 56605.00 -0.1% test-suite :: SingleSource/Regression/C/gcc-c-torture/execute/GCC-C-execute-20050826-2.test 1302.00 1294.00 -0.6% Misc-C++/stepanov_container - better code due to cost fixes. 483.xalancbmk - better code due to cost fixes. ASCI_Purple/SMG2000 - better code due to cost fixes. Benchmarks/Bullet - better vector code because of the cost. JM/ldecod - extra code remain scalar, extra reduction vectorized consumer-jpeg - extra code remain scalar because of the cost. tramp3d-v4 - better vectorization because of cost fixes. 511.povray_r - better vectorization because of cost fixes. LoopInterleavingBenchmarks - extra reductions are vectorized JM/lencod - small changes in vector code because of extract cost fixes. 453.povray - small changes in vector code because of extract cost fixes. 445.gobmk - extra small reduction vectorized 526.blender_r - extra reduced scalars, better small reduction, small changes in the vetorization because of the fixes for extracts cost 602.gcc_s 502.gcc_r - small changes in reductions vectorization because of the fixes in the extract cost. 631.deepsjeng_s 623.xalancbmk_s - small changes in reductions vectorization because of the fixes in the extract cost. MallocBench/gs - extra code remain scalar because of extracts cost alacconvert-encode - extra code remain scalar because of extracts cost alacconvert-decode - extra code remain scalar because of extracts cost GCC-C-execute-20050826-2 - extra reduction gets vectorized Reviewers: RKSimon Reviewed By: RKSimon Pull Request: #99923
1 parent 0dd1128 commit 197f4a9

File tree

6 files changed

+79
-151
lines changed

6 files changed

+79
-151
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 29 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -16661,8 +16661,6 @@ class HorizontalReduction {
1666116661
SmallVector<SmallVector<Value *>> ReducedVals;
1666216662
/// Maps reduced value to the corresponding reduction operation.
1666316663
DenseMap<Value *, SmallVector<Instruction *>> ReducedValsToOps;
16664-
// Use map vector to make stable output.
16665-
MapVector<Instruction *, Value *> ExtraArgs;
1666616664
WeakTrackingVH ReductionRoot;
1666716665
/// The type of reduction operation.
1666816666
RecurKind RdxKind;
@@ -16995,30 +16993,26 @@ class HorizontalReduction {
1699516993
// gather all the reduced values, sorting them by their value id.
1699616994
BasicBlock *BB = Root->getParent();
1699716995
bool IsCmpSelMinMax = isCmpSelMinMax(Root);
16998-
SmallVector<Instruction *> Worklist(1, Root);
16996+
SmallVector<std::pair<Instruction *, unsigned>> Worklist(
16997+
1, std::make_pair(Root, 0));
1699916998
// Checks if the operands of the \p TreeN instruction are also reduction
1700016999
// operations or should be treated as reduced values or an extra argument,
1700117000
// which is not part of the reduction.
1700217001
auto CheckOperands = [&](Instruction *TreeN,
17003-
SmallVectorImpl<Value *> &ExtraArgs,
1700417002
SmallVectorImpl<Value *> &PossibleReducedVals,
17005-
SmallVectorImpl<Instruction *> &ReductionOps) {
17003+
SmallVectorImpl<Instruction *> &ReductionOps,
17004+
unsigned Level) {
1700617005
for (int I : reverse(seq<int>(getFirstOperandIndex(TreeN),
1700717006
getNumberOfOperands(TreeN)))) {
1700817007
Value *EdgeVal = getRdxOperand(TreeN, I);
1700917008
ReducedValsToOps[EdgeVal].push_back(TreeN);
1701017009
auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
17011-
// Edge has wrong parent - mark as an extra argument.
17012-
if (EdgeInst && !isVectorLikeInstWithConstOps(EdgeInst) &&
17013-
!hasSameParent(EdgeInst, BB)) {
17014-
ExtraArgs.push_back(EdgeVal);
17015-
continue;
17016-
}
1701717010
// If the edge is not an instruction, or it is different from the main
1701817011
// reduction opcode or has too many uses - possible reduced value.
1701917012
// Also, do not try to reduce const values, if the operation is not
1702017013
// foldable.
17021-
if (!EdgeInst || getRdxKind(EdgeInst) != RdxKind ||
17014+
if (!EdgeInst || Level > RecursionMaxDepth ||
17015+
getRdxKind(EdgeInst) != RdxKind ||
1702217016
IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
1702317017
!hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
1702417018
!isVectorizable(RdxKind, EdgeInst) ||
@@ -17042,6 +17036,7 @@ class HorizontalReduction {
1704217036
SmallSet<size_t, 2> LoadKeyUsed;
1704317037

1704417038
auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
17039+
Key = hash_combine(hash_value(LI->getParent()), Key);
1704517040
Value *Ptr = getUnderlyingObject(LI->getPointerOperand());
1704617041
if (LoadKeyUsed.contains(Key)) {
1704717042
auto LIt = LoadsMap.find(Ptr);
@@ -17072,40 +17067,23 @@ class HorizontalReduction {
1707217067
};
1707317068

1707417069
while (!Worklist.empty()) {
17075-
Instruction *TreeN = Worklist.pop_back_val();
17076-
SmallVector<Value *> Args;
17070+
auto [TreeN, Level] = Worklist.pop_back_val();
1707717071
SmallVector<Value *> PossibleRedVals;
1707817072
SmallVector<Instruction *> PossibleReductionOps;
17079-
CheckOperands(TreeN, Args, PossibleRedVals, PossibleReductionOps);
17080-
// If too many extra args - mark the instruction itself as a reduction
17081-
// value, not a reduction operation.
17082-
if (Args.size() < 2) {
17083-
addReductionOps(TreeN);
17084-
// Add extra args.
17085-
if (!Args.empty()) {
17086-
assert(Args.size() == 1 && "Expected only single argument.");
17087-
ExtraArgs[TreeN] = Args.front();
17088-
}
17089-
// Add reduction values. The values are sorted for better vectorization
17090-
// results.
17091-
for (Value *V : PossibleRedVals) {
17092-
size_t Key, Idx;
17093-
std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey,
17094-
/*AllowAlternate=*/false);
17095-
++PossibleReducedVals[Key][Idx]
17096-
.insert(std::make_pair(V, 0))
17097-
.first->second;
17098-
}
17099-
Worklist.append(PossibleReductionOps.rbegin(),
17100-
PossibleReductionOps.rend());
17101-
} else {
17073+
CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
17074+
addReductionOps(TreeN);
17075+
// Add reduction values. The values are sorted for better vectorization
17076+
// results.
17077+
for (Value *V : PossibleRedVals) {
1710217078
size_t Key, Idx;
17103-
std::tie(Key, Idx) = generateKeySubkey(TreeN, &TLI, GenerateLoadsSubkey,
17079+
std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey,
1710417080
/*AllowAlternate=*/false);
1710517081
++PossibleReducedVals[Key][Idx]
17106-
.insert(std::make_pair(TreeN, 0))
17082+
.insert(std::make_pair(V, 0))
1710717083
.first->second;
1710817084
}
17085+
for (Instruction *I : reverse(PossibleReductionOps))
17086+
Worklist.emplace_back(I, I->getParent() == BB ? 0 : Level + 1);
1710917087
}
1711017088
auto PossibleReducedValsVect = PossibleReducedVals.takeVector();
1711117089
// Sort values by the total number of values kinds to start the reduction
@@ -17182,18 +17160,9 @@ class HorizontalReduction {
1718217160

1718317161
// Track the reduced values in case if they are replaced by extractelement
1718417162
// because of the vectorization.
17185-
DenseMap<Value *, WeakTrackingVH> TrackedVals(
17186-
ReducedVals.size() * ReducedVals.front().size() + ExtraArgs.size());
17187-
BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
17163+
DenseMap<Value *, WeakTrackingVH> TrackedVals(ReducedVals.size() *
17164+
ReducedVals.front().size());
1718817165
SmallVector<std::pair<Value *, Value *>> ReplacedExternals;
17189-
ExternallyUsedValues.reserve(ExtraArgs.size() + 1);
17190-
// The same extra argument may be used several times, so log each attempt
17191-
// to use it.
17192-
for (const std::pair<Instruction *, Value *> &Pair : ExtraArgs) {
17193-
assert(Pair.first && "DebugLoc must be set.");
17194-
ExternallyUsedValues[Pair.second].push_back(Pair.first);
17195-
TrackedVals.try_emplace(Pair.second, Pair.second);
17196-
}
1719717166

1719817167
// The compare instruction of a min/max is the insertion point for new
1719917168
// instructions and may be replaced with a new compare instruction.
@@ -17228,13 +17197,9 @@ class HorizontalReduction {
1722817197
// Initialize the final value in the reduction.
1722917198
return Res;
1723017199
};
17231-
bool AnyBoolLogicOp =
17232-
any_of(ReductionOps.back(), [](Value *V) {
17233-
return isBoolLogicOp(cast<Instruction>(V));
17234-
});
17235-
// The reduction root is used as the insertion point for new instructions,
17236-
// so set it as externally used to prevent it from being deleted.
17237-
ExternallyUsedValues[ReductionRoot];
17200+
bool AnyBoolLogicOp = any_of(ReductionOps.back(), [](Value *V) {
17201+
return isBoolLogicOp(cast<Instruction>(V));
17202+
});
1723817203
SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *
1723917204
ReductionOps.front().size());
1724017205
for (ReductionOpsType &RdxOps : ReductionOps)
@@ -17456,8 +17421,11 @@ class HorizontalReduction {
1745617421
V.reorderBottomToTop(/*IgnoreReorder=*/true);
1745717422
// Keep extracted other reduction values, if they are used in the
1745817423
// vectorization trees.
17459-
BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues(
17460-
ExternallyUsedValues);
17424+
BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues;
17425+
// The reduction root is used as the insertion point for new
17426+
// instructions, so set it as externally used to prevent it from being
17427+
// deleted.
17428+
LocalExternallyUsedValues[ReductionRoot];
1746117429
for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) {
1746217430
if (Cnt == I || (ShuffledExtracts && Cnt == I - 1))
1746317431
continue;
@@ -17504,23 +17472,6 @@ class HorizontalReduction {
1750417472
for (Value *RdxVal : VL)
1750517473
if (RequiredExtract.contains(RdxVal))
1750617474
LocalExternallyUsedValues[RdxVal];
17507-
// Update LocalExternallyUsedValues for the scalar, replaced by
17508-
// extractelement instructions.
17509-
DenseMap<Value *, Value *> ReplacementToExternal;
17510-
for (const std::pair<Value *, Value *> &Pair : ReplacedExternals)
17511-
ReplacementToExternal.try_emplace(Pair.second, Pair.first);
17512-
for (const std::pair<Value *, Value *> &Pair : ReplacedExternals) {
17513-
Value *Ext = Pair.first;
17514-
auto RIt = ReplacementToExternal.find(Ext);
17515-
while (RIt != ReplacementToExternal.end()) {
17516-
Ext = RIt->second;
17517-
RIt = ReplacementToExternal.find(Ext);
17518-
}
17519-
auto *It = ExternallyUsedValues.find(Ext);
17520-
if (It == ExternallyUsedValues.end())
17521-
continue;
17522-
LocalExternallyUsedValues[Pair.second].append(It->second);
17523-
}
1752417475
V.buildExternalUses(LocalExternallyUsedValues);
1752517476

1752617477
V.computeMinimumValueSizes();
@@ -17722,11 +17673,6 @@ class HorizontalReduction {
1772217673
ExtraReductions.emplace_back(RedOp, RdxVal);
1772317674
}
1772417675
}
17725-
for (auto &Pair : ExternallyUsedValues) {
17726-
// Add each externally used value to the final reduction.
17727-
for (auto *I : Pair.second)
17728-
ExtraReductions.emplace_back(I, Pair.first);
17729-
}
1773017676
// Iterate through all not-vectorized reduction values/extra arguments.
1773117677
bool InitStep = true;
1773217678
while (ExtraReductions.size() > 1) {
@@ -17878,6 +17824,8 @@ class HorizontalReduction {
1787817824
assert(IsSupportedHorRdxIdentityOp &&
1787917825
"The optimization of matched scalar identity horizontal reductions "
1788017826
"must be supported.");
17827+
if (Cnt == 1)
17828+
return VectorizedValue;
1788117829
switch (RdxKind) {
1788217830
case RecurKind::Add: {
1788317831
// res = mul vv, n

llvm/test/Transforms/SLPVectorizer/X86/external-used-across-reductions.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,11 @@ define void @test() {
1313
; CHECK-NEXT: [[PHI1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[OP_RDX25:%.*]], [[LOOP]] ]
1414
; CHECK-NEXT: [[TMP6:%.*]] = phi <8 x i64> [ [[TMP0]], [[ENTRY]] ], [ [[TMP1]], [[LOOP]] ]
1515
; CHECK-NEXT: [[TMP7:%.*]] = mul <8 x i64> [[TMP6]], <i64 4, i64 4, i64 4, i64 4, i64 4, i64 4, i64 4, i64 4>
16+
; CHECK-NEXT: [[TMP5:%.*]] = mul <8 x i64> [[TMP1]], <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
1617
; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP7]])
17-
; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP1]])
18-
; CHECK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP8]], 2
19-
; CHECK-NEXT: [[OP_RDX33:%.*]] = add i64 [[TMP10]], [[TMP9]]
20-
; CHECK-NEXT: [[OP_RDX25]] = add i64 [[OP_RDX33]], [[TMP3]]
18+
; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]])
19+
; CHECK-NEXT: [[OP_RDX16:%.*]] = add i64 [[TMP9]], [[TMP8]]
20+
; CHECK-NEXT: [[OP_RDX25]] = add i64 [[OP_RDX16]], [[TMP3]]
2121
; CHECK-NEXT: br label [[LOOP]]
2222
;
2323
entry:

llvm/test/Transforms/SLPVectorizer/X86/gather-extractelements-different-bbs.ll

Lines changed: 10 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -4,30 +4,24 @@
44
define i32 @foo(i32 %a) {
55
; CHECK-LABEL: @foo(
66
; CHECK-NEXT: entry:
7-
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[A:%.*]], i32 0
8-
; CHECK-NEXT: [[TMP1:%.*]] = sub nsw <2 x i32> zeroinitializer, [[TMP0]]
9-
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
10-
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1
7+
; CHECK-NEXT: [[TMP0:%.*]] = sub nsw i32 0, [[A:%.*]]
8+
; CHECK-NEXT: [[LOCAL:%.*]] = sub nsw i32 0, 0
119
; CHECK-NEXT: br i1 false, label [[BB5:%.*]], label [[BB1:%.*]]
1210
; CHECK: bb1:
13-
; CHECK-NEXT: [[TMP4:%.*]] = mul <2 x i32> [[TMP1]], <i32 1, i32 3>
14-
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[TMP4]], i32 0
15-
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP4]], i32 1
16-
; CHECK-NEXT: [[OP_RDX10:%.*]] = add i32 [[TMP6]], [[TMP5]]
17-
; CHECK-NEXT: [[OP_RDX11:%.*]] = add i32 [[OP_RDX10]], 0
11+
; CHECK-NEXT: [[TMP1:%.*]] = mul i32 [[LOCAL]], 3
12+
; CHECK-NEXT: [[OP_RDX2:%.*]] = add i32 [[TMP1]], [[TMP0]]
13+
; CHECK-NEXT: [[OP_RDX3:%.*]] = add i32 [[OP_RDX2]], 0
1814
; CHECK-NEXT: br label [[BB3:%.*]]
1915
; CHECK: bb2:
2016
; CHECK-NEXT: br label [[BB3]]
2117
; CHECK: bb3:
22-
; CHECK-NEXT: [[P1:%.*]] = phi i32 [ [[OP_RDX11]], [[BB1]] ], [ 0, [[BB2:%.*]] ]
18+
; CHECK-NEXT: [[P1:%.*]] = phi i32 [ [[OP_RDX3]], [[BB1]] ], [ 0, [[BB2:%.*]] ]
2319
; CHECK-NEXT: ret i32 0
2420
; CHECK: bb4:
25-
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
26-
; CHECK-NEXT: [[TMP8:%.*]] = add <4 x i32> [[TMP7]], [[TMP2]]
27-
; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP8]])
28-
; CHECK-NEXT: [[OP_RDX8:%.*]] = add i32 [[TMP9]], 0
29-
; CHECK-NEXT: [[OP_RDX9:%.*]] = add i32 [[OP_RDX8]], [[TMP3]]
30-
; CHECK-NEXT: ret i32 [[OP_RDX9]]
21+
; CHECK-NEXT: [[TMP2:%.*]] = mul i32 [[LOCAL]], 8
22+
; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP2]], [[TMP0]]
23+
; CHECK-NEXT: [[OP_RDX1:%.*]] = add i32 [[OP_RDX]], 0
24+
; CHECK-NEXT: ret i32 [[OP_RDX1]]
3125
; CHECK: bb5:
3226
; CHECK-NEXT: br label [[BB4:%.*]]
3327
;

0 commit comments

Comments
 (0)