Skip to content

Commit d7dd31e

Browse files
committed
[SLP]Better analysis of the repeated instructions during operands reordering
When doing the repeated instructions analysis, better to make the reordering non-profitable, if the number of unique instructions is not power-of-2. In this case better to keep power-of-2 elements as this allows better vectorization. Fixes #109725
1 parent b9bd8ca commit d7dd31e

File tree

2 files changed

+45
-39
lines changed

2 files changed

+45
-39
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 34 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1930,30 +1930,38 @@ class BoUpSLP {
19301930
/// elements in the lane, it will be vectorized with higher probability
19311931
/// after removing duplicates. Currently the SLP vectorizer supports only
19321932
/// vectorization of the power-of-2 number of unique scalars.
1933-
int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
1933+
int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx,
1934+
const SmallBitVector &UsedLanes) const {
19341935
Value *IdxLaneV = getData(Idx, Lane).V;
1935-
if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V)
1936+
if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V ||
1937+
isa<ExtractElementInst>(IdxLaneV))
19361938
return 0;
1937-
SmallPtrSet<Value *, 4> Uniques;
1938-
for (unsigned Ln = 0, E = getNumLanes(); Ln < E; ++Ln) {
1939+
SmallDenseMap<Value *, unsigned, 4> Uniques;
1940+
for (unsigned Ln : seq<unsigned>(getNumLanes())) {
19391941
if (Ln == Lane)
19401942
continue;
19411943
Value *OpIdxLnV = getData(OpIdx, Ln).V;
19421944
if (!isa<Instruction>(OpIdxLnV))
19431945
return 0;
1944-
Uniques.insert(OpIdxLnV);
1946+
Uniques.try_emplace(OpIdxLnV, Ln);
19451947
}
1946-
int UniquesCount = Uniques.size();
1947-
int UniquesCntWithIdxLaneV =
1948-
Uniques.contains(IdxLaneV) ? UniquesCount : UniquesCount + 1;
1948+
unsigned UniquesCount = Uniques.size();
1949+
auto IdxIt = Uniques.find(IdxLaneV);
1950+
unsigned UniquesCntWithIdxLaneV =
1951+
IdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
19491952
Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1950-
int UniquesCntWithOpIdxLaneV =
1951-
Uniques.contains(OpIdxLaneV) ? UniquesCount : UniquesCount + 1;
1953+
auto OpIdxIt = Uniques.find(OpIdxLaneV);
1954+
unsigned UniquesCntWithOpIdxLaneV =
1955+
OpIdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
19521956
if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
19531957
return 0;
1954-
return (PowerOf2Ceil(UniquesCntWithOpIdxLaneV) -
1955-
UniquesCntWithOpIdxLaneV) -
1956-
(PowerOf2Ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
1958+
return std::min(bit_ceil(UniquesCntWithOpIdxLaneV) -
1959+
UniquesCntWithOpIdxLaneV,
1960+
UniquesCntWithOpIdxLaneV -
1961+
bit_floor(UniquesCntWithOpIdxLaneV)) -
1962+
((IdxIt != Uniques.end() && UsedLanes.test(IdxIt->second))
1963+
? UniquesCntWithIdxLaneV - bit_floor(UniquesCntWithIdxLaneV)
1964+
: bit_ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
19571965
}
19581966

19591967
/// \param Lane lane of the operands under analysis.
@@ -1993,7 +2001,7 @@ class BoUpSLP {
19932001
/// predecessors.
19942002
int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,
19952003
int Lane, unsigned OpIdx, unsigned Idx,
1996-
bool &IsUsed) {
2004+
bool &IsUsed, const SmallBitVector &UsedLanes) {
19972005
LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(),
19982006
LookAheadMaxDepth);
19992007
// Keep track of the instruction stack as we recurse into the operands
@@ -2002,11 +2010,10 @@ class BoUpSLP {
20022010
LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr,
20032011
/*CurrLevel=*/1, MainAltOps);
20042012
if (Score) {
2005-
int SplatScore = getSplatScore(Lane, OpIdx, Idx);
2013+
int SplatScore = getSplatScore(Lane, OpIdx, Idx, UsedLanes);
20062014
if (Score <= -SplatScore) {
2007-
// Set the minimum score for splat-like sequence to avoid setting
2008-
// failed state.
2009-
Score = 1;
2015+
// Failed score.
2016+
Score = 0;
20102017
} else {
20112018
Score += SplatScore;
20122019
// Scale score to see the difference between different operands
@@ -2036,7 +2043,8 @@ class BoUpSLP {
20362043
std::optional<unsigned>
20372044
getBestOperand(unsigned OpIdx, int Lane, int LastLane,
20382045
ArrayRef<ReorderingMode> ReorderingModes,
2039-
ArrayRef<Value *> MainAltOps) {
2046+
ArrayRef<Value *> MainAltOps,
2047+
const SmallBitVector &UsedLanes) {
20402048
unsigned NumOperands = getNumOperands();
20412049

20422050
// The operand of the previous lane at OpIdx.
@@ -2092,7 +2100,7 @@ class BoUpSLP {
20922100
Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
20932101
Value *OpRight = (LeftToRight) ? Op : OpLastLane;
20942102
int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
2095-
OpIdx, Idx, IsUsed);
2103+
OpIdx, Idx, IsUsed, UsedLanes);
20962104
if (Score > static_cast<int>(BestOp.Score) ||
20972105
(Score > 0 && Score == static_cast<int>(BestOp.Score) &&
20982106
Idx == OpIdx)) {
@@ -2507,20 +2515,24 @@ class BoUpSLP {
25072515
for (unsigned I = 0; I < NumOperands; ++I)
25082516
MainAltOps[I].push_back(getData(I, FirstLane).V);
25092517

2518+
SmallBitVector UsedLanes(NumLanes);
2519+
UsedLanes.set(FirstLane);
25102520
for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
25112521
// Visit the lane on the right and then the lane on the left.
25122522
for (int Direction : {+1, -1}) {
25132523
int Lane = FirstLane + Direction * Distance;
25142524
if (Lane < 0 || Lane >= (int)NumLanes)
25152525
continue;
2526+
UsedLanes.set(Lane);
25162527
int LastLane = Lane - Direction;
25172528
assert(LastLane >= 0 && LastLane < (int)NumLanes &&
25182529
"Out of bounds");
25192530
// Look for a good match for each operand.
25202531
for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
25212532
// Search for the operand that matches SortedOps[OpIdx][Lane-1].
2522-
std::optional<unsigned> BestIdx = getBestOperand(
2523-
OpIdx, Lane, LastLane, ReorderingModes, MainAltOps[OpIdx]);
2533+
std::optional<unsigned> BestIdx =
2534+
getBestOperand(OpIdx, Lane, LastLane, ReorderingModes,
2535+
MainAltOps[OpIdx], UsedLanes);
25242536
// By not selecting a value, we allow the operands that follow to
25252537
// select a better matching value. We will get a non-null value in
25262538
// the next run of getBestOperand().

llvm/test/Transforms/SLPVectorizer/X86/splat-score-adjustment.ll

Lines changed: 11 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -6,29 +6,23 @@ define i32 @a() {
66
; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
77
; CHECK-NEXT: br label %[[BB1:.*]]
88
; CHECK: [[BB1]]:
9-
; CHECK-NEXT: [[TMP2:%.*]] = phi i8 [ 0, [[TMP0:%.*]] ], [ [[TMP7:%.*]], %[[BB1]] ]
10-
; CHECK-NEXT: [[TMP3:%.*]] = phi i8 [ 0, [[TMP0]] ], [ [[TMP8:%.*]], %[[BB1]] ]
11-
; CHECK-NEXT: [[TMP4:%.*]] = phi <4 x i8> [ zeroinitializer, [[TMP0]] ], [ [[TMP6:%.*]], %[[BB1]] ]
9+
; CHECK-NEXT: [[TMP4:%.*]] = phi <4 x i8> [ zeroinitializer, [[TMP0:%.*]] ], [ [[TMP6:%.*]], %[[BB1]] ]
10+
; CHECK-NEXT: [[TMP3:%.*]] = phi <2 x i8> [ zeroinitializer, [[TMP0]] ], [ [[TMP17:%.*]], %[[BB1]] ]
1211
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
1312
; CHECK-NEXT: [[TMP6]] = load <4 x i8>, ptr null, align 4
14-
; CHECK-NEXT: [[TMP7]] = extractelement <4 x i8> [[TMP6]], i32 3
15-
; CHECK-NEXT: [[TMP8]] = extractelement <4 x i8> [[TMP6]], i32 2
16-
; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i8> [[TMP6]], i32 0
17-
; CHECK-NEXT: [[TMP10:%.*]] = xor i8 [[TMP9]], [[TMP3]]
18-
; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i8> [[TMP6]], i32 1
19-
; CHECK-NEXT: [[TMP12:%.*]] = xor i8 [[TMP11]], [[TMP2]]
20-
; CHECK-NEXT: [[TMP13:%.*]] = xor i8 [[TMP8]], [[TMP9]]
21-
; CHECK-NEXT: [[TMP14:%.*]] = xor i8 [[TMP7]], [[TMP11]]
22-
; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <4 x i8> [[TMP6]], <4 x i8> poison, <8 x i32> <i32 poison, i32 0, i32 poison, i32 1, i32 poison, i32 2, i32 0, i32 3>
23-
; CHECK-NEXT: [[TMP16:%.*]] = insertelement <8 x i8> [[TMP15]], i8 [[TMP10]], i32 0
24-
; CHECK-NEXT: [[TMP17:%.*]] = insertelement <8 x i8> [[TMP16]], i8 [[TMP12]], i32 2
25-
; CHECK-NEXT: [[TMP18:%.*]] = insertelement <8 x i8> [[TMP17]], i8 [[TMP13]], i32 4
13+
; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i8> [[TMP6]], <4 x i8> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 1>
14+
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i8> [[TMP3]], <2 x i8> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
15+
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> [[TMP7]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
16+
; CHECK-NEXT: [[TMP9:%.*]] = xor <4 x i8> [[TMP6]], [[TMP8]]
17+
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i8> [[TMP6]], <4 x i8> poison, <8 x i32> <i32 poison, i32 0, i32 poison, i32 1, i32 poison, i32 2, i32 poison, i32 3>
18+
; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <8 x i32> <i32 0, i32 poison, i32 1, i32 poison, i32 2, i32 poison, i32 3, i32 poison>
19+
; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <8 x i8> [[TMP10]], <8 x i8> [[TMP11]], <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 5, i32 14, i32 7>
2620
; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
27-
; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <8 x i8> [[TMP19]], <8 x i8> [[TMP18]], <8 x i32> <i32 1, i32 3, i32 2, i32 9, i32 3, i32 11, i32 poison, i32 13>
28-
; CHECK-NEXT: [[TMP21:%.*]] = insertelement <8 x i8> [[TMP20]], i8 [[TMP14]], i32 6
21+
; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <8 x i8> [[TMP19]], <8 x i8> [[TMP18]], <8 x i32> <i32 1, i32 3, i32 2, i32 9, i32 3, i32 11, i32 9, i32 13>
2922
; CHECK-NEXT: [[TMP22:%.*]] = xor <8 x i8> [[TMP18]], [[TMP21]]
3023
; CHECK-NEXT: [[TMP23:%.*]] = xor <8 x i8> [[TMP22]], [[TMP5]]
3124
; CHECK-NEXT: store <8 x i8> [[TMP23]], ptr null, align 4
25+
; CHECK-NEXT: [[TMP17]] = shufflevector <4 x i8> [[TMP6]], <4 x i8> poison, <2 x i32> <i32 2, i32 3>
3226
; CHECK-NEXT: br label %[[BB1]]
3327
;
3428
br label %1

0 commit comments

Comments
 (0)