Skip to content

Commit c7657cf

Browse files
[SLP]Keep externally used GEPs as GEPs, if possible instead of extractelement.
If the vectorized GEP instruction can be still kept as a scalar GEP, better to keep it as scalar instead of extractelement. In many cases it is more profitable. Metric: size..text Program size..text results results0 diff test-suite :: SingleSource/Benchmarks/Misc/oourafft.test 18911.00 19695.00 4.1% test-suite :: SingleSource/Benchmarks/Misc-C++-EH/spirit.test 59987.00 60707.00 1.2% test-suite :: External/SPEC/CFP2017speed/638.imagick_s/638.imagick_s.test 1392209.00 1392753.00 0.0% test-suite :: External/SPEC/CFP2017rate/538.imagick_r/538.imagick_r.test 1392209.00 1392753.00 0.0% test-suite :: External/SPEC/CINT2006/400.perlbench/400.perlbench.test 1087996.00 1088236.00 0.0% test-suite :: MultiSource/Benchmarks/Bullet/bullet.test 309310.00 309342.00 0.0% test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test 664661.00 664693.00 0.0% test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test 664661.00 664693.00 0.0% test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 12354636.00 12354908.00 0.0% test-suite :: External/SPEC/CFP2006/453.povray/453.povray.test 1152748.00 1152716.00 -0.0% test-suite :: MultiSource/Applications/oggenc/oggenc.test 191787.00 191771.00 -0.0% test-suite :: SingleSource/UnitTests/matrix-types-spec.test 480796.00 480476.00 -0.1% Misc/oourafft - Extra code gets vectorized Misc-C++-EH/spirit - same CFP2017speed/638.imagick_s CFP2017rate/538.imagick_r - same, extra code gets vectorized CINT2006/400.perlbench - some extra 4 x ptr stores vectorized Bullet/bullet - extra 4 x ptr store vectorized CINT2017rate/525.x264_r CINT2017speed/625.x264_s - same CFP2017rate/526.blender_r - extra 8 x float stores (several), some extra 4 x ptr stores CFP2006/453.povray - 2 x double loads/stores replaced by 4 x double loads/stores Applications/oggenc - extra code is vectorized UnitTests/matrix-types-spec - extra code gets vectorized Reviewers: RKSimon Reviewed By: RKSimon Pull Request: #88877
1 parent 5422eb0 commit c7657cf

File tree

6 files changed

+71
-25
lines changed

6 files changed

+71
-25
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 49 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1134,6 +1134,7 @@ class BoUpSLP {
11341134
MustGather.clear();
11351135
EntryToLastInstruction.clear();
11361136
ExternalUses.clear();
1137+
ExternalUsesAsGEPs.clear();
11371138
for (auto &Iter : BlocksSchedules) {
11381139
BlockScheduling *BS = Iter.second.get();
11391140
BS->clear();
@@ -3154,6 +3155,10 @@ class BoUpSLP {
31543155
/// after vectorization.
31553156
UserList ExternalUses;
31563157

3158+
/// A list of GEPs which can be reaplced by scalar GEPs instead of
3159+
/// extractelement instructions.
3160+
SmallPtrSet<Value *, 4> ExternalUsesAsGEPs;
3161+
31573162
/// Values used only by @llvm.assume calls.
31583163
SmallPtrSet<const Value *, 32> EphValues;
31593164

@@ -5541,6 +5546,7 @@ void BoUpSLP::buildExternalUses(
55415546
<< FoundLane << " from " << *Scalar << ".\n");
55425547
ScalarToExtUses.try_emplace(Scalar, ExternalUses.size());
55435548
ExternalUses.emplace_back(Scalar, nullptr, FoundLane);
5549+
continue;
55445550
}
55455551
for (User *U : Scalar->users()) {
55465552
LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
@@ -9925,6 +9931,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
99259931
SmallVector<APInt> DemandedElts;
99269932
SmallDenseSet<Value *, 4> UsedInserts;
99279933
DenseSet<std::pair<const TreeEntry *, Type *>> VectorCasts;
9934+
std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
99289935
for (ExternalUser &EU : ExternalUses) {
99299936
// We only add extract cost once for the same scalar.
99309937
if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
@@ -10033,12 +10040,40 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
1003310040
}
1003410041
}
1003510042
}
10043+
// Leave the GEPs as is, they are free in most cases and better to keep them
10044+
// as GEPs.
10045+
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
10046+
if (auto *GEP = dyn_cast<GetElementPtrInst>(EU.Scalar)) {
10047+
if (!ValueToExtUses) {
10048+
ValueToExtUses.emplace();
10049+
for_each(enumerate(ExternalUses), [&](const auto &P) {
10050+
ValueToExtUses->try_emplace(P.value().Scalar, P.index());
10051+
});
10052+
}
10053+
// Can use original GEP, if no operands vectorized or they are marked as
10054+
// externally used already.
10055+
bool CanBeUsedAsGEP = all_of(GEP->operands(), [&](Value *V) {
10056+
if (!getTreeEntry(V))
10057+
return true;
10058+
auto It = ValueToExtUses->find(V);
10059+
if (It != ValueToExtUses->end()) {
10060+
// Replace all uses to avoid compiler crash.
10061+
ExternalUses[It->second].User = nullptr;
10062+
return true;
10063+
}
10064+
return false;
10065+
});
10066+
if (CanBeUsedAsGEP) {
10067+
ExtractCost += TTI->getInstructionCost(GEP, CostKind);
10068+
ExternalUsesAsGEPs.insert(EU.Scalar);
10069+
continue;
10070+
}
10071+
}
1003610072

1003710073
// If we plan to rewrite the tree in a smaller type, we will need to sign
1003810074
// extend the extracted value back to the original type. Here, we account
1003910075
// for the extract and the added cost of the sign extend if needed.
1004010076
auto *VecTy = FixedVectorType::get(EU.Scalar->getType(), BundleWidth);
10041-
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
1004210077
auto It = MinBWs.find(getTreeEntry(EU.Scalar));
1004310078
if (It != MinBWs.end()) {
1004410079
auto *MinTy = IntegerType::get(F->getContext(), It->second.first);
@@ -13161,6 +13196,8 @@ Value *BoUpSLP::vectorizeTree(
1316113196
if (Scalar->getType() != Vec->getType()) {
1316213197
Value *Ex = nullptr;
1316313198
Value *ExV = nullptr;
13199+
auto *GEP = dyn_cast<GetElementPtrInst>(Scalar);
13200+
bool ReplaceGEP = GEP && ExternalUsesAsGEPs.contains(GEP);
1316413201
auto It = ScalarToEEs.find(Scalar);
1316513202
if (It != ScalarToEEs.end()) {
1316613203
// No need to emit many extracts, just move the only one in the
@@ -13186,6 +13223,15 @@ Value *BoUpSLP::vectorizeTree(
1318613223
if (const TreeEntry *ETE = getTreeEntry(V))
1318713224
V = ETE->VectorizedValue;
1318813225
Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
13226+
} else if (ReplaceGEP) {
13227+
// Leave the GEPs as is, they are free in most cases and better to
13228+
// keep them as GEPs.
13229+
auto *CloneGEP = GEP->clone();
13230+
CloneGEP->insertBefore(*Builder.GetInsertBlock(),
13231+
Builder.GetInsertPoint());
13232+
if (GEP->hasName())
13233+
CloneGEP->takeName(GEP);
13234+
Ex = CloneGEP;
1318913235
} else {
1319013236
Ex = Builder.CreateExtractElement(Vec, Lane);
1319113237
}
@@ -13224,6 +13270,8 @@ Value *BoUpSLP::vectorizeTree(
1322413270
assert((ExternallyUsedValues.count(Scalar) ||
1322513271
any_of(Scalar->users(),
1322613272
[&](llvm::User *U) {
13273+
if (ExternalUsesAsGEPs.contains(U))
13274+
return true;
1322713275
TreeEntry *UseEntry = getTreeEntry(U);
1322813276
return UseEntry &&
1322913277
(UseEntry->State == TreeEntry::Vectorize ||

llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ define i32 @fn1() {
1313
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP0]], i32 0
1414
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x ptr> [[TMP1]], <2 x ptr> poison, <2 x i32> zeroinitializer
1515
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i64, <2 x ptr> [[TMP2]], <2 x i64> <i64 11, i64 56>
16-
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x ptr> [[TMP3]], i32 0
16+
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i64 11
1717
; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint <2 x ptr> [[TMP3]] to <2 x i64>
1818
; CHECK-NEXT: store <2 x i64> [[TMP5]], ptr [[TMP4]], align 8
1919
; CHECK-NEXT: ret i32 undef
@@ -92,7 +92,7 @@ define void @externally_used_ptrs() {
9292
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP0]], i32 0
9393
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x ptr> [[TMP1]], <2 x ptr> poison, <2 x i32> zeroinitializer
9494
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i64, <2 x ptr> [[TMP2]], <2 x i64> <i64 56, i64 11>
95-
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x ptr> [[TMP3]], i32 1
95+
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i64 11
9696
; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint <2 x ptr> [[TMP3]] to <2 x i64>
9797
; CHECK-NEXT: [[TMP6:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8
9898
; CHECK-NEXT: [[TMP7:%.*]] = add <2 x i64> [[TMP5]], [[TMP6]]

llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -13,33 +13,34 @@ define dso_local i32 @g() local_unnamed_addr {
1313
; CHECK: while.body:
1414
; CHECK-NEXT: [[C_022:%.*]] = phi ptr [ [[C_022_BE:%.*]], [[WHILE_BODY_BACKEDGE:%.*]] ], [ undef, [[ENTRY:%.*]] ]
1515
; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x ptr> [ [[TMP14:%.*]], [[WHILE_BODY_BACKEDGE]] ], [ undef, [[ENTRY]] ]
16-
; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[C_022]], i64 1
17-
; CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[C_022]] to i64
16+
; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1
17+
; CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP9]] to i64
1818
; CHECK-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
19+
; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[C_022]], i64 1
1920
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i32, <2 x ptr> [[TMP1]], <2 x i64> <i64 1, i64 1>
2021
; CHECK-NEXT: switch i32 [[TMP3]], label [[WHILE_BODY_BACKEDGE]] [
21-
; CHECK-NEXT: i32 2, label [[SW_BB:%.*]]
22-
; CHECK-NEXT: i32 4, label [[SW_BB6:%.*]]
22+
; CHECK-NEXT: i32 2, label [[SW_BB:%.*]]
23+
; CHECK-NEXT: i32 4, label [[SW_BB6:%.*]]
2324
; CHECK-NEXT: ]
2425
; CHECK: sw.bb:
2526
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x ptr> [[TMP4]], i32 0
2627
; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[TMP5]] to i64
2728
; CHECK-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
28-
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, <2 x ptr> [[TMP1]], <2 x i64> <i64 2, i64 2>
29-
; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x ptr> [[TMP4]], i32 1
30-
; CHECK-NEXT: store i32 [[TMP7]], ptr [[TMP9]], align 4
3129
; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, ptr [[C_022]], i64 2
30+
; CHECK-NEXT: store i32 [[TMP7]], ptr [[INCDEC_PTR1]], align 4
31+
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, <2 x ptr> [[TMP1]], <2 x i64> <i64 2, i64 2>
3232
; CHECK-NEXT: br label [[WHILE_BODY_BACKEDGE]]
3333
; CHECK: sw.bb6:
3434
; CHECK-NEXT: [[INCDEC_PTR8:%.*]] = getelementptr inbounds i32, ptr [[C_022]], i64 2
35+
; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i64 1
3536
; CHECK-NEXT: [[TMP10:%.*]] = ptrtoint ptr [[INCDEC_PTR]] to i64
3637
; CHECK-NEXT: [[TMP11:%.*]] = trunc i64 [[TMP10]] to i32
3738
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, <2 x ptr> [[TMP1]], <2 x i64> <i64 2, i64 2>
3839
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x ptr> [[TMP4]], i32 0
3940
; CHECK-NEXT: store i32 [[TMP11]], ptr [[TMP13]], align 4
4041
; CHECK-NEXT: br label [[WHILE_BODY_BACKEDGE]]
4142
; CHECK: while.body.backedge:
42-
; CHECK-NEXT: [[C_022_BE]] = phi ptr [ [[INCDEC_PTR]], [[WHILE_BODY]] ], [ [[INCDEC_PTR8]], [[SW_BB6]] ], [ [[INCDEC_PTR5]], [[SW_BB]] ]
43+
; CHECK-NEXT: [[C_022_BE]] = phi ptr [ [[INCDEC_PTR1]], [[WHILE_BODY]] ], [ [[INCDEC_PTR8]], [[SW_BB6]] ], [ [[INCDEC_PTR5]], [[SW_BB]] ]
4344
; CHECK-NEXT: [[TMP14]] = phi <2 x ptr> [ [[TMP4]], [[WHILE_BODY]] ], [ [[TMP12]], [[SW_BB6]] ], [ [[TMP8]], [[SW_BB]] ]
4445
; CHECK-NEXT: br label [[WHILE_BODY]]
4546
; CHECK: while.end:

llvm/test/Transforms/SLPVectorizer/X86/opaque-ptr.ll

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -52,17 +52,14 @@ define void @test(ptr %r, ptr %p, ptr %q) #0 {
5252

5353
define void @test2(ptr %a, ptr %b) {
5454
; CHECK-LABEL: @test2(
55-
; CHECK-NEXT: [[A1:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 1
56-
; CHECK-NEXT: [[A2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 2
57-
; CHECK-NEXT: [[I1:%.*]] = ptrtoint ptr [[A1]] to i64
58-
; CHECK-NEXT: [[B3:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 3
59-
; CHECK-NEXT: [[I2:%.*]] = ptrtoint ptr [[B3]] to i64
60-
; CHECK-NEXT: [[V1:%.*]] = load i64, ptr [[A1]], align 8
61-
; CHECK-NEXT: [[V2:%.*]] = load i64, ptr [[A2]], align 8
62-
; CHECK-NEXT: [[ADD1:%.*]] = add i64 [[I1]], [[V1]]
63-
; CHECK-NEXT: [[ADD2:%.*]] = add i64 [[I2]], [[V2]]
64-
; CHECK-NEXT: store i64 [[ADD1]], ptr [[A1]], align 8
65-
; CHECK-NEXT: store i64 [[ADD2]], ptr [[A2]], align 8
55+
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x ptr> poison, ptr [[A:%.*]], i32 0
56+
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x ptr> [[TMP1]], ptr [[B:%.*]], i32 1
57+
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i64, <2 x ptr> [[TMP2]], <2 x i64> <i64 1, i64 3>
58+
; CHECK-NEXT: [[A1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 1
59+
; CHECK-NEXT: [[TMP4:%.*]] = ptrtoint <2 x ptr> [[TMP3]] to <2 x i64>
60+
; CHECK-NEXT: [[TMP5:%.*]] = load <2 x i64>, ptr [[A1]], align 8
61+
; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i64> [[TMP4]], [[TMP5]]
62+
; CHECK-NEXT: store <2 x i64> [[TMP6]], ptr [[A1]], align 8
6663
; CHECK-NEXT: ret void
6764
;
6865
%a1 = getelementptr inbounds i64, ptr %a, i64 1

llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ define void @"foo"(ptr addrspace(1) %0, ptr addrspace(1) %1) #0 {
99
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x ptr addrspace(1)> poison, ptr addrspace(1) [[TMP0:%.*]], i32 0
1010
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x ptr addrspace(1)> [[TMP3]], <4 x ptr addrspace(1)> poison, <4 x i32> zeroinitializer
1111
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, <4 x ptr addrspace(1)> [[TMP4]], <4 x i64> <i64 8, i64 12, i64 28, i64 24>
12-
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x ptr addrspace(1)> [[TMP5]], i32 0
12+
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP0]], i64 8
1313
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP1:%.*]], i64 8
1414
; CHECK-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p1(<4 x ptr addrspace(1)> [[TMP5]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> poison)
1515
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <8 x i32> <i32 0, i32 3, i32 0, i32 3, i32 2, i32 1, i32 2, i32 1>

llvm/test/Transforms/SLPVectorizer/X86/stacksave-dependence.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ define void @allocas(ptr %a, ptr %b, ptr %c) {
3535
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x ptr> poison, ptr [[V1]], i32 0
3636
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x ptr> [[TMP1]], ptr [[V2]], i32 1
3737
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, <2 x ptr> [[TMP2]], <2 x i32> <i32 1, i32 1>
38-
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x ptr> [[TMP3]], i32 0
38+
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[V1]], i32 1
3939
; CHECK-NEXT: store ptr [[TMP4]], ptr [[A:%.*]], align 8
4040
; CHECK-NEXT: store <2 x ptr> [[TMP3]], ptr [[B:%.*]], align 8
4141
; CHECK-NEXT: ret void
@@ -127,7 +127,7 @@ define void @stacksave2(ptr %a, ptr %b, ptr %c) {
127127
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x ptr> poison, ptr [[V1]], i32 0
128128
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x ptr> [[TMP1]], ptr [[V2]], i32 1
129129
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, <2 x ptr> [[TMP2]], <2 x i32> <i32 1, i32 1>
130-
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x ptr> [[TMP3]], i32 0
130+
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[V1]], i32 1
131131
; CHECK-NEXT: store ptr [[TMP4]], ptr [[A:%.*]], align 8
132132
; CHECK-NEXT: call void @use(ptr inalloca(i8) [[V2]]) #[[ATTR5:[0-9]+]]
133133
; CHECK-NEXT: call void @llvm.stackrestore.p0(ptr [[STACK]])

0 commit comments

Comments
 (0)