Skip to content

[SLP]Keep externally used GEPs as GEPs, if possible instead of extractelement. #88877

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 49 additions & 1 deletion llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1134,6 +1134,7 @@ class BoUpSLP {
MustGather.clear();
EntryToLastInstruction.clear();
ExternalUses.clear();
ExternalUsesAsGEPs.clear();
for (auto &Iter : BlocksSchedules) {
BlockScheduling *BS = Iter.second.get();
BS->clear();
Expand Down Expand Up @@ -3154,6 +3155,10 @@ class BoUpSLP {
/// after vectorization.
UserList ExternalUses;

/// A list of GEPs which can be reaplced by scalar GEPs instead of
/// extractelement instructions.
SmallPtrSet<Value *, 4> ExternalUsesAsGEPs;

/// Values used only by @llvm.assume calls.
SmallPtrSet<const Value *, 32> EphValues;

Expand Down Expand Up @@ -5541,6 +5546,7 @@ void BoUpSLP::buildExternalUses(
<< FoundLane << " from " << *Scalar << ".\n");
ScalarToExtUses.try_emplace(Scalar, ExternalUses.size());
ExternalUses.emplace_back(Scalar, nullptr, FoundLane);
continue;
}
for (User *U : Scalar->users()) {
LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
Expand Down Expand Up @@ -9925,6 +9931,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
SmallVector<APInt> DemandedElts;
SmallDenseSet<Value *, 4> UsedInserts;
DenseSet<std::pair<const TreeEntry *, Type *>> VectorCasts;
std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
for (ExternalUser &EU : ExternalUses) {
// We only add extract cost once for the same scalar.
if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
Expand Down Expand Up @@ -10033,12 +10040,40 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
}
}
}
// Leave the GEPs as is, they are free in most cases and better to keep them
// as GEPs.
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why did you need to move this?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To use it in TTI->getInstructionCost for adding an extra cost for the scalar GEP, which we're going to keep

if (auto *GEP = dyn_cast<GetElementPtrInst>(EU.Scalar)) {
if (!ValueToExtUses) {
ValueToExtUses.emplace();
for_each(enumerate(ExternalUses), [&](const auto &P) {
ValueToExtUses->try_emplace(P.value().Scalar, P.index());
});
}
// Can use original GEP, if no operands vectorized or they are marked as
// externally used already.
bool CanBeUsedAsGEP = all_of(GEP->operands(), [&](Value *V) {
if (!getTreeEntry(V))
return true;
auto It = ValueToExtUses->find(V);
if (It != ValueToExtUses->end()) {
// Replace all uses to avoid compiler crash.
ExternalUses[It->second].User = nullptr;
return true;
}
return false;
});
if (CanBeUsedAsGEP) {
ExtractCost += TTI->getInstructionCost(GEP, CostKind);
ExternalUsesAsGEPs.insert(EU.Scalar);
continue;
}
}

// If we plan to rewrite the tree in a smaller type, we will need to sign
// extend the extracted value back to the original type. Here, we account
// for the extract and the added cost of the sign extend if needed.
auto *VecTy = FixedVectorType::get(EU.Scalar->getType(), BundleWidth);
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
auto It = MinBWs.find(getTreeEntry(EU.Scalar));
if (It != MinBWs.end()) {
auto *MinTy = IntegerType::get(F->getContext(), It->second.first);
Expand Down Expand Up @@ -13161,6 +13196,8 @@ Value *BoUpSLP::vectorizeTree(
if (Scalar->getType() != Vec->getType()) {
Value *Ex = nullptr;
Value *ExV = nullptr;
auto *GEP = dyn_cast<GetElementPtrInst>(Scalar);
bool ReplaceGEP = GEP && ExternalUsesAsGEPs.contains(GEP);
auto It = ScalarToEEs.find(Scalar);
if (It != ScalarToEEs.end()) {
// No need to emit many extracts, just move the only one in the
Expand All @@ -13186,6 +13223,15 @@ Value *BoUpSLP::vectorizeTree(
if (const TreeEntry *ETE = getTreeEntry(V))
V = ETE->VectorizedValue;
Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
} else if (ReplaceGEP) {
// Leave the GEPs as is, they are free in most cases and better to
// keep them as GEPs.
auto *CloneGEP = GEP->clone();
CloneGEP->insertBefore(*Builder.GetInsertBlock(),
Builder.GetInsertPoint());
if (GEP->hasName())
CloneGEP->takeName(GEP);
Ex = CloneGEP;
} else {
Ex = Builder.CreateExtractElement(Vec, Lane);
}
Expand Down Expand Up @@ -13224,6 +13270,8 @@ Value *BoUpSLP::vectorizeTree(
assert((ExternallyUsedValues.count(Scalar) ||
any_of(Scalar->users(),
[&](llvm::User *U) {
if (ExternalUsesAsGEPs.contains(U))
return true;
TreeEntry *UseEntry = getTreeEntry(U);
return UseEntry &&
(UseEntry->State == TreeEntry::Vectorize ||
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ define i32 @fn1() {
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP0]], i32 0
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x ptr> [[TMP1]], <2 x ptr> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i64, <2 x ptr> [[TMP2]], <2 x i64> <i64 11, i64 56>
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x ptr> [[TMP3]], i32 0
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i64 11
; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint <2 x ptr> [[TMP3]] to <2 x i64>
; CHECK-NEXT: store <2 x i64> [[TMP5]], ptr [[TMP4]], align 8
; CHECK-NEXT: ret i32 undef
Expand Down Expand Up @@ -92,7 +92,7 @@ define void @externally_used_ptrs() {
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP0]], i32 0
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x ptr> [[TMP1]], <2 x ptr> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i64, <2 x ptr> [[TMP2]], <2 x i64> <i64 56, i64 11>
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x ptr> [[TMP3]], i32 1
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i64 11
; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint <2 x ptr> [[TMP3]] to <2 x i64>
; CHECK-NEXT: [[TMP6:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8
; CHECK-NEXT: [[TMP7:%.*]] = add <2 x i64> [[TMP5]], [[TMP6]]
Expand Down
17 changes: 9 additions & 8 deletions llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll
Original file line number Diff line number Diff line change
Expand Up @@ -13,33 +13,34 @@ define dso_local i32 @g() local_unnamed_addr {
; CHECK: while.body:
; CHECK-NEXT: [[C_022:%.*]] = phi ptr [ [[C_022_BE:%.*]], [[WHILE_BODY_BACKEDGE:%.*]] ], [ undef, [[ENTRY:%.*]] ]
; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x ptr> [ [[TMP14:%.*]], [[WHILE_BODY_BACKEDGE]] ], [ undef, [[ENTRY]] ]
; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[C_022]], i64 1
; CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[C_022]] to i64
; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1
; CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP9]] to i64
; CHECK-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[C_022]], i64 1
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i32, <2 x ptr> [[TMP1]], <2 x i64> <i64 1, i64 1>
; CHECK-NEXT: switch i32 [[TMP3]], label [[WHILE_BODY_BACKEDGE]] [
; CHECK-NEXT: i32 2, label [[SW_BB:%.*]]
; CHECK-NEXT: i32 4, label [[SW_BB6:%.*]]
; CHECK-NEXT: i32 2, label [[SW_BB:%.*]]
; CHECK-NEXT: i32 4, label [[SW_BB6:%.*]]
; CHECK-NEXT: ]
; CHECK: sw.bb:
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x ptr> [[TMP4]], i32 0
; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[TMP5]] to i64
; CHECK-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, <2 x ptr> [[TMP1]], <2 x i64> <i64 2, i64 2>
; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x ptr> [[TMP4]], i32 1
; CHECK-NEXT: store i32 [[TMP7]], ptr [[TMP9]], align 4
; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, ptr [[C_022]], i64 2
; CHECK-NEXT: store i32 [[TMP7]], ptr [[INCDEC_PTR1]], align 4
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, <2 x ptr> [[TMP1]], <2 x i64> <i64 2, i64 2>
; CHECK-NEXT: br label [[WHILE_BODY_BACKEDGE]]
; CHECK: sw.bb6:
; CHECK-NEXT: [[INCDEC_PTR8:%.*]] = getelementptr inbounds i32, ptr [[C_022]], i64 2
; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i64 1
; CHECK-NEXT: [[TMP10:%.*]] = ptrtoint ptr [[INCDEC_PTR]] to i64
; CHECK-NEXT: [[TMP11:%.*]] = trunc i64 [[TMP10]] to i32
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, <2 x ptr> [[TMP1]], <2 x i64> <i64 2, i64 2>
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x ptr> [[TMP4]], i32 0
; CHECK-NEXT: store i32 [[TMP11]], ptr [[TMP13]], align 4
; CHECK-NEXT: br label [[WHILE_BODY_BACKEDGE]]
; CHECK: while.body.backedge:
; CHECK-NEXT: [[C_022_BE]] = phi ptr [ [[INCDEC_PTR]], [[WHILE_BODY]] ], [ [[INCDEC_PTR8]], [[SW_BB6]] ], [ [[INCDEC_PTR5]], [[SW_BB]] ]
; CHECK-NEXT: [[C_022_BE]] = phi ptr [ [[INCDEC_PTR1]], [[WHILE_BODY]] ], [ [[INCDEC_PTR8]], [[SW_BB6]] ], [ [[INCDEC_PTR5]], [[SW_BB]] ]
; CHECK-NEXT: [[TMP14]] = phi <2 x ptr> [ [[TMP4]], [[WHILE_BODY]] ], [ [[TMP12]], [[SW_BB6]] ], [ [[TMP8]], [[SW_BB]] ]
; CHECK-NEXT: br label [[WHILE_BODY]]
; CHECK: while.end:
Expand Down
19 changes: 8 additions & 11 deletions llvm/test/Transforms/SLPVectorizer/X86/opaque-ptr.ll
Original file line number Diff line number Diff line change
Expand Up @@ -52,17 +52,14 @@ define void @test(ptr %r, ptr %p, ptr %q) #0 {

define void @test2(ptr %a, ptr %b) {
; CHECK-LABEL: @test2(
; CHECK-NEXT: [[A1:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 1
; CHECK-NEXT: [[A2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 2
; CHECK-NEXT: [[I1:%.*]] = ptrtoint ptr [[A1]] to i64
; CHECK-NEXT: [[B3:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 3
; CHECK-NEXT: [[I2:%.*]] = ptrtoint ptr [[B3]] to i64
; CHECK-NEXT: [[V1:%.*]] = load i64, ptr [[A1]], align 8
; CHECK-NEXT: [[V2:%.*]] = load i64, ptr [[A2]], align 8
; CHECK-NEXT: [[ADD1:%.*]] = add i64 [[I1]], [[V1]]
; CHECK-NEXT: [[ADD2:%.*]] = add i64 [[I2]], [[V2]]
; CHECK-NEXT: store i64 [[ADD1]], ptr [[A1]], align 8
; CHECK-NEXT: store i64 [[ADD2]], ptr [[A2]], align 8
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x ptr> poison, ptr [[A:%.*]], i32 0
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x ptr> [[TMP1]], ptr [[B:%.*]], i32 1
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i64, <2 x ptr> [[TMP2]], <2 x i64> <i64 1, i64 3>
; CHECK-NEXT: [[A1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 1
; CHECK-NEXT: [[TMP4:%.*]] = ptrtoint <2 x ptr> [[TMP3]] to <2 x i64>
; CHECK-NEXT: [[TMP5:%.*]] = load <2 x i64>, ptr [[A1]], align 8
; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i64> [[TMP4]], [[TMP5]]
; CHECK-NEXT: store <2 x i64> [[TMP6]], ptr [[A1]], align 8
; CHECK-NEXT: ret void
;
%a1 = getelementptr inbounds i64, ptr %a, i64 1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ define void @"foo"(ptr addrspace(1) %0, ptr addrspace(1) %1) #0 {
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x ptr addrspace(1)> poison, ptr addrspace(1) [[TMP0:%.*]], i32 0
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x ptr addrspace(1)> [[TMP3]], <4 x ptr addrspace(1)> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, <4 x ptr addrspace(1)> [[TMP4]], <4 x i64> <i64 8, i64 12, i64 28, i64 24>
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x ptr addrspace(1)> [[TMP5]], i32 0
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP0]], i64 8
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP1:%.*]], i64 8
; CHECK-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p1(<4 x ptr addrspace(1)> [[TMP5]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> poison)
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <8 x i32> <i32 0, i32 3, i32 0, i32 3, i32 2, i32 1, i32 2, i32 1>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ define void @allocas(ptr %a, ptr %b, ptr %c) {
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x ptr> poison, ptr [[V1]], i32 0
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x ptr> [[TMP1]], ptr [[V2]], i32 1
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, <2 x ptr> [[TMP2]], <2 x i32> <i32 1, i32 1>
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x ptr> [[TMP3]], i32 0
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[V1]], i32 1
; CHECK-NEXT: store ptr [[TMP4]], ptr [[A:%.*]], align 8
; CHECK-NEXT: store <2 x ptr> [[TMP3]], ptr [[B:%.*]], align 8
; CHECK-NEXT: ret void
Expand Down Expand Up @@ -127,7 +127,7 @@ define void @stacksave2(ptr %a, ptr %b, ptr %c) {
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x ptr> poison, ptr [[V1]], i32 0
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x ptr> [[TMP1]], ptr [[V2]], i32 1
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, <2 x ptr> [[TMP2]], <2 x i32> <i32 1, i32 1>
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x ptr> [[TMP3]], i32 0
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[V1]], i32 1
; CHECK-NEXT: store ptr [[TMP4]], ptr [[A:%.*]], align 8
; CHECK-NEXT: call void @use(ptr inalloca(i8) [[V2]]) #[[ATTR5:[0-9]+]]
; CHECK-NEXT: call void @llvm.stackrestore.p0(ptr [[STACK]])
Expand Down
Loading