Skip to content

[SLP]Represent externally used values as original scalars, if profitable. #100904

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
167 changes: 103 additions & 64 deletions llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1253,7 +1253,7 @@ class BoUpSLP {
NonScheduledFirst.clear();
EntryToLastInstruction.clear();
ExternalUses.clear();
ExternalUsesAsGEPs.clear();
ExternalUsesAsOriginalScalar.clear();
for (auto &Iter : BlocksSchedules) {
BlockScheduling *BS = Iter.second.get();
BS->clear();
Expand Down Expand Up @@ -3468,7 +3468,7 @@ class BoUpSLP {

/// A list of GEPs which can be reaplced by scalar GEPs instead of
/// extractelement instructions.
SmallPtrSet<Value *, 4> ExternalUsesAsGEPs;
SmallPtrSet<Value *, 4> ExternalUsesAsOriginalScalar;

/// Values used only by @llvm.assume calls.
SmallPtrSet<const Value *, 32> EphValues;
Expand Down Expand Up @@ -10663,6 +10663,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
SmallDenseSet<Value *, 4> UsedInserts;
DenseSet<std::pair<const TreeEntry *, Type *>> VectorCasts;
std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
DenseMap<const TreeEntry *, DenseSet<Value *>> ExtractsCount;
for (ExternalUser &EU : ExternalUses) {
// We only add extract cost once for the same scalar.
if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
Expand Down Expand Up @@ -10771,52 +10772,90 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
}
}
}
// Leave the GEPs as is, they are free in most cases and better to keep them
// as GEPs.

TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
if (auto *GEP = dyn_cast<GetElementPtrInst>(EU.Scalar)) {
// If we plan to rewrite the tree in a smaller type, we will need to sign
// extend the extracted value back to the original type. Here, we account
// for the extract and the added cost of the sign extend if needed.
InstructionCost ExtraCost = TTI::TCC_Free;
auto *VecTy = getWidenedType(EU.Scalar->getType(), BundleWidth);
const TreeEntry *Entry = getTreeEntry(EU.Scalar);
auto It = MinBWs.find(Entry);
if (It != MinBWs.end()) {
auto *MinTy = IntegerType::get(F->getContext(), It->second.first);
unsigned Extend =
It->second.second ? Instruction::SExt : Instruction::ZExt;
VecTy = getWidenedType(MinTy, BundleWidth);
ExtraCost = TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
VecTy, EU.Lane);
} else {
ExtraCost = TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
CostKind, EU.Lane);
}
// Leave the scalar instructions as is if they are cheaper than extracts.
if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
Entry->getOpcode() == Instruction::Load) {
if (!ValueToExtUses) {
ValueToExtUses.emplace();
for_each(enumerate(ExternalUses), [&](const auto &P) {
// Ignore phis in loops.
if (auto *Phi = dyn_cast_if_present<PHINode>(P.value().User)) {
auto *I = cast<Instruction>(P.value().Scalar);
const Loop *L = LI->getLoopFor(Phi->getParent());
if (L && (Phi->getParent() == I->getParent() ||
L == LI->getLoopFor(I->getParent())))
return;
}

ValueToExtUses->try_emplace(P.value().Scalar, P.index());
});
}
// Can use original GEP, if no operands vectorized or they are marked as
// externally used already.
bool CanBeUsedAsGEP = all_of(GEP->operands(), [&](Value *V) {
if (!getTreeEntry(V))
return true;
auto It = ValueToExtUses->find(V);
if (It != ValueToExtUses->end()) {
// Replace all uses to avoid compiler crash.
ExternalUses[It->second].User = nullptr;
// Can use original instruction, if no operands vectorized or they are
// marked as externally used already.
auto *Inst = cast<Instruction>(EU.Scalar);
bool CanBeUsedAsScalar = all_of(Inst->operands(), [&](Value *V) {
if (!getTreeEntry(V)) {
// Some extractelements might be not vectorized, but
// transformed into shuffle and removed from the function,
// consider it here.
if (auto *EE = dyn_cast<ExtractElementInst>(V))
return !EE->hasOneUse() || !MustGather.contains(EE);
return true;
}
return false;
return ValueToExtUses->contains(V);
});
if (CanBeUsedAsGEP) {
ExtractCost += TTI->getInstructionCost(GEP, CostKind);
ExternalUsesAsGEPs.insert(EU.Scalar);
continue;
if (CanBeUsedAsScalar) {
InstructionCost ScalarCost = TTI->getInstructionCost(Inst, CostKind);
bool KeepScalar = ScalarCost <= ExtraCost;
if (KeepScalar && ScalarCost != TTI::TCC_Free &&
ExtraCost - ScalarCost <= TTI::TCC_Basic) {
unsigned ScalarUsesCount = count_if(Entry->Scalars, [&](Value *V) {
return ValueToExtUses->contains(V);
});
auto It = ExtractsCount.find(Entry);
if (It != ExtractsCount.end())
ScalarUsesCount -= It->getSecond().size();
// Keep original scalar if number of externally used instructions in
// the same entry is not power of 2. It may help to do some extra
// vectorization for now.
KeepScalar = ScalarUsesCount <= 1 || !isPowerOf2_32(ScalarUsesCount);
}
if (KeepScalar) {
ExternalUsesAsOriginalScalar.insert(EU.Scalar);
for_each(Inst->operands(), [&](Value *V) {
auto It = ValueToExtUses->find(V);
if (It != ValueToExtUses->end()) {
// Replace all uses to avoid compiler crash.
ExternalUses[It->second].User = nullptr;
}
});
ExtraCost = ScalarCost;
ExtractsCount[Entry].insert(Inst);
}
}
}

// If we plan to rewrite the tree in a smaller type, we will need to sign
// extend the extracted value back to the original type. Here, we account
// for the extract and the added cost of the sign extend if needed.
auto *VecTy = getWidenedType(EU.Scalar->getType(), BundleWidth);
auto It = MinBWs.find(getTreeEntry(EU.Scalar));
if (It != MinBWs.end()) {
auto *MinTy = IntegerType::get(F->getContext(), It->second.first);
unsigned Extend =
It->second.second ? Instruction::SExt : Instruction::ZExt;
VecTy = getWidenedType(MinTy, BundleWidth);
ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
VecTy, EU.Lane);
} else {
ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
CostKind, EU.Lane);
}
ExtractCost += ExtraCost;
}
// Add reduced value cost, if resized.
if (!VectorizedVals.empty()) {
Expand Down Expand Up @@ -14067,8 +14106,7 @@ Value *BoUpSLP::vectorizeTree(
DenseMap<Value *, InsertElementInst *> VectorToInsertElement;
// Maps extract Scalar to the corresponding extractelement instruction in the
// basic block. Only one extractelement per block should be emitted.
DenseMap<Value *,
DenseMap<BasicBlock *, std::pair<Instruction *, Instruction *>>>
DenseMap<Value *, DenseMap<BasicBlock *, std::pair<Value *, Value *>>>
ScalarToEEs;
SmallDenseSet<Value *, 4> UsedInserts;
DenseMap<std::pair<Value *, Type *>, Value *> VectorCasts;
Expand Down Expand Up @@ -14098,30 +14136,41 @@ Value *BoUpSLP::vectorizeTree(
if (Scalar->getType() != Vec->getType()) {
Value *Ex = nullptr;
Value *ExV = nullptr;
auto *GEP = dyn_cast<GetElementPtrInst>(Scalar);
bool ReplaceGEP = GEP && ExternalUsesAsGEPs.contains(GEP);
auto *Inst = dyn_cast<Instruction>(Scalar);
bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.contains(Inst);
auto It = ScalarToEEs.find(Scalar);
if (It != ScalarToEEs.end()) {
// No need to emit many extracts, just move the only one in the
// current block.
auto EEIt = It->second.find(Builder.GetInsertBlock());
auto EEIt = It->second.find(ReplaceInst ? Inst->getParent()
: Builder.GetInsertBlock());
if (EEIt != It->second.end()) {
Instruction *I = EEIt->second.first;
if (Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
Value *PrevV = EEIt->second.first;
if (auto *I = dyn_cast<Instruction>(PrevV);
I && !ReplaceInst &&
Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
Builder.GetInsertPoint()->comesBefore(I)) {
I->moveBefore(*Builder.GetInsertPoint()->getParent(),
Builder.GetInsertPoint());
if (auto *CI = EEIt->second.second)
if (auto *CI = dyn_cast<Instruction>(EEIt->second.second))
CI->moveAfter(I);
}
Ex = I;
Ex = PrevV;
ExV = EEIt->second.second ? EEIt->second.second : Ex;
}
}
if (!Ex) {
// "Reuse" the existing extract to improve final codegen.
if (auto *ES = dyn_cast<ExtractElementInst>(Scalar);
ES && isa<Instruction>(Vec)) {
if (ReplaceInst) {
// Leave the instruction as is, if it cheaper extracts and all
// operands are scalar.
auto *CloneInst = Inst->clone();
CloneInst->insertBefore(Inst);
if (Inst->hasName())
CloneInst->takeName(Inst);
Ex = CloneInst;
} else if (auto *ES = dyn_cast<ExtractElementInst>(Scalar);
ES && isa<Instruction>(Vec)) {
Value *V = ES->getVectorOperand();
auto *IVec = cast<Instruction>(Vec);
if (const TreeEntry *ETE = getTreeEntry(V))
Expand All @@ -14132,18 +14181,6 @@ Value *BoUpSLP::vectorizeTree(
Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
else
Ex = Builder.CreateExtractElement(Vec, Lane);
} else if (ReplaceGEP) {
// Leave the GEPs as is, they are free in most cases and better to
// keep them as GEPs.
auto *CloneGEP = GEP->clone();
if (isa<Instruction>(Vec))
CloneGEP->insertBefore(*Builder.GetInsertBlock(),
Builder.GetInsertPoint());
else
CloneGEP->insertBefore(GEP);
if (GEP->hasName())
CloneGEP->takeName(GEP);
Ex = CloneGEP;
} else if (auto *VecTy =
dyn_cast<FixedVectorType>(Scalar->getType())) {
assert(SLPReVec && "FixedVectorType is not expected.");
Expand All @@ -14164,14 +14201,15 @@ Value *BoUpSLP::vectorizeTree(
if (Scalar->getType() != Ex->getType())
ExV = Builder.CreateIntCast(Ex, Scalar->getType(),
MinBWs.find(E)->second.second);
if (auto *I = dyn_cast<Instruction>(Ex))
ScalarToEEs[Scalar].try_emplace(
Builder.GetInsertBlock(),
std::make_pair(I, cast<Instruction>(ExV)));
auto *I = dyn_cast<Instruction>(Ex);
ScalarToEEs[Scalar].try_emplace(I ? I->getParent()
: &F->getEntryBlock(),
std::make_pair(Ex, ExV));
}
// The then branch of the previous if may produce constants, since 0
// operand might be a constant.
if (auto *ExI = dyn_cast<Instruction>(Ex)) {
if (auto *ExI = dyn_cast<Instruction>(Ex);
ExI && !isa<PHINode>(ExI) && !mayHaveNonDefUseDependency(*ExI)) {
GatherShuffleExtractSeq.insert(ExI);
CSEBlocks.insert(ExI->getParent());
}
Expand All @@ -14192,9 +14230,10 @@ Value *BoUpSLP::vectorizeTree(
continue;
assert((ExternallyUsedValues.count(Scalar) ||
Scalar->hasNUsesOrMore(UsesLimit) ||
ExternalUsesAsOriginalScalar.contains(Scalar) ||
any_of(Scalar->users(),
[&](llvm::User *U) {
if (ExternalUsesAsGEPs.contains(U))
if (ExternalUsesAsOriginalScalar.contains(U))
return true;
TreeEntry *UseEntry = getTreeEntry(U);
return UseEntry &&
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@
define i16 @foo(ptr %p1, ptr %p2) {
; CHECK-LABEL: @foo(
; CHECK-NEXT: entry:
; CHECK-NEXT: store i32 0, ptr [[P1:%.*]], align 1
; CHECK-NEXT: [[CONST_MAT:%.*]] = or i32 0, 0
; CHECK-NEXT: [[CONST:%.*]] = bitcast i32 0 to i32
; CHECK-NEXT: store i32 [[CONST]], ptr [[P1:%.*]], align 1
; CHECK-NEXT: [[CONST_MAT:%.*]] = or i32 [[CONST]], 0
; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr [[P2:%.*]], align 1
; CHECK-NEXT: ret i16 0
;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1236,20 +1236,20 @@ define void @crash_no_tracked_instructions(ptr %arg, ptr %arg.2, ptr %arg.3, i1
; CHECK: bb22:
; CHECK-NEXT: [[T23:%.*]] = fmul float [[T20]], 9.900000e+01
; CHECK-NEXT: [[T25:%.*]] = getelementptr inbounds float, ptr [[T19]], i64 2
; CHECK-NEXT: [[T26:%.*]] = fmul float [[T23]], 1.000000e+01
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[T23]], i32 0
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x float> [[TMP2]], <float 9.900000e+01, float 1.000000e+01>
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
; CHECK-NEXT: store float [[TMP4]], ptr [[T25]], align 4
; CHECK-NEXT: store float [[T26]], ptr [[T25]], align 4
; CHECK-NEXT: [[T27:%.*]] = load float, ptr [[ARG_2:%.*]], align 8
; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x float> [[TMP3]], <float 2.000000e+01, float 2.000000e+01>
; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x float> [[TMP3]], <float 2.000000e+01, float 2.000000e+01>
; CHECK-NEXT: br label [[BB30]]
; CHECK: bb30:
; CHECK-NEXT: [[TMP6:%.*]] = phi <2 x float> [ [[TMP5]], [[BB22]] ], [ [[TMP0]], [[ENTRY:%.*]] ]
; CHECK-NEXT: [[TMP5:%.*]] = phi <2 x float> [ [[TMP4]], [[BB22]] ], [ [[TMP0]], [[ENTRY:%.*]] ]
; CHECK-NEXT: br label [[BB36:%.*]]
; CHECK: bb36:
; CHECK-NEXT: [[TMP7:%.*]] = fmul <2 x float> [[TMP6]], <float 3.000000e+00, float 3.000000e+00>
; CHECK-NEXT: store <2 x float> [[TMP7]], ptr [[ARG_3]], align 4
; CHECK-NEXT: [[TMP6:%.*]] = fmul <2 x float> [[TMP5]], <float 3.000000e+00, float 3.000000e+00>
; CHECK-NEXT: store <2 x float> [[TMP6]], ptr [[ARG_3]], align 4
; CHECK-NEXT: br label [[BB41:%.*]]
; CHECK: bb41:
; CHECK-NEXT: ret void
Expand Down
Loading
Loading