Skip to content

[SLP]Do not early exit if the number of unique elements is non-power-of-2. #65476

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 54 additions & 25 deletions llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2911,7 +2911,8 @@ class BoUpSLP {
}
if (Last->State != TreeEntry::NeedToGather) {
for (Value *V : VL) {
assert(!getTreeEntry(V) && "Scalar already in tree!");
[[maybe_unused]] const TreeEntry *TE = getTreeEntry(V);
assert((!TE || TE == Last) && "Scalar already in tree!");
ScalarToTreeEntry[V] = Last;
}
// Update the scheduler bundle to point to this TreeEntry.
Expand All @@ -2924,7 +2925,8 @@ class BoUpSLP {
for (Value *V : VL) {
if (doesNotNeedToBeScheduled(V))
continue;
assert(BundleMember && "Unexpected end of bundle.");
if (!BundleMember)
continue;
BundleMember->TE = Last;
BundleMember = BundleMember->NextInBundle;
}
Expand Down Expand Up @@ -5583,9 +5585,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

SmallVector<int> ReuseShuffleIndicies;
SmallVector<Value *> UniqueValues;
auto &&TryToFindDuplicates = [&VL, &ReuseShuffleIndicies, &UniqueValues,
&UserTreeIdx,
this](const InstructionsState &S) {
SmallVector<Value *> NonUniqueValueVL;
auto TryToFindDuplicates = [&](const InstructionsState &S,
bool DoNotFail = false) {
// Check that every instruction appears once in this bundle.
DenseMap<Value *, unsigned> UniquePositions(VL.size());
for (Value *V : VL) {
Expand All @@ -5612,6 +5614,24 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
!isConstant(V);
})) ||
!llvm::has_single_bit<uint32_t>(NumUniqueScalarValues)) {
if (DoNotFail && UniquePositions.size() > 1 &&
NumUniqueScalarValues > 1 && S.MainOp->isSafeToRemove() &&
all_of(UniqueValues, [=](Value *V) {
return isa<ExtractElementInst>(V) ||
areAllUsersVectorized(cast<Instruction>(V),
UserIgnoreList);
})) {
unsigned PWSz = PowerOf2Ceil(UniqueValues.size());
if (PWSz == VL.size()) {
ReuseShuffleIndicies.clear();
} else {
NonUniqueValueVL.assign(UniqueValues.begin(), UniqueValues.end());
NonUniqueValueVL.append(PWSz - UniqueValues.size(),
UniqueValues.back());
VL = NonUniqueValueVL;
}
return true;
}
LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
return false;
Expand Down Expand Up @@ -5857,7 +5877,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
}

// Check that every instruction appears once in this bundle.
if (!TryToFindDuplicates(S))
if (!TryToFindDuplicates(S, /*DoNotFail=*/true))
return;

// Perform specific checks for each particular instruction kind.
Expand All @@ -5877,7 +5897,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

BlockScheduling &BS = *BSRef;

std::optional<ScheduleData *> Bundle = BS.tryScheduleBundle(VL, this, S);
std::optional<ScheduleData *> Bundle =
BS.tryScheduleBundle(UniqueValues, this, S);
#ifdef EXPENSIVE_CHECKS
// Make sure we didn't break any internal invariants
BS.verify();
Expand Down Expand Up @@ -7537,7 +7558,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
Instruction *VL0 = E->getMainOp();
unsigned ShuffleOrOp =
E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
const unsigned Sz = VL.size();
SetVector<Value *> UniqueValues(VL.begin(), VL.end());
const unsigned Sz = UniqueValues.size();
auto GetCostDiff =
[=](function_ref<InstructionCost(unsigned)> ScalarEltCost,
function_ref<InstructionCost(InstructionCost)> VectorCost) {
Expand Down Expand Up @@ -7644,7 +7666,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
// Count reused scalars.
InstructionCost ScalarCost = 0;
SmallPtrSet<const TreeEntry *, 4> CountedOps;
for (Value *V : VL) {
for (Value *V : UniqueValues) {
auto *PHI = dyn_cast<PHINode>(V);
if (!PHI)
continue;
Expand All @@ -7665,8 +7687,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
}
case Instruction::ExtractValue:
case Instruction::ExtractElement: {
auto GetScalarCost = [=](unsigned Idx) {
auto *I = cast<Instruction>(VL[Idx]);
auto GetScalarCost = [&](unsigned Idx) {
auto *I = cast<Instruction>(UniqueValues[Idx]);
VectorType *SrcVecTy;
if (ShuffleOrOp == Instruction::ExtractElement) {
auto *EE = cast<ExtractElementInst>(I);
Expand Down Expand Up @@ -7844,9 +7866,10 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
}
}
auto GetScalarCost = [=](unsigned Idx) {
auto *VI =
VL0->getOpcode() == Opcode ? cast<Instruction>(VL[Idx]) : nullptr;
auto GetScalarCost = [&](unsigned Idx) {
auto *VI = VL0->getOpcode() == Opcode
? cast<Instruction>(UniqueValues[Idx])
: nullptr;
return TTI->getCastInstrCost(Opcode, ScalarTy, SrcScalarTy,
TTI::getCastContextHint(VI), CostKind, VI);
};
Expand Down Expand Up @@ -7891,7 +7914,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
? CmpInst::BAD_FCMP_PREDICATE
: CmpInst::BAD_ICMP_PREDICATE;
auto GetScalarCost = [&](unsigned Idx) {
auto *VI = cast<Instruction>(VL[Idx]);
auto *VI = cast<Instruction>(UniqueValues[Idx]);
CmpInst::Predicate CurrentPred = ScalarTy->isFloatingPointTy()
? CmpInst::BAD_FCMP_PREDICATE
: CmpInst::BAD_ICMP_PREDICATE;
Expand Down Expand Up @@ -7951,8 +7974,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
case Instruction::And:
case Instruction::Or:
case Instruction::Xor: {
auto GetScalarCost = [=](unsigned Idx) {
auto *VI = cast<Instruction>(VL[Idx]);
auto GetScalarCost = [&](unsigned Idx) {
auto *VI = cast<Instruction>(UniqueValues[Idx]);
unsigned OpIdx = isa<UnaryOperator>(VI) ? 0 : 1;
TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(VI->getOperand(0));
TTI::OperandValueInfo Op2Info =
Expand All @@ -7975,14 +7998,14 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
return CommonCost + GetGEPCostDiff(VL, VL0);
}
case Instruction::Load: {
auto GetScalarCost = [=](unsigned Idx) {
auto *VI = cast<LoadInst>(VL[Idx]);
auto GetScalarCost = [&](unsigned Idx) {
auto *VI = cast<LoadInst>(UniqueValues[Idx]);
return TTI->getMemoryOpCost(Instruction::Load, ScalarTy, VI->getAlign(),
VI->getPointerAddressSpace(), CostKind,
TTI::OperandValueInfo(), VI);
};
auto *LI0 = cast<LoadInst>(VL0);
auto GetVectorCost = [=](InstructionCost CommonCost) {
auto GetVectorCost = [&](InstructionCost CommonCost) {
InstructionCost VecLdCost;
if (E->State == TreeEntry::Vectorize) {
VecLdCost = TTI->getMemoryOpCost(
Expand All @@ -7993,7 +8016,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
E->State == TreeEntry::PossibleStridedVectorize) &&
"Unknown EntryState");
Align CommonAlignment = LI0->getAlign();
for (Value *V : VL)
for (Value *V : UniqueValues)
CommonAlignment =
std::min(CommonAlignment, cast<LoadInst>(V)->getAlign());
VecLdCost = TTI->getGatherScatterOpCost(
Expand Down Expand Up @@ -8045,8 +8068,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
}
case Instruction::Call: {
auto GetScalarCost = [=](unsigned Idx) {
auto *CI = cast<CallInst>(VL[Idx]);
auto GetScalarCost = [&](unsigned Idx) {
auto *CI = cast<CallInst>(UniqueValues[Idx]);
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
if (ID != Intrinsic::not_intrinsic) {
IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
Expand Down Expand Up @@ -8087,8 +8110,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
}
return false;
};
auto GetScalarCost = [=](unsigned Idx) {
auto *VI = cast<Instruction>(VL[Idx]);
auto GetScalarCost = [&](unsigned Idx) {
auto *VI = cast<Instruction>(UniqueValues[Idx]);
assert(E->isOpcodeOrAlt(VI) && "Unexpected main/alternate opcode");
(void)E;
return TTI->getInstructionCost(VI, CostKind);
Expand Down Expand Up @@ -8607,6 +8630,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
SmallVector<MapVector<const TreeEntry *, SmallVector<int>>> ShuffleMasks;
SmallVector<std::pair<Value *, const TreeEntry *>> FirstUsers;
SmallVector<APInt> DemandedElts;
SmallDenseSet<Value *, 4> UsedInserts;
for (ExternalUser &EU : ExternalUses) {
// We only add extract cost once for the same scalar.
if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
Expand All @@ -8627,6 +8651,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
// to detect it as a final shuffled/identity match.
if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User)) {
if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {
if (!UsedInserts.insert(VU).second)
continue;
std::optional<unsigned> InsertIdx = getInsertIndex(VU);
if (InsertIdx) {
const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar);
Expand Down Expand Up @@ -11008,6 +11034,7 @@ Value *BoUpSLP::vectorizeTree(
// Maps extract Scalar to the corresponding extractelement instruction in the
// basic block. Only one extractelement per block should be emitted.
DenseMap<Value *, DenseMap<BasicBlock *, Instruction *>> ScalarToEEs;
SmallDenseSet<Value *, 4> UsedInserts;
// Extract all of the elements with the external uses.
for (const auto &ExternalUse : ExternalUses) {
Value *Scalar = ExternalUse.Scalar;
Expand Down Expand Up @@ -11106,6 +11133,8 @@ Value *BoUpSLP::vectorizeTree(
// Skip if the scalar is another vector op or Vec is not an instruction.
if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
if (auto *FTy = dyn_cast<FixedVectorType>(User->getType())) {
if (!UsedInserts.insert(VU).second)
continue;
std::optional<unsigned> InsertIdx = getInsertIndex(VU);
if (InsertIdx) {
// Need to use original vector, if the root is truncated.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -123,22 +123,19 @@ define <4 x i32> @build_vec_v4i32_reuse_0(<2 x i32> %v0, <2 x i32> %v1) {

define <4 x i32> @build_vec_v4i32_reuse_1(<2 x i32> %v0, <2 x i32> %v1) {
; CHECK-LABEL: @build_vec_v4i32_reuse_1(
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[V1:%.*]], i64 1
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[V1]], i64 0
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[V0:%.*]], i64 1
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[V0]], i64 0
; CHECK-NEXT: [[TMP0_0:%.*]] = add i32 [[TMP4]], [[TMP2]]
; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[TMP3]], [[TMP1]]
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_0]], i64 0
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_1]], i64 0
; CHECK-NEXT: [[TMP7:%.*]] = sub <2 x i32> [[TMP5]], [[TMP6]]
; CHECK-NEXT: [[TMP8:%.*]] = xor <2 x i32> [[V0]], [[V1]]
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
; CHECK-NEXT: [[TMP10:%.*]] = sub <2 x i32> [[TMP8]], [[TMP9]]
; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> <i32 0, i32 0, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP2_31:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
; CHECK-NEXT: ret <4 x i32> [[TMP2_31]]
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[V1:%.*]], <2 x i32> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[V0]], i64 1
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[V1]], i64 1
; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[TMP3]], [[TMP4]]
; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i32> [[TMP1]], [[TMP2]]
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 2>
; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP0_1]], i64 0
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP9]], <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
; CHECK-NEXT: [[TMP11:%.*]] = sub <4 x i32> [[TMP7]], [[TMP10]]
; CHECK-NEXT: ret <4 x i32> [[TMP11]]
;
%v0.0 = extractelement <2 x i32> %v0, i32 0
%v0.1 = extractelement <2 x i32> %v0, i32 1
Expand Down
29 changes: 13 additions & 16 deletions llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll
Original file line number Diff line number Diff line change
Expand Up @@ -123,22 +123,19 @@ define <4 x i32> @build_vec_v4i32_reuse_0(<2 x i32> %v0, <2 x i32> %v1) {

define <4 x i32> @build_vec_v4i32_reuse_1(<2 x i32> %v0, <2 x i32> %v1) {
; CHECK-LABEL: @build_vec_v4i32_reuse_1(
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[V1:%.*]], i64 1
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[V1]], i64 0
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[V0:%.*]], i64 1
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[V0]], i64 0
; CHECK-NEXT: [[TMP0_0:%.*]] = add i32 [[TMP4]], [[TMP2]]
; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[TMP3]], [[TMP1]]
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_0]], i64 0
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_1]], i64 0
; CHECK-NEXT: [[TMP7:%.*]] = sub <2 x i32> [[TMP5]], [[TMP6]]
; CHECK-NEXT: [[TMP8:%.*]] = xor <2 x i32> [[V0]], [[V1]]
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
; CHECK-NEXT: [[TMP10:%.*]] = sub <2 x i32> [[TMP8]], [[TMP9]]
; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> <i32 0, i32 0, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP2_31:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
; CHECK-NEXT: ret <4 x i32> [[TMP2_31]]
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[V1:%.*]], <2 x i32> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[V0]], i64 1
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[V1]], i64 1
; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[TMP3]], [[TMP4]]
; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i32> [[TMP1]], [[TMP2]]
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 2>
; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP0_1]], i64 0
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP9]], <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
; CHECK-NEXT: [[TMP11:%.*]] = sub <4 x i32> [[TMP7]], [[TMP10]]
; CHECK-NEXT: ret <4 x i32> [[TMP11]]
;
%v0.0 = extractelement <2 x i32> %v0, i32 0
%v0.1 = extractelement <2 x i32> %v0, i32 1
Expand Down
Loading