Skip to content

Commit 3604304

Browse files
committed
[SLP]Do not early exit if the number of unique elements is
non-power-of-2. We still can try to vectorize the bundle of the instructions, even if the repeated number of instruction is non-power-of-2. In this case need to adjust the cost (calculate the cost only for unique scalar instructions) and cost of the extracts. Also, when scheduling the bundle need to schedule only unique scalars to avoid compiler crash because of the multiple dependencies. Can be safely applied only if all scalars's users are also vectorized and do not require memory accesses (this one is a temporarily requirement, can be relaxed later).
1 parent 25fd5e6 commit 3604304

File tree

6 files changed

+110
-105
lines changed

6 files changed

+110
-105
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 56 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -2911,7 +2911,8 @@ class BoUpSLP {
29112911
}
29122912
if (Last->State != TreeEntry::NeedToGather) {
29132913
for (Value *V : VL) {
2914-
assert(!getTreeEntry(V) && "Scalar already in tree!");
2914+
[[maybe_unused]] const TreeEntry *TE = getTreeEntry(V);
2915+
assert((!TE || TE == Last) && "Scalar already in tree!");
29152916
ScalarToTreeEntry[V] = Last;
29162917
}
29172918
// Update the scheduler bundle to point to this TreeEntry.
@@ -2924,7 +2925,8 @@ class BoUpSLP {
29242925
for (Value *V : VL) {
29252926
if (doesNotNeedToBeScheduled(V))
29262927
continue;
2927-
assert(BundleMember && "Unexpected end of bundle.");
2928+
if (!BundleMember)
2929+
continue;
29282930
BundleMember->TE = Last;
29292931
BundleMember = BundleMember->NextInBundle;
29302932
}
@@ -5583,9 +5585,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
55835585

55845586
SmallVector<int> ReuseShuffleIndicies;
55855587
SmallVector<Value *> UniqueValues;
5586-
auto &&TryToFindDuplicates = [&VL, &ReuseShuffleIndicies, &UniqueValues,
5587-
&UserTreeIdx,
5588-
this](const InstructionsState &S) {
5588+
SmallVector<Value *> NonUniqueValueVL;
5589+
auto TryToFindDuplicates = [&](const InstructionsState &S,
5590+
bool DoNotFail = false) {
55895591
// Check that every instruction appears once in this bundle.
55905592
DenseMap<Value *, unsigned> UniquePositions(VL.size());
55915593
for (Value *V : VL) {
@@ -5612,6 +5614,26 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
56125614
!isConstant(V);
56135615
})) ||
56145616
!llvm::has_single_bit<uint32_t>(NumUniqueScalarValues)) {
5617+
SmallVector<Value *> IgnoredVals;
5618+
if (UserIgnoreList)
5619+
IgnoredVals.assign(UserIgnoreList->begin(), UserIgnoreList->end());
5620+
if (DoNotFail && UniquePositions.size() > 1 &&
5621+
NumUniqueScalarValues > 1 && S.MainOp->isSafeToRemove() &&
5622+
all_of(UniqueValues, [=](Value *V) {
5623+
return isa<ExtractElementInst>(V) ||
5624+
areAllUsersVectorized(cast<Instruction>(V), IgnoredVals);
5625+
})) {
5626+
unsigned PWSz = PowerOf2Ceil(UniqueValues.size());
5627+
if (PWSz == VL.size()) {
5628+
ReuseShuffleIndicies.clear();
5629+
} else {
5630+
NonUniqueValueVL.assign(UniqueValues.begin(), UniqueValues.end());
5631+
NonUniqueValueVL.append(PWSz - UniqueValues.size(),
5632+
UniqueValues.back());
5633+
VL = NonUniqueValueVL;
5634+
}
5635+
return true;
5636+
}
56155637
LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
56165638
newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
56175639
return false;
@@ -5857,7 +5879,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
58575879
}
58585880

58595881
// Check that every instruction appears once in this bundle.
5860-
if (!TryToFindDuplicates(S))
5882+
if (!TryToFindDuplicates(S, /*DoNotFail=*/true))
58615883
return;
58625884

58635885
// Perform specific checks for each particular instruction kind.
@@ -5877,7 +5899,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
58775899

58785900
BlockScheduling &BS = *BSRef;
58795901

5880-
std::optional<ScheduleData *> Bundle = BS.tryScheduleBundle(VL, this, S);
5902+
std::optional<ScheduleData *> Bundle =
5903+
BS.tryScheduleBundle(UniqueValues, this, S);
58815904
#ifdef EXPENSIVE_CHECKS
58825905
// Make sure we didn't break any internal invariants
58835906
BS.verify();
@@ -7537,7 +7560,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
75377560
Instruction *VL0 = E->getMainOp();
75387561
unsigned ShuffleOrOp =
75397562
E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
7540-
const unsigned Sz = VL.size();
7563+
SetVector<Value *> UniqueValues(VL.begin(), VL.end());
7564+
const unsigned Sz = UniqueValues.size();
75417565
auto GetCostDiff =
75427566
[=](function_ref<InstructionCost(unsigned)> ScalarEltCost,
75437567
function_ref<InstructionCost(InstructionCost)> VectorCost) {
@@ -7644,7 +7668,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
76447668
// Count reused scalars.
76457669
InstructionCost ScalarCost = 0;
76467670
SmallPtrSet<const TreeEntry *, 4> CountedOps;
7647-
for (Value *V : VL) {
7671+
for (Value *V : UniqueValues) {
76487672
auto *PHI = dyn_cast<PHINode>(V);
76497673
if (!PHI)
76507674
continue;
@@ -7665,8 +7689,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
76657689
}
76667690
case Instruction::ExtractValue:
76677691
case Instruction::ExtractElement: {
7668-
auto GetScalarCost = [=](unsigned Idx) {
7669-
auto *I = cast<Instruction>(VL[Idx]);
7692+
auto GetScalarCost = [&](unsigned Idx) {
7693+
auto *I = cast<Instruction>(UniqueValues[Idx]);
76707694
VectorType *SrcVecTy;
76717695
if (ShuffleOrOp == Instruction::ExtractElement) {
76727696
auto *EE = cast<ExtractElementInst>(I);
@@ -7844,9 +7868,10 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
78447868
Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
78457869
}
78467870
}
7847-
auto GetScalarCost = [=](unsigned Idx) {
7848-
auto *VI =
7849-
VL0->getOpcode() == Opcode ? cast<Instruction>(VL[Idx]) : nullptr;
7871+
auto GetScalarCost = [&](unsigned Idx) {
7872+
auto *VI = VL0->getOpcode() == Opcode
7873+
? cast<Instruction>(UniqueValues[Idx])
7874+
: nullptr;
78507875
return TTI->getCastInstrCost(Opcode, ScalarTy, SrcScalarTy,
78517876
TTI::getCastContextHint(VI), CostKind, VI);
78527877
};
@@ -7891,7 +7916,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
78917916
? CmpInst::BAD_FCMP_PREDICATE
78927917
: CmpInst::BAD_ICMP_PREDICATE;
78937918
auto GetScalarCost = [&](unsigned Idx) {
7894-
auto *VI = cast<Instruction>(VL[Idx]);
7919+
auto *VI = cast<Instruction>(UniqueValues[Idx]);
78957920
CmpInst::Predicate CurrentPred = ScalarTy->isFloatingPointTy()
78967921
? CmpInst::BAD_FCMP_PREDICATE
78977922
: CmpInst::BAD_ICMP_PREDICATE;
@@ -7951,8 +7976,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
79517976
case Instruction::And:
79527977
case Instruction::Or:
79537978
case Instruction::Xor: {
7954-
auto GetScalarCost = [=](unsigned Idx) {
7955-
auto *VI = cast<Instruction>(VL[Idx]);
7979+
auto GetScalarCost = [&](unsigned Idx) {
7980+
auto *VI = cast<Instruction>(UniqueValues[Idx]);
79567981
unsigned OpIdx = isa<UnaryOperator>(VI) ? 0 : 1;
79577982
TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(VI->getOperand(0));
79587983
TTI::OperandValueInfo Op2Info =
@@ -7975,14 +8000,14 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
79758000
return CommonCost + GetGEPCostDiff(VL, VL0);
79768001
}
79778002
case Instruction::Load: {
7978-
auto GetScalarCost = [=](unsigned Idx) {
7979-
auto *VI = cast<LoadInst>(VL[Idx]);
8003+
auto GetScalarCost = [&](unsigned Idx) {
8004+
auto *VI = cast<LoadInst>(UniqueValues[Idx]);
79808005
return TTI->getMemoryOpCost(Instruction::Load, ScalarTy, VI->getAlign(),
79818006
VI->getPointerAddressSpace(), CostKind,
79828007
TTI::OperandValueInfo(), VI);
79838008
};
79848009
auto *LI0 = cast<LoadInst>(VL0);
7985-
auto GetVectorCost = [=](InstructionCost CommonCost) {
8010+
auto GetVectorCost = [&](InstructionCost CommonCost) {
79868011
InstructionCost VecLdCost;
79878012
if (E->State == TreeEntry::Vectorize) {
79888013
VecLdCost = TTI->getMemoryOpCost(
@@ -7993,7 +8018,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
79938018
E->State == TreeEntry::PossibleStridedVectorize) &&
79948019
"Unknown EntryState");
79958020
Align CommonAlignment = LI0->getAlign();
7996-
for (Value *V : VL)
8021+
for (Value *V : UniqueValues)
79978022
CommonAlignment =
79988023
std::min(CommonAlignment, cast<LoadInst>(V)->getAlign());
79998024
VecLdCost = TTI->getGatherScatterOpCost(
@@ -8045,8 +8070,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
80458070
GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
80468071
}
80478072
case Instruction::Call: {
8048-
auto GetScalarCost = [=](unsigned Idx) {
8049-
auto *CI = cast<CallInst>(VL[Idx]);
8073+
auto GetScalarCost = [&](unsigned Idx) {
8074+
auto *CI = cast<CallInst>(UniqueValues[Idx]);
80508075
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
80518076
if (ID != Intrinsic::not_intrinsic) {
80528077
IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
@@ -8087,8 +8112,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
80878112
}
80888113
return false;
80898114
};
8090-
auto GetScalarCost = [=](unsigned Idx) {
8091-
auto *VI = cast<Instruction>(VL[Idx]);
8115+
auto GetScalarCost = [&](unsigned Idx) {
8116+
auto *VI = cast<Instruction>(UniqueValues[Idx]);
80928117
assert(E->isOpcodeOrAlt(VI) && "Unexpected main/alternate opcode");
80938118
(void)E;
80948119
return TTI->getInstructionCost(VI, CostKind);
@@ -8607,6 +8632,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
86078632
SmallVector<MapVector<const TreeEntry *, SmallVector<int>>> ShuffleMasks;
86088633
SmallVector<std::pair<Value *, const TreeEntry *>> FirstUsers;
86098634
SmallVector<APInt> DemandedElts;
8635+
SmallDenseSet<Value *, 4> UsedInserts;
86108636
for (ExternalUser &EU : ExternalUses) {
86118637
// We only add extract cost once for the same scalar.
86128638
if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
@@ -8627,6 +8653,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
86278653
// to detect it as a final shuffled/identity match.
86288654
if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User)) {
86298655
if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {
8656+
if (!UsedInserts.insert(VU).second)
8657+
continue;
86308658
std::optional<unsigned> InsertIdx = getInsertIndex(VU);
86318659
if (InsertIdx) {
86328660
const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar);
@@ -11008,6 +11036,7 @@ Value *BoUpSLP::vectorizeTree(
1100811036
// Maps extract Scalar to the corresponding extractelement instruction in the
1100911037
// basic block. Only one extractelement per block should be emitted.
1101011038
DenseMap<Value *, DenseMap<BasicBlock *, Instruction *>> ScalarToEEs;
11039+
SmallDenseSet<Value *, 4> UsedInserts;
1101111040
// Extract all of the elements with the external uses.
1101211041
for (const auto &ExternalUse : ExternalUses) {
1101311042
Value *Scalar = ExternalUse.Scalar;
@@ -11106,6 +11135,8 @@ Value *BoUpSLP::vectorizeTree(
1110611135
// Skip if the scalar is another vector op or Vec is not an instruction.
1110711136
if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
1110811137
if (auto *FTy = dyn_cast<FixedVectorType>(User->getType())) {
11138+
if (!UsedInserts.insert(VU).second)
11139+
continue;
1110911140
std::optional<unsigned> InsertIdx = getInsertIndex(VU);
1111011141
if (InsertIdx) {
1111111142
// Need to use original vector, if the root is truncated.

llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll

Lines changed: 13 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -123,22 +123,19 @@ define <4 x i32> @build_vec_v4i32_reuse_0(<2 x i32> %v0, <2 x i32> %v1) {
123123

124124
define <4 x i32> @build_vec_v4i32_reuse_1(<2 x i32> %v0, <2 x i32> %v1) {
125125
; CHECK-LABEL: @build_vec_v4i32_reuse_1(
126-
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[V1:%.*]], i64 1
127-
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[V1]], i64 0
128-
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[V0:%.*]], i64 1
129-
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[V0]], i64 0
130-
; CHECK-NEXT: [[TMP0_0:%.*]] = add i32 [[TMP4]], [[TMP2]]
131-
; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[TMP3]], [[TMP1]]
132-
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_0]], i64 0
133-
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_1]], i64 0
134-
; CHECK-NEXT: [[TMP7:%.*]] = sub <2 x i32> [[TMP5]], [[TMP6]]
135-
; CHECK-NEXT: [[TMP8:%.*]] = xor <2 x i32> [[V0]], [[V1]]
136-
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
137-
; CHECK-NEXT: [[TMP10:%.*]] = sub <2 x i32> [[TMP8]], [[TMP9]]
138-
; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> <i32 0, i32 0, i32 poison, i32 poison>
139-
; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
140-
; CHECK-NEXT: [[TMP2_31:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
141-
; CHECK-NEXT: ret <4 x i32> [[TMP2_31]]
126+
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
127+
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[V1:%.*]], <2 x i32> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
128+
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[V0]], i64 1
129+
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[V1]], i64 1
130+
; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[TMP3]], [[TMP4]]
131+
; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
132+
; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i32> [[TMP1]], [[TMP2]]
133+
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
134+
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 2>
135+
; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP0_1]], i64 0
136+
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP9]], <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
137+
; CHECK-NEXT: [[TMP11:%.*]] = sub <4 x i32> [[TMP7]], [[TMP10]]
138+
; CHECK-NEXT: ret <4 x i32> [[TMP11]]
142139
;
143140
%v0.0 = extractelement <2 x i32> %v0, i32 0
144141
%v0.1 = extractelement <2 x i32> %v0, i32 1

llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll

Lines changed: 13 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -123,22 +123,19 @@ define <4 x i32> @build_vec_v4i32_reuse_0(<2 x i32> %v0, <2 x i32> %v1) {
123123

124124
define <4 x i32> @build_vec_v4i32_reuse_1(<2 x i32> %v0, <2 x i32> %v1) {
125125
; CHECK-LABEL: @build_vec_v4i32_reuse_1(
126-
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[V1:%.*]], i64 1
127-
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[V1]], i64 0
128-
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[V0:%.*]], i64 1
129-
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[V0]], i64 0
130-
; CHECK-NEXT: [[TMP0_0:%.*]] = add i32 [[TMP4]], [[TMP2]]
131-
; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[TMP3]], [[TMP1]]
132-
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_0]], i64 0
133-
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_1]], i64 0
134-
; CHECK-NEXT: [[TMP7:%.*]] = sub <2 x i32> [[TMP5]], [[TMP6]]
135-
; CHECK-NEXT: [[TMP8:%.*]] = xor <2 x i32> [[V0]], [[V1]]
136-
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
137-
; CHECK-NEXT: [[TMP10:%.*]] = sub <2 x i32> [[TMP8]], [[TMP9]]
138-
; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> <i32 0, i32 0, i32 poison, i32 poison>
139-
; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
140-
; CHECK-NEXT: [[TMP2_31:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
141-
; CHECK-NEXT: ret <4 x i32> [[TMP2_31]]
126+
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
127+
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[V1:%.*]], <2 x i32> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
128+
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[V0]], i64 1
129+
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[V1]], i64 1
130+
; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[TMP3]], [[TMP4]]
131+
; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
132+
; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i32> [[TMP1]], [[TMP2]]
133+
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
134+
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 2>
135+
; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP0_1]], i64 0
136+
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP9]], <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
137+
; CHECK-NEXT: [[TMP11:%.*]] = sub <4 x i32> [[TMP7]], [[TMP10]]
138+
; CHECK-NEXT: ret <4 x i32> [[TMP11]]
142139
;
143140
%v0.0 = extractelement <2 x i32> %v0, i32 0
144141
%v0.1 = extractelement <2 x i32> %v0, i32 1

0 commit comments

Comments
 (0)