diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index d2570fdea4216..ffd58ca74109a 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -2911,7 +2911,8 @@ class BoUpSLP { } if (Last->State != TreeEntry::NeedToGather) { for (Value *V : VL) { - assert(!getTreeEntry(V) && "Scalar already in tree!"); + [[maybe_unused]] const TreeEntry *TE = getTreeEntry(V); + assert((!TE || TE == Last) && "Scalar already in tree!"); ScalarToTreeEntry[V] = Last; } // Update the scheduler bundle to point to this TreeEntry. @@ -2924,7 +2925,8 @@ class BoUpSLP { for (Value *V : VL) { if (doesNotNeedToBeScheduled(V)) continue; - assert(BundleMember && "Unexpected end of bundle."); + if (!BundleMember) + continue; BundleMember->TE = Last; BundleMember = BundleMember->NextInBundle; } @@ -5583,9 +5585,9 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, SmallVector ReuseShuffleIndicies; SmallVector UniqueValues; - auto &&TryToFindDuplicates = [&VL, &ReuseShuffleIndicies, &UniqueValues, - &UserTreeIdx, - this](const InstructionsState &S) { + SmallVector NonUniqueValueVL; + auto TryToFindDuplicates = [&](const InstructionsState &S, + bool DoNotFail = false) { // Check that every instruction appears once in this bundle. DenseMap UniquePositions(VL.size()); for (Value *V : VL) { @@ -5612,6 +5614,24 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, !isConstant(V); })) || !llvm::has_single_bit(NumUniqueScalarValues)) { + if (DoNotFail && UniquePositions.size() > 1 && + NumUniqueScalarValues > 1 && S.MainOp->isSafeToRemove() && + all_of(UniqueValues, [=](Value *V) { + return isa(V) || + areAllUsersVectorized(cast(V), + UserIgnoreList); + })) { + unsigned PWSz = PowerOf2Ceil(UniqueValues.size()); + if (PWSz == VL.size()) { + ReuseShuffleIndicies.clear(); + } else { + NonUniqueValueVL.assign(UniqueValues.begin(), UniqueValues.end()); + NonUniqueValueVL.append(PWSz - UniqueValues.size(), + UniqueValues.back()); + VL = NonUniqueValueVL; + } + return true; + } LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx); return false; @@ -5857,7 +5877,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, } // Check that every instruction appears once in this bundle. - if (!TryToFindDuplicates(S)) + if (!TryToFindDuplicates(S, /*DoNotFail=*/true)) return; // Perform specific checks for each particular instruction kind. @@ -5877,7 +5897,8 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, BlockScheduling &BS = *BSRef; - std::optional Bundle = BS.tryScheduleBundle(VL, this, S); + std::optional Bundle = + BS.tryScheduleBundle(UniqueValues, this, S); #ifdef EXPENSIVE_CHECKS // Make sure we didn't break any internal invariants BS.verify(); @@ -7537,7 +7558,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, Instruction *VL0 = E->getMainOp(); unsigned ShuffleOrOp = E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode(); - const unsigned Sz = VL.size(); + SetVector UniqueValues(VL.begin(), VL.end()); + const unsigned Sz = UniqueValues.size(); auto GetCostDiff = [=](function_ref ScalarEltCost, function_ref VectorCost) { @@ -7644,7 +7666,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, // Count reused scalars. InstructionCost ScalarCost = 0; SmallPtrSet CountedOps; - for (Value *V : VL) { + for (Value *V : UniqueValues) { auto *PHI = dyn_cast(V); if (!PHI) continue; @@ -7665,8 +7687,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, } case Instruction::ExtractValue: case Instruction::ExtractElement: { - auto GetScalarCost = [=](unsigned Idx) { - auto *I = cast(VL[Idx]); + auto GetScalarCost = [&](unsigned Idx) { + auto *I = cast(UniqueValues[Idx]); VectorType *SrcVecTy; if (ShuffleOrOp == Instruction::ExtractElement) { auto *EE = cast(I); @@ -7844,9 +7866,10 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt; } } - auto GetScalarCost = [=](unsigned Idx) { - auto *VI = - VL0->getOpcode() == Opcode ? cast(VL[Idx]) : nullptr; + auto GetScalarCost = [&](unsigned Idx) { + auto *VI = VL0->getOpcode() == Opcode + ? cast(UniqueValues[Idx]) + : nullptr; return TTI->getCastInstrCost(Opcode, ScalarTy, SrcScalarTy, TTI::getCastContextHint(VI), CostKind, VI); }; @@ -7891,7 +7914,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, ? CmpInst::BAD_FCMP_PREDICATE : CmpInst::BAD_ICMP_PREDICATE; auto GetScalarCost = [&](unsigned Idx) { - auto *VI = cast(VL[Idx]); + auto *VI = cast(UniqueValues[Idx]); CmpInst::Predicate CurrentPred = ScalarTy->isFloatingPointTy() ? CmpInst::BAD_FCMP_PREDICATE : CmpInst::BAD_ICMP_PREDICATE; @@ -7951,8 +7974,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, case Instruction::And: case Instruction::Or: case Instruction::Xor: { - auto GetScalarCost = [=](unsigned Idx) { - auto *VI = cast(VL[Idx]); + auto GetScalarCost = [&](unsigned Idx) { + auto *VI = cast(UniqueValues[Idx]); unsigned OpIdx = isa(VI) ? 0 : 1; TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(VI->getOperand(0)); TTI::OperandValueInfo Op2Info = @@ -7975,14 +7998,14 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, return CommonCost + GetGEPCostDiff(VL, VL0); } case Instruction::Load: { - auto GetScalarCost = [=](unsigned Idx) { - auto *VI = cast(VL[Idx]); + auto GetScalarCost = [&](unsigned Idx) { + auto *VI = cast(UniqueValues[Idx]); return TTI->getMemoryOpCost(Instruction::Load, ScalarTy, VI->getAlign(), VI->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo(), VI); }; auto *LI0 = cast(VL0); - auto GetVectorCost = [=](InstructionCost CommonCost) { + auto GetVectorCost = [&](InstructionCost CommonCost) { InstructionCost VecLdCost; if (E->State == TreeEntry::Vectorize) { VecLdCost = TTI->getMemoryOpCost( @@ -7993,7 +8016,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, E->State == TreeEntry::PossibleStridedVectorize) && "Unknown EntryState"); Align CommonAlignment = LI0->getAlign(); - for (Value *V : VL) + for (Value *V : UniqueValues) CommonAlignment = std::min(CommonAlignment, cast(V)->getAlign()); VecLdCost = TTI->getGatherScatterOpCost( @@ -8045,8 +8068,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand()); } case Instruction::Call: { - auto GetScalarCost = [=](unsigned Idx) { - auto *CI = cast(VL[Idx]); + auto GetScalarCost = [&](unsigned Idx) { + auto *CI = cast(UniqueValues[Idx]); Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); if (ID != Intrinsic::not_intrinsic) { IntrinsicCostAttributes CostAttrs(ID, *CI, 1); @@ -8087,8 +8110,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, } return false; }; - auto GetScalarCost = [=](unsigned Idx) { - auto *VI = cast(VL[Idx]); + auto GetScalarCost = [&](unsigned Idx) { + auto *VI = cast(UniqueValues[Idx]); assert(E->isOpcodeOrAlt(VI) && "Unexpected main/alternate opcode"); (void)E; return TTI->getInstructionCost(VI, CostKind); @@ -8607,6 +8630,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef VectorizedVals) { SmallVector>> ShuffleMasks; SmallVector> FirstUsers; SmallVector DemandedElts; + SmallDenseSet UsedInserts; for (ExternalUser &EU : ExternalUses) { // We only add extract cost once for the same scalar. if (!isa_and_nonnull(EU.User) && @@ -8627,6 +8651,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef VectorizedVals) { // to detect it as a final shuffled/identity match. if (auto *VU = dyn_cast_or_null(EU.User)) { if (auto *FTy = dyn_cast(VU->getType())) { + if (!UsedInserts.insert(VU).second) + continue; std::optional InsertIdx = getInsertIndex(VU); if (InsertIdx) { const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar); @@ -11008,6 +11034,7 @@ Value *BoUpSLP::vectorizeTree( // Maps extract Scalar to the corresponding extractelement instruction in the // basic block. Only one extractelement per block should be emitted. DenseMap> ScalarToEEs; + SmallDenseSet UsedInserts; // Extract all of the elements with the external uses. for (const auto &ExternalUse : ExternalUses) { Value *Scalar = ExternalUse.Scalar; @@ -11106,6 +11133,8 @@ Value *BoUpSLP::vectorizeTree( // Skip if the scalar is another vector op or Vec is not an instruction. if (!Scalar->getType()->isVectorTy() && isa(Vec)) { if (auto *FTy = dyn_cast(User->getType())) { + if (!UsedInserts.insert(VU).second) + continue; std::optional InsertIdx = getInsertIndex(VU); if (InsertIdx) { // Need to use original vector, if the root is truncated. diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll index dd65f126021bb..26d3a405019bf 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll @@ -123,22 +123,19 @@ define <4 x i32> @build_vec_v4i32_reuse_0(<2 x i32> %v0, <2 x i32> %v1) { define <4 x i32> @build_vec_v4i32_reuse_1(<2 x i32> %v0, <2 x i32> %v1) { ; CHECK-LABEL: @build_vec_v4i32_reuse_1( -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[V1:%.*]], i64 1 -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[V1]], i64 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[V0:%.*]], i64 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[V0]], i64 0 -; CHECK-NEXT: [[TMP0_0:%.*]] = add i32 [[TMP4]], [[TMP2]] -; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[TMP3]], [[TMP1]] -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_0]], i64 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_1]], i64 0 -; CHECK-NEXT: [[TMP7:%.*]] = sub <2 x i32> [[TMP5]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = xor <2 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = sub <2 x i32> [[TMP8]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP2_31:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> -; CHECK-NEXT: ret <4 x i32> [[TMP2_31]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[V1:%.*]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[V0]], i64 1 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[V1]], i64 1 +; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[TMP3]], [[TMP4]] +; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP0_1]], i64 0 +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP9]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = sub <4 x i32> [[TMP7]], [[TMP10]] +; CHECK-NEXT: ret <4 x i32> [[TMP11]] ; %v0.0 = extractelement <2 x i32> %v0, i32 0 %v0.1 = extractelement <2 x i32> %v0, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll index 58444a257aa66..bbf56b9a86ce1 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll @@ -123,22 +123,19 @@ define <4 x i32> @build_vec_v4i32_reuse_0(<2 x i32> %v0, <2 x i32> %v1) { define <4 x i32> @build_vec_v4i32_reuse_1(<2 x i32> %v0, <2 x i32> %v1) { ; CHECK-LABEL: @build_vec_v4i32_reuse_1( -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[V1:%.*]], i64 1 -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[V1]], i64 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[V0:%.*]], i64 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[V0]], i64 0 -; CHECK-NEXT: [[TMP0_0:%.*]] = add i32 [[TMP4]], [[TMP2]] -; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[TMP3]], [[TMP1]] -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_0]], i64 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_1]], i64 0 -; CHECK-NEXT: [[TMP7:%.*]] = sub <2 x i32> [[TMP5]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = xor <2 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = sub <2 x i32> [[TMP8]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP2_31:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> -; CHECK-NEXT: ret <4 x i32> [[TMP2_31]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[V1:%.*]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[V0]], i64 1 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[V1]], i64 1 +; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[TMP3]], [[TMP4]] +; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP0_1]], i64 0 +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP9]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = sub <4 x i32> [[TMP7]], [[TMP10]] +; CHECK-NEXT: ret <4 x i32> [[TMP11]] ; %v0.0 = extractelement <2 x i32> %v0, i32 0 %v0.1 = extractelement <2 x i32> %v0, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll index 69d0e6241b9c0..b59659ca75eb2 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll @@ -242,40 +242,31 @@ define void @select_uniform_ugt_16xi8(ptr %ptr, i8 %x) { ; CHECK-LABEL: @select_uniform_ugt_16xi8( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[PTR:%.*]], align 1 -; CHECK-NEXT: [[TMP1:%.*]] = icmp ugt <8 x i8> [[TMP0]], -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i8> poison, i8 [[X:%.*]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = select <8 x i1> [[TMP1]], <8 x i8> [[TMP0]], <8 x i8> [[TMP3]] -; CHECK-NEXT: store <8 x i8> [[TMP4]], ptr [[PTR]], align 2 ; CHECK-NEXT: [[GEP_8:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 8 ; CHECK-NEXT: [[L_8:%.*]] = load i8, ptr [[GEP_8]], align 1 ; CHECK-NEXT: [[CMP_8:%.*]] = icmp ugt i8 [[L_8]], -1 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x i8> [[TMP0]], i32 0 -; CHECK-NEXT: [[S_8:%.*]] = select i1 [[CMP_8]], i8 [[TMP5]], i8 [[X]] -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x i8> [[TMP4]], i32 0 -; CHECK-NEXT: store i8 [[TMP6]], ptr [[GEP_8]], align 2 +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[TMP0]], i32 0 +; CHECK-NEXT: [[S_8:%.*]] = select i1 [[CMP_8]], i8 [[TMP1]], i8 [[X:%.*]] ; CHECK-NEXT: [[GEP_9:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 9 -; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i8>, ptr [[GEP_9]], align 1 -; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt <4 x i8> [[TMP7]], -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i8> poison, i8 [[X]], i32 0 -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP11:%.*]] = select <4 x i1> [[TMP8]], <4 x i8> [[TMP7]], <4 x i8> [[TMP10]] -; CHECK-NEXT: store <4 x i8> [[TMP11]], ptr [[GEP_9]], align 2 -; CHECK-NEXT: [[GEP_13:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 13 -; CHECK-NEXT: [[L_13:%.*]] = load i8, ptr [[GEP_13]], align 1 -; CHECK-NEXT: [[CMP_13:%.*]] = icmp ugt i8 [[L_13]], -1 -; CHECK-NEXT: [[S_13:%.*]] = select i1 [[CMP_13]], i8 [[L_13]], i8 [[X]] -; CHECK-NEXT: store i8 [[S_13]], ptr [[GEP_13]], align 2 -; CHECK-NEXT: [[GEP_14:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 14 -; CHECK-NEXT: [[L_14:%.*]] = load i8, ptr [[GEP_14]], align 1 -; CHECK-NEXT: [[CMP_14:%.*]] = icmp ugt i8 [[L_14]], -1 -; CHECK-NEXT: [[S_14:%.*]] = select i1 [[CMP_14]], i8 [[L_14]], i8 [[X]] -; CHECK-NEXT: store i8 [[S_14]], ptr [[GEP_14]], align 2 -; CHECK-NEXT: [[GEP_15:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 15 -; CHECK-NEXT: [[L_15:%.*]] = load i8, ptr [[GEP_15]], align 1 -; CHECK-NEXT: [[CMP_15:%.*]] = icmp ugt i8 [[L_15]], -1 -; CHECK-NEXT: [[S_15:%.*]] = select i1 [[CMP_15]], i8 [[L_15]], i8 [[X]] -; CHECK-NEXT: store i8 [[S_15]], ptr [[GEP_15]], align 2 +; CHECK-NEXT: [[L_9:%.*]] = load i8, ptr [[GEP_9]], align 1 +; CHECK-NEXT: [[GEP_10:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 10 +; CHECK-NEXT: [[L_10:%.*]] = load i8, ptr [[GEP_10]], align 1 +; CHECK-NEXT: [[GEP_11:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 11 +; CHECK-NEXT: [[L_11:%.*]] = load i8, ptr [[GEP_11]], align 1 +; CHECK-NEXT: [[GEP_12:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 12 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[GEP_12]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i8> [[TMP0]], <8 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <16 x i8> [[TMP3]], i8 [[L_9]], i32 9 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <16 x i8> [[TMP4]], i8 [[L_10]], i32 10 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <16 x i8> [[TMP5]], i8 [[L_11]], i32 11 +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i8> [[TMP6]], <16 x i8> [[TMP7]], <16 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <16 x i8> [[TMP8]], <16 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = icmp ugt <16 x i8> [[TMP9]], +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <16 x i8> poison, i8 [[X]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <16 x i8> [[TMP11]], <16 x i8> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP10]], <16 x i8> [[TMP9]], <16 x i8> [[TMP12]] +; CHECK-NEXT: store <16 x i8> [[TMP13]], ptr [[PTR]], align 2 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cse.ll b/llvm/test/Transforms/SLPVectorizer/X86/cse.ll index b5602c795f42b..9c8569fcbdf75 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/cse.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/cse.ll @@ -16,15 +16,10 @@ define i32 @test(ptr nocapture %G) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[G:%.*]], i64 5 ; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 8 -; CHECK-NEXT: [[TMP1:%.*]] = fmul <2 x double> [[TMP0]], -; CHECK-NEXT: [[TMP2:%.*]] = fadd <2 x double> [[TMP1]], -; CHECK-NEXT: store <2 x double> [[TMP2]], ptr [[G]], align 8 -; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds double, ptr [[G]], i64 2 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP0]], i32 1 -; CHECK-NEXT: [[MUL11:%.*]] = fmul double [[TMP3]], 4.000000e+00 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP1]], double [[MUL11]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP4]], -; CHECK-NEXT: store <2 x double> [[TMP5]], ptr [[ARRAYIDX9]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = fmul <4 x double> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = fadd <4 x double> [[TMP2]], +; CHECK-NEXT: store <4 x double> [[TMP3]], ptr [[G]], align 8 ; CHECK-NEXT: ret i32 undef ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr49081.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr49081.ll index 740bf71f60e44..e7239f906c59d 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/pr49081.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr49081.ll @@ -5,15 +5,9 @@ define dso_local <4 x float> @foo(<4 x i32> %0) { ; CHECK-LABEL: @foo( -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[TMP0:%.*]], i64 1 -; CHECK-NEXT: [[TMP3:%.*]] = sitofp i32 [[TMP2]] to float -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> undef, float [[TMP3]], i64 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x float> [[TMP4]], float [[TMP3]], i64 1 -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = sitofp <2 x i32> [[TMP6]] to <2 x float> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP8]], <4 x i32> -; CHECK-NEXT: ret <4 x float> [[TMP9]] +; CHECK-NEXT: [[TMP2:%.*]] = sitofp <4 x i32> [[TMP0:%.*]] to <4 x float> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: ret <4 x float> [[TMP3]] ; %2 = extractelement <4 x i32> %0, i32 1 %3 = sitofp i32 %2 to float