diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index c0b7298f78005..c94fb71ab220b 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -175,6 +175,15 @@ static cl::opt RootLookAheadMaxDepth( "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option")); +static cl::opt MinProfitableStridedLoads( + "slp-min-strided-loads", cl::init(2), cl::Hidden, + cl::desc("The minimum number of loads, which should be considered strided, " + "if the stride is > 1 or is runtime value")); + +static cl::opt MaxProfitableLoadStride( + "slp-max-stride", cl::init(8), cl::Hidden, + cl::desc("The maximum stride, considered to be profitable.")); + static cl::opt ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz")); @@ -2575,7 +2584,7 @@ class BoUpSLP { enum EntryState { Vectorize, ScatterVectorize, - PossibleStridedVectorize, + StridedVectorize, NeedToGather }; EntryState State; @@ -2753,8 +2762,8 @@ class BoUpSLP { case ScatterVectorize: dbgs() << "ScatterVectorize\n"; break; - case PossibleStridedVectorize: - dbgs() << "PossibleStridedVectorize\n"; + case StridedVectorize: + dbgs() << "StridedVectorize\n"; break; case NeedToGather: dbgs() << "NeedToGather\n"; @@ -3680,7 +3689,7 @@ template <> struct DOTGraphTraits : public DefaultDOTGraphTraits { if (Entry->State == TreeEntry::NeedToGather) return "color=red"; if (Entry->State == TreeEntry::ScatterVectorize || - Entry->State == TreeEntry::PossibleStridedVectorize) + Entry->State == TreeEntry::StridedVectorize) return "color=blue"; return ""; } @@ -3842,12 +3851,7 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) { namespace { /// Tracks the state we can represent the loads in the given sequence. -enum class LoadsState { - Gather, - Vectorize, - ScatterVectorize, - PossibleStridedVectorize -}; +enum class LoadsState { Gather, Vectorize, ScatterVectorize, StridedVectorize }; } // anonymous namespace static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, @@ -3878,6 +3882,14 @@ static Align computeCommonAlignment(ArrayRef VL) { return CommonAlignment; } +/// Check if \p Order represents reverse order. +static bool isReverseOrder(ArrayRef Order) { + unsigned Sz = Order.size(); + return !Order.empty() && all_of(enumerate(Order), [&](const auto &Pair) { + return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value(); + }); +} + /// Checks if the given array of loads can be represented as a vectorized, /// scatter or just simple gather. static LoadsState canVectorizeLoads(ArrayRef VL, const Value *VL0, @@ -3900,7 +3912,8 @@ static LoadsState canVectorizeLoads(ArrayRef VL, const Value *VL0, // Make sure all loads in the bundle are simple - we can't vectorize // atomic or volatile loads. PointerOps.clear(); - PointerOps.resize(VL.size()); + const unsigned Sz = VL.size(); + PointerOps.resize(Sz); auto *POIter = PointerOps.begin(); for (Value *V : VL) { auto *L = cast(V); @@ -3911,12 +3924,12 @@ static LoadsState canVectorizeLoads(ArrayRef VL, const Value *VL0, } Order.clear(); + auto *VecTy = FixedVectorType::get(ScalarTy, Sz); // Check the order of pointer operands or that all pointers are the same. bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, DL, SE, Order); if (IsSorted || all_of(PointerOps, [&](Value *P) { return arePointersCompatible(P, PointerOps.front(), TLI); })) { - bool IsPossibleStrided = false; if (IsSorted) { Value *Ptr0; Value *PtrN; @@ -3930,30 +3943,71 @@ static LoadsState canVectorizeLoads(ArrayRef VL, const Value *VL0, std::optional Diff = getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE); // Check that the sorted loads are consecutive. - if (static_cast(*Diff) == VL.size() - 1) + if (static_cast(*Diff) == Sz - 1) return LoadsState::Vectorize; // Simple check if not a strided access - clear order. - IsPossibleStrided = *Diff % (VL.size() - 1) == 0; + bool IsPossibleStrided = *Diff % (Sz - 1) == 0; + // Try to generate strided load node if: + // 1. Target with strided load support is detected. + // 2. The number of loads is greater than MinProfitableStridedLoads, + // or the potential stride <= MaxProfitableLoadStride and the + // potential stride is power-of-2 (to avoid perf regressions for the very + // small number of loads) and max distance > number of loads, or potential + // stride is -1. + // 3. The loads are ordered, or number of unordered loads <= + // MaxProfitableUnorderedLoads, or loads are in reversed order. + // (this check is to avoid extra costs for very expensive shuffles). + if (IsPossibleStrided && (((Sz > MinProfitableStridedLoads || + (static_cast(std::abs(*Diff)) <= + MaxProfitableLoadStride * Sz && + isPowerOf2_32(std::abs(*Diff)))) && + static_cast(std::abs(*Diff)) > Sz) || + *Diff == -(static_cast(Sz) - 1))) { + int Stride = *Diff / static_cast(Sz - 1); + if (*Diff == Stride * static_cast(Sz - 1)) { + Align Alignment = + cast(Order.empty() ? VL.front() : VL[Order.front()]) + ->getAlign(); + if (TTI.isLegalStridedLoadStore(VecTy, Alignment)) { + // Iterate through all pointers and check if all distances are + // unique multiple of Dist. + SmallSet Dists; + for (Value *Ptr : PointerOps) { + int Dist = 0; + if (Ptr == PtrN) + Dist = *Diff; + else if (Ptr != Ptr0) + Dist = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, DL, SE); + // If the strides are not the same or repeated, we can't + // vectorize. + if (((Dist / Stride) * Stride) != Dist || + !Dists.insert(Dist).second) + break; + } + if (Dists.size() == Sz) + return LoadsState::StridedVectorize; + } + } + } } // TODO: need to improve analysis of the pointers, if not all of them are // GEPs or have > 2 operands, we end up with a gather node, which just // increases the cost. Loop *L = LI.getLoopFor(cast(VL0)->getParent()); bool ProfitableGatherPointers = - static_cast(count_if(PointerOps, [L](Value *V) { - return L && L->isLoopInvariant(V); - })) <= VL.size() / 2 && VL.size() > 2; + static_cast(count_if( + PointerOps, + [L](Value *V) { return L && L->isLoopInvariant(V); })) <= Sz / 2 && + Sz > 2; if (ProfitableGatherPointers || all_of(PointerOps, [IsSorted](Value *P) { auto *GEP = dyn_cast(P); return (IsSorted && !GEP && doesNotNeedToBeScheduled(P)) || (GEP && GEP->getNumOperands() == 2); })) { Align CommonAlignment = computeCommonAlignment(VL); - auto *VecTy = FixedVectorType::get(ScalarTy, VL.size()); if (TTI.isLegalMaskedGather(VecTy, CommonAlignment) && !TTI.forceScalarizeMaskedGather(VecTy, CommonAlignment)) - return IsPossibleStrided ? LoadsState::PossibleStridedVectorize - : LoadsState::ScatterVectorize; + return LoadsState::ScatterVectorize; } } @@ -4160,7 +4214,7 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) { return std::move(ResOrder); } if ((TE.State == TreeEntry::Vectorize || - TE.State == TreeEntry::PossibleStridedVectorize) && + TE.State == TreeEntry::StridedVectorize) && (isa(TE.getMainOp()) || (TopToBottom && isa(TE.getMainOp()))) && !TE.isAltShuffle()) @@ -4418,7 +4472,7 @@ void BoUpSLP::reorderTopToBottom() { } VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get()); if (!(TE->State == TreeEntry::Vectorize || - TE->State == TreeEntry::PossibleStridedVectorize) || + TE->State == TreeEntry::StridedVectorize) || !TE->ReuseShuffleIndices.empty()) GathersToOrders.try_emplace(TE.get(), *CurrentOrder); if (TE->State == TreeEntry::Vectorize && @@ -4442,9 +4496,6 @@ void BoUpSLP::reorderTopToBottom() { MapVector> OrdersUses; - // Last chance orders - scatter vectorize. Try to use their orders if no - // other orders or the order is counted already. - SmallVector StridedVectorizeOrders; SmallPtrSet VisitedOps; for (const TreeEntry *OpTE : OrderedEntries) { // No need to reorder this nodes, still need to extend and to use shuffle, @@ -4491,11 +4542,6 @@ void BoUpSLP::reorderTopToBottom() { if (Order.empty()) continue; } - // Postpone scatter orders. - if (OpTE->State == TreeEntry::PossibleStridedVectorize) { - StridedVectorizeOrders.push_back(Order); - continue; - } // Stores actually store the mask, not the order, need to invert. if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() && OpTE->getOpcode() == Instruction::Store && !Order.empty()) { @@ -4512,22 +4558,6 @@ void BoUpSLP::reorderTopToBottom() { ++OrdersUses.insert(std::make_pair(Order, 0)).first->second; } } - // Set order of the user node. - if (OrdersUses.empty()) { - if (StridedVectorizeOrders.empty()) - continue; - // Add (potentially!) strided vectorize orders. - for (OrdersType &Order : StridedVectorizeOrders) - ++OrdersUses.insert(std::make_pair(Order, 0)).first->second; - } else { - // Account (potentially!) strided vectorize orders only if it was used - // already. - for (OrdersType &Order : StridedVectorizeOrders) { - auto *It = OrdersUses.find(Order); - if (It != OrdersUses.end()) - ++It->second; - } - } // Choose the most used order. ArrayRef BestOrder = OrdersUses.front().first; unsigned Cnt = OrdersUses.front().second; @@ -4569,7 +4599,7 @@ void BoUpSLP::reorderTopToBottom() { continue; } if ((TE->State == TreeEntry::Vectorize || - TE->State == TreeEntry::PossibleStridedVectorize) && + TE->State == TreeEntry::StridedVectorize) && isa(TE->getMainOp()) && !TE->isAltShuffle()) { @@ -4610,10 +4640,6 @@ bool BoUpSLP::canReorderOperands( })) continue; if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) { - // FIXME: Do not reorder (possible!) strided vectorized nodes, they - // require reordering of the operands, which is not implemented yet. - if (TE->State == TreeEntry::PossibleStridedVectorize) - return false; // Do not reorder if operand node is used by many user nodes. if (any_of(TE->UserTreeIndices, [UserTE](const EdgeInfo &EI) { return EI.UserTE != UserTE; })) @@ -4664,13 +4690,13 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { SmallVector NonVectorized; for (const std::unique_ptr &TE : VectorizableTree) { if (TE->State != TreeEntry::Vectorize && - TE->State != TreeEntry::PossibleStridedVectorize) + TE->State != TreeEntry::StridedVectorize) NonVectorized.push_back(TE.get()); if (std::optional CurrentOrder = getReorderingData(*TE, /*TopToBottom=*/false)) { OrderedEntries.insert(TE.get()); if (!(TE->State == TreeEntry::Vectorize || - TE->State == TreeEntry::PossibleStridedVectorize) || + TE->State == TreeEntry::StridedVectorize) || !TE->ReuseShuffleIndices.empty()) GathersToOrders.try_emplace(TE.get(), *CurrentOrder); } @@ -4688,7 +4714,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { SmallVector Filtered; for (TreeEntry *TE : OrderedEntries) { if (!(TE->State == TreeEntry::Vectorize || - TE->State == TreeEntry::PossibleStridedVectorize || + TE->State == TreeEntry::StridedVectorize || (TE->State == TreeEntry::NeedToGather && GathersToOrders.count(TE))) || TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() || @@ -4733,9 +4759,6 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { MapVector> OrdersUses; - // Last chance orders - scatter vectorize. Try to use their orders if no - // other orders or the order is counted already. - SmallVector> StridedVectorizeOrders; // Do the analysis for each tree entry only once, otherwise the order of // the same node my be considered several times, though might be not // profitable. @@ -4757,11 +4780,6 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { Data.second, [OpTE](const std::pair &P) { return P.second == OpTE; }); - // Postpone scatter orders. - if (OpTE->State == TreeEntry::PossibleStridedVectorize) { - StridedVectorizeOrders.emplace_back(Order, NumOps); - continue; - } // Stores actually store the mask, not the order, need to invert. if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() && OpTE->getOpcode() == Instruction::Store && !Order.empty()) { @@ -4819,30 +4837,6 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { ++Res.first->second; } } - // If no orders - skip current nodes and jump to the next one, if any. - if (OrdersUses.empty()) { - if (StridedVectorizeOrders.empty() || - (Data.first->ReorderIndices.empty() && - Data.first->ReuseShuffleIndices.empty() && - !(IgnoreReorder && - Data.first == VectorizableTree.front().get()))) { - for (const std::pair &Op : Data.second) - OrderedEntries.remove(Op.second); - continue; - } - // Add (potentially!) strided vectorize orders. - for (std::pair &Pair : StridedVectorizeOrders) - OrdersUses.insert(std::make_pair(Pair.first, 0)).first->second += - Pair.second; - } else { - // Account (potentially!) strided vectorize orders only if it was used - // already. - for (std::pair &Pair : StridedVectorizeOrders) { - auto *It = OrdersUses.find(Pair.first); - if (It != OrdersUses.end()) - It->second += Pair.second; - } - } // Choose the best order. ArrayRef BestOrder = OrdersUses.front().first; unsigned Cnt = OrdersUses.front().second; @@ -4878,7 +4872,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { } // Gathers are processed separately. if (TE->State != TreeEntry::Vectorize && - TE->State != TreeEntry::PossibleStridedVectorize && + TE->State != TreeEntry::StridedVectorize && (TE->State != TreeEntry::ScatterVectorize || TE->ReorderIndices.empty())) continue; @@ -4910,7 +4904,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { Data.first->reorderOperands(Mask); if (!isa(Data.first->getMainOp()) || Data.first->isAltShuffle() || - Data.first->State == TreeEntry::PossibleStridedVectorize) { + Data.first->State == TreeEntry::StridedVectorize) { reorderScalars(Data.first->Scalars, Mask); reorderOrder(Data.first->ReorderIndices, MaskOrder, /*BottomOrder=*/true); @@ -4973,7 +4967,6 @@ void BoUpSLP::buildExternalUses( // instructions. If that is the case, the one in FoundLane will // be used. if (UseEntry->State == TreeEntry::ScatterVectorize || - UseEntry->State == TreeEntry::PossibleStridedVectorize || !doesInTreeUserNeedToExtract( Scalar, cast(UseEntry->Scalars.front()), TLI)) { LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U @@ -5331,8 +5324,8 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState( return TreeEntry::Vectorize; case LoadsState::ScatterVectorize: return TreeEntry::ScatterVectorize; - case LoadsState::PossibleStridedVectorize: - return TreeEntry::PossibleStridedVectorize; + case LoadsState::StridedVectorize: + return TreeEntry::StridedVectorize; case LoadsState::Gather: #ifndef NDEBUG Type *ScalarTy = VL0->getType(); @@ -5753,8 +5746,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, BasicBlock *BB = nullptr; bool IsScatterVectorizeUserTE = UserTreeIdx.UserTE && - (UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize || - UserTreeIdx.UserTE->State == TreeEntry::PossibleStridedVectorize); + UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize; bool AreAllSameInsts = (S.getOpcode() && allSameBlock(VL)) || (S.OpValue->getType()->isPointerTy() && IsScatterVectorizeUserTE && @@ -5851,8 +5843,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, // Special processing for sorted pointers for ScatterVectorize node with // constant indeces only. if (AreAllSameInsts && UserTreeIdx.UserTE && - (UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize || - UserTreeIdx.UserTE->State == TreeEntry::PossibleStridedVectorize) && + UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize && !(S.getOpcode() && allSameBlock(VL))) { assert(S.OpValue->getType()->isPointerTy() && count_if(VL, [](Value *V) { return isa(V); }) >= @@ -6049,18 +6040,17 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, } TE->setOperandsInOrder(); break; - case TreeEntry::PossibleStridedVectorize: + case TreeEntry::StridedVectorize: // Vectorizing non-consecutive loads with `llvm.masked.gather`. if (CurrentOrder.empty()) { - TE = newTreeEntry(VL, TreeEntry::PossibleStridedVectorize, Bundle, S, + TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S, UserTreeIdx, ReuseShuffleIndicies); } else { - TE = newTreeEntry(VL, TreeEntry::PossibleStridedVectorize, Bundle, S, + TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S, UserTreeIdx, ReuseShuffleIndicies, CurrentOrder); } TE->setOperandsInOrder(); - buildTree_rec(PointerOps, Depth + 1, {TE, 0}); - LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n"); + LLVM_DEBUG(dbgs() << "SLP: added a vector of strided loads.\n"); break; case TreeEntry::ScatterVectorize: // Vectorizing non-consecutive loads with `llvm.masked.gather`. @@ -7091,7 +7081,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { !isSplat(Gathers)) { InstructionCost BaseCost = R.getGatherCost(Gathers, !Root); SetVector VectorizedLoads; - SmallVector VectorizedStarts; + SmallVector> VectorizedStarts; SmallVector ScatterVectorized; unsigned StartIdx = 0; unsigned VF = VL.size() / 2; @@ -7115,12 +7105,16 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { switch (LS) { case LoadsState::Vectorize: case LoadsState::ScatterVectorize: - case LoadsState::PossibleStridedVectorize: + case LoadsState::StridedVectorize: // Mark the vectorized loads so that we don't vectorize them // again. // TODO: better handling of loads with reorders. - if (LS == LoadsState::Vectorize && CurrentOrder.empty()) - VectorizedStarts.push_back(Cnt); + if (((LS == LoadsState::Vectorize || + LS == LoadsState::StridedVectorize) && + CurrentOrder.empty()) || + (LS == LoadsState::StridedVectorize && + isReverseOrder(CurrentOrder))) + VectorizedStarts.emplace_back(Cnt, LS); else ScatterVectorized.push_back(Cnt); VectorizedLoads.insert(Slice.begin(), Slice.end()); @@ -7164,16 +7158,20 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { CostKind, TTI::OperandValueInfo(), LI); } auto *LoadTy = FixedVectorType::get(VL.front()->getType(), VF); - for (unsigned P : VectorizedStarts) { - auto *LI = cast(VL[P]); + for (const std::pair &P : VectorizedStarts) { + auto *LI = cast(VL[P.first]); Align Alignment = LI->getAlign(); GatherCost += - TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, - LI->getPointerAddressSpace(), CostKind, - TTI::OperandValueInfo(), LI); + P.second == LoadsState::Vectorize + ? TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, + LI->getPointerAddressSpace(), CostKind, + TTI::OperandValueInfo(), LI) + : TTI.getStridedMemoryOpCost( + Instruction::Load, LoadTy, LI->getPointerOperand(), + /*VariableMask=*/false, Alignment, CostKind, LI); // Estimate GEP cost. SmallVector PointerOps(VF); - for (auto [I, V] : enumerate(VL.slice(P, VF))) + for (auto [I, V] : enumerate(VL.slice(P.first, VF))) PointerOps[I] = cast(V)->getPointerOperand(); auto [ScalarGEPCost, VectorGEPCost] = getGEPCosts(TTI, PointerOps, LI->getPointerOperand(), @@ -7913,8 +7911,9 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, } InstructionCost CommonCost = 0; SmallVector Mask; + bool IsReverseOrder = isReverseOrder(E->ReorderIndices); if (!E->ReorderIndices.empty() && - E->State != TreeEntry::PossibleStridedVectorize) { + (E->State != TreeEntry::StridedVectorize || !IsReverseOrder)) { SmallVector NewMask; if (E->getOpcode() == Instruction::Store) { // For stores the order is actually a mask. @@ -7932,7 +7931,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, FinalVecTy, Mask); assert((E->State == TreeEntry::Vectorize || E->State == TreeEntry::ScatterVectorize || - E->State == TreeEntry::PossibleStridedVectorize) && + E->State == TreeEntry::StridedVectorize) && "Unhandled state"); assert(E->getOpcode() && ((allSameType(VL) && allSameBlock(VL)) || @@ -7952,7 +7951,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, } auto GetCastContextHint = [&](Value *V) { if (const TreeEntry *OpTE = getTreeEntry(V)) { - if (OpTE->State == TreeEntry::ScatterVectorize) + if (OpTE->State == TreeEntry::ScatterVectorize || + OpTE->State == TreeEntry::StridedVectorize) return TTI::CastContextHint::GatherScatter; if (OpTE->State == TreeEntry::Vectorize && OpTE->getOpcode() == Instruction::Load && !OpTE->isAltShuffle()) { @@ -8028,8 +8028,9 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, // Calculate cost difference from vectorizing set of GEPs. // Negative value means vectorizing is profitable. auto GetGEPCostDiff = [=](ArrayRef Ptrs, Value *BasePtr) { - assert(E->State == TreeEntry::Vectorize && - "Entry state expected to be Vectorize here."); + assert((E->State == TreeEntry::Vectorize || + E->State == TreeEntry::StridedVectorize) && + "Entry state expected to be Vectorize or StridedVectorize here."); InstructionCost ScalarCost = 0; InstructionCost VecCost = 0; std::tie(ScalarCost, VecCost) = getGEPCosts( @@ -8382,10 +8383,14 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, VecLdCost = TTI->getMemoryOpCost( Instruction::Load, VecTy, LI0->getAlign(), LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo()); + } else if (E->State == TreeEntry::StridedVectorize) { + Align CommonAlignment = + computeCommonAlignment(UniqueValues.getArrayRef()); + VecLdCost = TTI->getStridedMemoryOpCost( + Instruction::Load, VecTy, LI0->getPointerOperand(), + /*VariableMask=*/false, CommonAlignment, CostKind); } else { - assert((E->State == TreeEntry::ScatterVectorize || - E->State == TreeEntry::PossibleStridedVectorize) && - "Unknown EntryState"); + assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState"); Align CommonAlignment = computeCommonAlignment(UniqueValues.getArrayRef()); VecLdCost = TTI->getGatherScatterOpCost( @@ -8398,8 +8403,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost); // If this node generates masked gather load then it is not a terminal node. // Hence address operand cost is estimated separately. - if (E->State == TreeEntry::ScatterVectorize || - E->State == TreeEntry::PossibleStridedVectorize) + if (E->State == TreeEntry::ScatterVectorize) return Cost; // Estimate cost of GEPs since this tree node is a terminator. @@ -8608,7 +8612,7 @@ bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const { if (VectorizableTree[0]->State == TreeEntry::NeedToGather || (VectorizableTree[1]->State == TreeEntry::NeedToGather && VectorizableTree[0]->State != TreeEntry::ScatterVectorize && - VectorizableTree[0]->State != TreeEntry::PossibleStridedVectorize)) + VectorizableTree[0]->State != TreeEntry::StridedVectorize)) return false; return true; @@ -10579,11 +10583,6 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis { Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx, bool PostponedPHIs) { ValueList &VL = E->getOperand(NodeIdx); - if (E->State == TreeEntry::PossibleStridedVectorize && - !E->ReorderIndices.empty()) { - SmallVector Mask(E->ReorderIndices.begin(), E->ReorderIndices.end()); - reorderScalars(VL, Mask); - } const unsigned VF = VL.size(); InstructionsState S = getSameOpcode(VL, *TLI); // Special processing for GEPs bundle, which may include non-gep values. @@ -11157,6 +11156,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { return Vec; } + bool IsReverseOrder = isReverseOrder(E->ReorderIndices); auto FinalShuffle = [&](Value *V, const TreeEntry *E, VectorType *VecTy, bool IsSigned) { if (V->getType() != VecTy) @@ -11167,7 +11167,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { ArrayRef(reinterpret_cast(E->ReorderIndices.begin()), E->ReorderIndices.size()); ShuffleBuilder.add(V, Mask); - } else if (E->State == TreeEntry::PossibleStridedVectorize) { + } else if (E->State == TreeEntry::StridedVectorize && IsReverseOrder) { ShuffleBuilder.addOrdered(V, std::nullopt); } else { ShuffleBuilder.addOrdered(V, E->ReorderIndices); @@ -11177,7 +11177,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { assert((E->State == TreeEntry::Vectorize || E->State == TreeEntry::ScatterVectorize || - E->State == TreeEntry::PossibleStridedVectorize) && + E->State == TreeEntry::StridedVectorize) && "Unhandled state"); unsigned ShuffleOrOp = E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode(); @@ -11642,10 +11642,29 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { Value *PO = LI->getPointerOperand(); if (E->State == TreeEntry::Vectorize) { NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign()); + } else if (E->State == TreeEntry::StridedVectorize) { + Value *Ptr0 = cast(E->Scalars.front())->getPointerOperand(); + Value *PtrN = cast(E->Scalars.back())->getPointerOperand(); + PO = IsReverseOrder ? PtrN : Ptr0; + std::optional Diff = getPointersDiff( + VL0->getType(), Ptr0, VL0->getType(), PtrN, *DL, *SE); + Type *StrideTy = DL->getIndexType(PO->getType()); + int Stride = *Diff / (static_cast(E->Scalars.size()) - 1); + Value *StrideVal = + ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride * + DL->getTypeAllocSize(ScalarTy)); + Align CommonAlignment = computeCommonAlignment(E->Scalars); + auto *Inst = Builder.CreateIntrinsic( + Intrinsic::experimental_vp_strided_load, + {VecTy, PO->getType(), StrideTy}, + {PO, StrideVal, Builder.getAllOnesMask(VecTy->getElementCount()), + Builder.getInt32(E->Scalars.size())}); + Inst->addParamAttr( + /*ArgNo=*/0, + Attribute::getWithAlignment(Inst->getContext(), CommonAlignment)); + NewLI = Inst; } else { - assert((E->State == TreeEntry::ScatterVectorize || - E->State == TreeEntry::PossibleStridedVectorize) && - "Unhandled state"); + assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state"); Value *VecPtr = vectorizeOperand(E, 0, PostponedPHIs); if (E->VectorizedValue) { LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); @@ -12069,8 +12088,11 @@ Value *BoUpSLP::vectorizeTree( [&](llvm::User *U) { TreeEntry *UseEntry = getTreeEntry(U); return UseEntry && - UseEntry->State == TreeEntry::Vectorize && - E->State == TreeEntry::Vectorize && + (UseEntry->State == TreeEntry::Vectorize || + UseEntry->State == + TreeEntry::StridedVectorize) && + (E->State == TreeEntry::Vectorize || + E->State == TreeEntry::StridedVectorize) && doesInTreeUserNeedToExtract( Scalar, cast(UseEntry->Scalars.front()), diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll index dc5fb91788634..e167b6a47af59 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3 -; RUN: opt -S -mtriple riscv64-unknown-linux-gnu < %s --passes=slp-vectorizer -mattr=+v -slp-threshold=-80 | FileCheck %s +; RUN: opt -S -mtriple riscv64-unknown-linux-gnu < %s --passes=slp-vectorizer -mattr=+v -slp-threshold=-40 | FileCheck %s define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.ptr, ptr %add.ptr64) { ; CHECK-LABEL: define i32 @test( @@ -67,305 +67,303 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; CHECK-NEXT: [[TMP48:%.*]] = extractelement <2 x i32> [[TMP47]], i32 0 ; CHECK-NEXT: [[TMP49:%.*]] = extractelement <2 x i32> [[TMP47]], i32 1 ; CHECK-NEXT: [[SUB59_2:%.*]] = sub i32 [[TMP48]], [[TMP49]] -; CHECK-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr i8, ptr null, i64 4 ; CHECK-NEXT: [[TMP50:%.*]] = load i8, ptr null, align 1 ; CHECK-NEXT: [[ARRAYIDX20_3:%.*]] = getelementptr i8, ptr null, i64 2 ; CHECK-NEXT: [[ARRAYIDX22_3:%.*]] = getelementptr i8, ptr null, i64 2 ; CHECK-NEXT: [[TMP51:%.*]] = load i8, ptr null, align 1 -; CHECK-NEXT: [[TMP52:%.*]] = insertelement <2 x ptr> , ptr [[ARRAYIDX20_3]], i32 1 +; CHECK-NEXT: [[TMP52:%.*]] = insertelement <2 x ptr> , ptr [[ARRAYIDX20_3]], i32 0 ; CHECK-NEXT: [[TMP53:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP52]], i32 1, <2 x i1> , <2 x i8> poison) ; CHECK-NEXT: [[TMP54:%.*]] = zext <2 x i8> [[TMP53]] to <2 x i32> -; CHECK-NEXT: [[TMP55:%.*]] = insertelement <2 x ptr> , ptr [[ARRAYIDX22_3]], i32 1 +; CHECK-NEXT: [[TMP55:%.*]] = insertelement <2 x ptr> , ptr [[ARRAYIDX22_3]], i32 0 ; CHECK-NEXT: [[TMP56:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP55]], i32 1, <2 x i1> , <2 x i8> poison) ; CHECK-NEXT: [[TMP57:%.*]] = zext <2 x i8> [[TMP56]] to <2 x i32> ; CHECK-NEXT: [[TMP58:%.*]] = sub <2 x i32> [[TMP54]], [[TMP57]] -; CHECK-NEXT: [[TMP59:%.*]] = insertelement <2 x ptr> , ptr [[ARRAYIDX3_3]], i32 0 -; CHECK-NEXT: [[TMP60:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP59]], i32 1, <2 x i1> , <2 x i8> poison) -; CHECK-NEXT: [[TMP61:%.*]] = zext <2 x i8> [[TMP60]] to <2 x i32> -; CHECK-NEXT: [[TMP62:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> ), i32 1, <2 x i1> , <2 x i8> poison) -; CHECK-NEXT: [[TMP63:%.*]] = zext <2 x i8> [[TMP62]] to <2 x i32> -; CHECK-NEXT: [[TMP64:%.*]] = sub <2 x i32> [[TMP61]], [[TMP63]] -; CHECK-NEXT: [[TMP65:%.*]] = shl <2 x i32> [[TMP64]], -; CHECK-NEXT: [[TMP66:%.*]] = add <2 x i32> [[TMP65]], [[TMP58]] -; CHECK-NEXT: [[TMP67:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> ), i32 1, <2 x i1> , <2 x i8> poison) -; CHECK-NEXT: [[TMP68:%.*]] = zext <2 x i8> [[TMP67]] to <2 x i32> -; CHECK-NEXT: [[TMP69:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> ), i32 1, <2 x i1> , <2 x i8> poison) -; CHECK-NEXT: [[TMP70:%.*]] = zext <2 x i8> [[TMP69]] to <2 x i32> -; CHECK-NEXT: [[TMP71:%.*]] = sub <2 x i32> [[TMP68]], [[TMP70]] -; CHECK-NEXT: [[TMP72:%.*]] = insertelement <2 x i8> poison, i8 [[TMP50]], i32 0 -; CHECK-NEXT: [[TMP73:%.*]] = insertelement <2 x i8> [[TMP72]], i8 [[TMP51]], i32 1 -; CHECK-NEXT: [[TMP74:%.*]] = zext <2 x i8> [[TMP73]] to <2 x i32> -; CHECK-NEXT: [[TMP75:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> ), i32 1, <2 x i1> , <2 x i8> poison) -; CHECK-NEXT: [[TMP76:%.*]] = zext <2 x i8> [[TMP75]] to <2 x i32> -; CHECK-NEXT: [[TMP77:%.*]] = sub <2 x i32> [[TMP74]], [[TMP76]] -; CHECK-NEXT: [[TMP78:%.*]] = shl <2 x i32> [[TMP77]], -; CHECK-NEXT: [[TMP79:%.*]] = add <2 x i32> [[TMP78]], [[TMP71]] -; CHECK-NEXT: [[TMP80:%.*]] = sub <2 x i32> [[TMP66]], [[TMP79]] -; CHECK-NEXT: [[TMP81:%.*]] = shufflevector <2 x i32> [[TMP79]], <2 x i32> [[TMP46]], <2 x i32> -; CHECK-NEXT: [[TMP82:%.*]] = shufflevector <2 x i32> [[TMP66]], <2 x i32> [[TMP30]], <2 x i32> -; CHECK-NEXT: [[TMP83:%.*]] = add <2 x i32> [[TMP81]], [[TMP82]] -; CHECK-NEXT: [[TMP84:%.*]] = shufflevector <2 x i32> [[TMP79]], <2 x i32> [[TMP46]], <2 x i32> -; CHECK-NEXT: [[TMP85:%.*]] = shufflevector <2 x i32> [[TMP66]], <2 x i32> [[TMP30]], <2 x i32> -; CHECK-NEXT: [[TMP86:%.*]] = add <2 x i32> [[TMP84]], [[TMP85]] -; CHECK-NEXT: [[TMP87:%.*]] = add <2 x i32> [[TMP86]], [[TMP83]] -; CHECK-NEXT: [[TMP88:%.*]] = sub <2 x i32> [[TMP83]], [[TMP86]] -; CHECK-NEXT: [[TMP89:%.*]] = extractelement <2 x i32> [[TMP80]], i32 0 -; CHECK-NEXT: [[TMP90:%.*]] = extractelement <2 x i32> [[TMP80]], i32 1 -; CHECK-NEXT: [[SUB59_3:%.*]] = sub i32 [[TMP89]], [[TMP90]] -; CHECK-NEXT: [[TMP91:%.*]] = extractelement <2 x i32> [[TMP87]], i32 0 -; CHECK-NEXT: [[TMP92:%.*]] = extractelement <2 x i32> [[TMP87]], i32 1 -; CHECK-NEXT: [[ADD94:%.*]] = add i32 [[TMP91]], [[TMP92]] -; CHECK-NEXT: [[SUB102:%.*]] = sub i32 [[TMP92]], [[TMP91]] -; CHECK-NEXT: [[TMP93:%.*]] = extractelement <2 x i32> [[TMP54]], i32 0 -; CHECK-NEXT: [[SHR_I:%.*]] = lshr i32 [[TMP93]], 15 +; CHECK-NEXT: [[TMP59:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 4, <2 x i1> , i32 2) +; CHECK-NEXT: [[TMP60:%.*]] = zext <2 x i8> [[TMP59]] to <2 x i32> +; CHECK-NEXT: [[TMP61:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> ), i32 1, <2 x i1> , <2 x i8> poison) +; CHECK-NEXT: [[TMP62:%.*]] = zext <2 x i8> [[TMP61]] to <2 x i32> +; CHECK-NEXT: [[TMP63:%.*]] = sub <2 x i32> [[TMP60]], [[TMP62]] +; CHECK-NEXT: [[TMP64:%.*]] = shl <2 x i32> [[TMP63]], +; CHECK-NEXT: [[TMP65:%.*]] = add <2 x i32> [[TMP64]], [[TMP58]] +; CHECK-NEXT: [[TMP66:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> ), i32 1, <2 x i1> , <2 x i8> poison) +; CHECK-NEXT: [[TMP67:%.*]] = zext <2 x i8> [[TMP66]] to <2 x i32> +; CHECK-NEXT: [[TMP68:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> ), i32 1, <2 x i1> , <2 x i8> poison) +; CHECK-NEXT: [[TMP69:%.*]] = zext <2 x i8> [[TMP68]] to <2 x i32> +; CHECK-NEXT: [[TMP70:%.*]] = sub <2 x i32> [[TMP67]], [[TMP69]] +; CHECK-NEXT: [[TMP71:%.*]] = insertelement <2 x i8> poison, i8 [[TMP51]], i32 0 +; CHECK-NEXT: [[TMP72:%.*]] = insertelement <2 x i8> [[TMP71]], i8 [[TMP50]], i32 1 +; CHECK-NEXT: [[TMP73:%.*]] = zext <2 x i8> [[TMP72]] to <2 x i32> +; CHECK-NEXT: [[TMP74:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> ), i32 1, <2 x i1> , <2 x i8> poison) +; CHECK-NEXT: [[TMP75:%.*]] = zext <2 x i8> [[TMP74]] to <2 x i32> +; CHECK-NEXT: [[TMP76:%.*]] = sub <2 x i32> [[TMP73]], [[TMP75]] +; CHECK-NEXT: [[TMP77:%.*]] = shl <2 x i32> [[TMP76]], +; CHECK-NEXT: [[TMP78:%.*]] = add <2 x i32> [[TMP77]], [[TMP70]] +; CHECK-NEXT: [[TMP79:%.*]] = sub <2 x i32> [[TMP65]], [[TMP78]] +; CHECK-NEXT: [[TMP80:%.*]] = shufflevector <2 x i32> [[TMP78]], <2 x i32> [[TMP46]], <2 x i32> +; CHECK-NEXT: [[TMP81:%.*]] = shufflevector <2 x i32> [[TMP65]], <2 x i32> [[TMP30]], <2 x i32> +; CHECK-NEXT: [[TMP82:%.*]] = add <2 x i32> [[TMP80]], [[TMP81]] +; CHECK-NEXT: [[TMP83:%.*]] = shufflevector <2 x i32> [[TMP78]], <2 x i32> [[TMP46]], <2 x i32> +; CHECK-NEXT: [[TMP84:%.*]] = shufflevector <2 x i32> [[TMP65]], <2 x i32> [[TMP30]], <2 x i32> +; CHECK-NEXT: [[TMP85:%.*]] = add <2 x i32> [[TMP83]], [[TMP84]] +; CHECK-NEXT: [[TMP86:%.*]] = add <2 x i32> [[TMP85]], [[TMP82]] +; CHECK-NEXT: [[TMP87:%.*]] = sub <2 x i32> [[TMP82]], [[TMP85]] +; CHECK-NEXT: [[TMP88:%.*]] = extractelement <2 x i32> [[TMP79]], i32 0 +; CHECK-NEXT: [[TMP89:%.*]] = extractelement <2 x i32> [[TMP79]], i32 1 +; CHECK-NEXT: [[SUB59_3:%.*]] = sub i32 [[TMP89]], [[TMP88]] +; CHECK-NEXT: [[TMP90:%.*]] = extractelement <2 x i32> [[TMP86]], i32 0 +; CHECK-NEXT: [[TMP91:%.*]] = extractelement <2 x i32> [[TMP86]], i32 1 +; CHECK-NEXT: [[ADD94:%.*]] = add i32 [[TMP90]], [[TMP91]] +; CHECK-NEXT: [[SUB102:%.*]] = sub i32 [[TMP91]], [[TMP90]] +; CHECK-NEXT: [[TMP92:%.*]] = extractelement <2 x i32> [[TMP54]], i32 1 +; CHECK-NEXT: [[SHR_I:%.*]] = lshr i32 [[TMP92]], 15 ; CHECK-NEXT: [[AND_I:%.*]] = and i32 [[SHR_I]], 65537 ; CHECK-NEXT: [[MUL_I:%.*]] = mul i32 [[AND_I]], 65535 -; CHECK-NEXT: [[TMP94:%.*]] = extractelement <2 x i32> [[TMP86]], i32 1 -; CHECK-NEXT: [[SHR_I49:%.*]] = lshr i32 [[TMP94]], 15 +; CHECK-NEXT: [[TMP93:%.*]] = extractelement <2 x i32> [[TMP85]], i32 1 +; CHECK-NEXT: [[SHR_I49:%.*]] = lshr i32 [[TMP93]], 15 ; CHECK-NEXT: [[AND_I50:%.*]] = and i32 [[SHR_I49]], 65537 ; CHECK-NEXT: [[MUL_I51:%.*]] = mul i32 [[AND_I50]], 65535 -; CHECK-NEXT: [[TMP95:%.*]] = extractelement <2 x i32> [[TMP88]], i32 0 -; CHECK-NEXT: [[TMP96:%.*]] = extractelement <2 x i32> [[TMP88]], i32 1 -; CHECK-NEXT: [[ADD94_2:%.*]] = add i32 [[TMP95]], [[TMP96]] -; CHECK-NEXT: [[TMP97:%.*]] = load <2 x i8>, ptr [[ARRAYIDX20]], align 1 -; CHECK-NEXT: [[TMP98:%.*]] = zext <2 x i8> [[TMP97]] to <2 x i32> -; CHECK-NEXT: [[TMP99:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59_2]], i32 0 -; CHECK-NEXT: [[TMP100:%.*]] = shufflevector <2 x i32> [[TMP99]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP101:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59_3]], i32 0 -; CHECK-NEXT: [[TMP102:%.*]] = shufflevector <2 x i32> [[TMP101]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP103:%.*]] = add <2 x i32> [[TMP100]], [[TMP102]] -; CHECK-NEXT: [[TMP104:%.*]] = sub <2 x i32> [[TMP100]], [[TMP102]] -; CHECK-NEXT: [[TMP105:%.*]] = shufflevector <2 x i32> [[TMP103]], <2 x i32> [[TMP104]], <2 x i32> -; CHECK-NEXT: [[TMP106:%.*]] = load <2 x i8>, ptr [[PIX1]], align 1 -; CHECK-NEXT: [[TMP107:%.*]] = zext <2 x i8> [[TMP106]] to <2 x i32> -; CHECK-NEXT: [[TMP108:%.*]] = shufflevector <2 x i32> [[TMP107]], <2 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP109:%.*]] = insertelement <2 x ptr> [[TMP4]], ptr [[ARRAYIDX22]], i32 1 -; CHECK-NEXT: [[TMP110:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP109]], i32 1, <2 x i1> , <2 x i8> poison) -; CHECK-NEXT: [[TMP111:%.*]] = zext <2 x i8> [[TMP110]] to <2 x i32> -; CHECK-NEXT: [[TMP112:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP2]], i32 1, <2 x i1> , <2 x i8> poison) -; CHECK-NEXT: [[TMP113:%.*]] = zext <2 x i8> [[TMP112]] to <2 x i32> -; CHECK-NEXT: [[TMP114:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP5]], i32 1, <2 x i1> , <2 x i8> poison) -; CHECK-NEXT: [[TMP115:%.*]] = zext <2 x i8> [[TMP114]] to <2 x i32> -; CHECK-NEXT: [[TMP116:%.*]] = sub <2 x i32> [[TMP113]], [[TMP115]] -; CHECK-NEXT: [[TMP117:%.*]] = shl <2 x i32> [[TMP116]], -; CHECK-NEXT: [[TMP118:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP6]], i32 1, <2 x i1> , <2 x i8> poison) -; CHECK-NEXT: [[TMP119:%.*]] = zext <2 x i8> [[TMP118]] to <2 x i32> -; CHECK-NEXT: [[TMP120:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP7]], i32 1, <2 x i1> , <2 x i8> poison) -; CHECK-NEXT: [[TMP121:%.*]] = zext <2 x i8> [[TMP120]] to <2 x i32> -; CHECK-NEXT: [[TMP122:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP8]], i32 1, <2 x i1> , <2 x i8> poison) -; CHECK-NEXT: [[TMP123:%.*]] = zext <2 x i8> [[TMP122]] to <2 x i32> -; CHECK-NEXT: [[TMP124:%.*]] = sub <2 x i32> [[TMP121]], [[TMP123]] -; CHECK-NEXT: [[TMP125:%.*]] = shl <2 x i32> [[TMP124]], -; CHECK-NEXT: [[TMP126:%.*]] = shufflevector <2 x i32> [[TMP107]], <2 x i32> [[TMP98]], <2 x i32> -; CHECK-NEXT: [[TMP127:%.*]] = sub <2 x i32> [[TMP126]], [[TMP111]] -; CHECK-NEXT: [[TMP128:%.*]] = add <2 x i32> [[TMP117]], [[TMP127]] -; CHECK-NEXT: [[TMP129:%.*]] = shufflevector <2 x i32> [[TMP108]], <2 x i32> [[TMP98]], <2 x i32> -; CHECK-NEXT: [[TMP130:%.*]] = sub <2 x i32> [[TMP129]], [[TMP119]] -; CHECK-NEXT: [[TMP131:%.*]] = add <2 x i32> [[TMP125]], [[TMP130]] -; CHECK-NEXT: [[TMP132:%.*]] = extractelement <2 x i32> [[TMP128]], i32 1 -; CHECK-NEXT: [[TMP133:%.*]] = extractelement <2 x i32> [[TMP131]], i32 1 -; CHECK-NEXT: [[ADD46:%.*]] = add i32 [[TMP133]], [[TMP132]] -; CHECK-NEXT: [[TMP134:%.*]] = sub <2 x i32> [[TMP128]], [[TMP131]] -; CHECK-NEXT: [[TMP135:%.*]] = extractelement <2 x i32> [[TMP128]], i32 0 -; CHECK-NEXT: [[TMP136:%.*]] = extractelement <2 x i32> [[TMP131]], i32 0 -; CHECK-NEXT: [[ADD44:%.*]] = add i32 [[TMP136]], [[TMP135]] -; CHECK-NEXT: [[TMP137:%.*]] = lshr <2 x i32> [[TMP108]], -; CHECK-NEXT: [[TMP138:%.*]] = and <2 x i32> [[TMP137]], -; CHECK-NEXT: [[TMP139:%.*]] = mul <2 x i32> [[TMP138]], -; CHECK-NEXT: [[TMP140:%.*]] = extractelement <2 x i32> [[TMP134]], i32 0 -; CHECK-NEXT: [[TMP141:%.*]] = extractelement <2 x i32> [[TMP134]], i32 1 -; CHECK-NEXT: [[SUB59:%.*]] = sub i32 [[TMP140]], [[TMP141]] -; CHECK-NEXT: [[TMP142:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1 -; CHECK-NEXT: [[TMP143:%.*]] = zext <2 x i8> [[TMP142]] to <2 x i32> +; CHECK-NEXT: [[TMP94:%.*]] = extractelement <2 x i32> [[TMP87]], i32 0 +; CHECK-NEXT: [[TMP95:%.*]] = extractelement <2 x i32> [[TMP87]], i32 1 +; CHECK-NEXT: [[ADD94_2:%.*]] = add i32 [[TMP94]], [[TMP95]] +; CHECK-NEXT: [[TMP96:%.*]] = load <2 x i8>, ptr [[ARRAYIDX20]], align 1 +; CHECK-NEXT: [[TMP97:%.*]] = zext <2 x i8> [[TMP96]] to <2 x i32> +; CHECK-NEXT: [[TMP98:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59_2]], i32 0 +; CHECK-NEXT: [[TMP99:%.*]] = shufflevector <2 x i32> [[TMP98]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP100:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59_3]], i32 0 +; CHECK-NEXT: [[TMP101:%.*]] = shufflevector <2 x i32> [[TMP100]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP102:%.*]] = add <2 x i32> [[TMP99]], [[TMP101]] +; CHECK-NEXT: [[TMP103:%.*]] = sub <2 x i32> [[TMP99]], [[TMP101]] +; CHECK-NEXT: [[TMP104:%.*]] = shufflevector <2 x i32> [[TMP102]], <2 x i32> [[TMP103]], <2 x i32> +; CHECK-NEXT: [[TMP105:%.*]] = load <2 x i8>, ptr [[PIX1]], align 1 +; CHECK-NEXT: [[TMP106:%.*]] = zext <2 x i8> [[TMP105]] to <2 x i32> +; CHECK-NEXT: [[TMP107:%.*]] = shufflevector <2 x i32> [[TMP106]], <2 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP108:%.*]] = insertelement <2 x ptr> [[TMP4]], ptr [[ARRAYIDX22]], i32 1 +; CHECK-NEXT: [[TMP109:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP108]], i32 1, <2 x i1> , <2 x i8> poison) +; CHECK-NEXT: [[TMP110:%.*]] = zext <2 x i8> [[TMP109]] to <2 x i32> +; CHECK-NEXT: [[TMP111:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP2]], i32 1, <2 x i1> , <2 x i8> poison) +; CHECK-NEXT: [[TMP112:%.*]] = zext <2 x i8> [[TMP111]] to <2 x i32> +; CHECK-NEXT: [[TMP113:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP5]], i32 1, <2 x i1> , <2 x i8> poison) +; CHECK-NEXT: [[TMP114:%.*]] = zext <2 x i8> [[TMP113]] to <2 x i32> +; CHECK-NEXT: [[TMP115:%.*]] = sub <2 x i32> [[TMP112]], [[TMP114]] +; CHECK-NEXT: [[TMP116:%.*]] = shl <2 x i32> [[TMP115]], +; CHECK-NEXT: [[TMP117:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP6]], i32 1, <2 x i1> , <2 x i8> poison) +; CHECK-NEXT: [[TMP118:%.*]] = zext <2 x i8> [[TMP117]] to <2 x i32> +; CHECK-NEXT: [[TMP119:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP7]], i32 1, <2 x i1> , <2 x i8> poison) +; CHECK-NEXT: [[TMP120:%.*]] = zext <2 x i8> [[TMP119]] to <2 x i32> +; CHECK-NEXT: [[TMP121:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP8]], i32 1, <2 x i1> , <2 x i8> poison) +; CHECK-NEXT: [[TMP122:%.*]] = zext <2 x i8> [[TMP121]] to <2 x i32> +; CHECK-NEXT: [[TMP123:%.*]] = sub <2 x i32> [[TMP120]], [[TMP122]] +; CHECK-NEXT: [[TMP124:%.*]] = shl <2 x i32> [[TMP123]], +; CHECK-NEXT: [[TMP125:%.*]] = shufflevector <2 x i32> [[TMP106]], <2 x i32> [[TMP97]], <2 x i32> +; CHECK-NEXT: [[TMP126:%.*]] = sub <2 x i32> [[TMP125]], [[TMP110]] +; CHECK-NEXT: [[TMP127:%.*]] = add <2 x i32> [[TMP116]], [[TMP126]] +; CHECK-NEXT: [[TMP128:%.*]] = shufflevector <2 x i32> [[TMP107]], <2 x i32> [[TMP97]], <2 x i32> +; CHECK-NEXT: [[TMP129:%.*]] = sub <2 x i32> [[TMP128]], [[TMP118]] +; CHECK-NEXT: [[TMP130:%.*]] = add <2 x i32> [[TMP124]], [[TMP129]] +; CHECK-NEXT: [[TMP131:%.*]] = extractelement <2 x i32> [[TMP127]], i32 1 +; CHECK-NEXT: [[TMP132:%.*]] = extractelement <2 x i32> [[TMP130]], i32 1 +; CHECK-NEXT: [[ADD46:%.*]] = add i32 [[TMP132]], [[TMP131]] +; CHECK-NEXT: [[TMP133:%.*]] = sub <2 x i32> [[TMP127]], [[TMP130]] +; CHECK-NEXT: [[TMP134:%.*]] = extractelement <2 x i32> [[TMP127]], i32 0 +; CHECK-NEXT: [[TMP135:%.*]] = extractelement <2 x i32> [[TMP130]], i32 0 +; CHECK-NEXT: [[ADD44:%.*]] = add i32 [[TMP135]], [[TMP134]] +; CHECK-NEXT: [[TMP136:%.*]] = lshr <2 x i32> [[TMP107]], +; CHECK-NEXT: [[TMP137:%.*]] = and <2 x i32> [[TMP136]], +; CHECK-NEXT: [[TMP138:%.*]] = mul <2 x i32> [[TMP137]], +; CHECK-NEXT: [[TMP139:%.*]] = extractelement <2 x i32> [[TMP133]], i32 0 +; CHECK-NEXT: [[TMP140:%.*]] = extractelement <2 x i32> [[TMP133]], i32 1 +; CHECK-NEXT: [[SUB59:%.*]] = sub i32 [[TMP139]], [[TMP140]] +; CHECK-NEXT: [[TMP141:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1 +; CHECK-NEXT: [[TMP142:%.*]] = zext <2 x i8> [[TMP141]] to <2 x i32> ; CHECK-NEXT: [[ADD_PTR644:%.*]] = getelementptr i8, ptr [[PIX2]], i64 [[IDX_EXT63]] ; CHECK-NEXT: [[ARRAYIDX22_1:%.*]] = getelementptr i8, ptr [[ADD_PTR644]], i64 2 -; CHECK-NEXT: [[TMP144:%.*]] = insertelement <2 x ptr> poison, ptr [[ADD_PTR644]], i32 0 -; CHECK-NEXT: [[TMP145:%.*]] = insertelement <2 x ptr> [[TMP144]], ptr [[ARRAYIDX22_1]], i32 1 -; CHECK-NEXT: [[TMP146:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP145]], i32 1, <2 x i1> , <2 x i8> poison) -; CHECK-NEXT: [[TMP147:%.*]] = zext <2 x i8> [[TMP146]] to <2 x i32> -; CHECK-NEXT: [[TMP148:%.*]] = insertelement <2 x ptr> poison, ptr [[ADD_PTR3]], i32 0 -; CHECK-NEXT: [[TMP149:%.*]] = shufflevector <2 x ptr> [[TMP148]], <2 x ptr> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP150:%.*]] = getelementptr i8, <2 x ptr> [[TMP149]], <2 x i64> -; CHECK-NEXT: [[TMP151:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP150]], i32 1, <2 x i1> , <2 x i8> poison) -; CHECK-NEXT: [[TMP152:%.*]] = zext <2 x i8> [[TMP151]] to <2 x i32> -; CHECK-NEXT: [[TMP153:%.*]] = shufflevector <2 x ptr> [[TMP145]], <2 x ptr> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP154:%.*]] = getelementptr i8, <2 x ptr> [[TMP153]], <2 x i64> -; CHECK-NEXT: [[TMP155:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP154]], i32 1, <2 x i1> , <2 x i8> poison) -; CHECK-NEXT: [[TMP156:%.*]] = zext <2 x i8> [[TMP155]] to <2 x i32> -; CHECK-NEXT: [[TMP157:%.*]] = sub <2 x i32> [[TMP152]], [[TMP156]] -; CHECK-NEXT: [[TMP158:%.*]] = shl <2 x i32> [[TMP157]], -; CHECK-NEXT: [[TMP159:%.*]] = getelementptr i8, <2 x ptr> [[TMP153]], <2 x i64> -; CHECK-NEXT: [[TMP160:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP159]], i32 1, <2 x i1> , <2 x i8> poison) -; CHECK-NEXT: [[TMP161:%.*]] = zext <2 x i8> [[TMP160]] to <2 x i32> -; CHECK-NEXT: [[TMP162:%.*]] = getelementptr i8, <2 x ptr> [[TMP149]], <2 x i64> -; CHECK-NEXT: [[TMP163:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP162]], i32 1, <2 x i1> , <2 x i8> poison) -; CHECK-NEXT: [[TMP164:%.*]] = zext <2 x i8> [[TMP163]] to <2 x i32> -; CHECK-NEXT: [[TMP165:%.*]] = getelementptr i8, <2 x ptr> [[TMP153]], <2 x i64> -; CHECK-NEXT: [[TMP166:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP165]], i32 1, <2 x i1> , <2 x i8> poison) -; CHECK-NEXT: [[TMP167:%.*]] = zext <2 x i8> [[TMP166]] to <2 x i32> -; CHECK-NEXT: [[TMP168:%.*]] = sub <2 x i32> [[TMP164]], [[TMP167]] -; CHECK-NEXT: [[TMP169:%.*]] = shl <2 x i32> [[TMP168]], -; CHECK-NEXT: [[TMP170:%.*]] = insertelement <2 x i32> [[TMP143]], i32 [[CONV33_1]], i32 1 -; CHECK-NEXT: [[TMP171:%.*]] = sub <2 x i32> [[TMP170]], [[TMP161]] -; CHECK-NEXT: [[TMP172:%.*]] = add <2 x i32> [[TMP169]], [[TMP171]] -; CHECK-NEXT: [[TMP173:%.*]] = insertelement <2 x i32> [[TMP143]], i32 [[CONV_1]], i32 0 -; CHECK-NEXT: [[TMP174:%.*]] = sub <2 x i32> [[TMP173]], [[TMP147]] -; CHECK-NEXT: [[TMP175:%.*]] = add <2 x i32> [[TMP158]], [[TMP174]] -; CHECK-NEXT: [[TMP176:%.*]] = add <2 x i32> [[TMP172]], [[TMP175]] -; CHECK-NEXT: [[TMP177:%.*]] = sub <2 x i32> [[TMP175]], [[TMP172]] -; CHECK-NEXT: [[TMP178:%.*]] = extractelement <2 x i32> [[TMP176]], i32 0 -; CHECK-NEXT: [[TMP179:%.*]] = extractelement <2 x i32> [[TMP176]], i32 1 -; CHECK-NEXT: [[SUB51_1:%.*]] = sub i32 [[TMP178]], [[TMP179]] -; CHECK-NEXT: [[TMP180:%.*]] = shufflevector <2 x i32> [[TMP177]], <2 x i32> [[TMP134]], <2 x i32> -; CHECK-NEXT: [[TMP181:%.*]] = shufflevector <2 x i32> [[TMP177]], <2 x i32> [[TMP134]], <2 x i32> -; CHECK-NEXT: [[TMP182:%.*]] = add <2 x i32> [[TMP180]], [[TMP181]] -; CHECK-NEXT: [[TMP183:%.*]] = extractelement <2 x i32> [[TMP177]], i32 0 -; CHECK-NEXT: [[TMP184:%.*]] = extractelement <2 x i32> [[TMP177]], i32 1 -; CHECK-NEXT: [[SUB59_1:%.*]] = sub i32 [[TMP183]], [[TMP184]] -; CHECK-NEXT: [[SHR_I54:%.*]] = lshr i32 [[TMP179]], 15 +; CHECK-NEXT: [[TMP143:%.*]] = insertelement <2 x ptr> poison, ptr [[ADD_PTR644]], i32 0 +; CHECK-NEXT: [[TMP144:%.*]] = insertelement <2 x ptr> [[TMP143]], ptr [[ARRAYIDX22_1]], i32 1 +; CHECK-NEXT: [[TMP145:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP144]], i32 1, <2 x i1> , <2 x i8> poison) +; CHECK-NEXT: [[TMP146:%.*]] = zext <2 x i8> [[TMP145]] to <2 x i32> +; CHECK-NEXT: [[TMP147:%.*]] = insertelement <2 x ptr> poison, ptr [[ADD_PTR3]], i32 0 +; CHECK-NEXT: [[TMP148:%.*]] = shufflevector <2 x ptr> [[TMP147]], <2 x ptr> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP149:%.*]] = getelementptr i8, <2 x ptr> [[TMP148]], <2 x i64> +; CHECK-NEXT: [[TMP150:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP149]], i32 1, <2 x i1> , <2 x i8> poison) +; CHECK-NEXT: [[TMP151:%.*]] = zext <2 x i8> [[TMP150]] to <2 x i32> +; CHECK-NEXT: [[TMP152:%.*]] = shufflevector <2 x ptr> [[TMP144]], <2 x ptr> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP153:%.*]] = getelementptr i8, <2 x ptr> [[TMP152]], <2 x i64> +; CHECK-NEXT: [[TMP154:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP153]], i32 1, <2 x i1> , <2 x i8> poison) +; CHECK-NEXT: [[TMP155:%.*]] = zext <2 x i8> [[TMP154]] to <2 x i32> +; CHECK-NEXT: [[TMP156:%.*]] = sub <2 x i32> [[TMP151]], [[TMP155]] +; CHECK-NEXT: [[TMP157:%.*]] = shl <2 x i32> [[TMP156]], +; CHECK-NEXT: [[TMP158:%.*]] = getelementptr i8, <2 x ptr> [[TMP152]], <2 x i64> +; CHECK-NEXT: [[TMP159:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP158]], i32 1, <2 x i1> , <2 x i8> poison) +; CHECK-NEXT: [[TMP160:%.*]] = zext <2 x i8> [[TMP159]] to <2 x i32> +; CHECK-NEXT: [[TMP161:%.*]] = getelementptr i8, <2 x ptr> [[TMP148]], <2 x i64> +; CHECK-NEXT: [[TMP162:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP161]], i32 1, <2 x i1> , <2 x i8> poison) +; CHECK-NEXT: [[TMP163:%.*]] = zext <2 x i8> [[TMP162]] to <2 x i32> +; CHECK-NEXT: [[TMP164:%.*]] = getelementptr i8, <2 x ptr> [[TMP152]], <2 x i64> +; CHECK-NEXT: [[TMP165:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP164]], i32 1, <2 x i1> , <2 x i8> poison) +; CHECK-NEXT: [[TMP166:%.*]] = zext <2 x i8> [[TMP165]] to <2 x i32> +; CHECK-NEXT: [[TMP167:%.*]] = sub <2 x i32> [[TMP163]], [[TMP166]] +; CHECK-NEXT: [[TMP168:%.*]] = shl <2 x i32> [[TMP167]], +; CHECK-NEXT: [[TMP169:%.*]] = insertelement <2 x i32> [[TMP142]], i32 [[CONV33_1]], i32 1 +; CHECK-NEXT: [[TMP170:%.*]] = sub <2 x i32> [[TMP169]], [[TMP160]] +; CHECK-NEXT: [[TMP171:%.*]] = add <2 x i32> [[TMP168]], [[TMP170]] +; CHECK-NEXT: [[TMP172:%.*]] = insertelement <2 x i32> [[TMP142]], i32 [[CONV_1]], i32 0 +; CHECK-NEXT: [[TMP173:%.*]] = sub <2 x i32> [[TMP172]], [[TMP146]] +; CHECK-NEXT: [[TMP174:%.*]] = add <2 x i32> [[TMP157]], [[TMP173]] +; CHECK-NEXT: [[TMP175:%.*]] = add <2 x i32> [[TMP171]], [[TMP174]] +; CHECK-NEXT: [[TMP176:%.*]] = sub <2 x i32> [[TMP174]], [[TMP171]] +; CHECK-NEXT: [[TMP177:%.*]] = extractelement <2 x i32> [[TMP175]], i32 0 +; CHECK-NEXT: [[TMP178:%.*]] = extractelement <2 x i32> [[TMP175]], i32 1 +; CHECK-NEXT: [[SUB51_1:%.*]] = sub i32 [[TMP177]], [[TMP178]] +; CHECK-NEXT: [[TMP179:%.*]] = shufflevector <2 x i32> [[TMP176]], <2 x i32> [[TMP133]], <2 x i32> +; CHECK-NEXT: [[TMP180:%.*]] = shufflevector <2 x i32> [[TMP176]], <2 x i32> [[TMP133]], <2 x i32> +; CHECK-NEXT: [[TMP181:%.*]] = add <2 x i32> [[TMP179]], [[TMP180]] +; CHECK-NEXT: [[TMP182:%.*]] = extractelement <2 x i32> [[TMP176]], i32 0 +; CHECK-NEXT: [[TMP183:%.*]] = extractelement <2 x i32> [[TMP176]], i32 1 +; CHECK-NEXT: [[SUB59_1:%.*]] = sub i32 [[TMP182]], [[TMP183]] +; CHECK-NEXT: [[SHR_I54:%.*]] = lshr i32 [[TMP178]], 15 ; CHECK-NEXT: [[AND_I55:%.*]] = and i32 [[SHR_I54]], 65537 ; CHECK-NEXT: [[MUL_I56:%.*]] = mul i32 [[AND_I55]], 65535 -; CHECK-NEXT: [[TMP185:%.*]] = lshr <2 x i32> [[TMP143]], -; CHECK-NEXT: [[TMP186:%.*]] = and <2 x i32> [[TMP185]], -; CHECK-NEXT: [[TMP187:%.*]] = mul <2 x i32> [[TMP186]], -; CHECK-NEXT: [[TMP188:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59_1]], i32 0 -; CHECK-NEXT: [[TMP189:%.*]] = shufflevector <2 x i32> [[TMP188]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP190:%.*]] = extractelement <2 x i32> [[TMP182]], i32 0 -; CHECK-NEXT: [[TMP191:%.*]] = extractelement <2 x i32> [[TMP182]], i32 1 -; CHECK-NEXT: [[ADD78_1:%.*]] = add i32 [[TMP190]], [[TMP191]] -; CHECK-NEXT: [[TMP192:%.*]] = shufflevector <2 x i32> [[TMP33]], <2 x i32> [[TMP177]], <2 x i32> -; CHECK-NEXT: [[TMP193:%.*]] = lshr <2 x i32> [[TMP192]], -; CHECK-NEXT: [[TMP194:%.*]] = and <2 x i32> [[TMP193]], -; CHECK-NEXT: [[TMP195:%.*]] = mul <2 x i32> [[TMP194]], -; CHECK-NEXT: [[TMP196:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_1]], i32 0 -; CHECK-NEXT: [[TMP197:%.*]] = shufflevector <2 x i32> [[TMP196]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP198:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_2]], i32 0 -; CHECK-NEXT: [[TMP199:%.*]] = shufflevector <2 x i32> [[TMP198]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP200:%.*]] = insertelement <2 x i32> poison, i32 [[ADD44]], i32 0 -; CHECK-NEXT: [[TMP201:%.*]] = shufflevector <2 x i32> [[TMP200]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP202:%.*]] = insertelement <2 x i32> , i32 [[ADD46]], i32 1 -; CHECK-NEXT: [[TMP203:%.*]] = lshr <2 x i32> [[TMP201]], [[TMP202]] -; CHECK-NEXT: [[TMP204:%.*]] = sub <2 x i32> [[TMP201]], [[TMP202]] -; CHECK-NEXT: [[TMP205:%.*]] = shufflevector <2 x i32> [[TMP203]], <2 x i32> [[TMP204]], <2 x i32> -; CHECK-NEXT: [[TMP206:%.*]] = extractelement <2 x i32> [[TMP205]], i32 1 -; CHECK-NEXT: [[ADD78_2:%.*]] = add i32 [[SUB51_1]], [[TMP206]] -; CHECK-NEXT: [[TMP207:%.*]] = insertelement <2 x i32> , i32 [[SUB51_1]], i32 1 -; CHECK-NEXT: [[TMP208:%.*]] = and <2 x i32> [[TMP205]], [[TMP207]] -; CHECK-NEXT: [[TMP209:%.*]] = sub <2 x i32> [[TMP205]], [[TMP207]] -; CHECK-NEXT: [[TMP210:%.*]] = shufflevector <2 x i32> [[TMP208]], <2 x i32> [[TMP209]], <2 x i32> -; CHECK-NEXT: [[TMP211:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_2]], i32 0 -; CHECK-NEXT: [[TMP212:%.*]] = shufflevector <2 x i32> [[TMP211]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP213:%.*]] = add <2 x i32> [[TMP212]], [[TMP199]] -; CHECK-NEXT: [[TMP214:%.*]] = sub <2 x i32> [[TMP212]], [[TMP199]] -; CHECK-NEXT: [[TMP215:%.*]] = shufflevector <2 x i32> [[TMP213]], <2 x i32> [[TMP214]], <2 x i32> -; CHECK-NEXT: [[TMP216:%.*]] = insertelement <2 x i32> [[TMP134]], i32 [[CONV_1]], i32 0 -; CHECK-NEXT: [[TMP217:%.*]] = lshr <2 x i32> [[TMP216]], -; CHECK-NEXT: [[TMP218:%.*]] = and <2 x i32> [[TMP217]], -; CHECK-NEXT: [[TMP219:%.*]] = mul <2 x i32> [[TMP218]], -; CHECK-NEXT: [[TMP220:%.*]] = shufflevector <2 x i32> [[TMP88]], <2 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP221:%.*]] = shufflevector <2 x i32> [[TMP220]], <2 x i32> [[TMP182]], <2 x i32> -; CHECK-NEXT: [[TMP222:%.*]] = shufflevector <2 x i32> [[TMP88]], <2 x i32> [[TMP182]], <2 x i32> -; CHECK-NEXT: [[TMP223:%.*]] = sub <2 x i32> [[TMP221]], [[TMP222]] -; CHECK-NEXT: [[TMP224:%.*]] = shufflevector <2 x i32> [[TMP47]], <2 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP225:%.*]] = insertelement <2 x i32> [[TMP224]], i32 [[ADD46]], i32 1 -; CHECK-NEXT: [[TMP226:%.*]] = insertelement <2 x i32> [[TMP47]], i32 [[ADD44]], i32 1 -; CHECK-NEXT: [[TMP227:%.*]] = add <2 x i32> [[TMP225]], [[TMP226]] -; CHECK-NEXT: [[TMP228:%.*]] = shufflevector <2 x i32> [[TMP80]], <2 x i32> [[TMP176]], <2 x i32> -; CHECK-NEXT: [[TMP229:%.*]] = shufflevector <2 x i32> [[TMP80]], <2 x i32> [[TMP176]], <2 x i32> -; CHECK-NEXT: [[TMP230:%.*]] = add <2 x i32> [[TMP228]], [[TMP229]] -; CHECK-NEXT: [[TMP231:%.*]] = extractelement <2 x i32> [[TMP227]], i32 0 -; CHECK-NEXT: [[TMP232:%.*]] = extractelement <2 x i32> [[TMP230]], i32 0 -; CHECK-NEXT: [[ADD94_1:%.*]] = add i32 [[TMP232]], [[TMP231]] -; CHECK-NEXT: [[TMP233:%.*]] = insertelement <2 x i32> [[TMP14]], i32 [[ADD46]], i32 1 -; CHECK-NEXT: [[TMP234:%.*]] = lshr <2 x i32> [[TMP233]], -; CHECK-NEXT: [[TMP235:%.*]] = and <2 x i32> [[TMP234]], -; CHECK-NEXT: [[TMP236:%.*]] = mul <2 x i32> [[TMP235]], -; CHECK-NEXT: [[TMP237:%.*]] = extractelement <2 x i32> [[TMP227]], i32 1 -; CHECK-NEXT: [[TMP238:%.*]] = extractelement <2 x i32> [[TMP230]], i32 1 -; CHECK-NEXT: [[ADD78:%.*]] = add i32 [[TMP238]], [[TMP237]] -; CHECK-NEXT: [[TMP239:%.*]] = sub <2 x i32> [[TMP227]], [[TMP230]] +; CHECK-NEXT: [[TMP184:%.*]] = lshr <2 x i32> [[TMP142]], +; CHECK-NEXT: [[TMP185:%.*]] = and <2 x i32> [[TMP184]], +; CHECK-NEXT: [[TMP186:%.*]] = mul <2 x i32> [[TMP185]], +; CHECK-NEXT: [[TMP187:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59_1]], i32 0 +; CHECK-NEXT: [[TMP188:%.*]] = shufflevector <2 x i32> [[TMP187]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP189:%.*]] = extractelement <2 x i32> [[TMP181]], i32 0 +; CHECK-NEXT: [[TMP190:%.*]] = extractelement <2 x i32> [[TMP181]], i32 1 +; CHECK-NEXT: [[ADD78_1:%.*]] = add i32 [[TMP189]], [[TMP190]] +; CHECK-NEXT: [[TMP191:%.*]] = shufflevector <2 x i32> [[TMP33]], <2 x i32> [[TMP176]], <2 x i32> +; CHECK-NEXT: [[TMP192:%.*]] = lshr <2 x i32> [[TMP191]], +; CHECK-NEXT: [[TMP193:%.*]] = and <2 x i32> [[TMP192]], +; CHECK-NEXT: [[TMP194:%.*]] = mul <2 x i32> [[TMP193]], +; CHECK-NEXT: [[TMP195:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_1]], i32 0 +; CHECK-NEXT: [[TMP196:%.*]] = shufflevector <2 x i32> [[TMP195]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP197:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_2]], i32 0 +; CHECK-NEXT: [[TMP198:%.*]] = shufflevector <2 x i32> [[TMP197]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP199:%.*]] = insertelement <2 x i32> poison, i32 [[ADD44]], i32 0 +; CHECK-NEXT: [[TMP200:%.*]] = shufflevector <2 x i32> [[TMP199]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP201:%.*]] = insertelement <2 x i32> , i32 [[ADD46]], i32 1 +; CHECK-NEXT: [[TMP202:%.*]] = lshr <2 x i32> [[TMP200]], [[TMP201]] +; CHECK-NEXT: [[TMP203:%.*]] = sub <2 x i32> [[TMP200]], [[TMP201]] +; CHECK-NEXT: [[TMP204:%.*]] = shufflevector <2 x i32> [[TMP202]], <2 x i32> [[TMP203]], <2 x i32> +; CHECK-NEXT: [[TMP205:%.*]] = extractelement <2 x i32> [[TMP204]], i32 1 +; CHECK-NEXT: [[ADD78_2:%.*]] = add i32 [[SUB51_1]], [[TMP205]] +; CHECK-NEXT: [[TMP206:%.*]] = insertelement <2 x i32> , i32 [[SUB51_1]], i32 1 +; CHECK-NEXT: [[TMP207:%.*]] = and <2 x i32> [[TMP204]], [[TMP206]] +; CHECK-NEXT: [[TMP208:%.*]] = sub <2 x i32> [[TMP204]], [[TMP206]] +; CHECK-NEXT: [[TMP209:%.*]] = shufflevector <2 x i32> [[TMP207]], <2 x i32> [[TMP208]], <2 x i32> +; CHECK-NEXT: [[TMP210:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_2]], i32 0 +; CHECK-NEXT: [[TMP211:%.*]] = shufflevector <2 x i32> [[TMP210]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP212:%.*]] = add <2 x i32> [[TMP211]], [[TMP198]] +; CHECK-NEXT: [[TMP213:%.*]] = sub <2 x i32> [[TMP211]], [[TMP198]] +; CHECK-NEXT: [[TMP214:%.*]] = shufflevector <2 x i32> [[TMP212]], <2 x i32> [[TMP213]], <2 x i32> +; CHECK-NEXT: [[TMP215:%.*]] = insertelement <2 x i32> [[TMP133]], i32 [[CONV_1]], i32 0 +; CHECK-NEXT: [[TMP216:%.*]] = lshr <2 x i32> [[TMP215]], +; CHECK-NEXT: [[TMP217:%.*]] = and <2 x i32> [[TMP216]], +; CHECK-NEXT: [[TMP218:%.*]] = mul <2 x i32> [[TMP217]], +; CHECK-NEXT: [[TMP219:%.*]] = shufflevector <2 x i32> [[TMP87]], <2 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP220:%.*]] = shufflevector <2 x i32> [[TMP219]], <2 x i32> [[TMP181]], <2 x i32> +; CHECK-NEXT: [[TMP221:%.*]] = shufflevector <2 x i32> [[TMP87]], <2 x i32> [[TMP181]], <2 x i32> +; CHECK-NEXT: [[TMP222:%.*]] = sub <2 x i32> [[TMP220]], [[TMP221]] +; CHECK-NEXT: [[TMP223:%.*]] = shufflevector <2 x i32> [[TMP47]], <2 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP224:%.*]] = insertelement <2 x i32> [[TMP223]], i32 [[ADD46]], i32 1 +; CHECK-NEXT: [[TMP225:%.*]] = insertelement <2 x i32> [[TMP47]], i32 [[ADD44]], i32 1 +; CHECK-NEXT: [[TMP226:%.*]] = add <2 x i32> [[TMP224]], [[TMP225]] +; CHECK-NEXT: [[TMP227:%.*]] = shufflevector <2 x i32> [[TMP79]], <2 x i32> [[TMP175]], <2 x i32> +; CHECK-NEXT: [[TMP228:%.*]] = shufflevector <2 x i32> [[TMP79]], <2 x i32> [[TMP175]], <2 x i32> +; CHECK-NEXT: [[TMP229:%.*]] = add <2 x i32> [[TMP227]], [[TMP228]] +; CHECK-NEXT: [[TMP230:%.*]] = extractelement <2 x i32> [[TMP226]], i32 0 +; CHECK-NEXT: [[TMP231:%.*]] = extractelement <2 x i32> [[TMP229]], i32 0 +; CHECK-NEXT: [[ADD94_1:%.*]] = add i32 [[TMP231]], [[TMP230]] +; CHECK-NEXT: [[TMP232:%.*]] = insertelement <2 x i32> [[TMP14]], i32 [[ADD46]], i32 1 +; CHECK-NEXT: [[TMP233:%.*]] = lshr <2 x i32> [[TMP232]], +; CHECK-NEXT: [[TMP234:%.*]] = and <2 x i32> [[TMP233]], +; CHECK-NEXT: [[TMP235:%.*]] = mul <2 x i32> [[TMP234]], +; CHECK-NEXT: [[TMP236:%.*]] = extractelement <2 x i32> [[TMP226]], i32 1 +; CHECK-NEXT: [[TMP237:%.*]] = extractelement <2 x i32> [[TMP229]], i32 1 +; CHECK-NEXT: [[ADD78:%.*]] = add i32 [[TMP237]], [[TMP236]] +; CHECK-NEXT: [[TMP238:%.*]] = sub <2 x i32> [[TMP226]], [[TMP229]] ; CHECK-NEXT: [[ADD103:%.*]] = add i32 [[ADD94]], [[ADD78]] ; CHECK-NEXT: [[SUB104:%.*]] = sub i32 [[ADD78]], [[ADD94]] -; CHECK-NEXT: [[TMP240:%.*]] = extractelement <2 x i32> [[TMP239]], i32 1 -; CHECK-NEXT: [[ADD105:%.*]] = add i32 [[SUB102]], [[TMP240]] +; CHECK-NEXT: [[TMP239:%.*]] = extractelement <2 x i32> [[TMP238]], i32 1 +; CHECK-NEXT: [[ADD105:%.*]] = add i32 [[SUB102]], [[TMP239]] ; CHECK-NEXT: [[ADD_I:%.*]] = add i32 [[MUL_I]], [[ADD103]] -; CHECK-NEXT: [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[TMP93]] +; CHECK-NEXT: [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[TMP92]] ; CHECK-NEXT: [[ADD_I52:%.*]] = add i32 [[MUL_I51]], [[ADD105]] -; CHECK-NEXT: [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[TMP94]] +; CHECK-NEXT: [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[TMP93]] ; CHECK-NEXT: [[ADD_I57:%.*]] = add i32 [[MUL_I56]], [[SUB104]] -; CHECK-NEXT: [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP179]] +; CHECK-NEXT: [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP178]] ; CHECK-NEXT: [[ADD110:%.*]] = add i32 [[XOR_I53]], [[XOR_I]] ; CHECK-NEXT: [[ADD112:%.*]] = add i32 [[ADD110]], [[XOR_I58]] -; CHECK-NEXT: [[TMP241:%.*]] = shufflevector <2 x i32> [[TMP223]], <2 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP242:%.*]] = insertelement <2 x i32> [[TMP241]], i32 [[SUB102]], i32 1 -; CHECK-NEXT: [[TMP243:%.*]] = add <2 x i32> [[TMP239]], [[TMP242]] -; CHECK-NEXT: [[TMP244:%.*]] = sub <2 x i32> [[TMP239]], [[TMP242]] -; CHECK-NEXT: [[TMP245:%.*]] = shufflevector <2 x i32> [[TMP243]], <2 x i32> [[TMP244]], <2 x i32> -; CHECK-NEXT: [[TMP246:%.*]] = add <2 x i32> [[TMP236]], [[TMP245]] -; CHECK-NEXT: [[TMP247:%.*]] = xor <2 x i32> [[TMP246]], [[TMP233]] -; CHECK-NEXT: [[TMP248:%.*]] = extractelement <2 x i32> [[TMP247]], i32 1 -; CHECK-NEXT: [[ADD113:%.*]] = add i32 [[ADD112]], [[TMP248]] -; CHECK-NEXT: [[TMP249:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_1]], i32 0 -; CHECK-NEXT: [[TMP250:%.*]] = shufflevector <2 x i32> [[TMP249]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP251:%.*]] = add <2 x i32> [[TMP197]], [[TMP250]] -; CHECK-NEXT: [[TMP252:%.*]] = sub <2 x i32> [[TMP197]], [[TMP250]] -; CHECK-NEXT: [[TMP253:%.*]] = shufflevector <2 x i32> [[TMP251]], <2 x i32> [[TMP252]], <2 x i32> -; CHECK-NEXT: [[TMP254:%.*]] = add <2 x i32> [[TMP195]], [[TMP253]] -; CHECK-NEXT: [[TMP255:%.*]] = xor <2 x i32> [[TMP254]], [[TMP192]] -; CHECK-NEXT: [[TMP256:%.*]] = extractelement <2 x i32> [[TMP247]], i32 0 -; CHECK-NEXT: [[ADD108_1:%.*]] = add i32 [[TMP256]], [[ADD113]] -; CHECK-NEXT: [[TMP257:%.*]] = extractelement <2 x i32> [[TMP255]], i32 0 -; CHECK-NEXT: [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[TMP257]] -; CHECK-NEXT: [[TMP258:%.*]] = extractelement <2 x i32> [[TMP255]], i32 1 -; CHECK-NEXT: [[ADD112_1:%.*]] = add i32 [[ADD110_1]], [[TMP258]] -; CHECK-NEXT: [[TMP259:%.*]] = shufflevector <2 x i32> [[TMP210]], <2 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP260:%.*]] = shufflevector <2 x i32> [[TMP259]], <2 x i32> [[TMP239]], <2 x i32> -; CHECK-NEXT: [[TMP261:%.*]] = add <2 x i32> [[TMP223]], [[TMP260]] -; CHECK-NEXT: [[TMP262:%.*]] = sub <2 x i32> [[TMP223]], [[TMP260]] -; CHECK-NEXT: [[TMP263:%.*]] = shufflevector <2 x i32> [[TMP261]], <2 x i32> [[TMP262]], <2 x i32> -; CHECK-NEXT: [[TMP264:%.*]] = add <2 x i32> [[TMP219]], [[TMP263]] -; CHECK-NEXT: [[TMP265:%.*]] = xor <2 x i32> [[TMP264]], [[TMP216]] -; CHECK-NEXT: [[TMP266:%.*]] = extractelement <2 x i32> [[TMP265]], i32 1 -; CHECK-NEXT: [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[TMP266]] -; CHECK-NEXT: [[TMP267:%.*]] = shufflevector <2 x i32> , <2 x i32> [[TMP223]], <2 x i32> -; CHECK-NEXT: [[TMP268:%.*]] = mul <2 x i32> [[TMP210]], [[TMP267]] -; CHECK-NEXT: [[TMP269:%.*]] = sub <2 x i32> [[TMP210]], [[TMP267]] -; CHECK-NEXT: [[TMP270:%.*]] = shufflevector <2 x i32> [[TMP268]], <2 x i32> [[TMP269]], <2 x i32> -; CHECK-NEXT: [[TMP271:%.*]] = add <2 x i32> [[TMP187]], [[TMP215]] -; CHECK-NEXT: [[TMP272:%.*]] = xor <2 x i32> [[TMP271]], [[TMP143]] -; CHECK-NEXT: [[TMP273:%.*]] = extractelement <2 x i32> [[TMP270]], i32 0 -; CHECK-NEXT: [[TMP274:%.*]] = extractelement <2 x i32> [[TMP270]], i32 1 -; CHECK-NEXT: [[ADD_I62_2:%.*]] = add i32 [[TMP273]], [[TMP274]] +; CHECK-NEXT: [[TMP240:%.*]] = shufflevector <2 x i32> [[TMP222]], <2 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP241:%.*]] = insertelement <2 x i32> [[TMP240]], i32 [[SUB102]], i32 1 +; CHECK-NEXT: [[TMP242:%.*]] = add <2 x i32> [[TMP238]], [[TMP241]] +; CHECK-NEXT: [[TMP243:%.*]] = sub <2 x i32> [[TMP238]], [[TMP241]] +; CHECK-NEXT: [[TMP244:%.*]] = shufflevector <2 x i32> [[TMP242]], <2 x i32> [[TMP243]], <2 x i32> +; CHECK-NEXT: [[TMP245:%.*]] = add <2 x i32> [[TMP235]], [[TMP244]] +; CHECK-NEXT: [[TMP246:%.*]] = xor <2 x i32> [[TMP245]], [[TMP232]] +; CHECK-NEXT: [[TMP247:%.*]] = extractelement <2 x i32> [[TMP246]], i32 1 +; CHECK-NEXT: [[ADD113:%.*]] = add i32 [[ADD112]], [[TMP247]] +; CHECK-NEXT: [[TMP248:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_1]], i32 0 +; CHECK-NEXT: [[TMP249:%.*]] = shufflevector <2 x i32> [[TMP248]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP250:%.*]] = add <2 x i32> [[TMP196]], [[TMP249]] +; CHECK-NEXT: [[TMP251:%.*]] = sub <2 x i32> [[TMP196]], [[TMP249]] +; CHECK-NEXT: [[TMP252:%.*]] = shufflevector <2 x i32> [[TMP250]], <2 x i32> [[TMP251]], <2 x i32> +; CHECK-NEXT: [[TMP253:%.*]] = add <2 x i32> [[TMP194]], [[TMP252]] +; CHECK-NEXT: [[TMP254:%.*]] = xor <2 x i32> [[TMP253]], [[TMP191]] +; CHECK-NEXT: [[TMP255:%.*]] = extractelement <2 x i32> [[TMP246]], i32 0 +; CHECK-NEXT: [[ADD108_1:%.*]] = add i32 [[TMP255]], [[ADD113]] +; CHECK-NEXT: [[TMP256:%.*]] = extractelement <2 x i32> [[TMP254]], i32 0 +; CHECK-NEXT: [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[TMP256]] +; CHECK-NEXT: [[TMP257:%.*]] = extractelement <2 x i32> [[TMP254]], i32 1 +; CHECK-NEXT: [[ADD112_1:%.*]] = add i32 [[ADD110_1]], [[TMP257]] +; CHECK-NEXT: [[TMP258:%.*]] = shufflevector <2 x i32> [[TMP209]], <2 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP259:%.*]] = shufflevector <2 x i32> [[TMP258]], <2 x i32> [[TMP238]], <2 x i32> +; CHECK-NEXT: [[TMP260:%.*]] = add <2 x i32> [[TMP222]], [[TMP259]] +; CHECK-NEXT: [[TMP261:%.*]] = sub <2 x i32> [[TMP222]], [[TMP259]] +; CHECK-NEXT: [[TMP262:%.*]] = shufflevector <2 x i32> [[TMP260]], <2 x i32> [[TMP261]], <2 x i32> +; CHECK-NEXT: [[TMP263:%.*]] = add <2 x i32> [[TMP218]], [[TMP262]] +; CHECK-NEXT: [[TMP264:%.*]] = xor <2 x i32> [[TMP263]], [[TMP215]] +; CHECK-NEXT: [[TMP265:%.*]] = extractelement <2 x i32> [[TMP264]], i32 1 +; CHECK-NEXT: [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[TMP265]] +; CHECK-NEXT: [[TMP266:%.*]] = shufflevector <2 x i32> , <2 x i32> [[TMP222]], <2 x i32> +; CHECK-NEXT: [[TMP267:%.*]] = mul <2 x i32> [[TMP209]], [[TMP266]] +; CHECK-NEXT: [[TMP268:%.*]] = sub <2 x i32> [[TMP209]], [[TMP266]] +; CHECK-NEXT: [[TMP269:%.*]] = shufflevector <2 x i32> [[TMP267]], <2 x i32> [[TMP268]], <2 x i32> +; CHECK-NEXT: [[TMP270:%.*]] = add <2 x i32> [[TMP186]], [[TMP214]] +; CHECK-NEXT: [[TMP271:%.*]] = xor <2 x i32> [[TMP270]], [[TMP142]] +; CHECK-NEXT: [[TMP272:%.*]] = extractelement <2 x i32> [[TMP269]], i32 0 +; CHECK-NEXT: [[TMP273:%.*]] = extractelement <2 x i32> [[TMP269]], i32 1 +; CHECK-NEXT: [[ADD_I62_2:%.*]] = add i32 [[TMP272]], [[TMP273]] ; CHECK-NEXT: [[XOR_I63_2:%.*]] = xor i32 [[ADD_I62_2]], [[ADD44]] -; CHECK-NEXT: [[TMP275:%.*]] = extractelement <2 x i32> [[TMP265]], i32 0 -; CHECK-NEXT: [[ADD108_2:%.*]] = add i32 [[TMP275]], [[ADD113_1]] -; CHECK-NEXT: [[TMP276:%.*]] = extractelement <2 x i32> [[TMP272]], i32 0 -; CHECK-NEXT: [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP276]] -; CHECK-NEXT: [[TMP277:%.*]] = extractelement <2 x i32> [[TMP272]], i32 1 -; CHECK-NEXT: [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP277]] +; CHECK-NEXT: [[TMP274:%.*]] = extractelement <2 x i32> [[TMP264]], i32 0 +; CHECK-NEXT: [[ADD108_2:%.*]] = add i32 [[TMP274]], [[ADD113_1]] +; CHECK-NEXT: [[TMP275:%.*]] = extractelement <2 x i32> [[TMP271]], i32 0 +; CHECK-NEXT: [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP275]] +; CHECK-NEXT: [[TMP276:%.*]] = extractelement <2 x i32> [[TMP271]], i32 1 +; CHECK-NEXT: [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP276]] ; CHECK-NEXT: [[ADD113_2:%.*]] = add i32 [[ADD112_2]], [[XOR_I63_2]] -; CHECK-NEXT: [[TMP278:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59]], i32 0 -; CHECK-NEXT: [[TMP279:%.*]] = shufflevector <2 x i32> [[TMP278]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP280:%.*]] = add <2 x i32> [[TMP279]], [[TMP189]] -; CHECK-NEXT: [[TMP281:%.*]] = sub <2 x i32> [[TMP279]], [[TMP189]] -; CHECK-NEXT: [[TMP282:%.*]] = shufflevector <2 x i32> [[TMP280]], <2 x i32> [[TMP281]], <2 x i32> -; CHECK-NEXT: [[TMP283:%.*]] = add <2 x i32> [[TMP105]], [[TMP282]] -; CHECK-NEXT: [[TMP284:%.*]] = sub <2 x i32> [[TMP282]], [[TMP105]] -; CHECK-NEXT: [[TMP285:%.*]] = add <2 x i32> [[TMP139]], [[TMP283]] -; CHECK-NEXT: [[TMP286:%.*]] = xor <2 x i32> [[TMP285]], [[TMP108]] -; CHECK-NEXT: [[TMP287:%.*]] = lshr <2 x i32> [[TMP98]], -; CHECK-NEXT: [[TMP288:%.*]] = and <2 x i32> [[TMP287]], -; CHECK-NEXT: [[TMP289:%.*]] = mul <2 x i32> [[TMP288]], -; CHECK-NEXT: [[TMP290:%.*]] = add <2 x i32> [[TMP289]], [[TMP284]] -; CHECK-NEXT: [[TMP291:%.*]] = xor <2 x i32> [[TMP290]], [[TMP98]] -; CHECK-NEXT: [[TMP292:%.*]] = extractelement <2 x i32> [[TMP286]], i32 1 -; CHECK-NEXT: [[ADD108_3:%.*]] = add i32 [[TMP292]], [[ADD113_2]] -; CHECK-NEXT: [[TMP293:%.*]] = extractelement <2 x i32> [[TMP286]], i32 0 -; CHECK-NEXT: [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP293]] -; CHECK-NEXT: [[TMP294:%.*]] = extractelement <2 x i32> [[TMP291]], i32 0 -; CHECK-NEXT: [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[TMP294]] -; CHECK-NEXT: [[TMP295:%.*]] = extractelement <2 x i32> [[TMP291]], i32 1 -; CHECK-NEXT: [[ADD113_3:%.*]] = add i32 [[ADD112_3]], [[TMP295]] +; CHECK-NEXT: [[TMP277:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59]], i32 0 +; CHECK-NEXT: [[TMP278:%.*]] = shufflevector <2 x i32> [[TMP277]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP279:%.*]] = add <2 x i32> [[TMP278]], [[TMP188]] +; CHECK-NEXT: [[TMP280:%.*]] = sub <2 x i32> [[TMP278]], [[TMP188]] +; CHECK-NEXT: [[TMP281:%.*]] = shufflevector <2 x i32> [[TMP279]], <2 x i32> [[TMP280]], <2 x i32> +; CHECK-NEXT: [[TMP282:%.*]] = add <2 x i32> [[TMP104]], [[TMP281]] +; CHECK-NEXT: [[TMP283:%.*]] = sub <2 x i32> [[TMP281]], [[TMP104]] +; CHECK-NEXT: [[TMP284:%.*]] = add <2 x i32> [[TMP138]], [[TMP282]] +; CHECK-NEXT: [[TMP285:%.*]] = xor <2 x i32> [[TMP284]], [[TMP107]] +; CHECK-NEXT: [[TMP286:%.*]] = lshr <2 x i32> [[TMP97]], +; CHECK-NEXT: [[TMP287:%.*]] = and <2 x i32> [[TMP286]], +; CHECK-NEXT: [[TMP288:%.*]] = mul <2 x i32> [[TMP287]], +; CHECK-NEXT: [[TMP289:%.*]] = add <2 x i32> [[TMP288]], [[TMP283]] +; CHECK-NEXT: [[TMP290:%.*]] = xor <2 x i32> [[TMP289]], [[TMP97]] +; CHECK-NEXT: [[TMP291:%.*]] = extractelement <2 x i32> [[TMP285]], i32 1 +; CHECK-NEXT: [[ADD108_3:%.*]] = add i32 [[TMP291]], [[ADD113_2]] +; CHECK-NEXT: [[TMP292:%.*]] = extractelement <2 x i32> [[TMP285]], i32 0 +; CHECK-NEXT: [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP292]] +; CHECK-NEXT: [[TMP293:%.*]] = extractelement <2 x i32> [[TMP290]], i32 0 +; CHECK-NEXT: [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[TMP293]] +; CHECK-NEXT: [[TMP294:%.*]] = extractelement <2 x i32> [[TMP290]], i32 1 +; CHECK-NEXT: [[ADD113_3:%.*]] = add i32 [[ADD112_3]], [[TMP294]] ; CHECK-NEXT: ret i32 [[ADD113_3]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll index a4cc311d12a21..4b0b41970bbb4 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll @@ -5,61 +5,12 @@ define void @test(ptr %p, ptr noalias %s) { ; CHECK-LABEL: @test( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [48 x float], ptr [[P:%.*]], i64 0, i64 0 -; CHECK-NEXT: [[I:%.*]] = load float, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 30 -; CHECK-NEXT: [[I1:%.*]] = load float, ptr [[ARRAYIDX1]], align 4 -; CHECK-NEXT: [[ADD:%.*]] = fsub fast float [[I1]], [[I]] ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[S:%.*]], i64 0 -; CHECK-NEXT: store float [[ADD]], ptr [[ARRAYIDX2]], align 4 -; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 4 -; CHECK-NEXT: [[I2:%.*]] = load float, ptr [[ARRAYIDX4]], align 4 -; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 26 -; CHECK-NEXT: [[I3:%.*]] = load float, ptr [[ARRAYIDX6]], align 4 -; CHECK-NEXT: [[ADD7:%.*]] = fsub fast float [[I3]], [[I2]] -; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr [[S]], i64 1 -; CHECK-NEXT: store float [[ADD7]], ptr [[ARRAYIDX9]], align 4 -; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 8 -; CHECK-NEXT: [[I4:%.*]] = load float, ptr [[ARRAYIDX11]], align 4 -; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 22 -; CHECK-NEXT: [[I5:%.*]] = load float, ptr [[ARRAYIDX13]], align 4 -; CHECK-NEXT: [[ADD14:%.*]] = fsub fast float [[I5]], [[I4]] -; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds float, ptr [[S]], i64 2 -; CHECK-NEXT: store float [[ADD14]], ptr [[ARRAYIDX16]], align 4 -; CHECK-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 12 -; CHECK-NEXT: [[I6:%.*]] = load float, ptr [[ARRAYIDX18]], align 4 -; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 18 -; CHECK-NEXT: [[I7:%.*]] = load float, ptr [[ARRAYIDX20]], align 4 -; CHECK-NEXT: [[ADD21:%.*]] = fsub fast float [[I7]], [[I6]] -; CHECK-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds float, ptr [[S]], i64 3 -; CHECK-NEXT: store float [[ADD21]], ptr [[ARRAYIDX23]], align 4 -; CHECK-NEXT: [[ARRAYIDX25:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 16 -; CHECK-NEXT: [[I8:%.*]] = load float, ptr [[ARRAYIDX25]], align 4 -; CHECK-NEXT: [[ARRAYIDX27:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 14 -; CHECK-NEXT: [[I9:%.*]] = load float, ptr [[ARRAYIDX27]], align 4 -; CHECK-NEXT: [[ADD28:%.*]] = fsub fast float [[I9]], [[I8]] -; CHECK-NEXT: [[ARRAYIDX30:%.*]] = getelementptr inbounds float, ptr [[S]], i64 4 -; CHECK-NEXT: store float [[ADD28]], ptr [[ARRAYIDX30]], align 4 -; CHECK-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 20 -; CHECK-NEXT: [[I10:%.*]] = load float, ptr [[ARRAYIDX32]], align 4 -; CHECK-NEXT: [[ARRAYIDX34:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 10 -; CHECK-NEXT: [[I11:%.*]] = load float, ptr [[ARRAYIDX34]], align 4 -; CHECK-NEXT: [[ADD35:%.*]] = fsub fast float [[I11]], [[I10]] -; CHECK-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds float, ptr [[S]], i64 5 -; CHECK-NEXT: store float [[ADD35]], ptr [[ARRAYIDX37]], align 4 -; CHECK-NEXT: [[ARRAYIDX39:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 24 -; CHECK-NEXT: [[I12:%.*]] = load float, ptr [[ARRAYIDX39]], align 4 -; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 6 -; CHECK-NEXT: [[I13:%.*]] = load float, ptr [[ARRAYIDX41]], align 4 -; CHECK-NEXT: [[ADD42:%.*]] = fsub fast float [[I13]], [[I12]] -; CHECK-NEXT: [[ARRAYIDX44:%.*]] = getelementptr inbounds float, ptr [[S]], i64 6 -; CHECK-NEXT: store float [[ADD42]], ptr [[ARRAYIDX44]], align 4 -; CHECK-NEXT: [[ARRAYIDX46:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 28 -; CHECK-NEXT: [[I14:%.*]] = load float, ptr [[ARRAYIDX46]], align 4 -; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 2 -; CHECK-NEXT: [[I15:%.*]] = load float, ptr [[ARRAYIDX48]], align 4 -; CHECK-NEXT: [[ADD49:%.*]] = fsub fast float [[I15]], [[I14]] -; CHECK-NEXT: [[ARRAYIDX51:%.*]] = getelementptr inbounds float, ptr [[S]], i64 7 -; CHECK-NEXT: store float [[ADD49]], ptr [[ARRAYIDX51]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i64(ptr align 4 [[ARRAYIDX]], i64 16, <8 x i1> , i32 8) +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i64(ptr align 4 [[ARRAYIDX1]], i64 -16, <8 x i1> , i32 8) +; CHECK-NEXT: [[TMP2:%.*]] = fsub fast <8 x float> [[TMP1]], [[TMP0]] +; CHECK-NEXT: store <8 x float> [[TMP2]], ptr [[ARRAYIDX2]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -262,67 +213,40 @@ define void @test2(ptr %p, ptr noalias %s, i32 %stride) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[STR:%.*]] = zext i32 [[STRIDE:%.*]] to i64 ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [48 x float], ptr [[P:%.*]], i64 0, i64 2 -; CHECK-NEXT: [[I:%.*]] = load float, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[ST6:%.*]] = mul i64 [[STR]], 7 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST6]] ; CHECK-NEXT: [[I1:%.*]] = load float, ptr [[ARRAYIDX1]], align 4 -; CHECK-NEXT: [[ADD:%.*]] = fsub fast float [[I1]], [[I]] ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[S:%.*]], i64 0 -; CHECK-NEXT: store float [[ADD]], ptr [[ARRAYIDX2]], align 4 -; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 6 -; CHECK-NEXT: [[I2:%.*]] = load float, ptr [[ARRAYIDX4]], align 4 ; CHECK-NEXT: [[ST5:%.*]] = mul i64 [[STR]], 6 ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST5]] ; CHECK-NEXT: [[I3:%.*]] = load float, ptr [[ARRAYIDX6]], align 4 -; CHECK-NEXT: [[ADD7:%.*]] = fsub fast float [[I3]], [[I2]] -; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr [[S]], i64 1 -; CHECK-NEXT: store float [[ADD7]], ptr [[ARRAYIDX9]], align 4 -; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 10 -; CHECK-NEXT: [[I4:%.*]] = load float, ptr [[ARRAYIDX11]], align 4 ; CHECK-NEXT: [[ST4:%.*]] = mul i64 [[STR]], 5 ; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST4]] ; CHECK-NEXT: [[I5:%.*]] = load float, ptr [[ARRAYIDX13]], align 4 -; CHECK-NEXT: [[ADD14:%.*]] = fsub fast float [[I5]], [[I4]] -; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds float, ptr [[S]], i64 2 -; CHECK-NEXT: store float [[ADD14]], ptr [[ARRAYIDX16]], align 4 -; CHECK-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 14 -; CHECK-NEXT: [[I6:%.*]] = load float, ptr [[ARRAYIDX18]], align 4 ; CHECK-NEXT: [[ST3:%.*]] = mul i64 [[STR]], 4 ; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST3]] ; CHECK-NEXT: [[I7:%.*]] = load float, ptr [[ARRAYIDX20]], align 4 -; CHECK-NEXT: [[ADD21:%.*]] = fsub fast float [[I7]], [[I6]] -; CHECK-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds float, ptr [[S]], i64 3 -; CHECK-NEXT: store float [[ADD21]], ptr [[ARRAYIDX23]], align 4 -; CHECK-NEXT: [[ARRAYIDX25:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 18 ; CHECK-NEXT: [[ST2:%.*]] = mul i64 [[STR]], 3 -; CHECK-NEXT: [[I8:%.*]] = load float, ptr [[ARRAYIDX25]], align 4 ; CHECK-NEXT: [[ARRAYIDX27:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST2]] ; CHECK-NEXT: [[I9:%.*]] = load float, ptr [[ARRAYIDX27]], align 4 -; CHECK-NEXT: [[ADD28:%.*]] = fsub fast float [[I9]], [[I8]] -; CHECK-NEXT: [[ARRAYIDX30:%.*]] = getelementptr inbounds float, ptr [[S]], i64 4 -; CHECK-NEXT: store float [[ADD28]], ptr [[ARRAYIDX30]], align 4 -; CHECK-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 22 -; CHECK-NEXT: [[I10:%.*]] = load float, ptr [[ARRAYIDX32]], align 4 ; CHECK-NEXT: [[ST1:%.*]] = mul i64 [[STR]], 2 ; CHECK-NEXT: [[ARRAYIDX34:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST1]] ; CHECK-NEXT: [[I11:%.*]] = load float, ptr [[ARRAYIDX34]], align 4 -; CHECK-NEXT: [[ADD35:%.*]] = fsub fast float [[I11]], [[I10]] -; CHECK-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds float, ptr [[S]], i64 5 -; CHECK-NEXT: store float [[ADD35]], ptr [[ARRAYIDX37]], align 4 -; CHECK-NEXT: [[ARRAYIDX39:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 26 -; CHECK-NEXT: [[I12:%.*]] = load float, ptr [[ARRAYIDX39]], align 4 ; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[STR]] ; CHECK-NEXT: [[I13:%.*]] = load float, ptr [[ARRAYIDX41]], align 4 -; CHECK-NEXT: [[ADD42:%.*]] = fsub fast float [[I13]], [[I12]] -; CHECK-NEXT: [[ARRAYIDX44:%.*]] = getelementptr inbounds float, ptr [[S]], i64 6 -; CHECK-NEXT: store float [[ADD42]], ptr [[ARRAYIDX44]], align 4 -; CHECK-NEXT: [[ARRAYIDX46:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 30 -; CHECK-NEXT: [[I14:%.*]] = load float, ptr [[ARRAYIDX46]], align 4 ; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 0 ; CHECK-NEXT: [[I15:%.*]] = load float, ptr [[ARRAYIDX48]], align 4 -; CHECK-NEXT: [[ADD49:%.*]] = fsub fast float [[I15]], [[I14]] -; CHECK-NEXT: [[ARRAYIDX51:%.*]] = getelementptr inbounds float, ptr [[S]], i64 7 -; CHECK-NEXT: store float [[ADD49]], ptr [[ARRAYIDX51]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i64(ptr align 4 [[ARRAYIDX]], i64 16, <8 x i1> , i32 8) +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x float> poison, float [[I1]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x float> [[TMP1]], float [[I3]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x float> [[TMP2]], float [[I5]], i32 2 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x float> [[TMP3]], float [[I7]], i32 3 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x float> [[TMP4]], float [[I9]], i32 4 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x float> [[TMP5]], float [[I11]], i32 5 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x float> [[TMP6]], float [[I13]], i32 6 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x float> [[TMP7]], float [[I15]], i32 7 +; CHECK-NEXT: [[TMP9:%.*]] = fsub fast <8 x float> [[TMP8]], [[TMP0]] +; CHECK-NEXT: store <8 x float> [[TMP9]], ptr [[ARRAYIDX2]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -397,27 +321,12 @@ define void @test3(ptr %p, ptr noalias %s) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [48 x float], ptr [[P:%.*]], i64 0, i64 0 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[S:%.*]], i64 0 -; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 4 -; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 8 -; CHECK-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 12 -; CHECK-NEXT: [[ARRAYIDX25:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 16 -; CHECK-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 20 -; CHECK-NEXT: [[ARRAYIDX39:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 24 -; CHECK-NEXT: [[ARRAYIDX46:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 28 ; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 23 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x ptr> poison, ptr [[ARRAYIDX]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x ptr> [[TMP0]], ptr [[ARRAYIDX4]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x ptr> [[TMP1]], ptr [[ARRAYIDX11]], i32 2 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x ptr> [[TMP2]], ptr [[ARRAYIDX18]], i32 3 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x ptr> [[TMP3]], ptr [[ARRAYIDX25]], i32 4 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x ptr> [[TMP4]], ptr [[ARRAYIDX32]], i32 5 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x ptr> [[TMP5]], ptr [[ARRAYIDX39]], i32 6 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x ptr> [[TMP6]], ptr [[ARRAYIDX46]], i32 7 -; CHECK-NEXT: [[TMP8:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP7]], i32 4, <8 x i1> , <8 x float> poison) -; CHECK-NEXT: [[TMP9:%.*]] = load <8 x float>, ptr [[ARRAYIDX48]], align 4 -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x float> [[TMP9]], <8 x float> poison, <8 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = fsub fast <8 x float> [[TMP10]], [[TMP8]] -; CHECK-NEXT: store <8 x float> [[TMP11]], ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i64(ptr align 4 [[ARRAYIDX]], i64 16, <8 x i1> , i32 8) +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, ptr [[ARRAYIDX48]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = fsub fast <8 x float> [[TMP2]], [[TMP0]] +; CHECK-NEXT: store <8 x float> [[TMP3]], ptr [[ARRAYIDX2]], align 4 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-use-ptr.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-use-ptr.ll index 5aba9ea115a4b..ec152c707eec6 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-use-ptr.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-use-ptr.ll @@ -8,7 +8,7 @@ define i16 @test() { ; CHECK-SAME: () #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[PPREV_058_I:%.*]] = getelementptr [[S:%.*]], ptr null, i64 -1 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x ptr> , ptr [[PPREV_058_I]], i32 0 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x ptr> , ptr [[PPREV_058_I]], i32 1 ; CHECK-NEXT: br label [[WHILE_BODY_I:%.*]] ; CHECK: while.body.i: ; CHECK-NEXT: [[TMP1:%.*]] = phi i16 [ 0, [[WHILE_BODY_I]] ], [ 0, [[ENTRY:%.*]] ] @@ -17,7 +17,7 @@ define i16 @test() { ; CHECK-NEXT: [[TMP4:%.*]] = call <2 x i16> @llvm.masked.gather.v2i16.v2p0(<2 x ptr> [[TMP3]], i32 2, <2 x i1> , <2 x i16> poison) ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i16> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i16> [[TMP4]], i32 1 -; CHECK-NEXT: [[CMP_I178:%.*]] = icmp ult i16 [[TMP6]], [[TMP5]] +; CHECK-NEXT: [[CMP_I178:%.*]] = icmp ult i16 [[TMP5]], [[TMP6]] ; CHECK-NEXT: br label [[WHILE_BODY_I]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads.ll index 8f2c72bb4c685..8ab57cc73e646 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads.ll @@ -5,14 +5,11 @@ define i32 @sum_of_abs(ptr noalias %a, ptr noalias %b) { ; CHECK-LABEL: define i32 @sum_of_abs ; CHECK-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x ptr> poison, ptr [[A]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x ptr> [[TMP0]], <8 x ptr> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, <8 x ptr> [[TMP1]], <8 x i64> -; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> [[TMP2]], i32 1, <8 x i1> , <8 x i8> poison) -; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i8> @llvm.abs.v8i8(<8 x i8> [[TMP3]], i1 false) -; CHECK-NEXT: [[TMP5:%.*]] = sext <8 x i8> [[TMP4]] to <8 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP5]]) -; CHECK-NEXT: ret i32 [[TMP6]] +; CHECK-NEXT: [[TMP0:%.*]] = call <8 x i8> @llvm.experimental.vp.strided.load.v8i8.p0.i64(ptr align 1 [[A]], i64 64, <8 x i1> , i32 8) +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i8> @llvm.abs.v8i8(<8 x i8> [[TMP0]], i1 false) +; CHECK-NEXT: [[TMP2:%.*]] = sext <8 x i8> [[TMP1]] to <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]]) +; CHECK-NEXT: ret i32 [[TMP3]] ; entry: %0 = load i8, ptr %a, align 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll b/llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll index 96d4c307f1c67..9e43cefef2801 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll @@ -30,7 +30,7 @@ define void @test() { ; CHECK-SLP-THRESHOLD: bb: ; CHECK-SLP-THRESHOLD-NEXT: [[TMP0:%.*]] = insertelement <4 x ptr> poison, ptr [[COND_IN_V]], i32 0 ; CHECK-SLP-THRESHOLD-NEXT: [[TMP1:%.*]] = shufflevector <4 x ptr> [[TMP0]], <4 x ptr> poison, <4 x i32> zeroinitializer -; CHECK-SLP-THRESHOLD-NEXT: [[TMP2:%.*]] = getelementptr i64, <4 x ptr> [[TMP1]], <4 x i64> +; CHECK-SLP-THRESHOLD-NEXT: [[TMP2:%.*]] = getelementptr i64, <4 x ptr> [[TMP1]], <4 x i64> ; CHECK-SLP-THRESHOLD-NEXT: [[TMP3:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> [[TMP2]], i32 8, <4 x i1> , <4 x i64> poison) ; CHECK-SLP-THRESHOLD-NEXT: [[TMP4:%.*]] = icmp eq <4 x i64> [[TMP3]], zeroinitializer ; CHECK-SLP-THRESHOLD-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll index 1add732d32e85..3bc6e64606e39 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll @@ -7,7 +7,7 @@ define i32 @test(ptr noalias %p, ptr noalias %addr) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x ptr> poison, ptr [[ADDR:%.*]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x ptr> [[TMP0]], <8 x ptr> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, <8 x ptr> [[TMP1]], <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, <8 x ptr> [[TMP1]], <8 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP2]], i32 8, <8 x i1> , <8 x i32> poison) ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x ptr> poison, ptr [[P:%.*]], i32 0 ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x ptr> [[TMP4]], <8 x ptr> poison, <8 x i32> zeroinitializer