From 92950afd39034c0184a3c807f8062e0053eead5c Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Thu, 1 Feb 2024 17:22:34 +0000 Subject: [PATCH 1/2] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20in?= =?UTF-8?q?itial=20version?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Created using spr 1.3.5 --- .../llvm/Analysis/TargetTransformInfo.h | 34 ++ .../llvm/Analysis/TargetTransformInfoImpl.h | 13 + llvm/lib/Analysis/TargetTransformInfo.cpp | 14 + .../Target/RISCV/RISCVTargetTransformInfo.cpp | 23 + .../Target/RISCV/RISCVTargetTransformInfo.h | 23 + .../Transforms/Vectorize/SLPVectorizer.cpp | 397 ++++++++++++------ .../SLPVectorizer/RISCV/complex-loads.ll | 132 +++--- .../RISCV/strided-loads-vectorized.ll | 209 +-------- .../strided-loads-with-external-use-ptr.ll | 4 +- .../SLPVectorizer/RISCV/strided-loads.ll | 13 +- .../X86/gep-nodes-with-non-gep-inst.ll | 2 +- .../X86/remark_gather-load-redux-cost.ll | 2 +- 12 files changed, 478 insertions(+), 388 deletions(-) diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 3b615bc700bbb..b0b6dab03fa38 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -781,6 +781,9 @@ class TargetTransformInfo { /// Return true if the target supports masked expand load. bool isLegalMaskedExpandLoad(Type *DataType) const; + /// Return true if the target supports strided load. + bool isLegalStridedLoad(Type *DataType, Align Alignment) const; + /// Return true if this is an alternating opcode pattern that can be lowered /// to a single instruction on the target. In X86 this is for the addsub /// instruction which corrsponds to a Shuffle + Fadd + FSub pattern in IR. @@ -1412,6 +1415,20 @@ class TargetTransformInfo { Align Alignment, TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput, const Instruction *I = nullptr) const; + /// \return The cost of strided memory operations. + /// \p Opcode - is a type of memory access Load or Store + /// \p DataTy - a vector type of the data to be loaded or stored + /// \p Ptr - pointer [or vector of pointers] - address[es] in memory + /// \p VariableMask - true when the memory access is predicated with a mask + /// that is not a compile-time constant + /// \p Alignment - alignment of single element + /// \p I - the optional original context instruction, if one exists, e.g. the + /// load/store to transform or the call to the gather/scatter intrinsic + InstructionCost getStridedMemoryOpCost( + unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, + Align Alignment, TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput, + const Instruction *I = nullptr) const; + /// \return The cost of the interleaved memory operation. /// \p Opcode is the memory operation code /// \p VecTy is the vector type of the interleaved access. @@ -1848,6 +1865,7 @@ class TargetTransformInfo::Concept { Align Alignment) = 0; virtual bool isLegalMaskedCompressStore(Type *DataType) = 0; virtual bool isLegalMaskedExpandLoad(Type *DataType) = 0; + virtual bool isLegalStridedLoad(Type *DataType, Align Alignment) = 0; virtual bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask) const = 0; @@ -2023,6 +2041,11 @@ class TargetTransformInfo::Concept { bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I = nullptr) = 0; + virtual InstructionCost + getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, + bool VariableMask, Align Alignment, + TTI::TargetCostKind CostKind, + const Instruction *I = nullptr) = 0; virtual InstructionCost getInterleavedMemoryOpCost( unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, @@ -2341,6 +2364,9 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept { bool isLegalMaskedExpandLoad(Type *DataType) override { return Impl.isLegalMaskedExpandLoad(DataType); } + bool isLegalStridedLoad(Type *DataType, Align Alignment) override { + return Impl.isLegalStridedLoad(DataType, Alignment); + } bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask) const override { return Impl.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask); @@ -2671,6 +2697,14 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept { return Impl.getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, Alignment, CostKind, I); } + InstructionCost + getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, + bool VariableMask, Align Alignment, + TTI::TargetCostKind CostKind, + const Instruction *I = nullptr) override { + return Impl.getStridedMemoryOpCost(Opcode, DataTy, Ptr, VariableMask, + Alignment, CostKind, I); + } InstructionCost getInterleavedMemoryOpCost( unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 9958b4daa6ed8..2a7e7b364ac40 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -304,6 +304,10 @@ class TargetTransformInfoImplBase { bool isLegalMaskedExpandLoad(Type *DataType) const { return false; } + bool isLegalStridedLoad(Type *DataType, Align Alignment) const { + return false; + } + bool enableOrderedReductions() const { return false; } bool hasDivRemOp(Type *DataType, bool IsSigned) const { return false; } @@ -687,6 +691,15 @@ class TargetTransformInfoImplBase { return 1; } + InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, + const Value *Ptr, bool VariableMask, + Align Alignment, + TTI::TargetCostKind CostKind, + const Instruction *I = nullptr) const { + return CostKind == TTI::TCK_RecipThroughput ? TTI::TCC_Expensive + : TTI::TCC_Basic; + } + unsigned getInterleavedMemoryOpCost( unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 8902dde37cbca..b86397ae7d267 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -500,6 +500,11 @@ bool TargetTransformInfo::isLegalMaskedExpandLoad(Type *DataType) const { return TTIImpl->isLegalMaskedExpandLoad(DataType); } +bool TargetTransformInfo::isLegalStridedLoad(Type *DataType, + Align Alignment) const { + return TTIImpl->isLegalStridedLoad(DataType, Alignment); +} + bool TargetTransformInfo::enableOrderedReductions() const { return TTIImpl->enableOrderedReductions(); } @@ -1041,6 +1046,15 @@ InstructionCost TargetTransformInfo::getGatherScatterOpCost( return Cost; } +InstructionCost TargetTransformInfo::getStridedMemoryOpCost( + unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, + Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const { + InstructionCost Cost = TTIImpl->getStridedMemoryOpCost( + Opcode, DataTy, Ptr, VariableMask, Alignment, CostKind, I); + assert(Cost >= 0 && "TTI should not produce negative costs!"); + return Cost; +} + InstructionCost TargetTransformInfo::getInterleavedMemoryOpCost( unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index fe1cdb2dfa423..9cec8ee4cb7f2 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -658,6 +658,29 @@ InstructionCost RISCVTTIImpl::getGatherScatterOpCost( return NumLoads * MemOpCost; } +InstructionCost RISCVTTIImpl::getStridedMemoryOpCost( + unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, + Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) { + if (CostKind != TTI::TCK_RecipThroughput) + return BaseT::getStridedMemoryOpCost(Opcode, DataTy, Ptr, VariableMask, + Alignment, CostKind, I); + + if ((Opcode == Instruction::Load && !isLegalStridedLoad(DataTy, Alignment)) || + Opcode != Instruction::Load) + return BaseT::getStridedMemoryOpCost(Opcode, DataTy, Ptr, VariableMask, + Alignment, CostKind, I); + + // Cost is proportional to the number of memory operations implied. For + // scalable vectors, we use an estimate on that number since we don't + // know exactly what VL will be. + auto &VTy = *cast(DataTy); + InstructionCost MemOpCost = + getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind, + {TTI::OK_AnyValue, TTI::OP_None}, I); + unsigned NumLoads = getEstimatedVLFor(&VTy); + return NumLoads * MemOpCost; +} + // Currently, these represent both throughput and codesize costs // for the respective intrinsics. The costs in this table are simply // instruction counts with the following adjustments made: diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index 0747a778fe9a2..742b1aadf00bd 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -143,6 +143,12 @@ class RISCVTTIImpl : public BasicTTIImplBase { TTI::TargetCostKind CostKind, const Instruction *I); + InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, + const Value *Ptr, bool VariableMask, + Align Alignment, + TTI::TargetCostKind CostKind, + const Instruction *I); + InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, @@ -250,6 +256,23 @@ class RISCVTTIImpl : public BasicTTIImplBase { return ST->is64Bit() && !ST->hasVInstructionsI64(); } + bool isLegalStridedLoad(Type *DataType, Align Alignment) { + if (!ST->hasVInstructions()) + return false; + + EVT DataTypeVT = TLI->getValueType(DL, DataType); + + // Only support fixed vectors if we know the minimum vector size. + if (DataTypeVT.isFixedLengthVector() && !ST->useRVVForFixedLengthVectors()) + return false; + + EVT ElemType = DataTypeVT.getScalarType(); + if (!ST->hasFastUnalignedAccess() && Alignment < ElemType.getStoreSize()) + return false; + + return TLI->isLegalElementTypeForRVV(ElemType); + } + bool isVScaleKnownToBeAPowerOfTwo() const { return TLI->isVScaleKnownToBeAPowerOfTwo(); } diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index a8aea112bc28e..90b9b51c470bf 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -87,6 +87,7 @@ #include "llvm/Transforms/Utils/InjectTLIMappings.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" +#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" #include #include #include @@ -175,6 +176,15 @@ static cl::opt RootLookAheadMaxDepth( "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option")); +static cl::opt MinProfitableStridedLoads( + "slp-min-strided-loads", cl::init(2), cl::Hidden, + cl::desc("The minimum number of loads, which should be considered strided, " + "if the stride is > 1 or is runtime value")); + +static cl::opt MaxProfitableLoadStride( + "slp-max-stride", cl::init(8), cl::Hidden, + cl::desc("The maximum stride, considered to be profitable.")); + static cl::opt ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz")); @@ -2575,7 +2585,7 @@ class BoUpSLP { enum EntryState { Vectorize, ScatterVectorize, - PossibleStridedVectorize, + StridedVectorize, NeedToGather }; EntryState State; @@ -2753,8 +2763,8 @@ class BoUpSLP { case ScatterVectorize: dbgs() << "ScatterVectorize\n"; break; - case PossibleStridedVectorize: - dbgs() << "PossibleStridedVectorize\n"; + case StridedVectorize: + dbgs() << "StridedVectorize\n"; break; case NeedToGather: dbgs() << "NeedToGather\n"; @@ -3680,7 +3690,7 @@ template <> struct DOTGraphTraits : public DefaultDOTGraphTraits { if (Entry->State == TreeEntry::NeedToGather) return "color=red"; if (Entry->State == TreeEntry::ScatterVectorize || - Entry->State == TreeEntry::PossibleStridedVectorize) + Entry->State == TreeEntry::StridedVectorize) return "color=blue"; return ""; } @@ -3846,7 +3856,7 @@ enum class LoadsState { Gather, Vectorize, ScatterVectorize, - PossibleStridedVectorize + StridedVectorize }; } // anonymous namespace @@ -3878,6 +3888,130 @@ static Align computeCommonAlignment(ArrayRef VL) { return CommonAlignment; } +/// Check if \p Order represents reverse order. +static bool isReverseOrder(ArrayRef Order) { + unsigned Sz = Order.size(); + return !Order.empty() && all_of(enumerate(Order), [&](const auto &Pair) { + return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value(); + }); +} + +/// Checks if the provided list of pointers \p Pointers represents the strided +/// pointers for type ElemTy. If they are not, std::nullopt is returned. +/// Otherwise, if \p Inst is not specified, just initialized optional value is +/// returned to show that the pointers represent strided pointers. If \p Inst +/// specified, the runtime stride is materialized before the given \p Inst. +/// \returns std::nullopt if the pointers are not pointers with the runtime +/// stride, nullptr or actual stride value, otherwise. +static std::optional +calculateRtStride(ArrayRef PointerOps, Type *ElemTy, + const DataLayout &DL, ScalarEvolution &SE, + SmallVectorImpl &SortedIndices, + Instruction *Inst = nullptr) { + SmallVector SCEVs; + const SCEV *PtrSCEVA = nullptr; + const SCEV *PtrSCEVB = nullptr; + for (Value *Ptr : PointerOps) { + const SCEV *PtrSCEV = SE.getSCEV(Ptr); + if (!PtrSCEV) + return std::nullopt; + SCEVs.push_back(PtrSCEV); + if (!PtrSCEVA && !PtrSCEVB) { + PtrSCEVA = PtrSCEVB = PtrSCEV; + continue; + } + const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVA); + if (!Diff || isa(Diff)) + return std::nullopt; + if (Diff->isNonConstantNegative()) { + PtrSCEVA = PtrSCEV; + continue; + } + const SCEV *Diff1 = SE.getMinusSCEV(PtrSCEVB, PtrSCEV); + if (!Diff1 || isa(Diff1)) + return std::nullopt; + if (Diff1->isNonConstantNegative()) { + PtrSCEVB = PtrSCEV; + continue; + } + } + const SCEV *Stride = SE.getMinusSCEV(PtrSCEVB, PtrSCEVA); + if (!Stride) + return std::nullopt; + int Size = DL.getTypeStoreSize(ElemTy); + auto TryGetStride = [&](const SCEV *Dist, + const SCEV *Multiplier) -> const SCEV * { + if (const auto *M = dyn_cast(Dist)) { + if (M->getOperand(0) == Multiplier) + return M->getOperand(1); + if (M->getOperand(1) == Multiplier) + return M->getOperand(0); + return nullptr; + } + if (Multiplier == Dist) + return SE.getConstant(Dist->getType(), 1); + return SE.getUDivExactExpr(Dist, Multiplier); + }; + if (Size != 1 || SCEVs.size() > 2) { + const SCEV *Sz = + SE.getConstant(Stride->getType(), Size * (SCEVs.size() - 1)); + Stride = TryGetStride(Stride, Sz); + if (!Stride) + return std::nullopt; + } + if (!Stride || isa(Stride)) + return std::nullopt; + // Iterate through all pointers and check if all distances are + // unique multiple of Dist. + using DistOrdPair = std::pair; + auto Compare = llvm::less_first(); + std::set Offsets(Compare); + int Cnt = 0; + bool IsConsecutive = true; + for (const SCEV *PtrSCEV : SCEVs) { + unsigned Dist = 0; + if (PtrSCEV != PtrSCEVA) { + const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVA); + const SCEV *Coeff = TryGetStride(Diff, Stride); + if (!Coeff) + return std::nullopt; + const auto *SC = dyn_cast(Coeff); + if (!SC || isa(SC)) + return std::nullopt; + if (!SE.getMinusSCEV(PtrSCEV, + SE.getAddExpr(PtrSCEVA, SE.getMulExpr(Stride, SC))) + ->isZero()) + return std::nullopt; + Dist = SC->getAPInt().getZExtValue(); + } + // If the strides are not the same or repeated, we can't vectorize. + if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size()) + return std::nullopt; + auto Res = Offsets.emplace(Dist, Cnt); + if (!Res.second) + return std::nullopt; + // Consecutive order if the inserted element is the last one. + IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end(); + ++Cnt; + } + if (Offsets.size() != SCEVs.size()) + return std::nullopt; + SortedIndices.clear(); + if (!IsConsecutive) { + // Fill SortedIndices array only if it is non-consecutive. + SortedIndices.resize(PointerOps.size()); + Cnt = 0; + for (const std::pair &Pair : Offsets) { + SortedIndices[Cnt] = Pair.second; + ++Cnt; + } + } + if (!Inst) + return nullptr; + SCEVExpander Expander(SE, DL, "strided-load-vec"); + return Expander.expandCodeFor(Stride, Stride->getType(), Inst); +} + /// Checks if the given array of loads can be represented as a vectorized, /// scatter or just simple gather. static LoadsState canVectorizeLoads(ArrayRef VL, const Value *VL0, @@ -3900,7 +4034,8 @@ static LoadsState canVectorizeLoads(ArrayRef VL, const Value *VL0, // Make sure all loads in the bundle are simple - we can't vectorize // atomic or volatile loads. PointerOps.clear(); - PointerOps.resize(VL.size()); + const unsigned Sz = VL.size(); + PointerOps.resize(Sz); auto *POIter = PointerOps.begin(); for (Value *V : VL) { auto *L = cast(V); @@ -3913,10 +4048,15 @@ static LoadsState canVectorizeLoads(ArrayRef VL, const Value *VL0, Order.clear(); // Check the order of pointer operands or that all pointers are the same. bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, DL, SE, Order); + Align CommonAlignment = computeCommonAlignment(VL); + auto *VecTy = FixedVectorType::get(ScalarTy, Sz); + if (!IsSorted && Sz > MinProfitableStridedLoads && TTI.isTypeLegal(VecTy) && + TTI.isLegalStridedLoad(VecTy, CommonAlignment) && + calculateRtStride(PointerOps, ScalarTy, DL, SE, Order)) + return LoadsState::StridedVectorize; if (IsSorted || all_of(PointerOps, [&](Value *P) { return arePointersCompatible(P, PointerOps.front(), TLI); })) { - bool IsPossibleStrided = false; if (IsSorted) { Value *Ptr0; Value *PtrN; @@ -3930,30 +4070,68 @@ static LoadsState canVectorizeLoads(ArrayRef VL, const Value *VL0, std::optional Diff = getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE); // Check that the sorted loads are consecutive. - if (static_cast(*Diff) == VL.size() - 1) + if (static_cast(*Diff) == Sz - 1) return LoadsState::Vectorize; // Simple check if not a strided access - clear order. - IsPossibleStrided = *Diff % (VL.size() - 1) == 0; + bool IsPossibleStrided = *Diff % (Sz - 1) == 0; + // Try to generate strided load node if: + // 1. Target with strided load support is detected. + // 2. The number of loads is greater than MinProfitableStridedLoads, + // or the potential stride <= MaxProfitableLoadStride and the + // potential stride is power-of-2 (to avoid perf regressions for the very + // small number of loads) and max distance > number of loads, or potential + // stride is -1. + // 3. The loads are ordered, or number of unordered loads <= + // MaxProfitableUnorderedLoads, or loads are in reversed order. + // (this check is to avoid extra costs for very expensive shuffles). + if (IsPossibleStrided && (((Sz > MinProfitableStridedLoads || + (static_cast(std::abs(*Diff)) <= + MaxProfitableLoadStride * Sz && + isPowerOf2_32(std::abs(*Diff)))) && + static_cast(std::abs(*Diff)) > Sz) || + *Diff == -(static_cast(Sz) - 1))) { + int Stride = *Diff / static_cast(Sz - 1); + if (*Diff == Stride * static_cast(Sz - 1)) { + if (TTI.isTypeLegal(VecTy) && + TTI.isLegalStridedLoad(VecTy, CommonAlignment)) { + // Iterate through all pointers and check if all distances are + // unique multiple of Dist. + SmallSet Dists; + for (Value *Ptr : PointerOps) { + int Dist = 0; + if (Ptr == PtrN) + Dist = *Diff; + else if (Ptr != Ptr0) + Dist = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, DL, SE); + // If the strides are not the same or repeated, we can't + // vectorize. + if (((Dist / Stride) * Stride) != Dist || + !Dists.insert(Dist).second) + break; + } + if (Dists.size() == Sz) + return LoadsState::StridedVectorize; + } + } + } } // TODO: need to improve analysis of the pointers, if not all of them are // GEPs or have > 2 operands, we end up with a gather node, which just // increases the cost. Loop *L = LI.getLoopFor(cast(VL0)->getParent()); bool ProfitableGatherPointers = - static_cast(count_if(PointerOps, [L](Value *V) { - return L && L->isLoopInvariant(V); - })) <= VL.size() / 2 && VL.size() > 2; + static_cast(count_if( + PointerOps, + [L](Value *V) { return L && L->isLoopInvariant(V); })) <= Sz / 2 && + Sz > 2; if (ProfitableGatherPointers || all_of(PointerOps, [IsSorted](Value *P) { auto *GEP = dyn_cast(P); return (IsSorted && !GEP && doesNotNeedToBeScheduled(P)) || (GEP && GEP->getNumOperands() == 2); })) { - Align CommonAlignment = computeCommonAlignment(VL); - auto *VecTy = FixedVectorType::get(ScalarTy, VL.size()); if (TTI.isLegalMaskedGather(VecTy, CommonAlignment) && !TTI.forceScalarizeMaskedGather(VecTy, CommonAlignment)) - return IsPossibleStrided ? LoadsState::PossibleStridedVectorize - : LoadsState::ScatterVectorize; + return LoadsState::ScatterVectorize; } } @@ -4158,7 +4336,7 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) { return std::move(ResOrder); } if ((TE.State == TreeEntry::Vectorize || - TE.State == TreeEntry::PossibleStridedVectorize) && + TE.State == TreeEntry::StridedVectorize) && (isa(TE.getMainOp()) || (TopToBottom && isa(TE.getMainOp()))) && !TE.isAltShuffle()) @@ -4414,7 +4592,7 @@ void BoUpSLP::reorderTopToBottom() { } VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get()); if (!(TE->State == TreeEntry::Vectorize || - TE->State == TreeEntry::PossibleStridedVectorize) || + TE->State == TreeEntry::StridedVectorize) || !TE->ReuseShuffleIndices.empty()) GathersToOrders.try_emplace(TE.get(), *CurrentOrder); if (TE->State == TreeEntry::Vectorize && @@ -4438,9 +4616,6 @@ void BoUpSLP::reorderTopToBottom() { MapVector> OrdersUses; - // Last chance orders - scatter vectorize. Try to use their orders if no - // other orders or the order is counted already. - SmallVector StridedVectorizeOrders; SmallPtrSet VisitedOps; for (const TreeEntry *OpTE : OrderedEntries) { // No need to reorder this nodes, still need to extend and to use shuffle, @@ -4487,11 +4662,6 @@ void BoUpSLP::reorderTopToBottom() { if (Order.empty()) continue; } - // Postpone scatter orders. - if (OpTE->State == TreeEntry::PossibleStridedVectorize) { - StridedVectorizeOrders.push_back(Order); - continue; - } // Stores actually store the mask, not the order, need to invert. if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() && OpTE->getOpcode() == Instruction::Store && !Order.empty()) { @@ -4508,22 +4678,6 @@ void BoUpSLP::reorderTopToBottom() { ++OrdersUses.insert(std::make_pair(Order, 0)).first->second; } } - // Set order of the user node. - if (OrdersUses.empty()) { - if (StridedVectorizeOrders.empty()) - continue; - // Add (potentially!) strided vectorize orders. - for (OrdersType &Order : StridedVectorizeOrders) - ++OrdersUses.insert(std::make_pair(Order, 0)).first->second; - } else { - // Account (potentially!) strided vectorize orders only if it was used - // already. - for (OrdersType &Order : StridedVectorizeOrders) { - auto *It = OrdersUses.find(Order); - if (It != OrdersUses.end()) - ++It->second; - } - } // Choose the most used order. ArrayRef BestOrder = OrdersUses.front().first; unsigned Cnt = OrdersUses.front().second; @@ -4565,7 +4719,7 @@ void BoUpSLP::reorderTopToBottom() { continue; } if ((TE->State == TreeEntry::Vectorize || - TE->State == TreeEntry::PossibleStridedVectorize) && + TE->State == TreeEntry::StridedVectorize) && isa(TE->getMainOp()) && !TE->isAltShuffle()) { @@ -4606,10 +4760,6 @@ bool BoUpSLP::canReorderOperands( })) continue; if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) { - // FIXME: Do not reorder (possible!) strided vectorized nodes, they - // require reordering of the operands, which is not implemented yet. - if (TE->State == TreeEntry::PossibleStridedVectorize) - return false; // Do not reorder if operand node is used by many user nodes. if (any_of(TE->UserTreeIndices, [UserTE](const EdgeInfo &EI) { return EI.UserTE != UserTE; })) @@ -4660,13 +4810,13 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { SmallVector NonVectorized; for (const std::unique_ptr &TE : VectorizableTree) { if (TE->State != TreeEntry::Vectorize && - TE->State != TreeEntry::PossibleStridedVectorize) + TE->State != TreeEntry::StridedVectorize) NonVectorized.push_back(TE.get()); if (std::optional CurrentOrder = getReorderingData(*TE, /*TopToBottom=*/false)) { OrderedEntries.insert(TE.get()); if (!(TE->State == TreeEntry::Vectorize || - TE->State == TreeEntry::PossibleStridedVectorize) || + TE->State == TreeEntry::StridedVectorize) || !TE->ReuseShuffleIndices.empty()) GathersToOrders.try_emplace(TE.get(), *CurrentOrder); } @@ -4684,7 +4834,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { SmallVector Filtered; for (TreeEntry *TE : OrderedEntries) { if (!(TE->State == TreeEntry::Vectorize || - TE->State == TreeEntry::PossibleStridedVectorize || + TE->State == TreeEntry::StridedVectorize || (TE->State == TreeEntry::NeedToGather && GathersToOrders.count(TE))) || TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() || @@ -4729,9 +4879,6 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { MapVector> OrdersUses; - // Last chance orders - scatter vectorize. Try to use their orders if no - // other orders or the order is counted already. - SmallVector> StridedVectorizeOrders; // Do the analysis for each tree entry only once, otherwise the order of // the same node my be considered several times, though might be not // profitable. @@ -4753,11 +4900,6 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { Data.second, [OpTE](const std::pair &P) { return P.second == OpTE; }); - // Postpone scatter orders. - if (OpTE->State == TreeEntry::PossibleStridedVectorize) { - StridedVectorizeOrders.emplace_back(Order, NumOps); - continue; - } // Stores actually store the mask, not the order, need to invert. if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() && OpTE->getOpcode() == Instruction::Store && !Order.empty()) { @@ -4816,30 +4958,6 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { ++Res.first->second; } } - // If no orders - skip current nodes and jump to the next one, if any. - if (OrdersUses.empty()) { - if (StridedVectorizeOrders.empty() || - (Data.first->ReorderIndices.empty() && - Data.first->ReuseShuffleIndices.empty() && - !(IgnoreReorder && - Data.first == VectorizableTree.front().get()))) { - for (const std::pair &Op : Data.second) - OrderedEntries.remove(Op.second); - continue; - } - // Add (potentially!) strided vectorize orders. - for (std::pair &Pair : StridedVectorizeOrders) - OrdersUses.insert(std::make_pair(Pair.first, 0)).first->second += - Pair.second; - } else { - // Account (potentially!) strided vectorize orders only if it was used - // already. - for (std::pair &Pair : StridedVectorizeOrders) { - auto *It = OrdersUses.find(Pair.first); - if (It != OrdersUses.end()) - It->second += Pair.second; - } - } // Choose the best order. ArrayRef BestOrder = OrdersUses.front().first; unsigned Cnt = OrdersUses.front().second; @@ -4875,7 +4993,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { } // Gathers are processed separately. if (TE->State != TreeEntry::Vectorize && - TE->State != TreeEntry::PossibleStridedVectorize && + TE->State != TreeEntry::StridedVectorize && (TE->State != TreeEntry::ScatterVectorize || TE->ReorderIndices.empty())) continue; @@ -4907,7 +5025,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { Data.first->reorderOperands(Mask); if (!isa(Data.first->getMainOp()) || Data.first->isAltShuffle() || - Data.first->State == TreeEntry::PossibleStridedVectorize) { + Data.first->State == TreeEntry::StridedVectorize) { reorderScalars(Data.first->Scalars, Mask); reorderOrder(Data.first->ReorderIndices, MaskOrder, /*BottomOrder=*/true); @@ -4970,7 +5088,6 @@ void BoUpSLP::buildExternalUses( // instructions. If that is the case, the one in FoundLane will // be used. if (UseEntry->State == TreeEntry::ScatterVectorize || - UseEntry->State == TreeEntry::PossibleStridedVectorize || !doesInTreeUserNeedToExtract( Scalar, cast(UseEntry->Scalars.front()), TLI)) { LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U @@ -5328,8 +5445,8 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState( return TreeEntry::Vectorize; case LoadsState::ScatterVectorize: return TreeEntry::ScatterVectorize; - case LoadsState::PossibleStridedVectorize: - return TreeEntry::PossibleStridedVectorize; + case LoadsState::StridedVectorize: + return TreeEntry::StridedVectorize; case LoadsState::Gather: #ifndef NDEBUG Type *ScalarTy = VL0->getType(); @@ -5750,8 +5867,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, BasicBlock *BB = nullptr; bool IsScatterVectorizeUserTE = UserTreeIdx.UserTE && - (UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize || - UserTreeIdx.UserTE->State == TreeEntry::PossibleStridedVectorize); + UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize; bool AreAllSameInsts = (S.getOpcode() && allSameBlock(VL)) || (S.OpValue->getType()->isPointerTy() && IsScatterVectorizeUserTE && @@ -5848,8 +5964,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, // Special processing for sorted pointers for ScatterVectorize node with // constant indeces only. if (AreAllSameInsts && UserTreeIdx.UserTE && - (UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize || - UserTreeIdx.UserTE->State == TreeEntry::PossibleStridedVectorize) && + UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize && !(S.getOpcode() && allSameBlock(VL))) { assert(S.OpValue->getType()->isPointerTy() && count_if(VL, [](Value *V) { return isa(V); }) >= @@ -6046,18 +6161,17 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, } TE->setOperandsInOrder(); break; - case TreeEntry::PossibleStridedVectorize: + case TreeEntry::StridedVectorize: // Vectorizing non-consecutive loads with `llvm.masked.gather`. if (CurrentOrder.empty()) { - TE = newTreeEntry(VL, TreeEntry::PossibleStridedVectorize, Bundle, S, + TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S, UserTreeIdx, ReuseShuffleIndicies); } else { - TE = newTreeEntry(VL, TreeEntry::PossibleStridedVectorize, Bundle, S, + TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S, UserTreeIdx, ReuseShuffleIndicies, CurrentOrder); } TE->setOperandsInOrder(); - buildTree_rec(PointerOps, Depth + 1, {TE, 0}); - LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n"); + LLVM_DEBUG(dbgs() << "SLP: added a vector of strided loads.\n"); break; case TreeEntry::ScatterVectorize: // Vectorizing non-consecutive loads with `llvm.masked.gather`. @@ -7035,11 +7149,13 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { switch (LS) { case LoadsState::Vectorize: case LoadsState::ScatterVectorize: - case LoadsState::PossibleStridedVectorize: + case LoadsState::StridedVectorize: // Mark the vectorized loads so that we don't vectorize them // again. // TODO: better handling of loads with reorders. - if (LS == LoadsState::Vectorize && CurrentOrder.empty()) + if ((LS == LoadsState::Vectorize || + LS == LoadsState::StridedVectorize) && + CurrentOrder.empty()) VectorizedStarts.push_back(cast(Slice.front())); else ScatterVectorized.emplace_back(Cnt, VF); @@ -7799,8 +7915,9 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, } InstructionCost CommonCost = 0; SmallVector Mask; + bool IsReverseOrder = isReverseOrder(E->ReorderIndices); if (!E->ReorderIndices.empty() && - E->State != TreeEntry::PossibleStridedVectorize) { + (E->State != TreeEntry::StridedVectorize || !IsReverseOrder)) { SmallVector NewMask; if (E->getOpcode() == Instruction::Store) { // For stores the order is actually a mask. @@ -7818,7 +7935,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, FinalVecTy, Mask); assert((E->State == TreeEntry::Vectorize || E->State == TreeEntry::ScatterVectorize || - E->State == TreeEntry::PossibleStridedVectorize) && + E->State == TreeEntry::StridedVectorize) && "Unhandled state"); assert(E->getOpcode() && ((allSameType(VL) && allSameBlock(VL)) || @@ -7921,7 +8038,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, // loads) or (2) when Ptrs are the arguments of loads or stores being // vectorized as plane wide unit-stride load/store since all the // loads/stores are known to be from/to adjacent locations. - assert(E->State == TreeEntry::Vectorize && + assert((E->State == TreeEntry::Vectorize || + E->State == TreeEntry::StridedVectorize) && "Entry state expected to be Vectorize here."); if (isa(VL0)) { // Case 2: estimate costs for pointer related costs when vectorizing to @@ -8334,10 +8452,14 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, VecLdCost = TTI->getMemoryOpCost( Instruction::Load, VecTy, LI0->getAlign(), LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo()); + } else if (E->State == TreeEntry::StridedVectorize) { + Align CommonAlignment = + computeCommonAlignment(UniqueValues.getArrayRef()); + VecLdCost = TTI->getStridedMemoryOpCost( + Instruction::Load, VecTy, LI0->getPointerOperand(), + /*VariableMask=*/false, CommonAlignment, CostKind); } else { - assert((E->State == TreeEntry::ScatterVectorize || - E->State == TreeEntry::PossibleStridedVectorize) && - "Unknown EntryState"); + assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState"); Align CommonAlignment = computeCommonAlignment(UniqueValues.getArrayRef()); VecLdCost = TTI->getGatherScatterOpCost( @@ -8350,8 +8472,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost); // If this node generates masked gather load then it is not a terminal node. // Hence address operand cost is estimated separately. - if (E->State == TreeEntry::ScatterVectorize || - E->State == TreeEntry::PossibleStridedVectorize) + if (E->State == TreeEntry::ScatterVectorize) return Cost; // Estimate cost of GEPs since this tree node is a terminator. @@ -8560,7 +8681,7 @@ bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const { if (VectorizableTree[0]->State == TreeEntry::NeedToGather || (VectorizableTree[1]->State == TreeEntry::NeedToGather && VectorizableTree[0]->State != TreeEntry::ScatterVectorize && - VectorizableTree[0]->State != TreeEntry::PossibleStridedVectorize)) + VectorizableTree[0]->State != TreeEntry::StridedVectorize)) return false; return true; @@ -10529,11 +10650,6 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis { Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx, bool PostponedPHIs) { ValueList &VL = E->getOperand(NodeIdx); - if (E->State == TreeEntry::PossibleStridedVectorize && - !E->ReorderIndices.empty()) { - SmallVector Mask(E->ReorderIndices.begin(), E->ReorderIndices.end()); - reorderScalars(VL, Mask); - } const unsigned VF = VL.size(); InstructionsState S = getSameOpcode(VL, *TLI); // Special processing for GEPs bundle, which may include non-gep values. @@ -11107,6 +11223,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { return Vec; } + bool IsReverseOrder = isReverseOrder(E->ReorderIndices); auto FinalShuffle = [&](Value *V, const TreeEntry *E, VectorType *VecTy, bool IsSigned) { if (V->getType() != VecTy) @@ -11117,7 +11234,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { ArrayRef(reinterpret_cast(E->ReorderIndices.begin()), E->ReorderIndices.size()); ShuffleBuilder.add(V, Mask); - } else if (E->State == TreeEntry::PossibleStridedVectorize) { + } else if (E->State == TreeEntry::StridedVectorize && IsReverseOrder) { ShuffleBuilder.addOrdered(V, std::nullopt); } else { ShuffleBuilder.addOrdered(V, E->ReorderIndices); @@ -11127,7 +11244,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { assert((E->State == TreeEntry::Vectorize || E->State == TreeEntry::ScatterVectorize || - E->State == TreeEntry::PossibleStridedVectorize) && + E->State == TreeEntry::StridedVectorize) && "Unhandled state"); unsigned ShuffleOrOp = E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode(); @@ -11592,10 +11709,49 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { Value *PO = LI->getPointerOperand(); if (E->State == TreeEntry::Vectorize) { NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign()); + } else if (E->State == TreeEntry::StridedVectorize) { + Value *Ptr0 = cast(E->Scalars.front())->getPointerOperand(); + Value *PtrN = cast(E->Scalars.back())->getPointerOperand(); + PO = IsReverseOrder ? PtrN : Ptr0; + std::optional Diff = getPointersDiff( + VL0->getType(), Ptr0, VL0->getType(), PtrN, *DL, *SE); + Type *StrideTy = DL->getIndexType(PO->getType()); + Value *StrideVal; + if (Diff) { + int Stride = *Diff / (static_cast(E->Scalars.size()) - 1); + StrideVal = + ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride * + DL->getTypeAllocSize(ScalarTy)); + } else { + SmallVector PointerOps(E->Scalars.size(), nullptr); + transform(E->Scalars, PointerOps.begin(), [](Value *V) { + return cast(V)->getPointerOperand(); + }); + OrdersType Order; + std::optional Stride = + calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order, + &*Builder.GetInsertPoint()); + Value *NewStride = + Builder.CreateIntCast(*Stride, StrideTy, /*isSigned=*/true); + StrideVal = Builder.CreateMul( + NewStride, + ConstantInt::get( + StrideTy, + (IsReverseOrder ? -1 : 1) * + static_cast(DL->getTypeAllocSize(ScalarTy)))); + } + Align CommonAlignment = computeCommonAlignment(E->Scalars); + auto *Inst = Builder.CreateIntrinsic( + Intrinsic::experimental_vp_strided_load, + {VecTy, PO->getType(), StrideTy}, + {PO, StrideVal, Builder.getAllOnesMask(VecTy->getElementCount()), + Builder.getInt32(E->Scalars.size())}); + Inst->addParamAttr( + /*ArgNo=*/0, + Attribute::getWithAlignment(Inst->getContext(), CommonAlignment)); + NewLI = Inst; } else { - assert((E->State == TreeEntry::ScatterVectorize || - E->State == TreeEntry::PossibleStridedVectorize) && - "Unhandled state"); + assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state"); Value *VecPtr = vectorizeOperand(E, 0, PostponedPHIs); if (E->VectorizedValue) { LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); @@ -12019,8 +12175,11 @@ Value *BoUpSLP::vectorizeTree( [&](llvm::User *U) { TreeEntry *UseEntry = getTreeEntry(U); return UseEntry && - UseEntry->State == TreeEntry::Vectorize && - E->State == TreeEntry::Vectorize && + (UseEntry->State == TreeEntry::Vectorize || + UseEntry->State == + TreeEntry::StridedVectorize) && + (E->State == TreeEntry::Vectorize || + E->State == TreeEntry::StridedVectorize) && doesInTreeUserNeedToExtract( Scalar, cast(UseEntry->Scalars.front()), diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll index ccc31193c7215..05c76469b641e 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll @@ -29,75 +29,73 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; CHECK-NEXT: [[TMP15:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_1]], align 1 ; CHECK-NEXT: [[TMP16:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_2]], align 1 ; CHECK-NEXT: [[TMP17:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_2]], align 1 -; CHECK-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr i8, ptr null, i64 4 ; CHECK-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr i8, ptr null, i64 4 -; CHECK-NEXT: [[TMP18:%.*]] = insertelement <2 x ptr> , ptr [[ARRAYIDX3_3]], i32 1 -; CHECK-NEXT: [[TMP19:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP18]], i32 1, <2 x i1> , <2 x i8> poison) -; CHECK-NEXT: [[TMP20:%.*]] = load i8, ptr null, align 1 +; CHECK-NEXT: [[TMP18:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 4, <2 x i1> , i32 2) +; CHECK-NEXT: [[TMP19:%.*]] = load i8, ptr null, align 1 +; CHECK-NEXT: [[TMP20:%.*]] = load <4 x i8>, ptr null, align 1 ; CHECK-NEXT: [[TMP21:%.*]] = load <4 x i8>, ptr null, align 1 -; CHECK-NEXT: [[TMP22:%.*]] = load <4 x i8>, ptr null, align 1 -; CHECK-NEXT: [[TMP23:%.*]] = load i8, ptr null, align 1 -; CHECK-NEXT: [[TMP24:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_3]], align 1 -; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <4 x i8> [[TMP21]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <4 x i8> [[TMP14]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <16 x i8> [[TMP25]], <16 x i8> [[TMP26]], <16 x i32> -; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP29:%.*]] = shufflevector <16 x i8> [[TMP27]], <16 x i8> [[TMP28]], <16 x i32> -; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP31:%.*]] = shufflevector <16 x i8> [[TMP29]], <16 x i8> [[TMP30]], <16 x i32> -; CHECK-NEXT: [[TMP32:%.*]] = zext <16 x i8> [[TMP31]] to <16 x i32> -; CHECK-NEXT: [[TMP33:%.*]] = shufflevector <4 x i8> [[TMP22]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP34:%.*]] = shufflevector <4 x i8> [[TMP15]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP35:%.*]] = shufflevector <16 x i8> [[TMP33]], <16 x i8> [[TMP34]], <16 x i32> -; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP37:%.*]] = shufflevector <16 x i8> [[TMP35]], <16 x i8> [[TMP36]], <16 x i32> -; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP39:%.*]] = shufflevector <16 x i8> [[TMP37]], <16 x i8> [[TMP38]], <16 x i32> -; CHECK-NEXT: [[TMP40:%.*]] = zext <16 x i8> [[TMP39]] to <16 x i32> -; CHECK-NEXT: [[TMP41:%.*]] = sub <16 x i32> [[TMP32]], [[TMP40]] -; CHECK-NEXT: [[TMP42:%.*]] = insertelement <16 x i8> poison, i8 [[TMP23]], i32 0 -; CHECK-NEXT: [[TMP43:%.*]] = insertelement <16 x i8> [[TMP42]], i8 [[TMP20]], i32 1 -; CHECK-NEXT: [[TMP44:%.*]] = shufflevector <2 x i8> [[TMP19]], <2 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP45:%.*]] = shufflevector <16 x i8> [[TMP43]], <16 x i8> [[TMP44]], <16 x i32> -; CHECK-NEXT: [[TMP46:%.*]] = shufflevector <4 x i8> [[TMP16]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP47:%.*]] = shufflevector <16 x i8> [[TMP45]], <16 x i8> [[TMP46]], <16 x i32> -; CHECK-NEXT: [[TMP48:%.*]] = shufflevector <8 x i8> [[TMP8]], <8 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP49:%.*]] = shufflevector <16 x i8> [[TMP47]], <16 x i8> [[TMP48]], <16 x i32> -; CHECK-NEXT: [[TMP50:%.*]] = zext <16 x i8> [[TMP49]] to <16 x i32> -; CHECK-NEXT: [[TMP51:%.*]] = shufflevector <16 x i32> [[TMP50]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP52:%.*]] = shufflevector <4 x i8> [[TMP24]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP53:%.*]] = shufflevector <4 x i8> [[TMP17]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP54:%.*]] = shufflevector <16 x i8> [[TMP52]], <16 x i8> [[TMP53]], <16 x i32> -; CHECK-NEXT: [[TMP55:%.*]] = shufflevector <8 x i8> [[TMP13]], <8 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP56:%.*]] = shufflevector <16 x i8> [[TMP54]], <16 x i8> [[TMP55]], <16 x i32> -; CHECK-NEXT: [[TMP57:%.*]] = zext <16 x i8> [[TMP56]] to <16 x i32> -; CHECK-NEXT: [[TMP58:%.*]] = sub <16 x i32> [[TMP51]], [[TMP57]] -; CHECK-NEXT: [[TMP59:%.*]] = shl <16 x i32> [[TMP58]], -; CHECK-NEXT: [[TMP60:%.*]] = add <16 x i32> [[TMP59]], [[TMP41]] -; CHECK-NEXT: [[TMP61:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP62:%.*]] = add <16 x i32> [[TMP60]], [[TMP61]] -; CHECK-NEXT: [[TMP63:%.*]] = sub <16 x i32> [[TMP60]], [[TMP61]] -; CHECK-NEXT: [[TMP64:%.*]] = shufflevector <16 x i32> [[TMP62]], <16 x i32> [[TMP63]], <16 x i32> -; CHECK-NEXT: [[TMP65:%.*]] = shufflevector <16 x i32> [[TMP64]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP66:%.*]] = add <16 x i32> [[TMP64]], [[TMP65]] -; CHECK-NEXT: [[TMP67:%.*]] = sub <16 x i32> [[TMP64]], [[TMP65]] -; CHECK-NEXT: [[TMP68:%.*]] = shufflevector <16 x i32> [[TMP66]], <16 x i32> [[TMP67]], <16 x i32> -; CHECK-NEXT: [[TMP69:%.*]] = shufflevector <16 x i32> [[TMP68]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP70:%.*]] = add <16 x i32> [[TMP68]], [[TMP69]] -; CHECK-NEXT: [[TMP71:%.*]] = sub <16 x i32> [[TMP68]], [[TMP69]] -; CHECK-NEXT: [[TMP72:%.*]] = shufflevector <16 x i32> [[TMP70]], <16 x i32> [[TMP71]], <16 x i32> -; CHECK-NEXT: [[TMP73:%.*]] = shufflevector <16 x i32> [[TMP72]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP74:%.*]] = add <16 x i32> [[TMP72]], [[TMP73]] -; CHECK-NEXT: [[TMP75:%.*]] = sub <16 x i32> [[TMP72]], [[TMP73]] -; CHECK-NEXT: [[TMP76:%.*]] = shufflevector <16 x i32> [[TMP74]], <16 x i32> [[TMP75]], <16 x i32> -; CHECK-NEXT: [[TMP77:%.*]] = shufflevector <16 x i32> [[TMP32]], <16 x i32> [[TMP64]], <16 x i32> -; CHECK-NEXT: [[TMP78:%.*]] = lshr <16 x i32> [[TMP77]], -; CHECK-NEXT: [[TMP79:%.*]] = and <16 x i32> [[TMP78]], -; CHECK-NEXT: [[TMP80:%.*]] = mul <16 x i32> [[TMP79]], -; CHECK-NEXT: [[TMP81:%.*]] = add <16 x i32> [[TMP80]], [[TMP76]] -; CHECK-NEXT: [[TMP82:%.*]] = xor <16 x i32> [[TMP81]], [[TMP77]] -; CHECK-NEXT: [[TMP83:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP82]]) -; CHECK-NEXT: ret i32 [[TMP83]] +; CHECK-NEXT: [[TMP22:%.*]] = load i8, ptr null, align 1 +; CHECK-NEXT: [[TMP23:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_3]], align 1 +; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <4 x i8> [[TMP20]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <4 x i8> [[TMP14]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <16 x i8> [[TMP24]], <16 x i8> [[TMP25]], <16 x i32> +; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <16 x i8> [[TMP26]], <16 x i8> [[TMP27]], <16 x i32> +; CHECK-NEXT: [[TMP29:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <16 x i8> [[TMP28]], <16 x i8> [[TMP29]], <16 x i32> +; CHECK-NEXT: [[TMP31:%.*]] = zext <16 x i8> [[TMP30]] to <16 x i32> +; CHECK-NEXT: [[TMP32:%.*]] = shufflevector <4 x i8> [[TMP21]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP33:%.*]] = shufflevector <4 x i8> [[TMP15]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP34:%.*]] = shufflevector <16 x i8> [[TMP32]], <16 x i8> [[TMP33]], <16 x i32> +; CHECK-NEXT: [[TMP35:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <16 x i8> [[TMP34]], <16 x i8> [[TMP35]], <16 x i32> +; CHECK-NEXT: [[TMP37:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <16 x i8> [[TMP36]], <16 x i8> [[TMP37]], <16 x i32> +; CHECK-NEXT: [[TMP39:%.*]] = zext <16 x i8> [[TMP38]] to <16 x i32> +; CHECK-NEXT: [[TMP40:%.*]] = sub <16 x i32> [[TMP31]], [[TMP39]] +; CHECK-NEXT: [[TMP41:%.*]] = insertelement <16 x i8> poison, i8 [[TMP22]], i32 0 +; CHECK-NEXT: [[TMP42:%.*]] = insertelement <16 x i8> [[TMP41]], i8 [[TMP19]], i32 1 +; CHECK-NEXT: [[TMP43:%.*]] = shufflevector <2 x i8> [[TMP18]], <2 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP44:%.*]] = shufflevector <16 x i8> [[TMP42]], <16 x i8> [[TMP43]], <16 x i32> +; CHECK-NEXT: [[TMP45:%.*]] = shufflevector <4 x i8> [[TMP16]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP46:%.*]] = shufflevector <16 x i8> [[TMP44]], <16 x i8> [[TMP45]], <16 x i32> +; CHECK-NEXT: [[TMP47:%.*]] = shufflevector <8 x i8> [[TMP8]], <8 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP48:%.*]] = shufflevector <16 x i8> [[TMP46]], <16 x i8> [[TMP47]], <16 x i32> +; CHECK-NEXT: [[TMP49:%.*]] = zext <16 x i8> [[TMP48]] to <16 x i32> +; CHECK-NEXT: [[TMP50:%.*]] = shufflevector <16 x i32> [[TMP49]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP51:%.*]] = shufflevector <4 x i8> [[TMP23]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP52:%.*]] = shufflevector <4 x i8> [[TMP17]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP53:%.*]] = shufflevector <16 x i8> [[TMP51]], <16 x i8> [[TMP52]], <16 x i32> +; CHECK-NEXT: [[TMP54:%.*]] = shufflevector <8 x i8> [[TMP13]], <8 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP55:%.*]] = shufflevector <16 x i8> [[TMP53]], <16 x i8> [[TMP54]], <16 x i32> +; CHECK-NEXT: [[TMP56:%.*]] = zext <16 x i8> [[TMP55]] to <16 x i32> +; CHECK-NEXT: [[TMP57:%.*]] = sub <16 x i32> [[TMP50]], [[TMP56]] +; CHECK-NEXT: [[TMP58:%.*]] = shl <16 x i32> [[TMP57]], +; CHECK-NEXT: [[TMP59:%.*]] = add <16 x i32> [[TMP58]], [[TMP40]] +; CHECK-NEXT: [[TMP60:%.*]] = shufflevector <16 x i32> [[TMP59]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP61:%.*]] = add <16 x i32> [[TMP59]], [[TMP60]] +; CHECK-NEXT: [[TMP62:%.*]] = sub <16 x i32> [[TMP59]], [[TMP60]] +; CHECK-NEXT: [[TMP63:%.*]] = shufflevector <16 x i32> [[TMP61]], <16 x i32> [[TMP62]], <16 x i32> +; CHECK-NEXT: [[TMP64:%.*]] = shufflevector <16 x i32> [[TMP63]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP65:%.*]] = add <16 x i32> [[TMP63]], [[TMP64]] +; CHECK-NEXT: [[TMP66:%.*]] = sub <16 x i32> [[TMP63]], [[TMP64]] +; CHECK-NEXT: [[TMP67:%.*]] = shufflevector <16 x i32> [[TMP65]], <16 x i32> [[TMP66]], <16 x i32> +; CHECK-NEXT: [[TMP68:%.*]] = shufflevector <16 x i32> [[TMP67]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP69:%.*]] = add <16 x i32> [[TMP67]], [[TMP68]] +; CHECK-NEXT: [[TMP70:%.*]] = sub <16 x i32> [[TMP67]], [[TMP68]] +; CHECK-NEXT: [[TMP71:%.*]] = shufflevector <16 x i32> [[TMP69]], <16 x i32> [[TMP70]], <16 x i32> +; CHECK-NEXT: [[TMP72:%.*]] = shufflevector <16 x i32> [[TMP71]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP73:%.*]] = add <16 x i32> [[TMP71]], [[TMP72]] +; CHECK-NEXT: [[TMP74:%.*]] = sub <16 x i32> [[TMP71]], [[TMP72]] +; CHECK-NEXT: [[TMP75:%.*]] = shufflevector <16 x i32> [[TMP73]], <16 x i32> [[TMP74]], <16 x i32> +; CHECK-NEXT: [[TMP76:%.*]] = shufflevector <16 x i32> [[TMP31]], <16 x i32> [[TMP63]], <16 x i32> +; CHECK-NEXT: [[TMP77:%.*]] = lshr <16 x i32> [[TMP76]], +; CHECK-NEXT: [[TMP78:%.*]] = and <16 x i32> [[TMP77]], +; CHECK-NEXT: [[TMP79:%.*]] = mul <16 x i32> [[TMP78]], +; CHECK-NEXT: [[TMP80:%.*]] = add <16 x i32> [[TMP79]], [[TMP75]] +; CHECK-NEXT: [[TMP81:%.*]] = xor <16 x i32> [[TMP80]], [[TMP76]] +; CHECK-NEXT: [[TMP82:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP81]]) +; CHECK-NEXT: ret i32 [[TMP82]] ; entry: %0 = load i8, ptr %pix1, align 1 diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll index 27e8f084e553d..591ef8865021c 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll @@ -5,61 +5,12 @@ define void @test([48 x float]* %p, float* noalias %s) { ; CHECK-LABEL: @test( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [48 x float], ptr [[P:%.*]], i64 0, i64 0 -; CHECK-NEXT: [[I:%.*]] = load float, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 30 -; CHECK-NEXT: [[I1:%.*]] = load float, ptr [[ARRAYIDX1]], align 4 -; CHECK-NEXT: [[ADD:%.*]] = fsub fast float [[I1]], [[I]] ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[S:%.*]], i64 0 -; CHECK-NEXT: store float [[ADD]], ptr [[ARRAYIDX2]], align 4 -; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 4 -; CHECK-NEXT: [[I2:%.*]] = load float, ptr [[ARRAYIDX4]], align 4 -; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 26 -; CHECK-NEXT: [[I3:%.*]] = load float, ptr [[ARRAYIDX6]], align 4 -; CHECK-NEXT: [[ADD7:%.*]] = fsub fast float [[I3]], [[I2]] -; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr [[S]], i64 1 -; CHECK-NEXT: store float [[ADD7]], ptr [[ARRAYIDX9]], align 4 -; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 8 -; CHECK-NEXT: [[I4:%.*]] = load float, ptr [[ARRAYIDX11]], align 4 -; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 22 -; CHECK-NEXT: [[I5:%.*]] = load float, ptr [[ARRAYIDX13]], align 4 -; CHECK-NEXT: [[ADD14:%.*]] = fsub fast float [[I5]], [[I4]] -; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds float, ptr [[S]], i64 2 -; CHECK-NEXT: store float [[ADD14]], ptr [[ARRAYIDX16]], align 4 -; CHECK-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 12 -; CHECK-NEXT: [[I6:%.*]] = load float, ptr [[ARRAYIDX18]], align 4 -; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 18 -; CHECK-NEXT: [[I7:%.*]] = load float, ptr [[ARRAYIDX20]], align 4 -; CHECK-NEXT: [[ADD21:%.*]] = fsub fast float [[I7]], [[I6]] -; CHECK-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds float, ptr [[S]], i64 3 -; CHECK-NEXT: store float [[ADD21]], ptr [[ARRAYIDX23]], align 4 -; CHECK-NEXT: [[ARRAYIDX25:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 16 -; CHECK-NEXT: [[I8:%.*]] = load float, ptr [[ARRAYIDX25]], align 4 -; CHECK-NEXT: [[ARRAYIDX27:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 14 -; CHECK-NEXT: [[I9:%.*]] = load float, ptr [[ARRAYIDX27]], align 4 -; CHECK-NEXT: [[ADD28:%.*]] = fsub fast float [[I9]], [[I8]] -; CHECK-NEXT: [[ARRAYIDX30:%.*]] = getelementptr inbounds float, ptr [[S]], i64 4 -; CHECK-NEXT: store float [[ADD28]], ptr [[ARRAYIDX30]], align 4 -; CHECK-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 20 -; CHECK-NEXT: [[I10:%.*]] = load float, ptr [[ARRAYIDX32]], align 4 -; CHECK-NEXT: [[ARRAYIDX34:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 10 -; CHECK-NEXT: [[I11:%.*]] = load float, ptr [[ARRAYIDX34]], align 4 -; CHECK-NEXT: [[ADD35:%.*]] = fsub fast float [[I11]], [[I10]] -; CHECK-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds float, ptr [[S]], i64 5 -; CHECK-NEXT: store float [[ADD35]], ptr [[ARRAYIDX37]], align 4 -; CHECK-NEXT: [[ARRAYIDX39:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 24 -; CHECK-NEXT: [[I12:%.*]] = load float, ptr [[ARRAYIDX39]], align 4 -; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 6 -; CHECK-NEXT: [[I13:%.*]] = load float, ptr [[ARRAYIDX41]], align 4 -; CHECK-NEXT: [[ADD42:%.*]] = fsub fast float [[I13]], [[I12]] -; CHECK-NEXT: [[ARRAYIDX44:%.*]] = getelementptr inbounds float, ptr [[S]], i64 6 -; CHECK-NEXT: store float [[ADD42]], ptr [[ARRAYIDX44]], align 4 -; CHECK-NEXT: [[ARRAYIDX46:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 28 -; CHECK-NEXT: [[I14:%.*]] = load float, ptr [[ARRAYIDX46]], align 4 -; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 2 -; CHECK-NEXT: [[I15:%.*]] = load float, ptr [[ARRAYIDX48]], align 4 -; CHECK-NEXT: [[ADD49:%.*]] = fsub fast float [[I15]], [[I14]] -; CHECK-NEXT: [[ARRAYIDX51:%.*]] = getelementptr inbounds float, ptr [[S]], i64 7 -; CHECK-NEXT: store float [[ADD49]], ptr [[ARRAYIDX51]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i64(ptr align 4 [[ARRAYIDX]], i64 16, <8 x i1> , i32 8) +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i64(ptr align 4 [[ARRAYIDX1]], i64 -16, <8 x i1> , i32 8) +; CHECK-NEXT: [[TMP2:%.*]] = fsub fast <8 x float> [[TMP1]], [[TMP0]] +; CHECK-NEXT: store <8 x float> [[TMP2]], ptr [[ARRAYIDX2]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -127,67 +78,13 @@ define void @test1([48 x float]* %p, float* noalias %s, i32 %stride) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[STR:%.*]] = zext i32 [[STRIDE:%.*]] to i64 ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [48 x float], ptr [[P:%.*]], i64 0, i64 0 -; CHECK-NEXT: [[I:%.*]] = load float, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 30 -; CHECK-NEXT: [[I1:%.*]] = load float, ptr [[ARRAYIDX1]], align 4 -; CHECK-NEXT: [[ADD:%.*]] = fsub fast float [[I1]], [[I]] ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[S:%.*]], i64 0 -; CHECK-NEXT: store float [[ADD]], ptr [[ARRAYIDX2]], align 4 -; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[STR]] -; CHECK-NEXT: [[I2:%.*]] = load float, ptr [[ARRAYIDX4]], align 4 -; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 26 -; CHECK-NEXT: [[I3:%.*]] = load float, ptr [[ARRAYIDX6]], align 4 -; CHECK-NEXT: [[ADD7:%.*]] = fsub fast float [[I3]], [[I2]] -; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr [[S]], i64 1 -; CHECK-NEXT: store float [[ADD7]], ptr [[ARRAYIDX9]], align 4 -; CHECK-NEXT: [[ST1:%.*]] = mul i64 [[STR]], 2 -; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST1]] -; CHECK-NEXT: [[I4:%.*]] = load float, ptr [[ARRAYIDX11]], align 4 -; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 22 -; CHECK-NEXT: [[I5:%.*]] = load float, ptr [[ARRAYIDX13]], align 4 -; CHECK-NEXT: [[ADD14:%.*]] = fsub fast float [[I5]], [[I4]] -; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds float, ptr [[S]], i64 2 -; CHECK-NEXT: store float [[ADD14]], ptr [[ARRAYIDX16]], align 4 -; CHECK-NEXT: [[ST2:%.*]] = mul i64 [[STR]], 3 -; CHECK-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST2]] -; CHECK-NEXT: [[I6:%.*]] = load float, ptr [[ARRAYIDX18]], align 4 -; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 18 -; CHECK-NEXT: [[I7:%.*]] = load float, ptr [[ARRAYIDX20]], align 4 -; CHECK-NEXT: [[ADD21:%.*]] = fsub fast float [[I7]], [[I6]] -; CHECK-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds float, ptr [[S]], i64 3 -; CHECK-NEXT: store float [[ADD21]], ptr [[ARRAYIDX23]], align 4 -; CHECK-NEXT: [[ST3:%.*]] = mul i64 [[STR]], 4 -; CHECK-NEXT: [[ARRAYIDX25:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST3]] -; CHECK-NEXT: [[I8:%.*]] = load float, ptr [[ARRAYIDX25]], align 4 -; CHECK-NEXT: [[ARRAYIDX27:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 14 -; CHECK-NEXT: [[I9:%.*]] = load float, ptr [[ARRAYIDX27]], align 4 -; CHECK-NEXT: [[ADD28:%.*]] = fsub fast float [[I9]], [[I8]] -; CHECK-NEXT: [[ARRAYIDX30:%.*]] = getelementptr inbounds float, ptr [[S]], i64 4 -; CHECK-NEXT: store float [[ADD28]], ptr [[ARRAYIDX30]], align 4 -; CHECK-NEXT: [[ST4:%.*]] = mul i64 [[STR]], 5 -; CHECK-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST4]] -; CHECK-NEXT: [[I10:%.*]] = load float, ptr [[ARRAYIDX32]], align 4 -; CHECK-NEXT: [[ARRAYIDX34:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 10 -; CHECK-NEXT: [[I11:%.*]] = load float, ptr [[ARRAYIDX34]], align 4 -; CHECK-NEXT: [[ADD35:%.*]] = fsub fast float [[I11]], [[I10]] -; CHECK-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds float, ptr [[S]], i64 5 -; CHECK-NEXT: store float [[ADD35]], ptr [[ARRAYIDX37]], align 4 -; CHECK-NEXT: [[ST5:%.*]] = mul i64 [[STR]], 6 -; CHECK-NEXT: [[ARRAYIDX39:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST5]] -; CHECK-NEXT: [[I12:%.*]] = load float, ptr [[ARRAYIDX39]], align 4 -; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 6 -; CHECK-NEXT: [[I13:%.*]] = load float, ptr [[ARRAYIDX41]], align 4 -; CHECK-NEXT: [[ADD42:%.*]] = fsub fast float [[I13]], [[I12]] -; CHECK-NEXT: [[ARRAYIDX44:%.*]] = getelementptr inbounds float, ptr [[S]], i64 6 -; CHECK-NEXT: store float [[ADD42]], ptr [[ARRAYIDX44]], align 4 -; CHECK-NEXT: [[ST6:%.*]] = mul i64 [[STR]], 7 -; CHECK-NEXT: [[ARRAYIDX46:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST6]] -; CHECK-NEXT: [[I14:%.*]] = load float, ptr [[ARRAYIDX46]], align 4 -; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 2 -; CHECK-NEXT: [[I15:%.*]] = load float, ptr [[ARRAYIDX48]], align 4 -; CHECK-NEXT: [[ADD49:%.*]] = fsub fast float [[I15]], [[I14]] -; CHECK-NEXT: [[ARRAYIDX51:%.*]] = getelementptr inbounds float, ptr [[S]], i64 7 -; CHECK-NEXT: store float [[ADD49]], ptr [[ARRAYIDX51]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = mul i64 [[STR]], 4 +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i64(ptr align 4 [[ARRAYIDX]], i64 [[TMP0]], <8 x i1> , i32 8) +; CHECK-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i64(ptr align 4 [[ARRAYIDX1]], i64 -16, <8 x i1> , i32 8) +; CHECK-NEXT: [[TMP3:%.*]] = fsub fast <8 x float> [[TMP2]], [[TMP1]] +; CHECK-NEXT: store <8 x float> [[TMP3]], ptr [[ARRAYIDX2]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -262,67 +159,14 @@ define void @test2([48 x float]* %p, float* noalias %s, i32 %stride) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[STR:%.*]] = zext i32 [[STRIDE:%.*]] to i64 ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [48 x float], ptr [[P:%.*]], i64 0, i64 2 -; CHECK-NEXT: [[I:%.*]] = load float, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[ST6:%.*]] = mul i64 [[STR]], 7 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST6]] -; CHECK-NEXT: [[I1:%.*]] = load float, ptr [[ARRAYIDX1]], align 4 -; CHECK-NEXT: [[ADD:%.*]] = fsub fast float [[I1]], [[I]] ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[S:%.*]], i64 0 -; CHECK-NEXT: store float [[ADD]], ptr [[ARRAYIDX2]], align 4 -; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 6 -; CHECK-NEXT: [[I2:%.*]] = load float, ptr [[ARRAYIDX4]], align 4 -; CHECK-NEXT: [[ST5:%.*]] = mul i64 [[STR]], 6 -; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST5]] -; CHECK-NEXT: [[I3:%.*]] = load float, ptr [[ARRAYIDX6]], align 4 -; CHECK-NEXT: [[ADD7:%.*]] = fsub fast float [[I3]], [[I2]] -; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr [[S]], i64 1 -; CHECK-NEXT: store float [[ADD7]], ptr [[ARRAYIDX9]], align 4 -; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 10 -; CHECK-NEXT: [[I4:%.*]] = load float, ptr [[ARRAYIDX11]], align 4 -; CHECK-NEXT: [[ST4:%.*]] = mul i64 [[STR]], 5 -; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST4]] -; CHECK-NEXT: [[I5:%.*]] = load float, ptr [[ARRAYIDX13]], align 4 -; CHECK-NEXT: [[ADD14:%.*]] = fsub fast float [[I5]], [[I4]] -; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds float, ptr [[S]], i64 2 -; CHECK-NEXT: store float [[ADD14]], ptr [[ARRAYIDX16]], align 4 -; CHECK-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 14 -; CHECK-NEXT: [[I6:%.*]] = load float, ptr [[ARRAYIDX18]], align 4 -; CHECK-NEXT: [[ST3:%.*]] = mul i64 [[STR]], 4 -; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST3]] -; CHECK-NEXT: [[I7:%.*]] = load float, ptr [[ARRAYIDX20]], align 4 -; CHECK-NEXT: [[ADD21:%.*]] = fsub fast float [[I7]], [[I6]] -; CHECK-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds float, ptr [[S]], i64 3 -; CHECK-NEXT: store float [[ADD21]], ptr [[ARRAYIDX23]], align 4 -; CHECK-NEXT: [[ARRAYIDX25:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 18 -; CHECK-NEXT: [[ST2:%.*]] = mul i64 [[STR]], 3 -; CHECK-NEXT: [[I8:%.*]] = load float, ptr [[ARRAYIDX25]], align 4 -; CHECK-NEXT: [[ARRAYIDX27:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST2]] -; CHECK-NEXT: [[I9:%.*]] = load float, ptr [[ARRAYIDX27]], align 4 -; CHECK-NEXT: [[ADD28:%.*]] = fsub fast float [[I9]], [[I8]] -; CHECK-NEXT: [[ARRAYIDX30:%.*]] = getelementptr inbounds float, ptr [[S]], i64 4 -; CHECK-NEXT: store float [[ADD28]], ptr [[ARRAYIDX30]], align 4 -; CHECK-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 22 -; CHECK-NEXT: [[I10:%.*]] = load float, ptr [[ARRAYIDX32]], align 4 -; CHECK-NEXT: [[ST1:%.*]] = mul i64 [[STR]], 2 -; CHECK-NEXT: [[ARRAYIDX34:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[ST1]] -; CHECK-NEXT: [[I11:%.*]] = load float, ptr [[ARRAYIDX34]], align 4 -; CHECK-NEXT: [[ADD35:%.*]] = fsub fast float [[I11]], [[I10]] -; CHECK-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds float, ptr [[S]], i64 5 -; CHECK-NEXT: store float [[ADD35]], ptr [[ARRAYIDX37]], align 4 -; CHECK-NEXT: [[ARRAYIDX39:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 26 -; CHECK-NEXT: [[I12:%.*]] = load float, ptr [[ARRAYIDX39]], align 4 -; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 [[STR]] -; CHECK-NEXT: [[I13:%.*]] = load float, ptr [[ARRAYIDX41]], align 4 -; CHECK-NEXT: [[ADD42:%.*]] = fsub fast float [[I13]], [[I12]] -; CHECK-NEXT: [[ARRAYIDX44:%.*]] = getelementptr inbounds float, ptr [[S]], i64 6 -; CHECK-NEXT: store float [[ADD42]], ptr [[ARRAYIDX44]], align 4 -; CHECK-NEXT: [[ARRAYIDX46:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 30 -; CHECK-NEXT: [[I14:%.*]] = load float, ptr [[ARRAYIDX46]], align 4 -; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 0 -; CHECK-NEXT: [[I15:%.*]] = load float, ptr [[ARRAYIDX48]], align 4 -; CHECK-NEXT: [[ADD49:%.*]] = fsub fast float [[I15]], [[I14]] -; CHECK-NEXT: [[ARRAYIDX51:%.*]] = getelementptr inbounds float, ptr [[S]], i64 7 -; CHECK-NEXT: store float [[ADD49]], ptr [[ARRAYIDX51]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i64(ptr align 4 [[ARRAYIDX]], i64 16, <8 x i1> , i32 8) +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[STR]], -4 +; CHECK-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i64(ptr align 4 [[ARRAYIDX1]], i64 [[TMP1]], <8 x i1> , i32 8) +; CHECK-NEXT: [[TMP3:%.*]] = fsub fast <8 x float> [[TMP2]], [[TMP0]] +; CHECK-NEXT: store <8 x float> [[TMP3]], ptr [[ARRAYIDX2]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -397,27 +241,12 @@ define void @test3([48 x float]* %p, float* noalias %s) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [48 x float], ptr [[P:%.*]], i64 0, i64 0 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[S:%.*]], i64 0 -; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 4 -; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 8 -; CHECK-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 12 -; CHECK-NEXT: [[ARRAYIDX25:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 16 -; CHECK-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 20 -; CHECK-NEXT: [[ARRAYIDX39:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 24 -; CHECK-NEXT: [[ARRAYIDX46:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 28 ; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 23 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x ptr> poison, ptr [[ARRAYIDX]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x ptr> [[TMP0]], ptr [[ARRAYIDX4]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x ptr> [[TMP1]], ptr [[ARRAYIDX11]], i32 2 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x ptr> [[TMP2]], ptr [[ARRAYIDX18]], i32 3 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x ptr> [[TMP3]], ptr [[ARRAYIDX25]], i32 4 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x ptr> [[TMP4]], ptr [[ARRAYIDX32]], i32 5 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x ptr> [[TMP5]], ptr [[ARRAYIDX39]], i32 6 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x ptr> [[TMP6]], ptr [[ARRAYIDX46]], i32 7 -; CHECK-NEXT: [[TMP8:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP7]], i32 4, <8 x i1> , <8 x float> poison) -; CHECK-NEXT: [[TMP9:%.*]] = load <8 x float>, ptr [[ARRAYIDX48]], align 4 -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x float> [[TMP9]], <8 x float> poison, <8 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = fsub fast <8 x float> [[TMP10]], [[TMP8]] -; CHECK-NEXT: store <8 x float> [[TMP11]], ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i64(ptr align 4 [[ARRAYIDX]], i64 16, <8 x i1> , i32 8) +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, ptr [[ARRAYIDX48]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = fsub fast <8 x float> [[TMP2]], [[TMP0]] +; CHECK-NEXT: store <8 x float> [[TMP3]], ptr [[ARRAYIDX2]], align 4 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-use-ptr.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-use-ptr.ll index 5aba9ea115a4b..ec152c707eec6 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-use-ptr.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-use-ptr.ll @@ -8,7 +8,7 @@ define i16 @test() { ; CHECK-SAME: () #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[PPREV_058_I:%.*]] = getelementptr [[S:%.*]], ptr null, i64 -1 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x ptr> , ptr [[PPREV_058_I]], i32 0 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x ptr> , ptr [[PPREV_058_I]], i32 1 ; CHECK-NEXT: br label [[WHILE_BODY_I:%.*]] ; CHECK: while.body.i: ; CHECK-NEXT: [[TMP1:%.*]] = phi i16 [ 0, [[WHILE_BODY_I]] ], [ 0, [[ENTRY:%.*]] ] @@ -17,7 +17,7 @@ define i16 @test() { ; CHECK-NEXT: [[TMP4:%.*]] = call <2 x i16> @llvm.masked.gather.v2i16.v2p0(<2 x ptr> [[TMP3]], i32 2, <2 x i1> , <2 x i16> poison) ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i16> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i16> [[TMP4]], i32 1 -; CHECK-NEXT: [[CMP_I178:%.*]] = icmp ult i16 [[TMP6]], [[TMP5]] +; CHECK-NEXT: [[CMP_I178:%.*]] = icmp ult i16 [[TMP5]], [[TMP6]] ; CHECK-NEXT: br label [[WHILE_BODY_I]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads.ll index 8f2c72bb4c685..8ab57cc73e646 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads.ll @@ -5,14 +5,11 @@ define i32 @sum_of_abs(ptr noalias %a, ptr noalias %b) { ; CHECK-LABEL: define i32 @sum_of_abs ; CHECK-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x ptr> poison, ptr [[A]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x ptr> [[TMP0]], <8 x ptr> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, <8 x ptr> [[TMP1]], <8 x i64> -; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> [[TMP2]], i32 1, <8 x i1> , <8 x i8> poison) -; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i8> @llvm.abs.v8i8(<8 x i8> [[TMP3]], i1 false) -; CHECK-NEXT: [[TMP5:%.*]] = sext <8 x i8> [[TMP4]] to <8 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP5]]) -; CHECK-NEXT: ret i32 [[TMP6]] +; CHECK-NEXT: [[TMP0:%.*]] = call <8 x i8> @llvm.experimental.vp.strided.load.v8i8.p0.i64(ptr align 1 [[A]], i64 64, <8 x i1> , i32 8) +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i8> @llvm.abs.v8i8(<8 x i8> [[TMP0]], i1 false) +; CHECK-NEXT: [[TMP2:%.*]] = sext <8 x i8> [[TMP1]] to <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]]) +; CHECK-NEXT: ret i32 [[TMP3]] ; entry: %0 = load i8, ptr %a, align 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll b/llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll index 96d4c307f1c67..9e43cefef2801 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll @@ -30,7 +30,7 @@ define void @test() { ; CHECK-SLP-THRESHOLD: bb: ; CHECK-SLP-THRESHOLD-NEXT: [[TMP0:%.*]] = insertelement <4 x ptr> poison, ptr [[COND_IN_V]], i32 0 ; CHECK-SLP-THRESHOLD-NEXT: [[TMP1:%.*]] = shufflevector <4 x ptr> [[TMP0]], <4 x ptr> poison, <4 x i32> zeroinitializer -; CHECK-SLP-THRESHOLD-NEXT: [[TMP2:%.*]] = getelementptr i64, <4 x ptr> [[TMP1]], <4 x i64> +; CHECK-SLP-THRESHOLD-NEXT: [[TMP2:%.*]] = getelementptr i64, <4 x ptr> [[TMP1]], <4 x i64> ; CHECK-SLP-THRESHOLD-NEXT: [[TMP3:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> [[TMP2]], i32 8, <4 x i1> , <4 x i64> poison) ; CHECK-SLP-THRESHOLD-NEXT: [[TMP4:%.*]] = icmp eq <4 x i64> [[TMP3]], zeroinitializer ; CHECK-SLP-THRESHOLD-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll index 1add732d32e85..3bc6e64606e39 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll @@ -7,7 +7,7 @@ define i32 @test(ptr noalias %p, ptr noalias %addr) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x ptr> poison, ptr [[ADDR:%.*]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x ptr> [[TMP0]], <8 x ptr> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, <8 x ptr> [[TMP1]], <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, <8 x ptr> [[TMP1]], <8 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP2]], i32 8, <8 x i1> , <8 x i32> poison) ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x ptr> poison, ptr [[P:%.*]], i32 0 ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x ptr> [[TMP4]], <8 x ptr> poison, <8 x i32> zeroinitializer From 8bdb5dfdc70545bb76fb01033df736c075f0c6a0 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Thu, 1 Feb 2024 17:29:45 +0000 Subject: [PATCH 2/2] Fix formatting Created using spr 1.3.5 --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 90b9b51c470bf..e211907002b4f 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -3852,12 +3852,7 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) { namespace { /// Tracks the state we can represent the loads in the given sequence. -enum class LoadsState { - Gather, - Vectorize, - ScatterVectorize, - StridedVectorize -}; +enum class LoadsState { Gather, Vectorize, ScatterVectorize, StridedVectorize }; } // anonymous namespace static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,