From 5b9cc46436ab5c99724ed6ac6245b7a771831917 Mon Sep 17 00:00:00 2001 From: David Sherwood Date: Wed, 13 Nov 2024 15:36:59 +0000 Subject: [PATCH 1/4] [LoopVectorize] Add support for reverse loops in isDereferenceableAndAlignedInLoop Currently when we encounter a negative step in the induction variable isDereferenceableAndAlignedInLoop bails out because the element size is signed greater than the step. This patch adds support for negative steps in cases where we detect the start address for the load is of the form base + offset. In this case the address decrements in each iteration so we need to calculate the access size differently. I have done this by caling getStartAndEndForAccess from LoopAccessAnalysis.cpp. The changed test in LoopVectorize/X86/load-deref-pred.ll now passes because previously we were calculating the total access size incorrectly, whereas now it is 412 bytes and fits perfectly into the alloca. --- .../llvm/Analysis/LoopAccessAnalysis.h | 9 + llvm/lib/Analysis/Loads.cpp | 122 ++++----- llvm/lib/Analysis/LoopAccessAnalysis.cpp | 48 ++-- .../LoopVectorize/X86/load-deref-pred.ll | 238 ++++++------------ .../LoopVectorize/load-deref-pred-align.ll | 27 +- 5 files changed, 183 insertions(+), 261 deletions(-) diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h index 31374a128856c..541ddc82b21c4 100644 --- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h +++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h @@ -853,6 +853,15 @@ bool sortPtrAccesses(ArrayRef VL, Type *ElemTy, const DataLayout &DL, bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL, ScalarEvolution &SE, bool CheckType = true); +/// For a given Loop \p Lp and pointer \p PtrExpr return a pair of SCEV values +/// representing the maximum range of addresses accessed in the loop, i.e. +/// [min,max). +std::pair getStartAndEndForAccess( + const Loop *Lp, const SCEV *PtrExpr, Type *AccessTy, const SCEV *MaxBECount, + ScalarEvolution *SE, + DenseMap, + std::pair> *PointerBounds); + class LoopAccessInfoManager { /// The cache. DenseMap> LoopAccessInfoMap; diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp index 7bbd469bd035d..7bf5d2c4a8390 100644 --- a/llvm/lib/Analysis/Loads.cpp +++ b/llvm/lib/Analysis/Loads.cpp @@ -13,6 +13,7 @@ #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AssumeBundleQueries.h" +#include "llvm/Analysis/LoopAccessAnalysis.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/MemoryLocation.h" @@ -275,84 +276,85 @@ static bool AreEquivalentAddressValues(const Value *A, const Value *B) { bool llvm::isDereferenceableAndAlignedInLoop( LoadInst *LI, Loop *L, ScalarEvolution &SE, DominatorTree &DT, AssumptionCache *AC, SmallVectorImpl *Predicates) { - auto &DL = LI->getDataLayout(); - Value *Ptr = LI->getPointerOperand(); - - APInt EltSize(DL.getIndexTypeSizeInBits(Ptr->getType()), - DL.getTypeStoreSize(LI->getType()).getFixedValue()); - const Align Alignment = LI->getAlign(); + const SCEV *Ptr = SE.getSCEV(LI->getPointerOperand()); + auto *AddRec = dyn_cast(Ptr); - Instruction *HeaderFirstNonPHI = L->getHeader()->getFirstNonPHI(); - - // If given a uniform (i.e. non-varying) address, see if we can prove the - // access is safe within the loop w/o needing predication. - if (L->isLoopInvariant(Ptr)) - return isDereferenceableAndAlignedPointer(Ptr, Alignment, EltSize, DL, - HeaderFirstNonPHI, AC, &DT); - - // Otherwise, check to see if we have a repeating access pattern where we can - // prove that all accesses are well aligned and dereferenceable. - auto *AddRec = dyn_cast(SE.getSCEV(Ptr)); + // Check to see if we have a repeating access pattern and it's possible + // to prove all accesses are well aligned. if (!AddRec || AddRec->getLoop() != L || !AddRec->isAffine()) return false; + auto* Step = dyn_cast(AddRec->getStepRecurrence(SE)); if (!Step) return false; - auto TC = SE.getSmallConstantMaxTripCount(L, Predicates); - if (!TC) + // For the moment, restrict ourselves to the case where the access size is a + // multiple of the requested alignment and the base is aligned. + // TODO: generalize if a case found which warrants + const Align Alignment = LI->getAlign(); + auto &DL = LI->getDataLayout(); + APInt EltSize(DL.getIndexTypeSizeInBits(Ptr->getType()), + DL.getTypeStoreSize(LI->getType()).getFixedValue()); + if (EltSize.urem(Alignment.value()) != 0) return false; // TODO: Handle overlapping accesses. - // We should be computing AccessSize as (TC - 1) * Step + EltSize. - if (EltSize.sgt(Step->getAPInt())) + if (EltSize.ugt(Step->getAPInt().abs())) return false; - // Compute the total access size for access patterns with unit stride and - // patterns with gaps. For patterns with unit stride, Step and EltSize are the - // same. - // For patterns with gaps (i.e. non unit stride), we are - // accessing EltSize bytes at every Step. - APInt AccessSize = TC * Step->getAPInt(); + const SCEV *MaxBECount = + SE.getPredicatedSymbolicMaxBackedgeTakenCount(L, *Predicates); + if (isa(MaxBECount)) + return false; - assert(SE.isLoopInvariant(AddRec->getStart(), L) && - "implied by addrec definition"); - Value *Base = nullptr; - if (auto *StartS = dyn_cast(AddRec->getStart())) { - Base = StartS->getValue(); - } else if (auto *StartS = dyn_cast(AddRec->getStart())) { - // Handle (NewBase + offset) as start value. - const auto *Offset = dyn_cast(StartS->getOperand(0)); - const auto *NewBase = dyn_cast(StartS->getOperand(1)); - if (StartS->getNumOperands() == 2 && Offset && NewBase) { - // The following code below assumes the offset is unsigned, but GEP - // offsets are treated as signed so we can end up with a signed value - // here too. For example, suppose the initial PHI value is (i8 255), - // the offset will be treated as (i8 -1) and sign-extended to (i64 -1). - if (Offset->getAPInt().isNegative()) - return false; + const auto &[AccessStart, AccessEnd] = + getStartAndEndForAccess(L, Ptr, LI->getType(), MaxBECount, &SE, nullptr); + if (isa(AccessStart) || + isa(AccessEnd)) + return false; - // For the moment, restrict ourselves to the case where the offset is a - // multiple of the requested alignment and the base is aligned. - // TODO: generalize if a case found which warrants - if (Offset->getAPInt().urem(Alignment.value()) != 0) - return false; - Base = NewBase->getValue(); - bool Overflow = false; - AccessSize = AccessSize.uadd_ov(Offset->getAPInt(), Overflow); - if (Overflow) - return false; - } - } + // Try to get the access size. + const SCEV *PtrDiff = SE.getMinusSCEV(AccessEnd, AccessStart); + APInt MaxPtrDiff = SE.getUnsignedRangeMax(PtrDiff); - if (!Base) + // If the (max) pointer difference is > 32 bits then it's unlikely to be + // dereferenceable. + if (MaxPtrDiff.getActiveBits() > 32) return false; - // For the moment, restrict ourselves to the case where the access size is a - // multiple of the requested alignment and the base is aligned. - // TODO: generalize if a case found which warrants - if (EltSize.urem(Alignment.value()) != 0) + Value *Base = nullptr; + APInt AccessSize; + if (const SCEVUnknown *NewBase = dyn_cast(AccessStart)) { + Base = NewBase->getValue(); + AccessSize = MaxPtrDiff; + } else if (auto *MinAdd = dyn_cast(AccessStart)) { + if (MinAdd->getNumOperands() != 2) + return false; + + const auto *Offset = dyn_cast(MinAdd->getOperand(0)); + const auto *NewBase = dyn_cast(MinAdd->getOperand(1)); + if (!Offset || !NewBase) + return false; + + // The following code below assumes the offset is unsigned, but GEP + // offsets are treated as signed so we can end up with a signed value + // here too. For example, suppose the initial PHI value is (i8 255), + // the offset will be treated as (i8 -1) and sign-extended to (i64 -1). + if (Offset->getAPInt().isNegative()) + return false; + + // For the moment, restrict ourselves to the case where the offset is a + // multiple of the requested alignment and the base is aligned. + // TODO: generalize if a case found which warrants + if (Offset->getAPInt().urem(Alignment.value()) != 0) + return false; + + AccessSize = MaxPtrDiff + Offset->getAPInt(); + Base = NewBase->getValue(); + } else return false; + + Instruction *HeaderFirstNonPHI = L->getHeader()->getFirstNonPHI(); return isDereferenceableAndAlignedPointer(Base, Alignment, AccessSize, DL, HeaderFirstNonPHI, AC, &DT); } diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index 2a68979add666..2bcdb5246eb70 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -203,18 +203,20 @@ RuntimeCheckingPtrGroup::RuntimeCheckingPtrGroup( /// /// There is no conflict when the intervals are disjoint: /// NoConflict = (P2.Start >= P1.End) || (P1.Start >= P2.End) -static std::pair getStartAndEndForAccess( - const Loop *Lp, const SCEV *PtrExpr, Type *AccessTy, - PredicatedScalarEvolution &PSE, +std::pair llvm::getStartAndEndForAccess( + const Loop *Lp, const SCEV *PtrExpr, Type *AccessTy, const SCEV *MaxBECount, + ScalarEvolution *SE, DenseMap, - std::pair> &PointerBounds) { - ScalarEvolution *SE = PSE.getSE(); - - auto [Iter, Ins] = PointerBounds.insert( - {{PtrExpr, AccessTy}, - {SE->getCouldNotCompute(), SE->getCouldNotCompute()}}); - if (!Ins) - return Iter->second; + std::pair> *PointerBounds) { + std::pair *PtrBoundsPair; + if (PointerBounds) { + auto [Iter, Ins] = PointerBounds->insert( + {{PtrExpr, AccessTy}, + {SE->getCouldNotCompute(), SE->getCouldNotCompute()}}); + if (!Ins) + return Iter->second; + PtrBoundsPair = &Iter->second; + } const SCEV *ScStart; const SCEV *ScEnd; @@ -222,10 +224,8 @@ static std::pair getStartAndEndForAccess( if (SE->isLoopInvariant(PtrExpr, Lp)) { ScStart = ScEnd = PtrExpr; } else if (auto *AR = dyn_cast(PtrExpr)) { - const SCEV *Ex = PSE.getSymbolicMaxBackedgeTakenCount(); - ScStart = AR->getStart(); - ScEnd = AR->evaluateAtIteration(Ex, *SE); + ScEnd = AR->evaluateAtIteration(MaxBECount, *SE); const SCEV *Step = AR->getStepRecurrence(*SE); // For expressions with negative step, the upper bound is ScStart and the @@ -244,7 +244,7 @@ static std::pair getStartAndEndForAccess( return {SE->getCouldNotCompute(), SE->getCouldNotCompute()}; assert(SE->isLoopInvariant(ScStart, Lp) && "ScStart needs to be invariant"); - assert(SE->isLoopInvariant(ScEnd, Lp)&& "ScEnd needs to be invariant"); + assert(SE->isLoopInvariant(ScEnd, Lp) && "ScEnd needs to be invariant"); // Add the size of the pointed element to ScEnd. auto &DL = Lp->getHeader()->getDataLayout(); @@ -252,8 +252,10 @@ static std::pair getStartAndEndForAccess( const SCEV *EltSizeSCEV = SE->getStoreSizeOfExpr(IdxTy, AccessTy); ScEnd = SE->getAddExpr(ScEnd, EltSizeSCEV); - Iter->second = {ScStart, ScEnd}; - return Iter->second; + std::pair Res = {ScStart, ScEnd}; + if (PointerBounds) + *PtrBoundsPair = Res; + return Res; } /// Calculate Start and End points of memory access using @@ -263,8 +265,9 @@ void RuntimePointerChecking::insert(Loop *Lp, Value *Ptr, const SCEV *PtrExpr, unsigned DepSetId, unsigned ASId, PredicatedScalarEvolution &PSE, bool NeedsFreeze) { + const SCEV *MaxBECount = PSE.getSymbolicMaxBackedgeTakenCount(); const auto &[ScStart, ScEnd] = getStartAndEndForAccess( - Lp, PtrExpr, AccessTy, PSE, DC.getPointerBounds()); + Lp, PtrExpr, AccessTy, MaxBECount, PSE.getSE(), &DC.getPointerBounds()); assert(!isa(ScStart) && !isa(ScEnd) && "must be able to compute both start and end expressions"); @@ -1938,10 +1941,11 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize( // required for correctness. if (SE.isLoopInvariant(Src, InnermostLoop) || SE.isLoopInvariant(Sink, InnermostLoop)) { - const auto &[SrcStart_, SrcEnd_] = - getStartAndEndForAccess(InnermostLoop, Src, ATy, PSE, PointerBounds); - const auto &[SinkStart_, SinkEnd_] = - getStartAndEndForAccess(InnermostLoop, Sink, BTy, PSE, PointerBounds); + const SCEV *MaxBECount = PSE.getSymbolicMaxBackedgeTakenCount(); + const auto &[SrcStart_, SrcEnd_] = getStartAndEndForAccess( + InnermostLoop, Src, ATy, MaxBECount, PSE.getSE(), &PointerBounds); + const auto &[SinkStart_, SinkEnd_] = getStartAndEndForAccess( + InnermostLoop, Sink, BTy, MaxBECount, PSE.getSE(), &PointerBounds); if (!isa(SrcStart_) && !isa(SrcEnd_) && !isa(SinkStart_) && diff --git a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll index 1433e48690bc6..3e50ee42866b9 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll @@ -2920,8 +2920,8 @@ loop_exit: ret i32 %accum.next } -define i32 @neg_test_non_unit_stride_off_by_four_bytes(i64 %len, ptr %test_base) { -; CHECK-LABEL: @neg_test_non_unit_stride_off_by_four_bytes( +define i32 @test_non_unit_stride_off_by_four_bytes(i64 %len, ptr %test_base) { +; CHECK-LABEL: @test_non_unit_stride_off_by_four_bytes( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [103 x i32], align 4 ; CHECK-NEXT: call void @init(ptr [[ALLOCA]]) @@ -2929,11 +2929,11 @@ define i32 @neg_test_non_unit_stride_off_by_four_bytes(i64 %len, ptr %test_base) ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE33:%.*]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP144:%.*]], [[PRED_LOAD_CONTINUE33]] ] -; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP145:%.*]], [[PRED_LOAD_CONTINUE33]] ] -; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP146:%.*]], [[PRED_LOAD_CONTINUE33]] ] -; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP147:%.*]], [[PRED_LOAD_CONTINUE33]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP112:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP113:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP114:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP115:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 2 @@ -2999,170 +2999,74 @@ define i32 @neg_test_non_unit_stride_off_by_four_bytes(i64 %len, ptr %test_base) ; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1 ; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2 ; CHECK-NEXT: [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3 -; CHECK-NEXT: [[TMP64:%.*]] = extractelement <4 x i1> [[TMP39]], i32 0 -; CHECK-NEXT: br i1 [[TMP64]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] -; CHECK: pred.load.if: -; CHECK-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP66:%.*]] = load i32, ptr [[TMP65]], align 4 -; CHECK-NEXT: [[TMP67:%.*]] = insertelement <4 x i32> poison, i32 [[TMP66]], i32 0 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE]] -; CHECK: pred.load.continue: -; CHECK-NEXT: [[TMP68:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP67]], [[PRED_LOAD_IF]] ] -; CHECK-NEXT: [[TMP69:%.*]] = extractelement <4 x i1> [[TMP39]], i32 1 -; CHECK-NEXT: br i1 [[TMP69]], label [[PRED_LOAD_IF4:%.*]], label [[PRED_LOAD_CONTINUE5:%.*]] -; CHECK: pred.load.if4: -; CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP71:%.*]] = load i32, ptr [[TMP70]], align 4 -; CHECK-NEXT: [[TMP72:%.*]] = insertelement <4 x i32> [[TMP68]], i32 [[TMP71]], i32 1 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE5]] -; CHECK: pred.load.continue5: -; CHECK-NEXT: [[TMP73:%.*]] = phi <4 x i32> [ [[TMP68]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP72]], [[PRED_LOAD_IF4]] ] -; CHECK-NEXT: [[TMP74:%.*]] = extractelement <4 x i1> [[TMP39]], i32 2 -; CHECK-NEXT: br i1 [[TMP74]], label [[PRED_LOAD_IF6:%.*]], label [[PRED_LOAD_CONTINUE7:%.*]] -; CHECK: pred.load.if6: -; CHECK-NEXT: [[TMP75:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP76:%.*]] = load i32, ptr [[TMP75]], align 4 -; CHECK-NEXT: [[TMP77:%.*]] = insertelement <4 x i32> [[TMP73]], i32 [[TMP76]], i32 2 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE7]] -; CHECK: pred.load.continue7: -; CHECK-NEXT: [[TMP78:%.*]] = phi <4 x i32> [ [[TMP73]], [[PRED_LOAD_CONTINUE5]] ], [ [[TMP77]], [[PRED_LOAD_IF6]] ] -; CHECK-NEXT: [[TMP79:%.*]] = extractelement <4 x i1> [[TMP39]], i32 3 -; CHECK-NEXT: br i1 [[TMP79]], label [[PRED_LOAD_IF8:%.*]], label [[PRED_LOAD_CONTINUE9:%.*]] -; CHECK: pred.load.if8: -; CHECK-NEXT: [[TMP80:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP81:%.*]] = load i32, ptr [[TMP80]], align 4 -; CHECK-NEXT: [[TMP82:%.*]] = insertelement <4 x i32> [[TMP78]], i32 [[TMP81]], i32 3 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE9]] -; CHECK: pred.load.continue9: -; CHECK-NEXT: [[TMP83:%.*]] = phi <4 x i32> [ [[TMP78]], [[PRED_LOAD_CONTINUE7]] ], [ [[TMP82]], [[PRED_LOAD_IF8]] ] -; CHECK-NEXT: [[TMP84:%.*]] = extractelement <4 x i1> [[TMP47]], i32 0 -; CHECK-NEXT: br i1 [[TMP84]], label [[PRED_LOAD_IF10:%.*]], label [[PRED_LOAD_CONTINUE11:%.*]] -; CHECK: pred.load.if10: -; CHECK-NEXT: [[TMP85:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP86:%.*]] = load i32, ptr [[TMP85]], align 4 -; CHECK-NEXT: [[TMP87:%.*]] = insertelement <4 x i32> poison, i32 [[TMP86]], i32 0 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE11]] -; CHECK: pred.load.continue11: -; CHECK-NEXT: [[TMP88:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE9]] ], [ [[TMP87]], [[PRED_LOAD_IF10]] ] -; CHECK-NEXT: [[TMP89:%.*]] = extractelement <4 x i1> [[TMP47]], i32 1 -; CHECK-NEXT: br i1 [[TMP89]], label [[PRED_LOAD_IF12:%.*]], label [[PRED_LOAD_CONTINUE13:%.*]] -; CHECK: pred.load.if12: -; CHECK-NEXT: [[TMP90:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP91:%.*]] = load i32, ptr [[TMP90]], align 4 -; CHECK-NEXT: [[TMP92:%.*]] = insertelement <4 x i32> [[TMP88]], i32 [[TMP91]], i32 1 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE13]] -; CHECK: pred.load.continue13: -; CHECK-NEXT: [[TMP93:%.*]] = phi <4 x i32> [ [[TMP88]], [[PRED_LOAD_CONTINUE11]] ], [ [[TMP92]], [[PRED_LOAD_IF12]] ] -; CHECK-NEXT: [[TMP94:%.*]] = extractelement <4 x i1> [[TMP47]], i32 2 -; CHECK-NEXT: br i1 [[TMP94]], label [[PRED_LOAD_IF14:%.*]], label [[PRED_LOAD_CONTINUE15:%.*]] -; CHECK: pred.load.if14: -; CHECK-NEXT: [[TMP95:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP96:%.*]] = load i32, ptr [[TMP95]], align 4 -; CHECK-NEXT: [[TMP97:%.*]] = insertelement <4 x i32> [[TMP93]], i32 [[TMP96]], i32 2 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE15]] -; CHECK: pred.load.continue15: -; CHECK-NEXT: [[TMP98:%.*]] = phi <4 x i32> [ [[TMP93]], [[PRED_LOAD_CONTINUE13]] ], [ [[TMP97]], [[PRED_LOAD_IF14]] ] -; CHECK-NEXT: [[TMP99:%.*]] = extractelement <4 x i1> [[TMP47]], i32 3 -; CHECK-NEXT: br i1 [[TMP99]], label [[PRED_LOAD_IF16:%.*]], label [[PRED_LOAD_CONTINUE17:%.*]] -; CHECK: pred.load.if16: -; CHECK-NEXT: [[TMP100:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP101:%.*]] = load i32, ptr [[TMP100]], align 4 -; CHECK-NEXT: [[TMP102:%.*]] = insertelement <4 x i32> [[TMP98]], i32 [[TMP101]], i32 3 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE17]] -; CHECK: pred.load.continue17: -; CHECK-NEXT: [[TMP103:%.*]] = phi <4 x i32> [ [[TMP98]], [[PRED_LOAD_CONTINUE15]] ], [ [[TMP102]], [[PRED_LOAD_IF16]] ] -; CHECK-NEXT: [[TMP104:%.*]] = extractelement <4 x i1> [[TMP55]], i32 0 -; CHECK-NEXT: br i1 [[TMP104]], label [[PRED_LOAD_IF18:%.*]], label [[PRED_LOAD_CONTINUE19:%.*]] -; CHECK: pred.load.if18: -; CHECK-NEXT: [[TMP105:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP106:%.*]] = load i32, ptr [[TMP105]], align 4 -; CHECK-NEXT: [[TMP107:%.*]] = insertelement <4 x i32> poison, i32 [[TMP106]], i32 0 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE19]] -; CHECK: pred.load.continue19: -; CHECK-NEXT: [[TMP108:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE17]] ], [ [[TMP107]], [[PRED_LOAD_IF18]] ] -; CHECK-NEXT: [[TMP109:%.*]] = extractelement <4 x i1> [[TMP55]], i32 1 -; CHECK-NEXT: br i1 [[TMP109]], label [[PRED_LOAD_IF20:%.*]], label [[PRED_LOAD_CONTINUE21:%.*]] -; CHECK: pred.load.if20: -; CHECK-NEXT: [[TMP110:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP111:%.*]] = load i32, ptr [[TMP110]], align 4 -; CHECK-NEXT: [[TMP112:%.*]] = insertelement <4 x i32> [[TMP108]], i32 [[TMP111]], i32 1 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE21]] -; CHECK: pred.load.continue21: -; CHECK-NEXT: [[TMP113:%.*]] = phi <4 x i32> [ [[TMP108]], [[PRED_LOAD_CONTINUE19]] ], [ [[TMP112]], [[PRED_LOAD_IF20]] ] -; CHECK-NEXT: [[TMP114:%.*]] = extractelement <4 x i1> [[TMP55]], i32 2 -; CHECK-NEXT: br i1 [[TMP114]], label [[PRED_LOAD_IF22:%.*]], label [[PRED_LOAD_CONTINUE23:%.*]] -; CHECK: pred.load.if22: -; CHECK-NEXT: [[TMP115:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP116:%.*]] = load i32, ptr [[TMP115]], align 4 -; CHECK-NEXT: [[TMP117:%.*]] = insertelement <4 x i32> [[TMP113]], i32 [[TMP116]], i32 2 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE23]] -; CHECK: pred.load.continue23: -; CHECK-NEXT: [[TMP118:%.*]] = phi <4 x i32> [ [[TMP113]], [[PRED_LOAD_CONTINUE21]] ], [ [[TMP117]], [[PRED_LOAD_IF22]] ] -; CHECK-NEXT: [[TMP119:%.*]] = extractelement <4 x i1> [[TMP55]], i32 3 -; CHECK-NEXT: br i1 [[TMP119]], label [[PRED_LOAD_IF24:%.*]], label [[PRED_LOAD_CONTINUE25:%.*]] -; CHECK: pred.load.if24: -; CHECK-NEXT: [[TMP120:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP11]] -; CHECK-NEXT: [[TMP121:%.*]] = load i32, ptr [[TMP120]], align 4 -; CHECK-NEXT: [[TMP122:%.*]] = insertelement <4 x i32> [[TMP118]], i32 [[TMP121]], i32 3 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE25]] -; CHECK: pred.load.continue25: -; CHECK-NEXT: [[TMP123:%.*]] = phi <4 x i32> [ [[TMP118]], [[PRED_LOAD_CONTINUE23]] ], [ [[TMP122]], [[PRED_LOAD_IF24]] ] -; CHECK-NEXT: [[TMP124:%.*]] = extractelement <4 x i1> [[TMP63]], i32 0 -; CHECK-NEXT: br i1 [[TMP124]], label [[PRED_LOAD_IF26:%.*]], label [[PRED_LOAD_CONTINUE27:%.*]] -; CHECK: pred.load.if26: -; CHECK-NEXT: [[TMP125:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP126:%.*]] = load i32, ptr [[TMP125]], align 4 -; CHECK-NEXT: [[TMP127:%.*]] = insertelement <4 x i32> poison, i32 [[TMP126]], i32 0 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE27]] -; CHECK: pred.load.continue27: -; CHECK-NEXT: [[TMP128:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE25]] ], [ [[TMP127]], [[PRED_LOAD_IF26]] ] -; CHECK-NEXT: [[TMP129:%.*]] = extractelement <4 x i1> [[TMP63]], i32 1 -; CHECK-NEXT: br i1 [[TMP129]], label [[PRED_LOAD_IF28:%.*]], label [[PRED_LOAD_CONTINUE29:%.*]] -; CHECK: pred.load.if28: -; CHECK-NEXT: [[TMP130:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP131:%.*]] = load i32, ptr [[TMP130]], align 4 -; CHECK-NEXT: [[TMP132:%.*]] = insertelement <4 x i32> [[TMP128]], i32 [[TMP131]], i32 1 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE29]] -; CHECK: pred.load.continue29: -; CHECK-NEXT: [[TMP133:%.*]] = phi <4 x i32> [ [[TMP128]], [[PRED_LOAD_CONTINUE27]] ], [ [[TMP132]], [[PRED_LOAD_IF28]] ] -; CHECK-NEXT: [[TMP134:%.*]] = extractelement <4 x i1> [[TMP63]], i32 2 -; CHECK-NEXT: br i1 [[TMP134]], label [[PRED_LOAD_IF30:%.*]], label [[PRED_LOAD_CONTINUE31:%.*]] -; CHECK: pred.load.if30: -; CHECK-NEXT: [[TMP135:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP136:%.*]] = load i32, ptr [[TMP135]], align 4 -; CHECK-NEXT: [[TMP137:%.*]] = insertelement <4 x i32> [[TMP133]], i32 [[TMP136]], i32 2 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE31]] -; CHECK: pred.load.continue31: -; CHECK-NEXT: [[TMP138:%.*]] = phi <4 x i32> [ [[TMP133]], [[PRED_LOAD_CONTINUE29]] ], [ [[TMP137]], [[PRED_LOAD_IF30]] ] -; CHECK-NEXT: [[TMP139:%.*]] = extractelement <4 x i1> [[TMP63]], i32 3 -; CHECK-NEXT: br i1 [[TMP139]], label [[PRED_LOAD_IF32:%.*]], label [[PRED_LOAD_CONTINUE33]] -; CHECK: pred.load.if32: -; CHECK-NEXT: [[TMP140:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP15]] -; CHECK-NEXT: [[TMP141:%.*]] = load i32, ptr [[TMP140]], align 4 -; CHECK-NEXT: [[TMP142:%.*]] = insertelement <4 x i32> [[TMP138]], i32 [[TMP141]], i32 3 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE33]] -; CHECK: pred.load.continue33: -; CHECK-NEXT: [[TMP143:%.*]] = phi <4 x i32> [ [[TMP138]], [[PRED_LOAD_CONTINUE31]] ], [ [[TMP142]], [[PRED_LOAD_IF32]] ] -; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[TMP83]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI34:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[TMP103]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI35:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[TMP123]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI36:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[TMP143]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP144]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] -; CHECK-NEXT: [[TMP145]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI34]] -; CHECK-NEXT: [[TMP146]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI35]] -; CHECK-NEXT: [[TMP147]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI36]] +; CHECK-NEXT: [[TMP64:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP67:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP69:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP71:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP72:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP73:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP74:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP75:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP76:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP77:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP78:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP14]] +; CHECK-NEXT: [[TMP79:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP15]] +; CHECK-NEXT: [[TMP80:%.*]] = load i32, ptr [[TMP64]], align 4 +; CHECK-NEXT: [[TMP81:%.*]] = load i32, ptr [[TMP65]], align 4 +; CHECK-NEXT: [[TMP82:%.*]] = load i32, ptr [[TMP66]], align 4 +; CHECK-NEXT: [[TMP83:%.*]] = load i32, ptr [[TMP67]], align 4 +; CHECK-NEXT: [[TMP84:%.*]] = insertelement <4 x i32> poison, i32 [[TMP80]], i32 0 +; CHECK-NEXT: [[TMP85:%.*]] = insertelement <4 x i32> [[TMP84]], i32 [[TMP81]], i32 1 +; CHECK-NEXT: [[TMP86:%.*]] = insertelement <4 x i32> [[TMP85]], i32 [[TMP82]], i32 2 +; CHECK-NEXT: [[TMP87:%.*]] = insertelement <4 x i32> [[TMP86]], i32 [[TMP83]], i32 3 +; CHECK-NEXT: [[TMP88:%.*]] = load i32, ptr [[TMP68]], align 4 +; CHECK-NEXT: [[TMP89:%.*]] = load i32, ptr [[TMP69]], align 4 +; CHECK-NEXT: [[TMP90:%.*]] = load i32, ptr [[TMP70]], align 4 +; CHECK-NEXT: [[TMP91:%.*]] = load i32, ptr [[TMP71]], align 4 +; CHECK-NEXT: [[TMP92:%.*]] = insertelement <4 x i32> poison, i32 [[TMP88]], i32 0 +; CHECK-NEXT: [[TMP93:%.*]] = insertelement <4 x i32> [[TMP92]], i32 [[TMP89]], i32 1 +; CHECK-NEXT: [[TMP94:%.*]] = insertelement <4 x i32> [[TMP93]], i32 [[TMP90]], i32 2 +; CHECK-NEXT: [[TMP95:%.*]] = insertelement <4 x i32> [[TMP94]], i32 [[TMP91]], i32 3 +; CHECK-NEXT: [[TMP96:%.*]] = load i32, ptr [[TMP72]], align 4 +; CHECK-NEXT: [[TMP97:%.*]] = load i32, ptr [[TMP73]], align 4 +; CHECK-NEXT: [[TMP98:%.*]] = load i32, ptr [[TMP74]], align 4 +; CHECK-NEXT: [[TMP99:%.*]] = load i32, ptr [[TMP75]], align 4 +; CHECK-NEXT: [[TMP100:%.*]] = insertelement <4 x i32> poison, i32 [[TMP96]], i32 0 +; CHECK-NEXT: [[TMP101:%.*]] = insertelement <4 x i32> [[TMP100]], i32 [[TMP97]], i32 1 +; CHECK-NEXT: [[TMP102:%.*]] = insertelement <4 x i32> [[TMP101]], i32 [[TMP98]], i32 2 +; CHECK-NEXT: [[TMP103:%.*]] = insertelement <4 x i32> [[TMP102]], i32 [[TMP99]], i32 3 +; CHECK-NEXT: [[TMP104:%.*]] = load i32, ptr [[TMP76]], align 4 +; CHECK-NEXT: [[TMP105:%.*]] = load i32, ptr [[TMP77]], align 4 +; CHECK-NEXT: [[TMP106:%.*]] = load i32, ptr [[TMP78]], align 4 +; CHECK-NEXT: [[TMP107:%.*]] = load i32, ptr [[TMP79]], align 4 +; CHECK-NEXT: [[TMP108:%.*]] = insertelement <4 x i32> poison, i32 [[TMP104]], i32 0 +; CHECK-NEXT: [[TMP109:%.*]] = insertelement <4 x i32> [[TMP108]], i32 [[TMP105]], i32 1 +; CHECK-NEXT: [[TMP110:%.*]] = insertelement <4 x i32> [[TMP109]], i32 [[TMP106]], i32 2 +; CHECK-NEXT: [[TMP111:%.*]] = insertelement <4 x i32> [[TMP110]], i32 [[TMP107]], i32 3 +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[TMP87]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI4:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[TMP95]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI5:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[TMP103]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI6:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[TMP111]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP112]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] +; CHECK-NEXT: [[TMP113]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI4]] +; CHECK-NEXT: [[TMP114]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI5]] +; CHECK-NEXT: [[TMP115]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI6]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP148:%.*]] = icmp eq i64 [[INDEX_NEXT]], 48 -; CHECK-NEXT: br i1 [[TMP148]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]] +; CHECK-NEXT: [[TMP116:%.*]] = icmp eq i64 [[INDEX_NEXT]], 48 +; CHECK-NEXT: br i1 [[TMP116]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP145]], [[TMP144]] -; CHECK-NEXT: [[BIN_RDX37:%.*]] = add <4 x i32> [[TMP146]], [[BIN_RDX]] -; CHECK-NEXT: [[BIN_RDX38:%.*]] = add <4 x i32> [[TMP147]], [[BIN_RDX37]] -; CHECK-NEXT: [[TMP149:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX38]]) +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP113]], [[TMP112]] +; CHECK-NEXT: [[BIN_RDX7:%.*]] = add <4 x i32> [[TMP114]], [[BIN_RDX]] +; CHECK-NEXT: [[BIN_RDX8:%.*]] = add <4 x i32> [[TMP115]], [[BIN_RDX7]] +; CHECK-NEXT: [[TMP117:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX8]]) ; CHECK-NEXT: br i1 false, label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 96, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP149]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP117]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] @@ -3181,7 +3085,7 @@ define i32 @neg_test_non_unit_stride_off_by_four_bytes(i64 %len, ptr %test_base) ; CHECK-NEXT: [[EXIT:%.*]] = icmp ugt i64 [[IV]], 100 ; CHECK-NEXT: br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP37:![0-9]+]] ; CHECK: loop_exit: -; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP149]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP117]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[ACCUM_NEXT_LCSSA]] ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll b/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll index 0f4e327891899..cbc483fabc184 100644 --- a/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll +++ b/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll @@ -351,27 +351,30 @@ define void @test_rev_loops_deref_loads(ptr nocapture noundef writeonly %dest) { ; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <2 x i32> [[WIDE_LOAD]], <2 x i32> poison, <2 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <2 x i32> [[REVERSE]], splat (i32 3) ; CHECK-NEXT: [[TMP5:%.*]] = xor <2 x i1> [[TMP4]], splat (i1 true) +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP0]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i32, ptr [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[TMP7]], i32 -1 +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x i32>, ptr [[TMP8]], align 4 +; CHECK-NEXT: [[REVERSE2:%.*]] = shufflevector <2 x i32> [[WIDE_LOAD1]], <2 x i32> poison, <2 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0 ; CHECK-NEXT: br i1 [[TMP9]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; CHECK: pred.store.if: -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP0]] -; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP7]], align 4 ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[TMP0]] +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i32> [[REVERSE2]], i32 0 ; CHECK-NEXT: [[TMP12:%.*]] = shl nsw i32 [[TMP11]], 2 ; CHECK-NEXT: store i32 [[TMP12]], ptr [[TMP10]], align 4 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] ; CHECK: pred.store.continue: ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1 ; CHECK-NEXT: br i1 [[TMP13]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4]] -; CHECK: pred.store.if1: +; CHECK: pred.store.if3: ; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], -1 -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP21]], align 4 ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x i32> [[REVERSE2]], i32 1 ; CHECK-NEXT: [[TMP17:%.*]] = shl nsw i32 [[TMP16]], 2 ; CHECK-NEXT: store i32 [[TMP17]], ptr [[TMP15]], align 4 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE4]] -; CHECK: pred.store.continue2: +; CHECK: pred.store.continue4: ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] @@ -665,12 +668,15 @@ define void @test_rev_loops_strided_deref_loads(ptr nocapture noundef writeonly ; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <2 x i32> [[REVERSE]], splat (i32 3) ; CHECK-NEXT: [[TMP5:%.*]] = xor <2 x i1> [[TMP4]], splat (i1 true) ; CHECK-NEXT: [[TMP6:%.*]] = mul <2 x i64> [[VEC_IND]], splat (i64 2) +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP8]], align 4 +; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP10]], align 4 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0 ; CHECK-NEXT: br i1 [[TMP13]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; CHECK: pred.store.if: -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0 -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP8]] -; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 4 ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[TMP0]] ; CHECK-NEXT: [[TMP15:%.*]] = shl nsw i32 [[TMP11]], 2 ; CHECK-NEXT: store i32 [[TMP15]], ptr [[TMP14]], align 4 @@ -680,9 +686,6 @@ define void @test_rev_loops_strided_deref_loads(ptr nocapture noundef writeonly ; CHECK-NEXT: br i1 [[TMP16]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]] ; CHECK: pred.store.if1: ; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[OFFSET_IDX]], -1 -; CHECK-NEXT: [[TMP23:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1 -; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP23]] -; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP24]], align 4 ; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[TMP17]] ; CHECK-NEXT: [[TMP19:%.*]] = shl nsw i32 [[TMP12]], 2 ; CHECK-NEXT: store i32 [[TMP19]], ptr [[TMP18]], align 4 From 1cbff9379bdb1cc636c58795e021e6753d459bf8 Mon Sep 17 00:00:00 2001 From: David Sherwood Date: Fri, 6 Dec 2024 10:33:56 +0000 Subject: [PATCH 2/4] Address review comments --- llvm/lib/Analysis/Loads.cpp | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp index 7bf5d2c4a8390..4b3d06882e39a 100644 --- a/llvm/lib/Analysis/Loads.cpp +++ b/llvm/lib/Analysis/Loads.cpp @@ -303,7 +303,7 @@ bool llvm::isDereferenceableAndAlignedInLoop( return false; const SCEV *MaxBECount = - SE.getPredicatedSymbolicMaxBackedgeTakenCount(L, *Predicates); + SE.getPredicatedConstantMaxBackedgeTakenCount(L, *Predicates); if (isa(MaxBECount)) return false; @@ -317,11 +317,6 @@ bool llvm::isDereferenceableAndAlignedInLoop( const SCEV *PtrDiff = SE.getMinusSCEV(AccessEnd, AccessStart); APInt MaxPtrDiff = SE.getUnsignedRangeMax(PtrDiff); - // If the (max) pointer difference is > 32 bits then it's unlikely to be - // dereferenceable. - if (MaxPtrDiff.getActiveBits() > 32) - return false; - Value *Base = nullptr; APInt AccessSize; if (const SCEVUnknown *NewBase = dyn_cast(AccessStart)) { From 9aae1c7ad63fa7d16fd5273ffb7368e6b223c4c2 Mon Sep 17 00:00:00 2001 From: David Sherwood Date: Mon, 6 Jan 2025 13:18:15 +0000 Subject: [PATCH 3/4] Re-add code for invariant pointers --- llvm/lib/Analysis/Loads.cpp | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp index 4b3d06882e39a..cc6760292c2ff 100644 --- a/llvm/lib/Analysis/Loads.cpp +++ b/llvm/lib/Analysis/Loads.cpp @@ -276,8 +276,20 @@ static bool AreEquivalentAddressValues(const Value *A, const Value *B) { bool llvm::isDereferenceableAndAlignedInLoop( LoadInst *LI, Loop *L, ScalarEvolution &SE, DominatorTree &DT, AssumptionCache *AC, SmallVectorImpl *Predicates) { - const SCEV *Ptr = SE.getSCEV(LI->getPointerOperand()); - auto *AddRec = dyn_cast(Ptr); + const Align Alignment = LI->getAlign(); + auto &DL = LI->getDataLayout(); + Value *Ptr = LI->getPointerOperand(); + APInt EltSize(DL.getIndexTypeSizeInBits(Ptr->getType()), + DL.getTypeStoreSize(LI->getType()).getFixedValue()); + + // If given a uniform (i.e. non-varying) address, see if we can prove the + // access is safe within the loop w/o needing predication. + if (L->isLoopInvariant(Ptr)) + return isDereferenceableAndAlignedPointer( + Ptr, Alignment, EltSize, DL, L->getHeader()->getFirstNonPHI(), AC, &DT); + + const SCEV *PtrScev = SE.getSCEV(Ptr); + auto *AddRec = dyn_cast(PtrScev); // Check to see if we have a repeating access pattern and it's possible // to prove all accesses are well aligned. @@ -291,10 +303,6 @@ bool llvm::isDereferenceableAndAlignedInLoop( // For the moment, restrict ourselves to the case where the access size is a // multiple of the requested alignment and the base is aligned. // TODO: generalize if a case found which warrants - const Align Alignment = LI->getAlign(); - auto &DL = LI->getDataLayout(); - APInt EltSize(DL.getIndexTypeSizeInBits(Ptr->getType()), - DL.getTypeStoreSize(LI->getType()).getFixedValue()); if (EltSize.urem(Alignment.value()) != 0) return false; @@ -307,8 +315,8 @@ bool llvm::isDereferenceableAndAlignedInLoop( if (isa(MaxBECount)) return false; - const auto &[AccessStart, AccessEnd] = - getStartAndEndForAccess(L, Ptr, LI->getType(), MaxBECount, &SE, nullptr); + const auto &[AccessStart, AccessEnd] = getStartAndEndForAccess( + L, PtrScev, LI->getType(), MaxBECount, &SE, nullptr); if (isa(AccessStart) || isa(AccessEnd)) return false; From 0503de2abb01aad113fc1f4cb25c0af79ee33b3f Mon Sep 17 00:00:00 2001 From: David Sherwood Date: Wed, 15 Jan 2025 10:01:43 +0000 Subject: [PATCH 4/4] Address review comment --- llvm/include/llvm/Analysis/LoopAccessAnalysis.h | 16 +++++++++++++--- llvm/lib/Analysis/LoopAccessAnalysis.cpp | 13 ------------- 2 files changed, 13 insertions(+), 16 deletions(-) diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h index 541ddc82b21c4..6fc6ca14d0889 100644 --- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h +++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h @@ -853,9 +853,19 @@ bool sortPtrAccesses(ArrayRef VL, Type *ElemTy, const DataLayout &DL, bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL, ScalarEvolution &SE, bool CheckType = true); -/// For a given Loop \p Lp and pointer \p PtrExpr return a pair of SCEV values -/// representing the maximum range of addresses accessed in the loop, i.e. -/// [min,max). +/// Calculate Start and End points of memory access. +/// Let's assume A is the first access and B is a memory access on N-th loop +/// iteration. Then B is calculated as: +/// B = A + Step*N . +/// Step value may be positive or negative. +/// N is a calculated back-edge taken count: +/// N = (TripCount > 0) ? RoundDown(TripCount -1 , VF) : 0 +/// Start and End points are calculated in the following way: +/// Start = UMIN(A, B) ; End = UMAX(A, B) + SizeOfElt, +/// where SizeOfElt is the size of single memory access in bytes. +/// +/// There is no conflict when the intervals are disjoint: +/// NoConflict = (P2.Start >= P1.End) || (P1.Start >= P2.End) std::pair getStartAndEndForAccess( const Loop *Lp, const SCEV *PtrExpr, Type *AccessTy, const SCEV *MaxBECount, ScalarEvolution *SE, diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index 2bcdb5246eb70..11e0a221fc887 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -190,19 +190,6 @@ RuntimeCheckingPtrGroup::RuntimeCheckingPtrGroup( Members.push_back(Index); } -/// Calculate Start and End points of memory access. -/// Let's assume A is the first access and B is a memory access on N-th loop -/// iteration. Then B is calculated as: -/// B = A + Step*N . -/// Step value may be positive or negative. -/// N is a calculated back-edge taken count: -/// N = (TripCount > 0) ? RoundDown(TripCount -1 , VF) : 0 -/// Start and End points are calculated in the following way: -/// Start = UMIN(A, B) ; End = UMAX(A, B) + SizeOfElt, -/// where SizeOfElt is the size of single memory access in bytes. -/// -/// There is no conflict when the intervals are disjoint: -/// NoConflict = (P2.Start >= P1.End) || (P1.Start >= P2.End) std::pair llvm::getStartAndEndForAccess( const Loop *Lp, const SCEV *PtrExpr, Type *AccessTy, const SCEV *MaxBECount, ScalarEvolution *SE,