diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 5df1061691a67..3f074c07b6479 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -992,7 +992,8 @@ class LoopVectorizationCostModel { /// If interleave count has been specified by metadata it will be returned. /// Otherwise, the interleave count is computed and returned. VF and LoopCost /// are the selected vectorization factor and the cost of the selected VF. - unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost); + unsigned selectInterleaveCount(VPlan &Plan, ElementCount VF, + InstructionCost LoopCost); /// Memory access instruction may be vectorized in more than one way. /// Form of instruction after vectorization depends on cost. @@ -4871,8 +4872,233 @@ void LoopVectorizationCostModel::collectElementTypesForWidening() { } } +/// Estimate the register usage for \p Plan and vectorization factors in \p VFs +/// by calculating the highest number of values that are live at a single +/// location as a rough estimate. Returns the register usage for each VF in \p +/// VFs. +static SmallVector +calculateRegisterUsage(VPlan &Plan, ArrayRef VFs, + const TargetTransformInfo &TTI, + const SmallPtrSetImpl &ValuesToIgnore) { + // Each 'key' in the map opens a new interval. The values + // of the map are the index of the 'last seen' usage of the + // recipe that is the key. + using IntervalMap = SmallDenseMap; + + // Maps indices to recipes. + SmallVector Idx2Recipe; + // Marks the end of each interval. + IntervalMap EndPoint; + // Saves the list of recipe indices that are used in the loop. + SmallPtrSet Ends; + // Saves the list of values that are used in the loop but are defined outside + // the loop (not including non-recipe values such as arguments and + // constants). + SmallSetVector LoopInvariants; + LoopInvariants.insert(&Plan.getVectorTripCount()); + + // We scan the loop in a topological order in order and assign a number to + // each recipe. We use RPO to ensure that defs are met before their users. We + // assume that each recipe that has in-loop users starts an interval. We + // record every time that an in-loop value is used, so we have a list of the + // first and last occurrences of each recipe. + ReversePostOrderTraversal> RPOT( + Plan.getVectorLoopRegion()); + for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly(RPOT)) { + if (!VPBB->getParent()) + break; + for (VPRecipeBase &R : *VPBB) { + Idx2Recipe.push_back(&R); + + // Save the end location of each USE. + for (VPValue *U : R.operands()) { + auto *DefR = U->getDefiningRecipe(); + + // Ignore non-recipe values such as arguments, constants, etc. + // FIXME: Might need some motivation why these values are ignored. If + // for example an argument is used inside the loop it will increase the + // register pressure (so shouldn't we add it to LoopInvariants). + if (!DefR && (!U->getLiveInIRValue() || + !isa(U->getLiveInIRValue()))) + continue; + + // If this recipe is outside the loop then record it and continue. + if (!DefR) { + LoopInvariants.insert(U); + continue; + } + + // Overwrite previous end points. + EndPoint[DefR] = Idx2Recipe.size(); + Ends.insert(DefR); + } + } + if (VPBB == Plan.getVectorLoopRegion()->getExiting()) { + // VPWidenIntOrFpInductionRecipes are used implicitly at the end of the + // exiting block, where their increment will get materialized eventually. + for (auto &R : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) { + if (isa(&R)) { + EndPoint[&R] = Idx2Recipe.size(); + Ends.insert(&R); + } + } + } + } + + // Saves the list of intervals that end with the index in 'key'. + using RecipeList = SmallVector; + SmallDenseMap TransposeEnds; + + // Next, we transpose the EndPoints into a multi map that holds the list of + // intervals that *end* at a specific location. + for (auto &Interval : EndPoint) + TransposeEnds[Interval.second].push_back(Interval.first); + + SmallPtrSet OpenIntervals; + SmallVector RUs(VFs.size()); + SmallVector, 8> MaxUsages(VFs.size()); + + LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); + + VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType()); + + const auto &TTICapture = TTI; + auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned { + if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty) || + (VF.isScalable() && + !TTICapture.isElementTypeLegalForScalableVector(Ty))) + return 0; + return TTICapture.getRegUsageForType(VectorType::get(Ty, VF)); + }; + + // We scan the instructions linearly and record each time that a new interval + // starts, by placing it in a set. If we find this value in TransposEnds then + // we remove it from the set. The max register usage is the maximum register + // usage of the recipes of the set. + for (unsigned int Idx = 0, Sz = Idx2Recipe.size(); Idx < Sz; ++Idx) { + VPRecipeBase *R = Idx2Recipe[Idx]; + + // Remove all of the recipes that end at this location. + RecipeList &List = TransposeEnds[Idx]; + for (VPRecipeBase *ToRemove : List) + OpenIntervals.erase(ToRemove); + + // Ignore recipes that are never used within the loop and do not have side + // effects. + if (!Ends.count(R) && !R->mayHaveSideEffects()) + continue; + + // Skip recipes for ignored values. + // TODO: Should mark recipes for ephemeral values that cannot be removed + // explictly in VPlan. + if (isa(R) && + ValuesToIgnore.contains( + cast(R)->getUnderlyingValue())) + continue; + + // For each VF find the maximum usage of registers. + for (unsigned J = 0, E = VFs.size(); J < E; ++J) { + // Count the number of registers used, per register class, given all open + // intervals. + // Note that elements in this SmallMapVector will be default constructed + // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if + // there is no previous entry for ClassID. + SmallMapVector RegUsage; + + for (auto *R : OpenIntervals) { + // Skip recipes that weren't present in the original loop. + // TODO: Remove after removing the legacy + // LoopVectorizationCostModel::calculateRegisterUsage + if (isa(R)) + continue; + + if (VFs[J].isScalar() || + isa(R) || + (isa(R) && + all_of(cast(R)->users(), [&](VPUser *U) { + return cast(U)->usesScalars(R->getVPSingleValue()); + }))) { + unsigned ClassID = TTI.getRegisterClassForType( + false, TypeInfo.inferScalarType(R->getVPSingleValue())); + // FIXME: The target might use more than one register for the type + // even in the scalar case. + RegUsage[ClassID] += 1; + } else { + for (VPValue *DefV : R->definedValues()) { + Type *ScalarTy = TypeInfo.inferScalarType(DefV); + unsigned ClassID = TTI.getRegisterClassForType(true, ScalarTy); + RegUsage[ClassID] += GetRegUsage(ScalarTy, VFs[J]); + } + } + } + + for (const auto &Pair : RegUsage) { + auto &Entry = MaxUsages[J][Pair.first]; + Entry = std::max(Entry, Pair.second); + } + } + + LLVM_DEBUG(dbgs() << "LV(REG): At #" << Idx << " Interval # " + << OpenIntervals.size() << '\n'); + + // Add the current recipe to the list of open intervals. + OpenIntervals.insert(R); + } + + // We also search for instructions that are defined outside the loop, but are + // used inside the loop. We need this number separately from the max-interval + // usage number because when we unroll, loop-invariant values do not take + // more register. + LoopVectorizationCostModel::RegisterUsage RU; + for (unsigned Idx = 0, End = VFs.size(); Idx < End; ++Idx) { + // Note that elements in this SmallMapVector will be default constructed + // as 0. So we can use "Invariant[ClassID] += n" in the code below even if + // there is no previous entry for ClassID. + SmallMapVector Invariant; + + for (auto *In : LoopInvariants) { + // FIXME: The target might use more than one register for the type + // even in the scalar case. + bool IsScalar = all_of(In->users(), [&](VPUser *U) { + return cast(U)->usesScalars(In); + }); + + ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[Idx]; + unsigned ClassID = TTI.getRegisterClassForType( + VF.isVector(), TypeInfo.inferScalarType(In)); + Invariant[ClassID] += GetRegUsage(TypeInfo.inferScalarType(In), VF); + } + + LLVM_DEBUG({ + dbgs() << "LV(REG): VF = " << VFs[Idx] << '\n'; + dbgs() << "LV(REG): Found max usage: " << MaxUsages[Idx].size() + << " item\n"; + for (const auto &pair : MaxUsages[Idx]) { + dbgs() << "LV(REG): RegisterClass: " + << TTI.getRegisterClassName(pair.first) << ", " << pair.second + << " registers\n"; + } + dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() + << " item\n"; + for (const auto &pair : Invariant) { + dbgs() << "LV(REG): RegisterClass: " + << TTI.getRegisterClassName(pair.first) << ", " << pair.second + << " registers\n"; + } + }); + + RU.LoopInvariantRegs = Invariant; + RU.MaxLocalUsers = MaxUsages[Idx]; + RUs[Idx] = RU; + } + + return RUs; +} + unsigned -LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, +LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF, InstructionCost LoopCost) { // -- The interleave heuristics -- // We interleave the loop in order to expose ILP and reduce the loop overhead. @@ -4922,7 +5148,8 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, return 1; } - RegisterUsage R = calculateRegisterUsage({VF})[0]; + RegisterUsage R = + ::calculateRegisterUsage(Plan, {VF}, TTI, ValuesToIgnore)[0]; // We divide by these constants so assume that we have at least one // instruction that uses at least one register. for (auto &Pair : R.MaxLocalUsers) { @@ -5173,7 +5400,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef VFs) { // We also search for instructions that are defined outside the loop, but are // used inside the loop. We need this number separately from the max-interval // usage number because when we unroll, loop-invariant values do not take - // more register. + // more registers. LoopBlocksDFS DFS(TheLoop); DFS.perform(LI); @@ -10760,7 +10987,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { AddBranchWeights, CM.CostKind); if (LVP.hasPlanWithVF(VF.Width)) { // Select the interleave count. - IC = CM.selectInterleaveCount(VF.Width, VF.Cost); + IC = CM.selectInterleaveCount(LVP.getPlanFor(VF.Width), VF.Width, VF.Cost); unsigned SelectedIC = std::max(IC, UserIC); // Optimistically generate runtime checks if they are needed. Drop them if diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/i1-reg-usage.ll b/llvm/test/Transforms/LoopVectorize/AArch64/i1-reg-usage.ll index 69af51deea08e..0ec90b75002cd 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/i1-reg-usage.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/i1-reg-usage.ll @@ -8,8 +8,8 @@ target triple = "aarch64" ; CHECK-LABEL: LV: Checking a loop in 'or_reduction_neon' from ; CHECK: LV(REG): VF = 32 ; CHECK-NEXT: LV(REG): Found max usage: 2 item +; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 2 registers ; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 72 registers -; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 1 registers define i1 @or_reduction_neon(i32 %arg, ptr %ptr) { entry: @@ -31,8 +31,8 @@ loop: ; CHECK-LABEL: LV: Checking a loop in 'or_reduction_sve' ; CHECK: LV(REG): VF = 64 ; CHECK-NEXT: LV(REG): Found max usage: 2 item +; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 2 registers ; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 136 registers -; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 1 registers define i1 @or_reduction_sve(i32 %arg, ptr %ptr) vscale_range(2,2) "target-features"="+sve" { entry: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll index 542008f34131f..bbcc6db020307 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll @@ -3240,10 +3240,10 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum, ; CHECK-INTERLEAVED-NEXT: [[ARRAYIDX58_PROMOTED:%.*]] = load i32, ptr [[ARRAYIDX58]], align 4 ; CHECK-INTERLEAVED-NEXT: [[ARRAYIDX67_PROMOTED:%.*]] = load i32, ptr [[ARRAYIDX67]], align 4 ; CHECK-INTERLEAVED-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64 -; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 8 +; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 ; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVED: vector.ph: -; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 8 +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 ; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] ; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[ARRAYIDX67_PROMOTED]], i32 0 ; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[ARRAYIDX58_PROMOTED]], i32 0 @@ -3257,42 +3257,19 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum, ; CHECK-INTERLEAVED: vector.body: ; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP0]], [[VECTOR_PH]] ], [ [[TMP64:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP65:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ [[TMP1]], [[VECTOR_PH]] ], [ [[TMP58:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP59:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ [[TMP2]], [[VECTOR_PH]] ], [ [[TMP52:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP53:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i32> [ [[TMP3]], [[VECTOR_PH]] ], [ [[TMP46:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP47:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[VEC_PHI8:%.*]] = phi <4 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[TMP40:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI9:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP41:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[VEC_PHI10:%.*]] = phi <4 x i32> [ [[TMP5]], [[VECTOR_PH]] ], [ [[TMP34:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI11:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP35:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[VEC_PHI12:%.*]] = phi <4 x i32> [ [[TMP6]], [[VECTOR_PH]] ], [ [[TMP28:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI13:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP29:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[VEC_PHI14:%.*]] = phi <4 x i32> [ [[TMP7]], [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI15:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 4 ; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]] ; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0 -; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 4 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP10]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD16:%.*]] = load <4 x i8>, ptr [[TMP11]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = zext <4 x i8> [[WIDE_LOAD]] to <4 x i32> +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD16:%.*]] = load <4 x i8>, ptr [[TMP10]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = zext <4 x i8> [[WIDE_LOAD16]] to <4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = shl nsw i64 [[INDEX]], 3 -; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = shl nsw i64 [[TMP8]], 3 -; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP14]] +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = shl nsw i64 [[INDEX]], 3 ; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP15]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_VEC:%.*]] = load <32 x i8>, ptr [[TMP16]], align 1 -; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> -; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC17:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> -; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC18:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> -; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC19:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> -; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC20:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> -; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC21:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> -; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC22:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> -; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC23:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> ; CHECK-INTERLEAVED-NEXT: [[WIDE_VEC24:%.*]] = load <32 x i8>, ptr [[TMP17]], align 1 ; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC25:%.*]] = shufflevector <32 x i8> [[WIDE_VEC24]], <32 x i8> poison, <4 x i32> ; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC26:%.*]] = shufflevector <32 x i8> [[WIDE_VEC24]], <32 x i8> poison, <4 x i32> @@ -3302,74 +3279,42 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum, ; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC30:%.*]] = shufflevector <32 x i8> [[WIDE_VEC24]], <32 x i8> poison, <4 x i32> ; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC31:%.*]] = shufflevector <32 x i8> [[WIDE_VEC24]], <32 x i8> poison, <4 x i32> ; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC32:%.*]] = shufflevector <32 x i8> [[WIDE_VEC24]], <32 x i8> poison, <4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = sext <4 x i8> [[STRIDED_VEC]] to <4 x i32> ; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = sext <4 x i8> [[STRIDED_VEC25]] to <4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = mul nsw <4 x i32> [[TMP18]], [[TMP12]] ; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = mul nsw <4 x i32> [[TMP19]], [[TMP13]] -; CHECK-INTERLEAVED-NEXT: [[TMP22]] = add <4 x i32> [[TMP20]], [[VEC_PHI14]] -; CHECK-INTERLEAVED-NEXT: [[TMP23]] = add <4 x i32> [[TMP21]], [[VEC_PHI15]] -; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = sext <4 x i8> [[STRIDED_VEC17]] to <4 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP22]] = add <4 x i32> [[TMP21]], [[VEC_PHI14]] ; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = sext <4 x i8> [[STRIDED_VEC26]] to <4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = mul nsw <4 x i32> [[TMP24]], [[TMP12]] ; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = mul nsw <4 x i32> [[TMP25]], [[TMP13]] -; CHECK-INTERLEAVED-NEXT: [[TMP28]] = add <4 x i32> [[TMP26]], [[VEC_PHI12]] -; CHECK-INTERLEAVED-NEXT: [[TMP29]] = add <4 x i32> [[TMP27]], [[VEC_PHI13]] -; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = sext <4 x i8> [[STRIDED_VEC18]] to <4 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP28]] = add <4 x i32> [[TMP27]], [[VEC_PHI12]] ; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = sext <4 x i8> [[STRIDED_VEC27]] to <4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = mul nsw <4 x i32> [[TMP30]], [[TMP12]] ; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = mul nsw <4 x i32> [[TMP31]], [[TMP13]] -; CHECK-INTERLEAVED-NEXT: [[TMP34]] = add <4 x i32> [[TMP32]], [[VEC_PHI10]] -; CHECK-INTERLEAVED-NEXT: [[TMP35]] = add <4 x i32> [[TMP33]], [[VEC_PHI11]] -; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = sext <4 x i8> [[STRIDED_VEC19]] to <4 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP34]] = add <4 x i32> [[TMP33]], [[VEC_PHI10]] ; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = sext <4 x i8> [[STRIDED_VEC28]] to <4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP38:%.*]] = mul nsw <4 x i32> [[TMP36]], [[TMP12]] ; CHECK-INTERLEAVED-NEXT: [[TMP39:%.*]] = mul nsw <4 x i32> [[TMP37]], [[TMP13]] -; CHECK-INTERLEAVED-NEXT: [[TMP40]] = add <4 x i32> [[TMP38]], [[VEC_PHI8]] -; CHECK-INTERLEAVED-NEXT: [[TMP41]] = add <4 x i32> [[TMP39]], [[VEC_PHI9]] -; CHECK-INTERLEAVED-NEXT: [[TMP42:%.*]] = sext <4 x i8> [[STRIDED_VEC20]] to <4 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP40]] = add <4 x i32> [[TMP39]], [[VEC_PHI8]] ; CHECK-INTERLEAVED-NEXT: [[TMP43:%.*]] = sext <4 x i8> [[STRIDED_VEC29]] to <4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP44:%.*]] = mul nsw <4 x i32> [[TMP42]], [[TMP12]] ; CHECK-INTERLEAVED-NEXT: [[TMP45:%.*]] = mul nsw <4 x i32> [[TMP43]], [[TMP13]] -; CHECK-INTERLEAVED-NEXT: [[TMP46]] = add <4 x i32> [[TMP44]], [[VEC_PHI6]] -; CHECK-INTERLEAVED-NEXT: [[TMP47]] = add <4 x i32> [[TMP45]], [[VEC_PHI7]] -; CHECK-INTERLEAVED-NEXT: [[TMP48:%.*]] = sext <4 x i8> [[STRIDED_VEC21]] to <4 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP46]] = add <4 x i32> [[TMP45]], [[VEC_PHI6]] ; CHECK-INTERLEAVED-NEXT: [[TMP49:%.*]] = sext <4 x i8> [[STRIDED_VEC30]] to <4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP50:%.*]] = mul nsw <4 x i32> [[TMP48]], [[TMP12]] ; CHECK-INTERLEAVED-NEXT: [[TMP51:%.*]] = mul nsw <4 x i32> [[TMP49]], [[TMP13]] -; CHECK-INTERLEAVED-NEXT: [[TMP52]] = add <4 x i32> [[TMP50]], [[VEC_PHI4]] -; CHECK-INTERLEAVED-NEXT: [[TMP53]] = add <4 x i32> [[TMP51]], [[VEC_PHI5]] -; CHECK-INTERLEAVED-NEXT: [[TMP54:%.*]] = sext <4 x i8> [[STRIDED_VEC22]] to <4 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP52]] = add <4 x i32> [[TMP51]], [[VEC_PHI4]] ; CHECK-INTERLEAVED-NEXT: [[TMP55:%.*]] = sext <4 x i8> [[STRIDED_VEC31]] to <4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP56:%.*]] = mul nsw <4 x i32> [[TMP54]], [[TMP12]] ; CHECK-INTERLEAVED-NEXT: [[TMP57:%.*]] = mul nsw <4 x i32> [[TMP55]], [[TMP13]] -; CHECK-INTERLEAVED-NEXT: [[TMP58]] = add <4 x i32> [[TMP56]], [[VEC_PHI2]] -; CHECK-INTERLEAVED-NEXT: [[TMP59]] = add <4 x i32> [[TMP57]], [[VEC_PHI3]] -; CHECK-INTERLEAVED-NEXT: [[TMP60:%.*]] = sext <4 x i8> [[STRIDED_VEC23]] to <4 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP58]] = add <4 x i32> [[TMP57]], [[VEC_PHI2]] ; CHECK-INTERLEAVED-NEXT: [[TMP61:%.*]] = sext <4 x i8> [[STRIDED_VEC32]] to <4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP62:%.*]] = mul nsw <4 x i32> [[TMP60]], [[TMP12]] ; CHECK-INTERLEAVED-NEXT: [[TMP63:%.*]] = mul nsw <4 x i32> [[TMP61]], [[TMP13]] -; CHECK-INTERLEAVED-NEXT: [[TMP64]] = add <4 x i32> [[TMP62]], [[VEC_PHI]] -; CHECK-INTERLEAVED-NEXT: [[TMP65]] = add <4 x i32> [[TMP63]], [[VEC_PHI1]] -; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-INTERLEAVED-NEXT: [[TMP64]] = add <4 x i32> [[TMP63]], [[VEC_PHI]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-INTERLEAVED-NEXT: [[TMP66:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP66]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP65]], [[TMP64]] -; CHECK-INTERLEAVED-NEXT: [[TMP67:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX33:%.*]] = add <4 x i32> [[TMP59]], [[TMP58]] -; CHECK-INTERLEAVED-NEXT: [[TMP68:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX33]]) -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX34:%.*]] = add <4 x i32> [[TMP53]], [[TMP52]] -; CHECK-INTERLEAVED-NEXT: [[TMP69:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX34]]) -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX35:%.*]] = add <4 x i32> [[TMP47]], [[TMP46]] -; CHECK-INTERLEAVED-NEXT: [[TMP70:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX35]]) -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX36:%.*]] = add <4 x i32> [[TMP41]], [[TMP40]] -; CHECK-INTERLEAVED-NEXT: [[TMP71:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX36]]) -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX37:%.*]] = add <4 x i32> [[TMP35]], [[TMP34]] -; CHECK-INTERLEAVED-NEXT: [[TMP72:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX37]]) -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX38:%.*]] = add <4 x i32> [[TMP29]], [[TMP28]] -; CHECK-INTERLEAVED-NEXT: [[TMP73:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX38]]) -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX39:%.*]] = add <4 x i32> [[TMP23]], [[TMP22]] -; CHECK-INTERLEAVED-NEXT: [[TMP74:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX39]]) +; CHECK-INTERLEAVED-NEXT: [[TMP38:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP64]]) +; CHECK-INTERLEAVED-NEXT: [[TMP47:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP58]]) +; CHECK-INTERLEAVED-NEXT: [[TMP48:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP52]]) +; CHECK-INTERLEAVED-NEXT: [[TMP41:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP46]]) +; CHECK-INTERLEAVED-NEXT: [[TMP42:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP40]]) +; CHECK-INTERLEAVED-NEXT: [[TMP50:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP34]]) +; CHECK-INTERLEAVED-NEXT: [[TMP44:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP28]]) +; CHECK-INTERLEAVED-NEXT: [[TMP53:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP22]]) ; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[FOR_COND_FOR_COND_CLEANUP_CRIT_EDGE:%.*]], label [[SCALAR_PH]] ; CHECK-INTERLEAVED: scalar.ph: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll index 5c69956429442..ab12ca4254a09 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll @@ -16,11 +16,10 @@ define void @get_invariant_reg_usage(ptr %z) { ; CHECK-LABEL: LV: Checking a loop in 'get_invariant_reg_usage' ; CHECK: LV(REG): VF = vscale x 16 ; CHECK-NEXT: LV(REG): Found max usage: 2 item -; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 3 registers -; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 1 registers -; CHECK-NEXT: LV(REG): Found invariant usage: 2 item ; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 2 registers -; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 8 registers +; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 1 registers +; CHECK-NEXT: LV(REG): Found invariant usage: 1 item +; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 3 registers L.entry: %0 = load i128, ptr %z, align 16 @@ -46,9 +45,9 @@ define void @load_and_compare_only_used_by_assume(ptr %a, ptr noalias %b) { ; CHECK-LABEL: LV: Checking a loop in 'load_and_compare_only_used_by_assume' ; CHECK: LV(REG): VF = vscale x 4 ; CHECK-NEXT: LV(REG): Found max usage: 2 item -; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 2 registers +; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 3 registers ; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 1 registers -; CHECK-NEXT: LV(REG): Found invariant usage: 0 item +; CHECK-NEXT: LV(REG): Found invariant usage: 1 item entry: br label %loop diff --git a/llvm/test/Transforms/LoopVectorize/LoongArch/reg-usage.ll b/llvm/test/Transforms/LoopVectorize/LoongArch/reg-usage.ll index 021ef0d543a18..5baf1e013a50f 100644 --- a/llvm/test/Transforms/LoopVectorize/LoongArch/reg-usage.ll +++ b/llvm/test/Transforms/LoopVectorize/LoongArch/reg-usage.ll @@ -9,17 +9,18 @@ define void @bar(ptr %A, i32 signext %n) { ; CHECK-LABEL: bar ; CHECK-SCALAR: LV(REG): Found max usage: 2 item -; CHECK-SCALAR-NEXT: LV(REG): RegisterClass: LoongArch::GPRRC, 2 registers +; CHECK-SCALAR-NEXT: LV(REG): RegisterClass: LoongArch::GPRRC, 3 registers ; CHECK-SCALAR-NEXT: LV(REG): RegisterClass: LoongArch::FPRRC, 1 registers ; CHECK-SCALAR-NEXT: LV(REG): Found invariant usage: 1 item ; CHECK-SCALAR-NEXT: LV(REG): RegisterClass: LoongArch::GPRRC, 1 registers ; CHECK-SCALAR-NEXT: LV: The target has 30 registers of LoongArch::GPRRC register class ; CHECK-SCALAR-NEXT: LV: The target has 32 registers of LoongArch::FPRRC register class ; CHECK-VECTOR: LV(REG): Found max usage: 2 item -; CHECK-VECTOR-NEXT: LV(REG): RegisterClass: LoongArch::VRRC, 3 registers -; CHECK-VECTOR-NEXT: LV(REG): RegisterClass: LoongArch::GPRRC, 1 registers +; CHECK-VECTOR-NEXT: LV(REG): RegisterClass: LoongArch::GPRRC, 2 registers +; CHECK-VECTOR-NEXT: LV(REG): RegisterClass: LoongArch::VRRC, 2 registers ; CHECK-VECTOR-NEXT: LV(REG): Found invariant usage: 1 item ; CHECK-VECTOR-NEXT: LV(REG): RegisterClass: LoongArch::GPRRC, 1 registers +; CHECK-VECTOR-NEXT: LV: The target has 30 registers of LoongArch::GPRRC register class ; CHECK-VECTOR-NEXT: LV: The target has 32 registers of LoongArch::VRRC register class entry: diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/exit-branch-cost.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/exit-branch-cost.ll index 36fff14528de9..f1947dec2ea23 100644 --- a/llvm/test/Transforms/LoopVectorize/PowerPC/exit-branch-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/PowerPC/exit-branch-cost.ll @@ -18,10 +18,10 @@ define i1 @select_exit_cond(ptr %start, ptr %end, i64 %N) { ; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP2]], 2 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]] ; CHECK: [[VECTOR_MAIN_LOOP_ITER_CHECK]]: -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 16 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-NEXT: [[MIN_ITERS_CHECK3:%.*]] = icmp ult i64 [[TMP2]], 24 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK3]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 16 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 24 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]] ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: @@ -35,6 +35,10 @@ define i1 @select_exit_cond(ptr %start, ptr %end, i64 %N) { ; CHECK-NEXT: [[VEC_PHI15:%.*]] = phi <2 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP48:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI16:%.*]] = phi <2 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP49:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI17:%.*]] = phi <2 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP50:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI18:%.*]] = phi <2 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP64:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI19:%.*]] = phi <2 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP65:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI20:%.*]] = phi <2 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP66:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI21:%.*]] = phi <2 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP67:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], splat (i64 2) ; CHECK-NEXT: [[STEP_ADD_2:%.*]] = add <2 x i64> [[STEP_ADD]], splat (i64 2) ; CHECK-NEXT: [[STEP_ADD_3:%.*]] = add <2 x i64> [[STEP_ADD_2]], splat (i64 2) @@ -42,6 +46,10 @@ define i1 @select_exit_cond(ptr %start, ptr %end, i64 %N) { ; CHECK-NEXT: [[STEP_ADD_5:%.*]] = add <2 x i64> [[STEP_ADD_4]], splat (i64 2) ; CHECK-NEXT: [[STEP_ADD_6:%.*]] = add <2 x i64> [[STEP_ADD_5]], splat (i64 2) ; CHECK-NEXT: [[STEP_ADD_7:%.*]] = add <2 x i64> [[STEP_ADD_6]], splat (i64 2) +; CHECK-NEXT: [[STEP_ADD_8:%.*]] = add <2 x i64> [[STEP_ADD_7]], splat (i64 2) +; CHECK-NEXT: [[STEP_ADD_9:%.*]] = add <2 x i64> [[STEP_ADD_8]], splat (i64 2) +; CHECK-NEXT: [[STEP_ADD_10:%.*]] = add <2 x i64> [[STEP_ADD_9]], splat (i64 2) +; CHECK-NEXT: [[STEP_ADD_11:%.*]] = add <2 x i64> [[STEP_ADD_10]], splat (i64 2) ; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0 ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 2 @@ -51,6 +59,10 @@ define i1 @select_exit_cond(ptr %start, ptr %end, i64 %N) { ; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 10 ; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 12 ; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 14 +; CHECK-NEXT: [[TMP68:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 16 +; CHECK-NEXT: [[TMP69:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 18 +; CHECK-NEXT: [[TMP70:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 20 +; CHECK-NEXT: [[TMP71:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 22 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i8>, ptr [[TMP11]], align 1 ; CHECK-NEXT: [[WIDE_LOAD25:%.*]] = load <2 x i8>, ptr [[TMP12]], align 1 ; CHECK-NEXT: [[WIDE_LOAD26:%.*]] = load <2 x i8>, ptr [[TMP13]], align 1 @@ -59,6 +71,10 @@ define i1 @select_exit_cond(ptr %start, ptr %end, i64 %N) { ; CHECK-NEXT: [[WIDE_LOAD29:%.*]] = load <2 x i8>, ptr [[TMP16]], align 1 ; CHECK-NEXT: [[WIDE_LOAD30:%.*]] = load <2 x i8>, ptr [[TMP17]], align 1 ; CHECK-NEXT: [[WIDE_LOAD31:%.*]] = load <2 x i8>, ptr [[TMP18]], align 1 +; CHECK-NEXT: [[WIDE_LOAD22:%.*]] = load <2 x i8>, ptr [[TMP68]], align 1 +; CHECK-NEXT: [[WIDE_LOAD23:%.*]] = load <2 x i8>, ptr [[TMP69]], align 1 +; CHECK-NEXT: [[WIDE_LOAD24:%.*]] = load <2 x i8>, ptr [[TMP70]], align 1 +; CHECK-NEXT: [[WIDE_LOAD33:%.*]] = load <2 x i8>, ptr [[TMP71]], align 1 ; CHECK-NEXT: [[TMP19:%.*]] = zext <2 x i8> [[WIDE_LOAD]] to <2 x i64> ; CHECK-NEXT: [[TMP20:%.*]] = zext <2 x i8> [[WIDE_LOAD25]] to <2 x i64> ; CHECK-NEXT: [[TMP21:%.*]] = zext <2 x i8> [[WIDE_LOAD26]] to <2 x i64> @@ -67,6 +83,10 @@ define i1 @select_exit_cond(ptr %start, ptr %end, i64 %N) { ; CHECK-NEXT: [[TMP24:%.*]] = zext <2 x i8> [[WIDE_LOAD29]] to <2 x i64> ; CHECK-NEXT: [[TMP25:%.*]] = zext <2 x i8> [[WIDE_LOAD30]] to <2 x i64> ; CHECK-NEXT: [[TMP26:%.*]] = zext <2 x i8> [[WIDE_LOAD31]] to <2 x i64> +; CHECK-NEXT: [[TMP72:%.*]] = zext <2 x i8> [[WIDE_LOAD22]] to <2 x i64> +; CHECK-NEXT: [[TMP73:%.*]] = zext <2 x i8> [[WIDE_LOAD23]] to <2 x i64> +; CHECK-NEXT: [[TMP74:%.*]] = zext <2 x i8> [[WIDE_LOAD24]] to <2 x i64> +; CHECK-NEXT: [[TMP75:%.*]] = zext <2 x i8> [[WIDE_LOAD33]] to <2 x i64> ; CHECK-NEXT: [[TMP27:%.*]] = shl <2 x i64> [[VEC_IND]], splat (i64 1) ; CHECK-NEXT: [[TMP28:%.*]] = shl <2 x i64> [[STEP_ADD]], splat (i64 1) ; CHECK-NEXT: [[TMP29:%.*]] = shl <2 x i64> [[STEP_ADD_2]], splat (i64 1) @@ -75,6 +95,10 @@ define i1 @select_exit_cond(ptr %start, ptr %end, i64 %N) { ; CHECK-NEXT: [[TMP32:%.*]] = shl <2 x i64> [[STEP_ADD_5]], splat (i64 1) ; CHECK-NEXT: [[TMP33:%.*]] = shl <2 x i64> [[STEP_ADD_6]], splat (i64 1) ; CHECK-NEXT: [[TMP34:%.*]] = shl <2 x i64> [[STEP_ADD_7]], splat (i64 1) +; CHECK-NEXT: [[TMP76:%.*]] = shl <2 x i64> [[STEP_ADD_8]], splat (i64 1) +; CHECK-NEXT: [[TMP77:%.*]] = shl <2 x i64> [[STEP_ADD_9]], splat (i64 1) +; CHECK-NEXT: [[TMP78:%.*]] = shl <2 x i64> [[STEP_ADD_10]], splat (i64 1) +; CHECK-NEXT: [[TMP79:%.*]] = shl <2 x i64> [[STEP_ADD_11]], splat (i64 1) ; CHECK-NEXT: [[TMP35:%.*]] = shl <2 x i64> [[TMP19]], [[TMP27]] ; CHECK-NEXT: [[TMP36:%.*]] = shl <2 x i64> [[TMP20]], [[TMP28]] ; CHECK-NEXT: [[TMP37:%.*]] = shl <2 x i64> [[TMP21]], [[TMP29]] @@ -83,6 +107,10 @@ define i1 @select_exit_cond(ptr %start, ptr %end, i64 %N) { ; CHECK-NEXT: [[TMP40:%.*]] = shl <2 x i64> [[TMP24]], [[TMP32]] ; CHECK-NEXT: [[TMP41:%.*]] = shl <2 x i64> [[TMP25]], [[TMP33]] ; CHECK-NEXT: [[TMP42:%.*]] = shl <2 x i64> [[TMP26]], [[TMP34]] +; CHECK-NEXT: [[TMP80:%.*]] = shl <2 x i64> [[TMP72]], [[TMP76]] +; CHECK-NEXT: [[TMP81:%.*]] = shl <2 x i64> [[TMP73]], [[TMP77]] +; CHECK-NEXT: [[TMP82:%.*]] = shl <2 x i64> [[TMP74]], [[TMP78]] +; CHECK-NEXT: [[TMP83:%.*]] = shl <2 x i64> [[TMP75]], [[TMP79]] ; CHECK-NEXT: [[TMP43]] = or <2 x i64> [[TMP35]], [[VEC_PHI]] ; CHECK-NEXT: [[TMP44]] = or <2 x i64> [[TMP36]], [[VEC_PHI11]] ; CHECK-NEXT: [[TMP45]] = or <2 x i64> [[TMP37]], [[VEC_PHI12]] @@ -91,8 +119,12 @@ define i1 @select_exit_cond(ptr %start, ptr %end, i64 %N) { ; CHECK-NEXT: [[TMP48]] = or <2 x i64> [[TMP40]], [[VEC_PHI15]] ; CHECK-NEXT: [[TMP49]] = or <2 x i64> [[TMP41]], [[VEC_PHI16]] ; CHECK-NEXT: [[TMP50]] = or <2 x i64> [[TMP42]], [[VEC_PHI17]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD_7]], splat (i64 2) +; CHECK-NEXT: [[TMP64]] = or <2 x i64> [[TMP80]], [[VEC_PHI18]] +; CHECK-NEXT: [[TMP65]] = or <2 x i64> [[TMP81]], [[VEC_PHI19]] +; CHECK-NEXT: [[TMP66]] = or <2 x i64> [[TMP82]], [[VEC_PHI20]] +; CHECK-NEXT: [[TMP67]] = or <2 x i64> [[TMP83]], [[VEC_PHI21]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 24 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD_11]], splat (i64 2) ; CHECK-NEXT: [[TMP51:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP51]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: @@ -102,7 +134,11 @@ define i1 @select_exit_cond(ptr %start, ptr %end, i64 %N) { ; CHECK-NEXT: [[BIN_RDX20:%.*]] = or <2 x i64> [[TMP47]], [[BIN_RDX19]] ; CHECK-NEXT: [[BIN_RDX21:%.*]] = or <2 x i64> [[TMP48]], [[BIN_RDX20]] ; CHECK-NEXT: [[BIN_RDX22:%.*]] = or <2 x i64> [[TMP49]], [[BIN_RDX21]] -; CHECK-NEXT: [[BIN_RDX37:%.*]] = or <2 x i64> [[TMP50]], [[BIN_RDX22]] +; CHECK-NEXT: [[BIN_RDX31:%.*]] = or <2 x i64> [[TMP50]], [[BIN_RDX22]] +; CHECK-NEXT: [[BIN_RDX32:%.*]] = or <2 x i64> [[TMP64]], [[BIN_RDX31]] +; CHECK-NEXT: [[BIN_RDX33:%.*]] = or <2 x i64> [[TMP65]], [[BIN_RDX32]] +; CHECK-NEXT: [[BIN_RDX34:%.*]] = or <2 x i64> [[TMP66]], [[BIN_RDX33]] +; CHECK-NEXT: [[BIN_RDX37:%.*]] = or <2 x i64> [[TMP67]], [[BIN_RDX34]] ; CHECK-NEXT: [[TMP52:%.*]] = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> [[BIN_RDX37]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] @@ -123,17 +159,17 @@ define i1 @select_exit_cond(ptr %start, ptr %end, i64 %N) { ; CHECK-NEXT: [[TMP57:%.*]] = insertelement <2 x i64> zeroinitializer, i64 [[BC_MERGE_RDX]], i32 0 ; CHECK-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]] ; CHECK: [[VEC_EPILOG_VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT32:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX38:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT32:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND27:%.*]] = phi <2 x i64> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT28:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI29:%.*]] = phi <2 x i64> [ [[TMP57]], %[[VEC_EPILOG_PH]] ], [ [[TMP58:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[NEXT_GEP30:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX]] +; CHECK-NEXT: [[NEXT_GEP30:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX38]] ; CHECK-NEXT: [[TMP60:%.*]] = getelementptr i8, ptr [[NEXT_GEP30]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD32:%.*]] = load <2 x i8>, ptr [[TMP60]], align 1 ; CHECK-NEXT: [[TMP61:%.*]] = zext <2 x i8> [[WIDE_LOAD32]] to <2 x i64> ; CHECK-NEXT: [[TMP62:%.*]] = shl <2 x i64> [[VEC_IND27]], splat (i64 1) ; CHECK-NEXT: [[TMP63:%.*]] = shl <2 x i64> [[TMP61]], [[TMP62]] ; CHECK-NEXT: [[TMP58]] = or <2 x i64> [[TMP63]], [[VEC_PHI29]] -; CHECK-NEXT: [[INDEX_NEXT32]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[INDEX_NEXT32]] = add nuw i64 [[INDEX38]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT28]] = add <2 x i64> [[VEC_IND27]], splat (i64 2) ; CHECK-NEXT: [[TMP54:%.*]] = icmp eq i64 [[INDEX_NEXT32]], [[N_VEC25]] ; CHECK-NEXT: br i1 [[TMP54]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] @@ -142,14 +178,14 @@ define i1 @select_exit_cond(ptr %start, ptr %end, i64 %N) { ; CHECK-NEXT: [[CMP_N33:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC25]] ; CHECK-NEXT: br i1 [[CMP_N33]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]] ; CHECK: [[VEC_EPILOG_SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL34:%.*]] = phi i64 [ [[N_VEC25]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ] -; CHECK-NEXT: [[BC_MERGE_RDX35:%.*]] = phi i64 [ [[TMP55]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP52]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL36:%.*]] = phi ptr [ [[TMP56]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[START]], %[[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL46:%.*]] = phi i64 [ [[N_VEC25]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX47:%.*]] = phi i64 [ [[TMP55]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP52]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL48:%.*]] = phi ptr [ [[TMP56]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[START]], %[[ITER_CHECK]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL34]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[RED:%.*]] = phi i64 [ [[BC_MERGE_RDX35]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL36]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL46]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RED:%.*]] = phi i64 [ [[BC_MERGE_RDX47]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL48]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP]] ] ; CHECK-NEXT: [[TMP53:%.*]] = load i8, ptr [[PTR_IV]], align 1 ; CHECK-NEXT: [[CONV3:%.*]] = zext i8 [[TMP53]] to i64 ; CHECK-NEXT: [[MUL:%.*]] = shl i64 [[IV]], 1 diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/large-loop-rdx.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/large-loop-rdx.ll index 33f0452265c41..0b23206134bc0 100644 --- a/llvm/test/Transforms/LoopVectorize/PowerPC/large-loop-rdx.ll +++ b/llvm/test/Transforms/LoopVectorize/PowerPC/large-loop-rdx.ll @@ -10,71 +10,28 @@ define void @QLA_F3_r_veq_norm2_V(ptr noalias %r, ptr noalias %a, i32 %n) { ; CHECK-SAME: ptr noalias [[R:%.*]], ptr noalias [[A:%.*]], i32 [[N:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*]]: ; CHECK-NEXT: [[CMP24:%.*]] = icmp sgt i32 [[N]], 0 -; CHECK-NEXT: br i1 [[CMP24]], label %[[ITER_CHECK:.*]], label %[[FOR_END13:.*]] -; CHECK: [[ITER_CHECK]]: +; CHECK-NEXT: br i1 [[CMP24]], label %[[FOR_COND1_PREHEADER_PREHEADER:.*]], label %[[FOR_END13:.*]] +; CHECK: [[FOR_COND1_PREHEADER_PREHEADER]]: ; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 2 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]] -; CHECK: [[VECTOR_MAIN_LOOP_ITER_CHECK]]: -; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP0]], 16 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 8 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 16 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 8 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP129:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <2 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP130:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <2 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP131:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI4:%.*]] = phi <2 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP132:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI5:%.*]] = phi <2 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP133:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI6:%.*]] = phi <2 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP134:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI7:%.*]] = phi <2 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP135:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI8:%.*]] = phi <2 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP136:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 6 -; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 8 -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 10 -; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 12 -; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 14 -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[INDEX]], i64 0, i32 0 -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[TMP2]], i64 0, i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[TMP3]], i64 0, i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[TMP4]], i64 0, i32 0 -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[TMP5]], i64 0, i32 0 -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[TMP6]], i64 0, i32 0 -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[TMP7]], i64 0, i32 0 -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[TMP8]], i64 0, i32 0 -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <12 x float>, ptr [[TMP9]], align 8 -; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <12 x float> [[WIDE_VEC]], <12 x float> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC9:%.*]] = shufflevector <12 x float> [[WIDE_VEC]], <12 x float> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC10:%.*]] = shufflevector <12 x float> [[WIDE_VEC]], <12 x float> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC11:%.*]] = shufflevector <12 x float> [[WIDE_VEC]], <12 x float> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC12:%.*]] = shufflevector <12 x float> [[WIDE_VEC]], <12 x float> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC13:%.*]] = shufflevector <12 x float> [[WIDE_VEC]], <12 x float> poison, <2 x i32> -; CHECK-NEXT: [[WIDE_VEC14:%.*]] = load <12 x float>, ptr [[TMP10]], align 8 -; CHECK-NEXT: [[STRIDED_VEC15:%.*]] = shufflevector <12 x float> [[WIDE_VEC14]], <12 x float> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC16:%.*]] = shufflevector <12 x float> [[WIDE_VEC14]], <12 x float> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC17:%.*]] = shufflevector <12 x float> [[WIDE_VEC14]], <12 x float> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC18:%.*]] = shufflevector <12 x float> [[WIDE_VEC14]], <12 x float> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC19:%.*]] = shufflevector <12 x float> [[WIDE_VEC14]], <12 x float> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC20:%.*]] = shufflevector <12 x float> [[WIDE_VEC14]], <12 x float> poison, <2 x i32> -; CHECK-NEXT: [[WIDE_VEC21:%.*]] = load <12 x float>, ptr [[TMP11]], align 8 -; CHECK-NEXT: [[STRIDED_VEC22:%.*]] = shufflevector <12 x float> [[WIDE_VEC21]], <12 x float> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC23:%.*]] = shufflevector <12 x float> [[WIDE_VEC21]], <12 x float> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC24:%.*]] = shufflevector <12 x float> [[WIDE_VEC21]], <12 x float> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC25:%.*]] = shufflevector <12 x float> [[WIDE_VEC21]], <12 x float> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC26:%.*]] = shufflevector <12 x float> [[WIDE_VEC21]], <12 x float> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC27:%.*]] = shufflevector <12 x float> [[WIDE_VEC21]], <12 x float> poison, <2 x i32> -; CHECK-NEXT: [[WIDE_VEC28:%.*]] = load <12 x float>, ptr [[TMP12]], align 8 -; CHECK-NEXT: [[STRIDED_VEC29:%.*]] = shufflevector <12 x float> [[WIDE_VEC28]], <12 x float> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC30:%.*]] = shufflevector <12 x float> [[WIDE_VEC28]], <12 x float> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC31:%.*]] = shufflevector <12 x float> [[WIDE_VEC28]], <12 x float> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC32:%.*]] = shufflevector <12 x float> [[WIDE_VEC28]], <12 x float> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC33:%.*]] = shufflevector <12 x float> [[WIDE_VEC28]], <12 x float> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC34:%.*]] = shufflevector <12 x float> [[WIDE_VEC28]], <12 x float> poison, <2 x i32> +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP69:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <2 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP65:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <2 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP66:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <2 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP67:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 6 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[INDEX]], i64 0, i32 0 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[TMP1]], i64 0, i32 0 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[TMP2]], i64 0, i32 0 +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[TMP3]], i64 0, i32 0 ; CHECK-NEXT: [[WIDE_VEC35:%.*]] = load <12 x float>, ptr [[TMP13]], align 8 ; CHECK-NEXT: [[STRIDED_VEC36:%.*]] = shufflevector <12 x float> [[WIDE_VEC35]], <12 x float> poison, <2 x i32> ; CHECK-NEXT: [[STRIDED_VEC37:%.*]] = shufflevector <12 x float> [[WIDE_VEC35]], <12 x float> poison, <2 x i32> @@ -103,191 +60,83 @@ define void @QLA_F3_r_veq_norm2_V(ptr noalias %r, ptr noalias %a, i32 %n) { ; CHECK-NEXT: [[STRIDED_VEC60:%.*]] = shufflevector <12 x float> [[WIDE_VEC56]], <12 x float> poison, <2 x i32> ; CHECK-NEXT: [[STRIDED_VEC61:%.*]] = shufflevector <12 x float> [[WIDE_VEC56]], <12 x float> poison, <2 x i32> ; CHECK-NEXT: [[STRIDED_VEC62:%.*]] = shufflevector <12 x float> [[WIDE_VEC56]], <12 x float> poison, <2 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = fmul fast <2 x float> [[STRIDED_VEC]], [[STRIDED_VEC]] -; CHECK-NEXT: [[TMP18:%.*]] = fmul fast <2 x float> [[STRIDED_VEC15]], [[STRIDED_VEC15]] -; CHECK-NEXT: [[TMP19:%.*]] = fmul fast <2 x float> [[STRIDED_VEC22]], [[STRIDED_VEC22]] -; CHECK-NEXT: [[TMP20:%.*]] = fmul fast <2 x float> [[STRIDED_VEC29]], [[STRIDED_VEC29]] -; CHECK-NEXT: [[TMP21:%.*]] = fmul fast <2 x float> [[STRIDED_VEC36]], [[STRIDED_VEC36]] -; CHECK-NEXT: [[TMP22:%.*]] = fmul fast <2 x float> [[STRIDED_VEC43]], [[STRIDED_VEC43]] -; CHECK-NEXT: [[TMP23:%.*]] = fmul fast <2 x float> [[STRIDED_VEC50]], [[STRIDED_VEC50]] -; CHECK-NEXT: [[TMP24:%.*]] = fmul fast <2 x float> [[STRIDED_VEC57]], [[STRIDED_VEC57]] -; CHECK-NEXT: [[TMP25:%.*]] = fmul fast <2 x float> [[STRIDED_VEC9]], [[STRIDED_VEC9]] -; CHECK-NEXT: [[TMP26:%.*]] = fmul fast <2 x float> [[STRIDED_VEC16]], [[STRIDED_VEC16]] -; CHECK-NEXT: [[TMP27:%.*]] = fmul fast <2 x float> [[STRIDED_VEC23]], [[STRIDED_VEC23]] -; CHECK-NEXT: [[TMP28:%.*]] = fmul fast <2 x float> [[STRIDED_VEC30]], [[STRIDED_VEC30]] -; CHECK-NEXT: [[TMP29:%.*]] = fmul fast <2 x float> [[STRIDED_VEC37]], [[STRIDED_VEC37]] -; CHECK-NEXT: [[TMP30:%.*]] = fmul fast <2 x float> [[STRIDED_VEC44]], [[STRIDED_VEC44]] -; CHECK-NEXT: [[TMP31:%.*]] = fmul fast <2 x float> [[STRIDED_VEC51]], [[STRIDED_VEC51]] -; CHECK-NEXT: [[TMP32:%.*]] = fmul fast <2 x float> [[STRIDED_VEC58]], [[STRIDED_VEC58]] -; CHECK-NEXT: [[TMP33:%.*]] = fadd fast <2 x float> [[TMP25]], [[TMP17]] -; CHECK-NEXT: [[TMP34:%.*]] = fadd fast <2 x float> [[TMP26]], [[TMP18]] -; CHECK-NEXT: [[TMP35:%.*]] = fadd fast <2 x float> [[TMP27]], [[TMP19]] -; CHECK-NEXT: [[TMP36:%.*]] = fadd fast <2 x float> [[TMP28]], [[TMP20]] -; CHECK-NEXT: [[TMP37:%.*]] = fadd fast <2 x float> [[TMP29]], [[TMP21]] -; CHECK-NEXT: [[TMP38:%.*]] = fadd fast <2 x float> [[TMP30]], [[TMP22]] -; CHECK-NEXT: [[TMP39:%.*]] = fadd fast <2 x float> [[TMP31]], [[TMP23]] -; CHECK-NEXT: [[TMP40:%.*]] = fadd fast <2 x float> [[TMP32]], [[TMP24]] -; CHECK-NEXT: [[TMP41:%.*]] = fpext <2 x float> [[TMP33]] to <2 x double> -; CHECK-NEXT: [[TMP42:%.*]] = fpext <2 x float> [[TMP34]] to <2 x double> -; CHECK-NEXT: [[TMP43:%.*]] = fpext <2 x float> [[TMP35]] to <2 x double> -; CHECK-NEXT: [[TMP44:%.*]] = fpext <2 x float> [[TMP36]] to <2 x double> -; CHECK-NEXT: [[TMP45:%.*]] = fpext <2 x float> [[TMP37]] to <2 x double> -; CHECK-NEXT: [[TMP46:%.*]] = fpext <2 x float> [[TMP38]] to <2 x double> -; CHECK-NEXT: [[TMP47:%.*]] = fpext <2 x float> [[TMP39]] to <2 x double> -; CHECK-NEXT: [[TMP48:%.*]] = fpext <2 x float> [[TMP40]] to <2 x double> -; CHECK-NEXT: [[TMP49:%.*]] = fadd fast <2 x double> [[TMP41]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP50:%.*]] = fadd fast <2 x double> [[TMP42]], [[VEC_PHI2]] -; CHECK-NEXT: [[TMP51:%.*]] = fadd fast <2 x double> [[TMP43]], [[VEC_PHI3]] -; CHECK-NEXT: [[TMP52:%.*]] = fadd fast <2 x double> [[TMP44]], [[VEC_PHI4]] -; CHECK-NEXT: [[TMP53:%.*]] = fadd fast <2 x double> [[TMP45]], [[VEC_PHI5]] -; CHECK-NEXT: [[TMP54:%.*]] = fadd fast <2 x double> [[TMP46]], [[VEC_PHI6]] -; CHECK-NEXT: [[TMP55:%.*]] = fadd fast <2 x double> [[TMP47]], [[VEC_PHI7]] -; CHECK-NEXT: [[TMP56:%.*]] = fadd fast <2 x double> [[TMP48]], [[VEC_PHI8]] -; CHECK-NEXT: [[TMP57:%.*]] = fmul fast <2 x float> [[STRIDED_VEC10]], [[STRIDED_VEC10]] -; CHECK-NEXT: [[TMP58:%.*]] = fmul fast <2 x float> [[STRIDED_VEC17]], [[STRIDED_VEC17]] -; CHECK-NEXT: [[TMP59:%.*]] = fmul fast <2 x float> [[STRIDED_VEC24]], [[STRIDED_VEC24]] -; CHECK-NEXT: [[TMP60:%.*]] = fmul fast <2 x float> [[STRIDED_VEC31]], [[STRIDED_VEC31]] -; CHECK-NEXT: [[TMP61:%.*]] = fmul fast <2 x float> [[STRIDED_VEC38]], [[STRIDED_VEC38]] -; CHECK-NEXT: [[TMP62:%.*]] = fmul fast <2 x float> [[STRIDED_VEC45]], [[STRIDED_VEC45]] -; CHECK-NEXT: [[TMP63:%.*]] = fmul fast <2 x float> [[STRIDED_VEC52]], [[STRIDED_VEC52]] -; CHECK-NEXT: [[TMP64:%.*]] = fmul fast <2 x float> [[STRIDED_VEC59]], [[STRIDED_VEC59]] -; CHECK-NEXT: [[TMP65:%.*]] = fmul fast <2 x float> [[STRIDED_VEC11]], [[STRIDED_VEC11]] -; CHECK-NEXT: [[TMP66:%.*]] = fmul fast <2 x float> [[STRIDED_VEC18]], [[STRIDED_VEC18]] -; CHECK-NEXT: [[TMP67:%.*]] = fmul fast <2 x float> [[STRIDED_VEC25]], [[STRIDED_VEC25]] -; CHECK-NEXT: [[TMP68:%.*]] = fmul fast <2 x float> [[STRIDED_VEC32]], [[STRIDED_VEC32]] -; CHECK-NEXT: [[TMP69:%.*]] = fmul fast <2 x float> [[STRIDED_VEC39]], [[STRIDED_VEC39]] -; CHECK-NEXT: [[TMP70:%.*]] = fmul fast <2 x float> [[STRIDED_VEC46]], [[STRIDED_VEC46]] -; CHECK-NEXT: [[TMP71:%.*]] = fmul fast <2 x float> [[STRIDED_VEC53]], [[STRIDED_VEC53]] -; CHECK-NEXT: [[TMP72:%.*]] = fmul fast <2 x float> [[STRIDED_VEC60]], [[STRIDED_VEC60]] -; CHECK-NEXT: [[TMP73:%.*]] = fadd fast <2 x float> [[TMP65]], [[TMP57]] -; CHECK-NEXT: [[TMP74:%.*]] = fadd fast <2 x float> [[TMP66]], [[TMP58]] -; CHECK-NEXT: [[TMP75:%.*]] = fadd fast <2 x float> [[TMP67]], [[TMP59]] -; CHECK-NEXT: [[TMP76:%.*]] = fadd fast <2 x float> [[TMP68]], [[TMP60]] -; CHECK-NEXT: [[TMP77:%.*]] = fadd fast <2 x float> [[TMP69]], [[TMP61]] -; CHECK-NEXT: [[TMP78:%.*]] = fadd fast <2 x float> [[TMP70]], [[TMP62]] -; CHECK-NEXT: [[TMP79:%.*]] = fadd fast <2 x float> [[TMP71]], [[TMP63]] +; CHECK-NEXT: [[TMP64:%.*]] = fmul fast <2 x float> [[STRIDED_VEC36]], [[STRIDED_VEC36]] +; CHECK-NEXT: [[TMP97:%.*]] = fmul fast <2 x float> [[STRIDED_VEC43]], [[STRIDED_VEC43]] +; CHECK-NEXT: [[TMP98:%.*]] = fmul fast <2 x float> [[STRIDED_VEC50]], [[STRIDED_VEC50]] +; CHECK-NEXT: [[TMP99:%.*]] = fmul fast <2 x float> [[STRIDED_VEC57]], [[STRIDED_VEC57]] +; CHECK-NEXT: [[TMP72:%.*]] = fmul fast <2 x float> [[STRIDED_VEC37]], [[STRIDED_VEC37]] +; CHECK-NEXT: [[TMP105:%.*]] = fmul fast <2 x float> [[STRIDED_VEC44]], [[STRIDED_VEC44]] +; CHECK-NEXT: [[TMP106:%.*]] = fmul fast <2 x float> [[STRIDED_VEC51]], [[STRIDED_VEC51]] +; CHECK-NEXT: [[TMP107:%.*]] = fmul fast <2 x float> [[STRIDED_VEC58]], [[STRIDED_VEC58]] ; CHECK-NEXT: [[TMP80:%.*]] = fadd fast <2 x float> [[TMP72]], [[TMP64]] -; CHECK-NEXT: [[TMP81:%.*]] = fpext <2 x float> [[TMP73]] to <2 x double> -; CHECK-NEXT: [[TMP82:%.*]] = fpext <2 x float> [[TMP74]] to <2 x double> -; CHECK-NEXT: [[TMP83:%.*]] = fpext <2 x float> [[TMP75]] to <2 x double> -; CHECK-NEXT: [[TMP84:%.*]] = fpext <2 x float> [[TMP76]] to <2 x double> -; CHECK-NEXT: [[TMP85:%.*]] = fpext <2 x float> [[TMP77]] to <2 x double> -; CHECK-NEXT: [[TMP86:%.*]] = fpext <2 x float> [[TMP78]] to <2 x double> -; CHECK-NEXT: [[TMP87:%.*]] = fpext <2 x float> [[TMP79]] to <2 x double> -; CHECK-NEXT: [[TMP88:%.*]] = fpext <2 x float> [[TMP80]] to <2 x double> -; CHECK-NEXT: [[TMP89:%.*]] = fadd fast <2 x double> [[TMP81]], [[TMP49]] -; CHECK-NEXT: [[TMP90:%.*]] = fadd fast <2 x double> [[TMP82]], [[TMP50]] -; CHECK-NEXT: [[TMP91:%.*]] = fadd fast <2 x double> [[TMP83]], [[TMP51]] -; CHECK-NEXT: [[TMP92:%.*]] = fadd fast <2 x double> [[TMP84]], [[TMP52]] -; CHECK-NEXT: [[TMP93:%.*]] = fadd fast <2 x double> [[TMP85]], [[TMP53]] -; CHECK-NEXT: [[TMP94:%.*]] = fadd fast <2 x double> [[TMP86]], [[TMP54]] -; CHECK-NEXT: [[TMP95:%.*]] = fadd fast <2 x double> [[TMP87]], [[TMP55]] -; CHECK-NEXT: [[TMP96:%.*]] = fadd fast <2 x double> [[TMP88]], [[TMP56]] -; CHECK-NEXT: [[TMP97:%.*]] = fmul fast <2 x float> [[STRIDED_VEC12]], [[STRIDED_VEC12]] -; CHECK-NEXT: [[TMP98:%.*]] = fmul fast <2 x float> [[STRIDED_VEC19]], [[STRIDED_VEC19]] -; CHECK-NEXT: [[TMP99:%.*]] = fmul fast <2 x float> [[STRIDED_VEC26]], [[STRIDED_VEC26]] -; CHECK-NEXT: [[TMP100:%.*]] = fmul fast <2 x float> [[STRIDED_VEC33]], [[STRIDED_VEC33]] -; CHECK-NEXT: [[TMP101:%.*]] = fmul fast <2 x float> [[STRIDED_VEC40]], [[STRIDED_VEC40]] -; CHECK-NEXT: [[TMP102:%.*]] = fmul fast <2 x float> [[STRIDED_VEC47]], [[STRIDED_VEC47]] -; CHECK-NEXT: [[TMP103:%.*]] = fmul fast <2 x float> [[STRIDED_VEC54]], [[STRIDED_VEC54]] -; CHECK-NEXT: [[TMP104:%.*]] = fmul fast <2 x float> [[STRIDED_VEC61]], [[STRIDED_VEC61]] -; CHECK-NEXT: [[TMP105:%.*]] = fmul fast <2 x float> [[STRIDED_VEC13]], [[STRIDED_VEC13]] -; CHECK-NEXT: [[TMP106:%.*]] = fmul fast <2 x float> [[STRIDED_VEC20]], [[STRIDED_VEC20]] -; CHECK-NEXT: [[TMP107:%.*]] = fmul fast <2 x float> [[STRIDED_VEC27]], [[STRIDED_VEC27]] -; CHECK-NEXT: [[TMP108:%.*]] = fmul fast <2 x float> [[STRIDED_VEC34]], [[STRIDED_VEC34]] -; CHECK-NEXT: [[TMP109:%.*]] = fmul fast <2 x float> [[STRIDED_VEC41]], [[STRIDED_VEC41]] -; CHECK-NEXT: [[TMP110:%.*]] = fmul fast <2 x float> [[STRIDED_VEC48]], [[STRIDED_VEC48]] -; CHECK-NEXT: [[TMP111:%.*]] = fmul fast <2 x float> [[STRIDED_VEC55]], [[STRIDED_VEC55]] -; CHECK-NEXT: [[TMP112:%.*]] = fmul fast <2 x float> [[STRIDED_VEC62]], [[STRIDED_VEC62]] ; CHECK-NEXT: [[TMP113:%.*]] = fadd fast <2 x float> [[TMP105]], [[TMP97]] ; CHECK-NEXT: [[TMP114:%.*]] = fadd fast <2 x float> [[TMP106]], [[TMP98]] ; CHECK-NEXT: [[TMP115:%.*]] = fadd fast <2 x float> [[TMP107]], [[TMP99]] +; CHECK-NEXT: [[TMP21:%.*]] = fpext <2 x float> [[TMP80]] to <2 x double> +; CHECK-NEXT: [[TMP22:%.*]] = fpext <2 x float> [[TMP113]] to <2 x double> +; CHECK-NEXT: [[TMP23:%.*]] = fpext <2 x float> [[TMP114]] to <2 x double> +; CHECK-NEXT: [[TMP24:%.*]] = fpext <2 x float> [[TMP115]] to <2 x double> +; CHECK-NEXT: [[TMP25:%.*]] = fadd fast <2 x double> [[TMP21]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP26:%.*]] = fadd fast <2 x double> [[TMP22]], [[VEC_PHI1]] +; CHECK-NEXT: [[TMP27:%.*]] = fadd fast <2 x double> [[TMP23]], [[VEC_PHI2]] +; CHECK-NEXT: [[TMP28:%.*]] = fadd fast <2 x double> [[TMP24]], [[VEC_PHI3]] +; CHECK-NEXT: [[TMP100:%.*]] = fmul fast <2 x float> [[STRIDED_VEC38]], [[STRIDED_VEC38]] +; CHECK-NEXT: [[TMP101:%.*]] = fmul fast <2 x float> [[STRIDED_VEC45]], [[STRIDED_VEC45]] +; CHECK-NEXT: [[TMP102:%.*]] = fmul fast <2 x float> [[STRIDED_VEC52]], [[STRIDED_VEC52]] +; CHECK-NEXT: [[TMP103:%.*]] = fmul fast <2 x float> [[STRIDED_VEC59]], [[STRIDED_VEC59]] +; CHECK-NEXT: [[TMP108:%.*]] = fmul fast <2 x float> [[STRIDED_VEC39]], [[STRIDED_VEC39]] +; CHECK-NEXT: [[TMP109:%.*]] = fmul fast <2 x float> [[STRIDED_VEC46]], [[STRIDED_VEC46]] +; CHECK-NEXT: [[TMP110:%.*]] = fmul fast <2 x float> [[STRIDED_VEC53]], [[STRIDED_VEC53]] +; CHECK-NEXT: [[TMP111:%.*]] = fmul fast <2 x float> [[STRIDED_VEC60]], [[STRIDED_VEC60]] ; CHECK-NEXT: [[TMP116:%.*]] = fadd fast <2 x float> [[TMP108]], [[TMP100]] ; CHECK-NEXT: [[TMP117:%.*]] = fadd fast <2 x float> [[TMP109]], [[TMP101]] ; CHECK-NEXT: [[TMP118:%.*]] = fadd fast <2 x float> [[TMP110]], [[TMP102]] ; CHECK-NEXT: [[TMP119:%.*]] = fadd fast <2 x float> [[TMP111]], [[TMP103]] +; CHECK-NEXT: [[TMP41:%.*]] = fpext <2 x float> [[TMP116]] to <2 x double> +; CHECK-NEXT: [[TMP42:%.*]] = fpext <2 x float> [[TMP117]] to <2 x double> +; CHECK-NEXT: [[TMP43:%.*]] = fpext <2 x float> [[TMP118]] to <2 x double> +; CHECK-NEXT: [[TMP44:%.*]] = fpext <2 x float> [[TMP119]] to <2 x double> +; CHECK-NEXT: [[TMP45:%.*]] = fadd fast <2 x double> [[TMP41]], [[TMP25]] +; CHECK-NEXT: [[TMP46:%.*]] = fadd fast <2 x double> [[TMP42]], [[TMP26]] +; CHECK-NEXT: [[TMP47:%.*]] = fadd fast <2 x double> [[TMP43]], [[TMP27]] +; CHECK-NEXT: [[TMP48:%.*]] = fadd fast <2 x double> [[TMP44]], [[TMP28]] +; CHECK-NEXT: [[TMP104:%.*]] = fmul fast <2 x float> [[STRIDED_VEC40]], [[STRIDED_VEC40]] +; CHECK-NEXT: [[TMP142:%.*]] = fmul fast <2 x float> [[STRIDED_VEC47]], [[STRIDED_VEC47]] +; CHECK-NEXT: [[TMP147:%.*]] = fmul fast <2 x float> [[STRIDED_VEC54]], [[STRIDED_VEC54]] +; CHECK-NEXT: [[TMP152:%.*]] = fmul fast <2 x float> [[STRIDED_VEC61]], [[STRIDED_VEC61]] +; CHECK-NEXT: [[TMP112:%.*]] = fmul fast <2 x float> [[STRIDED_VEC41]], [[STRIDED_VEC41]] +; CHECK-NEXT: [[TMP143:%.*]] = fmul fast <2 x float> [[STRIDED_VEC48]], [[STRIDED_VEC48]] +; CHECK-NEXT: [[TMP148:%.*]] = fmul fast <2 x float> [[STRIDED_VEC55]], [[STRIDED_VEC55]] +; CHECK-NEXT: [[TMP153:%.*]] = fmul fast <2 x float> [[STRIDED_VEC62]], [[STRIDED_VEC62]] ; CHECK-NEXT: [[TMP120:%.*]] = fadd fast <2 x float> [[TMP112]], [[TMP104]] -; CHECK-NEXT: [[TMP121:%.*]] = fpext <2 x float> [[TMP113]] to <2 x double> -; CHECK-NEXT: [[TMP122:%.*]] = fpext <2 x float> [[TMP114]] to <2 x double> -; CHECK-NEXT: [[TMP123:%.*]] = fpext <2 x float> [[TMP115]] to <2 x double> -; CHECK-NEXT: [[TMP124:%.*]] = fpext <2 x float> [[TMP116]] to <2 x double> -; CHECK-NEXT: [[TMP125:%.*]] = fpext <2 x float> [[TMP117]] to <2 x double> -; CHECK-NEXT: [[TMP126:%.*]] = fpext <2 x float> [[TMP118]] to <2 x double> -; CHECK-NEXT: [[TMP127:%.*]] = fpext <2 x float> [[TMP119]] to <2 x double> -; CHECK-NEXT: [[TMP128:%.*]] = fpext <2 x float> [[TMP120]] to <2 x double> -; CHECK-NEXT: [[TMP129]] = fadd fast <2 x double> [[TMP121]], [[TMP89]] -; CHECK-NEXT: [[TMP130]] = fadd fast <2 x double> [[TMP122]], [[TMP90]] -; CHECK-NEXT: [[TMP131]] = fadd fast <2 x double> [[TMP123]], [[TMP91]] -; CHECK-NEXT: [[TMP132]] = fadd fast <2 x double> [[TMP124]], [[TMP92]] -; CHECK-NEXT: [[TMP133]] = fadd fast <2 x double> [[TMP125]], [[TMP93]] -; CHECK-NEXT: [[TMP134]] = fadd fast <2 x double> [[TMP126]], [[TMP94]] -; CHECK-NEXT: [[TMP135]] = fadd fast <2 x double> [[TMP127]], [[TMP95]] -; CHECK-NEXT: [[TMP136]] = fadd fast <2 x double> [[TMP128]], [[TMP96]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP137:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP137]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] -; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <2 x double> [[TMP130]], [[TMP129]] -; CHECK-NEXT: [[BIN_RDX63:%.*]] = fadd fast <2 x double> [[TMP131]], [[BIN_RDX]] -; CHECK-NEXT: [[BIN_RDX64:%.*]] = fadd fast <2 x double> [[TMP132]], [[BIN_RDX63]] -; CHECK-NEXT: [[BIN_RDX65:%.*]] = fadd fast <2 x double> [[TMP133]], [[BIN_RDX64]] -; CHECK-NEXT: [[BIN_RDX66:%.*]] = fadd fast <2 x double> [[TMP134]], [[BIN_RDX65]] -; CHECK-NEXT: [[BIN_RDX67:%.*]] = fadd fast <2 x double> [[TMP135]], [[BIN_RDX66]] -; CHECK-NEXT: [[BIN_RDX68:%.*]] = fadd fast <2 x double> [[TMP136]], [[BIN_RDX67]] -; CHECK-NEXT: [[TMP138:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> [[BIN_RDX68]]) -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_FOR_END13_CRIT_EDGE:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] -; CHECK: [[VEC_EPILOG_ITER_CHECK]]: -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2 -; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]] -; CHECK: [[VEC_EPILOG_PH]]: -; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ [[TMP138]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0.000000e+00, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[N_MOD_VF69:%.*]] = urem i64 [[TMP0]], 2 -; CHECK-NEXT: [[N_VEC70:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF69]] -; CHECK-NEXT: [[TMP139:%.*]] = insertelement <2 x double> zeroinitializer, double [[BC_MERGE_RDX]], i32 0 -; CHECK-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]] -; CHECK: [[VEC_EPILOG_VECTOR_BODY]]: -; CHECK-NEXT: [[TMP140:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT80:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI72:%.*]] = phi <2 x double> [ [[TMP139]], %[[VEC_EPILOG_PH]] ], [ [[TMP156:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP141:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[TMP140]], i64 0, i32 0 -; CHECK-NEXT: [[WIDE_VEC73:%.*]] = load <12 x float>, ptr [[TMP141]], align 8 -; CHECK-NEXT: [[STRIDED_VEC74:%.*]] = shufflevector <12 x float> [[WIDE_VEC73]], <12 x float> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC75:%.*]] = shufflevector <12 x float> [[WIDE_VEC73]], <12 x float> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC76:%.*]] = shufflevector <12 x float> [[WIDE_VEC73]], <12 x float> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC77:%.*]] = shufflevector <12 x float> [[WIDE_VEC73]], <12 x float> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC78:%.*]] = shufflevector <12 x float> [[WIDE_VEC73]], <12 x float> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC79:%.*]] = shufflevector <12 x float> [[WIDE_VEC73]], <12 x float> poison, <2 x i32> -; CHECK-NEXT: [[TMP142:%.*]] = fmul fast <2 x float> [[STRIDED_VEC74]], [[STRIDED_VEC74]] -; CHECK-NEXT: [[TMP143:%.*]] = fmul fast <2 x float> [[STRIDED_VEC75]], [[STRIDED_VEC75]] ; CHECK-NEXT: [[TMP144:%.*]] = fadd fast <2 x float> [[TMP143]], [[TMP142]] -; CHECK-NEXT: [[TMP145:%.*]] = fpext <2 x float> [[TMP144]] to <2 x double> -; CHECK-NEXT: [[TMP146:%.*]] = fadd fast <2 x double> [[TMP145]], [[VEC_PHI72]] -; CHECK-NEXT: [[TMP147:%.*]] = fmul fast <2 x float> [[STRIDED_VEC76]], [[STRIDED_VEC76]] -; CHECK-NEXT: [[TMP148:%.*]] = fmul fast <2 x float> [[STRIDED_VEC77]], [[STRIDED_VEC77]] ; CHECK-NEXT: [[TMP149:%.*]] = fadd fast <2 x float> [[TMP148]], [[TMP147]] -; CHECK-NEXT: [[TMP150:%.*]] = fpext <2 x float> [[TMP149]] to <2 x double> -; CHECK-NEXT: [[TMP151:%.*]] = fadd fast <2 x double> [[TMP150]], [[TMP146]] -; CHECK-NEXT: [[TMP152:%.*]] = fmul fast <2 x float> [[STRIDED_VEC78]], [[STRIDED_VEC78]] -; CHECK-NEXT: [[TMP153:%.*]] = fmul fast <2 x float> [[STRIDED_VEC79]], [[STRIDED_VEC79]] ; CHECK-NEXT: [[TMP154:%.*]] = fadd fast <2 x float> [[TMP153]], [[TMP152]] +; CHECK-NEXT: [[TMP61:%.*]] = fpext <2 x float> [[TMP120]] to <2 x double> +; CHECK-NEXT: [[TMP62:%.*]] = fpext <2 x float> [[TMP144]] to <2 x double> +; CHECK-NEXT: [[TMP63:%.*]] = fpext <2 x float> [[TMP149]] to <2 x double> ; CHECK-NEXT: [[TMP155:%.*]] = fpext <2 x float> [[TMP154]] to <2 x double> -; CHECK-NEXT: [[TMP156]] = fadd fast <2 x double> [[TMP155]], [[TMP151]] -; CHECK-NEXT: [[INDEX_NEXT80]] = add nuw i64 [[TMP140]], 2 -; CHECK-NEXT: [[TMP157:%.*]] = icmp eq i64 [[INDEX_NEXT80]], [[N_VEC70]] -; CHECK-NEXT: br i1 [[TMP157]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] -; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP69]] = fadd fast <2 x double> [[TMP61]], [[TMP45]] +; CHECK-NEXT: [[TMP65]] = fadd fast <2 x double> [[TMP62]], [[TMP46]] +; CHECK-NEXT: [[TMP66]] = fadd fast <2 x double> [[TMP63]], [[TMP47]] +; CHECK-NEXT: [[TMP67]] = fadd fast <2 x double> [[TMP155]], [[TMP48]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-NEXT: [[TMP68:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP68]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <2 x double> [[TMP65]], [[TMP69]] +; CHECK-NEXT: [[BIN_RDX30:%.*]] = fadd fast <2 x double> [[TMP66]], [[BIN_RDX]] +; CHECK-NEXT: [[TMP156:%.*]] = fadd fast <2 x double> [[TMP67]], [[BIN_RDX30]] ; CHECK-NEXT: [[TMP158:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> [[TMP156]]) -; CHECK-NEXT: [[CMP_N81:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC70]] -; CHECK-NEXT: br i1 [[CMP_N81]], label %[[FOR_COND_FOR_END13_CRIT_EDGE]], label %[[VEC_EPILOG_SCALAR_PH]] -; CHECK: [[VEC_EPILOG_SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC70]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ] -; CHECK-NEXT: [[BC_MERGE_RDX82:%.*]] = phi double [ [[TMP158]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP138]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0.000000e+00, %[[ITER_CHECK]] ] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_FOR_END13_CRIT_EDGE:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_COND1_PREHEADER_PREHEADER]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ [[TMP158]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[FOR_COND1_PREHEADER_PREHEADER]] ] ; CHECK-NEXT: br label %[[FOR_COND1_PREHEADER:.*]] ; CHECK: [[FOR_COND1_PREHEADER]]: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_COND1_PREHEADER]] ], [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_SCALAR_PH]] ] -; CHECK-NEXT: [[SUM_026:%.*]] = phi double [ [[ADD10_2:%.*]], %[[FOR_COND1_PREHEADER]] ], [ [[BC_MERGE_RDX82]], %[[VEC_EPILOG_SCALAR_PH]] ] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_COND1_PREHEADER]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; CHECK-NEXT: [[SUM_026:%.*]] = phi double [ [[ADD10_2:%.*]], %[[FOR_COND1_PREHEADER]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX5_REALP:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[INDVARS_IV]], i64 0, i32 0 ; CHECK-NEXT: [[ARRAYIDX5_REAL:%.*]] = load float, ptr [[ARRAYIDX5_REALP]], align 8 ; CHECK-NEXT: [[ARRAYIDX5_IMAGP:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[INDVARS_IV]], i64 0, i32 1 @@ -318,9 +167,9 @@ define void @QLA_F3_r_veq_norm2_V(ptr noalias %r, ptr noalias %a, i32 %n) { ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label %[[FOR_COND_FOR_END13_CRIT_EDGE]], label %[[FOR_COND1_PREHEADER]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label %[[FOR_COND_FOR_END13_CRIT_EDGE]], label %[[FOR_COND1_PREHEADER]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: [[FOR_COND_FOR_END13_CRIT_EDGE]]: -; CHECK-NEXT: [[ADD10_2_LCSSA:%.*]] = phi double [ [[ADD10_2]], %[[FOR_COND1_PREHEADER]] ], [ [[TMP138]], %[[MIDDLE_BLOCK]] ], [ [[TMP158]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ADD10_2_LCSSA:%.*]] = phi double [ [[ADD10_2]], %[[FOR_COND1_PREHEADER]] ], [ [[TMP158]], %[[MIDDLE_BLOCK]] ] ; CHECK-NEXT: [[PHITMP:%.*]] = fptrunc double [[ADD10_2_LCSSA]] to float ; CHECK-NEXT: br label %[[FOR_END13]] ; CHECK: [[FOR_END13]]: @@ -385,6 +234,5 @@ for.end13: ; preds = %for.cond.for.end13_ ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} ; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]} -; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META2]], [[META1]]} +; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/reg-usage.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/reg-usage.ll index db4b580a39677..280b3af04a4db 100644 --- a/llvm/test/Transforms/LoopVectorize/PowerPC/reg-usage.ll +++ b/llvm/test/Transforms/LoopVectorize/PowerPC/reg-usage.ll @@ -132,7 +132,7 @@ define float @float_(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 % ;CHECK-LABEL: float_ ;CHECK: LV(REG): VF = 1 ;CHECK: LV(REG): Found max usage: 2 item -;CHECK-NEXT: LV(REG): RegisterClass: PPC::GPRRC, 2 registers +;CHECK-NEXT: LV(REG): RegisterClass: PPC::GPRRC, 3 registers ;CHECK-NEXT: LV(REG): RegisterClass: PPC::VSXRC, 3 registers ;CHECK: LV(REG): Found invariant usage: 1 item ;CHECK-NEXT: LV(REG): RegisterClass: PPC::GPRRC, 1 registers @@ -175,14 +175,14 @@ define void @double_(ptr nocapture %A, i32 %n) nounwind uwtable ssp { ;CHECK-PWR8-NEXT: LV(REG): RegisterClass: PPC::GPRRC, 2 registers ;CHECK-PWR8-NEXT: LV(REG): RegisterClass: PPC::VSXRC, 5 registers ;CHECK-PWR8: LV(REG): Found invariant usage: 1 item -;CHECK-PWR8-NEXT: LV(REG): RegisterClass: PPC::GPRRC, 1 registers +;CHECK-PWR8-NEXT: LV(REG): RegisterClass: PPC::GPRRC, 2 registers ;CHECK-PWR9: LV(REG): VF = 1 ;CHECK-PWR9: LV(REG): Found max usage: 2 item -;CHECK-PWR9-NEXT: LV(REG): RegisterClass: PPC::GPRRC, 3 registers +;CHECK-PWR9-NEXT: LV(REG): RegisterClass: PPC::GPRRC, 2 registers ;CHECK-PWR9-NEXT: LV(REG): RegisterClass: PPC::VSXRC, 5 registers ;CHECK-PWR9: LV(REG): Found invariant usage: 1 item -;CHECK-PWR9-NEXT: LV(REG): RegisterClass: PPC::GPRRC, 1 registers +;CHECK-PWR9-NEXT: LV(REG): RegisterClass: PPC::GPRRC, 2 registers %1 = sext i32 %n to i64 br label %2 @@ -248,8 +248,12 @@ define void @fp16_(ptr nocapture readonly %pIn, ptr nocapture %pOut, i32 %numRow ;CHECK-LABEL: fp16_ ;CHECK: LV(REG): VF = 1 ;CHECK: LV(REG): Found max usage: 2 item -;CHECK: LV(REG): RegisterClass: PPC::GPRRC, 5 registers +;CHECK: LV(REG): RegisterClass: PPC::GPRRC, 3 registers ;CHECK: LV(REG): RegisterClass: PPC::VSXRC, 2 registers +;CHECK: LV(REG): Found invariant usage: 2 item +;CHECK: LV(REG): RegisterClass: PPC::GPRRC, 1 registers +;CHECK: LV(REG): RegisterClass: PPC::VSXRC, 1 registers + entry: %tmp.0.extract.trunc = trunc i32 %scale.coerce to i16 %0 = bitcast i16 %tmp.0.extract.trunc to half diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-bf16.ll b/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-bf16.ll index 40d6e8bc33471..4e3077cfcab67 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-bf16.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-bf16.ll @@ -4,7 +4,7 @@ define void @add(ptr noalias nocapture readonly %src1, ptr noalias nocapture readonly %src2, i32 signext %size, ptr noalias nocapture writeonly %result) { ; CHECK-LABEL: add ; CHECK: LV(REG): Found max usage: 2 item -; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 2 registers +; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 3 registers ; CHECK-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 4 registers ; CHECK-NEXT: LV(REG): Found invariant usage: 1 item ; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 1 registers diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-f16.ll b/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-f16.ll index e07c7b6b40729..8825065aa5fe8 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-f16.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-f16.ll @@ -5,12 +5,12 @@ define void @add(ptr noalias nocapture readonly %src1, ptr noalias nocapture readonly %src2, i32 signext %size, ptr noalias nocapture writeonly %result) { ; CHECK-LABEL: add ; ZVFH: LV(REG): Found max usage: 2 item -; ZVFH-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 2 registers +; ZVFH-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 3 registers ; ZVFH-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 2 registers ; ZVFH-NEXT: LV(REG): Found invariant usage: 1 item ; ZVFH-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 1 registers ; ZVFHMIN: LV(REG): Found max usage: 2 item -; ZVFHMIN-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 2 registers +; ZVFHMIN-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 3 registers ; ZVFHMIN-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 4 registers ; ZVFHMIN-NEXT: LV(REG): Found invariant usage: 1 item ; ZVFHMIN-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 1 registers diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage.ll b/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage.ll index a2f55e49c9e0e..9585d0d6d6cfd 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage.ll @@ -23,27 +23,27 @@ define void @add(ptr noalias nocapture readonly %src1, ptr noalias nocapture readonly %src2, i32 signext %size, ptr noalias nocapture writeonly %result) { ; CHECK-LABEL: add ; CHECK-SCALAR: LV(REG): Found max usage: 2 item -; CHECK-SCALAR-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 2 registers +; CHECK-SCALAR-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 3 registers ; CHECK-SCALAR-NEXT: LV(REG): RegisterClass: RISCV::FPRRC, 2 registers ; CHECK-SCALAR-NEXT: LV(REG): Found invariant usage: 1 item ; CHECK-SCALAR-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 1 registers ; CHECK-LMUL1: LV(REG): Found max usage: 2 item -; CHECK-LMUL1-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 2 registers +; CHECK-LMUL1-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 3 registers ; CHECK-LMUL1-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 2 registers ; CHECK-LMUL1-NEXT: LV(REG): Found invariant usage: 1 item ; CHECK-LMUL1-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 1 registers ; CHECK-LMUL2: LV(REG): Found max usage: 2 item -; CHECK-LMUL2-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 2 registers +; CHECK-LMUL2-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 3 registers ; CHECK-LMUL2-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 4 registers ; CHECK-LMUL2-NEXT: LV(REG): Found invariant usage: 1 item ; CHECK-LMUL2-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 1 registers ; CHECK-LMUL4: LV(REG): Found max usage: 2 item -; CHECK-LMUL4-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 2 registers +; CHECK-LMUL4-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 3 registers ; CHECK-LMUL4-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 8 registers ; CHECK-LMUL4-NEXT: LV(REG): Found invariant usage: 1 item ; CHECK-LMUL4-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 1 registers ; CHECK-LMUL8: LV(REG): Found max usage: 2 item -; CHECK-LMUL8-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 2 registers +; CHECK-LMUL8-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 3 registers ; CHECK-LMUL8-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 16 registers ; CHECK-LMUL8-NEXT: LV(REG): Found invariant usage: 1 item ; CHECK-LMUL8-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 1 registers diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll index 5e4bd284e1fa8..3683cfaa578f9 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll @@ -126,12 +126,15 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV(REG): At #2 Interval # 2 ; CHECK-NEXT: LV(REG): At #3 Interval # 2 ; CHECK-NEXT: LV(REG): At #4 Interval # 2 -; CHECK-NEXT: LV(REG): At #5 Interval # 3 +; CHECK-NEXT: LV(REG): At #5 Interval # 2 ; CHECK-NEXT: LV(REG): At #6 Interval # 3 ; CHECK-NEXT: LV(REG): At #7 Interval # 3 ; CHECK-NEXT: LV(REG): At #8 Interval # 3 -; CHECK-NEXT: LV(REG): At #9 Interval # 2 +; CHECK-NEXT: LV(REG): At #9 Interval # 3 ; CHECK-NEXT: LV(REG): At #10 Interval # 3 +; CHECK-NEXT: LV(REG): At #11 Interval # 3 +; CHECK-NEXT: LV(REG): At #12 Interval # 2 +; CHECK-NEXT: LV(REG): At #13 Interval # 2 ; CHECK-NEXT: LV(REG): VF = vscale x 4 ; CHECK-NEXT: LV(REG): Found max usage: 2 item ; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 3 registers @@ -374,12 +377,15 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV(REG): At #2 Interval # 2 ; CHECK-NEXT: LV(REG): At #3 Interval # 2 ; CHECK-NEXT: LV(REG): At #4 Interval # 2 -; CHECK-NEXT: LV(REG): At #5 Interval # 3 +; CHECK-NEXT: LV(REG): At #5 Interval # 2 ; CHECK-NEXT: LV(REG): At #6 Interval # 3 ; CHECK-NEXT: LV(REG): At #7 Interval # 3 ; CHECK-NEXT: LV(REG): At #8 Interval # 3 -; CHECK-NEXT: LV(REG): At #9 Interval # 2 +; CHECK-NEXT: LV(REG): At #9 Interval # 3 ; CHECK-NEXT: LV(REG): At #10 Interval # 3 +; CHECK-NEXT: LV(REG): At #11 Interval # 3 +; CHECK-NEXT: LV(REG): At #12 Interval # 2 +; CHECK-NEXT: LV(REG): At #13 Interval # 2 ; CHECK-NEXT: LV(REG): VF = vscale x 4 ; CHECK-NEXT: LV(REG): Found max usage: 2 item ; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 3 registers diff --git a/llvm/test/Transforms/LoopVectorize/X86/i1-reg-usage.ll b/llvm/test/Transforms/LoopVectorize/X86/i1-reg-usage.ll index ce0fc350246e4..3445d4ceff5ec 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/i1-reg-usage.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/i1-reg-usage.ll @@ -8,8 +8,8 @@ target triple = "x86_64" ; CHECK-LABEL: LV: Checking a loop in 'or_reduction_avx' from ; CHECK: LV(REG): VF = 64 ; CHECK-NEXT: LV(REG): Found max usage: 2 item +; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 2 registers ; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 136 registers -; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 1 registers define i1 @or_reduction_avx(i32 %arg, ptr %ptr) "target-features"="+avx" { entry: diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll b/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll index ceafb54e1d539..c4291507e8d97 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll @@ -171,142 +171,65 @@ define void @test_muladd(ptr noalias nocapture %d1, ptr noalias nocapture readon ; AVX1-NEXT: entry: ; AVX1-NEXT: [[CMP30:%.*]] = icmp sgt i32 [[N:%.*]], 0 ; AVX1-NEXT: br i1 [[CMP30]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]] -; AVX1: iter.check: +; AVX1: for.body.preheader: ; AVX1-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 -; AVX1-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 -; AVX1-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] -; AVX1: vector.main.loop.iter.check: -; AVX1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16 +; AVX1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 8 ; AVX1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; AVX1: vector.ph: -; AVX1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16 +; AVX1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 8 ; AVX1-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] ; AVX1-NEXT: br label [[VECTOR_BODY:%.*]] ; AVX1: vector.body: ; AVX1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; AVX1-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 -; AVX1-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 8 -; AVX1-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 12 -; AVX1-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[INDEX]], 1 -; AVX1-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP1]], 1 -; AVX1-NEXT: [[TMP6:%.*]] = shl nuw nsw i64 [[TMP2]], 1 -; AVX1-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[TMP3]], 1 -; AVX1-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[S1:%.*]], i64 [[TMP4]] -; AVX1-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP5]] -; AVX1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP6]] -; AVX1-NEXT: [[TMP11:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP7]] -; AVX1-NEXT: [[WIDE_VEC:%.*]] = load <8 x i16>, ptr [[TMP8]], align 2 -; AVX1-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> -; AVX1-NEXT: [[STRIDED_VEC7:%.*]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> -; AVX1-NEXT: [[WIDE_VEC1:%.*]] = load <8 x i16>, ptr [[TMP9]], align 2 -; AVX1-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <8 x i16> [[WIDE_VEC1]], <8 x i16> poison, <4 x i32> -; AVX1-NEXT: [[STRIDED_VEC8:%.*]] = shufflevector <8 x i16> [[WIDE_VEC1]], <8 x i16> poison, <4 x i32> -; AVX1-NEXT: [[WIDE_VEC2:%.*]] = load <8 x i16>, ptr [[TMP10]], align 2 +; AVX1-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[INDEX]], 1 +; AVX1-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 1 +; AVX1-NEXT: [[TMP11:%.*]] = getelementptr inbounds i16, ptr [[S1:%.*]], i64 [[TMP7]] +; AVX1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP2]] +; AVX1-NEXT: [[WIDE_VEC2:%.*]] = load <8 x i16>, ptr [[TMP11]], align 2 ; AVX1-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <8 x i16> [[WIDE_VEC2]], <8 x i16> poison, <4 x i32> ; AVX1-NEXT: [[STRIDED_VEC9:%.*]] = shufflevector <8 x i16> [[WIDE_VEC2]], <8 x i16> poison, <4 x i32> -; AVX1-NEXT: [[WIDE_VEC3:%.*]] = load <8 x i16>, ptr [[TMP11]], align 2 +; AVX1-NEXT: [[WIDE_VEC3:%.*]] = load <8 x i16>, ptr [[TMP4]], align 2 ; AVX1-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <8 x i16> [[WIDE_VEC3]], <8 x i16> poison, <4 x i32> ; AVX1-NEXT: [[STRIDED_VEC10:%.*]] = shufflevector <8 x i16> [[WIDE_VEC3]], <8 x i16> poison, <4 x i32> -; AVX1-NEXT: [[TMP16:%.*]] = sext <4 x i16> [[STRIDED_VEC]] to <4 x i32> -; AVX1-NEXT: [[TMP17:%.*]] = sext <4 x i16> [[STRIDED_VEC4]] to <4 x i32> -; AVX1-NEXT: [[TMP18:%.*]] = sext <4 x i16> [[STRIDED_VEC5]] to <4 x i32> -; AVX1-NEXT: [[TMP19:%.*]] = sext <4 x i16> [[STRIDED_VEC6]] to <4 x i32> -; AVX1-NEXT: [[TMP20:%.*]] = getelementptr inbounds i16, ptr [[S2:%.*]], i64 [[TMP4]] -; AVX1-NEXT: [[TMP21:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP5]] -; AVX1-NEXT: [[TMP22:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP6]] -; AVX1-NEXT: [[TMP23:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP7]] -; AVX1-NEXT: [[WIDE_VEC11:%.*]] = load <8 x i16>, ptr [[TMP20]], align 2 -; AVX1-NEXT: [[STRIDED_VEC15:%.*]] = shufflevector <8 x i16> [[WIDE_VEC11]], <8 x i16> poison, <4 x i32> -; AVX1-NEXT: [[STRIDED_VEC19:%.*]] = shufflevector <8 x i16> [[WIDE_VEC11]], <8 x i16> poison, <4 x i32> -; AVX1-NEXT: [[WIDE_VEC12:%.*]] = load <8 x i16>, ptr [[TMP21]], align 2 -; AVX1-NEXT: [[STRIDED_VEC16:%.*]] = shufflevector <8 x i16> [[WIDE_VEC12]], <8 x i16> poison, <4 x i32> -; AVX1-NEXT: [[STRIDED_VEC20:%.*]] = shufflevector <8 x i16> [[WIDE_VEC12]], <8 x i16> poison, <4 x i32> +; AVX1-NEXT: [[TMP36:%.*]] = sext <4 x i16> [[STRIDED_VEC5]] to <4 x i32> +; AVX1-NEXT: [[TMP37:%.*]] = sext <4 x i16> [[STRIDED_VEC6]] to <4 x i32> +; AVX1-NEXT: [[TMP22:%.*]] = getelementptr inbounds i16, ptr [[S2:%.*]], i64 [[TMP7]] +; AVX1-NEXT: [[TMP23:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP2]] ; AVX1-NEXT: [[WIDE_VEC13:%.*]] = load <8 x i16>, ptr [[TMP22]], align 2 ; AVX1-NEXT: [[STRIDED_VEC17:%.*]] = shufflevector <8 x i16> [[WIDE_VEC13]], <8 x i16> poison, <4 x i32> ; AVX1-NEXT: [[STRIDED_VEC21:%.*]] = shufflevector <8 x i16> [[WIDE_VEC13]], <8 x i16> poison, <4 x i32> ; AVX1-NEXT: [[WIDE_VEC14:%.*]] = load <8 x i16>, ptr [[TMP23]], align 2 ; AVX1-NEXT: [[STRIDED_VEC18:%.*]] = shufflevector <8 x i16> [[WIDE_VEC14]], <8 x i16> poison, <4 x i32> ; AVX1-NEXT: [[STRIDED_VEC22:%.*]] = shufflevector <8 x i16> [[WIDE_VEC14]], <8 x i16> poison, <4 x i32> -; AVX1-NEXT: [[TMP28:%.*]] = sext <4 x i16> [[STRIDED_VEC15]] to <4 x i32> -; AVX1-NEXT: [[TMP29:%.*]] = sext <4 x i16> [[STRIDED_VEC16]] to <4 x i32> -; AVX1-NEXT: [[TMP30:%.*]] = sext <4 x i16> [[STRIDED_VEC17]] to <4 x i32> -; AVX1-NEXT: [[TMP31:%.*]] = sext <4 x i16> [[STRIDED_VEC18]] to <4 x i32> -; AVX1-NEXT: [[TMP32:%.*]] = mul nsw <4 x i32> [[TMP28]], [[TMP16]] -; AVX1-NEXT: [[TMP33:%.*]] = mul nsw <4 x i32> [[TMP29]], [[TMP17]] -; AVX1-NEXT: [[TMP34:%.*]] = mul nsw <4 x i32> [[TMP30]], [[TMP18]] -; AVX1-NEXT: [[TMP35:%.*]] = mul nsw <4 x i32> [[TMP31]], [[TMP19]] -; AVX1-NEXT: [[TMP36:%.*]] = sext <4 x i16> [[STRIDED_VEC7]] to <4 x i32> -; AVX1-NEXT: [[TMP37:%.*]] = sext <4 x i16> [[STRIDED_VEC8]] to <4 x i32> +; AVX1-NEXT: [[TMP40:%.*]] = sext <4 x i16> [[STRIDED_VEC17]] to <4 x i32> +; AVX1-NEXT: [[TMP41:%.*]] = sext <4 x i16> [[STRIDED_VEC18]] to <4 x i32> +; AVX1-NEXT: [[TMP44:%.*]] = mul nsw <4 x i32> [[TMP40]], [[TMP36]] +; AVX1-NEXT: [[TMP45:%.*]] = mul nsw <4 x i32> [[TMP41]], [[TMP37]] ; AVX1-NEXT: [[TMP38:%.*]] = sext <4 x i16> [[STRIDED_VEC9]] to <4 x i32> ; AVX1-NEXT: [[TMP39:%.*]] = sext <4 x i16> [[STRIDED_VEC10]] to <4 x i32> -; AVX1-NEXT: [[TMP40:%.*]] = sext <4 x i16> [[STRIDED_VEC19]] to <4 x i32> -; AVX1-NEXT: [[TMP41:%.*]] = sext <4 x i16> [[STRIDED_VEC20]] to <4 x i32> ; AVX1-NEXT: [[TMP42:%.*]] = sext <4 x i16> [[STRIDED_VEC21]] to <4 x i32> ; AVX1-NEXT: [[TMP43:%.*]] = sext <4 x i16> [[STRIDED_VEC22]] to <4 x i32> -; AVX1-NEXT: [[TMP44:%.*]] = mul nsw <4 x i32> [[TMP40]], [[TMP36]] -; AVX1-NEXT: [[TMP45:%.*]] = mul nsw <4 x i32> [[TMP41]], [[TMP37]] ; AVX1-NEXT: [[TMP46:%.*]] = mul nsw <4 x i32> [[TMP42]], [[TMP38]] ; AVX1-NEXT: [[TMP47:%.*]] = mul nsw <4 x i32> [[TMP43]], [[TMP39]] -; AVX1-NEXT: [[TMP48:%.*]] = add nsw <4 x i32> [[TMP44]], [[TMP32]] -; AVX1-NEXT: [[TMP49:%.*]] = add nsw <4 x i32> [[TMP45]], [[TMP33]] -; AVX1-NEXT: [[TMP50:%.*]] = add nsw <4 x i32> [[TMP46]], [[TMP34]] -; AVX1-NEXT: [[TMP51:%.*]] = add nsw <4 x i32> [[TMP47]], [[TMP35]] +; AVX1-NEXT: [[TMP48:%.*]] = add nsw <4 x i32> [[TMP46]], [[TMP44]] +; AVX1-NEXT: [[TMP49:%.*]] = add nsw <4 x i32> [[TMP47]], [[TMP45]] ; AVX1-NEXT: [[TMP52:%.*]] = getelementptr inbounds i32, ptr [[D1:%.*]], i64 [[INDEX]] ; AVX1-NEXT: [[TMP56:%.*]] = getelementptr inbounds i32, ptr [[TMP52]], i32 0 ; AVX1-NEXT: [[TMP57:%.*]] = getelementptr inbounds i32, ptr [[TMP52]], i32 4 -; AVX1-NEXT: [[TMP58:%.*]] = getelementptr inbounds i32, ptr [[TMP52]], i32 8 -; AVX1-NEXT: [[TMP59:%.*]] = getelementptr inbounds i32, ptr [[TMP52]], i32 12 ; AVX1-NEXT: store <4 x i32> [[TMP48]], ptr [[TMP56]], align 4 ; AVX1-NEXT: store <4 x i32> [[TMP49]], ptr [[TMP57]], align 4 -; AVX1-NEXT: store <4 x i32> [[TMP50]], ptr [[TMP58]], align 4 -; AVX1-NEXT: store <4 x i32> [[TMP51]], ptr [[TMP59]], align 4 -; AVX1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; AVX1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; AVX1-NEXT: [[TMP60:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; AVX1-NEXT: br i1 [[TMP60]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; AVX1: middle.block: ; AVX1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] -; AVX1-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] -; AVX1: vec.epilog.iter.check: -; AVX1-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] -; AVX1-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 -; AVX1-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[SCALAR_PH]] -; AVX1: vec.epilog.ph: -; AVX1-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; AVX1-NEXT: [[N_MOD_VF24:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 -; AVX1-NEXT: [[N_VEC25:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF24]] -; AVX1-NEXT: br label [[FOR_BODY:%.*]] -; AVX1: vec.epilog.vector.body: -; AVX1-NEXT: [[INDEX26:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDEX_NEXT33:%.*]], [[FOR_BODY]] ] -; AVX1-NEXT: [[TMP68:%.*]] = shl nuw nsw i64 [[INDEX26]], 1 -; AVX1-NEXT: [[TMP69:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP68]] -; AVX1-NEXT: [[WIDE_VEC27:%.*]] = load <8 x i16>, ptr [[TMP69]], align 2 -; AVX1-NEXT: [[STRIDED_VEC28:%.*]] = shufflevector <8 x i16> [[WIDE_VEC27]], <8 x i16> poison, <4 x i32> -; AVX1-NEXT: [[STRIDED_VEC29:%.*]] = shufflevector <8 x i16> [[WIDE_VEC27]], <8 x i16> poison, <4 x i32> -; AVX1-NEXT: [[TMP53:%.*]] = sext <4 x i16> [[STRIDED_VEC28]] to <4 x i32> -; AVX1-NEXT: [[TMP54:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP68]] -; AVX1-NEXT: [[WIDE_VEC30:%.*]] = load <8 x i16>, ptr [[TMP54]], align 2 -; AVX1-NEXT: [[STRIDED_VEC31:%.*]] = shufflevector <8 x i16> [[WIDE_VEC30]], <8 x i16> poison, <4 x i32> -; AVX1-NEXT: [[STRIDED_VEC32:%.*]] = shufflevector <8 x i16> [[WIDE_VEC30]], <8 x i16> poison, <4 x i32> -; AVX1-NEXT: [[TMP55:%.*]] = sext <4 x i16> [[STRIDED_VEC31]] to <4 x i32> -; AVX1-NEXT: [[TMP70:%.*]] = mul nsw <4 x i32> [[TMP55]], [[TMP53]] -; AVX1-NEXT: [[TMP71:%.*]] = sext <4 x i16> [[STRIDED_VEC29]] to <4 x i32> -; AVX1-NEXT: [[TMP72:%.*]] = sext <4 x i16> [[STRIDED_VEC32]] to <4 x i32> -; AVX1-NEXT: [[TMP73:%.*]] = mul nsw <4 x i32> [[TMP72]], [[TMP71]] -; AVX1-NEXT: [[TMP74:%.*]] = add nsw <4 x i32> [[TMP73]], [[TMP70]] -; AVX1-NEXT: [[TMP75:%.*]] = getelementptr inbounds i32, ptr [[D1]], i64 [[INDEX26]] -; AVX1-NEXT: [[TMP76:%.*]] = getelementptr inbounds i32, ptr [[TMP75]], i32 0 -; AVX1-NEXT: store <4 x i32> [[TMP74]], ptr [[TMP76]], align 4 -; AVX1-NEXT: [[INDEX_NEXT33]] = add nuw i64 [[INDEX26]], 4 -; AVX1-NEXT: [[TMP77:%.*]] = icmp eq i64 [[INDEX_NEXT33]], [[N_VEC25]] -; AVX1-NEXT: br i1 [[TMP77]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] -; AVX1: vec.epilog.middle.block: -; AVX1-NEXT: [[CMP_N34:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC25]] -; AVX1-NEXT: br i1 [[CMP_N34]], label [[FOR_END_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] -; AVX1: vec.epilog.scalar.ph: -; AVX1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC25]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; AVX1-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; AVX1: scalar.ph: +; AVX1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] ; AVX1-NEXT: br label [[FOR_BODY1:%.*]] ; AVX1: for.body: -; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY1]] ] +; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY1]] ] ; AVX1-NEXT: [[TMP61:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1 ; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP61]] ; AVX1-NEXT: [[TMP62:%.*]] = load i16, ptr [[ARRAYIDX]], align 2 @@ -328,7 +251,7 @@ define void @test_muladd(ptr noalias nocapture %d1, ptr noalias nocapture readon ; AVX1-NEXT: store i32 [[ADD18]], ptr [[ARRAYIDX20]], align 4 ; AVX1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; AVX1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY1]], !llvm.loop [[LOOP4:![0-9]+]] +; AVX1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY1]], !llvm.loop [[LOOP3:![0-9]+]] ; AVX1: for.end.loopexit: ; AVX1-NEXT: br label [[FOR_END]] ; AVX1: for.end: diff --git a/llvm/test/Transforms/LoopVectorize/X86/reg-usage.ll b/llvm/test/Transforms/LoopVectorize/X86/reg-usage.ll index f6191cc53c971..784b030bf3ab3 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/reg-usage.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/reg-usage.ll @@ -94,9 +94,11 @@ define i64 @bar(ptr nocapture %a) { ; CHECK-LABEL: bar ; CHECK: LV(REG): VF = 2 ; CHECK-NEXT: LV(REG): Found max usage: 2 item +; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 2 registers ; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 3 registers +; CHECK-NEXT: LV(REG): Found invariant usage: 1 item ; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 1 registers -; CHECK-NEXT: LV(REG): Found invariant usage: 0 item + entry: br label %for.body