diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index f5be6bbe4a2b6..c05e4b822d2c4 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -1253,7 +1253,7 @@ class BoUpSLP { NonScheduledFirst.clear(); EntryToLastInstruction.clear(); ExternalUses.clear(); - ExternalUsesAsGEPs.clear(); + ExternalUsesAsOriginalScalar.clear(); for (auto &Iter : BlocksSchedules) { BlockScheduling *BS = Iter.second.get(); BS->clear(); @@ -3468,7 +3468,7 @@ class BoUpSLP { /// A list of GEPs which can be reaplced by scalar GEPs instead of /// extractelement instructions. - SmallPtrSet ExternalUsesAsGEPs; + SmallPtrSet ExternalUsesAsOriginalScalar; /// Values used only by @llvm.assume calls. SmallPtrSet EphValues; @@ -10663,6 +10663,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef VectorizedVals) { SmallDenseSet UsedInserts; DenseSet> VectorCasts; std::optional> ValueToExtUses; + DenseMap> ExtractsCount; for (ExternalUser &EU : ExternalUses) { // We only add extract cost once for the same scalar. if (!isa_and_nonnull(EU.User) && @@ -10771,52 +10772,90 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef VectorizedVals) { } } } - // Leave the GEPs as is, they are free in most cases and better to keep them - // as GEPs. + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; - if (auto *GEP = dyn_cast(EU.Scalar)) { + // If we plan to rewrite the tree in a smaller type, we will need to sign + // extend the extracted value back to the original type. Here, we account + // for the extract and the added cost of the sign extend if needed. + InstructionCost ExtraCost = TTI::TCC_Free; + auto *VecTy = getWidenedType(EU.Scalar->getType(), BundleWidth); + const TreeEntry *Entry = getTreeEntry(EU.Scalar); + auto It = MinBWs.find(Entry); + if (It != MinBWs.end()) { + auto *MinTy = IntegerType::get(F->getContext(), It->second.first); + unsigned Extend = + It->second.second ? Instruction::SExt : Instruction::ZExt; + VecTy = getWidenedType(MinTy, BundleWidth); + ExtraCost = TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(), + VecTy, EU.Lane); + } else { + ExtraCost = TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, + CostKind, EU.Lane); + } + // Leave the scalar instructions as is if they are cheaper than extracts. + if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr || + Entry->getOpcode() == Instruction::Load) { if (!ValueToExtUses) { ValueToExtUses.emplace(); for_each(enumerate(ExternalUses), [&](const auto &P) { + // Ignore phis in loops. + if (auto *Phi = dyn_cast_if_present(P.value().User)) { + auto *I = cast(P.value().Scalar); + const Loop *L = LI->getLoopFor(Phi->getParent()); + if (L && (Phi->getParent() == I->getParent() || + L == LI->getLoopFor(I->getParent()))) + return; + } + ValueToExtUses->try_emplace(P.value().Scalar, P.index()); }); } - // Can use original GEP, if no operands vectorized or they are marked as - // externally used already. - bool CanBeUsedAsGEP = all_of(GEP->operands(), [&](Value *V) { - if (!getTreeEntry(V)) - return true; - auto It = ValueToExtUses->find(V); - if (It != ValueToExtUses->end()) { - // Replace all uses to avoid compiler crash. - ExternalUses[It->second].User = nullptr; + // Can use original instruction, if no operands vectorized or they are + // marked as externally used already. + auto *Inst = cast(EU.Scalar); + bool CanBeUsedAsScalar = all_of(Inst->operands(), [&](Value *V) { + if (!getTreeEntry(V)) { + // Some extractelements might be not vectorized, but + // transformed into shuffle and removed from the function, + // consider it here. + if (auto *EE = dyn_cast(V)) + return !EE->hasOneUse() || !MustGather.contains(EE); return true; } - return false; + return ValueToExtUses->contains(V); }); - if (CanBeUsedAsGEP) { - ExtractCost += TTI->getInstructionCost(GEP, CostKind); - ExternalUsesAsGEPs.insert(EU.Scalar); - continue; + if (CanBeUsedAsScalar) { + InstructionCost ScalarCost = TTI->getInstructionCost(Inst, CostKind); + bool KeepScalar = ScalarCost <= ExtraCost; + if (KeepScalar && ScalarCost != TTI::TCC_Free && + ExtraCost - ScalarCost <= TTI::TCC_Basic) { + unsigned ScalarUsesCount = count_if(Entry->Scalars, [&](Value *V) { + return ValueToExtUses->contains(V); + }); + auto It = ExtractsCount.find(Entry); + if (It != ExtractsCount.end()) + ScalarUsesCount -= It->getSecond().size(); + // Keep original scalar if number of externally used instructions in + // the same entry is not power of 2. It may help to do some extra + // vectorization for now. + KeepScalar = ScalarUsesCount <= 1 || !isPowerOf2_32(ScalarUsesCount); + } + if (KeepScalar) { + ExternalUsesAsOriginalScalar.insert(EU.Scalar); + for_each(Inst->operands(), [&](Value *V) { + auto It = ValueToExtUses->find(V); + if (It != ValueToExtUses->end()) { + // Replace all uses to avoid compiler crash. + ExternalUses[It->second].User = nullptr; + } + }); + ExtraCost = ScalarCost; + ExtractsCount[Entry].insert(Inst); + } } } - // If we plan to rewrite the tree in a smaller type, we will need to sign - // extend the extracted value back to the original type. Here, we account - // for the extract and the added cost of the sign extend if needed. - auto *VecTy = getWidenedType(EU.Scalar->getType(), BundleWidth); - auto It = MinBWs.find(getTreeEntry(EU.Scalar)); - if (It != MinBWs.end()) { - auto *MinTy = IntegerType::get(F->getContext(), It->second.first); - unsigned Extend = - It->second.second ? Instruction::SExt : Instruction::ZExt; - VecTy = getWidenedType(MinTy, BundleWidth); - ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(), - VecTy, EU.Lane); - } else { - ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, - CostKind, EU.Lane); - } + ExtractCost += ExtraCost; } // Add reduced value cost, if resized. if (!VectorizedVals.empty()) { @@ -14067,8 +14106,7 @@ Value *BoUpSLP::vectorizeTree( DenseMap VectorToInsertElement; // Maps extract Scalar to the corresponding extractelement instruction in the // basic block. Only one extractelement per block should be emitted. - DenseMap>> + DenseMap>> ScalarToEEs; SmallDenseSet UsedInserts; DenseMap, Value *> VectorCasts; @@ -14098,30 +14136,41 @@ Value *BoUpSLP::vectorizeTree( if (Scalar->getType() != Vec->getType()) { Value *Ex = nullptr; Value *ExV = nullptr; - auto *GEP = dyn_cast(Scalar); - bool ReplaceGEP = GEP && ExternalUsesAsGEPs.contains(GEP); + auto *Inst = dyn_cast(Scalar); + bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.contains(Inst); auto It = ScalarToEEs.find(Scalar); if (It != ScalarToEEs.end()) { // No need to emit many extracts, just move the only one in the // current block. - auto EEIt = It->second.find(Builder.GetInsertBlock()); + auto EEIt = It->second.find(ReplaceInst ? Inst->getParent() + : Builder.GetInsertBlock()); if (EEIt != It->second.end()) { - Instruction *I = EEIt->second.first; - if (Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() && + Value *PrevV = EEIt->second.first; + if (auto *I = dyn_cast(PrevV); + I && !ReplaceInst && + Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() && Builder.GetInsertPoint()->comesBefore(I)) { I->moveBefore(*Builder.GetInsertPoint()->getParent(), Builder.GetInsertPoint()); - if (auto *CI = EEIt->second.second) + if (auto *CI = dyn_cast(EEIt->second.second)) CI->moveAfter(I); } - Ex = I; + Ex = PrevV; ExV = EEIt->second.second ? EEIt->second.second : Ex; } } if (!Ex) { // "Reuse" the existing extract to improve final codegen. - if (auto *ES = dyn_cast(Scalar); - ES && isa(Vec)) { + if (ReplaceInst) { + // Leave the instruction as is, if it cheaper extracts and all + // operands are scalar. + auto *CloneInst = Inst->clone(); + CloneInst->insertBefore(Inst); + if (Inst->hasName()) + CloneInst->takeName(Inst); + Ex = CloneInst; + } else if (auto *ES = dyn_cast(Scalar); + ES && isa(Vec)) { Value *V = ES->getVectorOperand(); auto *IVec = cast(Vec); if (const TreeEntry *ETE = getTreeEntry(V)) @@ -14132,18 +14181,6 @@ Value *BoUpSLP::vectorizeTree( Ex = Builder.CreateExtractElement(V, ES->getIndexOperand()); else Ex = Builder.CreateExtractElement(Vec, Lane); - } else if (ReplaceGEP) { - // Leave the GEPs as is, they are free in most cases and better to - // keep them as GEPs. - auto *CloneGEP = GEP->clone(); - if (isa(Vec)) - CloneGEP->insertBefore(*Builder.GetInsertBlock(), - Builder.GetInsertPoint()); - else - CloneGEP->insertBefore(GEP); - if (GEP->hasName()) - CloneGEP->takeName(GEP); - Ex = CloneGEP; } else if (auto *VecTy = dyn_cast(Scalar->getType())) { assert(SLPReVec && "FixedVectorType is not expected."); @@ -14164,14 +14201,15 @@ Value *BoUpSLP::vectorizeTree( if (Scalar->getType() != Ex->getType()) ExV = Builder.CreateIntCast(Ex, Scalar->getType(), MinBWs.find(E)->second.second); - if (auto *I = dyn_cast(Ex)) - ScalarToEEs[Scalar].try_emplace( - Builder.GetInsertBlock(), - std::make_pair(I, cast(ExV))); + auto *I = dyn_cast(Ex); + ScalarToEEs[Scalar].try_emplace(I ? I->getParent() + : &F->getEntryBlock(), + std::make_pair(Ex, ExV)); } // The then branch of the previous if may produce constants, since 0 // operand might be a constant. - if (auto *ExI = dyn_cast(Ex)) { + if (auto *ExI = dyn_cast(Ex); + ExI && !isa(ExI) && !mayHaveNonDefUseDependency(*ExI)) { GatherShuffleExtractSeq.insert(ExI); CSEBlocks.insert(ExI->getParent()); } @@ -14192,9 +14230,10 @@ Value *BoUpSLP::vectorizeTree( continue; assert((ExternallyUsedValues.count(Scalar) || Scalar->hasNUsesOrMore(UsesLimit) || + ExternalUsesAsOriginalScalar.contains(Scalar) || any_of(Scalar->users(), [&](llvm::User *U) { - if (ExternalUsesAsGEPs.contains(U)) + if (ExternalUsesAsOriginalScalar.contains(U)) return true; TreeEntry *UseEntry = getTreeEntry(U); return UseEntry && diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/external-non-inst-use.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/external-non-inst-use.ll index d4e3fb3e24853..0d6eb7b5e08aa 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/external-non-inst-use.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/external-non-inst-use.ll @@ -4,8 +4,9 @@ define i16 @foo(ptr %p1, ptr %p2) { ; CHECK-LABEL: @foo( ; CHECK-NEXT: entry: -; CHECK-NEXT: store i32 0, ptr [[P1:%.*]], align 1 -; CHECK-NEXT: [[CONST_MAT:%.*]] = or i32 0, 0 +; CHECK-NEXT: [[CONST:%.*]] = bitcast i32 0 to i32 +; CHECK-NEXT: store i32 [[CONST]], ptr [[P1:%.*]], align 1 +; CHECK-NEXT: [[CONST_MAT:%.*]] = or i32 [[CONST]], 0 ; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr [[P2:%.*]], align 1 ; CHECK-NEXT: ret i16 0 ; diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll index 70cdd08548b2d..8f6d5d8f2d7ec 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll @@ -1236,20 +1236,20 @@ define void @crash_no_tracked_instructions(ptr %arg, ptr %arg.2, ptr %arg.3, i1 ; CHECK: bb22: ; CHECK-NEXT: [[T23:%.*]] = fmul float [[T20]], 9.900000e+01 ; CHECK-NEXT: [[T25:%.*]] = getelementptr inbounds float, ptr [[T19]], i64 2 +; CHECK-NEXT: [[T26:%.*]] = fmul float [[T23]], 1.000000e+01 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[T23]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x float> [[TMP2]], -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 1 -; CHECK-NEXT: store float [[TMP4]], ptr [[T25]], align 4 +; CHECK-NEXT: store float [[T26]], ptr [[T25]], align 4 ; CHECK-NEXT: [[T27:%.*]] = load float, ptr [[ARG_2:%.*]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x float> [[TMP3]], +; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x float> [[TMP3]], ; CHECK-NEXT: br label [[BB30]] ; CHECK: bb30: -; CHECK-NEXT: [[TMP6:%.*]] = phi <2 x float> [ [[TMP5]], [[BB22]] ], [ [[TMP0]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[TMP5:%.*]] = phi <2 x float> [ [[TMP4]], [[BB22]] ], [ [[TMP0]], [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[BB36:%.*]] ; CHECK: bb36: -; CHECK-NEXT: [[TMP7:%.*]] = fmul <2 x float> [[TMP6]], -; CHECK-NEXT: store <2 x float> [[TMP7]], ptr [[ARG_3]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = fmul <2 x float> [[TMP5]], +; CHECK-NEXT: store <2 x float> [[TMP6]], ptr [[ARG_3]], align 4 ; CHECK-NEXT: br label [[BB41:%.*]] ; CHECK: bb41: ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll index f85f658fed4d5..d89d628670360 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll @@ -29,151 +29,215 @@ define i64 @straight(ptr nocapture noundef readonly %p, i32 noundef %st) { ; CHECK-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr [[ADD_PTR_5]], align 2 ; CHECK-NEXT: [[ADD_PTR_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[TMP7:%.*]] = load <8 x i16>, ptr [[ADD_PTR_6]], align 2 -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> poison, <64 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <64 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <64 x i16> [[TMP8]], <64 x i16> [[TMP9]], <64 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <64 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <64 x i16> [[TMP10]], <64 x i16> [[TMP11]], <64 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> poison, <64 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <64 x i16> [[TMP12]], <64 x i16> [[TMP13]], <64 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> poison, <64 x i32> -; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <64 x i16> [[TMP14]], <64 x i16> [[TMP15]], <64 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> poison, <64 x i32> -; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <64 x i16> [[TMP16]], <64 x i16> [[TMP17]], <64 x i32> -; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <8 x i16> [[TMP6]], <8 x i16> poison, <64 x i32> -; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <64 x i16> [[TMP18]], <64 x i16> [[TMP19]], <64 x i32> -; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <8 x i16> [[TMP7]], <8 x i16> poison, <64 x i32> -; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <64 x i16> [[TMP20]], <64 x i16> [[TMP21]], <64 x i32> -; CHECK-NEXT: [[TMP23:%.*]] = zext <64 x i16> [[TMP22]] to <64 x i32> -; CHECK-NEXT: [[TMP24:%.*]] = extractelement <64 x i32> [[TMP23]], i32 0 -; CHECK-NEXT: [[TMP25:%.*]] = extractelement <64 x i32> [[TMP23]], i32 1 -; CHECK-NEXT: [[ADD_1:%.*]] = add nuw nsw i32 [[TMP24]], [[TMP25]] -; CHECK-NEXT: [[TMP26:%.*]] = mul nuw nsw <64 x i32> [[TMP23]], [[TMP23]] -; CHECK-NEXT: [[TMP27:%.*]] = extractelement <64 x i32> [[TMP23]], i32 2 -; CHECK-NEXT: [[ADD_2:%.*]] = add nuw nsw i32 [[ADD_1]], [[TMP27]] -; CHECK-NEXT: [[TMP28:%.*]] = extractelement <64 x i32> [[TMP23]], i32 3 -; CHECK-NEXT: [[ADD_3:%.*]] = add nuw nsw i32 [[ADD_2]], [[TMP28]] -; CHECK-NEXT: [[TMP29:%.*]] = extractelement <64 x i32> [[TMP23]], i32 4 -; CHECK-NEXT: [[ADD_4:%.*]] = add nuw nsw i32 [[ADD_3]], [[TMP29]] -; CHECK-NEXT: [[TMP30:%.*]] = extractelement <64 x i32> [[TMP23]], i32 5 -; CHECK-NEXT: [[ADD_5:%.*]] = add nuw nsw i32 [[ADD_4]], [[TMP30]] -; CHECK-NEXT: [[TMP31:%.*]] = extractelement <64 x i32> [[TMP23]], i32 6 -; CHECK-NEXT: [[ADD_6:%.*]] = add nuw nsw i32 [[ADD_5]], [[TMP31]] -; CHECK-NEXT: [[TMP32:%.*]] = extractelement <64 x i32> [[TMP23]], i32 7 -; CHECK-NEXT: [[ADD_7:%.*]] = add nuw nsw i32 [[ADD_6]], [[TMP32]] -; CHECK-NEXT: [[TMP33:%.*]] = extractelement <64 x i32> [[TMP23]], i32 8 -; CHECK-NEXT: [[ADD_141:%.*]] = add nuw nsw i32 [[ADD_7]], [[TMP33]] -; CHECK-NEXT: [[TMP34:%.*]] = extractelement <64 x i32> [[TMP23]], i32 9 -; CHECK-NEXT: [[ADD_1_1:%.*]] = add nuw nsw i32 [[ADD_141]], [[TMP34]] -; CHECK-NEXT: [[TMP35:%.*]] = extractelement <64 x i32> [[TMP23]], i32 10 -; CHECK-NEXT: [[ADD_2_1:%.*]] = add nuw nsw i32 [[ADD_1_1]], [[TMP35]] -; CHECK-NEXT: [[TMP36:%.*]] = extractelement <64 x i32> [[TMP23]], i32 11 -; CHECK-NEXT: [[ADD_3_1:%.*]] = add nuw nsw i32 [[ADD_2_1]], [[TMP36]] -; CHECK-NEXT: [[TMP37:%.*]] = extractelement <64 x i32> [[TMP23]], i32 12 -; CHECK-NEXT: [[ADD_4_1:%.*]] = add nuw nsw i32 [[ADD_3_1]], [[TMP37]] -; CHECK-NEXT: [[TMP38:%.*]] = extractelement <64 x i32> [[TMP23]], i32 13 -; CHECK-NEXT: [[ADD_5_1:%.*]] = add nuw nsw i32 [[ADD_4_1]], [[TMP38]] -; CHECK-NEXT: [[TMP39:%.*]] = extractelement <64 x i32> [[TMP23]], i32 14 -; CHECK-NEXT: [[ADD_6_1:%.*]] = add nuw nsw i32 [[ADD_5_1]], [[TMP39]] -; CHECK-NEXT: [[TMP40:%.*]] = extractelement <64 x i32> [[TMP23]], i32 15 -; CHECK-NEXT: [[ADD_7_1:%.*]] = add nuw nsw i32 [[ADD_6_1]], [[TMP40]] -; CHECK-NEXT: [[TMP41:%.*]] = extractelement <64 x i32> [[TMP23]], i32 16 -; CHECK-NEXT: [[ADD_245:%.*]] = add nuw nsw i32 [[ADD_7_1]], [[TMP41]] -; CHECK-NEXT: [[TMP42:%.*]] = extractelement <64 x i32> [[TMP23]], i32 17 -; CHECK-NEXT: [[ADD_1_2:%.*]] = add nuw nsw i32 [[ADD_245]], [[TMP42]] -; CHECK-NEXT: [[TMP43:%.*]] = extractelement <64 x i32> [[TMP23]], i32 18 -; CHECK-NEXT: [[ADD_2_2:%.*]] = add nuw nsw i32 [[ADD_1_2]], [[TMP43]] -; CHECK-NEXT: [[TMP44:%.*]] = extractelement <64 x i32> [[TMP23]], i32 19 -; CHECK-NEXT: [[ADD_3_2:%.*]] = add nuw nsw i32 [[ADD_2_2]], [[TMP44]] -; CHECK-NEXT: [[TMP45:%.*]] = extractelement <64 x i32> [[TMP23]], i32 20 -; CHECK-NEXT: [[ADD_4_2:%.*]] = add nuw nsw i32 [[ADD_3_2]], [[TMP45]] -; CHECK-NEXT: [[TMP46:%.*]] = extractelement <64 x i32> [[TMP23]], i32 21 -; CHECK-NEXT: [[ADD_5_2:%.*]] = add nuw nsw i32 [[ADD_4_2]], [[TMP46]] -; CHECK-NEXT: [[TMP47:%.*]] = extractelement <64 x i32> [[TMP23]], i32 22 -; CHECK-NEXT: [[ADD_6_2:%.*]] = add nuw nsw i32 [[ADD_5_2]], [[TMP47]] -; CHECK-NEXT: [[TMP48:%.*]] = extractelement <64 x i32> [[TMP23]], i32 23 -; CHECK-NEXT: [[ADD_7_2:%.*]] = add nuw nsw i32 [[ADD_6_2]], [[TMP48]] -; CHECK-NEXT: [[TMP49:%.*]] = extractelement <64 x i32> [[TMP23]], i32 24 -; CHECK-NEXT: [[ADD_349:%.*]] = add nuw nsw i32 [[ADD_7_2]], [[TMP49]] -; CHECK-NEXT: [[TMP50:%.*]] = extractelement <64 x i32> [[TMP23]], i32 25 -; CHECK-NEXT: [[ADD_1_3:%.*]] = add nuw nsw i32 [[ADD_349]], [[TMP50]] -; CHECK-NEXT: [[TMP51:%.*]] = extractelement <64 x i32> [[TMP23]], i32 26 -; CHECK-NEXT: [[ADD_2_3:%.*]] = add nuw nsw i32 [[ADD_1_3]], [[TMP51]] -; CHECK-NEXT: [[TMP52:%.*]] = extractelement <64 x i32> [[TMP23]], i32 27 -; CHECK-NEXT: [[ADD_3_3:%.*]] = add nuw nsw i32 [[ADD_2_3]], [[TMP52]] -; CHECK-NEXT: [[TMP53:%.*]] = extractelement <64 x i32> [[TMP23]], i32 28 -; CHECK-NEXT: [[ADD_4_3:%.*]] = add nuw nsw i32 [[ADD_3_3]], [[TMP53]] -; CHECK-NEXT: [[TMP54:%.*]] = extractelement <64 x i32> [[TMP23]], i32 29 -; CHECK-NEXT: [[ADD_5_3:%.*]] = add nuw nsw i32 [[ADD_4_3]], [[TMP54]] -; CHECK-NEXT: [[TMP55:%.*]] = extractelement <64 x i32> [[TMP23]], i32 30 -; CHECK-NEXT: [[ADD_6_3:%.*]] = add nuw nsw i32 [[ADD_5_3]], [[TMP55]] -; CHECK-NEXT: [[TMP56:%.*]] = extractelement <64 x i32> [[TMP23]], i32 31 -; CHECK-NEXT: [[ADD_7_3:%.*]] = add nuw nsw i32 [[ADD_6_3]], [[TMP56]] -; CHECK-NEXT: [[TMP57:%.*]] = extractelement <64 x i32> [[TMP23]], i32 32 -; CHECK-NEXT: [[ADD_453:%.*]] = add nuw nsw i32 [[ADD_7_3]], [[TMP57]] -; CHECK-NEXT: [[TMP58:%.*]] = extractelement <64 x i32> [[TMP23]], i32 33 -; CHECK-NEXT: [[ADD_1_4:%.*]] = add nuw nsw i32 [[ADD_453]], [[TMP58]] -; CHECK-NEXT: [[TMP59:%.*]] = extractelement <64 x i32> [[TMP23]], i32 34 -; CHECK-NEXT: [[ADD_2_4:%.*]] = add nuw nsw i32 [[ADD_1_4]], [[TMP59]] -; CHECK-NEXT: [[TMP60:%.*]] = extractelement <64 x i32> [[TMP23]], i32 35 -; CHECK-NEXT: [[ADD_3_4:%.*]] = add nuw nsw i32 [[ADD_2_4]], [[TMP60]] -; CHECK-NEXT: [[TMP61:%.*]] = extractelement <64 x i32> [[TMP23]], i32 36 -; CHECK-NEXT: [[ADD_4_4:%.*]] = add nuw nsw i32 [[ADD_3_4]], [[TMP61]] -; CHECK-NEXT: [[TMP62:%.*]] = extractelement <64 x i32> [[TMP23]], i32 37 -; CHECK-NEXT: [[ADD_5_4:%.*]] = add nuw nsw i32 [[ADD_4_4]], [[TMP62]] -; CHECK-NEXT: [[TMP63:%.*]] = extractelement <64 x i32> [[TMP23]], i32 38 -; CHECK-NEXT: [[ADD_6_4:%.*]] = add nuw nsw i32 [[ADD_5_4]], [[TMP63]] -; CHECK-NEXT: [[TMP64:%.*]] = extractelement <64 x i32> [[TMP23]], i32 39 -; CHECK-NEXT: [[ADD_7_4:%.*]] = add nuw nsw i32 [[ADD_6_4]], [[TMP64]] -; CHECK-NEXT: [[TMP65:%.*]] = extractelement <64 x i32> [[TMP23]], i32 40 -; CHECK-NEXT: [[ADD_557:%.*]] = add nuw nsw i32 [[ADD_7_4]], [[TMP65]] -; CHECK-NEXT: [[TMP66:%.*]] = extractelement <64 x i32> [[TMP23]], i32 41 -; CHECK-NEXT: [[ADD_1_5:%.*]] = add nuw nsw i32 [[ADD_557]], [[TMP66]] -; CHECK-NEXT: [[TMP67:%.*]] = extractelement <64 x i32> [[TMP23]], i32 42 -; CHECK-NEXT: [[ADD_2_5:%.*]] = add nuw nsw i32 [[ADD_1_5]], [[TMP67]] -; CHECK-NEXT: [[TMP68:%.*]] = extractelement <64 x i32> [[TMP23]], i32 43 -; CHECK-NEXT: [[ADD_3_5:%.*]] = add nuw nsw i32 [[ADD_2_5]], [[TMP68]] -; CHECK-NEXT: [[TMP69:%.*]] = extractelement <64 x i32> [[TMP23]], i32 44 -; CHECK-NEXT: [[ADD_4_5:%.*]] = add nuw nsw i32 [[ADD_3_5]], [[TMP69]] -; CHECK-NEXT: [[TMP70:%.*]] = extractelement <64 x i32> [[TMP23]], i32 45 -; CHECK-NEXT: [[ADD_5_5:%.*]] = add nuw nsw i32 [[ADD_4_5]], [[TMP70]] -; CHECK-NEXT: [[TMP71:%.*]] = extractelement <64 x i32> [[TMP23]], i32 46 -; CHECK-NEXT: [[ADD_6_5:%.*]] = add nuw nsw i32 [[ADD_5_5]], [[TMP71]] -; CHECK-NEXT: [[TMP72:%.*]] = extractelement <64 x i32> [[TMP23]], i32 47 -; CHECK-NEXT: [[ADD_7_5:%.*]] = add nuw nsw i32 [[ADD_6_5]], [[TMP72]] -; CHECK-NEXT: [[TMP73:%.*]] = extractelement <64 x i32> [[TMP23]], i32 48 -; CHECK-NEXT: [[ADD_661:%.*]] = add nuw nsw i32 [[ADD_7_5]], [[TMP73]] -; CHECK-NEXT: [[TMP74:%.*]] = extractelement <64 x i32> [[TMP23]], i32 49 -; CHECK-NEXT: [[ADD_1_6:%.*]] = add nuw nsw i32 [[ADD_661]], [[TMP74]] -; CHECK-NEXT: [[TMP75:%.*]] = extractelement <64 x i32> [[TMP23]], i32 50 -; CHECK-NEXT: [[ADD_2_6:%.*]] = add nuw nsw i32 [[ADD_1_6]], [[TMP75]] -; CHECK-NEXT: [[TMP76:%.*]] = extractelement <64 x i32> [[TMP23]], i32 51 -; CHECK-NEXT: [[ADD_3_6:%.*]] = add nuw nsw i32 [[ADD_2_6]], [[TMP76]] -; CHECK-NEXT: [[TMP77:%.*]] = extractelement <64 x i32> [[TMP23]], i32 52 -; CHECK-NEXT: [[ADD_4_6:%.*]] = add nuw nsw i32 [[ADD_3_6]], [[TMP77]] -; CHECK-NEXT: [[TMP78:%.*]] = extractelement <64 x i32> [[TMP23]], i32 53 -; CHECK-NEXT: [[ADD_5_6:%.*]] = add nuw nsw i32 [[ADD_4_6]], [[TMP78]] -; CHECK-NEXT: [[TMP79:%.*]] = extractelement <64 x i32> [[TMP23]], i32 54 -; CHECK-NEXT: [[ADD_6_6:%.*]] = add nuw nsw i32 [[ADD_5_6]], [[TMP79]] -; CHECK-NEXT: [[TMP80:%.*]] = extractelement <64 x i32> [[TMP23]], i32 55 -; CHECK-NEXT: [[ADD_7_6:%.*]] = add nuw nsw i32 [[ADD_6_6]], [[TMP80]] -; CHECK-NEXT: [[TMP81:%.*]] = extractelement <64 x i32> [[TMP23]], i32 56 -; CHECK-NEXT: [[ADD_765:%.*]] = add nuw nsw i32 [[ADD_7_6]], [[TMP81]] -; CHECK-NEXT: [[TMP82:%.*]] = extractelement <64 x i32> [[TMP23]], i32 57 -; CHECK-NEXT: [[ADD_1_7:%.*]] = add nuw nsw i32 [[ADD_765]], [[TMP82]] -; CHECK-NEXT: [[TMP83:%.*]] = extractelement <64 x i32> [[TMP23]], i32 58 -; CHECK-NEXT: [[ADD_2_7:%.*]] = add nuw nsw i32 [[ADD_1_7]], [[TMP83]] -; CHECK-NEXT: [[TMP84:%.*]] = extractelement <64 x i32> [[TMP23]], i32 59 -; CHECK-NEXT: [[ADD_3_7:%.*]] = add nuw nsw i32 [[ADD_2_7]], [[TMP84]] -; CHECK-NEXT: [[TMP85:%.*]] = extractelement <64 x i32> [[TMP23]], i32 60 -; CHECK-NEXT: [[ADD_4_7:%.*]] = add nuw nsw i32 [[ADD_3_7]], [[TMP85]] -; CHECK-NEXT: [[TMP86:%.*]] = extractelement <64 x i32> [[TMP23]], i32 61 -; CHECK-NEXT: [[ADD_5_7:%.*]] = add nuw nsw i32 [[ADD_4_7]], [[TMP86]] -; CHECK-NEXT: [[TMP87:%.*]] = extractelement <64 x i32> [[TMP23]], i32 62 -; CHECK-NEXT: [[ADD_6_7:%.*]] = add nuw nsw i32 [[ADD_5_7]], [[TMP87]] -; CHECK-NEXT: [[TMP88:%.*]] = extractelement <64 x i32> [[TMP23]], i32 63 -; CHECK-NEXT: [[ADD_7_7:%.*]] = add nuw nsw i32 [[ADD_6_7]], [[TMP88]] -; CHECK-NEXT: [[TMP89:%.*]] = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> [[TMP26]]) +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <8 x i16> [[TMP7]], i32 7 +; CHECK-NEXT: [[CONV_7_7:%.*]] = zext i16 [[TMP8]] to i32 +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> poison, <64 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <64 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <64 x i16> [[TMP9]], <64 x i16> [[TMP10]], <64 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <64 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <64 x i16> [[TMP11]], <64 x i16> [[TMP12]], <64 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> poison, <64 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <64 x i16> [[TMP13]], <64 x i16> [[TMP14]], <64 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> poison, <64 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <64 x i16> [[TMP15]], <64 x i16> [[TMP16]], <64 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> poison, <64 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <64 x i16> [[TMP17]], <64 x i16> [[TMP18]], <64 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <8 x i16> [[TMP6]], <8 x i16> poison, <64 x i32> +; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <64 x i16> [[TMP19]], <64 x i16> [[TMP20]], <64 x i32> +; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <8 x i16> [[TMP7]], <8 x i16> poison, <64 x i32> +; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <64 x i16> [[TMP21]], <64 x i16> [[TMP22]], <64 x i32> +; CHECK-NEXT: [[TMP24:%.*]] = zext <64 x i16> [[TMP23]] to <64 x i32> +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <8 x i16> [[TMP7]], i32 6 +; CHECK-NEXT: [[CONV_6_7:%.*]] = zext i16 [[TMP25]] to i32 +; CHECK-NEXT: [[TMP26:%.*]] = extractelement <8 x i16> [[TMP7]], i32 5 +; CHECK-NEXT: [[CONV_5_7:%.*]] = zext i16 [[TMP26]] to i32 +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <8 x i16> [[TMP7]], i32 4 +; CHECK-NEXT: [[CONV_4_7:%.*]] = zext i16 [[TMP27]] to i32 +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <8 x i16> [[TMP7]], i32 3 +; CHECK-NEXT: [[CONV_3_7:%.*]] = zext i16 [[TMP28]] to i32 +; CHECK-NEXT: [[TMP29:%.*]] = extractelement <8 x i16> [[TMP7]], i32 2 +; CHECK-NEXT: [[CONV_2_7:%.*]] = zext i16 [[TMP29]] to i32 +; CHECK-NEXT: [[TMP30:%.*]] = extractelement <8 x i16> [[TMP7]], i32 1 +; CHECK-NEXT: [[CONV_1_7:%.*]] = zext i16 [[TMP30]] to i32 +; CHECK-NEXT: [[TMP31:%.*]] = extractelement <8 x i16> [[TMP7]], i32 0 +; CHECK-NEXT: [[CONV_764:%.*]] = zext i16 [[TMP31]] to i32 +; CHECK-NEXT: [[TMP32:%.*]] = extractelement <8 x i16> [[TMP6]], i32 7 +; CHECK-NEXT: [[CONV_7_6:%.*]] = zext i16 [[TMP32]] to i32 +; CHECK-NEXT: [[TMP33:%.*]] = extractelement <8 x i16> [[TMP6]], i32 6 +; CHECK-NEXT: [[CONV_6_6:%.*]] = zext i16 [[TMP33]] to i32 +; CHECK-NEXT: [[TMP34:%.*]] = extractelement <8 x i16> [[TMP6]], i32 5 +; CHECK-NEXT: [[CONV_5_6:%.*]] = zext i16 [[TMP34]] to i32 +; CHECK-NEXT: [[TMP35:%.*]] = extractelement <8 x i16> [[TMP6]], i32 4 +; CHECK-NEXT: [[CONV_4_6:%.*]] = zext i16 [[TMP35]] to i32 +; CHECK-NEXT: [[TMP36:%.*]] = extractelement <8 x i16> [[TMP6]], i32 3 +; CHECK-NEXT: [[CONV_3_6:%.*]] = zext i16 [[TMP36]] to i32 +; CHECK-NEXT: [[TMP37:%.*]] = extractelement <8 x i16> [[TMP6]], i32 2 +; CHECK-NEXT: [[CONV_2_6:%.*]] = zext i16 [[TMP37]] to i32 +; CHECK-NEXT: [[TMP38:%.*]] = extractelement <8 x i16> [[TMP6]], i32 1 +; CHECK-NEXT: [[CONV_1_6:%.*]] = zext i16 [[TMP38]] to i32 +; CHECK-NEXT: [[TMP39:%.*]] = extractelement <8 x i16> [[TMP6]], i32 0 +; CHECK-NEXT: [[CONV_660:%.*]] = zext i16 [[TMP39]] to i32 +; CHECK-NEXT: [[TMP40:%.*]] = extractelement <8 x i16> [[TMP5]], i32 7 +; CHECK-NEXT: [[CONV_7_5:%.*]] = zext i16 [[TMP40]] to i32 +; CHECK-NEXT: [[TMP41:%.*]] = extractelement <8 x i16> [[TMP5]], i32 6 +; CHECK-NEXT: [[CONV_6_5:%.*]] = zext i16 [[TMP41]] to i32 +; CHECK-NEXT: [[TMP42:%.*]] = extractelement <8 x i16> [[TMP5]], i32 5 +; CHECK-NEXT: [[CONV_5_5:%.*]] = zext i16 [[TMP42]] to i32 +; CHECK-NEXT: [[TMP43:%.*]] = extractelement <8 x i16> [[TMP5]], i32 4 +; CHECK-NEXT: [[CONV_4_5:%.*]] = zext i16 [[TMP43]] to i32 +; CHECK-NEXT: [[TMP44:%.*]] = extractelement <8 x i16> [[TMP5]], i32 3 +; CHECK-NEXT: [[CONV_3_5:%.*]] = zext i16 [[TMP44]] to i32 +; CHECK-NEXT: [[TMP45:%.*]] = extractelement <8 x i16> [[TMP5]], i32 2 +; CHECK-NEXT: [[CONV_2_5:%.*]] = zext i16 [[TMP45]] to i32 +; CHECK-NEXT: [[TMP46:%.*]] = extractelement <8 x i16> [[TMP5]], i32 1 +; CHECK-NEXT: [[CONV_1_5:%.*]] = zext i16 [[TMP46]] to i32 +; CHECK-NEXT: [[TMP47:%.*]] = extractelement <8 x i16> [[TMP5]], i32 0 +; CHECK-NEXT: [[CONV_556:%.*]] = zext i16 [[TMP47]] to i32 +; CHECK-NEXT: [[TMP48:%.*]] = extractelement <8 x i16> [[TMP4]], i32 7 +; CHECK-NEXT: [[CONV_7_4:%.*]] = zext i16 [[TMP48]] to i32 +; CHECK-NEXT: [[TMP49:%.*]] = extractelement <8 x i16> [[TMP4]], i32 6 +; CHECK-NEXT: [[CONV_6_4:%.*]] = zext i16 [[TMP49]] to i32 +; CHECK-NEXT: [[TMP50:%.*]] = extractelement <8 x i16> [[TMP4]], i32 5 +; CHECK-NEXT: [[CONV_5_4:%.*]] = zext i16 [[TMP50]] to i32 +; CHECK-NEXT: [[TMP51:%.*]] = extractelement <8 x i16> [[TMP4]], i32 4 +; CHECK-NEXT: [[CONV_4_4:%.*]] = zext i16 [[TMP51]] to i32 +; CHECK-NEXT: [[TMP52:%.*]] = extractelement <8 x i16> [[TMP4]], i32 3 +; CHECK-NEXT: [[CONV_3_4:%.*]] = zext i16 [[TMP52]] to i32 +; CHECK-NEXT: [[TMP53:%.*]] = extractelement <8 x i16> [[TMP4]], i32 2 +; CHECK-NEXT: [[CONV_2_4:%.*]] = zext i16 [[TMP53]] to i32 +; CHECK-NEXT: [[TMP54:%.*]] = extractelement <8 x i16> [[TMP4]], i32 1 +; CHECK-NEXT: [[CONV_1_4:%.*]] = zext i16 [[TMP54]] to i32 +; CHECK-NEXT: [[TMP55:%.*]] = extractelement <8 x i16> [[TMP4]], i32 0 +; CHECK-NEXT: [[CONV_452:%.*]] = zext i16 [[TMP55]] to i32 +; CHECK-NEXT: [[TMP56:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7 +; CHECK-NEXT: [[CONV_7_3:%.*]] = zext i16 [[TMP56]] to i32 +; CHECK-NEXT: [[TMP57:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6 +; CHECK-NEXT: [[CONV_6_3:%.*]] = zext i16 [[TMP57]] to i32 +; CHECK-NEXT: [[TMP58:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5 +; CHECK-NEXT: [[CONV_5_3:%.*]] = zext i16 [[TMP58]] to i32 +; CHECK-NEXT: [[TMP59:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4 +; CHECK-NEXT: [[CONV_4_3:%.*]] = zext i16 [[TMP59]] to i32 +; CHECK-NEXT: [[TMP60:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3 +; CHECK-NEXT: [[CONV_3_3:%.*]] = zext i16 [[TMP60]] to i32 +; CHECK-NEXT: [[TMP61:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2 +; CHECK-NEXT: [[CONV_2_3:%.*]] = zext i16 [[TMP61]] to i32 +; CHECK-NEXT: [[TMP62:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1 +; CHECK-NEXT: [[CONV_1_3:%.*]] = zext i16 [[TMP62]] to i32 +; CHECK-NEXT: [[TMP63:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 +; CHECK-NEXT: [[CONV_348:%.*]] = zext i16 [[TMP63]] to i32 +; CHECK-NEXT: [[TMP64:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7 +; CHECK-NEXT: [[CONV_7_2:%.*]] = zext i16 [[TMP64]] to i32 +; CHECK-NEXT: [[TMP65:%.*]] = extractelement <8 x i16> [[TMP2]], i32 6 +; CHECK-NEXT: [[CONV_6_2:%.*]] = zext i16 [[TMP65]] to i32 +; CHECK-NEXT: [[TMP66:%.*]] = extractelement <8 x i16> [[TMP2]], i32 5 +; CHECK-NEXT: [[CONV_5_2:%.*]] = zext i16 [[TMP66]] to i32 +; CHECK-NEXT: [[TMP67:%.*]] = extractelement <8 x i16> [[TMP2]], i32 4 +; CHECK-NEXT: [[CONV_4_2:%.*]] = zext i16 [[TMP67]] to i32 +; CHECK-NEXT: [[TMP68:%.*]] = extractelement <8 x i16> [[TMP2]], i32 3 +; CHECK-NEXT: [[CONV_3_2:%.*]] = zext i16 [[TMP68]] to i32 +; CHECK-NEXT: [[TMP69:%.*]] = extractelement <8 x i16> [[TMP2]], i32 2 +; CHECK-NEXT: [[CONV_2_2:%.*]] = zext i16 [[TMP69]] to i32 +; CHECK-NEXT: [[TMP70:%.*]] = extractelement <8 x i16> [[TMP2]], i32 1 +; CHECK-NEXT: [[CONV_1_2:%.*]] = zext i16 [[TMP70]] to i32 +; CHECK-NEXT: [[TMP71:%.*]] = extractelement <8 x i16> [[TMP2]], i32 0 +; CHECK-NEXT: [[CONV_244:%.*]] = zext i16 [[TMP71]] to i32 +; CHECK-NEXT: [[TMP72:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7 +; CHECK-NEXT: [[CONV_7_1:%.*]] = zext i16 [[TMP72]] to i32 +; CHECK-NEXT: [[TMP73:%.*]] = extractelement <8 x i16> [[TMP1]], i32 6 +; CHECK-NEXT: [[CONV_6_1:%.*]] = zext i16 [[TMP73]] to i32 +; CHECK-NEXT: [[TMP74:%.*]] = extractelement <8 x i16> [[TMP1]], i32 5 +; CHECK-NEXT: [[CONV_5_1:%.*]] = zext i16 [[TMP74]] to i32 +; CHECK-NEXT: [[TMP75:%.*]] = extractelement <8 x i16> [[TMP1]], i32 4 +; CHECK-NEXT: [[CONV_4_1:%.*]] = zext i16 [[TMP75]] to i32 +; CHECK-NEXT: [[TMP76:%.*]] = extractelement <8 x i16> [[TMP1]], i32 3 +; CHECK-NEXT: [[CONV_3_1:%.*]] = zext i16 [[TMP76]] to i32 +; CHECK-NEXT: [[TMP77:%.*]] = extractelement <8 x i16> [[TMP1]], i32 2 +; CHECK-NEXT: [[CONV_2_1:%.*]] = zext i16 [[TMP77]] to i32 +; CHECK-NEXT: [[TMP78:%.*]] = extractelement <8 x i16> [[TMP1]], i32 1 +; CHECK-NEXT: [[CONV_1_1:%.*]] = zext i16 [[TMP78]] to i32 +; CHECK-NEXT: [[TMP79:%.*]] = extractelement <8 x i16> [[TMP1]], i32 0 +; CHECK-NEXT: [[CONV_140:%.*]] = zext i16 [[TMP79]] to i32 +; CHECK-NEXT: [[TMP80:%.*]] = extractelement <8 x i16> [[TMP0]], i32 7 +; CHECK-NEXT: [[CONV_7:%.*]] = zext i16 [[TMP80]] to i32 +; CHECK-NEXT: [[TMP81:%.*]] = extractelement <8 x i16> [[TMP0]], i32 6 +; CHECK-NEXT: [[CONV_6:%.*]] = zext i16 [[TMP81]] to i32 +; CHECK-NEXT: [[TMP82:%.*]] = extractelement <8 x i16> [[TMP0]], i32 5 +; CHECK-NEXT: [[CONV_5:%.*]] = zext i16 [[TMP82]] to i32 +; CHECK-NEXT: [[TMP83:%.*]] = extractelement <8 x i16> [[TMP0]], i32 4 +; CHECK-NEXT: [[CONV_4:%.*]] = zext i16 [[TMP83]] to i32 +; CHECK-NEXT: [[TMP84:%.*]] = extractelement <8 x i16> [[TMP0]], i32 3 +; CHECK-NEXT: [[CONV_3:%.*]] = zext i16 [[TMP84]] to i32 +; CHECK-NEXT: [[TMP85:%.*]] = extractelement <8 x i16> [[TMP0]], i32 2 +; CHECK-NEXT: [[CONV_2:%.*]] = zext i16 [[TMP85]] to i32 +; CHECK-NEXT: [[TMP86:%.*]] = extractelement <8 x i16> [[TMP0]], i32 0 +; CHECK-NEXT: [[CONV:%.*]] = zext i16 [[TMP86]] to i32 +; CHECK-NEXT: [[TMP87:%.*]] = extractelement <8 x i16> [[TMP0]], i32 1 +; CHECK-NEXT: [[CONV_1:%.*]] = zext i16 [[TMP87]] to i32 +; CHECK-NEXT: [[ADD_1:%.*]] = add nuw nsw i32 [[CONV]], [[CONV_1]] +; CHECK-NEXT: [[TMP88:%.*]] = mul nuw nsw <64 x i32> [[TMP24]], [[TMP24]] +; CHECK-NEXT: [[ADD_2:%.*]] = add nuw nsw i32 [[ADD_1]], [[CONV_2]] +; CHECK-NEXT: [[ADD_3:%.*]] = add nuw nsw i32 [[ADD_2]], [[CONV_3]] +; CHECK-NEXT: [[ADD_4:%.*]] = add nuw nsw i32 [[ADD_3]], [[CONV_4]] +; CHECK-NEXT: [[ADD_5:%.*]] = add nuw nsw i32 [[ADD_4]], [[CONV_5]] +; CHECK-NEXT: [[ADD_6:%.*]] = add nuw nsw i32 [[ADD_5]], [[CONV_6]] +; CHECK-NEXT: [[ADD_7:%.*]] = add nuw nsw i32 [[ADD_6]], [[CONV_7]] +; CHECK-NEXT: [[ADD_141:%.*]] = add nuw nsw i32 [[ADD_7]], [[CONV_140]] +; CHECK-NEXT: [[ADD_1_1:%.*]] = add nuw nsw i32 [[ADD_141]], [[CONV_1_1]] +; CHECK-NEXT: [[ADD_2_1:%.*]] = add nuw nsw i32 [[ADD_1_1]], [[CONV_2_1]] +; CHECK-NEXT: [[ADD_3_1:%.*]] = add nuw nsw i32 [[ADD_2_1]], [[CONV_3_1]] +; CHECK-NEXT: [[ADD_4_1:%.*]] = add nuw nsw i32 [[ADD_3_1]], [[CONV_4_1]] +; CHECK-NEXT: [[ADD_5_1:%.*]] = add nuw nsw i32 [[ADD_4_1]], [[CONV_5_1]] +; CHECK-NEXT: [[ADD_6_1:%.*]] = add nuw nsw i32 [[ADD_5_1]], [[CONV_6_1]] +; CHECK-NEXT: [[ADD_7_1:%.*]] = add nuw nsw i32 [[ADD_6_1]], [[CONV_7_1]] +; CHECK-NEXT: [[ADD_245:%.*]] = add nuw nsw i32 [[ADD_7_1]], [[CONV_244]] +; CHECK-NEXT: [[ADD_1_2:%.*]] = add nuw nsw i32 [[ADD_245]], [[CONV_1_2]] +; CHECK-NEXT: [[ADD_2_2:%.*]] = add nuw nsw i32 [[ADD_1_2]], [[CONV_2_2]] +; CHECK-NEXT: [[ADD_3_2:%.*]] = add nuw nsw i32 [[ADD_2_2]], [[CONV_3_2]] +; CHECK-NEXT: [[ADD_4_2:%.*]] = add nuw nsw i32 [[ADD_3_2]], [[CONV_4_2]] +; CHECK-NEXT: [[ADD_5_2:%.*]] = add nuw nsw i32 [[ADD_4_2]], [[CONV_5_2]] +; CHECK-NEXT: [[ADD_6_2:%.*]] = add nuw nsw i32 [[ADD_5_2]], [[CONV_6_2]] +; CHECK-NEXT: [[ADD_7_2:%.*]] = add nuw nsw i32 [[ADD_6_2]], [[CONV_7_2]] +; CHECK-NEXT: [[ADD_349:%.*]] = add nuw nsw i32 [[ADD_7_2]], [[CONV_348]] +; CHECK-NEXT: [[ADD_1_3:%.*]] = add nuw nsw i32 [[ADD_349]], [[CONV_1_3]] +; CHECK-NEXT: [[ADD_2_3:%.*]] = add nuw nsw i32 [[ADD_1_3]], [[CONV_2_3]] +; CHECK-NEXT: [[ADD_3_3:%.*]] = add nuw nsw i32 [[ADD_2_3]], [[CONV_3_3]] +; CHECK-NEXT: [[ADD_4_3:%.*]] = add nuw nsw i32 [[ADD_3_3]], [[CONV_4_3]] +; CHECK-NEXT: [[ADD_5_3:%.*]] = add nuw nsw i32 [[ADD_4_3]], [[CONV_5_3]] +; CHECK-NEXT: [[ADD_6_3:%.*]] = add nuw nsw i32 [[ADD_5_3]], [[CONV_6_3]] +; CHECK-NEXT: [[ADD_7_3:%.*]] = add nuw nsw i32 [[ADD_6_3]], [[CONV_7_3]] +; CHECK-NEXT: [[ADD_453:%.*]] = add nuw nsw i32 [[ADD_7_3]], [[CONV_452]] +; CHECK-NEXT: [[ADD_1_4:%.*]] = add nuw nsw i32 [[ADD_453]], [[CONV_1_4]] +; CHECK-NEXT: [[ADD_2_4:%.*]] = add nuw nsw i32 [[ADD_1_4]], [[CONV_2_4]] +; CHECK-NEXT: [[ADD_3_4:%.*]] = add nuw nsw i32 [[ADD_2_4]], [[CONV_3_4]] +; CHECK-NEXT: [[ADD_4_4:%.*]] = add nuw nsw i32 [[ADD_3_4]], [[CONV_4_4]] +; CHECK-NEXT: [[ADD_5_4:%.*]] = add nuw nsw i32 [[ADD_4_4]], [[CONV_5_4]] +; CHECK-NEXT: [[ADD_6_4:%.*]] = add nuw nsw i32 [[ADD_5_4]], [[CONV_6_4]] +; CHECK-NEXT: [[ADD_7_4:%.*]] = add nuw nsw i32 [[ADD_6_4]], [[CONV_7_4]] +; CHECK-NEXT: [[ADD_557:%.*]] = add nuw nsw i32 [[ADD_7_4]], [[CONV_556]] +; CHECK-NEXT: [[ADD_1_5:%.*]] = add nuw nsw i32 [[ADD_557]], [[CONV_1_5]] +; CHECK-NEXT: [[ADD_2_5:%.*]] = add nuw nsw i32 [[ADD_1_5]], [[CONV_2_5]] +; CHECK-NEXT: [[ADD_3_5:%.*]] = add nuw nsw i32 [[ADD_2_5]], [[CONV_3_5]] +; CHECK-NEXT: [[ADD_4_5:%.*]] = add nuw nsw i32 [[ADD_3_5]], [[CONV_4_5]] +; CHECK-NEXT: [[ADD_5_5:%.*]] = add nuw nsw i32 [[ADD_4_5]], [[CONV_5_5]] +; CHECK-NEXT: [[ADD_6_5:%.*]] = add nuw nsw i32 [[ADD_5_5]], [[CONV_6_5]] +; CHECK-NEXT: [[ADD_7_5:%.*]] = add nuw nsw i32 [[ADD_6_5]], [[CONV_7_5]] +; CHECK-NEXT: [[ADD_661:%.*]] = add nuw nsw i32 [[ADD_7_5]], [[CONV_660]] +; CHECK-NEXT: [[ADD_1_6:%.*]] = add nuw nsw i32 [[ADD_661]], [[CONV_1_6]] +; CHECK-NEXT: [[ADD_2_6:%.*]] = add nuw nsw i32 [[ADD_1_6]], [[CONV_2_6]] +; CHECK-NEXT: [[ADD_3_6:%.*]] = add nuw nsw i32 [[ADD_2_6]], [[CONV_3_6]] +; CHECK-NEXT: [[ADD_4_6:%.*]] = add nuw nsw i32 [[ADD_3_6]], [[CONV_4_6]] +; CHECK-NEXT: [[ADD_5_6:%.*]] = add nuw nsw i32 [[ADD_4_6]], [[CONV_5_6]] +; CHECK-NEXT: [[ADD_6_6:%.*]] = add nuw nsw i32 [[ADD_5_6]], [[CONV_6_6]] +; CHECK-NEXT: [[ADD_7_6:%.*]] = add nuw nsw i32 [[ADD_6_6]], [[CONV_7_6]] +; CHECK-NEXT: [[ADD_765:%.*]] = add nuw nsw i32 [[ADD_7_6]], [[CONV_764]] +; CHECK-NEXT: [[ADD_1_7:%.*]] = add nuw nsw i32 [[ADD_765]], [[CONV_1_7]] +; CHECK-NEXT: [[ADD_2_7:%.*]] = add nuw nsw i32 [[ADD_1_7]], [[CONV_2_7]] +; CHECK-NEXT: [[ADD_3_7:%.*]] = add nuw nsw i32 [[ADD_2_7]], [[CONV_3_7]] +; CHECK-NEXT: [[ADD_4_7:%.*]] = add nuw nsw i32 [[ADD_3_7]], [[CONV_4_7]] +; CHECK-NEXT: [[ADD_5_7:%.*]] = add nuw nsw i32 [[ADD_4_7]], [[CONV_5_7]] +; CHECK-NEXT: [[ADD_6_7:%.*]] = add nuw nsw i32 [[ADD_5_7]], [[CONV_6_7]] +; CHECK-NEXT: [[ADD_7_7:%.*]] = add nuw nsw i32 [[ADD_6_7]], [[CONV_7_7]] +; CHECK-NEXT: [[TMP89:%.*]] = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> [[TMP88]]) ; CHECK-NEXT: [[CONV15:%.*]] = zext i32 [[ADD_7_7]] to i64 ; CHECK-NEXT: [[CONV16:%.*]] = zext i32 [[TMP89]] to i64 ; CHECK-NEXT: [[SHL:%.*]] = shl nuw i64 [[CONV16]], 32 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll index c6209fd71063a..6f6b66255a434 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll @@ -6,7 +6,8 @@ define fastcc i64 @zot(float %arg, float %arg1, float %arg2, float %arg3, float %arg4, ptr %arg5, i1 %arg6, i1 %arg7, i1 %arg8) { ; CHECK-LABEL: @zot( ; CHECK-NEXT: bb: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x float> , float [[ARG:%.*]], i32 1 +; CHECK-NEXT: [[VAL9:%.*]] = fmul fast float 0.000000e+00, [[ARG:%.*]] +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x float> , float [[ARG]], i32 1 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> [[TMP0]], float [[ARG3:%.*]], i32 2 ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <4 x float> , [[TMP2]] @@ -24,18 +25,17 @@ define fastcc i64 @zot(float %arg, float %arg1, float %arg2, float %arg3, float ; CHECK-NEXT: br i1 [[ARG7:%.*]], label [[BB25:%.*]], label [[BB57]] ; CHECK: bb25: ; CHECK-NEXT: [[TMP10:%.*]] = phi <4 x float> [ [[TMP7]], [[BB18]] ] -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[TMP3]], i32 1 ; CHECK-NEXT: br label [[BB30:%.*]] ; CHECK: bb30: ; CHECK-NEXT: [[VAL31:%.*]] = phi float [ [[VAL55:%.*]], [[BB30]] ], [ 0.000000e+00, [[BB25]] ] -; CHECK-NEXT: [[VAL32:%.*]] = phi float [ [[TMP11]], [[BB30]] ], [ 0.000000e+00, [[BB25]] ] -; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i8>, ptr [[ARG5:%.*]], align 1 -; CHECK-NEXT: [[TMP13:%.*]] = uitofp <4 x i8> [[TMP12]] to <4 x float> -; CHECK-NEXT: [[TMP14:%.*]] = fsub fast <4 x float> [[TMP13]], [[TMP3]] -; CHECK-NEXT: [[TMP15:%.*]] = fmul fast <4 x float> [[TMP14]], [[TMP10]] -; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP15]]) +; CHECK-NEXT: [[VAL32:%.*]] = phi float [ [[VAL9]], [[BB30]] ], [ 0.000000e+00, [[BB25]] ] +; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i8>, ptr [[ARG5:%.*]], align 1 +; CHECK-NEXT: [[TMP12:%.*]] = uitofp <4 x i8> [[TMP11]] to <4 x float> +; CHECK-NEXT: [[TMP13:%.*]] = fsub fast <4 x float> [[TMP12]], [[TMP3]] +; CHECK-NEXT: [[TMP14:%.*]] = fmul fast <4 x float> [[TMP13]], [[TMP10]] +; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP14]]) ; CHECK-NEXT: [[VAL55]] = tail call fast float @llvm.minnum.f32(float [[VAL31]], float [[ARG1:%.*]]) -; CHECK-NEXT: [[VAL56:%.*]] = tail call fast float @llvm.maxnum.f32(float [[ARG2:%.*]], float [[TMP16]]) +; CHECK-NEXT: [[VAL56:%.*]] = tail call fast float @llvm.maxnum.f32(float [[ARG2:%.*]], float [[TMP15]]) ; CHECK-NEXT: call void @ham(float [[VAL55]], float [[VAL56]]) ; CHECK-NEXT: br i1 [[ARG8:%.*]], label [[BB30]], label [[BB57]] ; CHECK: bb57: diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/shuffle-vectors-mask-size.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/shuffle-vectors-mask-size.ll index 0783a28f56d85..e39cd8aaa111b 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/shuffle-vectors-mask-size.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/shuffle-vectors-mask-size.ll @@ -7,9 +7,11 @@ define void @p(double %0) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x double> , double [[TMP0]], i32 3 ; CHECK-NEXT: [[TMP2:%.*]] = fmul <4 x double> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[MUL16_150_1_I:%.*]] = fmul double 0.000000e+00, 0.000000e+00 ; CHECK-NEXT: [[TMP3:%.*]] = fadd <4 x double> zeroinitializer, [[TMP2]] ; CHECK-NEXT: [[TMP4:%.*]] = fadd <4 x double> [[TMP3]], zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> [[TMP3]], <2 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP14]], double [[MUL16_150_1_I]], i32 0 ; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x double> zeroinitializer, [[TMP5]] ; CHECK-NEXT: [[TMP7:%.*]] = fmul <2 x double> [[TMP6]], zeroinitializer ; CHECK-NEXT: [[TMP8:%.*]] = fmul <4 x double> [[TMP4]], zeroinitializer diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll index 0b26c53ca4503..03f67ecb3e695 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll @@ -7,19 +7,21 @@ define void @slp_not_profitable_with_fast_fmf(ptr %A, ptr %B) { ; CHECK-LABEL: @slp_not_profitable_with_fast_fmf( ; CHECK-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 1 ; CHECK-NEXT: [[A_0:%.*]] = load float, ptr [[A:%.*]], align 4 -; CHECK-NEXT: [[B_1:%.*]] = load float, ptr [[GEP_B_1]], align 4 -; CHECK-NEXT: [[MUL_0:%.*]] = fmul fast float [[B_1]], [[A_0]] ; CHECK-NEXT: [[B_0:%.*]] = load float, ptr [[B]], align 4 ; CHECK-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 2 ; CHECK-NEXT: [[B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4 -; CHECK-NEXT: [[MUL_1:%.*]] = fmul fast float [[B_2]], [[B_0]] -; CHECK-NEXT: [[SUB:%.*]] = fsub fast float [[MUL_0]], [[MUL_1]] -; CHECK-NEXT: [[MUL_2:%.*]] = fmul fast float [[B_0]], [[B_1]] -; CHECK-NEXT: [[MUL_3:%.*]] = fmul fast float [[B_2]], [[A_0]] -; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[MUL_3]], [[MUL_2]] -; CHECK-NEXT: store float [[SUB]], ptr [[A]], align 4 -; CHECK-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 1 -; CHECK-NEXT: store float [[ADD]], ptr [[GEP_A_1]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[GEP_B_1]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[B_0]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = fmul fast <2 x float> [[TMP3]], [[TMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> poison, float [[A_0]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = fsub fast <2 x float> [[TMP8]], [[TMP5]] +; CHECK-NEXT: [[TMP10:%.*]] = fadd fast <2 x float> [[TMP8]], [[TMP5]] +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x i32> +; CHECK-NEXT: store <2 x float> [[TMP11]], ptr [[A]], align 4 ; CHECK-NEXT: store float [[B_2]], ptr [[B]], align 4 ; CHECK-NEXT: ret void ; @@ -46,19 +48,21 @@ define void @slp_not_profitable_with_reassoc_fmf(ptr %A, ptr %B) { ; CHECK-LABEL: @slp_not_profitable_with_reassoc_fmf( ; CHECK-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 1 ; CHECK-NEXT: [[A_0:%.*]] = load float, ptr [[A:%.*]], align 4 -; CHECK-NEXT: [[B_1:%.*]] = load float, ptr [[GEP_B_1]], align 4 -; CHECK-NEXT: [[MUL_0:%.*]] = fmul reassoc float [[B_1]], [[A_0]] ; CHECK-NEXT: [[B_0:%.*]] = load float, ptr [[B]], align 4 ; CHECK-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 2 ; CHECK-NEXT: [[B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4 -; CHECK-NEXT: [[MUL_1:%.*]] = fmul float [[B_2]], [[B_0]] -; CHECK-NEXT: [[SUB:%.*]] = fsub reassoc float [[MUL_0]], [[MUL_1]] -; CHECK-NEXT: [[MUL_2:%.*]] = fmul float [[B_0]], [[B_1]] -; CHECK-NEXT: [[MUL_3:%.*]] = fmul reassoc float [[B_2]], [[A_0]] -; CHECK-NEXT: [[ADD:%.*]] = fadd reassoc float [[MUL_3]], [[MUL_2]] -; CHECK-NEXT: store float [[SUB]], ptr [[A]], align 4 -; CHECK-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 1 -; CHECK-NEXT: store float [[ADD]], ptr [[GEP_A_1]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[GEP_B_1]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[B_0]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x float> [[TMP3]], [[TMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> poison, float [[A_0]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = fmul reassoc <2 x float> [[TMP1]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = fsub reassoc <2 x float> [[TMP8]], [[TMP5]] +; CHECK-NEXT: [[TMP10:%.*]] = fadd reassoc <2 x float> [[TMP8]], [[TMP5]] +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x i32> +; CHECK-NEXT: store <2 x float> [[TMP11]], ptr [[A]], align 4 ; CHECK-NEXT: store float [[B_2]], ptr [[B]], align 4 ; CHECK-NEXT: ret void ; @@ -86,19 +90,21 @@ define void @slp_profitable_missing_fmf_on_fadd_fsub(ptr %A, ptr %B) { ; CHECK-LABEL: @slp_profitable_missing_fmf_on_fadd_fsub( ; CHECK-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 1 ; CHECK-NEXT: [[A_0:%.*]] = load float, ptr [[A:%.*]], align 4 -; CHECK-NEXT: [[B_1:%.*]] = load float, ptr [[GEP_B_1]], align 4 -; CHECK-NEXT: [[MUL_0:%.*]] = fmul fast float [[B_1]], [[A_0]] ; CHECK-NEXT: [[B_0:%.*]] = load float, ptr [[B]], align 4 ; CHECK-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 2 ; CHECK-NEXT: [[B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4 -; CHECK-NEXT: [[MUL_1:%.*]] = fmul fast float [[B_2]], [[B_0]] -; CHECK-NEXT: [[SUB:%.*]] = fsub float [[MUL_0]], [[MUL_1]] -; CHECK-NEXT: [[MUL_2:%.*]] = fmul fast float [[B_0]], [[B_1]] -; CHECK-NEXT: [[MUL_3:%.*]] = fmul fast float [[B_2]], [[A_0]] -; CHECK-NEXT: [[ADD:%.*]] = fadd float [[MUL_3]], [[MUL_2]] -; CHECK-NEXT: store float [[SUB]], ptr [[A]], align 4 -; CHECK-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 1 -; CHECK-NEXT: store float [[ADD]], ptr [[GEP_A_1]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[GEP_B_1]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[B_0]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = fmul fast <2 x float> [[TMP3]], [[TMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> poison, float [[A_0]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = fsub <2 x float> [[TMP8]], [[TMP5]] +; CHECK-NEXT: [[TMP10:%.*]] = fadd <2 x float> [[TMP8]], [[TMP5]] +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x i32> +; CHECK-NEXT: store <2 x float> [[TMP11]], ptr [[A]], align 4 ; CHECK-NEXT: store float [[B_2]], ptr [[B]], align 4 ; CHECK-NEXT: ret void ; @@ -126,19 +132,21 @@ define void @slp_profitable_missing_fmf_on_fmul_fadd_fsub(ptr %A, ptr %B) { ; CHECK-LABEL: @slp_profitable_missing_fmf_on_fmul_fadd_fsub( ; CHECK-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 1 ; CHECK-NEXT: [[A_0:%.*]] = load float, ptr [[A:%.*]], align 4 -; CHECK-NEXT: [[B_1:%.*]] = load float, ptr [[GEP_B_1]], align 4 -; CHECK-NEXT: [[MUL_0:%.*]] = fmul float [[B_1]], [[A_0]] ; CHECK-NEXT: [[B_0:%.*]] = load float, ptr [[B]], align 4 ; CHECK-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 2 ; CHECK-NEXT: [[B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4 -; CHECK-NEXT: [[MUL_1:%.*]] = fmul float [[B_2]], [[B_0]] -; CHECK-NEXT: [[SUB:%.*]] = fsub float [[MUL_0]], [[MUL_1]] -; CHECK-NEXT: [[MUL_2:%.*]] = fmul float [[B_0]], [[B_1]] -; CHECK-NEXT: [[MUL_3:%.*]] = fmul float [[B_2]], [[A_0]] -; CHECK-NEXT: [[ADD:%.*]] = fadd float [[MUL_3]], [[MUL_2]] -; CHECK-NEXT: store float [[SUB]], ptr [[A]], align 4 -; CHECK-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 1 -; CHECK-NEXT: store float [[ADD]], ptr [[GEP_A_1]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[GEP_B_1]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[B_0]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x float> [[TMP3]], [[TMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> poison, float [[A_0]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = fmul <2 x float> [[TMP1]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = fsub <2 x float> [[TMP8]], [[TMP5]] +; CHECK-NEXT: [[TMP10:%.*]] = fadd <2 x float> [[TMP8]], [[TMP5]] +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x i32> +; CHECK-NEXT: store <2 x float> [[TMP11]], ptr [[A]], align 4 ; CHECK-NEXT: store float [[B_2]], ptr [[B]], align 4 ; CHECK-NEXT: ret void ; @@ -166,19 +174,21 @@ define void @slp_profitable_missing_fmf_nnans_only(ptr %A, ptr %B) { ; CHECK-LABEL: @slp_profitable_missing_fmf_nnans_only( ; CHECK-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 1 ; CHECK-NEXT: [[A_0:%.*]] = load float, ptr [[A:%.*]], align 4 -; CHECK-NEXT: [[B_1:%.*]] = load float, ptr [[GEP_B_1]], align 4 -; CHECK-NEXT: [[MUL_0:%.*]] = fmul nnan float [[B_1]], [[A_0]] ; CHECK-NEXT: [[B_0:%.*]] = load float, ptr [[B]], align 4 ; CHECK-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 2 ; CHECK-NEXT: [[B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4 -; CHECK-NEXT: [[MUL_1:%.*]] = fmul nnan float [[B_2]], [[B_0]] -; CHECK-NEXT: [[SUB:%.*]] = fsub nnan float [[MUL_0]], [[MUL_1]] -; CHECK-NEXT: [[MUL_2:%.*]] = fmul nnan float [[B_0]], [[B_1]] -; CHECK-NEXT: [[MUL_3:%.*]] = fmul nnan float [[B_2]], [[A_0]] -; CHECK-NEXT: [[ADD:%.*]] = fadd nnan float [[MUL_3]], [[MUL_2]] -; CHECK-NEXT: store float [[SUB]], ptr [[A]], align 4 -; CHECK-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 1 -; CHECK-NEXT: store float [[ADD]], ptr [[GEP_A_1]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[GEP_B_1]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[B_0]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = fmul nnan <2 x float> [[TMP3]], [[TMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> poison, float [[A_0]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = fmul nnan <2 x float> [[TMP1]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = fsub nnan <2 x float> [[TMP8]], [[TMP5]] +; CHECK-NEXT: [[TMP10:%.*]] = fadd nnan <2 x float> [[TMP8]], [[TMP5]] +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x i32> +; CHECK-NEXT: store <2 x float> [[TMP11]], ptr [[A]], align 4 ; CHECK-NEXT: store float [[B_2]], ptr [[B]], align 4 ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll index cd4aa9a73dba2..1bd63b79b0f5c 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll @@ -124,18 +124,18 @@ define <4 x i32> @build_vec_v4i32_reuse_0(<2 x i32> %v0, <2 x i32> %v1) { define <4 x i32> @build_vec_v4i32_reuse_1(<2 x i32> %v0, <2 x i32> %v1) { ; CHECK-LABEL: @build_vec_v4i32_reuse_1( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[V0_1:%.*]] = extractelement <2 x i32> [[V0]], i64 1 ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[V1:%.*]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[V0]], i64 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[V1]], i64 1 -; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[TMP3]], [[TMP4]] -; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP0_1]], i64 0 -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP9]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = sub <4 x i32> [[TMP7]], [[TMP10]] -; CHECK-NEXT: ret <4 x i32> [[TMP11]] +; CHECK-NEXT: [[V1_1:%.*]] = extractelement <2 x i32> [[V1]], i64 1 +; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[V0_1]], [[V1_1]] +; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = xor <4 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP0_1]], i64 0 +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = sub <4 x i32> [[TMP5]], [[TMP8]] +; CHECK-NEXT: ret <4 x i32> [[TMP9]] ; %v0.0 = extractelement <2 x i32> %v0, i32 0 %v0.1 = extractelement <2 x i32> %v0, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll index 1e0245812d8d7..f99f6ecd33382 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll @@ -124,18 +124,18 @@ define <4 x i32> @build_vec_v4i32_reuse_0(<2 x i32> %v0, <2 x i32> %v1) { define <4 x i32> @build_vec_v4i32_reuse_1(<2 x i32> %v0, <2 x i32> %v1) { ; CHECK-LABEL: @build_vec_v4i32_reuse_1( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[V0_1:%.*]] = extractelement <2 x i32> [[V0]], i64 1 ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[V1:%.*]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[V0]], i64 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[V1]], i64 1 -; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[TMP3]], [[TMP4]] -; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP0_1]], i64 0 -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP9]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = sub <4 x i32> [[TMP7]], [[TMP10]] -; CHECK-NEXT: ret <4 x i32> [[TMP11]] +; CHECK-NEXT: [[V1_1:%.*]] = extractelement <2 x i32> [[V1]], i64 1 +; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[V0_1]], [[V1_1]] +; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = xor <4 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP0_1]], i64 0 +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = sub <4 x i32> [[TMP5]], [[TMP8]] +; CHECK-NEXT: ret <4 x i32> [[TMP9]] ; %v0.0 = extractelement <2 x i32> %v0, i32 0 %v0.1 = extractelement <2 x i32> %v0, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorize-free-extracts-inserts.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorize-free-extracts-inserts.ll index 55504985d9a6f..e2d1a29ee22de 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorize-free-extracts-inserts.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorize-free-extracts-inserts.ll @@ -12,13 +12,13 @@ define void @noop_extracts_first_2_lanes(ptr %ptr.1, ptr %ptr.2) { ; CHECK-LABEL: @noop_extracts_first_2_lanes( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[V_1:%.*]] = load <2 x double>, ptr [[PTR_1:%.*]], align 8 +; CHECK-NEXT: [[V1_LANE_0:%.*]] = extractelement <2 x double> [[V_1]], i32 0 +; CHECK-NEXT: [[V1_LANE_1:%.*]] = extractelement <2 x double> [[V_1]], i32 1 ; CHECK-NEXT: [[V_2:%.*]] = load <4 x double>, ptr [[PTR_2:%.*]], align 16 ; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <2 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = fmul <2 x double> [[V_1]], [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[V_1]], i32 0 -; CHECK-NEXT: call void @use(double [[TMP2]]) -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[V_1]], i32 1 -; CHECK-NEXT: call void @use(double [[TMP3]]) +; CHECK-NEXT: call void @use(double [[V1_LANE_0]]) +; CHECK-NEXT: call void @use(double [[V1_LANE_1]]) ; CHECK-NEXT: store <2 x double> [[TMP1]], ptr [[PTR_1]], align 8 ; CHECK-NEXT: ret void ; @@ -127,14 +127,14 @@ define void @extract_reverse_order(ptr %ptr.1, ptr %ptr.2) { ; CHECK-LABEL: @extract_reverse_order( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[V_1:%.*]] = load <2 x double>, ptr [[PTR_1:%.*]], align 8 +; CHECK-NEXT: [[V1_LANE_0:%.*]] = extractelement <2 x double> [[V_1]], i32 0 +; CHECK-NEXT: [[V1_LANE_1:%.*]] = extractelement <2 x double> [[V_1]], i32 1 ; CHECK-NEXT: [[V_2:%.*]] = load <4 x double>, ptr [[PTR_2:%.*]], align 16 ; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <2 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = fmul <2 x double> [[V_1]], [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[V_1]], i32 0 -; CHECK-NEXT: call void @use(double [[TMP3]]) -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[V_1]], i32 1 -; CHECK-NEXT: call void @use(double [[TMP4]]) +; CHECK-NEXT: call void @use(double [[V1_LANE_0]]) +; CHECK-NEXT: call void @use(double [[V1_LANE_1]]) ; CHECK-NEXT: store <2 x double> [[TMP2]], ptr [[PTR_1]], align 8 ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/mixed-extracts-types.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/mixed-extracts-types.ll index 0d5c644b9cc0f..125fe69820d5c 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/mixed-extracts-types.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/mixed-extracts-types.ll @@ -9,10 +9,10 @@ define i32 @test() { ; CHECK-NEXT: [[CONV5:%.*]] = sext i8 [[VECTOR_RECUR_EXTRACT]] to i32 ; CHECK-NEXT: store i32 [[CONV5]], ptr getelementptr ([0 x i32], ptr null, i64 0, i64 -14), align 4 ; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i8>, ptr getelementptr ([9 x i8], ptr null, i64 -2, i64 5), align 1 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr getelementptr ([9 x i8], ptr null, i64 -2, i64 5), align 1 ; CHECK-NEXT: [[TMP1:%.*]] = icmp ne <2 x i8> [[TMP0]], zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = zext <2 x i1> [[TMP1]] to <2 x i16> ; CHECK-NEXT: store <2 x i16> [[TMP2]], ptr getelementptr ([0 x i16], ptr null, i64 0, i64 -14), align 2 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i8> [[TMP0]], i32 0 ; CHECK-NEXT: [[CONV5_1:%.*]] = sext i8 [[TMP3]] to i32 ; CHECK-NEXT: store i32 [[CONV5_1]], ptr getelementptr ([0 x i32], ptr null, i64 0, i64 -13), align 4 ; CHECK-NEXT: ret i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-indices.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-indices.ll index c72d6cc75d827..93f5b5e46d2c3 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-indices.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-indices.ll @@ -7,6 +7,7 @@ define void @test() { ; CHECK-LABEL: define void @test ; CHECK-SAME: () #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: +; CHECK-NEXT: [[SUB4_I_I65_US:%.*]] = or i64 0, 1 ; CHECK-NEXT: br label [[BODY:%.*]] ; CHECK: body: ; CHECK-NEXT: [[ADD_I_I62_US:%.*]] = shl i64 0, 0 @@ -17,8 +18,7 @@ define void @test() { ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP3]], i32 0 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[TMP3]], i32 1 ; CHECK-NEXT: [[CMP_I_I_I_I67_US:%.*]] = icmp slt i32 [[TMP4]], [[TMP5]] -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1 -; CHECK-NEXT: [[SPEC_SELECT_I_I68_US:%.*]] = select i1 false, i64 [[TMP6]], i64 0 +; CHECK-NEXT: [[SPEC_SELECT_I_I68_US:%.*]] = select i1 false, i64 [[SUB4_I_I65_US]], i64 0 ; CHECK-NEXT: br label [[BODY]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-unsupported-type.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-unsupported-type.ll index 4fd22639d6371..c0e1ab56c110b 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-unsupported-type.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-unsupported-type.ll @@ -5,10 +5,10 @@ define void @loads() { ; CHECK-LABEL: define void @loads( ; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load <2 x fp128>, ptr null, align 16 -; CHECK-NEXT: [[TMP1:%.*]] = fcmp une <2 x fp128> [[TMP0]], zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x fp128>, ptr null, align 16 +; CHECK-NEXT: [[TMP3:%.*]] = fcmp une <2 x fp128> [[TMP1]], zeroinitializer ; CHECK-NEXT: call void null(i32 0, ptr null, i32 0) -; CHECK-NEXT: [[TMP2:%.*]] = fcmp une <2 x fp128> [[TMP0]], zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = fcmp une <2 x fp128> [[TMP1]], zeroinitializer ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll index 473b37167409e..14685fcca5107 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll @@ -50,15 +50,10 @@ define void @i64_simplifiedi_reversed(ptr noalias %st, ptr noalias %ld) { define void @i64_simplifiedi_extract(ptr noalias %st, ptr noalias %ld) { ; CHECK-LABEL: @i64_simplifiedi_extract( ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[LD:%.*]], i64 1 -; CHECK-NEXT: [[T0:%.*]] = load i64, ptr [[LD]], align 8 ; CHECK-NEXT: [[T1:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8 -; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, ptr [[ST:%.*]], i64 1 -; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, ptr [[ST]], i64 2 -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i64, ptr [[ST]], i64 3 -; CHECK-NEXT: store i64 [[T0]], ptr [[ST]], align 8 -; CHECK-NEXT: store i64 [[T0]], ptr [[ARRAYIDX3]], align 8 -; CHECK-NEXT: store i64 [[T0]], ptr [[ARRAYIDX4]], align 8 -; CHECK-NEXT: store i64 [[T1]], ptr [[ARRAYIDX5]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[LD]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> +; CHECK-NEXT: store <4 x i64> [[TMP2]], ptr [[ST:%.*]], align 8 ; CHECK-NEXT: store i64 [[T1]], ptr [[LD]], align 8 ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR35628_1.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR35628_1.ll index 5ec7aac8a7935..e9aa434dec03d 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/PR35628_1.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/PR35628_1.ll @@ -9,10 +9,11 @@ define void @mainTest(ptr %ptr) #0 { ; CHECK-NEXT: br i1 [[CMP]], label [[LOOP:%.*]], label [[BAIL_OUT:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[DUMMY_PHI:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[OP_RDX3:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i64 1 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[PTR]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP0]], align 4 ; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i32> [[TMP1]], [[TMP1]] ; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[TMP3]] to i64 ; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP5]]) diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll index 5c261d69cd53e..143e09374a891 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll @@ -7,32 +7,32 @@ define void @Test(i32) { ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP9:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[LOCAL_8_43_US:%.*]] = phi i32 [ [[VAL_43:%.*]], [[LOOP]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP6:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY]] ] ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], ; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP3]]) ; CHECK-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP0:%.*]], [[TMP4]] -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> , i32 [[OP_RDX]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = and <2 x i32> [[TMP5]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = add <2 x i32> [[TMP5]], [[TMP6]] -; CHECK-NEXT: [[TMP9]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> [[TMP8]], <2 x i32> +; CHECK-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[LOCAL_8_43_US]] +; CHECK-NEXT: [[VAL_43]] = add i32 [[LOCAL_8_43_US]], 14910 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[OP_RDX1]], i32 0 +; CHECK-NEXT: [[TMP6]] = insertelement <2 x i32> [[TMP5]], i32 [[VAL_43]], i32 1 ; CHECK-NEXT: br label [[LOOP]] ; ; FORCE_REDUCTION-LABEL: @Test( ; FORCE_REDUCTION-NEXT: entry: ; FORCE_REDUCTION-NEXT: br label [[LOOP:%.*]] ; FORCE_REDUCTION: loop: -; FORCE_REDUCTION-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP9:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ] +; FORCE_REDUCTION-NEXT: [[LOCAL_8_43_US:%.*]] = phi i32 [ [[VAL_43:%.*]], [[LOOP]] ], [ 0, [[ENTRY:%.*]] ] +; FORCE_REDUCTION-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP6:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY]] ] ; FORCE_REDUCTION-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> ; FORCE_REDUCTION-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], ; FORCE_REDUCTION-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP3]]) ; FORCE_REDUCTION-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP0:%.*]], [[TMP4]] -; FORCE_REDUCTION-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> , i32 [[OP_RDX]], i32 0 -; FORCE_REDUCTION-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> -; FORCE_REDUCTION-NEXT: [[TMP7:%.*]] = and <2 x i32> [[TMP5]], [[TMP6]] -; FORCE_REDUCTION-NEXT: [[TMP8:%.*]] = add <2 x i32> [[TMP5]], [[TMP6]] -; FORCE_REDUCTION-NEXT: [[TMP9]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> [[TMP8]], <2 x i32> +; FORCE_REDUCTION-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[LOCAL_8_43_US]] +; FORCE_REDUCTION-NEXT: [[VAL_43]] = add i32 [[LOCAL_8_43_US]], 14910 +; FORCE_REDUCTION-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[OP_RDX1]], i32 0 +; FORCE_REDUCTION-NEXT: [[TMP6]] = insertelement <2 x i32> [[TMP5]], i32 [[VAL_43]], i32 1 ; FORCE_REDUCTION-NEXT: br label [[LOOP]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll index 2ea7f191947b4..194c7021f60f5 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll @@ -7,17 +7,17 @@ define void @mainTest(i32 %param, ptr %vals, i32 %len) { ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> , i32 [[PARAM:%.*]], i32 0 ; CHECK-NEXT: br label [[BCI_15:%.*]] ; CHECK: bci_15: -; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP7:%.*]], [[BCI_15]] ], [ [[TMP0]], [[BCI_15_PREHEADER:%.*]] ] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <16 x i32> [[SHUFFLE]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = add <16 x i32> [[SHUFFLE]], -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <16 x i32> [[SHUFFLE]], i32 0 -; CHECK-NEXT: store atomic i32 [[TMP4]], ptr [[VALS:%.*]] unordered, align 4 -; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[TMP3]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP5]], [[TMP2]] -; CHECK-NEXT: [[V44:%.*]] = add i32 [[TMP2]], 16 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[OP_RDX]], i32 0 -; CHECK-NEXT: [[TMP7]] = insertelement <2 x i32> [[TMP6]], i32 [[V44]], i32 1 +; CHECK-NEXT: [[LOCAL_0_:%.*]] = phi i32 [ [[OP_RDX:%.*]], [[BCI_15]] ], [ [[PARAM]], [[BCI_15_PREHEADER:%.*]] ] +; CHECK-NEXT: [[LOCAL_4_:%.*]] = phi i32 [ [[V44:%.*]], [[BCI_15]] ], [ 31, [[BCI_15_PREHEADER]] ] +; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP6:%.*]], [[BCI_15]] ], [ [[TMP0]], [[BCI_15_PREHEADER]] ] +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = add <16 x i32> [[TMP2]], +; CHECK-NEXT: store atomic i32 [[LOCAL_0_]], ptr [[VALS:%.*]] unordered, align 4 +; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[TMP3]]) +; CHECK-NEXT: [[OP_RDX]] = and i32 [[TMP4]], [[LOCAL_4_]] +; CHECK-NEXT: [[V44]] = add i32 [[LOCAL_4_]], 16 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[OP_RDX]], i32 0 +; CHECK-NEXT: [[TMP6]] = insertelement <2 x i32> [[TMP5]], i32 [[V44]], i32 1 ; CHECK-NEXT: br i1 true, label [[BCI_15]], label [[LOOPEXIT:%.*]] ; CHECK: loopexit: ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/compare-reduce.ll b/llvm/test/Transforms/SLPVectorizer/X86/compare-reduce.ll index 22cba328b180a..ae6e6723706cd 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/compare-reduce.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/compare-reduce.ll @@ -80,11 +80,11 @@ declare i32 @printf(ptr nocapture, ...) define float @merge_anyof_v4f32_wrong_first(<4 x float> %x) { ; CHECK-LABEL: @merge_anyof_v4f32_wrong_first( -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[X:%.*]], i32 3 -; CHECK-NEXT: [[CMP3WRONG:%.*]] = fcmp olt float [[TMP1]], 4.200000e+01 -; CHECK-NEXT: [[TMP2:%.*]] = fcmp ogt <4 x float> [[X]], -; CHECK-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = or i1 [[TMP3]], [[CMP3WRONG]] +; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x float> [[X:%.*]], i32 3 +; CHECK-NEXT: [[CMP3WRONG:%.*]] = fcmp olt float [[X3]], 4.200000e+01 +; CHECK-NEXT: [[TMP1:%.*]] = fcmp ogt <4 x float> [[X]], +; CHECK-NEXT: [[TMP2:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP1]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = or i1 [[TMP2]], [[CMP3WRONG]] ; CHECK-NEXT: [[R:%.*]] = select i1 [[OP_RDX]], float -1.000000e+00, float 1.000000e+00 ; CHECK-NEXT: ret float [[R]] ; @@ -107,11 +107,11 @@ define float @merge_anyof_v4f32_wrong_first(<4 x float> %x) { define float @merge_anyof_v4f32_wrong_last(<4 x float> %x) { ; CHECK-LABEL: @merge_anyof_v4f32_wrong_last( -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[X:%.*]], i32 3 -; CHECK-NEXT: [[CMP3WRONG:%.*]] = fcmp olt float [[TMP1]], 4.200000e+01 -; CHECK-NEXT: [[TMP2:%.*]] = fcmp ogt <4 x float> [[X]], -; CHECK-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = or i1 [[TMP3]], [[CMP3WRONG]] +; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x float> [[X:%.*]], i32 3 +; CHECK-NEXT: [[CMP3WRONG:%.*]] = fcmp olt float [[X3]], 4.200000e+01 +; CHECK-NEXT: [[TMP1:%.*]] = fcmp ogt <4 x float> [[X]], +; CHECK-NEXT: [[TMP2:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP1]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = or i1 [[TMP2]], [[CMP3WRONG]] ; CHECK-NEXT: [[R:%.*]] = select i1 [[OP_RDX]], float -1.000000e+00, float 1.000000e+00 ; CHECK-NEXT: ret float [[R]] ; @@ -134,11 +134,11 @@ define float @merge_anyof_v4f32_wrong_last(<4 x float> %x) { define i32 @merge_anyof_v4i32_wrong_middle(<4 x i32> %x) { ; CHECK-LABEL: @merge_anyof_v4i32_wrong_middle( -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 3 -; CHECK-NEXT: [[CMP3WRONG:%.*]] = icmp slt i32 [[TMP1]], 42 -; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i32> [[X]], -; CHECK-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = or i1 [[TMP3]], [[CMP3WRONG]] +; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 3 +; CHECK-NEXT: [[CMP3WRONG:%.*]] = icmp slt i32 [[X3]], 42 +; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt <4 x i32> [[X]], +; CHECK-NEXT: [[TMP2:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP1]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = or i1 [[TMP2]], [[CMP3WRONG]] ; CHECK-NEXT: [[R:%.*]] = select i1 [[OP_RDX]], i32 -1, i32 1 ; CHECK-NEXT: ret i32 [[R]] ; @@ -164,12 +164,12 @@ define i32 @merge_anyof_v4i32_wrong_middle(<4 x i32> %x) { define i32 @merge_anyof_v4i32_wrong_middle_better_rdx(<4 x i32> %x, <4 x i32> %y) { ; CHECK-LABEL: @merge_anyof_v4i32_wrong_middle_better_rdx( -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[Y:%.*]], i32 3 -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 3 -; CHECK-NEXT: [[CMP3WRONG:%.*]] = icmp slt i32 [[TMP2]], [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = icmp sgt <4 x i32> [[X]], [[Y]] -; CHECK-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = or i1 [[TMP4]], [[CMP3WRONG]] +; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 3 +; CHECK-NEXT: [[Y3:%.*]] = extractelement <4 x i32> [[Y:%.*]], i32 3 +; CHECK-NEXT: [[CMP3WRONG:%.*]] = icmp slt i32 [[X3]], [[Y3]] +; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt <4 x i32> [[X]], [[Y]] +; CHECK-NEXT: [[TMP2:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP1]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = or i1 [[TMP2]], [[CMP3WRONG]] ; CHECK-NEXT: [[R:%.*]] = select i1 [[OP_RDX]], i32 -1, i32 1 ; CHECK-NEXT: ret i32 [[R]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_vectorizeTree.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_vectorizeTree.ll index 5a0deddb9247c..e3a860a4c6f06 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_vectorizeTree.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_vectorizeTree.ll @@ -20,18 +20,23 @@ define void @bar() { ; CHECK-LABEL: @bar( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[I:%.*]] = getelementptr inbounds [[TMP0:%.*]], ptr undef, i64 0, i32 1, i32 0 +; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds [[TMP0]], ptr undef, i64 0, i32 1, i32 1 ; CHECK-NEXT: [[I2:%.*]] = getelementptr inbounds [[TMP0]], ptr undef, i64 0, i32 1, i32 0 +; CHECK-NEXT: [[I3:%.*]] = getelementptr inbounds [[TMP0]], ptr undef, i64 0, i32 1, i32 1 ; CHECK-NEXT: [[I4:%.*]] = getelementptr inbounds [[TMP0]], ptr undef, i64 0, i32 1, i32 0 ; CHECK-NEXT: br label [[BB6:%.*]] ; CHECK: bb6: -; CHECK-NEXT: [[TMP0]] = phi <2 x double> [ , [[BB:%.*]] ], [ [[TMP3:%.*]], [[BB17:%.*]] ], [ [[TMP3]], [[BB16:%.*]] ], [ [[TMP3]], [[BB16]] ] -; CHECK-NEXT: store <2 x double> [[TMP0]], ptr [[I]], align 8 -; CHECK-NEXT: [[TMP3]] = load <2 x double>, ptr [[I2]], align 8 +; CHECK-NEXT: [[I7:%.*]] = phi double [ 2.800000e+01, [[BB:%.*]] ], [ [[I10:%.*]], [[BB17:%.*]] ], [ [[I10]], [[BB16:%.*]] ], [ [[I10]], [[BB16]] ] +; CHECK-NEXT: [[I8:%.*]] = phi double [ 1.800000e+01, [[BB]] ], [ [[TMP1:%.*]], [[BB17]] ], [ [[TMP1]], [[BB16]] ], [ [[TMP1]], [[BB16]] ] +; CHECK-NEXT: store double [[I8]], ptr [[I]], align 8 +; CHECK-NEXT: store double [[I7]], ptr [[I1]], align 8 +; CHECK-NEXT: [[I10]] = load double, ptr [[I3]], align 8 +; CHECK-NEXT: [[TMP0]] = load <2 x double>, ptr [[I2]], align 8 ; CHECK-NEXT: br i1 undef, label [[BB11:%.*]], label [[BB12:%.*]] ; CHECK: bb11: ; CHECK-NEXT: ret void ; CHECK: bb12: -; CHECK-NEXT: store <2 x double> [[TMP3]], ptr [[I4]], align 8 +; CHECK-NEXT: store <2 x double> [[TMP0]], ptr [[I4]], align 8 ; CHECK-NEXT: br i1 undef, label [[BB13:%.*]], label [[BB14:%.*]] ; CHECK: bb13: ; CHECK-NEXT: br label [[BB14]] @@ -40,9 +45,10 @@ define void @bar() { ; CHECK: bb15: ; CHECK-NEXT: unreachable ; CHECK: bb16: +; CHECK-NEXT: [[TMP1]] = extractelement <2 x double> [[TMP0]], i32 0 ; CHECK-NEXT: switch i32 undef, label [[BB17]] [ -; CHECK-NEXT: i32 32, label [[BB6]] -; CHECK-NEXT: i32 103, label [[BB6]] +; CHECK-NEXT: i32 32, label [[BB6]] +; CHECK-NEXT: i32 103, label [[BB6]] ; CHECK-NEXT: ] ; CHECK: bb17: ; CHECK-NEXT: br i1 undef, label [[BB6]], label [[BB18:%.*]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cse_extractelement.ll b/llvm/test/Transforms/SLPVectorizer/X86/cse_extractelement.ll index bcee81f901987..73f9b42ee72b5 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/cse_extractelement.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/cse_extractelement.ll @@ -7,17 +7,17 @@ define void @test(ptr %ptr, ptr noalias %s) { ; CHECK-NEXT: [[CMP:%.*]] = icmp eq ptr [[PTR:%.*]], null ; CHECK-NEXT: br i1 [[CMP]], label [[LOOP:%.*]], label [[BAIL_OUT:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[PTR]], align 4 -; CHECK-NEXT: store <4 x i32> [[TMP1]], ptr [[S:%.*]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[PTR]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[PTR]], align 4 +; CHECK-NEXT: store <4 x i32> [[TMP0]], ptr [[S:%.*]], align 4 ; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop1: -; CHECK-NEXT: store i32 [[TMP3]], ptr [[S]], align 4 +; CHECK-NEXT: store i32 [[TMP1]], ptr [[S]], align 4 ; CHECK-NEXT: br i1 true, label [[LOOP1]], label [[CONT:%.*]] ; CHECK: cont: ; CHECK-NEXT: br i1 true, label [[LOOP]], label [[BAIL_OUT]] ; CHECK: bail_out: -; CHECK-NEXT: [[DUMMY_PHI:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[TMP3]], [[CONT]] ] +; CHECK-NEXT: [[DUMMY_PHI:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[TMP1]], [[CONT]] ] ; CHECK-NEXT: store i32 [[DUMMY_PHI]], ptr [[S]], align 4 ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/diamond.ll b/llvm/test/Transforms/SLPVectorizer/X86/diamond.ll index b2bcdb178b21b..1b34d31eb623d 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/diamond.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/diamond.ll @@ -60,12 +60,12 @@ define i32 @extr_user(ptr noalias nocapture %B, ptr noalias nocapture %A, i32 %n ; CHECK-NEXT: entry: ; CHECK-NEXT: [[MUL238:%.*]] = add i32 [[M:%.*]], [[N:%.*]] ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[A:%.*]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[MUL238]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i32> [[TMP0]], [[TMP2]] -; CHECK-NEXT: store <4 x i32> [[TMP3]], ptr [[B:%.*]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP0]], i32 0 -; CHECK-NEXT: ret i32 [[TMP4]] +; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[A]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[MUL238]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = mul <4 x i32> [[TMP0]], [[TMP3]] +; CHECK-NEXT: store <4 x i32> [[TMP4]], ptr [[B:%.*]], align 4 +; CHECK-NEXT: ret i32 [[TMP1]] ; entry: %0 = load i32, ptr %A, align 4 @@ -95,13 +95,14 @@ define i32 @extr_user1(ptr noalias nocapture %B, ptr noalias nocapture %A, i32 % ; CHECK-LABEL: @extr_user1( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[MUL238:%.*]] = add i32 [[M:%.*]], [[N:%.*]] -; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[A:%.*]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[MUL238]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i32> [[TMP0]], [[TMP2]] -; CHECK-NEXT: store <4 x i32> [[TMP3]], ptr [[B:%.*]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP0]], i32 1 -; CHECK-NEXT: ret i32 [[TMP4]] +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 1 +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[A]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX4]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[MUL238]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = mul <4 x i32> [[TMP0]], [[TMP3]] +; CHECK-NEXT: store <4 x i32> [[TMP4]], ptr [[B:%.*]], align 4 +; CHECK-NEXT: ret i32 [[TMP1]] ; entry: %0 = load i32, ptr %A, align 4 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/external-user-instruction-minbitwidth.ll b/llvm/test/Transforms/SLPVectorizer/X86/external-user-instruction-minbitwidth.ll index 84f7e219f5066..f58379b46dc19 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/external-user-instruction-minbitwidth.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/external-user-instruction-minbitwidth.ll @@ -12,6 +12,7 @@ define i8 @test() { ; CHECK-NEXT: [[CONV:%.*]] = sext i8 [[TMP0]] to i32 ; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr @c, align 2 ; CHECK-NEXT: [[CONV1:%.*]] = zext i16 [[TMP1]] to i32 +; CHECK-NEXT: [[TMP13:%.*]] = or i32 [[CONV]], 32769 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[CONV]], i32 0 ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: [[TMP4:%.*]] = or <8 x i32> [[TMP3]], @@ -20,7 +21,6 @@ define i8 @test() { ; CHECK-NEXT: [[TMP8:%.*]] = add nsw <8 x i32> [[TMP4]], [[TMP7]] ; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> [[TMP8]]) ; CHECK-NEXT: [[CONV4_30:%.*]] = trunc i32 [[TMP11]] to i8 -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <8 x i32> [[TMP4]], i32 7 ; CHECK-NEXT: [[XOR_31:%.*]] = and i32 [[TMP13]], -2 ; CHECK-NEXT: store i32 [[XOR_31]], ptr @d, align 4 ; CHECK-NEXT: ret i8 [[CONV4_30]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract-many-users-buildvector.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract-many-users-buildvector.ll index eb7498fea6f79..3b03ca13ea65d 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/extract-many-users-buildvector.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extract-many-users-buildvector.ll @@ -6,31 +6,30 @@ define i1 @test(float %0, double %1) { ; CHECK-SAME: (float [[TMP0:%.*]], double [[TMP1:%.*]]) { ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> , float [[TMP0]], i32 3 ; CHECK-NEXT: [[TMP4:%.*]] = fpext <4 x float> [[TMP3]] to <4 x double> -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> , double [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = fmul <2 x double> zeroinitializer, [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <4 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> , <4 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x double> [[TMP8]], double [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> [[TMP7]], <2 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x double> [[TMP10]], <2 x double> poison, <4 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> [[TMP11]], <4 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = fmul <4 x double> [[TMP9]], [[TMP12]] -; CHECK-NEXT: [[TMP14:%.*]] = fmul <4 x double> zeroinitializer, [[TMP4]] -; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <4 x double> [[TMP13]], <4 x double> poison, <8 x i32> -; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <8 x double> , <8 x double> [[TMP15]], <8 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x double> [[TMP14]], <4 x double> poison, <8 x i32> -; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <8 x double> , <8 x double> [[TMP17]], <8 x i32> -; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <8 x i32> -; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <8 x double> [[TMP18]], <8 x double> [[TMP19]], <8 x i32> -; CHECK-NEXT: [[TMP21:%.*]] = fsub <8 x double> [[TMP16]], [[TMP20]] -; CHECK-NEXT: [[TMP22:%.*]] = fmul <8 x double> [[TMP16]], [[TMP20]] -; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <8 x double> [[TMP21]], <8 x double> [[TMP22]], <8 x i32> -; CHECK-NEXT: [[TMP24:%.*]] = fptrunc <8 x double> [[TMP23]] to <8 x float> -; CHECK-NEXT: [[TMP25:%.*]] = fmul <8 x float> [[TMP24]], zeroinitializer -; CHECK-NEXT: [[TMP26:%.*]] = fcmp oeq <8 x float> [[TMP25]], zeroinitializer -; CHECK-NEXT: [[TMP27:%.*]] = freeze <8 x i1> [[TMP26]] -; CHECK-NEXT: [[TMP28:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP27]]) -; CHECK-NEXT: ret i1 [[TMP28]] +; CHECK-NEXT: [[TMP5:%.*]] = fpext float 0.000000e+00 to double +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> , double [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = fmul <2 x double> zeroinitializer, [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> [[TMP8]], <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x double> , double [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x double> [[TMP10]], double [[TMP5]], i32 2 +; CHECK-NEXT: [[TMP12:%.*]] = fmul <4 x double> [[TMP9]], [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = fmul <4 x double> zeroinitializer, [[TMP4]] +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x double> [[TMP12]], <4 x double> poison, <8 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <8 x double> , <8 x double> [[TMP14]], <8 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x double> [[TMP13]], <4 x double> poison, <8 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <8 x double> , <8 x double> [[TMP16]], <8 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> poison, <8 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <8 x double> [[TMP17]], <8 x double> [[TMP18]], <8 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = fsub <8 x double> [[TMP15]], [[TMP19]] +; CHECK-NEXT: [[TMP21:%.*]] = fmul <8 x double> [[TMP15]], [[TMP19]] +; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <8 x double> [[TMP20]], <8 x double> [[TMP21]], <8 x i32> +; CHECK-NEXT: [[TMP23:%.*]] = fptrunc <8 x double> [[TMP22]] to <8 x float> +; CHECK-NEXT: [[TMP24:%.*]] = fmul <8 x float> [[TMP23]], zeroinitializer +; CHECK-NEXT: [[TMP25:%.*]] = fcmp oeq <8 x float> [[TMP24]], zeroinitializer +; CHECK-NEXT: [[TMP26:%.*]] = freeze <8 x i1> [[TMP25]] +; CHECK-NEXT: [[TMP27:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP26]]) +; CHECK-NEXT: ret i1 [[TMP27]] ; %3 = fpext float %0 to double %4 = fpext float 0.000000e+00 to double diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract-subvector-long-input.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract-subvector-long-input.ll index 1b54a604cd6f3..f90456297d7cb 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/extract-subvector-long-input.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extract-subvector-long-input.ll @@ -6,8 +6,9 @@ define void @test() { ; CHECK-NEXT: bb: ; CHECK-NEXT: br label [[BB1:%.*]] ; CHECK: bb1: -; CHECK-NEXT: [[TMP0:%.*]] = phi <8 x i32> [ poison, [[BB10:%.*]] ], [ zeroinitializer, [[BB:%.*]] ] -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <2 x i32> +; CHECK-NEXT: [[PHI7:%.*]] = phi i32 [ 0, [[BB10:%.*]] ], [ 0, [[BB:%.*]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi <8 x i32> [ poison, [[BB10]] ], [ zeroinitializer, [[BB]] ] +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> , i32 [[PHI7]], i32 0 ; CHECK-NEXT: switch i32 0, label [[BB16:%.*]] [ ; CHECK-NEXT: i32 0, label [[BB14:%.*]] ; CHECK-NEXT: i32 1, label [[BB11:%.*]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract-vectorized-operand.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract-vectorized-operand.ll index f1a5709d07f02..7a860719505f0 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/extract-vectorized-operand.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extract-vectorized-operand.ll @@ -4,19 +4,21 @@ define void @test() { ; CHECK-LABEL: define void @test() { ; CHECK-NEXT: [[BB:.*]]: +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <2 x ptr addrspace(1)> zeroinitializer, <2 x ptr addrspace(1)> zeroinitializer, <2 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x ptr addrspace(1)> [[TMP0]], i32 0 ; CHECK-NEXT: br label %[[BB43:.*]] ; CHECK: [[BB20:.*]]: ; CHECK-NEXT: br label %[[BB105:.*]] ; CHECK: [[BB43]]: -; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x ptr addrspace(1)> [ [[TMP1:%.*]], %[[BB51:.*]] ], [ zeroinitializer, %[[BB]] ] +; CHECK-NEXT: [[TMP2:%.*]] = phi <2 x ptr addrspace(1)> [ [[TMP3:%.*]], %[[BB51:.*]] ], [ zeroinitializer, %[[BB]] ] ; CHECK-NEXT: br i1 false, label %[[BB105]], label %[[BB51]] ; CHECK: [[BB51]]: -; CHECK-NEXT: [[TMP1]] = phi <2 x ptr addrspace(1)> [ poison, %[[BB54:.*]] ], [ zeroinitializer, %[[BB43]] ] +; CHECK-NEXT: [[TMP3]] = phi <2 x ptr addrspace(1)> [ poison, %[[BB54:.*]] ], [ zeroinitializer, %[[BB43]] ] ; CHECK-NEXT: br label %[[BB43]] ; CHECK: [[BB54]]: ; CHECK-NEXT: br label %[[BB51]] ; CHECK: [[BB105]]: -; CHECK-NEXT: [[PHI106:%.*]] = phi ptr addrspace(1) [ null, %[[BB20]] ], [ null, %[[BB43]] ] +; CHECK-NEXT: [[PHI106:%.*]] = phi ptr addrspace(1) [ [[TMP1]], %[[BB20]] ], [ null, %[[BB43]] ] ; CHECK-NEXT: ret void ; bb: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll index 0eb18239ae3fb..6033e8def3436 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll @@ -89,10 +89,10 @@ define void @externally_used_ptrs() { ; CHECK-LABEL: @externally_used_ptrs( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr @a, align 8 +; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i64 11 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP0]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x ptr> [[TMP1]], <2 x ptr> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i64, <2 x ptr> [[TMP2]], <2 x i64> -; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i64 11 ; CHECK-NEXT: [[TMP4:%.*]] = ptrtoint <2 x ptr> [[TMP3]] to <2 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = load <2 x i64>, ptr [[ADD_PTR]], align 8 ; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i64> [[TMP4]], [[TMP5]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extractelement-multi-register-use.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractelement-multi-register-use.ll index ba406c8f20bb0..73b73735da021 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/extractelement-multi-register-use.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extractelement-multi-register-use.ll @@ -9,13 +9,14 @@ define void @test(double %i) { ; CHECK-NEXT: [[TMP1:%.*]] = fsub <2 x double> zeroinitializer, [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> , double [[I]], i32 1 ; CHECK-NEXT: [[TMP3:%.*]] = fsub <2 x double> zeroinitializer, [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 1 +; CHECK-NEXT: [[I75:%.*]] = fsub double 0.000000e+00, [[I]] ; CHECK-NEXT: [[TMP5:%.*]] = fsub <2 x double> [[TMP0]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP3]], <4 x i32> ; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP5]], <4 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x double> [[TMP6]], <4 x double> [[TMP7]], <8 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <8 x double> [[TMP8]], <8 x double> , <8 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <8 x double> [[TMP9]], double [[TMP4]], i32 7 +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <4 x double> [[TMP6]], <4 x double> [[TMP7]], <8 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x double> [[TMP28]], <8 x double> , <8 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x double> [[TMP8]], double [[I75]], i32 3 +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x double> [[TMP9]], <8 x double> poison, <8 x i32> ; CHECK-NEXT: [[TMP11:%.*]] = fmul <8 x double> zeroinitializer, [[TMP10]] ; CHECK-NEXT: [[TMP12:%.*]] = fadd <8 x double> zeroinitializer, [[TMP11]] ; CHECK-NEXT: [[TMP13:%.*]] = fadd <8 x double> [[TMP12]], zeroinitializer diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll index f2b1c78ce0aac..aba45fe6bd519 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll @@ -9,10 +9,11 @@ define void @foo(double %i) { ; CHECK-NEXT: [[TMP1:%.*]] = fsub <4 x double> zeroinitializer, [[TMP0]] ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[I]], i32 0 ; CHECK-NEXT: [[TMP4:%.*]] = fsub <2 x double> zeroinitializer, [[TMP3]] -; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> [[TMP22]], <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x double> [[TMP5]], <4 x double> [[TMP5]], <8 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x double> [[TMP6]], <8 x double> , <8 x i32> +; CHECK-NEXT: [[I82:%.*]] = fsub double 0.000000e+00, poison +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x double> [[TMP8]], <8 x double> , <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x double> [[TMP5]], double [[I82]], i32 2 +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x double> [[TMP6]], <8 x double> poison, <8 x i32> ; CHECK-NEXT: [[TMP12:%.*]] = fmul <8 x double> , [[TMP7]] ; CHECK-NEXT: [[TMP13:%.*]] = fadd <8 x double> zeroinitializer, [[TMP12]] ; CHECK-NEXT: [[TMP14:%.*]] = fadd <8 x double> [[TMP13]], zeroinitializer @@ -26,6 +27,7 @@ define void @foo(double %i) { ; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x double> [[TMP18]], i32 1 ; CHECK-NEXT: [[I118:%.*]] = fadd double [[TMP19]], [[TMP20]] ; CHECK-NEXT: [[TMP21:%.*]] = fmul <4 x double> zeroinitializer, [[TMP1]] +; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <4 x i32> ; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <4 x double> , <4 x double> [[TMP22]], <4 x i32> ; CHECK-NEXT: [[TMP24:%.*]] = fadd <4 x double> [[TMP21]], [[TMP23]] ; CHECK-NEXT: [[TMP25:%.*]] = fadd <4 x double> [[TMP24]], zeroinitializer diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extractelements-vector-ops-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractelements-vector-ops-shuffle.ll index 0222e0aaeea3e..783eca2221357 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/extractelements-vector-ops-shuffle.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extractelements-vector-ops-shuffle.ll @@ -4,11 +4,12 @@ define double @test() { ; CHECK-LABEL: define double @test() { ; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP3:%.*]] = load double, ptr getelementptr inbounds ([13 x double], ptr null, i64 0, i64 6), align 16 ; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr getelementptr inbounds ([13 x double], ptr null, i64 0, i64 5), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load double, ptr getelementptr inbounds ([13 x double], ptr null, i64 0, i64 9), align 8 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr getelementptr inbounds ([13 x double], ptr null, i64 0, i64 8), align 16 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> [[TMP1]], <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> , <4 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x double> , double [[TMP3]], i32 2 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x double> [[TMP17]], double [[TMP2]], i32 3 ; CHECK-NEXT: [[TMP5:%.*]] = fmul <4 x double> zeroinitializer, [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = call reassoc nsz double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[TMP5]]) ; CHECK-NEXT: [[TMP7:%.*]] = fmul double [[TMP6]], 0.000000e+00 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extracts-with-undefs.ll b/llvm/test/Transforms/SLPVectorizer/X86/extracts-with-undefs.ll index abf277fb8ba34..b6de2d4fbcb11 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/extracts-with-undefs.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extracts-with-undefs.ll @@ -7,7 +7,7 @@ define void @test() { ; CHECK-NEXT: br label [[BODY:%.*]] ; CHECK: body: ; CHECK-NEXT: [[PHI1:%.*]] = phi double [ 0.000000e+00, [[ENTRY:%.*]] ], [ 0.000000e+00, [[BODY]] ] -; CHECK-NEXT: [[PHI2:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ 0.000000e+00, [[BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x double> [ zeroinitializer, [[ENTRY]] ], [ zeroinitializer, [[BODY]] ] ; CHECK-NEXT: [[MUL_I478_I:%.*]] = fmul fast double [[PHI1]], 0.000000e+00 ; CHECK-NEXT: [[MUL7_I485_I:%.*]] = fmul fast double undef, 0.000000e+00 ; CHECK-NEXT: [[ADD8_I_I:%.*]] = fadd fast double [[MUL_I478_I]], [[MUL7_I485_I]] @@ -16,16 +16,15 @@ define void @test() { ; CHECK: exit: ; CHECK-NEXT: br i1 false, label [[IF_THEN135_I:%.*]], label [[IF_END209_I:%.*]] ; CHECK: if.then135.i: -; CHECK-NEXT: [[CMP145_I:%.*]] = fcmp fast olt double [[PHI1]], 0.000000e+00 -; CHECK-NEXT: [[CMP152_I:%.*]] = fcmp fast olt double [[PHI2]], 0.000000e+00 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i1> , i1 [[CMP152_I]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = select <2 x i1> [[TMP0]], <2 x double> zeroinitializer, <2 x double> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = fmul fast <2 x double> zeroinitializer, [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <2 x double> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = fadd fast <2 x double> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = fcmp fast olt <2 x double> [[TMP0]], zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i1> , <2 x i1> [[TMP1]], <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = select <2 x i1> [[TMP2]], <2 x double> zeroinitializer, <2 x double> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = fmul fast <2 x double> zeroinitializer, [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = fmul fast <2 x double> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = fadd fast <2 x double> [[TMP5]], zeroinitializer ; CHECK-NEXT: br label [[IF_END209_I]] ; CHECK: if.end209.i: -; CHECK-NEXT: [[TMP5:%.*]] = phi <2 x double> [ [[TMP4]], [[IF_THEN135_I]] ], [ zeroinitializer, [[EXIT]] ] +; CHECK-NEXT: [[TMP7:%.*]] = phi <2 x double> [ [[TMP6]], [[IF_THEN135_I]] ], [ zeroinitializer, [[EXIT]] ] ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gathered-delayed-nodes-with-reused-user.ll b/llvm/test/Transforms/SLPVectorizer/X86/gathered-delayed-nodes-with-reused-user.ll index f197b2480d61c..fa33621de5ae7 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/gathered-delayed-nodes-with-reused-user.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/gathered-delayed-nodes-with-reused-user.ll @@ -25,6 +25,7 @@ define i64 @foo() { ; ; FORCED-LABEL: define i64 @foo() { ; FORCED-NEXT: bb: +; FORCED-NEXT: [[TMP8:%.*]] = add i64 0, 0 ; FORCED-NEXT: br label [[BB3:%.*]] ; FORCED: bb1: ; FORCED-NEXT: [[TMP0:%.*]] = phi <2 x i64> [ [[TMP5:%.*]], [[BB3]] ] @@ -38,7 +39,6 @@ define i64 @foo() { ; FORCED-NEXT: [[TMP5]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i32> ; FORCED-NEXT: [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> , <2 x i32> ; FORCED-NEXT: [[TMP7]] = add <2 x i64> [[TMP6]], [[TMP2]] -; FORCED-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP7]], i32 1 ; FORCED-NEXT: [[GETELEMENTPTR:%.*]] = getelementptr i64, ptr addrspace(1) null, i64 [[TMP8]] ; FORCED-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 ; FORCED-NEXT: [[ICMP:%.*]] = icmp ult i64 [[TMP9]], 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gep-with-extractelement-many-users.ll b/llvm/test/Transforms/SLPVectorizer/X86/gep-with-extractelement-many-users.ll index 37d166953c333..cea95c1102497 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/gep-with-extractelement-many-users.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/gep-with-extractelement-many-users.ll @@ -4,9 +4,7 @@ define void @test() { ; CHECK-LABEL: define void @test() { ; CHECK-NEXT: bb: -; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x ptr> zeroinitializer, i32 0 -; CHECK-NEXT: [[GETELEMENTPTR6:%.*]] = getelementptr i8, ptr [[TMP0]], i64 872 -; CHECK-NEXT: store double 0.000000e+00, ptr [[GETELEMENTPTR6]], align 8 +; CHECK-NEXT: store double 0.000000e+00, ptr inttoptr (i64 872 to ptr), align 8 ; CHECK-NEXT: br label [[BB9:%.*]] ; CHECK: bb9: ; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x ptr> [ getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> ), [[BB:%.*]] ] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll b/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll index e94dd2119270c..e0d7c12f70c2e 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll @@ -14,6 +14,7 @@ define dso_local i32 @g() local_unnamed_addr { ; CHECK-NEXT: [[A_020:%.*]] = phi ptr [ [[A_020_BE:%.*]], [[WHILE_BODY_BACKEDGE:%.*]] ], [ undef, [[ENTRY:%.*]] ] ; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x ptr> [ [[TMP14:%.*]], [[WHILE_BODY_BACKEDGE]] ], [ undef, [[ENTRY]] ] ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1 +; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 1 ; CHECK-NEXT: [[TMP3:%.*]] = ptrtoint ptr [[TMP2]] to i64 ; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[A_020]], i64 1 @@ -32,7 +33,6 @@ define dso_local i32 @g() local_unnamed_addr { ; CHECK-NEXT: br label [[WHILE_BODY_BACKEDGE]] ; CHECK: sw.bb6: ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, ptr [[A_020]], i64 2 -; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 1 ; CHECK-NEXT: [[TMP10:%.*]] = ptrtoint ptr [[INCDEC_PTR]] to i64 ; CHECK-NEXT: [[TMP11:%.*]] = trunc i64 [[TMP10]] to i32 ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, <2 x ptr> [[TMP1]], <2 x i64> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll index 7ff4a1a231c22..6956178518215 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll @@ -968,11 +968,11 @@ define i32 @wobble(i32 %arg, i32 %bar) { ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[BAR:%.*]], i32 0 ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP4:%.*]] = xor <4 x i32> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3 -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq <4 x i32> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = sext <4 x i1> [[TMP6]] to <4 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP7]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP8]], [[TMP5]] +; CHECK-NEXT: [[X4:%.*]] = xor i32 [[ARG]], [[BAR]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i32> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP6]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP7]], [[X4]] ; CHECK-NEXT: [[OP_RDX1:%.*]] = add i32 [[OP_RDX]], [[ARG]] ; CHECK-NEXT: ret i32 [[OP_RDX1]] ; @@ -983,11 +983,11 @@ define i32 @wobble(i32 %arg, i32 %bar) { ; THRESHOLD-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[BAR:%.*]], i32 0 ; THRESHOLD-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer ; THRESHOLD-NEXT: [[TMP4:%.*]] = xor <4 x i32> [[TMP1]], [[TMP3]] -; THRESHOLD-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3 -; THRESHOLD-NEXT: [[TMP6:%.*]] = icmp eq <4 x i32> [[TMP4]], zeroinitializer -; THRESHOLD-NEXT: [[TMP7:%.*]] = sext <4 x i1> [[TMP6]] to <4 x i32> -; THRESHOLD-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP7]]) -; THRESHOLD-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP8]], [[TMP5]] +; THRESHOLD-NEXT: [[X4:%.*]] = xor i32 [[ARG]], [[BAR]] +; THRESHOLD-NEXT: [[TMP5:%.*]] = icmp eq <4 x i32> [[TMP4]], zeroinitializer +; THRESHOLD-NEXT: [[TMP6:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i32> +; THRESHOLD-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP6]]) +; THRESHOLD-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP7]], [[X4]] ; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = add i32 [[OP_RDX]], [[ARG]] ; THRESHOLD-NEXT: ret i32 [[OP_RDX1]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insertelement-uses-vectorized-index.ll b/llvm/test/Transforms/SLPVectorizer/X86/insertelement-uses-vectorized-index.ll index 799d0a055d5c4..78b3f8b101284 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/insertelement-uses-vectorized-index.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/insertelement-uses-vectorized-index.ll @@ -7,6 +7,7 @@ define void @test(ptr %0) { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x ptr> , ptr [[TMP0]], i32 1 ; CHECK-NEXT: [[TMP2:%.*]] = ptrtoint <2 x ptr> [[TMP1]] to <2 x i64> +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr null to i64 ; CHECK-NEXT: [[TMP3:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32> ; CHECK-NEXT: switch i32 0, label %[[NEWFUNCROOT994:.*]] [ ; CHECK-NEXT: i32 1, label %[[NEWFUNCROOT994]] @@ -17,7 +18,6 @@ define void @test(ptr %0) { ; CHECK-NEXT: ret void ; CHECK: [[NEWFUNCROOT994]]: ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[TMP5]], i64 [[TMP6]] ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll index 2a9e40156420a..5a28581913b8c 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll @@ -201,8 +201,9 @@ entry: define void @lookahead_external_uses(ptr %A, ptr %B, ptr %C, ptr %D, ptr %S, ptr %Ext1, ptr %Ext2) { ; CHECK-LABEL: @lookahead_external_uses( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[IDXA1:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i64 1 ; CHECK-NEXT: [[IDXB2:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i64 2 -; CHECK-NEXT: [[IDXA2:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i64 2 +; CHECK-NEXT: [[IDXA2:%.*]] = getelementptr inbounds double, ptr [[A]], i64 2 ; CHECK-NEXT: [[IDXB1:%.*]] = getelementptr inbounds double, ptr [[B]], i64 1 ; CHECK-NEXT: [[B0:%.*]] = load double, ptr [[B]], align 8 ; CHECK-NEXT: [[C0:%.*]] = load double, ptr [[C:%.*]], align 8 @@ -210,6 +211,7 @@ define void @lookahead_external_uses(ptr %A, ptr %B, ptr %C, ptr %D, ptr %S, ptr ; CHECK-NEXT: [[B2:%.*]] = load double, ptr [[IDXB2]], align 8 ; CHECK-NEXT: [[A2:%.*]] = load double, ptr [[IDXA2]], align 8 ; CHECK-NEXT: [[B1:%.*]] = load double, ptr [[IDXB1]], align 8 +; CHECK-NEXT: [[A1:%.*]] = load double, ptr [[IDXA1]], align 8 ; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[A]], align 8 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[B0]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B2]], i32 1 @@ -221,8 +223,7 @@ define void @lookahead_external_uses(ptr %A, ptr %B, ptr %C, ptr %D, ptr %S, ptr ; CHECK-NEXT: [[TMP8:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP7]] ; CHECK-NEXT: [[TMP9:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP8]] ; CHECK-NEXT: store <2 x double> [[TMP9]], ptr [[S:%.*]], align 8 -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[TMP0]], i32 1 -; CHECK-NEXT: store double [[TMP10]], ptr [[EXT1:%.*]], align 8 +; CHECK-NEXT: store double [[A1]], ptr [[EXT1:%.*]], align 8 ; CHECK-NEXT: ret void ; entry: @@ -279,8 +280,9 @@ entry: define void @lookahead_limit_users_budget(ptr %A, ptr %B, ptr %C, ptr %D, ptr %S, ptr %Ext1, ptr %Ext2, ptr %Ext3, ptr %Ext4, ptr %Ext5) { ; CHECK-LABEL: @lookahead_limit_users_budget( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[IDXA1:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i64 1 ; CHECK-NEXT: [[IDXB2:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i64 2 -; CHECK-NEXT: [[IDXA2:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i64 2 +; CHECK-NEXT: [[IDXA2:%.*]] = getelementptr inbounds double, ptr [[A]], i64 2 ; CHECK-NEXT: [[IDXB1:%.*]] = getelementptr inbounds double, ptr [[B]], i64 1 ; CHECK-NEXT: [[B0:%.*]] = load double, ptr [[B]], align 8 ; CHECK-NEXT: [[C0:%.*]] = load double, ptr [[C:%.*]], align 8 @@ -288,6 +290,7 @@ define void @lookahead_limit_users_budget(ptr %A, ptr %B, ptr %C, ptr %D, ptr %S ; CHECK-NEXT: [[B2:%.*]] = load double, ptr [[IDXB2]], align 8 ; CHECK-NEXT: [[A2:%.*]] = load double, ptr [[IDXA2]], align 8 ; CHECK-NEXT: [[B1:%.*]] = load double, ptr [[IDXB1]], align 8 +; CHECK-NEXT: [[A1:%.*]] = load double, ptr [[IDXA1]], align 8 ; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[A]], align 8 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[B0]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B2]], i32 1 @@ -299,10 +302,9 @@ define void @lookahead_limit_users_budget(ptr %A, ptr %B, ptr %C, ptr %D, ptr %S ; CHECK-NEXT: [[TMP8:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP7]] ; CHECK-NEXT: [[TMP9:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP8]] ; CHECK-NEXT: store <2 x double> [[TMP9]], ptr [[S:%.*]], align 8 -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[TMP0]], i32 1 -; CHECK-NEXT: store double [[TMP10]], ptr [[EXT1:%.*]], align 8 -; CHECK-NEXT: store double [[TMP10]], ptr [[EXT2:%.*]], align 8 -; CHECK-NEXT: store double [[TMP10]], ptr [[EXT3:%.*]], align 8 +; CHECK-NEXT: store double [[A1]], ptr [[EXT1:%.*]], align 8 +; CHECK-NEXT: store double [[A1]], ptr [[EXT2:%.*]], align 8 +; CHECK-NEXT: store double [[A1]], ptr [[EXT3:%.*]], align 8 ; CHECK-NEXT: store double [[B1]], ptr [[EXT4:%.*]], align 8 ; CHECK-NEXT: store double [[B1]], ptr [[EXT5:%.*]], align 8 ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-drop-wrapping-flags.ll b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-drop-wrapping-flags.ll index a8d481a3e28a5..2a5bfa7390770 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-drop-wrapping-flags.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-drop-wrapping-flags.ll @@ -5,21 +5,16 @@ define i32 @test() { ; CHECK-LABEL: define i32 @test() { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[A_PROMOTED:%.*]] = load i8, ptr null, align 1 -; CHECK-NEXT: [[DEC_4:%.*]] = add i8 [[A_PROMOTED]], 0 -; CHECK-NEXT: [[CONV_I_4:%.*]] = zext i8 [[DEC_4]] to i32 -; CHECK-NEXT: [[SUB_I_4:%.*]] = add nuw nsw i32 [[CONV_I_4]], 0 -; CHECK-NEXT: [[DEC_5:%.*]] = add i8 [[A_PROMOTED]], 0 -; CHECK-NEXT: [[CONV_I_5:%.*]] = zext i8 [[DEC_5]] to i32 -; CHECK-NEXT: [[SUB_I_5:%.*]] = add nuw nsw i32 [[CONV_I_5]], 65535 -; CHECK-NEXT: [[TMP0:%.*]] = or i32 [[SUB_I_4]], [[SUB_I_5]] -; CHECK-NEXT: [[DEC_6:%.*]] = or i8 [[A_PROMOTED]], 0 -; CHECK-NEXT: [[CONV_I_6:%.*]] = zext i8 [[DEC_6]] to i32 -; CHECK-NEXT: [[SUB_I_6:%.*]] = add nuw nsw i32 [[CONV_I_6]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = or i32 [[TMP0]], [[SUB_I_6]] ; CHECK-NEXT: [[TMP10:%.*]] = or i8 [[A_PROMOTED]], 0 -; CHECK-NEXT: [[CONV_I_7:%.*]] = zext i8 [[TMP10]] to i32 -; CHECK-NEXT: [[SUB_I_7:%.*]] = add nuw nsw i32 [[CONV_I_7]], 0 -; CHECK-NEXT: [[TMP8:%.*]] = or i32 [[TMP1]], [[SUB_I_7]] +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i8> poison, i8 [[A_PROMOTED]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i8> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = or <4 x i8> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i16> +; CHECK-NEXT: [[TMP6:%.*]] = add <4 x i16> [[TMP5]], +; CHECK-NEXT: [[TMP7:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP6]]) +; CHECK-NEXT: [[TMP8:%.*]] = zext i16 [[TMP7]] to i32 ; CHECK-NEXT: [[TMP9:%.*]] = and i32 [[TMP8]], 65535 ; CHECK-NEXT: store i8 [[TMP10]], ptr null, align 1 ; CHECK-NEXT: [[CALL3:%.*]] = tail call i32 (ptr, ...) null(ptr null, i32 [[TMP9]]) diff --git a/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll index 9df2b9a8e8f3e..61938d01e57ac 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll @@ -345,7 +345,7 @@ define void @good_load_order() { ; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr @a, align 16 ; CHECK-NEXT: br label [[FOR_BODY3:%.*]] ; CHECK: for.body3: -; CHECK-NEXT: [[TMP1:%.*]] = phi float [ [[TMP0]], [[FOR_COND1_PREHEADER]] ], [ [[TMP12:%.*]], [[FOR_BODY3]] ] +; CHECK-NEXT: [[TMP1:%.*]] = phi float [ [[TMP0]], [[FOR_COND1_PREHEADER]] ], [ [[TMP13:%.*]], [[FOR_BODY3]] ] ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_COND1_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY3]] ] ; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[INDVARS_IV]] to i32 ; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], 1 @@ -355,17 +355,17 @@ define void @good_load_order() { ; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[INDVARS_IV]] to i32 ; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP5]], 4 ; CHECK-NEXT: [[ARRAYIDX31:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP6]] -; CHECK-NEXT: [[TMP7:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x float> [[TMP8]], float [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP10:%.*]] = fmul <4 x float> [[TMP7]], [[TMP9]] -; CHECK-NEXT: store <4 x float> [[TMP10]], ptr [[ARRAYIDX5]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[ARRAYIDX31]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP11:%.*]] = fmul <4 x float> [[TMP8]], [[TMP10]] +; CHECK-NEXT: store <4 x float> [[TMP11]], ptr [[ARRAYIDX5]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 5 -; CHECK-NEXT: [[TMP11:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP11]] -; CHECK-NEXT: [[TMP12]] = load float, ptr [[ARRAYIDX41]], align 4 -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[TMP7]], i64 3 -; CHECK-NEXT: [[MUL45:%.*]] = fmul float [[TMP12]], [[TMP13]] +; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 +; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP12]] +; CHECK-NEXT: [[TMP13]] = load float, ptr [[ARRAYIDX41]], align 4 +; CHECK-NEXT: [[MUL45:%.*]] = fmul float [[TMP13]], [[TMP7]] ; CHECK-NEXT: store float [[MUL45]], ptr [[ARRAYIDX31]], align 4 ; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 ; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP14]], 31995 @@ -380,7 +380,7 @@ define void @good_load_order() { ; SSE2-NEXT: [[TMP0:%.*]] = load float, ptr @a, align 16 ; SSE2-NEXT: br label [[FOR_BODY3:%.*]] ; SSE2: for.body3: -; SSE2-NEXT: [[TMP1:%.*]] = phi float [ [[TMP0]], [[FOR_COND1_PREHEADER]] ], [ [[TMP12:%.*]], [[FOR_BODY3]] ] +; SSE2-NEXT: [[TMP1:%.*]] = phi float [ [[TMP0]], [[FOR_COND1_PREHEADER]] ], [ [[TMP13:%.*]], [[FOR_BODY3]] ] ; SSE2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_COND1_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY3]] ] ; SSE2-NEXT: [[TMP2:%.*]] = trunc i64 [[INDVARS_IV]] to i32 ; SSE2-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], 1 @@ -390,17 +390,17 @@ define void @good_load_order() { ; SSE2-NEXT: [[TMP5:%.*]] = trunc i64 [[INDVARS_IV]] to i32 ; SSE2-NEXT: [[TMP6:%.*]] = add i32 [[TMP5]], 4 ; SSE2-NEXT: [[ARRAYIDX31:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP6]] -; SSE2-NEXT: [[TMP7:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 4 -; SSE2-NEXT: [[TMP8:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> poison, <4 x i32> -; SSE2-NEXT: [[TMP9:%.*]] = insertelement <4 x float> [[TMP8]], float [[TMP1]], i64 0 -; SSE2-NEXT: [[TMP10:%.*]] = fmul <4 x float> [[TMP7]], [[TMP9]] -; SSE2-NEXT: store <4 x float> [[TMP10]], ptr [[ARRAYIDX5]], align 4 +; SSE2-NEXT: [[TMP7:%.*]] = load float, ptr [[ARRAYIDX31]], align 4 +; SSE2-NEXT: [[TMP8:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 4 +; SSE2-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <4 x i32> +; SSE2-NEXT: [[TMP10:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP1]], i64 0 +; SSE2-NEXT: [[TMP11:%.*]] = fmul <4 x float> [[TMP8]], [[TMP10]] +; SSE2-NEXT: store <4 x float> [[TMP11]], ptr [[ARRAYIDX5]], align 4 ; SSE2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 5 -; SSE2-NEXT: [[TMP11:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; SSE2-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP11]] -; SSE2-NEXT: [[TMP12]] = load float, ptr [[ARRAYIDX41]], align 4 -; SSE2-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[TMP7]], i64 3 -; SSE2-NEXT: [[MUL45:%.*]] = fmul float [[TMP12]], [[TMP13]] +; SSE2-NEXT: [[TMP12:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 +; SSE2-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP12]] +; SSE2-NEXT: [[TMP13]] = load float, ptr [[ARRAYIDX41]], align 4 +; SSE2-NEXT: [[MUL45:%.*]] = fmul float [[TMP13]], [[TMP7]] ; SSE2-NEXT: store float [[MUL45]], ptr [[ARRAYIDX31]], align 4 ; SSE2-NEXT: [[TMP14:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 ; SSE2-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP14]], 31995 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/ordering-bug.ll b/llvm/test/Transforms/SLPVectorizer/X86/ordering-bug.ll index efb11d2756c3c..e1c794a6fd279 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/ordering-bug.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/ordering-bug.ll @@ -10,15 +10,15 @@ define void @f(i1 %x) #0 { ; CHECK-LABEL: @f( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[A1:%.*]] = load i64, ptr getelementptr inbounds ([[STRUCT_A:%.*]], ptr @a, i32 0, i32 0, i32 1), align 8 ; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr @a, align 8 ; CHECK-NEXT: br i1 [[X:%.*]], label [[WHILE_BODY_LR_PH:%.*]], label [[WHILE_END:%.*]] ; CHECK: while.body.lr.ph: -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1 -; CHECK-NEXT: [[ICMP_A1:%.*]] = icmp eq i64 [[TMP1]], 0 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr @b, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i1> poison, i1 [[ICMP_A1]], i32 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i1> [[TMP3]], <2 x i1> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = select <2 x i1> [[SHUFFLE]], <2 x i64> [[TMP2]], <2 x i64> [[TMP0]] +; CHECK-NEXT: [[ICMP_A1:%.*]] = icmp eq i64 [[A1]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @b, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i1> poison, i1 [[ICMP_A1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i1> [[TMP2]], <2 x i1> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = select <2 x i1> [[TMP3]], <2 x i64> [[TMP1]], <2 x i64> [[TMP0]] ; CHECK-NEXT: br label [[WHILE_END]] ; CHECK: while.end: ; CHECK-NEXT: [[TMP5:%.*]] = phi <2 x i64> [ [[TMP0]], [[ENTRY:%.*]] ], [ [[TMP4]], [[WHILE_BODY_LR_PH]] ] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr27163.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr27163.ll index ea6989b8bbabb..9979bb9170d48 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/pr27163.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr27163.ll @@ -8,19 +8,19 @@ target triple = "x86_64-pc-windows-msvc18.0.0" define void @test1(ptr %p) personality ptr @__CxxFrameHandler3 { ; CHECK-LABEL: @test1( ; CHECK-NEXT: invoke.cont: -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[P:%.*]], align 8 -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0 -; CHECK-NEXT: store <2 x i64> [[TMP1]], ptr [[P]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[P:%.*]], align 8 +; CHECK-NEXT: [[LOAD1:%.*]] = load i64, ptr [[P]], align 8 +; CHECK-NEXT: store <2 x i64> [[TMP0]], ptr [[P]], align 8 ; CHECK-NEXT: invoke void @throw() -; CHECK-NEXT: to label [[UNREACHABLE:%.*]] unwind label [[CATCH_DISPATCH:%.*]] +; CHECK-NEXT: to label [[UNREACHABLE:%.*]] unwind label [[CATCH_DISPATCH:%.*]] ; CHECK: catch.dispatch: ; CHECK-NEXT: [[CS:%.*]] = catchswitch within none [label %invoke.cont1] unwind label [[EHCLEANUP:%.*]] ; CHECK: invoke.cont1: ; CHECK-NEXT: [[CATCH:%.*]] = catchpad within [[CS]] [ptr null, i32 64, ptr null] ; CHECK-NEXT: invoke void @throw() [ "funclet"(token [[CATCH]]) ] -; CHECK-NEXT: to label [[UNREACHABLE]] unwind label [[EHCLEANUP]] +; CHECK-NEXT: to label [[UNREACHABLE]] unwind label [[EHCLEANUP]] ; CHECK: ehcleanup: -; CHECK-NEXT: [[PHI:%.*]] = phi i64 [ [[TMP2]], [[CATCH_DISPATCH]] ], [ 9, [[INVOKE_CONT1:%.*]] ] +; CHECK-NEXT: [[PHI:%.*]] = phi i64 [ [[LOAD1]], [[CATCH_DISPATCH]] ], [ 9, [[INVOKE_CONT1:%.*]] ] ; CHECK-NEXT: [[CLEANUP:%.*]] = cleanuppad within none [] ; CHECK-NEXT: call void @release(i64 [[PHI]]) [ "funclet"(token [[CLEANUP]]) ] ; CHECK-NEXT: cleanupret from [[CLEANUP]] unwind to caller diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduced-value-replace-extractelement.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduced-value-replace-extractelement.ll index edf8756fd06df..5cbf78435233b 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reduced-value-replace-extractelement.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduced-value-replace-extractelement.ll @@ -4,6 +4,7 @@ define void @test() { ; CHECK-LABEL: define void @test() { ; CHECK-NEXT: [[BB:.*]]: +; CHECK-NEXT: [[TRUNC:%.*]] = trunc i64 0 to i32 ; CHECK-NEXT: br label %[[BB1:.*]] ; CHECK: [[BB1]]: ; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x i32> [ zeroinitializer, %[[BB]] ], [ [[TMP4:%.*]], %[[BB1]] ] @@ -11,7 +12,7 @@ define void @test() { ; CHECK-NEXT: [[TMP2:%.*]] = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> zeroinitializer) ; CHECK-NEXT: [[TMP3:%.*]] = zext i8 [[TMP2]] to i32 ; CHECK-NEXT: [[OP_RDX:%.*]] = mul i32 [[TMP3]], [[TMP1]] -; CHECK-NEXT: [[OP_RDX1:%.*]] = mul i32 [[OP_RDX]], 0 +; CHECK-NEXT: [[OP_RDX1:%.*]] = mul i32 [[OP_RDX]], [[TRUNC]] ; CHECK-NEXT: [[TMP4]] = insertelement <2 x i32> , i32 [[OP_RDX1]], i32 1 ; CHECK-NEXT: br label %[[BB1]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-value-in-tree.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-value-in-tree.ll index 254525c942356..6d6dd502415e5 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-value-in-tree.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-value-in-tree.ll @@ -6,10 +6,11 @@ define void @test() { ; CHECK-NEXT: br i1 false, label [[PH:%.*]], label [[EXIT:%.*]] ; CHECK: ph: ; CHECK-NEXT: [[TMP0:%.*]] = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> zeroinitializer) -; CHECK-NEXT: [[OP_RDX2:%.*]] = and i8 0, [[TMP0]] +; CHECK-NEXT: [[OP_RDX:%.*]] = and i8 0, [[TMP0]] +; CHECK-NEXT: [[OP_RDX1:%.*]] = and i8 [[OP_RDX]], 0 ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: -; CHECK-NEXT: [[PHI:%.*]] = phi i8 [ [[OP_RDX2]], [[PH]] ], [ 0, [[BB:%.*]] ] +; CHECK-NEXT: [[PHI:%.*]] = phi i8 [ [[OP_RDX1]], [[PH]] ], [ 0, [[BB:%.*]] ] ; CHECK-NEXT: ret void ; bb: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-node.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-node.ll index 1940e1bc8d18a..3a456798d7818 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-node.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-node.ll @@ -7,8 +7,9 @@ define void @test(ptr noalias %arg, ptr noalias %arg1, ptr %arg2) { ; CHECK-NEXT: bb: ; CHECK-NEXT: [[TMP_I_I:%.*]] = getelementptr i8, ptr [[ARG1]], i64 24 ; CHECK-NEXT: [[TMP_I_I4:%.*]] = getelementptr i8, ptr [[ARG]], i64 24 +; CHECK-NEXT: [[TMP_I_I13:%.*]] = getelementptr i8, ptr [[ARG1]], i64 28 +; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[TMP_I_I13]], align 4 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[TMP_I_I]], align 8 -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; CHECK-NEXT: store float [[TMP1]], ptr [[ARG2]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = fcmp olt <4 x float> [[TMP0]], zeroinitializer ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <4 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reordering-single-phi.ll b/llvm/test/Transforms/SLPVectorizer/X86/reordering-single-phi.ll index d88135df5c96a..bc1eaaac5d1bb 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reordering-single-phi.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reordering-single-phi.ll @@ -15,6 +15,7 @@ define void @test() { ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[INDVARS_IV]] ; CHECK-NEXT: [[TMP5:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 4 ; CHECK-NEXT: [[ARRAYIDX31:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[TMP5]] +; CHECK-NEXT: [[TMP6:%.*]] = load float, ptr [[ARRAYIDX31]], align 4 ; CHECK-NEXT: [[TMP14:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP14]], <4 x float> poison, <4 x i32> ; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x float> [[TMP4]], float [[TMP0]], i32 0 @@ -23,7 +24,6 @@ define void @test() { ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 5 ; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[INDVARS_IV_NEXT]] ; CHECK-NEXT: [[TMP16]] = load float, ptr [[ARRAYIDX41]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[TMP14]], i32 3 ; CHECK-NEXT: [[MUL45:%.*]] = fmul fast float [[TMP16]], [[TMP6]] ; CHECK-NEXT: store float [[MUL45]], ptr [[ARRAYIDX31]], align 4 ; CHECK-NEXT: [[CMP2:%.*]] = icmp ult i64 [[INDVARS_IV]], 31990 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/replaced-external-in-reduction.ll b/llvm/test/Transforms/SLPVectorizer/X86/replaced-external-in-reduction.ll index 9df7aa1c727c8..8fa84699a267c 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/replaced-external-in-reduction.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/replaced-external-in-reduction.ll @@ -5,23 +5,22 @@ define void @test(i32 %0, ptr %p) { ; CHECK-LABEL: define void @test( ; CHECK-SAME: i32 [[TMP0:%.*]], ptr [[P:%.*]]) { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> , i32 [[TMP0]], i32 3 -; CHECK-NEXT: [[TMP2:%.*]] = xor <4 x i32> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> , i32 [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = xor <4 x i32> [[TMP1]], +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP0]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> ; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[PH:%.*]] ; CHECK: ph: -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> , i32 [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> , i32 [[TMP0]], i32 2 ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: -; CHECK-NEXT: [[TMP5:%.*]] = phi <4 x i32> [ [[TMP2]], [[ENTRY:%.*]] ], [ zeroinitializer, [[PH]] ] -; CHECK-NEXT: [[TMP6:%.*]] = phi <4 x i32> [ [[TMP2]], [[ENTRY]] ], [ [[TMP4]], [[PH]] ] -; CHECK-NEXT: [[TMP7:%.*]] = phi <4 x i32> [ [[TMP2]], [[ENTRY]] ], [ zeroinitializer, [[PH]] ] -; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP5]]) -; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP6]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = or i32 [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[TMP9:%.*]] = phi <8 x i32> [ [[TMP8]], [[ENTRY:%.*]] ], [ [[TMP6]], [[PH]] ] +; CHECK-NEXT: [[TMP7:%.*]] = phi <4 x i32> [ [[TMP5]], [[ENTRY]] ], [ zeroinitializer, [[PH]] ] +; CHECK-NEXT: [[OP_RDX:%.*]] = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> [[TMP9]]) ; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP7]]) -; CHECK-NEXT: [[OP_RDX1:%.*]] = or i32 [[OP_RDX]], [[TMP10]] -; CHECK-NEXT: [[OP_RDX2:%.*]] = or i32 [[OP_RDX1]], [[TMP3]] +; CHECK-NEXT: [[OP_RDX5:%.*]] = or i32 [[TMP10]], [[TMP3]] +; CHECK-NEXT: [[OP_RDX2:%.*]] = or i32 [[OP_RDX5]], [[OP_RDX]] ; CHECK-NEXT: store i32 [[OP_RDX2]], ptr [[P]], align 4 ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/same-scalar-in-same-phi-extract.ll b/llvm/test/Transforms/SLPVectorizer/X86/same-scalar-in-same-phi-extract.ll index f1be11d0d0fc5..8bcf650d41d93 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/same-scalar-in-same-phi-extract.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/same-scalar-in-same-phi-extract.ll @@ -5,6 +5,7 @@ define void @test(i32 %arg) { ; CHECK-LABEL: define void @test( ; CHECK-SAME: i32 [[ARG:%.*]]) { ; CHECK-NEXT: bb: +; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[ARG]] to i64 ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> , i32 [[ARG]], i32 0 ; CHECK-NEXT: br label [[BB2:%.*]] ; CHECK: bb2: @@ -14,8 +15,6 @@ define void @test(i32 %arg) { ; CHECK-NEXT: i32 1, label [[BB4:%.*]] ; CHECK-NEXT: ] ; CHECK: bb3: -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 ; CHECK-NEXT: switch i32 0, label [[BB10]] [ ; CHECK-NEXT: i32 18, label [[BB7:%.*]] ; CHECK-NEXT: i32 1, label [[BB7]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/scalarization-overhead.ll b/llvm/test/Transforms/SLPVectorizer/X86/scalarization-overhead.ll index e146a0a365a84..55e155840f858 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/scalarization-overhead.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/scalarization-overhead.ll @@ -6,10 +6,11 @@ define i16 @D134605() { ; CHECK-LABEL: @D134605( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr poison, align 1 -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[TMP0]], i32 3 -; CHECK-NEXT: [[REASS_ADD:%.*]] = add i16 poison, [[TMP1]] -; CHECK-NEXT: [[TMP2:%.*]] = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> [[TMP0]]) +; CHECK-NEXT: [[ARRAYIDX81:%.*]] = getelementptr inbounds [32 x i16], ptr poison, i16 0, i16 3 +; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[ARRAYIDX81]], align 1 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr poison, align 1 +; CHECK-NEXT: [[REASS_ADD:%.*]] = add i16 poison, [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> [[TMP1]]) ; CHECK-NEXT: [[TMP3:%.*]] = mul i16 [[TMP2]], 2 ; CHECK-NEXT: [[OP_RDX:%.*]] = add i16 [[TMP3]], poison ; CHECK-NEXT: [[REASS_MUL24:%.*]] = shl i16 [[OP_RDX]], 2 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/slp-throttle.ll b/llvm/test/Transforms/SLPVectorizer/X86/slp-throttle.ll index 51ce40b7a178b..d9496a3e3e343 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/slp-throttle.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/slp-throttle.ll @@ -5,15 +5,17 @@ define dso_local void @rftbsub(ptr %a) local_unnamed_addr #0 { ; CHECK-LABEL: @rftbsub( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i64 2 +; CHECK-NEXT: [[TMP0:%.*]] = or disjoint i64 2, 1 +; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[TMP0]] ; CHECK-NEXT: [[SUB22:%.*]] = fsub double undef, undef -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[ARRAYIDX6]], align 8 -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 -; CHECK-NEXT: [[ADD16:%.*]] = fadd double [[TMP2]], undef +; CHECK-NEXT: [[TMP1:%.*]] = load double, ptr [[ARRAYIDX12]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[ARRAYIDX6]], align 8 +; CHECK-NEXT: [[ADD16:%.*]] = fadd double [[TMP1]], undef ; CHECK-NEXT: [[MUL18:%.*]] = fmul double undef, [[ADD16]] ; CHECK-NEXT: [[ADD19:%.*]] = fadd double undef, [[MUL18]] ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[ADD19]], i32 0 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[SUB22]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = fsub <2 x double> [[TMP1]], [[TMP4]] +; CHECK-NEXT: [[TMP5:%.*]] = fsub <2 x double> [[TMP2]], [[TMP4]] ; CHECK-NEXT: store <2 x double> [[TMP5]], ptr [[ARRAYIDX6]], align 8 ; CHECK-NEXT: unreachable ;