diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index b5f87e458833d..0fde9bbacf801 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -161,33 +161,34 @@ class VPBuilder { return tryInsertInstruction( new VPInstruction(Opcode, Operands, WrapFlags, DL, Name)); } - VPValue *createNot(VPValue *Operand, DebugLoc DL = {}, - const Twine &Name = "") { + VPInstruction *createNot(VPValue *Operand, DebugLoc DL = {}, + const Twine &Name = "") { return createInstruction(VPInstruction::Not, {Operand}, DL, Name); } - VPValue *createAnd(VPValue *LHS, VPValue *RHS, DebugLoc DL = {}, - const Twine &Name = "") { + VPInstruction *createAnd(VPValue *LHS, VPValue *RHS, DebugLoc DL = {}, + const Twine &Name = "") { return createInstruction(Instruction::BinaryOps::And, {LHS, RHS}, DL, Name); } - VPValue *createOr(VPValue *LHS, VPValue *RHS, DebugLoc DL = {}, - const Twine &Name = "") { + VPInstruction *createOr(VPValue *LHS, VPValue *RHS, DebugLoc DL = {}, + const Twine &Name = "") { return tryInsertInstruction(new VPInstruction( Instruction::BinaryOps::Or, {LHS, RHS}, VPRecipeWithIRFlags::DisjointFlagsTy(false), DL, Name)); } - VPValue *createLogicalAnd(VPValue *LHS, VPValue *RHS, DebugLoc DL = {}, - const Twine &Name = "") { + VPInstruction *createLogicalAnd(VPValue *LHS, VPValue *RHS, DebugLoc DL = {}, + const Twine &Name = "") { return tryInsertInstruction( new VPInstruction(VPInstruction::LogicalAnd, {LHS, RHS}, DL, Name)); } - VPValue *createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal, - DebugLoc DL = {}, const Twine &Name = "", - std::optional FMFs = std::nullopt) { + VPInstruction * + createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal, + DebugLoc DL = {}, const Twine &Name = "", + std::optional FMFs = std::nullopt) { auto *Select = FMFs ? new VPInstruction(Instruction::Select, {Cond, TrueVal, FalseVal}, *FMFs, DL, Name) @@ -199,8 +200,8 @@ class VPBuilder { /// Create a new ICmp VPInstruction with predicate \p Pred and operands \p A /// and \p B. /// TODO: add createFCmp when needed. - VPValue *createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B, - DebugLoc DL = {}, const Twine &Name = ""); + VPInstruction *createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B, + DebugLoc DL = {}, const Twine &Name = ""); //===--------------------------------------------------------------------===// // RAII helpers. diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 0200525a718d5..f29271883f161 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -6931,8 +6931,9 @@ void LoopVectorizationCostModel::collectInLoopReductions() { } } -VPValue *VPBuilder::createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B, - DebugLoc DL, const Twine &Name) { +VPInstruction *VPBuilder::createICmp(CmpInst::Predicate Pred, VPValue *A, + VPValue *B, DebugLoc DL, + const Twine &Name) { assert(Pred >= CmpInst::FIRST_ICMP_PREDICATE && Pred <= CmpInst::LAST_ICMP_PREDICATE && "invalid predicate"); return tryInsertInstruction( diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h index 5f86f2c969651..935419c8f9c70 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h +++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h @@ -75,8 +75,9 @@ template struct specific_intval { if (!CI) return false; - assert((BitWidth == 0 || CI->getBitWidth() == BitWidth) && - "Trying the match constant with unexpected bitwidth."); + if (BitWidth != 0 && CI->getBitWidth() != BitWidth) + return false; + return APInt::isSameValue(CI->getValue(), Val); } }; @@ -87,6 +88,8 @@ inline specific_intval<0> m_SpecificInt(uint64_t V) { inline specific_intval<1> m_False() { return specific_intval<1>(APInt(64, 0)); } +inline specific_intval<1> m_True() { return specific_intval<1>(APInt(64, 1)); } + /// Matching combinators template struct match_combine_or { LTy L; diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 9796ee64f6ef9..8d80db9a138f8 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -25,6 +25,7 @@ #include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/PatternMatch.h" +#include using namespace llvm; @@ -852,8 +853,10 @@ void VPlanTransforms::clearReductionWrapFlags(VPlan &Plan) { } } -/// Try to simplify recipe \p R. -static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { +/// Try to simplify recipe \p R. Returns any new recipes introduced during +/// simplification, as candidates for further simplification. +static SmallVector +simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo, VPlan &Plan) { using namespace llvm::VPlanPatternMatch; if (auto *Blend = dyn_cast(&R)) { @@ -868,11 +871,11 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { if (UniqueValues.size() == 1) { Blend->replaceAllUsesWith(*UniqueValues.begin()); Blend->eraseFromParent(); - return; + return {}; } if (Blend->isNormalized()) - return; + return {}; // Normalize the blend so its first incoming value is used as the initial // value with the others blended into it. @@ -907,7 +910,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { Blend->replaceAllUsesWith(NewBlend); Blend->eraseFromParent(); recursivelyDeleteDeadRecipes(DeadMask); - return; + return {}; } VPValue *A; @@ -920,7 +923,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { } else { // Don't replace a scalarizing recipe with a widened cast. if (isa(&R)) - return; + return {}; if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) { unsigned ExtOpcode = match(R.getOperand(0), m_SExt(m_VPValue())) @@ -955,24 +958,73 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV)); } #endif + return {}; + } + + VPValue *X, *X1, *Y, *Z; + LLVMContext &Ctx = TypeInfo.getContext(); + + // (X || !X) -> true. + if (match(&R, m_c_BinaryOr(m_VPValue(X), m_Not(m_VPValue(X1)))) && X == X1) { + VPValue *VPV = Plan.getOrAddLiveIn(ConstantInt::getTrue(Ctx)); + R.getVPSingleValue()->replaceAllUsesWith(VPV); + return {}; } - // Simplify (X && Y) || (X && !Y) -> X. - // TODO: Split up into simpler, modular combines: (X && Y) || (X && Z) into X - // && (Y || Z) and (X || !X) into true. This requires queuing newly created - // recipes to be visited during simplification. - VPValue *X, *Y, *X1, *Y1; - if (match(&R, - m_c_BinaryOr(m_LogicalAnd(m_VPValue(X), m_VPValue(Y)), - m_LogicalAnd(m_VPValue(X1), m_Not(m_VPValue(Y1))))) && - X == X1 && Y == Y1) { + // (X || true) -> true. + if (match(&R, m_c_BinaryOr(m_VPValue(X), m_True()))) { + VPValue *VPV = Plan.getOrAddLiveIn(ConstantInt::getTrue(Ctx)); + R.getVPSingleValue()->replaceAllUsesWith(VPV); + return {}; + } + + // (X || false) -> X. + if (match(&R, m_c_BinaryOr(m_VPValue(X), m_False()))) { R.getVPSingleValue()->replaceAllUsesWith(X); + return {}; + } + + // (X && !X) -> false. + if (match(&R, m_LogicalAnd(m_VPValue(X), m_Not(m_VPValue(X1)))) && X == X1) { + VPValue *VPV = Plan.getOrAddLiveIn(ConstantInt::getFalse(Ctx)); + R.getVPSingleValue()->replaceAllUsesWith(VPV); + return {}; + } + + // (X && true) -> X. + if (match(&R, m_LogicalAnd(m_VPValue(X), m_True()))) { + R.getVPSingleValue()->replaceAllUsesWith(X); + return {}; + } + + // (X && false) -> false. + if (match(&R, m_LogicalAnd(m_VPValue(X), m_False()))) { + VPValue *VPV = Plan.getOrAddLiveIn(ConstantInt::getFalse(Ctx)); + R.getVPSingleValue()->replaceAllUsesWith(VPV); + return {}; + } + + // (X * 1) -> X. + if (match(&R, m_c_Mul(m_VPValue(X), m_SpecificInt(1)))) { + R.getVPSingleValue()->replaceAllUsesWith(X); + return {}; + } + + // (X && Y) || (X && Z) -> X && (Y || Z). + if (match(&R, m_BinaryOr(m_LogicalAnd(m_VPValue(X), m_VPValue(Y)), + m_LogicalAnd(m_VPValue(X1), m_VPValue(Z)))) && + X == X1) { + VPBuilder Builder(&R); + VPInstruction *YorZ = Builder.createOr(Y, Z, R.getDebugLoc()); + VPInstruction *VPI = Builder.createLogicalAnd(X, YorZ, R.getDebugLoc()); + R.getVPSingleValue()->replaceAllUsesWith(VPI); R.eraseFromParent(); - return; + // Order of simplification matters: simplify sub-recipes before root + // recipes. + return {YorZ, VPI}; } - if (match(&R, m_c_Mul(m_VPValue(A), m_SpecificInt(1)))) - return R.getVPSingleValue()->replaceAllUsesWith(A); + return {}; } /// Try to simplify the recipes in \p Plan. @@ -981,8 +1033,17 @@ static void simplifyRecipes(VPlan &Plan, LLVMContext &Ctx) { Plan.getEntry()); VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType(), Ctx); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly(RPOT)) { - for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { - simplifyRecipe(R, TypeInfo); + // Order of simplification matters: add new candidates for simplification to + // the back of the Worklist, while the Worklist processes recipes from the + // front. + std::deque Worklist; + for (auto &R : make_early_inc_range(*VPBB)) { + Worklist.emplace_front(&R); + while (!Worklist.empty()) { + VPRecipeBase *R = Worklist.front(); + Worklist.pop_front(); + append_range(Worklist, simplifyRecipe(*R, TypeInfo, Plan)); + } } } } diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll index 07a1cca1bc21e..203abe6c91312 100644 --- a/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll +++ b/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll @@ -7,8 +7,6 @@ define void @test(ptr %p, i40 %a) { ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <16 x i40> poison, i40 [[A]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <16 x i40> [[BROADCAST_SPLATINSERT1]], <16 x i40> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE32:%.*]] ] @@ -16,126 +14,102 @@ define void @test(ptr %p, i40 %a) { ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: [[VEC_IV:%.*]] = add <16 x i32> [[BROADCAST_SPLAT]], ; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <16 x i32> [[VEC_IV]], -; CHECK-NEXT: [[TMP1:%.*]] = shl <16 x i40> [[BROADCAST_SPLAT2]], -; CHECK-NEXT: [[TMP2:%.*]] = ashr <16 x i40> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = trunc <16 x i40> [[TMP2]] to <16 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = trunc <16 x i32> [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <16 x i1> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = icmp ult <16 x i1> zeroinitializer, [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = or <16 x i1> [[TMP6]], -; CHECK-NEXT: [[TMP8:%.*]] = icmp sgt <16 x i1> [[TMP7]], zeroinitializer ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <16 x i1> [[TMP0]], i32 0 ; CHECK-NEXT: br i1 [[TMP9]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; CHECK: pred.store.if: -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0 -; CHECK-NEXT: store i1 [[TMP10]], ptr [[P]], align 1 +; CHECK-NEXT: store i1 false, ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] ; CHECK: pred.store.continue: ; CHECK-NEXT: [[TMP11:%.*]] = extractelement <16 x i1> [[TMP0]], i32 1 ; CHECK-NEXT: br i1 [[TMP11]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]] -; CHECK: pred.store.if3: -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <16 x i1> [[TMP8]], i32 1 -; CHECK-NEXT: store i1 [[TMP12]], ptr [[P]], align 1 +; CHECK: pred.store.if1: +; CHECK-NEXT: store i1 false, ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE4]] +; CHECK: pred.store.continue2: +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <16 x i1> [[TMP0]], i32 2 +; CHECK-NEXT: br i1 [[TMP3]], label [[PRED_STORE_IF4:%.*]], label [[PRED_STORE_CONTINUE5:%.*]] +; CHECK: pred.store.if3: +; CHECK-NEXT: store i1 false, ptr [[P]], align 1 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE5]] ; CHECK: pred.store.continue4: -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <16 x i1> [[TMP0]], i32 2 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <16 x i1> [[TMP0]], i32 3 ; CHECK-NEXT: br i1 [[TMP13]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]] ; CHECK: pred.store.if5: -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <16 x i1> [[TMP8]], i32 2 -; CHECK-NEXT: store i1 [[TMP14]], ptr [[P]], align 1 +; CHECK-NEXT: store i1 false, ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE6]] ; CHECK: pred.store.continue6: -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <16 x i1> [[TMP0]], i32 3 +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <16 x i1> [[TMP0]], i32 4 ; CHECK-NEXT: br i1 [[TMP15]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]] ; CHECK: pred.store.if7: -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <16 x i1> [[TMP8]], i32 3 -; CHECK-NEXT: store i1 [[TMP16]], ptr [[P]], align 1 +; CHECK-NEXT: store i1 false, ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE8]] ; CHECK: pred.store.continue8: -; CHECK-NEXT: [[TMP17:%.*]] = extractelement <16 x i1> [[TMP0]], i32 4 +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <16 x i1> [[TMP0]], i32 5 ; CHECK-NEXT: br i1 [[TMP17]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]] ; CHECK: pred.store.if9: -; CHECK-NEXT: [[TMP18:%.*]] = extractelement <16 x i1> [[TMP8]], i32 4 -; CHECK-NEXT: store i1 [[TMP18]], ptr [[P]], align 1 +; CHECK-NEXT: store i1 false, ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE10]] ; CHECK: pred.store.continue10: -; CHECK-NEXT: [[TMP19:%.*]] = extractelement <16 x i1> [[TMP0]], i32 5 +; CHECK-NEXT: [[TMP19:%.*]] = extractelement <16 x i1> [[TMP0]], i32 6 ; CHECK-NEXT: br i1 [[TMP19]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12:%.*]] ; CHECK: pred.store.if11: -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <16 x i1> [[TMP8]], i32 5 -; CHECK-NEXT: store i1 [[TMP20]], ptr [[P]], align 1 +; CHECK-NEXT: store i1 false, ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE12]] ; CHECK: pred.store.continue12: -; CHECK-NEXT: [[TMP21:%.*]] = extractelement <16 x i1> [[TMP0]], i32 6 +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <16 x i1> [[TMP0]], i32 7 ; CHECK-NEXT: br i1 [[TMP21]], label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE14:%.*]] ; CHECK: pred.store.if13: -; CHECK-NEXT: [[TMP22:%.*]] = extractelement <16 x i1> [[TMP8]], i32 6 -; CHECK-NEXT: store i1 [[TMP22]], ptr [[P]], align 1 +; CHECK-NEXT: store i1 false, ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE14]] ; CHECK: pred.store.continue14: -; CHECK-NEXT: [[TMP23:%.*]] = extractelement <16 x i1> [[TMP0]], i32 7 +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <16 x i1> [[TMP0]], i32 8 ; CHECK-NEXT: br i1 [[TMP23]], label [[PRED_STORE_IF15:%.*]], label [[PRED_STORE_CONTINUE16:%.*]] ; CHECK: pred.store.if15: -; CHECK-NEXT: [[TMP24:%.*]] = extractelement <16 x i1> [[TMP8]], i32 7 -; CHECK-NEXT: store i1 [[TMP24]], ptr [[P]], align 1 +; CHECK-NEXT: store i1 false, ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE16]] ; CHECK: pred.store.continue16: -; CHECK-NEXT: [[TMP25:%.*]] = extractelement <16 x i1> [[TMP0]], i32 8 +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <16 x i1> [[TMP0]], i32 9 ; CHECK-NEXT: br i1 [[TMP25]], label [[PRED_STORE_IF17:%.*]], label [[PRED_STORE_CONTINUE18:%.*]] ; CHECK: pred.store.if17: -; CHECK-NEXT: [[TMP26:%.*]] = extractelement <16 x i1> [[TMP8]], i32 8 -; CHECK-NEXT: store i1 [[TMP26]], ptr [[P]], align 1 +; CHECK-NEXT: store i1 false, ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE18]] ; CHECK: pred.store.continue18: -; CHECK-NEXT: [[TMP27:%.*]] = extractelement <16 x i1> [[TMP0]], i32 9 +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <16 x i1> [[TMP0]], i32 10 ; CHECK-NEXT: br i1 [[TMP27]], label [[PRED_STORE_IF19:%.*]], label [[PRED_STORE_CONTINUE20:%.*]] ; CHECK: pred.store.if19: -; CHECK-NEXT: [[TMP28:%.*]] = extractelement <16 x i1> [[TMP8]], i32 9 -; CHECK-NEXT: store i1 [[TMP28]], ptr [[P]], align 1 +; CHECK-NEXT: store i1 false, ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE20]] ; CHECK: pred.store.continue20: -; CHECK-NEXT: [[TMP29:%.*]] = extractelement <16 x i1> [[TMP0]], i32 10 +; CHECK-NEXT: [[TMP29:%.*]] = extractelement <16 x i1> [[TMP0]], i32 11 ; CHECK-NEXT: br i1 [[TMP29]], label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22:%.*]] ; CHECK: pred.store.if21: -; CHECK-NEXT: [[TMP30:%.*]] = extractelement <16 x i1> [[TMP8]], i32 10 -; CHECK-NEXT: store i1 [[TMP30]], ptr [[P]], align 1 +; CHECK-NEXT: store i1 false, ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE22]] ; CHECK: pred.store.continue22: -; CHECK-NEXT: [[TMP31:%.*]] = extractelement <16 x i1> [[TMP0]], i32 11 +; CHECK-NEXT: [[TMP31:%.*]] = extractelement <16 x i1> [[TMP0]], i32 12 ; CHECK-NEXT: br i1 [[TMP31]], label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]] ; CHECK: pred.store.if23: -; CHECK-NEXT: [[TMP32:%.*]] = extractelement <16 x i1> [[TMP8]], i32 11 -; CHECK-NEXT: store i1 [[TMP32]], ptr [[P]], align 1 +; CHECK-NEXT: store i1 false, ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE24]] ; CHECK: pred.store.continue24: -; CHECK-NEXT: [[TMP33:%.*]] = extractelement <16 x i1> [[TMP0]], i32 12 +; CHECK-NEXT: [[TMP33:%.*]] = extractelement <16 x i1> [[TMP0]], i32 13 ; CHECK-NEXT: br i1 [[TMP33]], label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]] ; CHECK: pred.store.if25: -; CHECK-NEXT: [[TMP34:%.*]] = extractelement <16 x i1> [[TMP8]], i32 12 -; CHECK-NEXT: store i1 [[TMP34]], ptr [[P]], align 1 +; CHECK-NEXT: store i1 false, ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE26]] ; CHECK: pred.store.continue26: -; CHECK-NEXT: [[TMP35:%.*]] = extractelement <16 x i1> [[TMP0]], i32 13 +; CHECK-NEXT: [[TMP35:%.*]] = extractelement <16 x i1> [[TMP0]], i32 14 ; CHECK-NEXT: br i1 [[TMP35]], label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE28:%.*]] ; CHECK: pred.store.if27: -; CHECK-NEXT: [[TMP36:%.*]] = extractelement <16 x i1> [[TMP8]], i32 13 -; CHECK-NEXT: store i1 [[TMP36]], ptr [[P]], align 1 +; CHECK-NEXT: store i1 false, ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE28]] ; CHECK: pred.store.continue28: -; CHECK-NEXT: [[TMP37:%.*]] = extractelement <16 x i1> [[TMP0]], i32 14 -; CHECK-NEXT: br i1 [[TMP37]], label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE30:%.*]] +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <16 x i1> [[TMP0]], i32 15 +; CHECK-NEXT: br i1 [[TMP16]], label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE32]] ; CHECK: pred.store.if29: -; CHECK-NEXT: [[TMP38:%.*]] = extractelement <16 x i1> [[TMP8]], i32 14 -; CHECK-NEXT: store i1 [[TMP38]], ptr [[P]], align 1 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE30]] -; CHECK: pred.store.continue30: -; CHECK-NEXT: [[TMP39:%.*]] = extractelement <16 x i1> [[TMP0]], i32 15 -; CHECK-NEXT: br i1 [[TMP39]], label [[PRED_STORE_IF31:%.*]], label [[PRED_STORE_CONTINUE32]] -; CHECK: pred.store.if31: -; CHECK-NEXT: [[TMP40:%.*]] = extractelement <16 x i1> [[TMP8]], i32 15 -; CHECK-NEXT: store i1 [[TMP40]], ptr [[P]], align 1 +; CHECK-NEXT: store i1 false, ptr [[P]], align 1 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE32]] -; CHECK: pred.store.continue32: +; CHECK: pred.store.continue30: ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 16 ; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: diff --git a/llvm/test/Transforms/LoopVectorize/X86/predicate-switch.ll b/llvm/test/Transforms/LoopVectorize/X86/predicate-switch.ll index 060d2ecc385f7..55b20843a557f 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/predicate-switch.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/predicate-switch.ll @@ -975,7 +975,8 @@ define void @switch_under_br_default_common_dest_with_case(ptr %start, ptr %end, ; COST-NEXT: [[TMP9:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], ; COST-NEXT: [[TMP10:%.*]] = select <4 x i1> [[TMP7]], <4 x i1> [[TMP8]], <4 x i1> zeroinitializer ; COST-NEXT: [[TMP11:%.*]] = select <4 x i1> [[TMP7]], <4 x i1> [[TMP9]], <4 x i1> zeroinitializer -; COST-NEXT: [[TMP12:%.*]] = or <4 x i1> [[TMP10]], [[TMP11]] +; COST-NEXT: [[TMP15:%.*]] = or <4 x i1> [[TMP8]], [[TMP9]] +; COST-NEXT: [[TMP12:%.*]] = select <4 x i1> [[TMP7]], <4 x i1> [[TMP15]], <4 x i1> zeroinitializer ; COST-NEXT: [[TMP13:%.*]] = xor <4 x i1> [[TMP12]], ; COST-NEXT: [[TMP14:%.*]] = select <4 x i1> [[TMP7]], <4 x i1> [[TMP13]], <4 x i1> zeroinitializer ; COST-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[TMP6]], i32 1, <4 x i1> [[TMP11]]) @@ -1056,8 +1057,10 @@ define void @switch_under_br_default_common_dest_with_case(ptr %start, ptr %end, ; FORCED-NEXT: [[TMP16:%.*]] = select <4 x i1> [[TMP10]], <4 x i1> [[TMP12]], <4 x i1> zeroinitializer ; FORCED-NEXT: [[TMP25:%.*]] = select <4 x i1> [[TMP9]], <4 x i1> [[TMP13]], <4 x i1> zeroinitializer ; FORCED-NEXT: [[TMP26:%.*]] = select <4 x i1> [[TMP10]], <4 x i1> [[TMP14]], <4 x i1> zeroinitializer -; FORCED-NEXT: [[TMP27:%.*]] = or <4 x i1> [[TMP15]], [[TMP25]] -; FORCED-NEXT: [[TMP20:%.*]] = or <4 x i1> [[TMP16]], [[TMP26]] +; FORCED-NEXT: [[TMP28:%.*]] = or <4 x i1> [[TMP11]], [[TMP13]] +; FORCED-NEXT: [[TMP29:%.*]] = or <4 x i1> [[TMP12]], [[TMP14]] +; FORCED-NEXT: [[TMP27:%.*]] = select <4 x i1> [[TMP9]], <4 x i1> [[TMP28]], <4 x i1> zeroinitializer +; FORCED-NEXT: [[TMP20:%.*]] = select <4 x i1> [[TMP10]], <4 x i1> [[TMP29]], <4 x i1> zeroinitializer ; FORCED-NEXT: [[TMP21:%.*]] = xor <4 x i1> [[TMP27]], ; FORCED-NEXT: [[TMP22:%.*]] = xor <4 x i1> [[TMP20]], ; FORCED-NEXT: [[TMP23:%.*]] = select <4 x i1> [[TMP9]], <4 x i1> [[TMP21]], <4 x i1> zeroinitializer diff --git a/llvm/test/Transforms/LoopVectorize/uniform-blend.ll b/llvm/test/Transforms/LoopVectorize/uniform-blend.ll index c9fc8beb006d9..d7813101ee274 100644 --- a/llvm/test/Transforms/LoopVectorize/uniform-blend.ll +++ b/llvm/test/Transforms/LoopVectorize/uniform-blend.ll @@ -138,8 +138,7 @@ define void @blend_chain_iv(i1 %c) { ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i1> [[BROADCAST_SPLAT]], <4 x i1> zeroinitializer -; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> [[VEC_IND]], <4 x i64> undef -; CHECK-NEXT: [[PREDPHI1:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i64> [[PREDPHI]], <4 x i64> undef +; CHECK-NEXT: [[PREDPHI1:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> [[VEC_IND]], <4 x i64> undef ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i64> [[PREDPHI1]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i64> [[PREDPHI1]], i32 1