diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index fcaa6ea7ce3a3..69578b5b9fcfe 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8040,21 +8040,14 @@ void VPRecipeBuilder::createHeaderMask() { return; } - // Introduce the early-exit compare IV <= BTC to form header block mask. - // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by - // constructing the desired canonical IV in the header block as its first - // non-phi instructions. - + // Introduce an abstract header-mask VPInstruction. This will be lowered later + // depending on target preference. VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi(); - auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV()); - HeaderVPBB->insert(IV, NewInsertionPoint); - VPBuilder::InsertPointGuard Guard(Builder); Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); - VPValue *BlockMask = nullptr; - VPValue *BTC = Plan.getOrCreateBackedgeTakenCount(); - BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC); + VPValue *BlockMask = + Builder.createNaryOp(VPInstruction::HeaderMask, {Plan.getCanonicalIV()}); BlockMaskCache[Header] = BlockMask; } @@ -8558,9 +8551,13 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, VPlanTransforms::truncateToMinimalBitwidths( *Plan, CM.getMinimalBitwidths(), PSE.getSE()->getContext()); VPlanTransforms::optimize(*Plan, *PSE.getSE()); - // TODO: try to put it close to addActiveLaneMask(). + // TODO: The three passes that lower the header mask (addActiveLaneMask, + // addExplicitVectorLength, lowerRecipes) should arguably be applied + // together, depending on tail folding style, inside + // VPlanTransforms::optimize(). if (CM.foldTailWithEVL()) VPlanTransforms::addExplicitVectorLength(*Plan); + VPlanTransforms::lowerRecipes(*Plan); assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid"); VPlans.push_back(std::move(Plan)); } diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 67059bc17c7ca..8014f331d6132 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1176,6 +1176,12 @@ class VPInstruction : public VPRecipeWithIRFlags { BranchOnCount, BranchOnCond, ComputeReductionResult, + // An abstract representation of the vector loops header mask, to be lowered + // later depending on target preference. Relevant only when the header may + // have a partial mask, i.e., when tail folding. A mask known to always be + // full is represented by null, w/o a HeaderMask recipe. A header mask may + // not be empty. + HeaderMask, // Add an offset in bytes (second operand) to a base pointer (first // operand). Only generates scalar values (either for the first lane only or // for all lanes, depending on its uses). @@ -2688,14 +2694,13 @@ class VPEVLBasedIVPHIRecipe : public VPHeaderPHIRecipe { /// A Recipe for widening the canonical induction variable of the vector loop. class VPWidenCanonicalIVRecipe : public VPSingleDefRecipe { public: - VPWidenCanonicalIVRecipe(VPCanonicalIVPHIRecipe *CanonicalIV) - : VPSingleDefRecipe(VPDef::VPWidenCanonicalIVSC, {CanonicalIV}) {} + VPWidenCanonicalIVRecipe(VPValue *Start) + : VPSingleDefRecipe(VPDef::VPWidenCanonicalIVSC, {Start}) {} ~VPWidenCanonicalIVRecipe() override = default; VPWidenCanonicalIVRecipe *clone() override { - return new VPWidenCanonicalIVRecipe( - cast(getOperand(0))); + return new VPWidenCanonicalIVRecipe(getOperand(0)); } VP_CLASSOF_IMPL(VPDef::VPWidenCanonicalIVSC) @@ -2710,12 +2715,6 @@ class VPWidenCanonicalIVRecipe : public VPSingleDefRecipe { void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override; #endif - - /// Returns the scalar type of the induction. - const Type *getScalarType() const { - return cast(getOperand(0)->getDefiningRecipe()) - ->getScalarType(); - } }; /// A recipe for converting the input value \p IV value to the corresponding @@ -3055,6 +3054,9 @@ class VPRegionBlock : public VPBlockBase { /// Clone all blocks in the single-entry single-exit region of the block and /// their recipes without updating the operands of the cloned recipes. VPRegionBlock *clone() override; + + /// Return the header mask recipe of the VPlan, if there is one. + VPInstruction *getHeaderMask(VPlan &Plan) const; }; /// VPlan models a candidate for vectorization, encoding various decisions take diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 018436326b7a5..3e6c870ec8c9e 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -137,6 +137,7 @@ bool VPRecipeBase::mayHaveSideEffects() const { case VPInstruction::Not: case VPInstruction::CalculateTripCountMinusVF: case VPInstruction::CanonicalIVIncrementForPart: + case VPInstruction::HeaderMask: case VPInstruction::PtrAdd: return false; default: @@ -690,6 +691,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, case VPInstruction::ComputeReductionResult: O << "compute-reduction-result"; break; + case VPInstruction::HeaderMask: + O << "header-mask"; + break; case VPInstruction::PtrAdd: O << "ptradd"; break; @@ -1897,13 +1901,12 @@ void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent, #endif void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) { - Value *CanonicalIV = State.get(getOperand(0), 0, /*IsScalar*/ true); - Type *STy = CanonicalIV->getType(); + Value *Start = State.get(getOperand(0), 0, /*IsScalar*/ true); + Type *STy = Start->getType(); IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); ElementCount VF = State.VF; - Value *VStart = VF.isScalar() - ? CanonicalIV - : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast"); + Value *VStart = + VF.isScalar() ? Start : Builder.CreateVectorSplat(VF, Start, "broadcast"); for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) { Value *VStep = createStepForVF(Builder, STy, VF, Part); if (VF.isVector()) { @@ -1911,8 +1914,8 @@ void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) { VStep = Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType())); } - Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv"); - State.set(this, CanonicalVectorIV, Part); + Value *Res = Builder.CreateAdd(VStart, VStep, "vec.iv"); + State.set(this, Res, Part); } } diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index c0eb6d710ad34..8f59da961fa03 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -434,43 +434,6 @@ static void removeRedundantInductionCasts(VPlan &Plan) { } } -/// Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV -/// recipe, if it exists. -static void removeRedundantCanonicalIVs(VPlan &Plan) { - VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV(); - VPWidenCanonicalIVRecipe *WidenNewIV = nullptr; - for (VPUser *U : CanonicalIV->users()) { - WidenNewIV = dyn_cast(U); - if (WidenNewIV) - break; - } - - if (!WidenNewIV) - return; - - VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); - for (VPRecipeBase &Phi : HeaderVPBB->phis()) { - auto *WidenOriginalIV = dyn_cast(&Phi); - - if (!WidenOriginalIV || !WidenOriginalIV->isCanonical()) - continue; - - // Replace WidenNewIV with WidenOriginalIV if WidenOriginalIV provides - // everything WidenNewIV's users need. That is, WidenOriginalIV will - // generate a vector phi or all users of WidenNewIV demand the first lane - // only. - if (any_of(WidenOriginalIV->users(), - [WidenOriginalIV](VPUser *U) { - return !U->usesScalars(WidenOriginalIV); - }) || - vputils::onlyFirstLaneUsed(WidenNewIV)) { - WidenNewIV->replaceAllUsesWith(WidenOriginalIV); - WidenNewIV->eraseFromParent(); - return; - } - } -} - /// Returns true if \p R is dead and can be removed. static bool isDeadRecipe(VPRecipeBase &R) { using namespace llvm::PatternMatch; @@ -552,6 +515,16 @@ createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind, return Steps; } +/// Return the header mask recipe of the VPlan, if there is one. +static VPInstruction *getHeaderMask(VPlan &Plan) { + VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); + auto R = find_if(*HeaderVPBB, [](VPRecipeBase &R) { + using namespace llvm::VPlanPatternMatch; + return match(&R, m_VPInstruction(m_VPValue())); + }); + return R == HeaderVPBB->end() ? nullptr : cast(&*R); +} + /// Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd /// (IndStart, ScalarIVSteps (0, Step)) if only its scalar values are used, as /// VPWidenPointerInductionRecipe will generate vectors only. If some users @@ -595,6 +568,19 @@ static void legalizeAndOptimizeInductions(VPlan &Plan, ScalarEvolution &SE) { auto *WideIV = dyn_cast(&Phi); if (!WideIV) continue; + + // If there is a header mask, check if WideIV is canonical IV with other + // wide users. If that is the case, use it as HeaderMask's operand, so it + // can be used when lowering the recipe. + if (VPInstruction *HeaderMask = getHeaderMask(Plan)) { + if (WideIV->isCanonical() && + (!HasOnlyVectorVFs || any_of(WideIV->users(), [WideIV](VPUser *U) { + return !U->usesScalars(WideIV); + }))) { + HeaderMask->setOperand(0, WideIV); + } + } + if (HasOnlyVectorVFs && none_of(WideIV->users(), [WideIV](VPUser *U) { return U->usesScalars(WideIV); })) @@ -1085,7 +1071,6 @@ void VPlanTransforms::truncateToMinimalBitwidths( } void VPlanTransforms::optimize(VPlan &Plan, ScalarEvolution &SE) { - removeRedundantCanonicalIVs(Plan); removeRedundantInductionCasts(Plan); simplifyRecipes(Plan, SE.getContext()); @@ -1202,52 +1187,23 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch( return LaneMaskPhi; } -/// Collect all VPValues representing a header mask through the (ICMP_ULE, -/// WideCanonicalIV, backedge-taken-count) pattern. -/// TODO: Introduce explicit recipe for header-mask instead of searching -/// for the header-mask pattern manually. -static SmallVector collectAllHeaderMasks(VPlan &Plan) { - SmallVector WideCanonicalIVs; - auto *FoundWidenCanonicalIVUser = - find_if(Plan.getCanonicalIV()->users(), - [](VPUser *U) { return isa(U); }); - assert(count_if(Plan.getCanonicalIV()->users(), - [](VPUser *U) { return isa(U); }) <= - 1 && - "Must have at most one VPWideCanonicalIVRecipe"); - if (FoundWidenCanonicalIVUser != Plan.getCanonicalIV()->users().end()) { - auto *WideCanonicalIV = - cast(*FoundWidenCanonicalIVUser); - WideCanonicalIVs.push_back(WideCanonicalIV); - } - - // Also include VPWidenIntOrFpInductionRecipes that represent a widened - // version of the canonical induction. - VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); - for (VPRecipeBase &Phi : HeaderVPBB->phis()) { - auto *WidenOriginalIV = dyn_cast(&Phi); - if (WidenOriginalIV && WidenOriginalIV->isCanonical()) - WideCanonicalIVs.push_back(WidenOriginalIV); - } - - // Walk users of wide canonical IVs and collect to all compares of the form - // (ICMP_ULE, WideCanonicalIV, backedge-taken-count). - SmallVector HeaderMasks; - VPValue *BTC = Plan.getOrCreateBackedgeTakenCount(); - for (auto *Wide : WideCanonicalIVs) { - for (VPUser *U : SmallVector(Wide->users())) { - auto *HeaderMask = dyn_cast(U); - if (!HeaderMask || HeaderMask->getOpcode() != Instruction::ICmp || - HeaderMask->getPredicate() != CmpInst::ICMP_ULE || - HeaderMask->getOperand(1) != BTC) - continue; - - assert(HeaderMask->getOperand(0) == Wide && - "WidenCanonicalIV must be the first operand of the compare"); - HeaderMasks.push_back(HeaderMask); +static VPValue *getOrCreateWideCanonicalIV(VPlan &Plan, + VPInstruction *HeaderMask) { + VPValue *Op = HeaderMask->getOperand(0); + if (isa(Op)) + return Op; + // Check if there is a wide canonical IV that can be re-used. + if (auto *CanIV = dyn_cast(Op)) { + VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); + for (VPRecipeBase &R : HeaderVPBB->phis()) { + auto *WideIV = dyn_cast(&R); + if (WideIV && WideIV->isCanonical()) + return WideIV; } } - return HeaderMasks; + auto *IV = new VPWidenCanonicalIVRecipe(Op); + IV->insertBefore(HeaderMask); + return IV; } void VPlanTransforms::addActiveLaneMask( @@ -1257,30 +1213,21 @@ void VPlanTransforms::addActiveLaneMask( UseActiveLaneMaskForControlFlow) && "DataAndControlFlowWithoutRuntimeCheck implies " "UseActiveLaneMaskForControlFlow"); - - auto FoundWidenCanonicalIVUser = - find_if(Plan.getCanonicalIV()->users(), - [](VPUser *U) { return isa(U); }); - assert(FoundWidenCanonicalIVUser && - "Must have widened canonical IV when tail folding!"); - auto *WideCanonicalIV = - cast(*FoundWidenCanonicalIVUser); + VPInstruction *HeaderMask = getHeaderMask(Plan); + assert(HeaderMask && "Active-lane-mask not needed?"); VPSingleDefRecipe *LaneMask; if (UseActiveLaneMaskForControlFlow) { LaneMask = addVPLaneMaskPhiAndUpdateExitBranch( Plan, DataAndControlFlowWithoutRuntimeCheck); } else { - VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV); - LaneMask = B.createNaryOp(VPInstruction::ActiveLaneMask, - {WideCanonicalIV, Plan.getTripCount()}, nullptr, - "active.lane.mask"); + VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); + VPBuilder B(&*HeaderVPBB->getFirstNonPhi()); + LaneMask = B.createNaryOp( + VPInstruction::ActiveLaneMask, + {getOrCreateWideCanonicalIV(Plan, HeaderMask), Plan.getTripCount()}, + nullptr, "active.lane.mask"); } - - // Walk users of WideCanonicalIV and replace all compares of the form - // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an - // active-lane-mask. - for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) - HeaderMask->replaceAllUsesWith(LaneMask); + HeaderMask->replaceAllUsesWith(LaneMask); } /// Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and @@ -1306,6 +1253,10 @@ void VPlanTransforms::addActiveLaneMask( /// ... /// void VPlanTransforms::addExplicitVectorLength(VPlan &Plan) { + VPValue *HeaderMask = getHeaderMask(Plan); + if (!HeaderMask) + return; + VPBasicBlock *Header = Plan.getVectorLoopRegion()->getEntryBasicBlock(); auto *CanonicalIVPHI = Plan.getCanonicalIV(); VPValue *StartV = CanonicalIVPHI->getStartValue(); @@ -1335,29 +1286,28 @@ void VPlanTransforms::addExplicitVectorLength(VPlan &Plan) { NextEVLIV->insertBefore(CanonicalIVIncrement); EVLPhi->addOperand(NextEVLIV); - for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) { - for (VPUser *U : collectUsersRecursively(HeaderMask)) { - auto *MemR = dyn_cast(U); - if (!MemR) - continue; - VPValue *OrigMask = MemR->getMask(); - assert(OrigMask && "Unmasked widen memory recipe when folding tail"); - VPValue *NewMask = HeaderMask == OrigMask ? nullptr : OrigMask; - if (auto *L = dyn_cast(MemR)) { - auto *N = new VPWidenLoadEVLRecipe(L, VPEVL, NewMask); - N->insertBefore(L); - L->replaceAllUsesWith(N); - L->eraseFromParent(); - } else if (auto *S = dyn_cast(MemR)) { - auto *N = new VPWidenStoreEVLRecipe(S, VPEVL, NewMask); - N->insertBefore(S); - S->eraseFromParent(); - } else { - llvm_unreachable("unsupported recipe"); - } + for (VPUser *U : collectUsersRecursively(HeaderMask)) { + auto *MemR = dyn_cast(U); + if (!MemR) + continue; + VPValue *OrigMask = MemR->getMask(); + assert(OrigMask && "Unmasked widen memory recipe when folding tail"); + VPValue *NewMask = HeaderMask == OrigMask ? nullptr : OrigMask; + if (auto *L = dyn_cast(MemR)) { + auto *N = new VPWidenLoadEVLRecipe(L, VPEVL, NewMask); + N->insertBefore(L); + L->replaceAllUsesWith(N); + L->eraseFromParent(); + } else if (auto *S = dyn_cast(MemR)) { + auto *N = new VPWidenStoreEVLRecipe(S, VPEVL, NewMask); + N->insertBefore(S); + S->eraseFromParent(); + } else { + llvm_unreachable("unsupported recipe"); } - recursivelyDeleteDeadRecipes(HeaderMask); } + recursivelyDeleteDeadRecipes(HeaderMask); + // Replace all uses of VPCanonicalIVPHIRecipe by // VPEVLBasedIVPHIRecipe except for the canonical IV increment. CanonicalIVPHI->replaceAllUsesWith(EVLPhi); @@ -1462,3 +1412,16 @@ void VPlanTransforms::dropPoisonGeneratingRecipes( } } } + +void VPlanTransforms::lowerRecipes(VPlan &Plan) { + VPInstruction *HeaderMask = getHeaderMask(Plan); + if (!HeaderMask) + return; + + VPValue *IV = getOrCreateWideCanonicalIV(Plan, HeaderMask); + VPBuilder Builder(HeaderMask); + VPValue *BTC = Plan.getOrCreateBackedgeTakenCount(); + VPValue *M = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC); + HeaderMask->replaceAllUsesWith(M); + HeaderMask->eraseFromParent(); +} diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index 0cbc70713d9c1..607c43458c1b4 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -105,6 +105,10 @@ struct VPlanTransforms { /// VPCanonicalIVPHIRecipe is only used to control the loop after /// this transformation. static void addExplicitVectorLength(VPlan &Plan); + + /// Lower abstract VPInstruction recipes to a concrete sequence of recipes for + /// which code can be generated. + static void lowerRecipes(VPlan &Plan); }; } // namespace llvm