|
123 | 123 | #include "llvm/IR/User.h"
|
124 | 124 | #include "llvm/IR/Value.h"
|
125 | 125 | #include "llvm/IR/ValueHandle.h"
|
| 126 | +#include "llvm/IR/VectorBuilder.h" |
126 | 127 | #include "llvm/IR/Verifier.h"
|
127 | 128 | #include "llvm/Support/Casting.h"
|
128 | 129 | #include "llvm/Support/CommandLine.h"
|
@@ -247,10 +248,12 @@ static cl::opt<TailFoldingStyle> ForceTailFoldingStyle(
|
247 | 248 | clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control",
|
248 | 249 | "Create lane mask using active.lane.mask intrinsic, and use "
|
249 | 250 | "it for both data and control flow"),
|
250 |
| - clEnumValN( |
251 |
| - TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck, |
252 |
| - "data-and-control-without-rt-check", |
253 |
| - "Similar to data-and-control, but remove the runtime check"))); |
| 251 | + clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck, |
| 252 | + "data-and-control-without-rt-check", |
| 253 | + "Similar to data-and-control, but remove the runtime check"), |
| 254 | + clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl", |
| 255 | + "Use predicated EVL instructions for tail folding if the " |
| 256 | + "target supports vector length predication"))); |
254 | 257 |
|
255 | 258 | static cl::opt<bool> MaximizeBandwidth(
|
256 | 259 | "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
|
@@ -1098,9 +1101,7 @@ void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
|
1098 | 1101 | // handled.
|
1099 | 1102 | if (isa<VPWidenMemoryInstructionRecipe>(CurRec) ||
|
1100 | 1103 | isa<VPInterleaveRecipe>(CurRec) ||
|
1101 |
| - isa<VPScalarIVStepsRecipe>(CurRec) || |
1102 |
| - isa<VPCanonicalIVPHIRecipe>(CurRec) || |
1103 |
| - isa<VPActiveLaneMaskPHIRecipe>(CurRec)) |
| 1104 | + isa<VPScalarIVStepsRecipe>(CurRec) || isa<VPHeaderPHIRecipe>(CurRec)) |
1104 | 1105 | continue;
|
1105 | 1106 |
|
1106 | 1107 | // This recipe contributes to the address computation of a widen
|
@@ -1640,6 +1641,23 @@ class LoopVectorizationCostModel {
|
1640 | 1641 | return foldTailByMasking() || Legal->blockNeedsPredication(BB);
|
1641 | 1642 | }
|
1642 | 1643 |
|
| 1644 | + /// Returns true if VP intrinsics with explicit vector length support should |
| 1645 | + /// be generated in the tail folded loop. |
| 1646 | + bool useVPIWithVPEVLVectorization() const { |
| 1647 | + return PreferEVL && !EnableVPlanNativePath && |
| 1648 | + getTailFoldingStyle() == TailFoldingStyle::DataWithEVL && |
| 1649 | + // FIXME: implement support for max safe dependency distance. |
| 1650 | + Legal->isSafeForAnyVectorWidth() && |
| 1651 | + // FIXME: remove this once reductions are supported. |
| 1652 | + Legal->getReductionVars().empty() && |
| 1653 | + // FIXME: remove this once vp_reverse is supported. |
| 1654 | + none_of( |
| 1655 | + WideningDecisions, |
| 1656 | + [](const std::pair<std::pair<Instruction *, ElementCount>, |
| 1657 | + std::pair<InstWidening, InstructionCost>> |
| 1658 | + &Data) { return Data.second.first == CM_Widen_Reverse; }); |
| 1659 | + } |
| 1660 | + |
1643 | 1661 | /// Returns true if the Phi is part of an inloop reduction.
|
1644 | 1662 | bool isInLoopReduction(PHINode *Phi) const {
|
1645 | 1663 | return InLoopReductions.contains(Phi);
|
@@ -1785,6 +1803,10 @@ class LoopVectorizationCostModel {
|
1785 | 1803 | /// All blocks of loop are to be masked to fold tail of scalar iterations.
|
1786 | 1804 | bool CanFoldTailByMasking = false;
|
1787 | 1805 |
|
| 1806 | + /// Control whether to generate VP intrinsics with explicit-vector-length |
| 1807 | + /// support in vectorized code. |
| 1808 | + bool PreferEVL = false; |
| 1809 | + |
1788 | 1810 | /// A map holding scalar costs for different vectorization factors. The
|
1789 | 1811 | /// presence of a cost for an instruction in the mapping indicates that the
|
1790 | 1812 | /// instruction will be scalarized when vectorizing with the associated
|
@@ -4690,6 +4712,39 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
|
4690 | 4712 | // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
|
4691 | 4713 | if (Legal->prepareToFoldTailByMasking()) {
|
4692 | 4714 | CanFoldTailByMasking = true;
|
| 4715 | + if (getTailFoldingStyle() == TailFoldingStyle::None) |
| 4716 | + return MaxFactors; |
| 4717 | + |
| 4718 | + if (UserIC > 1) { |
| 4719 | + LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. Will " |
| 4720 | + "not generate VP intrinsics since interleave count " |
| 4721 | + "specified is greater than 1.\n"); |
| 4722 | + return MaxFactors; |
| 4723 | + } |
| 4724 | + |
| 4725 | + if (MaxFactors.ScalableVF.isVector()) { |
| 4726 | + assert(MaxFactors.ScalableVF.isScalable() && |
| 4727 | + "Expected scalable vector factor."); |
| 4728 | + // FIXME: use actual opcode/data type for analysis here. |
| 4729 | + PreferEVL = getTailFoldingStyle() == TailFoldingStyle::DataWithEVL && |
| 4730 | + TTI.hasActiveVectorLength(0, nullptr, Align()); |
| 4731 | +#if !NDEBUG |
| 4732 | + if (getTailFoldingStyle() == TailFoldingStyle::DataWithEVL) { |
| 4733 | + if (PreferEVL) |
| 4734 | + dbgs() << "LV: Preference for VP intrinsics indicated. Will " |
| 4735 | + "try to generate VP Intrinsics.\n"; |
| 4736 | + else |
| 4737 | + dbgs() << "LV: Preference for VP intrinsics indicated. Will " |
| 4738 | + "not try to generate VP Intrinsics since the target " |
| 4739 | + "does not support vector length predication.\n"; |
| 4740 | + } |
| 4741 | +#endif // !NDEBUG |
| 4742 | + |
| 4743 | + // Tail folded loop using VP intrinsics restricts the VF to be scalable. |
| 4744 | + if (PreferEVL) |
| 4745 | + MaxFactors.FixedVF = ElementCount::getFixed(1); |
| 4746 | + } |
| 4747 | + |
4693 | 4748 | return MaxFactors;
|
4694 | 4749 | }
|
4695 | 4750 |
|
@@ -5299,6 +5354,10 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
|
5299 | 5354 | if (!isScalarEpilogueAllowed())
|
5300 | 5355 | return 1;
|
5301 | 5356 |
|
| 5357 | + // Do not interleave if EVL is preferred and no User IC is specified. |
| 5358 | + if (useVPIWithVPEVLVectorization()) |
| 5359 | + return 1; |
| 5360 | + |
5302 | 5361 | // We used the distance for the interleave count.
|
5303 | 5362 | if (!Legal->isSafeForAnyVectorWidth())
|
5304 | 5363 | return 1;
|
@@ -8553,6 +8612,8 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
|
8553 | 8612 | VPlanTransforms::truncateToMinimalBitwidths(
|
8554 | 8613 | *Plan, CM.getMinimalBitwidths(), PSE.getSE()->getContext());
|
8555 | 8614 | VPlanTransforms::optimize(*Plan, *PSE.getSE());
|
| 8615 | + if (CM.useVPIWithVPEVLVectorization()) |
| 8616 | + VPlanTransforms::addExplicitVectorLength(*Plan); |
8556 | 8617 | assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid");
|
8557 | 8618 | VPlans.push_back(std::move(Plan));
|
8558 | 8619 | }
|
@@ -9414,6 +9475,52 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
|
9414 | 9475 | State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane), State);
|
9415 | 9476 | }
|
9416 | 9477 |
|
| 9478 | +/// Creates either vp_store or vp_scatter intrinsics calls to represent |
| 9479 | +/// predicated store/scatter. |
| 9480 | +static Instruction * |
| 9481 | +lowerStoreUsingVectorIntrinsics(IRBuilderBase &Builder, Value *Addr, |
| 9482 | + Value *StoredVal, bool IsScatter, Value *Mask, |
| 9483 | + Value *EVLPart, const Align &Alignment) { |
| 9484 | + CallInst *Call; |
| 9485 | + if (IsScatter) { |
| 9486 | + Call = Builder.CreateIntrinsic(Type::getVoidTy(EVLPart->getContext()), |
| 9487 | + Intrinsic::vp_scatter, |
| 9488 | + {StoredVal, Addr, Mask, EVLPart}); |
| 9489 | + } else { |
| 9490 | + VectorBuilder VBuilder(Builder); |
| 9491 | + VBuilder.setEVL(EVLPart).setMask(Mask); |
| 9492 | + Call = cast<CallInst>(VBuilder.createVectorInstruction( |
| 9493 | + Instruction::Store, Type::getVoidTy(EVLPart->getContext()), |
| 9494 | + {StoredVal, Addr})); |
| 9495 | + } |
| 9496 | + Call->addParamAttr( |
| 9497 | + 1, Attribute::getWithAlignment(Call->getContext(), Alignment)); |
| 9498 | + return Call; |
| 9499 | +} |
| 9500 | + |
| 9501 | +/// Creates either vp_load or vp_gather intrinsics calls to represent |
| 9502 | +/// predicated load/gather. |
| 9503 | +static Instruction *lowerLoadUsingVectorIntrinsics(IRBuilderBase &Builder, |
| 9504 | + VectorType *DataTy, |
| 9505 | + Value *Addr, bool IsGather, |
| 9506 | + Value *Mask, Value *EVLPart, |
| 9507 | + const Align &Alignment) { |
| 9508 | + CallInst *Call; |
| 9509 | + if (IsGather) { |
| 9510 | + Call = Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, |
| 9511 | + {Addr, Mask, EVLPart}, nullptr, |
| 9512 | + "wide.masked.gather"); |
| 9513 | + } else { |
| 9514 | + VectorBuilder VBuilder(Builder); |
| 9515 | + VBuilder.setEVL(EVLPart).setMask(Mask); |
| 9516 | + Call = cast<CallInst>(VBuilder.createVectorInstruction( |
| 9517 | + Instruction::Load, DataTy, Addr, "vp.op.load")); |
| 9518 | + } |
| 9519 | + Call->addParamAttr( |
| 9520 | + 0, Attribute::getWithAlignment(Call->getContext(), Alignment)); |
| 9521 | + return Call; |
| 9522 | +} |
| 9523 | + |
9417 | 9524 | void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
|
9418 | 9525 | VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
|
9419 | 9526 |
|
@@ -9445,14 +9552,31 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
|
9445 | 9552 | }
|
9446 | 9553 | }
|
9447 | 9554 |
|
| 9555 | + auto MaskValue = [&](unsigned Part) -> Value * { |
| 9556 | + if (isMaskRequired) |
| 9557 | + return BlockInMaskParts[Part]; |
| 9558 | + return nullptr; |
| 9559 | + }; |
| 9560 | + |
9448 | 9561 | // Handle Stores:
|
9449 | 9562 | if (SI) {
|
9450 | 9563 | State.setDebugLocFrom(SI->getDebugLoc());
|
9451 | 9564 |
|
9452 | 9565 | for (unsigned Part = 0; Part < State.UF; ++Part) {
|
9453 | 9566 | Instruction *NewSI = nullptr;
|
9454 | 9567 | Value *StoredVal = State.get(StoredValue, Part);
|
9455 |
| - if (CreateGatherScatter) { |
| 9568 | + if (State.EVL) { |
| 9569 | + Value *EVLPart = State.get(State.EVL, Part); |
| 9570 | + // If EVL is not nullptr, then EVL must be a valid value set during plan |
| 9571 | + // creation, possibly default value = whole vector register length. EVL |
| 9572 | + // is created only if TTI prefers predicated vectorization, thus if EVL |
| 9573 | + // is not nullptr it also implies preference for predicated |
| 9574 | + // vectorization. |
| 9575 | + // FIXME: Support reverse store after vp_reverse is added. |
| 9576 | + NewSI = lowerStoreUsingVectorIntrinsics( |
| 9577 | + Builder, State.get(getAddr(), Part), StoredVal, CreateGatherScatter, |
| 9578 | + MaskValue(Part), EVLPart, Alignment); |
| 9579 | + } else if (CreateGatherScatter) { |
9456 | 9580 | Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
|
9457 | 9581 | Value *VectorGep = State.get(getAddr(), Part);
|
9458 | 9582 | NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
|
@@ -9482,7 +9606,18 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
|
9482 | 9606 | State.setDebugLocFrom(LI->getDebugLoc());
|
9483 | 9607 | for (unsigned Part = 0; Part < State.UF; ++Part) {
|
9484 | 9608 | Value *NewLI;
|
9485 |
| - if (CreateGatherScatter) { |
| 9609 | + if (State.EVL) { |
| 9610 | + Value *EVLPart = State.get(State.EVL, Part); |
| 9611 | + // If EVL is not nullptr, then EVL must be a valid value set during plan |
| 9612 | + // creation, possibly default value = whole vector register length. EVL |
| 9613 | + // is created only if TTI prefers predicated vectorization, thus if EVL |
| 9614 | + // is not nullptr it also implies preference for predicated |
| 9615 | + // vectorization. |
| 9616 | + // FIXME: Support reverse loading after vp_reverse is added. |
| 9617 | + NewLI = lowerLoadUsingVectorIntrinsics( |
| 9618 | + Builder, DataTy, State.get(getAddr(), Part), CreateGatherScatter, |
| 9619 | + MaskValue(Part), EVLPart, Alignment); |
| 9620 | + } else if (CreateGatherScatter) { |
9486 | 9621 | Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
|
9487 | 9622 | Value *VectorGep = State.get(getAddr(), Part);
|
9488 | 9623 | NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart,
|
|
0 commit comments