Skip to content

[LV][NFC] Refactor code for extracting first active element #131118

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Mar 14, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions llvm/lib/Transforms/Vectorize/VPlan.h
Original file line number Diff line number Diff line change
Expand Up @@ -877,9 +877,8 @@ class VPInstruction : public VPRecipeWithIRFlags,
// Returns a scalar boolean value, which is true if any lane of its (only
// boolean) vector operand is true.
AnyOf,
// Extracts the first active lane of a vector, where the first operand is
// the predicate, and the second operand is the vector to extract.
ExtractFirstActive,
// Calculates the first active lane index of the vector predicate operand.
FirstActiveLane,
};

private:
Expand Down
5 changes: 4 additions & 1 deletion llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
return SetResultTyFromOp();

switch (Opcode) {
case Instruction::ExtractElement:
return inferScalarType(R->getOperand(0));
case Instruction::Select: {
Type *ResTy = inferScalarType(R->getOperand(1));
VPValue *OtherV = R->getOperand(2);
Expand Down Expand Up @@ -82,7 +84,8 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
case VPInstruction::CanonicalIVIncrementForPart:
case VPInstruction::AnyOf:
return SetResultTyFromOp();
case VPInstruction::ExtractFirstActive:
case VPInstruction::FirstActiveLane:
return Type::getIntNTy(Ctx, 64);
case VPInstruction::ExtractFromEnd: {
Type *BaseTy = inferScalarType(R->getOperand(0));
if (auto *VecTy = dyn_cast<VectorType>(BaseTy))
Expand Down
45 changes: 27 additions & 18 deletions llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -468,6 +468,12 @@ Value *VPInstruction::generate(VPTransformState &State) {
Value *A = State.get(getOperand(0));
return Builder.CreateNot(A, Name);
}
case Instruction::ExtractElement: {
assert(State.VF.isVector() && "Only extract elements from vectors");
Value *Vec = State.get(getOperand(0));
Value *Idx = State.get(getOperand(1), /*IsScalar=*/true);
return Builder.CreateExtractElement(Vec, Idx, Name);
}
case Instruction::ICmp: {
bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
Expand Down Expand Up @@ -723,12 +729,10 @@ Value *VPInstruction::generate(VPTransformState &State) {
Value *A = State.get(getOperand(0));
return Builder.CreateOrReduce(A);
}
case VPInstruction::ExtractFirstActive: {
Value *Vec = State.get(getOperand(0));
Value *Mask = State.get(getOperand(1));
Value *Ctz = Builder.CreateCountTrailingZeroElems(
Builder.getInt64Ty(), Mask, true, "first.active.lane");
return Builder.CreateExtractElement(Vec, Ctz, "early.exit.value");
case VPInstruction::FirstActiveLane: {
Value *Mask = State.get(getOperand(0));
return Builder.CreateCountTrailingZeroElems(Builder.getInt64Ty(), Mask,
true, Name);
}
default:
llvm_unreachable("Unsupported opcode for instruction");
Expand All @@ -755,22 +759,24 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
}

switch (getOpcode()) {
case Instruction::ExtractElement: {
// Add on the cost of extracting the element.
auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
return Ctx.TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy,
Ctx.CostKind);
}
case VPInstruction::AnyOf: {
auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
return Ctx.TTI.getArithmeticReductionCost(
Instruction::Or, cast<VectorType>(VecTy), std::nullopt, Ctx.CostKind);
}
case VPInstruction::ExtractFirstActive: {
case VPInstruction::FirstActiveLane: {
// Calculate the cost of determining the lane index.
auto *PredTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(1)), VF);
auto *PredTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts,
Type::getInt64Ty(Ctx.LLVMCtx),
{PredTy, Type::getInt1Ty(Ctx.LLVMCtx)});
InstructionCost Cost = Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
// Add on the cost of extracting the element.
auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
return Cost + Ctx.TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy,
Ctx.CostKind);
return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
}
case VPInstruction::FirstOrderRecurrenceSplice: {
assert(VF.isVector() && "Scalar FirstOrderRecurrenceSplice?");
Expand All @@ -793,7 +799,8 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,

bool VPInstruction::isVectorToScalar() const {
return getOpcode() == VPInstruction::ExtractFromEnd ||
getOpcode() == VPInstruction::ExtractFirstActive ||
getOpcode() == Instruction::ExtractElement ||
getOpcode() == VPInstruction::FirstActiveLane ||
getOpcode() == VPInstruction::ComputeReductionResult ||
getOpcode() == VPInstruction::AnyOf;
}
Expand Down Expand Up @@ -853,13 +860,14 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
if (Instruction::isBinaryOp(getOpcode()))
return false;
switch (getOpcode()) {
case Instruction::ExtractElement:
case Instruction::ICmp:
case Instruction::Select:
case VPInstruction::AnyOf:
case VPInstruction::CalculateTripCountMinusVF:
case VPInstruction::CanonicalIVIncrementForPart:
case VPInstruction::ExtractFromEnd:
case VPInstruction::ExtractFirstActive:
case VPInstruction::FirstActiveLane:
case VPInstruction::FirstOrderRecurrenceSplice:
case VPInstruction::LogicalAnd:
case VPInstruction::Not:
Expand All @@ -878,6 +886,8 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
switch (getOpcode()) {
default:
return false;
case Instruction::ExtractElement:
return Op == getOperand(1);
case Instruction::PHI:
return true;
case Instruction::ICmp:
Expand Down Expand Up @@ -970,7 +980,6 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
case VPInstruction::Broadcast:
O << "broadcast";
break;

case VPInstruction::ExtractFromEnd:
O << "extract-from-end";
break;
Expand All @@ -986,8 +995,8 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
case VPInstruction::AnyOf:
O << "any-of";
break;
case VPInstruction::ExtractFirstActive:
O << "extract-first-active";
case VPInstruction::FirstActiveLane:
O << "first-active-lane";
break;
default:
O << Instruction::getOpcodeName(getOpcode());
Expand Down
12 changes: 8 additions & 4 deletions llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2158,10 +2158,14 @@ void VPlanTransforms::handleUncountableEarlyExit(
ExitIRI->extractLastLaneOfOperand(MiddleBuilder);
}
// Add the incoming value from the early exit.
if (!IncomingFromEarlyExit->isLiveIn())
IncomingFromEarlyExit =
EarlyExitB.createNaryOp(VPInstruction::ExtractFirstActive,
{IncomingFromEarlyExit, EarlyExitTakenCond});
if (!IncomingFromEarlyExit->isLiveIn()) {
VPValue *FirstActiveLane = EarlyExitB.createNaryOp(
VPInstruction::FirstActiveLane, {EarlyExitTakenCond}, nullptr,
"first.active.lane");
IncomingFromEarlyExit = EarlyExitB.createNaryOp(
Instruction::ExtractElement, {IncomingFromEarlyExit, FirstActiveLane},
nullptr, "early.exit.value");
}
ExitIRI->addOperand(IncomingFromEarlyExit);
}
MiddleBuilder.createNaryOp(VPInstruction::BranchOnCond, {IsEarlyExitTaken});
Expand Down
12 changes: 8 additions & 4 deletions llvm/test/Transforms/LoopVectorize/AArch64/early_exit_costs.ll
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,10 @@ define i64 @same_exit_block_pre_inc_use1_sve() #1 {
; CHECK-LABEL: LV: Checking a loop in 'same_exit_block_pre_inc_use1_sve'
; CHECK: LV: Selecting VF: vscale x 16
; CHECK: Calculating cost of work in exit block vector.early.exit
; CHECK-NEXT: Cost of 6 for VF vscale x 16: EMIT vp<{{.*}}> = extract-first-active
; CHECK-NEXT: Cost of 6 for VF vscale x 16: EMIT vp<{{.*}}> = extract-first-active
; CHECK-NEXT: Cost of 4 for VF vscale x 16: EMIT vp<{{.*}}> = first-active-lane vp<{{.*}}>
; CHECK-NEXT: Cost of 2 for VF vscale x 16: EMIT vp<{{.*}}> = extractelement ir<{{.*}}>, vp<{{.*}}>
; CHECK-NEXT: Cost of 4 for VF vscale x 16: EMIT vp<{{.*}}>.1 = first-active-lane vp<{{.*}}>
; CHECK-NEXT: Cost of 2 for VF vscale x 16: EMIT vp<{{.*}}>.1 = extractelement ir<{{.*}}>, vp<{{.*}}>.1
; CHECK: LV: Minimum required TC for runtime checks to be profitable:32
entry:
%p1 = alloca [1024 x i8]
Expand Down Expand Up @@ -48,8 +50,10 @@ define i64 @same_exit_block_pre_inc_use1_nosve() {
; CHECK-LABEL: LV: Checking a loop in 'same_exit_block_pre_inc_use1_nosve'
; CHECK: LV: Selecting VF: 16
; CHECK: Calculating cost of work in exit block vector.early.exit
; CHECK-NEXT: Cost of 50 for VF 16: EMIT vp<{{.*}}> = extract-first-active
; CHECK-NEXT: Cost of 50 for VF 16: EMIT vp<{{.*}}> = extract-first-active
; CHECK-NEXT: Cost of 48 for VF 16: EMIT vp<{{.*}}> = first-active-lane vp<{{.*}}>
; CHECK-NEXT: Cost of 2 for VF 16: EMIT vp<{{.*}}> = extractelement ir<{{.*}}>, vp<{{.*}}>
; CHECK-NEXT: Cost of 48 for VF 16: EMIT vp<{{.*}}>.1 = first-active-lane vp<{{.*}}>
; CHECK-NEXT: Cost of 2 for VF 16: EMIT vp<{{.*}}>.1 = extractelement ir<{{.*}}>, vp<{{.*}}>.1
; CHECK: LV: Minimum required TC for runtime checks to be profitable:176
; CHECK-NEXT: LV: Vectorization is not beneficial: expected trip count < minimum profitable VF (64 < 176)
; CHECK-NEXT: LV: Too many memory checks needed.
Expand Down
Loading