diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index f78eb84b0c445..2205e87d2bc17 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -877,9 +877,8 @@ class VPInstruction : public VPRecipeWithIRFlags, // Returns a scalar boolean value, which is true if any lane of its (only // boolean) vector operand is true. AnyOf, - // Extracts the first active lane of a vector, where the first operand is - // the predicate, and the second operand is the vector to extract. - ExtractFirstActive, + // Calculates the first active lane index of the vector predicate operand. + FirstActiveLane, }; private: diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index 9b0720760df40..e780e6934eb44 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -50,6 +50,8 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) { return SetResultTyFromOp(); switch (Opcode) { + case Instruction::ExtractElement: + return inferScalarType(R->getOperand(0)); case Instruction::Select: { Type *ResTy = inferScalarType(R->getOperand(1)); VPValue *OtherV = R->getOperand(2); @@ -82,7 +84,8 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) { case VPInstruction::CanonicalIVIncrementForPart: case VPInstruction::AnyOf: return SetResultTyFromOp(); - case VPInstruction::ExtractFirstActive: + case VPInstruction::FirstActiveLane: + return Type::getIntNTy(Ctx, 64); case VPInstruction::ExtractFromEnd: { Type *BaseTy = inferScalarType(R->getOperand(0)); if (auto *VecTy = dyn_cast(BaseTy)) diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 6e396eda6aac6..d97805d874955 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -468,6 +468,12 @@ Value *VPInstruction::generate(VPTransformState &State) { Value *A = State.get(getOperand(0)); return Builder.CreateNot(A, Name); } + case Instruction::ExtractElement: { + assert(State.VF.isVector() && "Only extract elements from vectors"); + Value *Vec = State.get(getOperand(0)); + Value *Idx = State.get(getOperand(1), /*IsScalar=*/true); + return Builder.CreateExtractElement(Vec, Idx, Name); + } case Instruction::ICmp: { bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); Value *A = State.get(getOperand(0), OnlyFirstLaneUsed); @@ -723,12 +729,10 @@ Value *VPInstruction::generate(VPTransformState &State) { Value *A = State.get(getOperand(0)); return Builder.CreateOrReduce(A); } - case VPInstruction::ExtractFirstActive: { - Value *Vec = State.get(getOperand(0)); - Value *Mask = State.get(getOperand(1)); - Value *Ctz = Builder.CreateCountTrailingZeroElems( - Builder.getInt64Ty(), Mask, true, "first.active.lane"); - return Builder.CreateExtractElement(Vec, Ctz, "early.exit.value"); + case VPInstruction::FirstActiveLane: { + Value *Mask = State.get(getOperand(0)); + return Builder.CreateCountTrailingZeroElems(Builder.getInt64Ty(), Mask, + true, Name); } default: llvm_unreachable("Unsupported opcode for instruction"); @@ -755,22 +759,24 @@ InstructionCost VPInstruction::computeCost(ElementCount VF, } switch (getOpcode()) { + case Instruction::ExtractElement: { + // Add on the cost of extracting the element. + auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF); + return Ctx.TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, + Ctx.CostKind); + } case VPInstruction::AnyOf: { auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(this), VF); return Ctx.TTI.getArithmeticReductionCost( Instruction::Or, cast(VecTy), std::nullopt, Ctx.CostKind); } - case VPInstruction::ExtractFirstActive: { + case VPInstruction::FirstActiveLane: { // Calculate the cost of determining the lane index. - auto *PredTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(1)), VF); + auto *PredTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF); IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts, Type::getInt64Ty(Ctx.LLVMCtx), {PredTy, Type::getInt1Ty(Ctx.LLVMCtx)}); - InstructionCost Cost = Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind); - // Add on the cost of extracting the element. - auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF); - return Cost + Ctx.TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, - Ctx.CostKind); + return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind); } case VPInstruction::FirstOrderRecurrenceSplice: { assert(VF.isVector() && "Scalar FirstOrderRecurrenceSplice?"); @@ -793,7 +799,8 @@ InstructionCost VPInstruction::computeCost(ElementCount VF, bool VPInstruction::isVectorToScalar() const { return getOpcode() == VPInstruction::ExtractFromEnd || - getOpcode() == VPInstruction::ExtractFirstActive || + getOpcode() == Instruction::ExtractElement || + getOpcode() == VPInstruction::FirstActiveLane || getOpcode() == VPInstruction::ComputeReductionResult || getOpcode() == VPInstruction::AnyOf; } @@ -853,13 +860,14 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const { if (Instruction::isBinaryOp(getOpcode())) return false; switch (getOpcode()) { + case Instruction::ExtractElement: case Instruction::ICmp: case Instruction::Select: case VPInstruction::AnyOf: case VPInstruction::CalculateTripCountMinusVF: case VPInstruction::CanonicalIVIncrementForPart: case VPInstruction::ExtractFromEnd: - case VPInstruction::ExtractFirstActive: + case VPInstruction::FirstActiveLane: case VPInstruction::FirstOrderRecurrenceSplice: case VPInstruction::LogicalAnd: case VPInstruction::Not: @@ -878,6 +886,8 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const { switch (getOpcode()) { default: return false; + case Instruction::ExtractElement: + return Op == getOperand(1); case Instruction::PHI: return true; case Instruction::ICmp: @@ -970,7 +980,6 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, case VPInstruction::Broadcast: O << "broadcast"; break; - case VPInstruction::ExtractFromEnd: O << "extract-from-end"; break; @@ -986,8 +995,8 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, case VPInstruction::AnyOf: O << "any-of"; break; - case VPInstruction::ExtractFirstActive: - O << "extract-first-active"; + case VPInstruction::FirstActiveLane: + O << "first-active-lane"; break; default: O << Instruction::getOpcodeName(getOpcode()); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index dacd2c2b0070b..9aae383d35d91 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -2158,10 +2158,14 @@ void VPlanTransforms::handleUncountableEarlyExit( ExitIRI->extractLastLaneOfOperand(MiddleBuilder); } // Add the incoming value from the early exit. - if (!IncomingFromEarlyExit->isLiveIn()) - IncomingFromEarlyExit = - EarlyExitB.createNaryOp(VPInstruction::ExtractFirstActive, - {IncomingFromEarlyExit, EarlyExitTakenCond}); + if (!IncomingFromEarlyExit->isLiveIn()) { + VPValue *FirstActiveLane = EarlyExitB.createNaryOp( + VPInstruction::FirstActiveLane, {EarlyExitTakenCond}, nullptr, + "first.active.lane"); + IncomingFromEarlyExit = EarlyExitB.createNaryOp( + Instruction::ExtractElement, {IncomingFromEarlyExit, FirstActiveLane}, + nullptr, "early.exit.value"); + } ExitIRI->addOperand(IncomingFromEarlyExit); } MiddleBuilder.createNaryOp(VPInstruction::BranchOnCond, {IsEarlyExitTaken}); diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/early_exit_costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/early_exit_costs.ll index 55c6c43b6306a..4d7c5d088034d 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/early_exit_costs.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/early_exit_costs.ll @@ -11,8 +11,10 @@ define i64 @same_exit_block_pre_inc_use1_sve() #1 { ; CHECK-LABEL: LV: Checking a loop in 'same_exit_block_pre_inc_use1_sve' ; CHECK: LV: Selecting VF: vscale x 16 ; CHECK: Calculating cost of work in exit block vector.early.exit -; CHECK-NEXT: Cost of 6 for VF vscale x 16: EMIT vp<{{.*}}> = extract-first-active -; CHECK-NEXT: Cost of 6 for VF vscale x 16: EMIT vp<{{.*}}> = extract-first-active +; CHECK-NEXT: Cost of 4 for VF vscale x 16: EMIT vp<{{.*}}> = first-active-lane vp<{{.*}}> +; CHECK-NEXT: Cost of 2 for VF vscale x 16: EMIT vp<{{.*}}> = extractelement ir<{{.*}}>, vp<{{.*}}> +; CHECK-NEXT: Cost of 4 for VF vscale x 16: EMIT vp<{{.*}}>.1 = first-active-lane vp<{{.*}}> +; CHECK-NEXT: Cost of 2 for VF vscale x 16: EMIT vp<{{.*}}>.1 = extractelement ir<{{.*}}>, vp<{{.*}}>.1 ; CHECK: LV: Minimum required TC for runtime checks to be profitable:32 entry: %p1 = alloca [1024 x i8] @@ -48,8 +50,10 @@ define i64 @same_exit_block_pre_inc_use1_nosve() { ; CHECK-LABEL: LV: Checking a loop in 'same_exit_block_pre_inc_use1_nosve' ; CHECK: LV: Selecting VF: 16 ; CHECK: Calculating cost of work in exit block vector.early.exit -; CHECK-NEXT: Cost of 50 for VF 16: EMIT vp<{{.*}}> = extract-first-active -; CHECK-NEXT: Cost of 50 for VF 16: EMIT vp<{{.*}}> = extract-first-active +; CHECK-NEXT: Cost of 48 for VF 16: EMIT vp<{{.*}}> = first-active-lane vp<{{.*}}> +; CHECK-NEXT: Cost of 2 for VF 16: EMIT vp<{{.*}}> = extractelement ir<{{.*}}>, vp<{{.*}}> +; CHECK-NEXT: Cost of 48 for VF 16: EMIT vp<{{.*}}>.1 = first-active-lane vp<{{.*}}> +; CHECK-NEXT: Cost of 2 for VF 16: EMIT vp<{{.*}}>.1 = extractelement ir<{{.*}}>, vp<{{.*}}>.1 ; CHECK: LV: Minimum required TC for runtime checks to be profitable:176 ; CHECK-NEXT: LV: Vectorization is not beneficial: expected trip count < minimum profitable VF (64 < 176) ; CHECK-NEXT: LV: Too many memory checks needed.