Skip to content

Commit 4eb30cf

Browse files
authored
[LV][EVL] Support in-loop reduction using tail folding with EVL. (#90184)
Following from #87816, add VPReductionEVLRecipe to describe vector predication reduction. Address one of TODOs from #76172.
1 parent 5d12fa7 commit 4eb30cf

18 files changed

+5344
-96
lines changed

llvm/include/llvm/IR/VectorBuilder.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#ifndef LLVM_IR_VECTORBUILDER_H
1616
#define LLVM_IR_VECTORBUILDER_H
1717

18+
#include <llvm/Analysis/IVDescriptors.h>
1819
#include <llvm/IR/IRBuilder.h>
1920
#include <llvm/IR/InstrTypes.h>
2021
#include <llvm/IR/Instruction.h>
@@ -57,6 +58,11 @@ class VectorBuilder {
5758
return RetType();
5859
}
5960

61+
/// Helper function for creating VP intrinsic call.
62+
Value *createVectorInstructionImpl(Intrinsic::ID VPID, Type *ReturnTy,
63+
ArrayRef<Value *> VecOpArray,
64+
const Twine &Name = Twine());
65+
6066
public:
6167
VectorBuilder(IRBuilderBase &Builder,
6268
Behavior ErrorHandling = Behavior::ReportAndAbort)
@@ -92,6 +98,15 @@ class VectorBuilder {
9298
Value *createVectorInstruction(unsigned Opcode, Type *ReturnTy,
9399
ArrayRef<Value *> VecOpArray,
94100
const Twine &Name = Twine());
101+
102+
/// Emit a VP reduction intrinsic call for recurrence kind.
103+
/// \param Kind The kind of recurrence
104+
/// \param ValTy The type of operand which the reduction operation is
105+
/// performed.
106+
/// \param VecOpArray The operand list.
107+
Value *createSimpleTargetReduction(RecurKind Kind, Type *ValTy,
108+
ArrayRef<Value *> VecOpArray,
109+
const Twine &Name = Twine());
95110
};
96111

97112
} // namespace llvm

llvm/include/llvm/Transforms/Utils/LoopUtils.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515

1616
#include "llvm/Analysis/IVDescriptors.h"
1717
#include "llvm/Analysis/LoopAccessAnalysis.h"
18+
#include "llvm/IR/VectorBuilder.h"
1819
#include "llvm/Transforms/Utils/ValueMapper.h"
1920

2021
namespace llvm {
@@ -394,6 +395,10 @@ Value *getShuffleReduction(IRBuilderBase &Builder, Value *Src, unsigned Op,
394395
/// Fast-math-flags are propagated using the IRBuilder's setting.
395396
Value *createSimpleTargetReduction(IRBuilderBase &B, Value *Src,
396397
RecurKind RdxKind);
398+
/// Overloaded function to generate vector-predication intrinsics for target
399+
/// reduction.
400+
Value *createSimpleTargetReduction(VectorBuilder &VB, Value *Src,
401+
const RecurrenceDescriptor &Desc);
397402

398403
/// Create a target reduction of the given vector \p Src for a reduction of the
399404
/// kind RecurKind::IAnyOf or RecurKind::FAnyOf. The reduction operation is
@@ -414,6 +419,11 @@ Value *createTargetReduction(IRBuilderBase &B, const RecurrenceDescriptor &Desc,
414419
Value *createOrderedReduction(IRBuilderBase &B,
415420
const RecurrenceDescriptor &Desc, Value *Src,
416421
Value *Start);
422+
/// Overloaded function to generate vector-predication intrinsics for ordered
423+
/// reduction.
424+
Value *createOrderedReduction(VectorBuilder &VB,
425+
const RecurrenceDescriptor &Desc, Value *Src,
426+
Value *Start);
417427

418428
/// Get the intersection (logical and) of all of the potential IR flags
419429
/// of each scalar operation (VL) that will be converted into a vector (I).

llvm/lib/IR/VectorBuilder.cpp

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,70 @@ Value *VectorBuilder::createVectorInstruction(unsigned Opcode, Type *ReturnTy,
5757
auto VPID = VPIntrinsic::getForOpcode(Opcode);
5858
if (VPID == Intrinsic::not_intrinsic)
5959
return returnWithError<Value *>("No VPIntrinsic for this opcode");
60+
return createVectorInstructionImpl(VPID, ReturnTy, InstOpArray, Name);
61+
}
62+
63+
Value *VectorBuilder::createSimpleTargetReduction(RecurKind Kind, Type *ValTy,
64+
ArrayRef<Value *> InstOpArray,
65+
const Twine &Name) {
66+
Intrinsic::ID VPID;
67+
switch (Kind) {
68+
case RecurKind::Add:
69+
VPID = Intrinsic::vp_reduce_add;
70+
break;
71+
case RecurKind::Mul:
72+
VPID = Intrinsic::vp_reduce_mul;
73+
break;
74+
case RecurKind::And:
75+
VPID = Intrinsic::vp_reduce_and;
76+
break;
77+
case RecurKind::Or:
78+
VPID = Intrinsic::vp_reduce_or;
79+
break;
80+
case RecurKind::Xor:
81+
VPID = Intrinsic::vp_reduce_xor;
82+
break;
83+
case RecurKind::FMulAdd:
84+
case RecurKind::FAdd:
85+
VPID = Intrinsic::vp_reduce_fadd;
86+
break;
87+
case RecurKind::FMul:
88+
VPID = Intrinsic::vp_reduce_fmul;
89+
break;
90+
case RecurKind::SMax:
91+
VPID = Intrinsic::vp_reduce_smax;
92+
break;
93+
case RecurKind::SMin:
94+
VPID = Intrinsic::vp_reduce_smin;
95+
break;
96+
case RecurKind::UMax:
97+
VPID = Intrinsic::vp_reduce_umax;
98+
break;
99+
case RecurKind::UMin:
100+
VPID = Intrinsic::vp_reduce_umin;
101+
break;
102+
case RecurKind::FMax:
103+
VPID = Intrinsic::vp_reduce_fmax;
104+
break;
105+
case RecurKind::FMin:
106+
VPID = Intrinsic::vp_reduce_fmin;
107+
break;
108+
case RecurKind::FMaximum:
109+
VPID = Intrinsic::vp_reduce_fmaximum;
110+
break;
111+
case RecurKind::FMinimum:
112+
VPID = Intrinsic::vp_reduce_fminimum;
113+
break;
114+
default:
115+
llvm_unreachable("No VPIntrinsic for this reduction");
116+
}
117+
return createVectorInstructionImpl(VPID, ValTy, InstOpArray, Name);
118+
}
60119

120+
Value *VectorBuilder::createVectorInstructionImpl(Intrinsic::ID VPID,
121+
Type *ReturnTy,
122+
ArrayRef<Value *> InstOpArray,
123+
const Twine &Name) {
61124
auto MaskPosOpt = VPIntrinsic::getMaskParamPos(VPID);
62125
auto VLenPosOpt = VPIntrinsic::getVectorLengthParamPos(VPID);
63126
size_t NumInstParams = InstOpArray.size();

llvm/lib/Transforms/Utils/LoopUtils.cpp

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1192,6 +1192,19 @@ Value *llvm::createSimpleTargetReduction(IRBuilderBase &Builder, Value *Src,
11921192
}
11931193
}
11941194

1195+
Value *llvm::createSimpleTargetReduction(VectorBuilder &VBuilder, Value *Src,
1196+
const RecurrenceDescriptor &Desc) {
1197+
RecurKind Kind = Desc.getRecurrenceKind();
1198+
assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
1199+
"AnyOf reduction is not supported.");
1200+
auto *SrcTy = cast<VectorType>(Src->getType());
1201+
Type *SrcEltTy = SrcTy->getElementType();
1202+
Value *Iden =
1203+
Desc.getRecurrenceIdentity(Kind, SrcEltTy, Desc.getFastMathFlags());
1204+
Value *Ops[] = {Iden, Src};
1205+
return VBuilder.createSimpleTargetReduction(Kind, SrcTy, Ops);
1206+
}
1207+
11951208
Value *llvm::createTargetReduction(IRBuilderBase &B,
11961209
const RecurrenceDescriptor &Desc, Value *Src,
11971210
PHINode *OrigPhi) {
@@ -1220,6 +1233,20 @@ Value *llvm::createOrderedReduction(IRBuilderBase &B,
12201233
return B.CreateFAddReduce(Start, Src);
12211234
}
12221235

1236+
Value *llvm::createOrderedReduction(VectorBuilder &VBuilder,
1237+
const RecurrenceDescriptor &Desc,
1238+
Value *Src, Value *Start) {
1239+
assert((Desc.getRecurrenceKind() == RecurKind::FAdd ||
1240+
Desc.getRecurrenceKind() == RecurKind::FMulAdd) &&
1241+
"Unexpected reduction kind");
1242+
assert(Src->getType()->isVectorTy() && "Expected a vector type");
1243+
assert(!Start->getType()->isVectorTy() && "Expected a scalar type");
1244+
1245+
auto *SrcTy = cast<VectorType>(Src->getType());
1246+
Value *Ops[] = {Start, Src};
1247+
return VBuilder.createSimpleTargetReduction(RecurKind::FAdd, SrcTy, Ops);
1248+
}
1249+
12231250
void llvm::propagateIRFlags(Value *I, ArrayRef<Value *> VL, Value *OpValue,
12241251
bool IncludeWrapFlags) {
12251252
auto *VecOp = dyn_cast<Instruction>(I);

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1516,9 +1516,7 @@ class LoopVectorizationCostModel {
15161516
TTI.hasActiveVectorLength(0, nullptr, Align()) &&
15171517
!EnableVPlanNativePath &&
15181518
// FIXME: implement support for max safe dependency distance.
1519-
Legal->isSafeForAnyVectorWidth() &&
1520-
// FIXME: remove this once reductions are supported.
1521-
Legal->getReductionVars().empty();
1519+
Legal->isSafeForAnyVectorWidth();
15221520
if (!EVLIsLegal) {
15231521
// If for some reason EVL mode is unsupported, fallback to
15241522
// DataWithoutLaneMask to try to vectorize the loop with folded tail

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 76 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -909,6 +909,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
909909
case VPRecipeBase::VPEVLBasedIVPHISC:
910910
case VPRecipeBase::VPExpandSCEVSC:
911911
case VPRecipeBase::VPInstructionSC:
912+
case VPRecipeBase::VPReductionEVLSC:
912913
case VPRecipeBase::VPReductionSC:
913914
case VPRecipeBase::VPReplicateSC:
914915
case VPRecipeBase::VPScalarIVStepsSC:
@@ -2171,17 +2172,27 @@ class VPReductionRecipe : public VPSingleDefRecipe {
21712172
/// The recurrence decriptor for the reduction in question.
21722173
const RecurrenceDescriptor &RdxDesc;
21732174
bool IsOrdered;
2175+
/// Whether the reduction is conditional.
2176+
bool IsConditional = false;
2177+
2178+
protected:
2179+
VPReductionRecipe(const unsigned char SC, const RecurrenceDescriptor &R,
2180+
Instruction *I, ArrayRef<VPValue *> Operands,
2181+
VPValue *CondOp, bool IsOrdered)
2182+
: VPSingleDefRecipe(SC, Operands, I), RdxDesc(R), IsOrdered(IsOrdered) {
2183+
if (CondOp) {
2184+
IsConditional = true;
2185+
addOperand(CondOp);
2186+
}
2187+
}
21742188

21752189
public:
21762190
VPReductionRecipe(const RecurrenceDescriptor &R, Instruction *I,
21772191
VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp,
21782192
bool IsOrdered)
2179-
: VPSingleDefRecipe(VPDef::VPReductionSC,
2180-
ArrayRef<VPValue *>({ChainOp, VecOp}), I),
2181-
RdxDesc(R), IsOrdered(IsOrdered) {
2182-
if (CondOp)
2183-
addOperand(CondOp);
2184-
}
2193+
: VPReductionRecipe(VPDef::VPReductionSC, R, I,
2194+
ArrayRef<VPValue *>({ChainOp, VecOp}), CondOp,
2195+
IsOrdered) {}
21852196

21862197
~VPReductionRecipe() override = default;
21872198

@@ -2190,7 +2201,15 @@ class VPReductionRecipe : public VPSingleDefRecipe {
21902201
getVecOp(), getCondOp(), IsOrdered);
21912202
}
21922203

2193-
VP_CLASSOF_IMPL(VPDef::VPReductionSC)
2204+
static inline bool classof(const VPRecipeBase *R) {
2205+
return R->getVPDefID() == VPRecipeBase::VPReductionSC ||
2206+
R->getVPDefID() == VPRecipeBase::VPReductionEVLSC;
2207+
}
2208+
2209+
static inline bool classof(const VPUser *U) {
2210+
auto *R = dyn_cast<VPRecipeBase>(U);
2211+
return R && classof(R);
2212+
}
21942213

21952214
/// Generate the reduction in the loop
21962215
void execute(VPTransformState &State) override;
@@ -2201,13 +2220,62 @@ class VPReductionRecipe : public VPSingleDefRecipe {
22012220
VPSlotTracker &SlotTracker) const override;
22022221
#endif
22032222

2223+
/// Return the recurrence decriptor for the in-loop reduction.
2224+
const RecurrenceDescriptor &getRecurrenceDescriptor() const {
2225+
return RdxDesc;
2226+
}
2227+
/// Return true if the in-loop reduction is ordered.
2228+
bool isOrdered() const { return IsOrdered; };
2229+
/// Return true if the in-loop reduction is conditional.
2230+
bool isConditional() const { return IsConditional; };
22042231
/// The VPValue of the scalar Chain being accumulated.
22052232
VPValue *getChainOp() const { return getOperand(0); }
22062233
/// The VPValue of the vector value to be reduced.
22072234
VPValue *getVecOp() const { return getOperand(1); }
22082235
/// The VPValue of the condition for the block.
22092236
VPValue *getCondOp() const {
2210-
return getNumOperands() > 2 ? getOperand(2) : nullptr;
2237+
return isConditional() ? getOperand(getNumOperands() - 1) : nullptr;
2238+
}
2239+
};
2240+
2241+
/// A recipe to represent inloop reduction operations with vector-predication
2242+
/// intrinsics, performing a reduction on a vector operand with the explicit
2243+
/// vector length (EVL) into a scalar value, and adding the result to a chain.
2244+
/// The Operands are {ChainOp, VecOp, EVL, [Condition]}.
2245+
class VPReductionEVLRecipe : public VPReductionRecipe {
2246+
public:
2247+
VPReductionEVLRecipe(VPReductionRecipe *R, VPValue *EVL, VPValue *CondOp)
2248+
: VPReductionRecipe(
2249+
VPDef::VPReductionEVLSC, R->getRecurrenceDescriptor(),
2250+
cast_or_null<Instruction>(R->getUnderlyingValue()),
2251+
ArrayRef<VPValue *>({R->getChainOp(), R->getVecOp(), EVL}), CondOp,
2252+
R->isOrdered()) {}
2253+
2254+
~VPReductionEVLRecipe() override = default;
2255+
2256+
VPReductionEVLRecipe *clone() override {
2257+
llvm_unreachable("cloning not implemented yet");
2258+
}
2259+
2260+
VP_CLASSOF_IMPL(VPDef::VPReductionEVLSC)
2261+
2262+
/// Generate the reduction in the loop
2263+
void execute(VPTransformState &State) override;
2264+
2265+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2266+
/// Print the recipe.
2267+
void print(raw_ostream &O, const Twine &Indent,
2268+
VPSlotTracker &SlotTracker) const override;
2269+
#endif
2270+
2271+
/// The VPValue of the explicit vector length.
2272+
VPValue *getEVL() const { return getOperand(2); }
2273+
2274+
/// Returns true if the recipe only uses the first lane of operand \p Op.
2275+
bool onlyFirstLaneUsed(const VPValue *Op) const override {
2276+
assert(is_contained(operands(), Op) &&
2277+
"Op must be an operand of the recipe");
2278+
return Op == getEVL();
22112279
}
22122280
};
22132281

llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -274,6 +274,9 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
274274
[](const VPScalarCastRecipe *R) { return R->getResultType(); })
275275
.Case<VPExpandSCEVRecipe>([](const VPExpandSCEVRecipe *R) {
276276
return R->getSCEV()->getType();
277+
})
278+
.Case<VPReductionRecipe>([this](const auto *R) {
279+
return inferScalarType(R->getChainOp());
277280
});
278281

279282
assert(ResultTy && "could not infer type for the given VPValue");

0 commit comments

Comments
 (0)