Skip to content

Commit c73ad7b

Browse files
committed
[VPlan] Add transformation to narrow interleave groups.
This patch adds a new narrowInterleaveGroups transfrom, which tries convert a plan with interleave groups with VF elements to a plan that instead replaces the interleave groups with wide loads and stores processing VF elements. This effectively is a very simple form of loop-aware SLP, where we use interleave groups to identify candidates. This initial version is quite restricted and hopefully serves as a starting point for how to best model those kinds of transforms. For now it only transforms load interleave groups feeding store groups. Depends on #106431. This lands the main parts of the approved #106441 as suggested to break things up a bit more.
1 parent 523cf65 commit c73ad7b

File tree

6 files changed

+156
-41
lines changed

6 files changed

+156
-41
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

+3
Original file line numberDiff line numberDiff line change
@@ -7720,6 +7720,9 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
77207720
VPlanTransforms::materializeBroadcasts(BestVPlan);
77217721
VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
77227722
VPlanTransforms::simplifyRecipes(BestVPlan, *Legal->getWidestInductionType());
7723+
VPlanTransforms::narrowInterleaveGroups(
7724+
BestVPlan, BestVF,
7725+
TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector));
77237726
VPlanTransforms::removeDeadRecipes(BestVPlan);
77247727
VPlanTransforms::convertToConcreteRecipes(BestVPlan);
77257728

llvm/lib/Transforms/Vectorize/VPlan.cpp

-2
Original file line numberDiff line numberDiff line change
@@ -898,8 +898,6 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
898898

899899
IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
900900
// FIXME: Model VF * UF computation completely in VPlan.
901-
assert((!getVectorLoopRegion() || VFxUF.getNumUsers()) &&
902-
"VFxUF expected to always have users");
903901
unsigned UF = getUF();
904902
if (VF.getNumUsers()) {
905903
Value *RuntimeVF = getRuntimeVF(Builder, TCTy, State.VF);

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

+129
Original file line numberDiff line numberDiff line change
@@ -2235,3 +2235,132 @@ void VPlanTransforms::materializeBroadcasts(VPlan &Plan) {
22352235
});
22362236
}
22372237
}
2238+
2239+
/// Returns true if \p IR is a full interleave group with factor and number of
2240+
/// members both equal to \p VF. The interleave group must also access the full
2241+
/// vector width \p VectorRegWidth.
2242+
static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR,
2243+
unsigned VF, VPTypeAnalysis &TypeInfo,
2244+
unsigned VectorRegWidth) {
2245+
if (!InterleaveR)
2246+
return false;
2247+
2248+
Type *GroupElementTy = nullptr;
2249+
if (InterleaveR->getStoredValues().empty()) {
2250+
GroupElementTy = TypeInfo.inferScalarType(InterleaveR->getVPValue(0));
2251+
if (!all_of(InterleaveR->definedValues(),
2252+
[&TypeInfo, GroupElementTy](VPValue *Op) {
2253+
return TypeInfo.inferScalarType(Op) == GroupElementTy;
2254+
}))
2255+
return false;
2256+
} else {
2257+
GroupElementTy =
2258+
TypeInfo.inferScalarType(InterleaveR->getStoredValues()[0]);
2259+
if (!all_of(InterleaveR->getStoredValues(),
2260+
[&TypeInfo, GroupElementTy](VPValue *Op) {
2261+
return TypeInfo.inferScalarType(Op) == GroupElementTy;
2262+
}))
2263+
return false
2264+
}
2265+
2266+
unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * VF;
2267+
auto IG = InterleaveR->getInterleaveGroup();
2268+
return IG->getFactor() == VF && IG->getNumMembers() == VF &&
2269+
GroupSize == VectorRegWidth;
2270+
}
2271+
2272+
void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
2273+
unsigned VectorRegWidth) {
2274+
using namespace llvm::VPlanPatternMatch;
2275+
VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
2276+
if (VF.isScalable() || !VectorLoop)
2277+
return;
2278+
2279+
VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();
2280+
Type *CanonicalIVType = CanonicalIV->getScalarType();
2281+
VPTypeAnalysis TypeInfo(CanonicalIVType);
2282+
2283+
unsigned FixedVF = VF.getFixedValue();
2284+
SmallVector<VPInterleaveRecipe *> StoreGroups;
2285+
for (auto &R : *VectorLoop->getEntryBasicBlock()) {
2286+
if (isa<VPCanonicalIVPHIRecipe>(&R) ||
2287+
match(&R, m_BranchOnCount(m_VPValue(), m_VPValue())))
2288+
continue;
2289+
2290+
// Bail out on recipes not supported at the moment:
2291+
// * phi recipes other than the canonical induction
2292+
// * recipes writing to memory except interleave groups
2293+
// Only support plans with a canonical induction phi.
2294+
if (R.isPhi())
2295+
return;
2296+
2297+
auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R);
2298+
if (R.mayWriteToMemory() && !InterleaveR)
2299+
return;
2300+
2301+
if (!InterleaveR)
2302+
continue;
2303+
2304+
// Bail out on non-consecutive interleave groups.
2305+
if (!isConsecutiveInterleaveGroup(InterleaveR, FixedVF, TypeInfo,
2306+
VectorRegWidth))
2307+
return;
2308+
2309+
// Skip read interleave groups.
2310+
if (InterleaveR->getStoredValues().empty())
2311+
continue;
2312+
2313+
// For now, we only support full interleave groups storing load interleave
2314+
// groups.
2315+
if (!all_of(enumerate(InterleaveR->getStoredValues()), [](auto Op) {
2316+
VPRecipeBase *DefR = Op.value()->getDefiningRecipe();
2317+
if (!DefR)
2318+
return false;
2319+
auto *IR = dyn_cast<VPInterleaveRecipe>(DefR);
2320+
return IR &&
2321+
IR->getInterleaveGroup()->getFactor() ==
2322+
IR->getInterleaveGroup()->getNumMembers() &&
2323+
IR->getVPValue(Op.index()) == Op.value();
2324+
})) {
2325+
return;
2326+
}
2327+
StoreGroups.push_back(InterleaveR);
2328+
}
2329+
2330+
if (StoreGroups.empty())
2331+
return;
2332+
2333+
// Convert InterleaveGroup R to a single VPWidenLoadRecipe.
2334+
auto NarrowOp = [](VPRecipeBase *R) -> VPValue * {
2335+
auto *LoadGroup = cast<VPInterleaveRecipe>(R);
2336+
// Narrow interleave group to wide load, as transformed VPlan will only
2337+
// process one original iteration.
2338+
auto *L = new VPWidenLoadRecipe(
2339+
*cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos()),
2340+
LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
2341+
/*Reverse=*/false, LoadGroup->getDebugLoc());
2342+
L->insertBefore(LoadGroup);
2343+
return L;
2344+
};
2345+
2346+
// Narrow operation tree rooted at store groups.
2347+
for (auto *StoreGroup : StoreGroups) {
2348+
VPValue *Res =
2349+
NarrowOp(StoreGroup->getStoredValues()[0]->getDefiningRecipe());
2350+
2351+
auto *S = new VPWidenStoreRecipe(
2352+
*cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos()),
2353+
StoreGroup->getAddr(), Res, nullptr, /*Consecutive=*/true,
2354+
/*Reverse=*/false, StoreGroup->getDebugLoc());
2355+
S->insertBefore(StoreGroup);
2356+
StoreGroup->eraseFromParent();
2357+
}
2358+
2359+
// Adjust induction to reflect that the transformed plan only processes one
2360+
// original iteration.
2361+
auto *CanIV = Plan.getCanonicalIV();
2362+
auto *Inc = cast<VPInstruction>(CanIV->getBackedgeValue());
2363+
Inc->setOperand(1, Plan.getOrAddLiveIn(ConstantInt::get(
2364+
CanIV->getScalarType(), 1 * Plan.getUF())));
2365+
removeDeadRecipes(Plan);
2366+
}

llvm/lib/Transforms/Vectorize/VPlanTransforms.h

+9
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,15 @@ struct VPlanTransforms {
192192

193193
/// Add explicit broadcasts for live-ins and VPValues defined in \p Plan's entry block if they are used as vectors.
194194
static void materializeBroadcasts(VPlan &Plan);
195+
196+
/// Try to convert a plan with interleave groups with VF elements to a plan
197+
/// with the interleave groups replaced by wide loads and stores processing VF
198+
/// elements, if all transformed interleave groups access the full vector
199+
/// width (checked via \o VectorRegWidth). This effectively is a very simple
200+
/// form of loop-aware SLP, where we use interleave groups to identify
201+
/// candidates.
202+
static void narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
203+
unsigned VectorRegWidth);
195204
};
196205

197206
} // namespace llvm

llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-unroll.ll

+5-13
Original file line numberDiff line numberDiff line change
@@ -19,19 +19,11 @@ define void @load_store_interleave_group(ptr noalias %data) {
1919
; CHECK-NEXT: [[TMP3:%.*]] = shl nsw i64 [[TMP1]], 1
2020
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP2]]
2121
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP3]]
22-
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
23-
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
24-
; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
25-
; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8
26-
; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <4 x i64> [[WIDE_VEC2]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
27-
; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <4 x i64> [[WIDE_VEC2]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
28-
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i64> [[STRIDED_VEC]], <2 x i64> [[STRIDED_VEC1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
29-
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP8]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
30-
; CHECK-NEXT: store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
31-
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[STRIDED_VEC3]], <2 x i64> [[STRIDED_VEC4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
32-
; CHECK-NEXT: [[INTERLEAVED_VEC5:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
33-
; CHECK-NEXT: store <4 x i64> [[INTERLEAVED_VEC5]], ptr [[TMP5]], align 8
34-
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
22+
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8
23+
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x i64>, ptr [[TMP5]], align 8
24+
; CHECK-NEXT: store <2 x i64> [[WIDE_LOAD]], ptr [[TMP4]], align 8
25+
; CHECK-NEXT: store <2 x i64> [[WIDE_LOAD1]], ptr [[TMP5]], align 8
26+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
3527
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
3628
; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
3729
; CHECK: [[MIDDLE_BLOCK]]:

llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll

+10-26
Original file line numberDiff line numberDiff line change
@@ -20,19 +20,11 @@ define void @load_store_interleave_group(ptr noalias %data) {
2020
; VF2-NEXT: [[TMP8:%.*]] = shl nsw i64 [[TMP6]], 1
2121
; VF2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP1]]
2222
; VF2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP8]]
23-
; VF2-NEXT: [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
24-
; VF2-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
25-
; VF2-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
26-
; VF2-NEXT: [[WIDE_VEC2:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8
27-
; VF2-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <4 x i64> [[WIDE_VEC2]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
28-
; VF2-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <4 x i64> [[WIDE_VEC2]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
29-
; VF2-NEXT: [[TMP3:%.*]] = shufflevector <2 x i64> [[STRIDED_VEC]], <2 x i64> [[STRIDED_VEC1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
30-
; VF2-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
31-
; VF2-NEXT: store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8
32-
; VF2-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[STRIDED_VEC3]], <2 x i64> [[STRIDED_VEC4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
33-
; VF2-NEXT: [[INTERLEAVED_VEC5:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
34-
; VF2-NEXT: store <4 x i64> [[INTERLEAVED_VEC5]], ptr [[TMP5]], align 8
35-
; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
23+
; VF2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
24+
; VF2-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x i64>, ptr [[TMP5]], align 8
25+
; VF2-NEXT: store <2 x i64> [[WIDE_LOAD]], ptr [[TMP2]], align 8
26+
; VF2-NEXT: store <2 x i64> [[WIDE_LOAD1]], ptr [[TMP5]], align 8
27+
; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
3628
; VF2-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
3729
; VF2-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
3830
; VF2: [[MIDDLE_BLOCK]]:
@@ -133,21 +125,13 @@ define void @load_store_interleave_group_different_objecs(ptr noalias %src, ptr
133125
; VF2-NEXT: [[TMP8:%.*]] = shl nsw i64 [[TMP6]], 1
134126
; VF2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[TMP1]]
135127
; VF2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[TMP8]]
136-
; VF2-NEXT: [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
137-
; VF2-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
138-
; VF2-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
139-
; VF2-NEXT: [[WIDE_VEC2:%.*]] = load <4 x i64>, ptr [[TMP10]], align 8
140-
; VF2-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <4 x i64> [[WIDE_VEC2]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
141-
; VF2-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <4 x i64> [[WIDE_VEC2]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
128+
; VF2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
129+
; VF2-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x i64>, ptr [[TMP10]], align 8
142130
; VF2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP1]]
143131
; VF2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP8]]
144-
; VF2-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[STRIDED_VEC]], <2 x i64> [[STRIDED_VEC1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
145-
; VF2-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
146-
; VF2-NEXT: store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8
147-
; VF2-NEXT: [[TMP9:%.*]] = shufflevector <2 x i64> [[STRIDED_VEC3]], <2 x i64> [[STRIDED_VEC4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
148-
; VF2-NEXT: [[INTERLEAVED_VEC5:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
149-
; VF2-NEXT: store <4 x i64> [[INTERLEAVED_VEC5]], ptr [[TMP7]], align 8
150-
; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
132+
; VF2-NEXT: store <2 x i64> [[WIDE_LOAD]], ptr [[TMP3]], align 8
133+
; VF2-NEXT: store <2 x i64> [[WIDE_LOAD1]], ptr [[TMP7]], align 8
134+
; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
151135
; VF2-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
152136
; VF2-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
153137
; VF2: [[MIDDLE_BLOCK]]:

0 commit comments

Comments
 (0)