Skip to content

Commit dfa665f

Browse files
authored
[VPlan] Add transformation to narrow interleave groups. (#106441)
This patch adds a new narrowInterleaveGroups transfrom, which tries convert a plan with interleave groups with VF elements to a plan that instead replaces the interleave groups with wide loads and stores processing VF elements. This effectively is a very simple form of loop-aware SLP, where we use interleave groups to identify candidates. This initial version is quite restricted and hopefully serves as a starting point for how to best model those kinds of transforms. Depends on #106431. Fixes #82936. PR: #106441
1 parent 0d3ba08 commit dfa665f

File tree

4 files changed

+126
-92
lines changed

4 files changed

+126
-92
lines changed

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

+80-12
Original file line numberDiff line numberDiff line change
@@ -2236,6 +2236,36 @@ void VPlanTransforms::materializeBroadcasts(VPlan &Plan) {
22362236
}
22372237
}
22382238

2239+
/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
2240+
/// converted to a narrower recipe. \p V is used by a wide recipe \p WideMember
2241+
/// that feeds a store interleave group at index \p Idx, \p WideMember0 is the
2242+
/// recipe feeding the same interleave group at index 0. A VPWidenLoadRecipe can
2243+
/// be narrowed to an index-independent load if it feeds all wide ops at all
2244+
/// indices (checked by via the operands of the wide recipe at lane0, \p
2245+
/// WideMember0). A VPInterleaveRecipe can be narrowed to a wide load, if \p V
2246+
/// is defined at \p Idx of a load interleave group.
2247+
static bool canNarrowLoad(VPWidenRecipe *WideMember0, VPWidenRecipe *WideMember,
2248+
VPValue *V, unsigned Idx) {
2249+
auto *DefR = V->getDefiningRecipe();
2250+
if (!DefR)
2251+
return false;
2252+
if (auto *W = dyn_cast<VPWidenLoadRecipe>(DefR))
2253+
return !W->getMask() &&
2254+
all_of(zip(WideMember0->operands(), WideMember->operands()),
2255+
[V](const auto P) {
2256+
// V must be as at the same places in both WideMember0 and
2257+
// WideMember.
2258+
const auto &[WideMember0Op, WideMemberOp] = P;
2259+
return (WideMember0Op == V) == (WideMemberOp == V);
2260+
});
2261+
2262+
if (auto *IR = dyn_cast<VPInterleaveRecipe>(DefR))
2263+
return IR->getInterleaveGroup()->getFactor() ==
2264+
IR->getInterleaveGroup()->getNumMembers() &&
2265+
IR->getVPValue(Idx) == V;
2266+
return false;
2267+
}
2268+
22392269
/// Returns true if \p IR is a full interleave group with factor and number of
22402270
/// members both equal to \p VF. The interleave group must also access the full
22412271
/// vector width \p VectorRegWidth.
@@ -2298,6 +2328,8 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
22982328
if (R.mayWriteToMemory() && !InterleaveR)
22992329
return;
23002330

2331+
// All other ops are allowed, but we reject uses that cannot be converted
2332+
// when checking all allowed consumers (store interleave groups) below.
23012333
if (!InterleaveR)
23022334
continue;
23032335

@@ -2312,7 +2344,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
23122344

23132345
// For now, we only support full interleave groups storing load interleave
23142346
// groups.
2315-
if (!all_of(enumerate(InterleaveR->getStoredValues()), [](auto Op) {
2347+
if (all_of(enumerate(InterleaveR->getStoredValues()), [](auto Op) {
23162348
VPRecipeBase *DefR = Op.value()->getDefiningRecipe();
23172349
if (!DefR)
23182350
return false;
@@ -2322,31 +2354,67 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
23222354
IR->getInterleaveGroup()->getNumMembers() &&
23232355
IR->getVPValue(Op.index()) == Op.value();
23242356
})) {
2357+
StoreGroups.push_back(InterleaveR);
2358+
continue;
2359+
}
2360+
2361+
// Check if all values feeding InterleaveR are matching wide recipes, which
2362+
// operands that can be narrowed.
2363+
auto *WideMember0 = dyn_cast_or_null<VPWidenRecipe>(
2364+
InterleaveR->getStoredValues()[0]->getDefiningRecipe());
2365+
if (!WideMember0)
23252366
return;
2367+
for (const auto &[I, V] : enumerate(InterleaveR->getStoredValues())) {
2368+
auto *R = dyn_cast<VPWidenRecipe>(V->getDefiningRecipe());
2369+
if (!R || R->getOpcode() != WideMember0->getOpcode() ||
2370+
R->getNumOperands() > 2)
2371+
return;
2372+
if (any_of(R->operands(), [WideMember0, Idx = I, R](VPValue *V) {
2373+
return !canNarrowLoad(WideMember0, R, V, Idx);
2374+
}))
2375+
return;
23262376
}
23272377
StoreGroups.push_back(InterleaveR);
23282378
}
23292379

23302380
if (StoreGroups.empty())
23312381
return;
23322382

2333-
// Convert InterleaveGroup R to a single VPWidenLoadRecipe.
2383+
// Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
23342384
auto NarrowOp = [](VPRecipeBase *R) -> VPValue * {
2335-
auto *LoadGroup = cast<VPInterleaveRecipe>(R);
2336-
// Narrow interleave group to wide load, as transformed VPlan will only
2385+
if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) {
2386+
// Narrow interleave group to wide load, as transformed VPlan will only
2387+
// process one original iteration.
2388+
auto *L = new VPWidenLoadRecipe(
2389+
*cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos()),
2390+
LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
2391+
/*Reverse=*/false, LoadGroup->getDebugLoc());
2392+
L->insertBefore(LoadGroup);
2393+
return L;
2394+
}
2395+
2396+
auto *WideLoad = cast<VPWidenLoadRecipe>(R);
2397+
2398+
// Narrow wide load to uniform scalar load, as transformed VPlan will only
23372399
// process one original iteration.
2338-
auto *L = new VPWidenLoadRecipe(
2339-
*cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos()),
2340-
LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
2341-
/*Reverse=*/false, LoadGroup->getDebugLoc());
2342-
L->insertBefore(LoadGroup);
2343-
return L;
2400+
auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(),
2401+
WideLoad->operands(), /*IsUniform*/ true);
2402+
N->insertBefore(WideLoad);
2403+
return N;
23442404
};
23452405

23462406
// Narrow operation tree rooted at store groups.
23472407
for (auto *StoreGroup : StoreGroups) {
2348-
VPValue *Res =
2349-
NarrowOp(StoreGroup->getStoredValues()[0]->getDefiningRecipe());
2408+
VPValue *Res = nullptr;
2409+
if (auto *WideMember0 = dyn_cast<VPWidenRecipe>(
2410+
StoreGroup->getStoredValues()[0]->getDefiningRecipe())) {
2411+
for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx)
2412+
WideMember0->setOperand(
2413+
Idx, NarrowOp(WideMember0->getOperand(Idx)->getDefiningRecipe()));
2414+
Res = WideMember0;
2415+
} else {
2416+
Res = NarrowOp(StoreGroup->getStoredValues()[0]->getDefiningRecipe());
2417+
}
23502418

23512419
auto *S = new VPWidenStoreRecipe(
23522420
*cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos()),

llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll

+8-22
Original file line numberDiff line numberDiff line change
@@ -99,31 +99,17 @@ define void @test_complex_add_double(ptr %res, ptr noalias %A, ptr noalias %B, i
9999
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[TMP1]]
100100
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[B]], i64 [[TMP0]]
101101
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[B]], i64 [[TMP1]]
102-
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <4 x double>, ptr [[TMP2]], align 4
103-
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
104-
; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
105-
; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <4 x double>, ptr [[TMP3]], align 4
106-
; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
107-
; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
108-
; CHECK-NEXT: [[WIDE_VEC5:%.*]] = load <4 x double>, ptr [[TMP4]], align 4
109-
; CHECK-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <4 x double> [[WIDE_VEC5]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
110-
; CHECK-NEXT: [[STRIDED_VEC7:%.*]] = shufflevector <4 x double> [[WIDE_VEC5]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
111-
; CHECK-NEXT: [[WIDE_VEC8:%.*]] = load <4 x double>, ptr [[TMP5]], align 4
112-
; CHECK-NEXT: [[STRIDED_VEC9:%.*]] = shufflevector <4 x double> [[WIDE_VEC8]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
113-
; CHECK-NEXT: [[STRIDED_VEC10:%.*]] = shufflevector <4 x double> [[WIDE_VEC8]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
114-
; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[STRIDED_VEC]], [[STRIDED_VEC6]]
115-
; CHECK-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[STRIDED_VEC3]], [[STRIDED_VEC9]]
116-
; CHECK-NEXT: [[TMP8:%.*]] = fadd <2 x double> [[STRIDED_VEC1]], [[STRIDED_VEC7]]
102+
; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = load <2 x double>, ptr [[TMP2]], align 4
103+
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x double>, ptr [[TMP3]], align 4
104+
; CHECK-NEXT: [[STRIDED_VEC10:%.*]] = load <2 x double>, ptr [[TMP4]], align 4
105+
; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <2 x double>, ptr [[TMP5]], align 4
117106
; CHECK-NEXT: [[TMP9:%.*]] = fadd <2 x double> [[STRIDED_VEC4]], [[STRIDED_VEC10]]
107+
; CHECK-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[WIDE_LOAD1]], [[WIDE_LOAD3]]
118108
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[TMP0]]
119109
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[TMP1]]
120-
; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
121-
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP12]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
122-
; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP10]], align 4
123-
; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
124-
; CHECK-NEXT: [[INTERLEAVED_VEC11:%.*]] = shufflevector <4 x double> [[TMP13]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
125-
; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC11]], ptr [[TMP11]], align 4
126-
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
110+
; CHECK-NEXT: store <2 x double> [[TMP9]], ptr [[TMP10]], align 4
111+
; CHECK-NEXT: store <2 x double> [[TMP7]], ptr [[TMP11]], align 4
112+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
127113
; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
128114
; CHECK-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
129115
; CHECK: [[MIDDLE_BLOCK]]:

llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll

+30-41
Original file line numberDiff line numberDiff line change
@@ -19,24 +19,14 @@ define void @test_2xi64_unary_op_load_interleave_group(ptr noalias %data, ptr no
1919
; VF2-NEXT: [[TMP1:%.*]] = shl nsw i64 [[TMP0]], 1
2020
; VF2-NEXT: [[TMP12:%.*]] = shl nsw i64 [[TMP10]], 1
2121
; VF2-NEXT: [[TMP2:%.*]] = getelementptr inbounds double, ptr [[DATA]], i64 [[TMP1]]
22-
; VF2-NEXT: [[TMP13:%.*]] = getelementptr inbounds double, ptr [[DATA]], i64 [[TMP12]]
23-
; VF2-NEXT: [[WIDE_VEC:%.*]] = load <4 x double>, ptr [[TMP2]], align 8
24-
; VF2-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
25-
; VF2-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
26-
; VF2-NEXT: [[WIDE_VEC2:%.*]] = load <4 x double>, ptr [[TMP13]], align 8
27-
; VF2-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
28-
; VF2-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
29-
; VF2-NEXT: [[TMP3:%.*]] = fneg <2 x double> [[STRIDED_VEC]]
30-
; VF2-NEXT: [[TMP14:%.*]] = fneg <2 x double> [[STRIDED_VEC3]]
31-
; VF2-NEXT: [[TMP4:%.*]] = fneg <2 x double> [[STRIDED_VEC1]]
32-
; VF2-NEXT: [[TMP9:%.*]] = fneg <2 x double> [[STRIDED_VEC4]]
33-
; VF2-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
34-
; VF2-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP5]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
35-
; VF2-NEXT: store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8
36-
; VF2-NEXT: [[TMP11:%.*]] = shufflevector <2 x double> [[TMP14]], <2 x double> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
37-
; VF2-NEXT: [[INTERLEAVED_VEC5:%.*]] = shufflevector <4 x double> [[TMP11]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
38-
; VF2-NEXT: store <4 x double> [[INTERLEAVED_VEC5]], ptr [[TMP13]], align 8
39-
; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
22+
; VF2-NEXT: [[TMP5:%.*]] = getelementptr inbounds double, ptr [[DATA]], i64 [[TMP12]]
23+
; VF2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP2]], align 8
24+
; VF2-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x double>, ptr [[TMP5]], align 8
25+
; VF2-NEXT: [[TMP9:%.*]] = fneg <2 x double> [[WIDE_LOAD]]
26+
; VF2-NEXT: [[TMP11:%.*]] = fneg <2 x double> [[WIDE_LOAD1]]
27+
; VF2-NEXT: store <2 x double> [[TMP9]], ptr [[TMP2]], align 8
28+
; VF2-NEXT: store <2 x double> [[TMP11]], ptr [[TMP5]], align 8
29+
; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
4030
; VF2-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
4131
; VF2-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
4232
; VF2: [[MIDDLE_BLOCK]]:
@@ -200,18 +190,15 @@ define void @test_2xi64(ptr noalias %data, ptr noalias %factor) {
200190
; VF2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
201191
; VF2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
202192
; VF2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
203-
; VF2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
193+
; VF2-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
194+
; VF2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i64 0
195+
; VF2-NEXT: [[WIDE_LOAD:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
204196
; VF2-NEXT: [[TMP6:%.*]] = shl nsw i64 [[TMP0]], 1
205197
; VF2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]]
206-
; VF2-NEXT: [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP7]], align 8
207-
; VF2-NEXT: [[TMP11:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
208-
; VF2-NEXT: [[TMP23:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
209-
; VF2-NEXT: [[TMP12:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[TMP11]]
210-
; VF2-NEXT: [[TMP24:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[TMP23]]
211-
; VF2-NEXT: [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> [[TMP24]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
212-
; VF2-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP8]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
213-
; VF2-NEXT: store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP7]], align 8
214-
; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
198+
; VF2-NEXT: [[STRIDED_VEC1:%.*]] = load <2 x i64>, ptr [[TMP7]], align 8
199+
; VF2-NEXT: [[TMP8:%.*]] = mul <2 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]]
200+
; VF2-NEXT: store <2 x i64> [[TMP8]], ptr [[TMP7]], align 8
201+
; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
215202
; VF2-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
216203
; VF2-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
217204
; VF2: [[MIDDLE_BLOCK]]:
@@ -1014,28 +1001,30 @@ define void @test_2xi64_sub_of_wide_loads(ptr noalias %data, ptr noalias %A, ptr
10141001
; VF2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP0]]
10151002
; VF2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
10161003
; VF2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 2
1017-
; VF2-NEXT: [[BROADCAST_SPLAT:%.*]] = load <2 x i64>, ptr [[TMP3]], align 8
1018-
; VF2-NEXT: [[BROADCAST_SPLAT4:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8
1004+
; VF2-NEXT: [[TMP5:%.*]] = load i64, ptr [[TMP3]], align 8
1005+
; VF2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i64 0
1006+
; VF2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
1007+
; VF2-NEXT: [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 8
1008+
; VF2-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <2 x i64> poison, i64 [[TMP6]], i64 0
1009+
; VF2-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT3]], <2 x i64> poison, <2 x i32> zeroinitializer
10191010
; VF2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
10201011
; VF2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 0
10211012
; VF2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 2
1022-
; VF2-NEXT: [[BROADCAST_SPLAT2:%.*]] = load <2 x i64>, ptr [[TMP8]], align 8
1023-
; VF2-NEXT: [[BROADCAST_SPLAT6:%.*]] = load <2 x i64>, ptr [[TMP9]], align 8
1013+
; VF2-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 8
1014+
; VF2-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i64> poison, i64 [[TMP10]], i64 0
1015+
; VF2-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT1]], <2 x i64> poison, <2 x i32> zeroinitializer
1016+
; VF2-NEXT: [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 8
1017+
; VF2-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <2 x i64> poison, i64 [[TMP11]], i64 0
1018+
; VF2-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT5]], <2 x i64> poison, <2 x i32> zeroinitializer
10241019
; VF2-NEXT: [[TMP12:%.*]] = sub <2 x i64> [[BROADCAST_SPLAT]], [[BROADCAST_SPLAT2]]
10251020
; VF2-NEXT: [[TMP13:%.*]] = sub <2 x i64> [[BROADCAST_SPLAT4]], [[BROADCAST_SPLAT6]]
10261021
; VF2-NEXT: [[TMP19:%.*]] = shl nsw i64 [[TMP0]], 1
10271022
; VF2-NEXT: [[TMP20:%.*]] = shl nsw i64 [[TMP1]], 1
10281023
; VF2-NEXT: [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP19]]
10291024
; VF2-NEXT: [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP20]]
1030-
; VF2-NEXT: [[TMP14:%.*]] = sub <2 x i64> [[BROADCAST_SPLAT]], [[BROADCAST_SPLAT2]]
1031-
; VF2-NEXT: [[TMP15:%.*]] = sub <2 x i64> [[BROADCAST_SPLAT4]], [[BROADCAST_SPLAT6]]
1032-
; VF2-NEXT: [[TMP16:%.*]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1033-
; VF2-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP16]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
1034-
; VF2-NEXT: store <4 x i64> [[INTERLEAVED_VEC]], ptr [[DATA_0]], align 8
1035-
; VF2-NEXT: [[TMP17:%.*]] = shufflevector <2 x i64> [[TMP13]], <2 x i64> [[TMP15]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1036-
; VF2-NEXT: [[INTERLEAVED_VEC4:%.*]] = shufflevector <4 x i64> [[TMP17]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
1037-
; VF2-NEXT: store <4 x i64> [[INTERLEAVED_VEC4]], ptr [[DATA_1]], align 8
1038-
; VF2-NEXT: [[IV_NEXT]] = add nuw i64 [[INDEX]], 4
1025+
; VF2-NEXT: store <2 x i64> [[TMP12]], ptr [[DATA_0]], align 8
1026+
; VF2-NEXT: store <2 x i64> [[TMP13]], ptr [[DATA_1]], align 8
1027+
; VF2-NEXT: [[IV_NEXT]] = add nuw i64 [[INDEX]], 2
10391028
; VF2-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 100
10401029
; VF2-NEXT: br i1 [[EC]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
10411030
; VF2: [[MIDDLE_BLOCK]]:

0 commit comments

Comments
 (0)