Skip to content

Commit df019e7

Browse files
committed
[LV] Convert uniform-address scatters to scalar store when unmasked or header-masked
This patch optimizes vector scatters that have a uniform (single-scalar) address by replacing them with "extract-last-element + scalar store" when the scatter is unmasked, or masked by the header mask. In all of these cases, at least one lane is guaranteed to execute in each vector iteration, so storing the last active element is sufficient. Implementation: - Add optimizeScatterWithUniformAddr(VPlan &), and invoke it from VPlanTransforms::optimize(). - Identify non-consecutive VPWidenStoreRecipe/VPWidenStoreEVLRecipe with uniform addresses. - Require either no mask, an all-true mask, or the loop header mask. - Replace the scatter with VPInstruction::ExtractLastElement of the stored value and a VPReplicate (scalar) store. Notes: - The legacy cost model can scalarize a store if both the address and the value are uniform. In VPlan we materialize the stored value via ExtractLastElement, so only the address must be uniform. - Some of the loops won't be vectorized any sine no vector instructions will be generated.
1 parent 902b0bd commit df019e7

File tree

9 files changed

+163
-126
lines changed

9 files changed

+163
-126
lines changed

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1372,6 +1372,50 @@ void VPlanTransforms::simplifyRecipes(VPlan &Plan) {
13721372
}
13731373
}
13741374

1375+
static VPSingleDefRecipe *findHeaderMask(VPlan &Plan);
1376+
1377+
/// Convert scatters with a uniform address that are either unmasked or
1378+
/// masked by the header mask into an extract-last-element + scalar store.
1379+
// TODO: Add a profitability check comparing the cost of a scatter vs.
1380+
// extract + scalar store.
1381+
static void optimizeScatterWithUniformAddr(VPlan &Plan) {
1382+
VPValue *HeaderMask = findHeaderMask(Plan);
1383+
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
1384+
vp_depth_first_deep(Plan.getEntry()))) {
1385+
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
1386+
1387+
// Only transform store recipes.
1388+
if (!isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe>(&R))
1389+
continue;
1390+
1391+
auto StoreR = cast<VPWidenMemoryRecipe>(&R);
1392+
if (StoreR->isConsecutive() ||
1393+
!vputils::isSingleScalar(StoreR->getAddr()))
1394+
continue;
1395+
1396+
assert(!StoreR->isReverse() &&
1397+
"Not consecutive memory recipes shouldn't be reversed");
1398+
VPValue *Mask = StoreR->getMask();
1399+
1400+
// Only convert the scatter to a scalar store if it is unmasked or masked
1401+
// by the header mask, which guarantees that at least one active lane.
1402+
if (Mask && Mask != HeaderMask)
1403+
continue;
1404+
1405+
auto *Extract = new VPInstruction(VPInstruction::ExtractLastElement,
1406+
{StoreR->getOperand(1)});
1407+
Extract->insertBefore(StoreR);
1408+
1409+
// TODO: Sink the scalar store recipe to middle block if possible.
1410+
auto *ScalarStore = new VPReplicateRecipe(
1411+
&StoreR->getIngredient(), {Extract, StoreR->getAddr()},
1412+
true /*IsSingleScalar*/, nullptr /*Mask*/, *StoreR /*Metadata*/);
1413+
ScalarStore->insertBefore(StoreR);
1414+
StoreR->eraseFromParent();
1415+
}
1416+
}
1417+
}
1418+
13751419
static void narrowToSingleScalarRecipes(VPlan &Plan) {
13761420
if (Plan.hasScalarVFOnly())
13771421
return;
@@ -2320,6 +2364,7 @@ void VPlanTransforms::optimize(VPlan &Plan) {
23202364
runPass(removeDeadRecipes, Plan);
23212365
runPass(simplifyBlends, Plan);
23222366
runPass(legalizeAndOptimizeInductions, Plan);
2367+
runPass(optimizeScatterWithUniformAddr, Plan);
23232368
runPass(narrowToSingleScalarRecipes, Plan);
23242369
runPass(removeRedundantExpandSCEVRecipes, Plan);
23252370
runPass(simplifyRecipes, Plan);

llvm/test/Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -31,17 +31,19 @@ define void @predicated_uniform_load(ptr %src, i32 %n, ptr %dst, i1 %cond) {
3131
; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i1> poison, i1 [[COND:%.*]], i64 0
3232
; CHECK-NEXT: [[BROADCAST_SPLAT1:%.*]] = shufflevector <vscale x 4 x i1> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
3333
; CHECK-NEXT: [[TMP13:%.*]] = xor <vscale x 4 x i1> [[BROADCAST_SPLAT1]], splat (i1 true)
34-
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[BOXES]], i64 0
35-
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer
36-
; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[NBRBOXES]], i64 0
34+
; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[BOXES]], i64 0
3735
; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 4 x ptr> [[BROADCAST_SPLATINSERT3]], <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer
3836
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
3937
; CHECK: vector.body:
4038
; CHECK-NEXT: [[AVL:%.*]] = phi i32 [ [[TMP3]], [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
4139
; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 4, i1 true)
42-
; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.vp.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[BROADCAST_SPLAT]], <vscale x 4 x i1> [[TMP13]], i32 [[TMP10]]), !alias.scope [[META0:![0-9]+]]
40+
; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.vp.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[BROADCAST_SPLAT4]], <vscale x 4 x i1> [[TMP13]], i32 [[TMP10]]), !alias.scope [[META0:![0-9]+]]
4341
; CHECK-NEXT: [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[BROADCAST_SPLAT1]], <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> [[WIDE_MASKED_GATHER]]
44-
; CHECK-NEXT: call void @llvm.vp.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[PREDPHI]], <vscale x 4 x ptr> align 4 [[BROADCAST_SPLAT4]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]]), !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
42+
; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.vscale.i32()
43+
; CHECK-NEXT: [[TMP18:%.*]] = mul nuw i32 [[TMP12]], 4
44+
; CHECK-NEXT: [[TMP14:%.*]] = sub i32 [[TMP18]], 1
45+
; CHECK-NEXT: [[TMP15:%.*]] = extractelement <vscale x 4 x i32> [[PREDPHI]], i32 [[TMP14]]
46+
; CHECK-NEXT: store i32 [[TMP15]], ptr [[NBRBOXES]], align 4, !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
4547
; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP10]]
4648
; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i32 [[AVL_NEXT]], 0
4749
; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
@@ -63,7 +65,7 @@ define void @predicated_uniform_load(ptr %src, i32 %n, ptr %dst, i1 %cond) {
6365
; CHECK-NEXT: store i32 [[STORE]], ptr [[NBRBOXES]], align 4
6466
; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1
6567
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp sgt i32 [[IV]], [[IBOX]]
66-
; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
68+
; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
6769
; CHECK: exit:
6870
; CHECK-NEXT: ret void
6971
;
@@ -114,7 +116,7 @@ define void @predicated_strided_store(ptr %start) {
114116
; RVA23-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP3]]
115117
; RVA23-NEXT: [[VEC_IND_NEXT]] = add <vscale x 8 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
116118
; RVA23-NEXT: [[TMP7:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
117-
; RVA23-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
119+
; RVA23-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
118120
; RVA23: middle.block:
119121
; RVA23-NEXT: br label [[LOOP:%.*]]
120122
; RVA23: exit:
@@ -141,7 +143,7 @@ define void @predicated_strided_store(ptr %start) {
141143
; RVA23ZVL1024B-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP3]]
142144
; RVA23ZVL1024B-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
143145
; RVA23ZVL1024B-NEXT: [[TMP7:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
144-
; RVA23ZVL1024B-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
146+
; RVA23ZVL1024B-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
145147
; RVA23ZVL1024B: middle.block:
146148
; RVA23ZVL1024B-NEXT: br label [[LOOP:%.*]]
147149
; RVA23ZVL1024B: exit:
@@ -188,13 +190,14 @@ define void @store_to_addr_generated_from_invariant_addr(ptr noalias %p0, ptr no
188190
; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP6]], i64 0
189191
; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
190192
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[P3:%.*]], <vscale x 2 x i64> [[BROADCAST_SPLAT2]]
191-
; CHECK-NEXT: call void @llvm.vp.scatter.nxv2i32.nxv2p0(<vscale x 2 x i32> zeroinitializer, <vscale x 2 x ptr> align 4 [[TMP7]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP3]])
192-
; CHECK-NEXT: call void @llvm.vp.scatter.nxv2i32.nxv2p0(<vscale x 2 x i32> zeroinitializer, <vscale x 2 x ptr> align 4 [[TMP7]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP3]])
193-
; CHECK-NEXT: call void @llvm.vp.scatter.nxv2i8.nxv2p0(<vscale x 2 x i8> zeroinitializer, <vscale x 2 x ptr> align 1 [[TMP7]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP3]])
193+
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <vscale x 2 x ptr> [[TMP7]], i32 0
194+
; CHECK-NEXT: store i32 0, ptr [[TMP8]], align 4
195+
; CHECK-NEXT: store i32 0, ptr [[TMP8]], align 4
196+
; CHECK-NEXT: store i8 0, ptr [[TMP8]], align 1
194197
; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP4]]
195198
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
196199
; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
197-
; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
200+
; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
198201
; CHECK: middle.block:
199202
; CHECK-NEXT: br label [[LOOP:%.*]]
200203
; CHECK: exit:

llvm/test/Transforms/LoopVectorize/RISCV/induction-costs.ll

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -126,8 +126,6 @@ define void @test_3_inductions(ptr noalias %dst, ptr noalias %src, i64 %n) #1 {
126126
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1
127127
; CHECK-NEXT: br label %[[VECTOR_PH:.*]]
128128
; CHECK: [[VECTOR_PH]]:
129-
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[DST]], i64 0
130-
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
131129
; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
132130
; CHECK-NEXT: [[TMP2:%.*]] = mul <vscale x 2 x i32> [[TMP1]], splat (i32 2)
133131
; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 2 x i32> splat (i32 1), [[TMP2]]
@@ -144,7 +142,11 @@ define void @test_3_inductions(ptr noalias %dst, ptr noalias %src, i64 %n) #1 {
144142
; CHECK-NEXT: [[TMP5:%.*]] = or <vscale x 2 x i32> [[VEC_IND2]], [[VEC_IND]]
145143
; CHECK-NEXT: [[TMP6:%.*]] = sext <vscale x 2 x i32> [[TMP5]] to <vscale x 2 x i64>
146144
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[SRC]], <vscale x 2 x i64> [[TMP6]]
147-
; CHECK-NEXT: call void @llvm.vp.scatter.nxv2p0.nxv2p0(<vscale x 2 x ptr> [[TMP7]], <vscale x 2 x ptr> align 8 [[BROADCAST_SPLAT]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP3]])
145+
; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.vscale.i32()
146+
; CHECK-NEXT: [[TMP13:%.*]] = mul nuw i32 [[TMP12]], 2
147+
; CHECK-NEXT: [[TMP10:%.*]] = sub i32 [[TMP13]], 1
148+
; CHECK-NEXT: [[TMP11:%.*]] = extractelement <vscale x 2 x ptr> [[TMP7]], i32 [[TMP10]]
149+
; CHECK-NEXT: store ptr [[TMP11]], ptr [[DST]], align 8
148150
; CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP3]] to i64
149151
; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]]
150152
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i32> [[VEC_IND]], [[BROADCAST_SPLAT4]]

llvm/test/Transforms/LoopVectorize/RISCV/pr154103.ll

Lines changed: 37 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -6,29 +6,46 @@
66
define void @pr154103(ptr noalias %a, ptr noalias %b, ptr noalias %c, ptr noalias %d) {
77
; CHECK-LABEL: define void @pr154103(
88
; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], ptr noalias [[D:%.*]]) #[[ATTR0:[0-9]+]] {
9-
; CHECK-NEXT: [[ENTRY:.*]]:
10-
; CHECK-NEXT: br label %[[LOOP:.*]]
11-
; CHECK: [[LOOP]]:
12-
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 1, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ]
13-
; CHECK-NEXT: [[GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]]
14-
; CHECK-NEXT: [[X:%.*]] = load i8, ptr [[GEP]], align 1
15-
; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[X]] to i64
16-
; CHECK-NEXT: [[DIV:%.*]] = sdiv i64 0, [[CONV]]
17-
; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i64 [[DIV]], 0
18-
; CHECK-NEXT: br i1 [[CMP]], label %[[THEN:.*]], label %[[LATCH]]
19-
; CHECK: [[THEN]]:
20-
; CHECK-NEXT: [[Y:%.*]] = load i8, ptr [[B]], align 1
21-
; CHECK-NEXT: [[ZEXT:%.*]] = zext i8 [[Y]] to i64
22-
; CHECK-NEXT: [[NOT:%.*]] = xor i64 [[ZEXT]], 0
23-
; CHECK-NEXT: br label %[[LATCH]]
24-
; CHECK: [[LATCH]]:
25-
; CHECK-NEXT: [[COND:%.*]] = phi i64 [ [[NOT]], %[[THEN]] ], [ 0, %[[LOOP]] ]
26-
; CHECK-NEXT: [[TRUNC:%.*]] = trunc i64 [[COND]] to i16
9+
; CHECK-NEXT: [[ENTRY:.*:]]
10+
; CHECK-NEXT: br label %[[VECTOR_PH:.*]]
11+
; CHECK: [[VECTOR_PH]]:
12+
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[B]], i64 0
13+
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer
14+
; CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
15+
; CHECK-NEXT: [[TMP1:%.*]] = mul <vscale x 4 x i64> [[TMP0]], splat (i64 7)
16+
; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i64> splat (i64 1), [[TMP1]]
17+
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
18+
; CHECK: [[VECTOR_BODY]]:
19+
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
20+
; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ -7905747460161236406, %[[VECTOR_PH]] ], [ [[IV:%.*]], %[[VECTOR_BODY]] ]
21+
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
22+
; CHECK-NEXT: [[TMP3:%.*]] = zext i32 [[TMP2]] to i64
23+
; CHECK-NEXT: [[TMP4:%.*]] = mul i64 7, [[TMP3]]
24+
; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP4]], i64 0
25+
; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
26+
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[A]], <vscale x 4 x i64> [[VEC_IND]]
27+
; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i8> @llvm.vp.gather.nxv4i8.nxv4p0(<vscale x 4 x ptr> align 1 [[TMP5]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP2]])
28+
; CHECK-NEXT: [[TMP6:%.*]] = zext <vscale x 4 x i8> [[WIDE_MASKED_GATHER]] to <vscale x 4 x i64>
29+
; CHECK-NEXT: [[TMP7:%.*]] = call <vscale x 4 x i64> @llvm.vp.merge.nxv4i64(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i64> [[TMP6]], <vscale x 4 x i64> splat (i64 1), i32 [[TMP2]])
30+
; CHECK-NEXT: [[TMP8:%.*]] = sdiv <vscale x 4 x i64> zeroinitializer, [[TMP7]]
31+
; CHECK-NEXT: [[TMP9:%.*]] = icmp sgt <vscale x 4 x i64> [[TMP8]], zeroinitializer
32+
; CHECK-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 4 x i8> @llvm.vp.gather.nxv4i8.nxv4p0(<vscale x 4 x ptr> align 1 [[BROADCAST_SPLAT]], <vscale x 4 x i1> [[TMP9]], i32 [[TMP2]])
33+
; CHECK-NEXT: [[TMP10:%.*]] = zext <vscale x 4 x i8> [[WIDE_MASKED_GATHER3]] to <vscale x 4 x i64>
34+
; CHECK-NEXT: [[TMP11:%.*]] = xor <vscale x 4 x i64> [[TMP10]], zeroinitializer
35+
; CHECK-NEXT: [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i64> [[TMP11]], <vscale x 4 x i64> zeroinitializer
36+
; CHECK-NEXT: [[TMP12:%.*]] = trunc <vscale x 4 x i64> [[PREDPHI]] to <vscale x 4 x i16>
37+
; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.vscale.i32()
38+
; CHECK-NEXT: [[TMP14:%.*]] = mul nuw i32 [[TMP13]], 4
39+
; CHECK-NEXT: [[TMP15:%.*]] = sub i32 [[TMP14]], 1
40+
; CHECK-NEXT: [[TRUNC:%.*]] = extractelement <vscale x 4 x i16> [[TMP12]], i32 [[TMP15]]
2741
; CHECK-NEXT: store i16 [[TRUNC]], ptr [[C]], align 2
2842
; CHECK-NEXT: store i32 0, ptr [[D]], align 4
29-
; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 7
43+
; CHECK-NEXT: [[IV]] = sub nuw i64 [[AVL]], [[TMP3]]
44+
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT2]]
3045
; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[IV]], 0
31-
; CHECK-NEXT: br i1 [[DONE]], label %[[EXIT:.*]], label %[[LOOP]]
46+
; CHECK-NEXT: br i1 [[DONE]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
47+
; CHECK: [[MIDDLE_BLOCK]]:
48+
; CHECK-NEXT: br label %[[EXIT:.*]]
3249
; CHECK: [[EXIT]]:
3350
; CHECK-NEXT: ret void
3451
;

0 commit comments

Comments
 (0)