Skip to content

Commit 4a90b09

Browse files
committed
[LV] Convert uniform-address scatters to scalar store when unmasked.
This patch optimizes vector scatters that have a uniform (single-scalar) address by replacing them with "extract-last-element + scalar store" when the scatter is unmasked. In all of these cases, at least one lane is guaranteed to execute in each vector iteration, so storing the last active element is sufficient. Implementation: - Add optimizeScatterWithUniformAddr(VPlan &), and invoke it from VPlanTransforms::optimize(). - Identify non-consecutive VPWidenStoreRecipe/VPWidenStoreEVLRecipe with uniform addresses. - Replace the scatter with VPInstruction::ExtractLastElement of the stored value and a VPReplicate (scalar) store. Notes: - The legacy cost model can scalarize a store if both the address and the value are uniform. In VPlan we materialize the stored value via ExtractLastElement, so only the address must be uniform. - Some of the loops won't be vectorized any sine no vector instructions will be generated. I plan to have a follow-up patch for convert uniform-address scatters to scalar store when the mask is header maks. This reqiures `extract-last-active-element` to get the correct value to store.
1 parent 00b1bd1 commit 4a90b09

File tree

2 files changed

+50
-23
lines changed

2 files changed

+50
-23
lines changed

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 36 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1392,14 +1392,46 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
13921392
vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
13931393
for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
13941394
if (!isa<VPWidenRecipe, VPWidenCastRecipe, VPWidenSelectRecipe,
1395-
VPReplicateRecipe>(&R))
1395+
VPReplicateRecipe, VPWidenMemoryRecipe>(&R))
13961396
continue;
13971397
auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
13981398
if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))
13991399
continue;
14001400

1401-
auto *RepOrWidenR = cast<VPSingleDefRecipe>(&R);
1402-
if (RepR && isa<StoreInst>(RepR->getUnderlyingInstr()) &&
1401+
// Convert scatters with a uniform address that is unmasked into an
1402+
// extract-last-element + scalar store.
1403+
// TODO: Add a profitability check comparing the cost of a scatter vs.
1404+
// extract + scalar store.
1405+
auto *WidenStoreR = dyn_cast<VPWidenMemoryRecipe>(&R);
1406+
if (WidenStoreR && vputils::isSingleScalar(WidenStoreR->getAddr()) &&
1407+
!WidenStoreR->isConsecutive() &&
1408+
isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe>(WidenStoreR)) {
1409+
assert(!WidenStoreR->isReverse() &&
1410+
"Not consecutive memory recipes shouldn't be reversed");
1411+
VPValue *Mask = WidenStoreR->getMask();
1412+
1413+
// Only convert the scatter to a scalar store if it is unmasked. or
1414+
// TODO: Support converting scatter masked by the header mask to scalar
1415+
// store.
1416+
if (Mask)
1417+
continue;
1418+
1419+
auto *Extract = new VPInstruction(VPInstruction::ExtractLastElement,
1420+
{WidenStoreR->getOperand(1)});
1421+
Extract->insertBefore(WidenStoreR);
1422+
1423+
// TODO: Sink the scalar store recipe to middle block if possible.
1424+
auto *ScalarStore = new VPReplicateRecipe(
1425+
&WidenStoreR->getIngredient(), {Extract, WidenStoreR->getAddr()},
1426+
true /*IsSingleScalar*/, nullptr /*Mask*/,
1427+
*WidenStoreR /*Metadata*/);
1428+
ScalarStore->insertBefore(WidenStoreR);
1429+
WidenStoreR->eraseFromParent();
1430+
continue;
1431+
}
1432+
1433+
auto *RepOrWidenR = dyn_cast<VPSingleDefRecipe>(&R);
1434+
if (RepR && RepOrWidenR && isa<StoreInst>(RepR->getUnderlyingInstr()) &&
14031435
vputils::isSingleScalar(RepR->getOperand(1))) {
14041436
auto *Clone = new VPReplicateRecipe(
14051437
RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
@@ -1419,7 +1451,7 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
14191451
// Skip recipes that aren't single scalars or don't have only their
14201452
// scalar results used. In the latter case, we would introduce extra
14211453
// broadcasts.
1422-
if (!vputils::isSingleScalar(RepOrWidenR) ||
1454+
if (!RepOrWidenR || !vputils::isSingleScalar(RepOrWidenR) ||
14231455
!all_of(RepOrWidenR->users(), [RepOrWidenR](const VPUser *U) {
14241456
return U->usesScalars(RepOrWidenR) ||
14251457
match(cast<VPRecipeBase>(U),

llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-evl-crash.ll

Lines changed: 14 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -49,25 +49,21 @@ exit: ; preds = %loop
4949
define void @truncate_i16_to_i8_cse(ptr noalias %src, ptr noalias %dst) {
5050
; CHECK-LABEL: define void @truncate_i16_to_i8_cse(
5151
; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) #[[ATTR0]] {
52-
; CHECK-NEXT: [[ENTRY:.*:]]
53-
; CHECK-NEXT: br label %[[VECTOR_PH:.*]]
54-
; CHECK: [[VECTOR_PH]]:
55-
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x ptr> poison, ptr [[DST]], i64 0
56-
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x ptr> [[BROADCAST_SPLATINSERT]], <2 x ptr> poison, <2 x i32> zeroinitializer
57-
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
58-
; CHECK: [[VECTOR_BODY]]:
59-
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
52+
; CHECK-NEXT: [[ENTRY:.*]]:
53+
; CHECK-NEXT: br label %[[LOOP:.*]]
54+
; CHECK: [[LOOP]]:
55+
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
56+
; CHECK-NEXT: [[COUNT:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[COUNT_NEXT:%.*]], %[[LOOP]] ]
6057
; CHECK-NEXT: [[VAL:%.*]] = load i16, ptr [[SRC]], align 2
61-
; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i16> poison, i16 [[VAL]], i64 0
62-
; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT1]], <2 x i16> poison, <2 x i32> zeroinitializer
63-
; CHECK-NEXT: [[TMP1:%.*]] = trunc <2 x i16> [[BROADCAST_SPLAT2]] to <2 x i8>
64-
; CHECK-NEXT: call void @llvm.masked.scatter.v2i8.v2p0(<2 x i8> [[TMP1]], <2 x ptr> align 1 zeroinitializer, <2 x i1> splat (i1 true))
65-
; CHECK-NEXT: call void @llvm.masked.scatter.v2i8.v2p0(<2 x i8> [[TMP1]], <2 x ptr> align 1 [[BROADCAST_SPLAT]], <2 x i1> splat (i1 true))
66-
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
67-
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4294967296
68-
; CHECK-NEXT: br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
69-
; CHECK: [[MIDDLE_BLOCK]]:
70-
; CHECK-NEXT: br label %[[EXIT:.*]]
58+
; CHECK-NEXT: [[VAL_ZEXT:%.*]] = zext i16 [[VAL]] to i64
59+
; CHECK-NEXT: [[VAL_TRUNC_ZEXT:%.*]] = trunc i64 [[VAL_ZEXT]] to i8
60+
; CHECK-NEXT: store i8 [[VAL_TRUNC_ZEXT]], ptr null, align 1
61+
; CHECK-NEXT: [[VAL_TRUNC:%.*]] = trunc i16 [[VAL]] to i8
62+
; CHECK-NEXT: store i8 [[VAL_TRUNC]], ptr [[DST]], align 1
63+
; CHECK-NEXT: [[COUNT_NEXT]] = add i32 [[COUNT]], 1
64+
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[COUNT_NEXT]], 0
65+
; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
66+
; CHECK-NEXT: br i1 [[EXITCOND]], label %[[EXIT:.*]], label %[[LOOP]]
7167
; CHECK: [[EXIT]]:
7268
; CHECK-NEXT: ret void
7369
;
@@ -96,5 +92,4 @@ exit: ; preds = %loop
9692
; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
9793
; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
9894
; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
99-
; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
10095
;.

0 commit comments

Comments
 (0)