Skip to content

Commit 275f344

Browse files
committed
LoopVectorize: optimize VF for low TC, when tail-folding
9a087a3 (LoopVectorize: MaxVF should not be larger than the loop trip count) was the first commit to add the condition PowerOf2_32() of the trip-count to, what is now getMaximizedVFForTarget(). It made sense at the time, as there was no tail-folding support. Much later, 2025e09 ([LV] Make sure VF doesn't exceed compile time known TC) came along to patch this with an extra condition on FoldTailByMasking, in order to ensure that that the VF doesn't exceed the trip-count. However, it didn't go far enough, and we can do better, as there is existing code to clamp the trip-count, and do tail-folding. Fixes #82626.
1 parent 9f888fc commit 275f344

File tree

3 files changed

+23
-26
lines changed

3 files changed

+23
-26
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4776,16 +4776,16 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
47764776
if (MaxTripCount > 0 && requiresScalarEpilogue(true))
47774777
MaxTripCount -= 1;
47784778

4779-
if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC &&
4780-
(!FoldTailByMasking || isPowerOf2_32(MaxTripCount))) {
4779+
if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC) {
47814780
// If upper bound loop trip count (TC) is known at compile time there is no
47824781
// point in choosing VF greater than TC (as done in the loop below). Select
47834782
// maximum power of two which doesn't exceed TC. If MaxVectorElementCount is
47844783
// scalable, we only fall back on a fixed VF when the TC is less than or
47854784
// equal to the known number of lanes.
4786-
auto ClampedUpperTripCount = llvm::bit_floor(MaxTripCount);
4787-
LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
4788-
"exceeding the constant trip count: "
4785+
auto ClampedUpperTripCount = FoldTailByMasking
4786+
? llvm::bit_ceil(MaxTripCount)
4787+
: llvm::bit_floor(MaxTripCount);
4788+
LLVM_DEBUG(dbgs() << "LV: Clamping the trip count to a power of two: "
47894789
<< ClampedUpperTripCount << "\n");
47904790
return ElementCount::get(
47914791
ClampedUpperTripCount,

llvm/test/Transforms/LoopVectorize/AArch64/maximize-bandwidth-invalidate.ll

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,6 @@ target triple = "aarch64"
1111
; so will be masked.
1212

1313
; COST: LV: Found an estimated cost of 3000000 for VF 2 For instruction: %0 = load
14-
; COST: LV: Found an estimated cost of 3000000 for VF 4 For instruction: %0 = load
15-
; COST: LV: Found an estimated cost of 3000000 for VF 8 For instruction: %0 = load
16-
; COST: LV: Found an estimated cost of 3000000 for VF 16 For instruction: %0 = load
1714
; COST: LV: Selecting VF: 1.
1815

1916
define i32 @test(ptr nocapture noundef readonly %pInVec, ptr nocapture noundef readonly %pInA1, ptr nocapture noundef readonly %pInA2, ptr nocapture noundef readonly %pInA3, ptr nocapture noundef readonly %pInA4, i32 noundef %numCols) {

llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -46,26 +46,26 @@ define void @trip3_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
4646
; CHECK-LABEL: @trip3_i8(
4747
; CHECK-NEXT: entry:
4848
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
49-
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
49+
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
5050
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
51-
; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16
51+
; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
5252
; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1
5353
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 3, [[TMP4]]
5454
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
5555
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
5656
; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
57-
; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 16
57+
; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4
5858
; CHECK-NEXT: [[TMP7:%.*]] = add i64 0, 0
59-
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP7]], i64 3)
59+
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP7]], i64 3)
6060
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP7]]
6161
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
62-
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP9]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
63-
; CHECK-NEXT: [[TMP10:%.*]] = shl <vscale x 16 x i8> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer)
62+
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP9]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
63+
; CHECK-NEXT: [[TMP10:%.*]] = shl <vscale x 4 x i8> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 4 x i8> insertelement (<vscale x 4 x i8> poison, i8 1, i64 0), <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer)
6464
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP7]]
6565
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
66-
; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP12]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
67-
; CHECK-NEXT: [[TMP13:%.*]] = add <vscale x 16 x i8> [[TMP10]], [[WIDE_MASKED_LOAD1]]
68-
; CHECK-NEXT: call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP13]], ptr [[TMP12]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
66+
; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP12]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
67+
; CHECK-NEXT: [[TMP13:%.*]] = add <vscale x 4 x i8> [[TMP10]], [[WIDE_MASKED_LOAD1]]
68+
; CHECK-NEXT: call void @llvm.masked.store.nxv4i8.p0(<vscale x 4 x i8> [[TMP13]], ptr [[TMP12]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
6969
; CHECK-NEXT: [[INDEX_NEXT:%.*]] = add i64 0, [[TMP6]]
7070
; CHECK-NEXT: ret void
7171
;
@@ -93,26 +93,26 @@ define void @trip5_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
9393
; CHECK-LABEL: @trip5_i8(
9494
; CHECK-NEXT: entry:
9595
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
96-
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
96+
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8
9797
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
98-
; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16
98+
; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8
9999
; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1
100100
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 5, [[TMP4]]
101101
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
102102
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
103103
; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
104-
; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 16
104+
; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8
105105
; CHECK-NEXT: [[TMP7:%.*]] = add i64 0, 0
106-
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP7]], i64 5)
106+
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP7]], i64 5)
107107
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP7]]
108108
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
109-
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP9]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
110-
; CHECK-NEXT: [[TMP10:%.*]] = shl <vscale x 16 x i8> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer)
109+
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr [[TMP9]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i8> poison)
110+
; CHECK-NEXT: [[TMP10:%.*]] = shl <vscale x 8 x i8> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 8 x i8> insertelement (<vscale x 8 x i8> poison, i8 1, i64 0), <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer)
111111
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP7]]
112112
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
113-
; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP12]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
114-
; CHECK-NEXT: [[TMP13:%.*]] = add <vscale x 16 x i8> [[TMP10]], [[WIDE_MASKED_LOAD1]]
115-
; CHECK-NEXT: call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP13]], ptr [[TMP12]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
113+
; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr [[TMP12]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i8> poison)
114+
; CHECK-NEXT: [[TMP13:%.*]] = add <vscale x 8 x i8> [[TMP10]], [[WIDE_MASKED_LOAD1]]
115+
; CHECK-NEXT: call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP13]], ptr [[TMP12]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]])
116116
; CHECK-NEXT: [[INDEX_NEXT:%.*]] = add i64 0, [[TMP6]]
117117
; CHECK-NEXT: ret void
118118
;

0 commit comments

Comments
 (0)