-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[LV] add test for low TC under ARM MVE #93181
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Normally, by rule of thumb, keeping the number of instructions the same, a lower VF is cheaper than a higher one on most targets. ARM MVE is a strange target in that 128-bit vectors are cheaper than smaller vectors. As preparatory work for optimizing LoopVectorize for low TCs either requiring tail-folding or scalar-epilogue, add a test for low TCs under the ARM MVE target.
@llvm/pr-subscribers-llvm-transforms Author: Ramkumar Ramachandra (artagnon) ChangesNormally, by rule of thumb, keeping the number of instructions the same, a lower VF is cheaper than a higher one on most targets. ARM MVE is a strange target in that 128-bit vectors are cheaper than smaller vectors. As preparatory work for optimizing LoopVectorize for low TCs either requiring tail-folding or scalar-epilogue, add a test for low TCs under the ARM MVE target. Patch is 31.87 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/93181.diff 2 Files Affected:
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/low-trip-count.ll b/llvm/test/Transforms/LoopVectorize/ARM/low-trip-count.ll
new file mode 100644
index 0000000000000..d59dad2a82bd2
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/ARM/low-trip-count.ll
@@ -0,0 +1,272 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=loop-vectorize,simplifycfg -mtriple=thumbv8.1m -mattr=+mve -S < %s | FileCheck %s
+
+define void @trip1_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture noundef readonly %src) #0 {
+; CHECK-LABEL: @trip1_i8(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.body:
+; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[I_08]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[MUL:%.*]] = shl i8 [[TMP0]], 1
+; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[I_08]]
+; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT: [[ADD:%.*]] = add i8 [[MUL]], [[TMP1]]
+; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 1
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; CHECK: for.end:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %for.body
+
+for.body: ; preds = %entry, %for.body
+ %i.08 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+ %arrayidx = getelementptr inbounds i8, ptr %src, i64 %i.08
+ %0 = load i8, ptr %arrayidx, align 1
+ %mul = shl i8 %0, 1
+ %arrayidx1 = getelementptr inbounds i8, ptr %dst, i64 %i.08
+ %1 = load i8, ptr %arrayidx1, align 1
+ %add = add i8 %mul, %1
+ store i8 %add, ptr %arrayidx1, align 1
+ %inc = add nuw nsw i64 %i.08, 1
+ %exitcond.not = icmp eq i64 %inc, 1
+ br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end: ; preds = %for.body
+ ret void
+}
+
+define void @trip3_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture noundef readonly %src) #0 {
+; CHECK-LABEL: @trip3_i8(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = add i64 0, 0
+; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[TMP0]], i64 3)
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP0]]
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr [[TMP2]], i32 1, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i8> poison)
+; CHECK-NEXT: [[TMP3:%.*]] = shl <4 x i8> [[WIDE_MASKED_LOAD]], <i8 1, i8 1, i8 1, i8 1>
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP0]]
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
+; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr [[TMP5]], i32 1, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i8> poison)
+; CHECK-NEXT: [[TMP6:%.*]] = add <4 x i8> [[TMP3]], [[WIDE_MASKED_LOAD1]]
+; CHECK-NEXT: call void @llvm.masked.store.v4i8.p0(<4 x i8> [[TMP6]], ptr [[TMP5]], i32 1, <4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT: [[INDEX_NEXT:%.*]] = add i64 0, 4
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %for.body
+
+for.body: ; preds = %entry, %for.body
+ %i.08 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+ %arrayidx = getelementptr inbounds i8, ptr %src, i64 %i.08
+ %0 = load i8, ptr %arrayidx, align 1
+ %mul = shl i8 %0, 1
+ %arrayidx1 = getelementptr inbounds i8, ptr %dst, i64 %i.08
+ %1 = load i8, ptr %arrayidx1, align 1
+ %add = add i8 %mul, %1
+ store i8 %add, ptr %arrayidx1, align 1
+ %inc = add nuw nsw i64 %i.08, 1
+ %exitcond.not = icmp eq i64 %inc, 3
+ br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end: ; preds = %for.body
+ ret void
+}
+
+define void @trip5_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture noundef readonly %src) #0 {
+; CHECK-LABEL: @trip5_i8(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = add i64 0, 0
+; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 [[TMP0]], i64 5)
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP0]]
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr [[TMP2]], i32 1, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i8> poison)
+; CHECK-NEXT: [[TMP3:%.*]] = shl <8 x i8> [[WIDE_MASKED_LOAD]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP0]]
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
+; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr [[TMP5]], i32 1, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i8> poison)
+; CHECK-NEXT: [[TMP6:%.*]] = add <8 x i8> [[TMP3]], [[WIDE_MASKED_LOAD1]]
+; CHECK-NEXT: call void @llvm.masked.store.v8i8.p0(<8 x i8> [[TMP6]], ptr [[TMP5]], i32 1, <8 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT: [[INDEX_NEXT:%.*]] = add i64 0, 8
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %for.body
+
+for.body: ; preds = %entry, %for.body
+ %i.08 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+ %arrayidx = getelementptr inbounds i8, ptr %src, i64 %i.08
+ %0 = load i8, ptr %arrayidx, align 1
+ %mul = shl i8 %0, 1
+ %arrayidx1 = getelementptr inbounds i8, ptr %dst, i64 %i.08
+ %1 = load i8, ptr %arrayidx1, align 1
+ %add = add i8 %mul, %1
+ store i8 %add, ptr %arrayidx1, align 1
+ %inc = add nuw nsw i64 %i.08, 1
+ %exitcond.not = icmp eq i64 %inc, 5
+ br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end: ; preds = %for.body
+ ret void
+}
+
+define void @trip8_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture noundef readonly %src) #0 {
+; CHECK-LABEL: @trip8_i8(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = add i64 0, 0
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP0]]
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
+; CHECK-NEXT: [[TMP3:%.*]] = shl <8 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP0]]
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP5]], align 1
+; CHECK-NEXT: [[TMP6:%.*]] = add <8 x i8> [[TMP3]], [[WIDE_LOAD1]]
+; CHECK-NEXT: store <8 x i8> [[TMP6]], ptr [[TMP5]], align 1
+; CHECK-NEXT: [[INDEX_NEXT:%.*]] = add nuw i64 0, 8
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %for.body
+
+for.body: ; preds = %entry, %for.body
+ %i.08 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+ %arrayidx = getelementptr inbounds i8, ptr %src, i64 %i.08
+ %0 = load i8, ptr %arrayidx, align 1
+ %mul = shl i8 %0, 1
+ %arrayidx1 = getelementptr inbounds i8, ptr %dst, i64 %i.08
+ %1 = load i8, ptr %arrayidx1, align 1
+ %add = add i8 %mul, %1
+ store i8 %add, ptr %arrayidx1, align 1
+ %inc = add nuw nsw i64 %i.08, 1
+ %exitcond.not = icmp eq i64 %inc, 8
+ br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end: ; preds = %for.body
+ ret void
+}
+
+define void @trip16_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture noundef readonly %src) #0 {
+; CHECK-LABEL: @trip16_i8(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = add i64 0, 0
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP0]]
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-NEXT: [[TMP3:%.*]] = shl <16 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP0]]
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-NEXT: [[TMP6:%.*]] = add <16 x i8> [[TMP3]], [[WIDE_LOAD1]]
+; CHECK-NEXT: store <16 x i8> [[TMP6]], ptr [[TMP5]], align 1
+; CHECK-NEXT: [[INDEX_NEXT:%.*]] = add nuw i64 0, 16
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %for.body
+
+for.body: ; preds = %entry, %for.body
+ %i.08 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+ %arrayidx = getelementptr inbounds i8, ptr %src, i64 %i.08
+ %0 = load i8, ptr %arrayidx, align 1
+ %mul = shl i8 %0, 1
+ %arrayidx1 = getelementptr inbounds i8, ptr %dst, i64 %i.08
+ %1 = load i8, ptr %arrayidx1, align 1
+ %add = add i8 %mul, %1
+ store i8 %add, ptr %arrayidx1, align 1
+ %inc = add nuw nsw i64 %i.08, 1
+ %exitcond.not = icmp eq i64 %inc, 16
+ br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end: ; preds = %for.body
+ ret void
+}
+
+
+define void @trip32_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture noundef readonly %src) #0 {
+; CHECK-LABEL: @trip32_i8(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP0]]
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-NEXT: [[TMP3:%.*]] = shl <16 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP0]]
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-NEXT: [[TMP6:%.*]] = add <16 x i8> [[TMP3]], [[WIDE_LOAD1]]
+; CHECK-NEXT: store <16 x i8> [[TMP6]], ptr [[TMP5]], align 1
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
+; CHECK-NEXT: br i1 [[TMP7]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: for.end:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %for.body
+
+for.body: ; preds = %entry, %for.body
+ %i.08 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+ %arrayidx = getelementptr inbounds i8, ptr %src, i64 %i.08
+ %0 = load i8, ptr %arrayidx, align 1
+ %mul = shl i8 %0, 1
+ %arrayidx1 = getelementptr inbounds i8, ptr %dst, i64 %i.08
+ %1 = load i8, ptr %arrayidx1, align 1
+ %add = add i8 %mul, %1
+ store i8 %add, ptr %arrayidx1, align 1
+ %inc = add nuw nsw i64 %i.08, 1
+ %exitcond.not = icmp eq i64 %inc, 32
+ br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end: ; preds = %for.body
+ ret void
+}
+
+define void @trip24_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture noundef readonly %src) #0 {
+; CHECK-LABEL: @trip24_i8(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP0]]
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
+; CHECK-NEXT: [[TMP3:%.*]] = shl <8 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP0]]
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP5]], align 1
+; CHECK-NEXT: [[TMP6:%.*]] = add <8 x i8> [[TMP3]], [[WIDE_LOAD1]]
+; CHECK-NEXT: store <8 x i8> [[TMP6]], ptr [[TMP5]], align 1
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24
+; CHECK-NEXT: br i1 [[TMP7]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK: for.end:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %for.body
+
+for.body: ; preds = %entry, %for.body
+ %i.08 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+ %arrayidx = getelementptr inbounds i8, ptr %src, i64 %i.08
+ %0 = load i8, ptr %arrayidx, align 1
+ %mul = shl i8 %0, 1
+ %arrayidx1 = getelementptr inbounds i8, ptr %dst, i64 %i.08
+ %1 = load i8, ptr %arrayidx1, align 1
+ %add = add i8 %mul, %1
+ store i8 %add, ptr %arrayidx1, align 1
+ %inc = add nuw nsw i64 %i.08, 1
+ %exitcond.not = icmp eq i64 %inc, 24
+ br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end: ; preds = %for.body
+ ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll
index 7ccbc98d26567..2e631348ea992 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=loop-vectorize -riscv-v-vector-bits-min=128 -scalable-vectorization=on -force-target-instruction-cost=1 -S < %s | FileCheck %s
+; RUN: opt -passes=loop-vectorize,simplifycfg -riscv-v-vector-bits-min=128 -scalable-vectorization=on -force-target-instruction-cost=1 -S < %s | FileCheck %s
target triple = "riscv64"
@@ -45,8 +45,6 @@ for.end: ; preds = %for.body
define void @trip3_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture noundef readonly %src) #0 {
; CHECK-LABEL: @trip3_i8(
; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
@@ -57,10 +55,7 @@ define void @trip3_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 16
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[TMP7:%.*]] = add i64 0, 0
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP7]], i64 3)
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP7]]
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
@@ -71,26 +66,7 @@ define void @trip3_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP12]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
; CHECK-NEXT: [[TMP13:%.*]] = add <vscale x 16 x i8> [[TMP10]], [[WIDE_MASKED_LOAD1]]
; CHECK-NEXT: call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP13]], ptr [[TMP12]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
-; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[I_08]]
-; CHECK-NEXT: [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT: [[MUL:%.*]] = shl i8 [[TMP14]], 1
-; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]]
-; CHECK-NEXT: [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT: [[ADD:%.*]] = add i8 [[MUL]], [[TMP15]]
-; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 3
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
-; CHECK: for.end:
+; CHECK-NEXT: [[INDEX_NEXT:%.*]] = add i64 0, [[TMP6]]
; CHECK-NEXT: ret void
;
entry:
@@ -116,8 +92,6 @@ for.end: ; preds = %for.body
define void @trip5_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture noundef readonly %src) #0 {
; CHECK-LABEL: @trip5_i8(
; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
@@ -128,10 +102,7 @@ define void @trip5_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 16
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[TMP7:%.*]] = add i64 0, 0
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP7]], i64 5)
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP7]]
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
@@ -142,26 +113,7 @@ define void @trip5_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP12]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
; CHECK-NEXT: [[TMP13:%.*]] = add <vscale x 16 x i8> [[TMP10]], [[WIDE_MASKED_LOAD1]]
; CHECK-NEXT: call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP13]], ptr [[TMP12]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
-; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br...
[truncated]
|
Subsumed by 46541aa3 ([ARM] Add a extra MVE low-trip-count loop. NFC). |
Normally, by rule of thumb, keeping the number of instructions the same, a lower VF is cheaper than a higher one on most targets. ARM MVE is a strange target in that 128-bit vectors are cheaper than smaller vectors. As preparatory work for optimizing LoopVectorize for low TCs either requiring tail-folding or scalar-epilogue, add a test for low TCs under the ARM MVE target.