Skip to content

Commit 9f888fc

Browse files
committed
[LV] add test for low TC under ARM MVE
Normally, by rule of thumb, keeping the number of instructions the same, a lower VF is cheaper than a higher one on most targets. ARM MVE is a strange target in that 128-bit vectors are cheaper than smaller vectors. As preparatory work for optimizing LoopVectorize for low TCs either requiring tail-folding or scalar-epilogue, add a test for low TCs under the ARM MVE target.
1 parent 55e5842 commit 9f888fc

File tree

2 files changed

+285
-152
lines changed

2 files changed

+285
-152
lines changed
Lines changed: 272 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,272 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2+
; RUN: opt -passes=loop-vectorize,simplifycfg -mtriple=thumbv8.1m -mattr=+mve -S < %s | FileCheck %s
3+
4+
define void @trip1_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture noundef readonly %src) #0 {
5+
; CHECK-LABEL: @trip1_i8(
6+
; CHECK-NEXT: entry:
7+
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
8+
; CHECK: for.body:
9+
; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
10+
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[I_08]]
11+
; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
12+
; CHECK-NEXT: [[MUL:%.*]] = shl i8 [[TMP0]], 1
13+
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[I_08]]
14+
; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
15+
; CHECK-NEXT: [[ADD:%.*]] = add i8 [[MUL]], [[TMP1]]
16+
; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1
17+
; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1
18+
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 1
19+
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
20+
; CHECK: for.end:
21+
; CHECK-NEXT: ret void
22+
;
23+
entry:
24+
br label %for.body
25+
26+
for.body: ; preds = %entry, %for.body
27+
%i.08 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
28+
%arrayidx = getelementptr inbounds i8, ptr %src, i64 %i.08
29+
%0 = load i8, ptr %arrayidx, align 1
30+
%mul = shl i8 %0, 1
31+
%arrayidx1 = getelementptr inbounds i8, ptr %dst, i64 %i.08
32+
%1 = load i8, ptr %arrayidx1, align 1
33+
%add = add i8 %mul, %1
34+
store i8 %add, ptr %arrayidx1, align 1
35+
%inc = add nuw nsw i64 %i.08, 1
36+
%exitcond.not = icmp eq i64 %inc, 1
37+
br i1 %exitcond.not, label %for.end, label %for.body
38+
39+
for.end: ; preds = %for.body
40+
ret void
41+
}
42+
43+
define void @trip3_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture noundef readonly %src) #0 {
44+
; CHECK-LABEL: @trip3_i8(
45+
; CHECK-NEXT: entry:
46+
; CHECK-NEXT: [[TMP0:%.*]] = add i64 0, 0
47+
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[TMP0]], i64 3)
48+
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP0]]
49+
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
50+
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr [[TMP2]], i32 1, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i8> poison)
51+
; CHECK-NEXT: [[TMP3:%.*]] = shl <4 x i8> [[WIDE_MASKED_LOAD]], <i8 1, i8 1, i8 1, i8 1>
52+
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP0]]
53+
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
54+
; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr [[TMP5]], i32 1, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i8> poison)
55+
; CHECK-NEXT: [[TMP6:%.*]] = add <4 x i8> [[TMP3]], [[WIDE_MASKED_LOAD1]]
56+
; CHECK-NEXT: call void @llvm.masked.store.v4i8.p0(<4 x i8> [[TMP6]], ptr [[TMP5]], i32 1, <4 x i1> [[ACTIVE_LANE_MASK]])
57+
; CHECK-NEXT: [[INDEX_NEXT:%.*]] = add i64 0, 4
58+
; CHECK-NEXT: ret void
59+
;
60+
entry:
61+
br label %for.body
62+
63+
for.body: ; preds = %entry, %for.body
64+
%i.08 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
65+
%arrayidx = getelementptr inbounds i8, ptr %src, i64 %i.08
66+
%0 = load i8, ptr %arrayidx, align 1
67+
%mul = shl i8 %0, 1
68+
%arrayidx1 = getelementptr inbounds i8, ptr %dst, i64 %i.08
69+
%1 = load i8, ptr %arrayidx1, align 1
70+
%add = add i8 %mul, %1
71+
store i8 %add, ptr %arrayidx1, align 1
72+
%inc = add nuw nsw i64 %i.08, 1
73+
%exitcond.not = icmp eq i64 %inc, 3
74+
br i1 %exitcond.not, label %for.end, label %for.body
75+
76+
for.end: ; preds = %for.body
77+
ret void
78+
}
79+
80+
define void @trip5_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture noundef readonly %src) #0 {
81+
; CHECK-LABEL: @trip5_i8(
82+
; CHECK-NEXT: entry:
83+
; CHECK-NEXT: [[TMP0:%.*]] = add i64 0, 0
84+
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 [[TMP0]], i64 5)
85+
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP0]]
86+
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
87+
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr [[TMP2]], i32 1, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i8> poison)
88+
; CHECK-NEXT: [[TMP3:%.*]] = shl <8 x i8> [[WIDE_MASKED_LOAD]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
89+
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP0]]
90+
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
91+
; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr [[TMP5]], i32 1, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i8> poison)
92+
; CHECK-NEXT: [[TMP6:%.*]] = add <8 x i8> [[TMP3]], [[WIDE_MASKED_LOAD1]]
93+
; CHECK-NEXT: call void @llvm.masked.store.v8i8.p0(<8 x i8> [[TMP6]], ptr [[TMP5]], i32 1, <8 x i1> [[ACTIVE_LANE_MASK]])
94+
; CHECK-NEXT: [[INDEX_NEXT:%.*]] = add i64 0, 8
95+
; CHECK-NEXT: ret void
96+
;
97+
entry:
98+
br label %for.body
99+
100+
for.body: ; preds = %entry, %for.body
101+
%i.08 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
102+
%arrayidx = getelementptr inbounds i8, ptr %src, i64 %i.08
103+
%0 = load i8, ptr %arrayidx, align 1
104+
%mul = shl i8 %0, 1
105+
%arrayidx1 = getelementptr inbounds i8, ptr %dst, i64 %i.08
106+
%1 = load i8, ptr %arrayidx1, align 1
107+
%add = add i8 %mul, %1
108+
store i8 %add, ptr %arrayidx1, align 1
109+
%inc = add nuw nsw i64 %i.08, 1
110+
%exitcond.not = icmp eq i64 %inc, 5
111+
br i1 %exitcond.not, label %for.end, label %for.body
112+
113+
for.end: ; preds = %for.body
114+
ret void
115+
}
116+
117+
define void @trip8_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture noundef readonly %src) #0 {
118+
; CHECK-LABEL: @trip8_i8(
119+
; CHECK-NEXT: entry:
120+
; CHECK-NEXT: [[TMP0:%.*]] = add i64 0, 0
121+
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP0]]
122+
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
123+
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
124+
; CHECK-NEXT: [[TMP3:%.*]] = shl <8 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
125+
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP0]]
126+
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
127+
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP5]], align 1
128+
; CHECK-NEXT: [[TMP6:%.*]] = add <8 x i8> [[TMP3]], [[WIDE_LOAD1]]
129+
; CHECK-NEXT: store <8 x i8> [[TMP6]], ptr [[TMP5]], align 1
130+
; CHECK-NEXT: [[INDEX_NEXT:%.*]] = add nuw i64 0, 8
131+
; CHECK-NEXT: ret void
132+
;
133+
entry:
134+
br label %for.body
135+
136+
for.body: ; preds = %entry, %for.body
137+
%i.08 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
138+
%arrayidx = getelementptr inbounds i8, ptr %src, i64 %i.08
139+
%0 = load i8, ptr %arrayidx, align 1
140+
%mul = shl i8 %0, 1
141+
%arrayidx1 = getelementptr inbounds i8, ptr %dst, i64 %i.08
142+
%1 = load i8, ptr %arrayidx1, align 1
143+
%add = add i8 %mul, %1
144+
store i8 %add, ptr %arrayidx1, align 1
145+
%inc = add nuw nsw i64 %i.08, 1
146+
%exitcond.not = icmp eq i64 %inc, 8
147+
br i1 %exitcond.not, label %for.end, label %for.body
148+
149+
for.end: ; preds = %for.body
150+
ret void
151+
}
152+
153+
define void @trip16_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture noundef readonly %src) #0 {
154+
; CHECK-LABEL: @trip16_i8(
155+
; CHECK-NEXT: entry:
156+
; CHECK-NEXT: [[TMP0:%.*]] = add i64 0, 0
157+
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP0]]
158+
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
159+
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
160+
; CHECK-NEXT: [[TMP3:%.*]] = shl <16 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
161+
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP0]]
162+
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
163+
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
164+
; CHECK-NEXT: [[TMP6:%.*]] = add <16 x i8> [[TMP3]], [[WIDE_LOAD1]]
165+
; CHECK-NEXT: store <16 x i8> [[TMP6]], ptr [[TMP5]], align 1
166+
; CHECK-NEXT: [[INDEX_NEXT:%.*]] = add nuw i64 0, 16
167+
; CHECK-NEXT: ret void
168+
;
169+
entry:
170+
br label %for.body
171+
172+
for.body: ; preds = %entry, %for.body
173+
%i.08 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
174+
%arrayidx = getelementptr inbounds i8, ptr %src, i64 %i.08
175+
%0 = load i8, ptr %arrayidx, align 1
176+
%mul = shl i8 %0, 1
177+
%arrayidx1 = getelementptr inbounds i8, ptr %dst, i64 %i.08
178+
%1 = load i8, ptr %arrayidx1, align 1
179+
%add = add i8 %mul, %1
180+
store i8 %add, ptr %arrayidx1, align 1
181+
%inc = add nuw nsw i64 %i.08, 1
182+
%exitcond.not = icmp eq i64 %inc, 16
183+
br i1 %exitcond.not, label %for.end, label %for.body
184+
185+
for.end: ; preds = %for.body
186+
ret void
187+
}
188+
189+
190+
define void @trip32_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture noundef readonly %src) #0 {
191+
; CHECK-LABEL: @trip32_i8(
192+
; CHECK-NEXT: entry:
193+
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
194+
; CHECK: vector.body:
195+
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
196+
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
197+
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP0]]
198+
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
199+
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
200+
; CHECK-NEXT: [[TMP3:%.*]] = shl <16 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
201+
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP0]]
202+
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
203+
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
204+
; CHECK-NEXT: [[TMP6:%.*]] = add <16 x i8> [[TMP3]], [[WIDE_LOAD1]]
205+
; CHECK-NEXT: store <16 x i8> [[TMP6]], ptr [[TMP5]], align 1
206+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
207+
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
208+
; CHECK-NEXT: br i1 [[TMP7]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
209+
; CHECK: for.end:
210+
; CHECK-NEXT: ret void
211+
;
212+
entry:
213+
br label %for.body
214+
215+
for.body: ; preds = %entry, %for.body
216+
%i.08 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
217+
%arrayidx = getelementptr inbounds i8, ptr %src, i64 %i.08
218+
%0 = load i8, ptr %arrayidx, align 1
219+
%mul = shl i8 %0, 1
220+
%arrayidx1 = getelementptr inbounds i8, ptr %dst, i64 %i.08
221+
%1 = load i8, ptr %arrayidx1, align 1
222+
%add = add i8 %mul, %1
223+
store i8 %add, ptr %arrayidx1, align 1
224+
%inc = add nuw nsw i64 %i.08, 1
225+
%exitcond.not = icmp eq i64 %inc, 32
226+
br i1 %exitcond.not, label %for.end, label %for.body
227+
228+
for.end: ; preds = %for.body
229+
ret void
230+
}
231+
232+
define void @trip24_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture noundef readonly %src) #0 {
233+
; CHECK-LABEL: @trip24_i8(
234+
; CHECK-NEXT: entry:
235+
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
236+
; CHECK: vector.body:
237+
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
238+
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
239+
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP0]]
240+
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
241+
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
242+
; CHECK-NEXT: [[TMP3:%.*]] = shl <8 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
243+
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP0]]
244+
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
245+
; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP5]], align 1
246+
; CHECK-NEXT: [[TMP6:%.*]] = add <8 x i8> [[TMP3]], [[WIDE_LOAD1]]
247+
; CHECK-NEXT: store <8 x i8> [[TMP6]], ptr [[TMP5]], align 1
248+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
249+
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24
250+
; CHECK-NEXT: br i1 [[TMP7]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
251+
; CHECK: for.end:
252+
; CHECK-NEXT: ret void
253+
;
254+
entry:
255+
br label %for.body
256+
257+
for.body: ; preds = %entry, %for.body
258+
%i.08 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
259+
%arrayidx = getelementptr inbounds i8, ptr %src, i64 %i.08
260+
%0 = load i8, ptr %arrayidx, align 1
261+
%mul = shl i8 %0, 1
262+
%arrayidx1 = getelementptr inbounds i8, ptr %dst, i64 %i.08
263+
%1 = load i8, ptr %arrayidx1, align 1
264+
%add = add i8 %mul, %1
265+
store i8 %add, ptr %arrayidx1, align 1
266+
%inc = add nuw nsw i64 %i.08, 1
267+
%exitcond.not = icmp eq i64 %inc, 24
268+
br i1 %exitcond.not, label %for.end, label %for.body
269+
270+
for.end: ; preds = %for.body
271+
ret void
272+
}

0 commit comments

Comments
 (0)