1+ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
12; RUN: llc -opaque-pointers=0 -O3 -mtriple=thumb-eabi -mcpu=cortex-a9 %s -o - | FileCheck %s -check-prefix=A9
23
34; @simple is the most basic chain of address induction variables. Chaining
45; saves at least one register and avoids complex addressing and setup
56; code.
67;
7- ; A9: @simple
88; no expensive address computation in the preheader
9- ; A9: lsl
10- ; A9-NOT: lsl
11- ; A9: %loop
129; no complex address modes
13- ; A9-NOT: lsl
1410define i32 @simple (i32* %a , i32* %b , i32 %x ) nounwind {
11+ ; A9-LABEL: simple:
12+ ; A9: @ %bb.0: @ %entry
13+ ; A9-NEXT: .save {r4, r5, r6, lr}
14+ ; A9-NEXT: push {r4, r5, r6, lr}
15+ ; A9-NEXT: mov r3, r0
16+ ; A9-NEXT: lsls r2, r2, #2
17+ ; A9-NEXT: movs r0, #0
18+ ; A9-NEXT: .LBB0_1: @ %loop
19+ ; A9-NEXT: @ =>This Inner Loop Header: Depth=1
20+ ; A9-NEXT: add.w lr, r3, r2
21+ ; A9-NEXT: ldr.w r12, [r3, r2]
22+ ; A9-NEXT: ldr r3, [r3]
23+ ; A9-NEXT: add.w r4, lr, r2
24+ ; A9-NEXT: ldr.w r6, [lr, r2]
25+ ; A9-NEXT: add r0, r3
26+ ; A9-NEXT: adds r3, r4, r2
27+ ; A9-NEXT: add r0, r12
28+ ; A9-NEXT: ldr r5, [r4, r2]
29+ ; A9-NEXT: add r0, r6
30+ ; A9-NEXT: add r3, r2
31+ ; A9-NEXT: add r0, r5
32+ ; A9-NEXT: cmp r3, r1
33+ ; A9-NEXT: bne .LBB0_1
34+ ; A9-NEXT: @ %bb.2: @ %exit
35+ ; A9-NEXT: pop {r4, r5, r6, pc}
1536entry:
1637 br label %loop
1738loop:
@@ -37,15 +58,34 @@ exit:
3758
3859; @user is not currently chained because the IV is live across memory ops.
3960;
40- ; A9: @user
4161; stride multiples computed in the preheader
42- ; A9: lsl
43- ; A9: lsl
44- ; A9: %loop
4562; complex address modes
46- ; A9: lsl
47- ; A9: lsl
4863define i32 @user (i32* %a , i32* %b , i32 %x ) nounwind {
64+ ; A9-LABEL: user:
65+ ; A9: @ %bb.0: @ %entry
66+ ; A9-NEXT: .save {r4, r5, r6, r7, lr}
67+ ; A9-NEXT: push {r4, r5, r6, r7, lr}
68+ ; A9-NEXT: add.w r3, r2, r2, lsl #1
69+ ; A9-NEXT: lsl.w r12, r2, #4
70+ ; A9-NEXT: lsl.w lr, r3, #2
71+ ; A9-NEXT: movs r3, #0
72+ ; A9-NEXT: .LBB1_1: @ %loop
73+ ; A9-NEXT: @ =>This Inner Loop Header: Depth=1
74+ ; A9-NEXT: ldr r4, [r0]
75+ ; A9-NEXT: ldr.w r5, [r0, r2, lsl #3]
76+ ; A9-NEXT: ldr.w r6, [r0, r2, lsl #2]
77+ ; A9-NEXT: add r3, r4
78+ ; A9-NEXT: ldr.w r7, [r0, lr]
79+ ; A9-NEXT: add r3, r6
80+ ; A9-NEXT: add r3, r5
81+ ; A9-NEXT: add r3, r7
82+ ; A9-NEXT: str r3, [r0]
83+ ; A9-NEXT: add r0, r12
84+ ; A9-NEXT: cmp r0, r1
85+ ; A9-NEXT: bne .LBB1_1
86+ ; A9-NEXT: @ %bb.2: @ %exit
87+ ; A9-NEXT: mov r0, r3
88+ ; A9-NEXT: pop {r4, r5, r6, r7, pc}
4989entry:
5090 br label %loop
5191loop:
@@ -75,16 +115,43 @@ exit:
75115; used to do, and exactly what we don't want to do. LSR's new IV
76116; chaining feature should now undo the damage.
77117;
78- ; A9: extrastride:
79118; no spills
80- ; A9-NOT: str
81119; only one stride multiple in the preheader
82- ; A9: lsl
83- ; A9-NOT: {{str r|lsl}}
84- ; A9: %for.body{{$}}
85120; no complex address modes or reloads
86- ; A9-NOT: {{ldr .*[sp]|lsl}}
87121define void @extrastride (i8* nocapture %main , i32 %main_stride , i32* nocapture %res , i32 %x , i32 %y , i32 %z ) nounwind {
122+ ; A9-LABEL: extrastride:
123+ ; A9: @ %bb.0: @ %entry
124+ ; A9-NEXT: .save {r4, r5, r6, r7, lr}
125+ ; A9-NEXT: push {r4, r5, r6, r7, lr}
126+ ; A9-NEXT: ldr.w r12, [sp, #24]
127+ ; A9-NEXT: cmp.w r12, #0
128+ ; A9-NEXT: beq .LBB2_3
129+ ; A9-NEXT: @ %bb.1: @ %for.body.lr.ph
130+ ; A9-NEXT: ldr r4, [sp, #20]
131+ ; A9-NEXT: add.w lr, r3, r1
132+ ; A9-NEXT: lsls r3, r4, #2
133+ ; A9-NEXT: .LBB2_2: @ %for.body
134+ ; A9-NEXT: @ =>This Inner Loop Header: Depth=1
135+ ; A9-NEXT: adds r5, r0, r1
136+ ; A9-NEXT: ldr r4, [r0, r1]
137+ ; A9-NEXT: ldr r0, [r0]
138+ ; A9-NEXT: subs.w r12, r12, #1
139+ ; A9-NEXT: ldr r6, [r5, r1]
140+ ; A9-NEXT: add r5, r1
141+ ; A9-NEXT: add r0, r4
142+ ; A9-NEXT: ldr r7, [r5, r1]
143+ ; A9-NEXT: add r5, r1
144+ ; A9-NEXT: add r0, r6
145+ ; A9-NEXT: ldr r4, [r5, r1]
146+ ; A9-NEXT: add r0, r7
147+ ; A9-NEXT: add r0, r4
148+ ; A9-NEXT: str r0, [r2]
149+ ; A9-NEXT: add.w r0, r5, r1
150+ ; A9-NEXT: add r2, r3
151+ ; A9-NEXT: add r0, lr
152+ ; A9-NEXT: bne .LBB2_2
153+ ; A9-NEXT: .LBB2_3: @ %for.end
154+ ; A9-NEXT: pop {r4, r5, r6, r7, pc}
88155entry:
89156 %cmp8 = icmp eq i32 %z , 0
90157 br i1 %cmp8 , label %for.end , label %for.body.lr.ph
@@ -136,10 +203,38 @@ for.end: ; preds = %for.body, %entry
136203; }
137204; where 's' can be folded into the addressing mode.
138205; Consequently, we should *not* form any chains.
139- ;
140- ; A9: foldedidx:
141- ; A9: ldrb{{(.w)?}} {{r[0-9]|lr}}, [{{r[0-9]|lr}}, #3]
142206define void @foldedidx (i8* nocapture %a , i8* nocapture %b , i8* nocapture %c ) nounwind ssp {
207+ ; A9-LABEL: foldedidx:
208+ ; A9: @ %bb.0: @ %entry
209+ ; A9-NEXT: .save {r4, r5, r6, lr}
210+ ; A9-NEXT: push {r4, r5, r6, lr}
211+ ; A9-NEXT: mov.w lr, #0
212+ ; A9-NEXT: .LBB3_1: @ %for.body
213+ ; A9-NEXT: @ =>This Inner Loop Header: Depth=1
214+ ; A9-NEXT: ldrb.w r12, [r0, lr]
215+ ; A9-NEXT: add.w r4, r1, lr
216+ ; A9-NEXT: ldrb.w r3, [r1, lr]
217+ ; A9-NEXT: add r3, r12
218+ ; A9-NEXT: strb.w r3, [r2, lr]
219+ ; A9-NEXT: add.w r3, r0, lr
220+ ; A9-NEXT: ldrb.w r12, [r3, #1]
221+ ; A9-NEXT: ldrb r5, [r4, #1]
222+ ; A9-NEXT: add r12, r5
223+ ; A9-NEXT: add.w r5, r2, lr
224+ ; A9-NEXT: strb.w r12, [r5, #1]
225+ ; A9-NEXT: add.w lr, lr, #4
226+ ; A9-NEXT: cmp.w lr, #400
227+ ; A9-NEXT: ldrb.w r12, [r3, #2]
228+ ; A9-NEXT: ldrb r6, [r4, #2]
229+ ; A9-NEXT: add r6, r12
230+ ; A9-NEXT: strb r6, [r5, #2]
231+ ; A9-NEXT: ldrb r3, [r3, #3]
232+ ; A9-NEXT: ldrb r6, [r4, #3]
233+ ; A9-NEXT: add r3, r6
234+ ; A9-NEXT: strb r3, [r5, #3]
235+ ; A9-NEXT: bne .LBB3_1
236+ ; A9-NEXT: @ %bb.2: @ %for.end
237+ ; A9-NEXT: pop {r4, r5, r6, pc}
143238entry:
144239 br label %for.body
145240
@@ -200,14 +295,45 @@ for.end: ; preds = %for.body
200295;
201296; Loads and stores should use post-increment addressing, no add's or add.w's.
202297; Most importantly, there should be no spills or reloads!
203- ;
204- ; A9: testNeon:
205- ; A9: %.lr.ph
206- ; A9-NOT: lsl.w
207- ; A9-NOT: {{ldr|str|adds|add r}}
208- ; A9-NOT: add.w r
209- ; A9: bne
210298define hidden void @testNeon (i8* %ref_data , i32 %ref_stride , i32 %limit , <16 x i8 >* nocapture %data ) nounwind optsize {
299+ ; A9-LABEL: testNeon:
300+ ; A9: @ %bb.0:
301+ ; A9-NEXT: .save {r4, r5, r7, lr}
302+ ; A9-NEXT: push {r4, r5, r7, lr}
303+ ; A9-NEXT: vmov.i32 q8, #0x0
304+ ; A9-NEXT: cmp r2, #1
305+ ; A9-NEXT: blt .LBB4_4
306+ ; A9-NEXT: @ %bb.1: @ %.lr.ph
307+ ; A9-NEXT: movs r5, #0
308+ ; A9-NEXT: movw r4, #64464
309+ ; A9-NEXT: sub.w r12, r5, r2, lsl #6
310+ ; A9-NEXT: sub.w lr, r1, r1, lsl #4
311+ ; A9-NEXT: movt r4, #65535
312+ ; A9-NEXT: mov r5, r3
313+ ; A9-NEXT: .LBB4_2: @ =>This Inner Loop Header: Depth=1
314+ ; A9-NEXT: vld1.64 {d18}, [r0], r1
315+ ; A9-NEXT: subs r2, #1
316+ ; A9-NEXT: vld1.64 {d19}, [r0], r1
317+ ; A9-NEXT: vst1.8 {d18, d19}, [r5]!
318+ ; A9-NEXT: vld1.64 {d20}, [r0], r1
319+ ; A9-NEXT: vld1.64 {d21}, [r0], r1
320+ ; A9-NEXT: vst1.8 {d20, d21}, [r5]!
321+ ; A9-NEXT: vld1.64 {d22}, [r0], r1
322+ ; A9-NEXT: vadd.i8 q9, q9, q10
323+ ; A9-NEXT: vld1.64 {d23}, [r0], r1
324+ ; A9-NEXT: vst1.8 {d22, d23}, [r5]!
325+ ; A9-NEXT: vld1.64 {d20}, [r0], r1
326+ ; A9-NEXT: vadd.i8 q9, q9, q11
327+ ; A9-NEXT: vld1.64 {d21}, [r0], lr
328+ ; A9-NEXT: vadd.i8 q9, q9, q10
329+ ; A9-NEXT: vadd.i8 q8, q8, q9
330+ ; A9-NEXT: vst1.8 {d20, d21}, [r5], r4
331+ ; A9-NEXT: bne .LBB4_2
332+ ; A9-NEXT: @ %bb.3: @ %._crit_edge
333+ ; A9-NEXT: add.w r3, r3, r12, lsl #4
334+ ; A9-NEXT: .LBB4_4:
335+ ; A9-NEXT: vst1.32 {d16, d17}, [r3]
336+ ; A9-NEXT: pop {r4, r5, r7, pc}
211337 %1 = icmp sgt i32 %limit , 0
212338 br i1 %1 , label %.lr.ph , label %45
213339
@@ -284,24 +410,41 @@ declare <1 x i64> @llvm.arm.neon.vld1.v1i64.p0i8(i8*, i32) nounwind readonly
284410; Handle chains in which the same offset is used for both loads and
285411; stores to the same array.
286412; rdar://11410078.
287- ;
288- ; A9: @testReuse
289- ; A9: %for.body
290- ; A9: vld1.8 {d{{[0-9]+}}}, [[BASE:[r[0-9]+]]], [[INC:r[0-9]]]
291- ; A9: vld1.8 {d{{[0-9]+}}}, [[BASE]], [[INC]]
292- ; A9: vld1.8 {d{{[0-9]+}}}, [[BASE]], [[INC]]
293- ; A9: vld1.8 {d{{[0-9]+}}}, [[BASE]], [[INC]]
294- ; A9: vld1.8 {d{{[0-9]+}}}, [[BASE]], [[INC]]
295- ; A9: vld1.8 {d{{[0-9]+}}}, [[BASE]], [[INC]]
296- ; A9: vld1.8 {d{{[0-9]+}}}, [[BASE]], [[INC]]
297- ; A9: vst1.8 {d{{[0-9]+}}}, [[BASE]], [[INC]]
298- ; A9: vst1.8 {d{{[0-9]+}}}, [[BASE]], [[INC]]
299- ; A9: vst1.8 {d{{[0-9]+}}}, [[BASE]], [[INC]]
300- ; A9: vst1.8 {d{{[0-9]+}}}, [[BASE]], [[INC]]
301- ; A9: vst1.8 {d{{[0-9]+}}}, [[BASE]], [[INC]]
302- ; A9: vst1.8 {d{{[0-9]+}}}, [[BASE]]
303- ; A9: bne
304413define void @testReuse (i8* %src , i32 %stride ) nounwind ssp {
414+ ; A9-LABEL: testReuse:
415+ ; A9: @ %bb.0: @ %entry
416+ ; A9-NEXT: sub.w r12, r0, r1, lsl #2
417+ ; A9-NEXT: sub.w r0, r1, r1, lsl #2
418+ ; A9-NEXT: lsls r2, r0, #1
419+ ; A9-NEXT: movs r3, #0
420+ ; A9-NEXT: .LBB5_1: @ %for.body
421+ ; A9-NEXT: @ =>This Inner Loop Header: Depth=1
422+ ; A9-NEXT: add.w r0, r12, r3
423+ ; A9-NEXT: adds r3, #8
424+ ; A9-NEXT: vld1.8 {d16}, [r0], r1
425+ ; A9-NEXT: cmp r3, #32
426+ ; A9-NEXT: vld1.8 {d17}, [r0], r1
427+ ; A9-NEXT: vhadd.u8 d16, d16, d17
428+ ; A9-NEXT: vld1.8 {d18}, [r0], r1
429+ ; A9-NEXT: vhadd.u8 d17, d17, d18
430+ ; A9-NEXT: vld1.8 {d19}, [r0], r1
431+ ; A9-NEXT: vhadd.u8 d18, d18, d19
432+ ; A9-NEXT: vld1.8 {d20}, [r0], r1
433+ ; A9-NEXT: vhadd.u8 d19, d19, d20
434+ ; A9-NEXT: vld1.8 {d21}, [r0], r1
435+ ; A9-NEXT: vhadd.u8 d20, d20, d21
436+ ; A9-NEXT: vld1.8 {d22}, [r0], r1
437+ ; A9-NEXT: vhadd.u8 d21, d21, d22
438+ ; A9-NEXT: vld1.8 {d23}, [r0], r2
439+ ; A9-NEXT: vst1.8 {d16}, [r0], r1
440+ ; A9-NEXT: vst1.8 {d17}, [r0], r1
441+ ; A9-NEXT: vst1.8 {d18}, [r0], r1
442+ ; A9-NEXT: vst1.8 {d19}, [r0], r1
443+ ; A9-NEXT: vst1.8 {d20}, [r0], r1
444+ ; A9-NEXT: vst1.8 {d21}, [r0]
445+ ; A9-NEXT: bne .LBB5_1
446+ ; A9-NEXT: @ %bb.2: @ %for.end
447+ ; A9-NEXT: bx lr
305448entry:
306449 %mul = shl nsw i32 %stride , 2
307450 %idx.neg = sub i32 0 , %mul
0 commit comments