Skip to content

Commit 325a1de

Browse files
Sam Tebbswilldeacon
Sam Tebbs
authored andcommitted
arm64: Import updated version of Cortex Strings' strlen
Import an updated version of the former Cortex Strings - now Arm Optimized Routines - strcmp function. The latest version introduces Advanced SIMD usage which rules it out for our purposes, but we can still pick an intermediate improvement from the previous version, namely string/aarch64/strlen.S at commit 98e4d6a from https://github.com/ARM-software/optimized-routines Note that for simplicity Arm have chosen to contribute this code to Linux under GPLv2 rather than the original MIT license. Signed-off-by: Sam Tebbs <[email protected]> [ rm: update attribution and commit message ] Signed-off-by: Robin Murphy <[email protected]> Link: https://lore.kernel.org/r/32e3489398a24b23ae6e996935ac4818f8fd9dfd.1622128527.git.robin.murphy@arm.com Signed-off-by: Will Deacon <[email protected]>
1 parent 758602c commit 325a1de

File tree

1 file changed

+173
-85
lines changed

1 file changed

+173
-85
lines changed

arch/arm64/lib/strlen.S

Lines changed: 173 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -1,115 +1,203 @@
11
/* SPDX-License-Identifier: GPL-2.0-only */
22
/*
3-
* Copyright (C) 2013 ARM Ltd.
4-
* Copyright (C) 2013 Linaro.
3+
* Copyright (c) 2013, Arm Limited.
54
*
6-
* This code is based on glibc cortex strings work originally authored by Linaro
7-
* be found @
8-
*
9-
* http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
10-
* files/head:/src/aarch64/
5+
* Adapted from the original at:
6+
* https://github.com/ARM-software/optimized-routines/blob/master/string/aarch64/strlen.S
117
*/
128

139
#include <linux/linkage.h>
1410
#include <asm/assembler.h>
1511

16-
/*
17-
* calculate the length of a string
12+
/* Assumptions:
1813
*
19-
* Parameters:
20-
* x0 - const string pointer
21-
* Returns:
22-
* x0 - the return length of specific string
14+
* ARMv8-a, AArch64, unaligned accesses, min page size 4k.
2315
*/
2416

17+
#define L(label) .L ## label
18+
2519
/* Arguments and results. */
26-
srcin .req x0
27-
len .req x0
20+
#define srcin x0
21+
#define len x0
2822

2923
/* Locals and temporaries. */
30-
src .req x1
31-
data1 .req x2
32-
data2 .req x3
33-
data2a .req x4
34-
has_nul1 .req x5
35-
has_nul2 .req x6
36-
tmp1 .req x7
37-
tmp2 .req x8
38-
tmp3 .req x9
39-
tmp4 .req x10
40-
zeroones .req x11
41-
pos .req x12
24+
#define src x1
25+
#define data1 x2
26+
#define data2 x3
27+
#define has_nul1 x4
28+
#define has_nul2 x5
29+
#define tmp1 x4
30+
#define tmp2 x5
31+
#define tmp3 x6
32+
#define tmp4 x7
33+
#define zeroones x8
34+
35+
/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
36+
(=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
37+
can be done in parallel across the entire word. A faster check
38+
(X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives
39+
false hits for characters 129..255. */
4240

4341
#define REP8_01 0x0101010101010101
4442
#define REP8_7f 0x7f7f7f7f7f7f7f7f
4543
#define REP8_80 0x8080808080808080
4644

45+
#define MIN_PAGE_SIZE 4096
46+
47+
/* Since strings are short on average, we check the first 16 bytes
48+
of the string for a NUL character. In order to do an unaligned ldp
49+
safely we have to do a page cross check first. If there is a NUL
50+
byte we calculate the length from the 2 8-byte words using
51+
conditional select to reduce branch mispredictions (it is unlikely
52+
strlen will be repeatedly called on strings with the same length).
53+
54+
If the string is longer than 16 bytes, we align src so don't need
55+
further page cross checks, and process 32 bytes per iteration
56+
using the fast NUL check. If we encounter non-ASCII characters,
57+
fallback to a second loop using the full NUL check.
58+
59+
If the page cross check fails, we read 16 bytes from an aligned
60+
address, remove any characters before the string, and continue
61+
in the main loop using aligned loads. Since strings crossing a
62+
page in the first 16 bytes are rare (probability of
63+
16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized.
64+
65+
AArch64 systems have a minimum page size of 4k. We don't bother
66+
checking for larger page sizes - the cost of setting up the correct
67+
page size is just not worth the extra gain from a small reduction in
68+
the cases taking the slow path. Note that we only care about
69+
whether the first fetch, which may be misaligned, crosses a page
70+
boundary. */
71+
4772
SYM_FUNC_START_WEAK_PI(strlen)
48-
mov zeroones, #REP8_01
49-
bic src, srcin, #15
50-
ands tmp1, srcin, #15
51-
b.ne .Lmisaligned
52-
/*
53-
* NUL detection works on the principle that (X - 1) & (~X) & 0x80
54-
* (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
55-
* can be done in parallel across the entire word.
56-
*/
57-
/*
58-
* The inner loop deals with two Dwords at a time. This has a
59-
* slightly higher start-up cost, but we should win quite quickly,
60-
* especially on cores with a high number of issue slots per
61-
* cycle, as we get much better parallelism out of the operations.
62-
*/
63-
.Lloop:
64-
ldp data1, data2, [src], #16
65-
.Lrealigned:
73+
and tmp1, srcin, MIN_PAGE_SIZE - 1
74+
mov zeroones, REP8_01
75+
cmp tmp1, MIN_PAGE_SIZE - 16
76+
b.gt L(page_cross)
77+
ldp data1, data2, [srcin]
78+
#ifdef __AARCH64EB__
79+
/* For big-endian, carry propagation (if the final byte in the
80+
string is 0x01) means we cannot use has_nul1/2 directly.
81+
Since we expect strings to be small and early-exit,
82+
byte-swap the data now so has_null1/2 will be correct. */
83+
rev data1, data1
84+
rev data2, data2
85+
#endif
6686
sub tmp1, data1, zeroones
67-
orr tmp2, data1, #REP8_7f
87+
orr tmp2, data1, REP8_7f
6888
sub tmp3, data2, zeroones
69-
orr tmp4, data2, #REP8_7f
70-
bic has_nul1, tmp1, tmp2
71-
bics has_nul2, tmp3, tmp4
72-
ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */
73-
b.eq .Lloop
89+
orr tmp4, data2, REP8_7f
90+
bics has_nul1, tmp1, tmp2
91+
bic has_nul2, tmp3, tmp4
92+
ccmp has_nul2, 0, 0, eq
93+
beq L(main_loop_entry)
94+
95+
/* Enter with C = has_nul1 == 0. */
96+
csel has_nul1, has_nul1, has_nul2, cc
97+
mov len, 8
98+
rev has_nul1, has_nul1
99+
clz tmp1, has_nul1
100+
csel len, xzr, len, cc
101+
add len, len, tmp1, lsr 3
102+
ret
74103

104+
/* The inner loop processes 32 bytes per iteration and uses the fast
105+
NUL check. If we encounter non-ASCII characters, use a second
106+
loop with the accurate NUL check. */
107+
.p2align 4
108+
L(main_loop_entry):
109+
bic src, srcin, 15
110+
sub src, src, 16
111+
L(main_loop):
112+
ldp data1, data2, [src, 32]!
113+
L(page_cross_entry):
114+
sub tmp1, data1, zeroones
115+
sub tmp3, data2, zeroones
116+
orr tmp2, tmp1, tmp3
117+
tst tmp2, zeroones, lsl 7
118+
bne 1f
119+
ldp data1, data2, [src, 16]
120+
sub tmp1, data1, zeroones
121+
sub tmp3, data2, zeroones
122+
orr tmp2, tmp1, tmp3
123+
tst tmp2, zeroones, lsl 7
124+
beq L(main_loop)
125+
add src, src, 16
126+
1:
127+
/* The fast check failed, so do the slower, accurate NUL check. */
128+
orr tmp2, data1, REP8_7f
129+
orr tmp4, data2, REP8_7f
130+
bics has_nul1, tmp1, tmp2
131+
bic has_nul2, tmp3, tmp4
132+
ccmp has_nul2, 0, 0, eq
133+
beq L(nonascii_loop)
134+
135+
/* Enter with C = has_nul1 == 0. */
136+
L(tail):
137+
#ifdef __AARCH64EB__
138+
/* For big-endian, carry propagation (if the final byte in the
139+
string is 0x01) means we cannot use has_nul1/2 directly. The
140+
easiest way to get the correct byte is to byte-swap the data
141+
and calculate the syndrome a second time. */
142+
csel data1, data1, data2, cc
143+
rev data1, data1
144+
sub tmp1, data1, zeroones
145+
orr tmp2, data1, REP8_7f
146+
bic has_nul1, tmp1, tmp2
147+
#else
148+
csel has_nul1, has_nul1, has_nul2, cc
149+
#endif
75150
sub len, src, srcin
76-
cbz has_nul1, .Lnul_in_data2
77-
CPU_BE( mov data2, data1 ) /*prepare data to re-calculate the syndrome*/
78-
sub len, len, #8
79-
mov has_nul2, has_nul1
80-
.Lnul_in_data2:
81-
/*
82-
* For big-endian, carry propagation (if the final byte in the
83-
* string is 0x01) means we cannot use has_nul directly. The
84-
* easiest way to get the correct byte is to byte-swap the data
85-
* and calculate the syndrome a second time.
86-
*/
87-
CPU_BE( rev data2, data2 )
88-
CPU_BE( sub tmp1, data2, zeroones )
89-
CPU_BE( orr tmp2, data2, #REP8_7f )
90-
CPU_BE( bic has_nul2, tmp1, tmp2 )
91-
92-
sub len, len, #8
93-
rev has_nul2, has_nul2
94-
clz pos, has_nul2
95-
add len, len, pos, lsr #3 /* Bits to bytes. */
151+
rev has_nul1, has_nul1
152+
add tmp2, len, 8
153+
clz tmp1, has_nul1
154+
csel len, len, tmp2, cc
155+
add len, len, tmp1, lsr 3
96156
ret
97157

98-
.Lmisaligned:
99-
cmp tmp1, #8
100-
neg tmp1, tmp1
101-
ldp data1, data2, [src], #16
102-
lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */
103-
mov tmp2, #~0
104-
/* Big-endian. Early bytes are at MSB. */
105-
CPU_BE( lsl tmp2, tmp2, tmp1 ) /* Shift (tmp1 & 63). */
158+
L(nonascii_loop):
159+
ldp data1, data2, [src, 16]!
160+
sub tmp1, data1, zeroones
161+
orr tmp2, data1, REP8_7f
162+
sub tmp3, data2, zeroones
163+
orr tmp4, data2, REP8_7f
164+
bics has_nul1, tmp1, tmp2
165+
bic has_nul2, tmp3, tmp4
166+
ccmp has_nul2, 0, 0, eq
167+
bne L(tail)
168+
ldp data1, data2, [src, 16]!
169+
sub tmp1, data1, zeroones
170+
orr tmp2, data1, REP8_7f
171+
sub tmp3, data2, zeroones
172+
orr tmp4, data2, REP8_7f
173+
bics has_nul1, tmp1, tmp2
174+
bic has_nul2, tmp3, tmp4
175+
ccmp has_nul2, 0, 0, eq
176+
beq L(nonascii_loop)
177+
b L(tail)
178+
179+
/* Load 16 bytes from [srcin & ~15] and force the bytes that precede
180+
srcin to 0x7f, so we ignore any NUL bytes before the string.
181+
Then continue in the aligned loop. */
182+
L(page_cross):
183+
bic src, srcin, 15
184+
ldp data1, data2, [src]
185+
lsl tmp1, srcin, 3
186+
mov tmp4, -1
187+
#ifdef __AARCH64EB__
188+
/* Big-endian. Early bytes are at MSB. */
189+
lsr tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */
190+
#else
106191
/* Little-endian. Early bytes are at LSB. */
107-
CPU_LE( lsr tmp2, tmp2, tmp1 ) /* Shift (tmp1 & 63). */
192+
lsl tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */
193+
#endif
194+
orr tmp1, tmp1, REP8_80
195+
orn data1, data1, tmp1
196+
orn tmp2, data2, tmp1
197+
tst srcin, 8
198+
csel data1, data1, tmp4, eq
199+
csel data2, data2, tmp2, eq
200+
b L(page_cross_entry)
108201

109-
orr data1, data1, tmp2
110-
orr data2a, data2, tmp2
111-
csinv data1, data1, xzr, le
112-
csel data2, data2, data2a, le
113-
b .Lrealigned
114202
SYM_FUNC_END_PI(strlen)
115203
EXPORT_SYMBOL_NOKASAN(strlen)

0 commit comments

Comments
 (0)