|
1 | 1 | /* SPDX-License-Identifier: GPL-2.0-only */
|
2 | 2 | /*
|
3 |
| - * Copyright (C) 2013 ARM Ltd. |
4 |
| - * Copyright (C) 2013 Linaro. |
| 3 | + * Copyright (c) 2013, Arm Limited. |
5 | 4 | *
|
6 |
| - * This code is based on glibc cortex strings work originally authored by Linaro |
7 |
| - * be found @ |
8 |
| - * |
9 |
| - * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ |
10 |
| - * files/head:/src/aarch64/ |
| 5 | + * Adapted from the original at: |
| 6 | + * https://github.com/ARM-software/optimized-routines/blob/master/string/aarch64/strlen.S |
11 | 7 | */
|
12 | 8 |
|
13 | 9 | #include <linux/linkage.h>
|
14 | 10 | #include <asm/assembler.h>
|
15 | 11 |
|
16 |
| -/* |
17 |
| - * calculate the length of a string |
| 12 | +/* Assumptions: |
18 | 13 | *
|
19 |
| - * Parameters: |
20 |
| - * x0 - const string pointer |
21 |
| - * Returns: |
22 |
| - * x0 - the return length of specific string |
| 14 | + * ARMv8-a, AArch64, unaligned accesses, min page size 4k. |
23 | 15 | */
|
24 | 16 |
|
| 17 | +#define L(label) .L ## label |
| 18 | + |
25 | 19 | /* Arguments and results. */
|
26 |
| -srcin .req x0 |
27 |
| -len .req x0 |
| 20 | +#define srcin x0 |
| 21 | +#define len x0 |
28 | 22 |
|
29 | 23 | /* Locals and temporaries. */
|
30 |
| -src .req x1 |
31 |
| -data1 .req x2 |
32 |
| -data2 .req x3 |
33 |
| -data2a .req x4 |
34 |
| -has_nul1 .req x5 |
35 |
| -has_nul2 .req x6 |
36 |
| -tmp1 .req x7 |
37 |
| -tmp2 .req x8 |
38 |
| -tmp3 .req x9 |
39 |
| -tmp4 .req x10 |
40 |
| -zeroones .req x11 |
41 |
| -pos .req x12 |
| 24 | +#define src x1 |
| 25 | +#define data1 x2 |
| 26 | +#define data2 x3 |
| 27 | +#define has_nul1 x4 |
| 28 | +#define has_nul2 x5 |
| 29 | +#define tmp1 x4 |
| 30 | +#define tmp2 x5 |
| 31 | +#define tmp3 x6 |
| 32 | +#define tmp4 x7 |
| 33 | +#define zeroones x8 |
| 34 | + |
| 35 | + /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 |
| 36 | + (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and |
| 37 | + can be done in parallel across the entire word. A faster check |
| 38 | + (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives |
| 39 | + false hits for characters 129..255. */ |
42 | 40 |
|
43 | 41 | #define REP8_01 0x0101010101010101
|
44 | 42 | #define REP8_7f 0x7f7f7f7f7f7f7f7f
|
45 | 43 | #define REP8_80 0x8080808080808080
|
46 | 44 |
|
| 45 | +#define MIN_PAGE_SIZE 4096 |
| 46 | + |
| 47 | + /* Since strings are short on average, we check the first 16 bytes |
| 48 | + of the string for a NUL character. In order to do an unaligned ldp |
| 49 | + safely we have to do a page cross check first. If there is a NUL |
| 50 | + byte we calculate the length from the 2 8-byte words using |
| 51 | + conditional select to reduce branch mispredictions (it is unlikely |
| 52 | + strlen will be repeatedly called on strings with the same length). |
| 53 | + |
| 54 | + If the string is longer than 16 bytes, we align src so don't need |
| 55 | + further page cross checks, and process 32 bytes per iteration |
| 56 | + using the fast NUL check. If we encounter non-ASCII characters, |
| 57 | + fallback to a second loop using the full NUL check. |
| 58 | + |
| 59 | + If the page cross check fails, we read 16 bytes from an aligned |
| 60 | + address, remove any characters before the string, and continue |
| 61 | + in the main loop using aligned loads. Since strings crossing a |
| 62 | + page in the first 16 bytes are rare (probability of |
| 63 | + 16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized. |
| 64 | + |
| 65 | + AArch64 systems have a minimum page size of 4k. We don't bother |
| 66 | + checking for larger page sizes - the cost of setting up the correct |
| 67 | + page size is just not worth the extra gain from a small reduction in |
| 68 | + the cases taking the slow path. Note that we only care about |
| 69 | + whether the first fetch, which may be misaligned, crosses a page |
| 70 | + boundary. */ |
| 71 | + |
47 | 72 | SYM_FUNC_START_WEAK_PI(strlen)
|
48 |
| - mov zeroones, #REP8_01 |
49 |
| - bic src, srcin, #15 |
50 |
| - ands tmp1, srcin, #15 |
51 |
| - b.ne .Lmisaligned |
52 |
| - /* |
53 |
| - * NUL detection works on the principle that (X - 1) & (~X) & 0x80 |
54 |
| - * (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and |
55 |
| - * can be done in parallel across the entire word. |
56 |
| - */ |
57 |
| - /* |
58 |
| - * The inner loop deals with two Dwords at a time. This has a |
59 |
| - * slightly higher start-up cost, but we should win quite quickly, |
60 |
| - * especially on cores with a high number of issue slots per |
61 |
| - * cycle, as we get much better parallelism out of the operations. |
62 |
| - */ |
63 |
| -.Lloop: |
64 |
| - ldp data1, data2, [src], #16 |
65 |
| -.Lrealigned: |
| 73 | + and tmp1, srcin, MIN_PAGE_SIZE - 1 |
| 74 | + mov zeroones, REP8_01 |
| 75 | + cmp tmp1, MIN_PAGE_SIZE - 16 |
| 76 | + b.gt L(page_cross) |
| 77 | + ldp data1, data2, [srcin] |
| 78 | +#ifdef __AARCH64EB__ |
| 79 | + /* For big-endian, carry propagation (if the final byte in the |
| 80 | + string is 0x01) means we cannot use has_nul1/2 directly. |
| 81 | + Since we expect strings to be small and early-exit, |
| 82 | + byte-swap the data now so has_null1/2 will be correct. */ |
| 83 | + rev data1, data1 |
| 84 | + rev data2, data2 |
| 85 | +#endif |
66 | 86 | sub tmp1, data1, zeroones
|
67 |
| - orr tmp2, data1, #REP8_7f |
| 87 | + orr tmp2, data1, REP8_7f |
68 | 88 | sub tmp3, data2, zeroones
|
69 |
| - orr tmp4, data2, #REP8_7f |
70 |
| - bic has_nul1, tmp1, tmp2 |
71 |
| - bics has_nul2, tmp3, tmp4 |
72 |
| - ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */ |
73 |
| - b.eq .Lloop |
| 89 | + orr tmp4, data2, REP8_7f |
| 90 | + bics has_nul1, tmp1, tmp2 |
| 91 | + bic has_nul2, tmp3, tmp4 |
| 92 | + ccmp has_nul2, 0, 0, eq |
| 93 | + beq L(main_loop_entry) |
| 94 | + |
| 95 | + /* Enter with C = has_nul1 == 0. */ |
| 96 | + csel has_nul1, has_nul1, has_nul2, cc |
| 97 | + mov len, 8 |
| 98 | + rev has_nul1, has_nul1 |
| 99 | + clz tmp1, has_nul1 |
| 100 | + csel len, xzr, len, cc |
| 101 | + add len, len, tmp1, lsr 3 |
| 102 | + ret |
74 | 103 |
|
| 104 | + /* The inner loop processes 32 bytes per iteration and uses the fast |
| 105 | + NUL check. If we encounter non-ASCII characters, use a second |
| 106 | + loop with the accurate NUL check. */ |
| 107 | + .p2align 4 |
| 108 | +L(main_loop_entry): |
| 109 | + bic src, srcin, 15 |
| 110 | + sub src, src, 16 |
| 111 | +L(main_loop): |
| 112 | + ldp data1, data2, [src, 32]! |
| 113 | +L(page_cross_entry): |
| 114 | + sub tmp1, data1, zeroones |
| 115 | + sub tmp3, data2, zeroones |
| 116 | + orr tmp2, tmp1, tmp3 |
| 117 | + tst tmp2, zeroones, lsl 7 |
| 118 | + bne 1f |
| 119 | + ldp data1, data2, [src, 16] |
| 120 | + sub tmp1, data1, zeroones |
| 121 | + sub tmp3, data2, zeroones |
| 122 | + orr tmp2, tmp1, tmp3 |
| 123 | + tst tmp2, zeroones, lsl 7 |
| 124 | + beq L(main_loop) |
| 125 | + add src, src, 16 |
| 126 | +1: |
| 127 | + /* The fast check failed, so do the slower, accurate NUL check. */ |
| 128 | + orr tmp2, data1, REP8_7f |
| 129 | + orr tmp4, data2, REP8_7f |
| 130 | + bics has_nul1, tmp1, tmp2 |
| 131 | + bic has_nul2, tmp3, tmp4 |
| 132 | + ccmp has_nul2, 0, 0, eq |
| 133 | + beq L(nonascii_loop) |
| 134 | + |
| 135 | + /* Enter with C = has_nul1 == 0. */ |
| 136 | +L(tail): |
| 137 | +#ifdef __AARCH64EB__ |
| 138 | + /* For big-endian, carry propagation (if the final byte in the |
| 139 | + string is 0x01) means we cannot use has_nul1/2 directly. The |
| 140 | + easiest way to get the correct byte is to byte-swap the data |
| 141 | + and calculate the syndrome a second time. */ |
| 142 | + csel data1, data1, data2, cc |
| 143 | + rev data1, data1 |
| 144 | + sub tmp1, data1, zeroones |
| 145 | + orr tmp2, data1, REP8_7f |
| 146 | + bic has_nul1, tmp1, tmp2 |
| 147 | +#else |
| 148 | + csel has_nul1, has_nul1, has_nul2, cc |
| 149 | +#endif |
75 | 150 | sub len, src, srcin
|
76 |
| - cbz has_nul1, .Lnul_in_data2 |
77 |
| -CPU_BE( mov data2, data1 ) /*prepare data to re-calculate the syndrome*/ |
78 |
| - sub len, len, #8 |
79 |
| - mov has_nul2, has_nul1 |
80 |
| -.Lnul_in_data2: |
81 |
| - /* |
82 |
| - * For big-endian, carry propagation (if the final byte in the |
83 |
| - * string is 0x01) means we cannot use has_nul directly. The |
84 |
| - * easiest way to get the correct byte is to byte-swap the data |
85 |
| - * and calculate the syndrome a second time. |
86 |
| - */ |
87 |
| -CPU_BE( rev data2, data2 ) |
88 |
| -CPU_BE( sub tmp1, data2, zeroones ) |
89 |
| -CPU_BE( orr tmp2, data2, #REP8_7f ) |
90 |
| -CPU_BE( bic has_nul2, tmp1, tmp2 ) |
91 |
| - |
92 |
| - sub len, len, #8 |
93 |
| - rev has_nul2, has_nul2 |
94 |
| - clz pos, has_nul2 |
95 |
| - add len, len, pos, lsr #3 /* Bits to bytes. */ |
| 151 | + rev has_nul1, has_nul1 |
| 152 | + add tmp2, len, 8 |
| 153 | + clz tmp1, has_nul1 |
| 154 | + csel len, len, tmp2, cc |
| 155 | + add len, len, tmp1, lsr 3 |
96 | 156 | ret
|
97 | 157 |
|
98 |
| -.Lmisaligned: |
99 |
| - cmp tmp1, #8 |
100 |
| - neg tmp1, tmp1 |
101 |
| - ldp data1, data2, [src], #16 |
102 |
| - lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */ |
103 |
| - mov tmp2, #~0 |
104 |
| - /* Big-endian. Early bytes are at MSB. */ |
105 |
| -CPU_BE( lsl tmp2, tmp2, tmp1 ) /* Shift (tmp1 & 63). */ |
| 158 | +L(nonascii_loop): |
| 159 | + ldp data1, data2, [src, 16]! |
| 160 | + sub tmp1, data1, zeroones |
| 161 | + orr tmp2, data1, REP8_7f |
| 162 | + sub tmp3, data2, zeroones |
| 163 | + orr tmp4, data2, REP8_7f |
| 164 | + bics has_nul1, tmp1, tmp2 |
| 165 | + bic has_nul2, tmp3, tmp4 |
| 166 | + ccmp has_nul2, 0, 0, eq |
| 167 | + bne L(tail) |
| 168 | + ldp data1, data2, [src, 16]! |
| 169 | + sub tmp1, data1, zeroones |
| 170 | + orr tmp2, data1, REP8_7f |
| 171 | + sub tmp3, data2, zeroones |
| 172 | + orr tmp4, data2, REP8_7f |
| 173 | + bics has_nul1, tmp1, tmp2 |
| 174 | + bic has_nul2, tmp3, tmp4 |
| 175 | + ccmp has_nul2, 0, 0, eq |
| 176 | + beq L(nonascii_loop) |
| 177 | + b L(tail) |
| 178 | + |
| 179 | + /* Load 16 bytes from [srcin & ~15] and force the bytes that precede |
| 180 | + srcin to 0x7f, so we ignore any NUL bytes before the string. |
| 181 | + Then continue in the aligned loop. */ |
| 182 | +L(page_cross): |
| 183 | + bic src, srcin, 15 |
| 184 | + ldp data1, data2, [src] |
| 185 | + lsl tmp1, srcin, 3 |
| 186 | + mov tmp4, -1 |
| 187 | +#ifdef __AARCH64EB__ |
| 188 | + /* Big-endian. Early bytes are at MSB. */ |
| 189 | + lsr tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */ |
| 190 | +#else |
106 | 191 | /* Little-endian. Early bytes are at LSB. */
|
107 |
| -CPU_LE( lsr tmp2, tmp2, tmp1 ) /* Shift (tmp1 & 63). */ |
| 192 | + lsl tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */ |
| 193 | +#endif |
| 194 | + orr tmp1, tmp1, REP8_80 |
| 195 | + orn data1, data1, tmp1 |
| 196 | + orn tmp2, data2, tmp1 |
| 197 | + tst srcin, 8 |
| 198 | + csel data1, data1, tmp4, eq |
| 199 | + csel data2, data2, tmp2, eq |
| 200 | + b L(page_cross_entry) |
108 | 201 |
|
109 |
| - orr data1, data1, tmp2 |
110 |
| - orr data2a, data2, tmp2 |
111 |
| - csinv data1, data1, xzr, le |
112 |
| - csel data2, data2, data2a, le |
113 |
| - b .Lrealigned |
114 | 202 | SYM_FUNC_END_PI(strlen)
|
115 | 203 | EXPORT_SYMBOL_NOKASAN(strlen)
|
0 commit comments