Skip to content

Commit e18d07d

Browse files
4a6f656cmengzhuo
authored andcommitted
runtime: optimise memmove on riscv64
Implement a more optimised memmove on riscv64, where up to 64 bytes are moved per loop after achieving alignment. In the unaligned case, memory is moved at up to 8 bytes per loop. This also avoids doing unaligned loads and stores, which results in kernel traps and a significant performance penality. Fixes #48248. name old speed new speed delta Memmove/1-4 31.3MB/s _ 0% 26.6MB/s _ 0% -14.95% (p=0.000 n=3+3) Memmove/2-4 50.6MB/s _ 1% 42.6MB/s _ 0% -15.75% (p=0.000 n=3+3) Memmove/3-4 64.5MB/s _ 1% 53.4MB/s _ 2% -17.11% (p=0.001 n=3+3) Memmove/4-4 74.9MB/s _ 0% 99.2MB/s _ 0% +32.55% (p=0.000 n=3+3) Memmove/5-4 82.3MB/s _ 0% 99.0MB/s _ 1% +20.29% (p=0.000 n=3+3) Memmove/6-4 88.2MB/s _ 0% 102.3MB/s _ 1% +15.87% (p=0.000 n=3+3) Memmove/7-4 93.4MB/s _ 0% 102.0MB/s _ 0% +9.18% (p=0.000 n=3+3) Memmove/8-4 188MB/s _ 3% 188MB/s _ 6% ~ (p=0.964 n=3+3) Memmove/9-4 182MB/s _ 6% 163MB/s _ 1% ~ (p=0.069 n=3+3) Memmove/10-4 177MB/s _ 0% 149MB/s _ 4% -15.93% (p=0.012 n=3+3) Memmove/11-4 171MB/s _ 6% 148MB/s _ 0% -13.65% (p=0.045 n=3+3) Memmove/12-4 166MB/s _ 5% 209MB/s _ 0% +26.12% (p=0.009 n=3+3) Memmove/13-4 170MB/s _ 1% 188MB/s _ 4% +10.76% (p=0.039 n=3+3) Memmove/14-4 158MB/s _ 0% 185MB/s _ 0% +17.13% (p=0.000 n=3+3) Memmove/15-4 166MB/s _ 0% 175MB/s _ 0% +5.38% (p=0.000 n=3+3) Memmove/16-4 320MB/s _ 6% 343MB/s _ 0% ~ (p=0.149 n=3+3) Memmove/32-4 493MB/s _ 5% 628MB/s _ 1% +27.51% (p=0.008 n=3+3) Memmove/64-4 706MB/s _ 0% 1132MB/s _ 0% +60.32% (p=0.000 n=3+3) Memmove/128-4 837MB/s _ 1% 1623MB/s _ 1% +93.96% (p=0.000 n=3+3) Memmove/256-4 960MB/s _ 0% 2070MB/s _ 6% +115.68% (p=0.003 n=3+3) Memmove/512-4 1.04GB/s _ 0% 2.55GB/s _ 0% +146.05% (p=0.000 n=3+3) Memmove/1024-4 1.08GB/s _ 0% 2.76GB/s _ 0% +155.62% (p=0.000 n=3+3) Memmove/2048-4 1.10GB/s _ 0% 2.90GB/s _ 1% +164.31% (p=0.000 n=3+3) Memmove/4096-4 1.11GB/s _ 0% 2.98GB/s _ 0% +169.77% (p=0.000 n=3+3) MemmoveOverlap/32-4 443MB/s _ 0% 500MB/s _ 0% +12.81% (p=0.000 n=3+3) MemmoveOverlap/64-4 635MB/s _ 0% 908MB/s _ 0% +42.92% (p=0.000 n=3+3) MemmoveOverlap/128-4 789MB/s _ 0% 1423MB/s _ 0% +80.28% (p=0.000 n=3+3) MemmoveOverlap/256-4 925MB/s _ 0% 1941MB/s _ 0% +109.86% (p=0.000 n=3+3) MemmoveOverlap/512-4 1.01GB/s _ 2% 2.37GB/s _ 0% +134.86% (p=0.000 n=3+3) MemmoveOverlap/1024-4 1.06GB/s _ 0% 2.68GB/s _ 1% +151.67% (p=0.000 n=3+3) MemmoveOverlap/2048-4 1.09GB/s _ 0% 2.89GB/s _ 0% +164.82% (p=0.000 n=3+3) MemmoveOverlap/4096-4 1.11GB/s _ 0% 3.01GB/s _ 0% +171.30% (p=0.000 n=3+3) MemmoveUnalignedDst/1-4 24.1MB/s _ 1% 21.3MB/s _ 0% -11.76% (p=0.000 n=3+3) MemmoveUnalignedDst/2-4 41.6MB/s _ 1% 35.9MB/s _ 0% -13.72% (p=0.000 n=3+3) MemmoveUnalignedDst/3-4 54.0MB/s _ 0% 45.5MB/s _ 2% -15.76% (p=0.004 n=3+3) MemmoveUnalignedDst/4-4 63.9MB/s _ 1% 81.6MB/s _ 0% +27.70% (p=0.000 n=3+3) MemmoveUnalignedDst/5-4 69.4MB/s _ 6% 84.8MB/s _ 0% +22.08% (p=0.015 n=3+3) MemmoveUnalignedDst/6-4 77.8MB/s _ 2% 89.0MB/s _ 0% +14.53% (p=0.004 n=3+3) MemmoveUnalignedDst/7-4 83.0MB/s _ 0% 90.7MB/s _ 1% +9.30% (p=0.000 n=3+3) MemmoveUnalignedDst/8-4 6.97MB/s _ 2% 127.73MB/s _ 0% +1732.57% (p=0.000 n=3+3) MemmoveUnalignedDst/9-4 7.81MB/s _ 1% 125.41MB/s _ 0% +1506.45% (p=0.000 n=3+3) MemmoveUnalignedDst/10-4 8.59MB/s _ 2% 123.52MB/s _ 0% +1337.43% (p=0.000 n=3+3) MemmoveUnalignedDst/11-4 9.23MB/s _ 6% 119.81MB/s _ 4% +1197.55% (p=0.000 n=3+3) MemmoveUnalignedDst/12-4 10.3MB/s _ 0% 155.9MB/s _ 7% +1416.08% (p=0.001 n=3+3) MemmoveUnalignedDst/13-4 10.9MB/s _ 3% 155.1MB/s _ 0% +1321.26% (p=0.000 n=3+3) MemmoveUnalignedDst/14-4 11.4MB/s _ 5% 151.0MB/s _ 0% +1229.37% (p=0.000 n=3+3) MemmoveUnalignedDst/15-4 12.6MB/s _ 0% 147.0MB/s _ 0% +1066.39% (p=0.000 n=3+3) MemmoveUnalignedDst/16-4 7.17MB/s _ 0% 184.33MB/s _ 5% +2470.90% (p=0.001 n=3+3) MemmoveUnalignedDst/32-4 7.26MB/s _ 0% 252.00MB/s _ 2% +3371.12% (p=0.000 n=3+3) MemmoveUnalignedDst/64-4 7.25MB/s _ 2% 306.37MB/s _ 1% +4125.75% (p=0.000 n=3+3) MemmoveUnalignedDst/128-4 7.32MB/s _ 1% 338.03MB/s _ 1% +4517.85% (p=0.000 n=3+3) MemmoveUnalignedDst/256-4 7.31MB/s _ 0% 361.06MB/s _ 0% +4841.47% (p=0.000 n=3+3) MemmoveUnalignedDst/512-4 7.35MB/s _ 0% 373.55MB/s _ 0% +4982.36% (p=0.000 n=3+3) MemmoveUnalignedDst/1024-4 7.33MB/s _ 0% 379.00MB/s _ 2% +5068.18% (p=0.000 n=3+3) MemmoveUnalignedDst/2048-4 7.31MB/s _ 2% 383.05MB/s _ 0% +5142.47% (p=0.000 n=3+3) MemmoveUnalignedDst/4096-4 7.35MB/s _ 1% 385.97MB/s _ 1% +5151.25% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/32-4 9.43MB/s _ 0% 233.72MB/s _ 0% +2377.56% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/64-4 8.13MB/s _ 3% 288.77MB/s _ 0% +3451.91% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/128-4 7.77MB/s _ 0% 326.62MB/s _ 3% +4103.65% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/256-4 7.28MB/s _ 6% 357.24MB/s _ 0% +4804.85% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/512-4 7.44MB/s _ 0% 363.63MB/s _ 7% +4787.54% (p=0.001 n=3+3) MemmoveUnalignedDstOverlap/1024-4 7.37MB/s _ 0% 383.17MB/s _ 0% +5101.40% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/2048-4 7.29MB/s _ 2% 387.69MB/s _ 0% +5215.68% (p=0.000 n=3+3) MemmoveUnalignedDstOverlap/4096-4 7.18MB/s _ 5% 389.22MB/s _ 0% +5320.84% (p=0.000 n=3+3) MemmoveUnalignedSrc/1-4 24.2MB/s _ 0% 21.4MB/s _ 1% -11.70% (p=0.001 n=3+3) MemmoveUnalignedSrc/2-4 41.7MB/s _ 0% 36.0MB/s _ 0% -13.71% (p=0.000 n=3+3) MemmoveUnalignedSrc/3-4 52.1MB/s _ 6% 46.4MB/s _ 1% ~ (p=0.074 n=3+3) MemmoveUnalignedSrc/4-4 60.4MB/s _ 0% 76.4MB/s _ 0% +26.39% (p=0.000 n=3+3) MemmoveUnalignedSrc/5-4 71.2MB/s _ 1% 84.7MB/s _ 0% +18.90% (p=0.000 n=3+3) MemmoveUnalignedSrc/6-4 77.7MB/s _ 0% 88.7MB/s _ 0% +14.06% (p=0.000 n=3+3) MemmoveUnalignedSrc/7-4 82.9MB/s _ 1% 90.7MB/s _ 1% +9.42% (p=0.000 n=3+3) MemmoveUnalignedSrc/8-4 74.6MB/s _ 0% 120.6MB/s _ 0% +61.62% (p=0.000 n=3+3) MemmoveUnalignedSrc/9-4 78.7MB/s _ 1% 123.9MB/s _ 1% +57.42% (p=0.000 n=3+3) MemmoveUnalignedSrc/10-4 82.1MB/s _ 0% 121.7MB/s _ 0% +48.21% (p=0.000 n=3+3) MemmoveUnalignedSrc/11-4 83.7MB/s _ 5% 122.0MB/s _ 0% +45.79% (p=0.003 n=3+3) MemmoveUnalignedSrc/12-4 88.6MB/s _ 0% 160.8MB/s _ 0% +81.56% (p=0.000 n=3+3) MemmoveUnalignedSrc/13-4 91.0MB/s _ 0% 155.0MB/s _ 0% +70.29% (p=0.000 n=3+3) MemmoveUnalignedSrc/14-4 92.0MB/s _ 2% 151.0MB/s _ 0% +64.09% (p=0.000 n=3+3) MemmoveUnalignedSrc/15-4 12.6MB/s _ 0% 146.6MB/s _ 0% +1063.32% (p=0.000 n=3+3) MemmoveUnalignedSrc/16-4 13.3MB/s _ 0% 188.8MB/s _ 2% +1319.02% (p=0.000 n=3+3) MemmoveUnalignedSrc/32-4 9.44MB/s _ 0% 254.24MB/s _ 1% +2594.21% (p=0.000 n=3+3) MemmoveUnalignedSrc/64-4 8.27MB/s _ 0% 302.33MB/s _ 2% +3555.78% (p=0.000 n=3+3) MemmoveUnalignedSrc/128-4 7.73MB/s _ 3% 338.82MB/s _ 0% +4281.29% (p=0.000 n=3+3) MemmoveUnalignedSrc/256-4 7.58MB/s _ 0% 362.19MB/s _ 0% +4678.23% (p=0.000 n=3+3) MemmoveUnalignedSrc/512-4 7.44MB/s _ 1% 374.49MB/s _ 0% +4933.51% (p=0.000 n=3+3) MemmoveUnalignedSrc/1024-4 7.30MB/s _ 2% 379.74MB/s _ 0% +5099.54% (p=0.000 n=3+3) MemmoveUnalignedSrc/2048-4 7.34MB/s _ 2% 385.50MB/s _ 0% +5154.38% (p=0.000 n=3+3) MemmoveUnalignedSrc/4096-4 7.35MB/s _ 1% 383.64MB/s _ 0% +5119.59% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/32-4 7.22MB/s _ 0% 254.94MB/s _ 0% +3432.66% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/64-4 7.29MB/s _ 1% 296.99MB/s _ 5% +3973.89% (p=0.001 n=3+3) MemmoveUnalignedSrcOverlap/128-4 7.32MB/s _ 1% 336.73MB/s _ 1% +4500.09% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/256-4 7.30MB/s _ 1% 361.41MB/s _ 0% +4850.82% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/512-4 7.34MB/s _ 0% 374.92MB/s _ 0% +5007.90% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/1024-4 7.34MB/s _ 0% 380.15MB/s _ 0% +5079.16% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/2048-4 7.36MB/s _ 0% 383.78MB/s _ 0% +5116.76% (p=0.000 n=3+3) MemmoveUnalignedSrcOverlap/4096-4 7.35MB/s _ 0% 386.32MB/s _ 0% +5156.05% (p=0.000 n=3+3) Change-Id: Ibc13230af7b1e205ed95a6470e2cf64ff4251405 Reviewed-on: https://go-review.googlesource.com/c/go/+/426256 TryBot-Result: Gopher Robot <[email protected]> Reviewed-by: Meng Zhuo <[email protected]> Reviewed-by: Than McIntosh <[email protected]> Reviewed-by: Joedian Reid <[email protected]> Run-TryBot: Joel Sing <[email protected]>
1 parent c13ce29 commit e18d07d

File tree

1 file changed

+296
-76
lines changed

1 file changed

+296
-76
lines changed

src/runtime/memmove_riscv64.s

Lines changed: 296 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -8,91 +8,311 @@
88

99
// void runtime·memmove(void*, void*, uintptr)
1010
TEXT runtime·memmove<ABIInternal>(SB),NOSPLIT,$-0-24
11-
// A0 = to
12-
// A1 = from
13-
// A2 = n
14-
ADD A1, A2, T5
11+
// X10 = to
12+
// X11 = from
13+
// X12 = n
14+
BEQ X10, X11, done
15+
BEQZ X12, done
1516

1617
// If the destination is ahead of the source, start at the end of the
1718
// buffer and go backward.
18-
BLTU A1, A0, b
19+
BGTU X10, X11, backward
1920

20-
// If less than eight bytes, do one byte at a time.
21-
SLTU $8, A2, T3
22-
BNE T3, ZERO, f_outcheck
21+
// If less than 8 bytes, do single byte copies.
22+
MOV $8, X9
23+
BLT X12, X9, f_loop4_check
2324

24-
// Do one byte at a time until from is eight-aligned.
25-
JMP f_aligncheck
25+
// Check alignment - if alignment differs we have to do one byte at a time.
26+
AND $3, X10, X5
27+
AND $3, X11, X6
28+
BNE X5, X6, f_loop8_unaligned_check
29+
BEQZ X5, f_loop_check
30+
31+
// Move one byte at a time until we reach 8 byte alignment.
32+
SUB X5, X12, X12
2633
f_align:
27-
MOVB (A1), T3
28-
MOVB T3, (A0)
29-
ADD $1, A0
30-
ADD $1, A1
31-
f_aligncheck:
32-
AND $7, A1, T3
33-
BNE T3, ZERO, f_align
34-
35-
// Do eight bytes at a time as long as there is room.
36-
ADD $-7, T5, T6
37-
JMP f_wordscheck
38-
f_words:
39-
MOV (A1), T3
40-
MOV T3, (A0)
41-
ADD $8, A0
42-
ADD $8, A1
43-
f_wordscheck:
44-
SLTU T6, A1, T3
45-
BNE T3, ZERO, f_words
46-
47-
// Finish off the remaining partial word.
48-
JMP f_outcheck
49-
f_out:
50-
MOVB (A1), T3
51-
MOVB T3, (A0)
52-
ADD $1, A0
53-
ADD $1, A1
54-
f_outcheck:
55-
BNE A1, T5, f_out
34+
ADD $-1, X5
35+
MOVB 0(X11), X14
36+
MOVB X14, 0(X10)
37+
ADD $1, X10
38+
ADD $1, X11
39+
BNEZ X5, f_align
5640

57-
RET
41+
f_loop_check:
42+
MOV $16, X9
43+
BLT X12, X9, f_loop8_check
44+
MOV $32, X9
45+
BLT X12, X9, f_loop16_check
46+
MOV $64, X9
47+
BLT X12, X9, f_loop32_check
48+
f_loop64:
49+
MOV 0(X11), X14
50+
MOV 8(X11), X15
51+
MOV 16(X11), X16
52+
MOV 24(X11), X17
53+
MOV 32(X11), X18
54+
MOV 40(X11), X19
55+
MOV 48(X11), X20
56+
MOV 56(X11), X21
57+
MOV X14, 0(X10)
58+
MOV X15, 8(X10)
59+
MOV X16, 16(X10)
60+
MOV X17, 24(X10)
61+
MOV X18, 32(X10)
62+
MOV X19, 40(X10)
63+
MOV X20, 48(X10)
64+
MOV X21, 56(X10)
65+
ADD $64, X10
66+
ADD $64, X11
67+
ADD $-64, X12
68+
BGE X12, X9, f_loop64
69+
BEQZ X12, done
70+
71+
f_loop32_check:
72+
MOV $32, X9
73+
BLT X12, X9, f_loop16_check
74+
f_loop32:
75+
MOV 0(X11), X14
76+
MOV 8(X11), X15
77+
MOV 16(X11), X16
78+
MOV 24(X11), X17
79+
MOV X14, 0(X10)
80+
MOV X15, 8(X10)
81+
MOV X16, 16(X10)
82+
MOV X17, 24(X10)
83+
ADD $32, X10
84+
ADD $32, X11
85+
ADD $-32, X12
86+
BGE X12, X9, f_loop32
87+
BEQZ X12, done
88+
89+
f_loop16_check:
90+
MOV $16, X9
91+
BLT X12, X9, f_loop8_check
92+
f_loop16:
93+
MOV 0(X11), X14
94+
MOV 8(X11), X15
95+
MOV X14, 0(X10)
96+
MOV X15, 8(X10)
97+
ADD $16, X10
98+
ADD $16, X11
99+
ADD $-16, X12
100+
BGE X12, X9, f_loop16
101+
BEQZ X12, done
102+
103+
f_loop8_check:
104+
MOV $8, X9
105+
BLT X12, X9, f_loop4_check
106+
f_loop8:
107+
MOV 0(X11), X14
108+
MOV X14, 0(X10)
109+
ADD $8, X10
110+
ADD $8, X11
111+
ADD $-8, X12
112+
BGE X12, X9, f_loop8
113+
BEQZ X12, done
114+
JMP f_loop4_check
115+
116+
f_loop8_unaligned_check:
117+
MOV $8, X9
118+
BLT X12, X9, f_loop4_check
119+
f_loop8_unaligned:
120+
MOVB 0(X11), X14
121+
MOVB 1(X11), X15
122+
MOVB 2(X11), X16
123+
MOVB 3(X11), X17
124+
MOVB 4(X11), X18
125+
MOVB 5(X11), X19
126+
MOVB 6(X11), X20
127+
MOVB 7(X11), X21
128+
MOVB X14, 0(X10)
129+
MOVB X15, 1(X10)
130+
MOVB X16, 2(X10)
131+
MOVB X17, 3(X10)
132+
MOVB X18, 4(X10)
133+
MOVB X19, 5(X10)
134+
MOVB X20, 6(X10)
135+
MOVB X21, 7(X10)
136+
ADD $8, X10
137+
ADD $8, X11
138+
ADD $-8, X12
139+
BGE X12, X9, f_loop8_unaligned
140+
141+
f_loop4_check:
142+
MOV $4, X9
143+
BLT X12, X9, f_loop1
144+
f_loop4:
145+
MOVB 0(X11), X14
146+
MOVB 1(X11), X15
147+
MOVB 2(X11), X16
148+
MOVB 3(X11), X17
149+
MOVB X14, 0(X10)
150+
MOVB X15, 1(X10)
151+
MOVB X16, 2(X10)
152+
MOVB X17, 3(X10)
153+
ADD $4, X10
154+
ADD $4, X11
155+
ADD $-4, X12
156+
BGE X12, X9, f_loop4
157+
158+
f_loop1:
159+
BEQZ X12, done
160+
MOVB 0(X11), X14
161+
MOVB X14, 0(X10)
162+
ADD $1, X10
163+
ADD $1, X11
164+
ADD $-1, X12
165+
JMP f_loop1
166+
167+
backward:
168+
ADD X10, X12, X10
169+
ADD X11, X12, X11
58170

59-
b:
60-
ADD A0, A2, T4
61-
// If less than eight bytes, do one byte at a time.
62-
SLTU $8, A2, T3
63-
BNE T3, ZERO, b_outcheck
171+
// If less than 8 bytes, do single byte copies.
172+
MOV $8, X9
173+
BLT X12, X9, b_loop4_check
64174

65-
// Do one byte at a time until from+n is eight-aligned.
66-
JMP b_aligncheck
175+
// Check alignment - if alignment differs we have to do one byte at a time.
176+
AND $3, X10, X5
177+
AND $3, X11, X6
178+
BNE X5, X6, b_loop8_unaligned_check
179+
BEQZ X5, b_loop_check
180+
181+
// Move one byte at a time until we reach 8 byte alignment.
182+
SUB X5, X12, X12
67183
b_align:
68-
ADD $-1, T4
69-
ADD $-1, T5
70-
MOVB (T5), T3
71-
MOVB T3, (T4)
72-
b_aligncheck:
73-
AND $7, T5, T3
74-
BNE T3, ZERO, b_align
75-
76-
// Do eight bytes at a time as long as there is room.
77-
ADD $7, A1, T6
78-
JMP b_wordscheck
79-
b_words:
80-
ADD $-8, T4
81-
ADD $-8, T5
82-
MOV (T5), T3
83-
MOV T3, (T4)
84-
b_wordscheck:
85-
SLTU T5, T6, T3
86-
BNE T3, ZERO, b_words
87-
88-
// Finish off the remaining partial word.
89-
JMP b_outcheck
90-
b_out:
91-
ADD $-1, T4
92-
ADD $-1, T5
93-
MOVB (T5), T3
94-
MOVB T3, (T4)
95-
b_outcheck:
96-
BNE T5, A1, b_out
184+
ADD $-1, X5
185+
ADD $-1, X10
186+
ADD $-1, X11
187+
MOVB 0(X11), X14
188+
MOVB X14, 0(X10)
189+
BNEZ X5, b_align
190+
191+
b_loop_check:
192+
MOV $16, X9
193+
BLT X12, X9, b_loop8_check
194+
MOV $32, X9
195+
BLT X12, X9, b_loop16_check
196+
MOV $64, X9
197+
BLT X12, X9, b_loop32_check
198+
b_loop64:
199+
ADD $-64, X10
200+
ADD $-64, X11
201+
MOV 0(X11), X14
202+
MOV 8(X11), X15
203+
MOV 16(X11), X16
204+
MOV 24(X11), X17
205+
MOV 32(X11), X18
206+
MOV 40(X11), X19
207+
MOV 48(X11), X20
208+
MOV 56(X11), X21
209+
MOV X14, 0(X10)
210+
MOV X15, 8(X10)
211+
MOV X16, 16(X10)
212+
MOV X17, 24(X10)
213+
MOV X18, 32(X10)
214+
MOV X19, 40(X10)
215+
MOV X20, 48(X10)
216+
MOV X21, 56(X10)
217+
ADD $-64, X12
218+
BGE X12, X9, b_loop64
219+
BEQZ X12, done
220+
221+
b_loop32_check:
222+
MOV $32, X9
223+
BLT X12, X9, b_loop16_check
224+
b_loop32:
225+
ADD $-32, X10
226+
ADD $-32, X11
227+
MOV 0(X11), X14
228+
MOV 8(X11), X15
229+
MOV 16(X11), X16
230+
MOV 24(X11), X17
231+
MOV X14, 0(X10)
232+
MOV X15, 8(X10)
233+
MOV X16, 16(X10)
234+
MOV X17, 24(X10)
235+
ADD $-32, X12
236+
BGE X12, X9, b_loop32
237+
BEQZ X12, done
238+
239+
b_loop16_check:
240+
MOV $16, X9
241+
BLT X12, X9, b_loop8_check
242+
b_loop16:
243+
ADD $-16, X10
244+
ADD $-16, X11
245+
MOV 0(X11), X14
246+
MOV 8(X11), X15
247+
MOV X14, 0(X10)
248+
MOV X15, 8(X10)
249+
ADD $-16, X12
250+
BGE X12, X9, b_loop16
251+
BEQZ X12, done
252+
253+
b_loop8_check:
254+
MOV $8, X9
255+
BLT X12, X9, b_loop4_check
256+
b_loop8:
257+
ADD $-8, X10
258+
ADD $-8, X11
259+
MOV 0(X11), X14
260+
MOV X14, 0(X10)
261+
ADD $-8, X12
262+
BGE X12, X9, b_loop8
263+
BEQZ X12, done
264+
JMP b_loop4_check
265+
266+
b_loop8_unaligned_check:
267+
MOV $8, X9
268+
BLT X12, X9, b_loop4_check
269+
b_loop8_unaligned:
270+
ADD $-8, X10
271+
ADD $-8, X11
272+
MOVB 0(X11), X14
273+
MOVB 1(X11), X15
274+
MOVB 2(X11), X16
275+
MOVB 3(X11), X17
276+
MOVB 4(X11), X18
277+
MOVB 5(X11), X19
278+
MOVB 6(X11), X20
279+
MOVB 7(X11), X21
280+
MOVB X14, 0(X10)
281+
MOVB X15, 1(X10)
282+
MOVB X16, 2(X10)
283+
MOVB X17, 3(X10)
284+
MOVB X18, 4(X10)
285+
MOVB X19, 5(X10)
286+
MOVB X20, 6(X10)
287+
MOVB X21, 7(X10)
288+
ADD $-8, X12
289+
BGE X12, X9, b_loop8_unaligned
290+
291+
b_loop4_check:
292+
MOV $4, X9
293+
BLT X12, X9, b_loop1
294+
b_loop4:
295+
ADD $-4, X10
296+
ADD $-4, X11
297+
MOVB 0(X11), X14
298+
MOVB 1(X11), X15
299+
MOVB 2(X11), X16
300+
MOVB 3(X11), X17
301+
MOVB X14, 0(X10)
302+
MOVB X15, 1(X10)
303+
MOVB X16, 2(X10)
304+
MOVB X17, 3(X10)
305+
ADD $-4, X12
306+
BGE X12, X9, b_loop4
307+
308+
b_loop1:
309+
BEQZ X12, done
310+
ADD $-1, X10
311+
ADD $-1, X11
312+
MOVB 0(X11), X14
313+
MOVB X14, 0(X10)
314+
ADD $-1, X12
315+
JMP b_loop1
97316

317+
done:
98318
RET

0 commit comments

Comments
 (0)