Skip to content

Commit 261fe25

Browse files
committed
internal/bytealg: simplify and improve compare on riscv64
Remove some unnecessary loops and pull the comparison code out from the compare/loop code. Add an unaligned 8 byte comparison, which reads 8 bytes from each input before comparing them. This gives a reasonable gain in performance for the large unaligned case. Updates #50615 name old time/op new time/op delta CompareBytesEqual-4 116ns _ 0% 111ns _ 0% -4.10% (p=0.000 n=5+5) CompareBytesToNil-4 34.9ns _ 0% 35.0ns _ 0% +0.45% (p=0.002 n=5+5) CompareBytesEmpty-4 29.6ns _ 1% 29.8ns _ 0% +0.71% (p=0.016 n=5+5) CompareBytesIdentical-4 29.8ns _ 0% 29.9ns _ 1% +0.50% (p=0.036 n=5+5) CompareBytesSameLength-4 66.1ns _ 0% 60.4ns _ 0% -8.59% (p=0.000 n=5+5) CompareBytesDifferentLength-4 63.1ns _ 0% 60.5ns _ 0% -4.20% (p=0.000 n=5+5) CompareBytesBigUnaligned/offset=1-4 6.84ms _ 3% 6.04ms _ 5% -11.70% (p=0.001 n=5+5) CompareBytesBigUnaligned/offset=2-4 6.99ms _ 4% 5.93ms _ 6% -15.22% (p=0.000 n=5+5) CompareBytesBigUnaligned/offset=3-4 6.74ms _ 1% 6.00ms _ 5% -10.94% (p=0.001 n=5+5) CompareBytesBigUnaligned/offset=4-4 7.20ms _ 6% 5.97ms _ 6% -17.05% (p=0.000 n=5+5) CompareBytesBigUnaligned/offset=5-4 6.75ms _ 1% 5.81ms _ 8% -13.93% (p=0.001 n=5+5) CompareBytesBigUnaligned/offset=6-4 6.89ms _ 5% 5.75ms _ 2% -16.58% (p=0.000 n=5+4) CompareBytesBigUnaligned/offset=7-4 6.91ms _ 6% 6.13ms _ 6% -11.27% (p=0.001 n=5+5) CompareBytesBig-4 2.75ms _ 5% 2.71ms _ 8% ~ (p=0.651 n=5+5) CompareBytesBigIdentical-4 29.9ns _ 1% 29.8ns _ 0% ~ (p=0.751 n=5+5) name old speed new speed delta CompareBytesBigUnaligned/offset=1-4 153MB/s _ 3% 174MB/s _ 6% +13.40% (p=0.003 n=5+5) CompareBytesBigUnaligned/offset=2-4 150MB/s _ 4% 177MB/s _ 6% +18.06% (p=0.001 n=5+5) CompareBytesBigUnaligned/offset=3-4 156MB/s _ 1% 175MB/s _ 5% +12.39% (p=0.002 n=5+5) CompareBytesBigUnaligned/offset=4-4 146MB/s _ 6% 176MB/s _ 6% +20.67% (p=0.001 n=5+5) CompareBytesBigUnaligned/offset=5-4 155MB/s _ 1% 181MB/s _ 7% +16.35% (p=0.002 n=5+5) CompareBytesBigUnaligned/offset=6-4 152MB/s _ 5% 182MB/s _ 2% +19.74% (p=0.000 n=5+4) CompareBytesBigUnaligned/offset=7-4 152MB/s _ 6% 171MB/s _ 6% +12.70% (p=0.001 n=5+5) CompareBytesBig-4 382MB/s _ 5% 388MB/s _ 9% ~ (p=0.616 n=5+5) CompareBytesBigIdentical-4 35.1TB/s _ 1% 35.1TB/s _ 0% ~ (p=0.800 n=5+5) Change-Id: I127edc376e62a2c529719a4ab172f481e0a81357 Reviewed-on: https://go-review.googlesource.com/c/go/+/431100 Reviewed-by: Cherry Mui <[email protected]> Reviewed-by: Meng Zhuo <[email protected]> Reviewed-by: Bryan Mills <[email protected]> TryBot-Result: Gopher Robot <[email protected]> Reviewed-by: Joedian Reid <[email protected]> Run-TryBot: Joel Sing <[email protected]>
1 parent e03ee85 commit 261fe25

File tree

1 file changed

+103
-70
lines changed

1 file changed

+103
-70
lines changed

src/internal/bytealg/compare_riscv64.s

Lines changed: 103 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -40,13 +40,13 @@ use_a_len:
4040
BEQZ X5, cmp_len
4141

4242
MOV $32, X6
43-
BLT X5, X6, loop4_check
43+
BLT X5, X6, check8_unaligned
4444

4545
// Check alignment - if alignment differs we have to do one byte at a time.
4646
AND $7, X10, X7
4747
AND $7, X12, X8
48-
BNE X7, X8, loop4_check
49-
BEQZ X7, loop32_check
48+
BNE X7, X8, check8_unaligned
49+
BEQZ X7, compare32
5050

5151
// Check one byte at a time until we reach 8 byte alignment.
5252
SUB X7, X5, X5
@@ -59,122 +59,155 @@ align:
5959
ADD $1, X12
6060
BNEZ X7, align
6161

62-
loop32_check:
63-
MOV $32, X7
64-
BLT X5, X7, loop16_check
65-
loop32:
62+
check32:
63+
MOV $32, X6
64+
BLT X5, X6, compare16
65+
compare32:
6666
MOV 0(X10), X15
6767
MOV 0(X12), X16
6868
MOV 8(X10), X17
6969
MOV 8(X12), X18
70-
BEQ X15, X16, loop32a
71-
JMP cmp8a
72-
loop32a:
73-
BEQ X17, X18, loop32b
74-
JMP cmp8b
75-
loop32b:
70+
BNE X15, X16, cmp8a
71+
BNE X17, X18, cmp8b
7672
MOV 16(X10), X15
7773
MOV 16(X12), X16
7874
MOV 24(X10), X17
7975
MOV 24(X12), X18
80-
BEQ X15, X16, loop32c
81-
JMP cmp8a
82-
loop32c:
83-
BEQ X17, X18, loop32d
84-
JMP cmp8b
85-
loop32d:
76+
BNE X15, X16, cmp8a
77+
BNE X17, X18, cmp8b
8678
ADD $32, X10
8779
ADD $32, X12
8880
ADD $-32, X5
89-
BGE X5, X7, loop32
81+
BGE X5, X6, compare32
9082
BEQZ X5, cmp_len
9183

92-
loop16_check:
84+
check16:
9385
MOV $16, X6
94-
BLT X5, X6, loop4_check
95-
loop16:
86+
BLT X5, X6, check8_unaligned
87+
compare16:
9688
MOV 0(X10), X15
9789
MOV 0(X12), X16
9890
MOV 8(X10), X17
9991
MOV 8(X12), X18
100-
BEQ X15, X16, loop16a
101-
JMP cmp8a
102-
loop16a:
103-
BEQ X17, X18, loop16b
104-
JMP cmp8b
105-
loop16b:
92+
BNE X15, X16, cmp8a
93+
BNE X17, X18, cmp8b
10694
ADD $16, X10
10795
ADD $16, X12
10896
ADD $-16, X5
109-
BGE X5, X6, loop16
11097
BEQZ X5, cmp_len
11198

112-
loop4_check:
113-
MOV $4, X6
114-
BLT X5, X6, loop1
115-
loop4:
99+
check8_unaligned:
100+
MOV $8, X6
101+
BLT X5, X6, check4_unaligned
102+
compare8_unaligned:
116103
MOVBU 0(X10), X8
104+
MOVBU 1(X10), X15
105+
MOVBU 2(X10), X17
106+
MOVBU 3(X10), X19
107+
MOVBU 4(X10), X21
108+
MOVBU 5(X10), X23
109+
MOVBU 6(X10), X25
110+
MOVBU 7(X10), X29
117111
MOVBU 0(X12), X9
112+
MOVBU 1(X12), X16
113+
MOVBU 2(X12), X18
114+
MOVBU 3(X12), X20
115+
MOVBU 4(X12), X22
116+
MOVBU 5(X12), X24
117+
MOVBU 6(X12), X28
118+
MOVBU 7(X12), X30
119+
BNE X8, X9, cmp1a
120+
BNE X15, X16, cmp1b
121+
BNE X17, X18, cmp1c
122+
BNE X19, X20, cmp1d
123+
BNE X21, X22, cmp1e
124+
BNE X23, X24, cmp1f
125+
BNE X25, X28, cmp1g
126+
BNE X29, X30, cmp1h
127+
ADD $8, X10
128+
ADD $8, X12
129+
ADD $-8, X5
130+
BGE X5, X6, compare8_unaligned
131+
BEQZ X5, cmp_len
132+
133+
check4_unaligned:
134+
MOV $4, X6
135+
BLT X5, X6, compare1
136+
compare4_unaligned:
137+
MOVBU 0(X10), X8
118138
MOVBU 1(X10), X15
139+
MOVBU 2(X10), X17
140+
MOVBU 3(X10), X19
141+
MOVBU 0(X12), X9
119142
MOVBU 1(X12), X16
120-
BEQ X8, X9, loop4a
121-
SLTU X9, X8, X5
122-
SLTU X8, X9, X6
123-
JMP cmp_ret
124-
loop4a:
125-
BEQ X15, X16, loop4b
126-
SLTU X16, X15, X5
127-
SLTU X15, X16, X6
128-
JMP cmp_ret
129-
loop4b:
130-
MOVBU 2(X10), X21
131-
MOVBU 2(X12), X22
132-
MOVBU 3(X10), X23
133-
MOVBU 3(X12), X24
134-
BEQ X21, X22, loop4c
135-
SLTU X22, X21, X5
136-
SLTU X21, X22, X6
137-
JMP cmp_ret
138-
loop4c:
139-
BEQ X23, X24, loop4d
140-
SLTU X24, X23, X5
141-
SLTU X23, X24, X6
142-
JMP cmp_ret
143-
loop4d:
143+
MOVBU 2(X12), X18
144+
MOVBU 3(X12), X20
145+
BNE X8, X9, cmp1a
146+
BNE X15, X16, cmp1b
147+
BNE X17, X18, cmp1c
148+
BNE X19, X20, cmp1d
144149
ADD $4, X10
145150
ADD $4, X12
146151
ADD $-4, X5
147-
BGE X5, X6, loop4
152+
BGE X5, X6, compare4_unaligned
148153

149-
loop1:
154+
compare1:
150155
BEQZ X5, cmp_len
151156
MOVBU 0(X10), X8
152157
MOVBU 0(X12), X9
153158
BNE X8, X9, cmp
154159
ADD $1, X10
155160
ADD $1, X12
156161
ADD $-1, X5
157-
JMP loop1
162+
JMP compare1
158163

159164
// Compare 8 bytes of memory in X15/X16 that are known to differ.
160165
cmp8a:
161-
MOV $0xff, X19
162-
cmp8a_loop:
163-
AND X15, X19, X8
164-
AND X16, X19, X9
165-
BNE X8, X9, cmp
166-
SLLI $8, X19
167-
JMP cmp8a_loop
166+
MOV X15, X17
167+
MOV X16, X18
168168

169169
// Compare 8 bytes of memory in X17/X18 that are known to differ.
170170
cmp8b:
171171
MOV $0xff, X19
172-
cmp8b_loop:
172+
cmp8_loop:
173173
AND X17, X19, X8
174174
AND X18, X19, X9
175175
BNE X8, X9, cmp
176176
SLLI $8, X19
177-
JMP cmp8b_loop
177+
JMP cmp8_loop
178+
179+
cmp1a:
180+
SLTU X9, X8, X5
181+
SLTU X8, X9, X6
182+
JMP cmp_ret
183+
cmp1b:
184+
SLTU X16, X15, X5
185+
SLTU X15, X16, X6
186+
JMP cmp_ret
187+
cmp1c:
188+
SLTU X18, X17, X5
189+
SLTU X17, X18, X6
190+
JMP cmp_ret
191+
cmp1d:
192+
SLTU X20, X19, X5
193+
SLTU X19, X20, X6
194+
JMP cmp_ret
195+
cmp1e:
196+
SLTU X22, X21, X5
197+
SLTU X21, X22, X6
198+
JMP cmp_ret
199+
cmp1f:
200+
SLTU X24, X23, X5
201+
SLTU X23, X24, X6
202+
JMP cmp_ret
203+
cmp1g:
204+
SLTU X28, X25, X5
205+
SLTU X25, X28, X6
206+
JMP cmp_ret
207+
cmp1h:
208+
SLTU X30, X29, X5
209+
SLTU X29, X30, X6
210+
JMP cmp_ret
178211

179212
cmp_len:
180213
MOV X11, X8

0 commit comments

Comments
 (0)