Skip to content

Commit 291bda8

Browse files
committed
internal/bytealg: optimise compare on riscv64
Implement compare using loops that process 32 bytes, 16 bytes, 4 bytes or 1 byte depending on size and alignment. For comparisons that are less than 32 bytes the overhead of checking and adjusting alignment usually exceeds the overhead of reading and processing 4 bytes at a time. Updates #50615 name old time/op new time/op delta BytesCompare/1-4 68.4ns _ 1% 61.0ns _ 0% -10.78% (p=0.001 n=3+3) BytesCompare/2-4 82.9ns _ 0% 71.0ns _ 1% -14.31% (p=0.000 n=3+3) BytesCompare/4-4 107ns _ 0% 70ns _ 0% -34.96% (p=0.000 n=3+3) BytesCompare/8-4 156ns _ 0% 90ns _ 0% -42.36% (p=0.000 n=3+3) BytesCompare/16-4 267ns _11% 130ns _ 0% -51.10% (p=0.011 n=3+3) BytesCompare/32-4 446ns _ 0% 74ns _ 0% -83.31% (p=0.000 n=3+3) BytesCompare/64-4 840ns _ 2% 91ns _ 0% -89.17% (p=0.000 n=3+3) BytesCompare/128-4 1.60_s _ 0% 0.13_s _ 0% -92.18% (p=0.000 n=3+3) BytesCompare/256-4 3.15_s _ 0% 0.19_s _ 0% -93.91% (p=0.000 n=3+3) BytesCompare/512-4 6.25_s _ 0% 0.33_s _ 0% -94.80% (p=0.000 n=3+3) BytesCompare/1024-4 12.5_s _ 0% 0.6_s _ 0% -95.23% (p=0.000 n=3+3) BytesCompare/2048-4 24.8_s _ 0% 1.1_s _ 0% -95.46% (p=0.000 n=3+3) CompareBytesEqual-4 225ns _ 0% 131ns _ 0% -41.69% (p=0.000 n=3+3) CompareBytesToNil-4 45.3ns _ 7% 46.7ns _ 0% ~ (p=0.452 n=3+3) CompareBytesEmpty-4 41.0ns _ 1% 40.6ns _ 0% ~ (p=0.071 n=3+3) CompareBytesIdentical-4 48.9ns _ 0% 41.3ns _ 1% -15.58% (p=0.000 n=3+3) CompareBytesSameLength-4 127ns _ 0% 77ns _ 0% -39.48% (p=0.000 n=3+3) CompareBytesDifferentLength-4 136ns _12% 78ns _ 0% -42.65% (p=0.018 n=3+3) CompareBytesBigUnaligned-4 14.9ms _ 1% 7.3ms _ 1% -50.95% (p=0.000 n=3+3) CompareBytesBig-4 14.9ms _ 1% 2.7ms _ 8% -82.10% (p=0.000 n=3+3) CompareBytesBigIdentical-4 52.5ns _ 0% 44.9ns _ 0% -14.53% (p=0.000 n=3+3) name old speed new speed delta CompareBytesBigUnaligned-4 70.5MB/s _ 1% 143.8MB/s _ 1% +103.87% (p=0.000 n=3+3) CompareBytesBig-4 70.3MB/s _ 1% 393.8MB/s _ 8% +460.43% (p=0.003 n=3+3) CompareBytesBigIdentical-4 20.0TB/s _ 0% 23.4TB/s _ 0% +17.00% (p=0.000 n=3+3) Change-Id: Ie18712a9009d425c75e1ab49d5a673d84e73a1eb Reviewed-on: https://go-review.googlesource.com/c/go/+/380076 Trust: Joel Sing <[email protected]> Trust: mzh <[email protected]> Reviewed-by: Cherry Mui <[email protected]>
1 parent 3c42ebf commit 291bda8

File tree

3 files changed

+187
-2
lines changed

3 files changed

+187
-2
lines changed

src/internal/bytealg/compare_generic.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
// Use of this source code is governed by a BSD-style
33
// license that can be found in the LICENSE file.
44

5-
//go:build !386 && !amd64 && !s390x && !arm && !arm64 && !ppc64 && !ppc64le && !mips && !mipsle && !wasm && !mips64 && !mips64le
5+
//go:build !386 && !amd64 && !s390x && !arm && !arm64 && !ppc64 && !ppc64le && !mips && !mipsle && !wasm && !mips64 && !mips64le && !riscv64
66

77
package bytealg
88

src/internal/bytealg/compare_native.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
// Use of this source code is governed by a BSD-style
33
// license that can be found in the LICENSE file.
44

5-
//go:build 386 || amd64 || s390x || arm || arm64 || ppc64 || ppc64le || mips || mipsle || wasm || mips64 || mips64le
5+
//go:build 386 || amd64 || s390x || arm || arm64 || ppc64 || ppc64le || mips || mipsle || wasm || mips64 || mips64le || riscv64
66

77
package bytealg
88

Lines changed: 185 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,185 @@
1+
// Copyright 2022 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
#include "go_asm.h"
6+
#include "textflag.h"
7+
8+
TEXT ·Compare(SB),NOSPLIT|NOFRAME,$0-56
9+
MOV a_base+0(FP), X5
10+
MOV a_len+8(FP), X6
11+
MOV b_base+24(FP), X7
12+
MOV b_len+32(FP), X8
13+
MOV $ret+48(FP), X9
14+
JMP compare<>(SB)
15+
16+
TEXT runtime·cmpstring(SB),NOSPLIT|NOFRAME,$0-40
17+
MOV a_base+0(FP), X5
18+
MOV a_len+8(FP), X6
19+
MOV b_base+16(FP), X7
20+
MOV b_len+24(FP), X8
21+
MOV $ret+32(FP), X9
22+
JMP compare<>(SB)
23+
24+
// On entry:
25+
// X5 points to start of a
26+
// X6 length of a
27+
// X7 points to start of b
28+
// X8 length of b
29+
// X9 points to the address to store the return value (-1/0/1)
30+
TEXT compare<>(SB),NOSPLIT|NOFRAME,$0
31+
BEQ X5, X7, cmp_len
32+
33+
MOV X6, X10
34+
BGE X8, X10, use_a_len // X10 = min(len(a), len(b))
35+
MOV X8, X10
36+
use_a_len:
37+
BEQZ X10, cmp_len
38+
39+
MOV $32, X11
40+
BLT X10, X11, loop4_check
41+
42+
// Check alignment - if alignment differs we have to do one byte at a time.
43+
AND $3, X5, X12
44+
AND $3, X7, X13
45+
BNE X12, X13, loop4_check
46+
BEQZ X12, loop32_check
47+
48+
// Check one byte at a time until we reach 8 byte alignment.
49+
SUB X12, X10, X10
50+
align:
51+
ADD $-1, X12
52+
MOVBU 0(X5), X13
53+
MOVBU 0(X7), X14
54+
BNE X13, X14, cmp
55+
ADD $1, X5
56+
ADD $1, X7
57+
BNEZ X12, align
58+
59+
loop32_check:
60+
MOV $32, X12
61+
BLT X10, X12, loop16_check
62+
loop32:
63+
MOV 0(X5), X15
64+
MOV 0(X7), X16
65+
MOV 8(X5), X17
66+
MOV 8(X7), X18
67+
BEQ X15, X16, loop32a
68+
JMP cmp8a
69+
loop32a:
70+
BEQ X17, X18, loop32b
71+
JMP cmp8b
72+
loop32b:
73+
MOV 16(X5), X15
74+
MOV 16(X7), X16
75+
MOV 24(X5), X17
76+
MOV 24(X7), X18
77+
BEQ X15, X16, loop32c
78+
JMP cmp8a
79+
loop32c:
80+
BEQ X17, X18, loop32d
81+
JMP cmp8b
82+
loop32d:
83+
ADD $32, X5
84+
ADD $32, X7
85+
ADD $-32, X10
86+
BGE X10, X12, loop32
87+
BEQZ X10, cmp_len
88+
89+
loop16_check:
90+
MOV $16, X11
91+
BLT X10, X11, loop4_check
92+
loop16:
93+
MOV 0(X5), X15
94+
MOV 0(X7), X16
95+
MOV 8(X5), X17
96+
MOV 8(X7), X18
97+
BEQ X15, X16, loop16a
98+
JMP cmp8a
99+
loop16a:
100+
BEQ X17, X18, loop16b
101+
JMP cmp8b
102+
loop16b:
103+
ADD $16, X5
104+
ADD $16, X7
105+
ADD $-16, X10
106+
BGE X10, X11, loop16
107+
BEQZ X10, cmp_len
108+
109+
loop4_check:
110+
MOV $4, X11
111+
BLT X10, X11, loop1
112+
loop4:
113+
MOVBU 0(X5), X13
114+
MOVBU 0(X7), X14
115+
MOVBU 1(X5), X15
116+
MOVBU 1(X7), X16
117+
BEQ X13, X14, loop4a
118+
SLTU X14, X13, X10
119+
SLTU X13, X14, X11
120+
JMP cmp_ret
121+
loop4a:
122+
BEQ X15, X16, loop4b
123+
SLTU X16, X15, X10
124+
SLTU X15, X16, X11
125+
JMP cmp_ret
126+
loop4b:
127+
MOVBU 2(X5), X21
128+
MOVBU 2(X7), X22
129+
MOVBU 3(X5), X23
130+
MOVBU 3(X7), X24
131+
BEQ X21, X22, loop4c
132+
SLTU X22, X21, X10
133+
SLTU X21, X22, X11
134+
JMP cmp_ret
135+
loop4c:
136+
BEQ X23, X24, loop4d
137+
SLTU X24, X23, X10
138+
SLTU X23, X24, X11
139+
JMP cmp_ret
140+
loop4d:
141+
ADD $4, X5
142+
ADD $4, X7
143+
ADD $-4, X10
144+
BGE X10, X11, loop4
145+
146+
loop1:
147+
BEQZ X10, cmp_len
148+
MOVBU 0(X5), X13
149+
MOVBU 0(X7), X14
150+
BNE X13, X14, cmp
151+
ADD $1, X5
152+
ADD $1, X7
153+
ADD $-1, X10
154+
JMP loop1
155+
156+
// Compare 8 bytes of memory in X15/X16 that are known to differ.
157+
cmp8a:
158+
MOV $0xff, X19
159+
cmp8a_loop:
160+
AND X15, X19, X13
161+
AND X16, X19, X14
162+
BNE X13, X14, cmp
163+
SLLI $8, X19
164+
JMP cmp8a_loop
165+
166+
// Compare 8 bytes of memory in X17/X18 that are known to differ.
167+
cmp8b:
168+
MOV $0xff, X19
169+
cmp8b_loop:
170+
AND X17, X19, X13
171+
AND X18, X19, X14
172+
BNE X13, X14, cmp
173+
SLLI $8, X19
174+
JMP cmp8b_loop
175+
176+
cmp_len:
177+
MOV X6, X13
178+
MOV X8, X14
179+
cmp:
180+
SLTU X14, X13, X10
181+
SLTU X13, X14, X11
182+
cmp_ret:
183+
SUB X10, X11, X12
184+
MOV X12, (X9)
185+
RET

0 commit comments

Comments
 (0)