Skip to content

Commit 3c42ebf

Browse files
committed
internal/bytealg: optimise memequal on riscv64
Implement memequal using loops that process 32 bytes, 16 bytes, 4 bytes or 1 byte depending on size and alignment. For comparisons that are less than 32 bytes the overhead of checking and adjusting alignment usually exceeds the overhead of reading and processing 4 bytes at a time. Updates #50615 name old time/op new time/op delta Equal/0-4 38.3ns _ 0% 43.1ns _ 0% +12.54% (p=0.000 n=3+3) Equal/1-4 77.7ns _ 0% 90.3ns _ 0% +16.27% (p=0.000 n=3+3) Equal/6-4 116ns _ 0% 121ns _ 0% +3.85% (p=0.002 n=3+3) Equal/9-4 137ns _ 0% 126ns _ 0% -7.98% (p=0.000 n=3+3) Equal/15-4 179ns _ 0% 170ns _ 0% -4.77% (p=0.001 n=3+3) Equal/16-4 186ns _ 0% 159ns _ 0% -14.65% (p=0.000 n=3+3) Equal/20-4 215ns _ 0% 178ns _ 0% -17.18% (p=0.000 n=3+3) Equal/32-4 298ns _ 0% 101ns _ 0% -66.22% (p=0.000 n=3+3) Equal/4K-4 28.9_s _ 0% 2.2_s _ 0% -92.56% (p=0.000 n=3+3) Equal/4M-4 29.6ms _ 0% 2.2ms _ 0% -92.72% (p=0.000 n=3+3) Equal/64M-4 758ms _75% 35ms _ 0% ~ (p=0.127 n=3+3) CompareBytesEqual-4 226ns _ 0% 131ns _ 0% -41.76% (p=0.000 n=3+3) name old speed new speed delta Equal/1-4 12.9MB/s _ 0% 11.1MB/s _ 0% -13.98% (p=0.000 n=3+3) Equal/6-4 51.7MB/s _ 0% 49.8MB/s _ 0% -3.72% (p=0.002 n=3+3) Equal/9-4 65.7MB/s _ 0% 71.4MB/s _ 0% +8.67% (p=0.000 n=3+3) Equal/15-4 83.8MB/s _ 0% 88.0MB/s _ 0% +5.02% (p=0.001 n=3+3) Equal/16-4 85.9MB/s _ 0% 100.6MB/s _ 0% +17.19% (p=0.000 n=3+3) Equal/20-4 93.2MB/s _ 0% 112.6MB/s _ 0% +20.74% (p=0.000 n=3+3) Equal/32-4 107MB/s _ 0% 317MB/s _ 0% +195.97% (p=0.000 n=3+3) Equal/4K-4 142MB/s _ 0% 1902MB/s _ 0% +1243.76% (p=0.000 n=3+3) Equal/4M-4 142MB/s _ 0% 1946MB/s _ 0% +1274.22% (p=0.000 n=3+3) Equal/64M-4 111MB/s _55% 1941MB/s _ 0% +1641.21% (p=0.000 n=3+3) Change-Id: I9af7e82de3c4c5af8813772ed139230900c03b92 Reviewed-on: https://go-review.googlesource.com/c/go/+/380075 Trust: Joel Sing <[email protected]> Trust: mzh <[email protected]> Reviewed-by: mzh <[email protected]> Run-TryBot: Joel Sing <[email protected]> TryBot-Result: Gopher Robot <[email protected]> Reviewed-by: Cherry Mui <[email protected]>
1 parent 7fd9564 commit 3c42ebf

File tree

1 file changed

+111
-33
lines changed

1 file changed

+111
-33
lines changed

src/internal/bytealg/equal_riscv64.s

Lines changed: 111 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -9,41 +9,119 @@
99

1010
// func memequal(a, b unsafe.Pointer, size uintptr) bool
1111
TEXT runtime·memequal(SB),NOSPLIT|NOFRAME,$0-25
12-
MOV a+0(FP), A1
13-
MOV b+8(FP), A2
14-
BEQ A1, A2, eq
15-
MOV size+16(FP), A3
16-
ADD A1, A3, A4
17-
loop:
18-
BEQ A1, A4, eq
19-
20-
MOVBU (A1), A6
21-
ADD $1, A1
22-
MOVBU (A2), A7
23-
ADD $1, A2
24-
BEQ A6, A7, loop
25-
26-
MOVB ZERO, ret+24(FP)
27-
RET
28-
eq:
29-
MOV $1, A1
30-
MOVB A1, ret+24(FP)
31-
RET
12+
MOV a+0(FP), X5
13+
MOV b+8(FP), X6
14+
MOV size+16(FP), X7
15+
MOV $ret+24(FP), X19
16+
JMP memequal<>(SB)
3217

3318
// func memequal_varlen(a, b unsafe.Pointer) bool
34-
TEXT runtime·memequal_varlen(SB),NOSPLIT,$40-17
35-
MOV a+0(FP), A1
36-
MOV b+8(FP), A2
37-
BEQ A1, A2, eq
38-
MOV 8(CTXT), A3 // compiler stores size at offset 8 in the closure
39-
MOV A1, 8(X2)
40-
MOV A2, 16(X2)
41-
MOV A3, 24(X2)
42-
CALL runtime·memequal(SB)
43-
MOVBU 32(X2), A1
44-
MOVB A1, ret+16(FP)
19+
TEXT runtime·memequal_varlen(SB),NOSPLIT|NOFRAME,$0-17
20+
MOV a+0(FP), X5
21+
MOV b+8(FP), X6
22+
MOV 8(CTXT), X7 // compiler stores size at offset 8 in the closure
23+
MOV $ret+16(FP), X19
24+
JMP memequal<>(SB)
25+
26+
// On entry X5 and X6 contain pointers, X7 contains length.
27+
// X19 contains address for return value.
28+
TEXT memequal<>(SB),NOSPLIT|NOFRAME,$0
29+
BEQ X5, X6, eq
30+
31+
MOV $32, X8
32+
BLT X7, X8, loop4_check
33+
34+
// Check alignment - if alignment differs we have to do one byte at a time.
35+
AND $3, X5, X9
36+
AND $3, X6, X10
37+
BNE X9, X10, loop4_check
38+
BEQZ X9, loop32_check
39+
40+
// Check one byte at a time until we reach 8 byte alignment.
41+
SUB X9, X7, X7
42+
align:
43+
ADD $-1, X9
44+
MOVBU 0(X5), X10
45+
MOVBU 0(X6), X11
46+
BNE X10, X11, not_eq
47+
ADD $1, X5
48+
ADD $1, X6
49+
BNEZ X9, align
50+
51+
loop32_check:
52+
MOV $32, X9
53+
BLT X7, X9, loop16_check
54+
loop32:
55+
MOV 0(X5), X10
56+
MOV 0(X6), X11
57+
MOV 8(X5), X12
58+
MOV 8(X6), X13
59+
BNE X10, X11, not_eq
60+
BNE X12, X13, not_eq
61+
MOV 16(X5), X14
62+
MOV 16(X6), X15
63+
MOV 24(X5), X16
64+
MOV 24(X6), X17
65+
BNE X14, X15, not_eq
66+
BNE X16, X17, not_eq
67+
ADD $32, X5
68+
ADD $32, X6
69+
ADD $-32, X7
70+
BGE X7, X9, loop32
71+
BEQZ X7, eq
72+
73+
loop16_check:
74+
MOV $16, X8
75+
BLT X7, X8, loop4_check
76+
loop16:
77+
MOV 0(X5), X10
78+
MOV 0(X6), X11
79+
MOV 8(X5), X12
80+
MOV 8(X6), X13
81+
BNE X10, X11, not_eq
82+
BNE X12, X13, not_eq
83+
ADD $16, X5
84+
ADD $16, X6
85+
ADD $-16, X7
86+
BGE X7, X8, loop16
87+
BEQZ X7, eq
88+
89+
loop4_check:
90+
MOV $4, X8
91+
BLT X7, X8, loop1
92+
loop4:
93+
MOVBU 0(X5), X10
94+
MOVBU 0(X6), X11
95+
MOVBU 1(X5), X12
96+
MOVBU 1(X6), X13
97+
BNE X10, X11, not_eq
98+
BNE X12, X13, not_eq
99+
MOVBU 2(X5), X14
100+
MOVBU 2(X6), X15
101+
MOVBU 3(X5), X16
102+
MOVBU 3(X6), X17
103+
BNE X14, X15, not_eq
104+
BNE X16, X17, not_eq
105+
ADD $4, X5
106+
ADD $4, X6
107+
ADD $-4, X7
108+
BGE X7, X8, loop4
109+
110+
loop1:
111+
BEQZ X7, eq
112+
MOVBU 0(X5), X10
113+
MOVBU 0(X6), X11
114+
BNE X10, X11, not_eq
115+
ADD $1, X5
116+
ADD $1, X6
117+
ADD $-1, X7
118+
JMP loop1
119+
120+
not_eq:
121+
MOV $0, X5
122+
MOVB X5, (X19)
45123
RET
46124
eq:
47-
MOV $1, A1
48-
MOVB A1, ret+16(FP)
125+
MOV $1, X5
126+
MOVB X5, (X19)
49127
RET

0 commit comments

Comments
 (0)