Skip to content

Commit d5e9cb2

Browse files
committed
Add optimized assembly xorBytes for ARM (NEON and non-NEON)
1 parent 2bd1cf9 commit d5e9cb2

File tree

3 files changed

+173
-1
lines changed

3 files changed

+173
-1
lines changed

src/crypto/cipher/xor_arm.go

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
// Copyright 2022 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
package cipher
6+
7+
import (
8+
"internal/cpu"
9+
"unsafe"
10+
)
11+
12+
const wordSize = int(unsafe.Sizeof(uintptr(0)))
13+
14+
var hasNEON = cpu.HWCap&(1<<12) != 0
15+
16+
func isAligned(a *byte) bool {
17+
return uintptr(unsafe.Pointer(a))%uintptr(wordSize) == 0
18+
}
19+
20+
// xorBytes xors the bytes in a and b. The destination should have enough
21+
// space, otherwise xorBytes will panic. Returns the number of bytes xor'd.
22+
func xorBytes(dst, a, b []byte) int {
23+
n := len(a)
24+
if len(b) < n {
25+
n = len(b)
26+
}
27+
if n == 0 {
28+
return 0
29+
}
30+
// make sure dst has enough space
31+
_ = dst[n-1]
32+
33+
if hasNEON {
34+
xorBytesNEON32(&dst[0], &a[0], &b[0], n)
35+
} else if isAligned(&dst[0]) && isAligned(&a[0]) && isAligned(&b[0]) {
36+
xorBytesARM32(&dst[0], &a[0], &b[0], n)
37+
} else {
38+
safeXORBytes(dst, a, b, n)
39+
}
40+
return n
41+
}
42+
43+
// n needs to be smaller or equal than the length of a and b.
44+
func safeXORBytes(dst, a, b []byte, n int) {
45+
for i := 0; i < n; i++ {
46+
dst[i] = a[i] ^ b[i]
47+
}
48+
}
49+
50+
func xorWords(dst, a, b []byte) {
51+
xorBytes(dst, a, b)
52+
}
53+
54+
//go:noescape
55+
func xorBytesARM32(dst, a, b *byte, n int)
56+
57+
//go:noescape
58+
func xorBytesNEON32(dst, a, b *byte, n int)

src/crypto/cipher/xor_arm.s

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
// Copyright 2022 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
#include "textflag.h"
6+
7+
// func xorBytesARM32(dst, a, b *byte, n int)
8+
TEXT ·xorBytesARM32(SB), NOSPLIT|NOFRAME, $0
9+
MOVW dst+0(FP), R0
10+
MOVW a+4(FP), R1
11+
MOVW b+8(FP), R2
12+
MOVW n+12(FP), R3
13+
CMP $4, R3
14+
BLT less_than4
15+
16+
loop_4:
17+
MOVW.P 4(R1), R4
18+
MOVW.P 4(R2), R5
19+
EOR R4, R5, R5
20+
MOVW.P R5, 4(R0)
21+
22+
SUB $4, R3
23+
CMP $4, R3
24+
BGE loop_4
25+
26+
less_than4:
27+
CMP $2, R3
28+
BLT less_than2
29+
MOVH.P 2(R1), R4
30+
MOVH.P 2(R2), R5
31+
EOR R4, R5, R5
32+
MOVH.P R5, 2(R0)
33+
34+
SUB $2, R3
35+
36+
less_than2:
37+
CMP $0, R3
38+
BEQ end
39+
MOVB (R1), R4
40+
MOVB (R2), R5
41+
EOR R4, R5, R5
42+
MOVB R5, (R0)
43+
end:
44+
RET
45+
46+
// func xorBytesNEON32(dst, a, b *byte, n int)
47+
TEXT ·xorBytesNEON32(SB), NOSPLIT|NOFRAME, $0
48+
MOVW dst+0(FP), R0
49+
MOVW a+4(FP), R1
50+
MOVW b+8(FP), R2
51+
MOVW n+12(FP), R3
52+
CMP $32, R3
53+
BLT less_than32
54+
55+
loop_32:
56+
WORD $0xF421020D // vld1.u8 {q0, q1}, [r1]!
57+
WORD $0xF422420D // vld1.u8 {q2, q3}, [r2]!
58+
WORD $0xF3004154 // veor q2, q0, q2
59+
WORD $0xF3026156 // veor q3, q1, q3
60+
WORD $0xF400420D // vst1.u8 {q2, q3}, [r0]!
61+
62+
SUB $32, R3
63+
CMP $32, R3
64+
BGE loop_32
65+
66+
less_than32:
67+
CMP $16, R3
68+
BLT less_than16
69+
WORD $0xF4210A0D // vld1.u8 q0, [r1]!
70+
WORD $0xF4222A0D // vld1.u8 q1, [r2]!
71+
WORD $0xF3002152 // veor q1, q0, q1
72+
WORD $0xF4002A0D // vst1.u8 {q1}, [r0]!
73+
74+
SUB $16, R3
75+
76+
less_than16:
77+
CMP $8, R3
78+
BLT less_than8
79+
WORD $0xF421070D // vld1.u8 d0, [r1]!
80+
WORD $0xF422170D // vld1.u8 d1, [r2]!
81+
WORD $0xF3001111 // veor d1, d0, d1
82+
WORD $0xF400170D // vst1.u8 {d1}, [r0]!
83+
84+
SUB $8, R3
85+
86+
less_than8:
87+
CMP $4, R3
88+
BLT less_than4
89+
MOVW.P 4(R1), R4
90+
MOVW.P 4(R2), R5
91+
EOR R4, R5, R5
92+
MOVW.P R5, 4(R0)
93+
94+
SUB $4, R3
95+
96+
less_than4:
97+
CMP $2, R3
98+
BLT less_than2
99+
MOVH.P 2(R1), R4
100+
MOVH.P 2(R2), R5
101+
EOR R4, R5, R5
102+
MOVH.P R5, 2(R0)
103+
104+
SUB $2, R3
105+
106+
less_than2:
107+
CMP $0, R3
108+
BEQ end
109+
MOVB (R1), R4
110+
MOVB (R2), R5
111+
EOR R4, R5, R5
112+
MOVB R5, (R0)
113+
end:
114+
RET

src/crypto/cipher/xor_generic.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
// Use of this source code is governed by a BSD-style
33
// license that can be found in the LICENSE file.
44

5-
//go:build !amd64 && !ppc64 && !ppc64le && !arm64
5+
//go:build !amd64 && !ppc64 && !ppc64le && !arm64 && !arm
66

77
package cipher
88

0 commit comments

Comments
 (0)