Skip to content

Commit 5168fcf

Browse files
templexxxbradfitz
authored andcommitted
crypto/cipher: use SIMD for xor on amd64
cpu: Intel(R) Core(TM) i7-7700HQ CPU @ 2.80GHz Benchmark: xor name old time/op new time/op delta XORBytes/8Bytes-8 8.21ns ± 1% 6.35ns ± 3% -22.66% (p=0.008 n=5+5) XORBytes/128Bytes-8 17.9ns ± 1% 10.4ns ± 1% -41.68% (p=0.008 n=5+5) XORBytes/2048Bytes-8 187ns ± 1% 78ns ± 0% -58.44% (p=0.008 n=5+5) XORBytes/32768Bytes-8 2.87µs ± 1% 1.38µs ± 0% -52.05% (p=0.008 n=5+5) name old speed new speed delta XORBytes/8Bytes-8 974MB/s ± 1% 1260MB/s ± 2% +29.33% (p=0.008 n=5+5) XORBytes/128Bytes-8 7.15GB/s ± 0% 12.25GB/s ± 1% +71.17% (p=0.008 n=5+5) XORBytes/2048Bytes-8 10.9GB/s ± 1% 26.4GB/s ± 0% +140.99% (p=0.008 n=5+5) XORBytes/32768Bytes-8 11.4GB/s ± 1% 23.8GB/s ± 0% +108.52% (p=0.008 n=5+5) Benchmark: cipher name old time/op new time/op delta AESGCMSeal1K-8 269ns ± 6% 261ns ± 2% ~ (p=0.246 n=5+5) AESGCMOpen1K-8 242ns ± 1% 240ns ± 2% ~ (p=0.190 n=5+5) AESGCMSign8K-8 869ns ± 0% 870ns ± 1% ~ (p=0.683 n=5+5) AESGCMSeal8K-8 1.64µs ± 6% 1.59µs ± 7% ~ (p=0.151 n=5+5) AESGCMOpen8K-8 1.48µs ± 2% 1.46µs ± 0% -1.39% (p=0.008 n=5+5) AESCFBEncrypt1K-8 1.88µs ± 5% 1.62µs ± 1% -13.52% (p=0.008 n=5+5) AESCFBDecrypt1K-8 1.76µs ± 1% 1.58µs ± 1% -10.24% (p=0.016 n=4+5) AESOFB1K-8 1.10µs ± 4% 1.03µs ± 2% -6.36% (p=0.008 n=5+5) AESCTR1K-8 1.24µs ± 1% 1.17µs ± 0% -5.96% (p=0.008 n=5+5) AESCBCEncrypt1K-8 1.74µs ± 0% 1.14µs ± 1% -34.36% (p=0.008 n=5+5) AESCBCDecrypt1K-8 1.28µs ± 1% 1.10µs ± 1% -14.04% (p=0.008 n=5+5) name old speed new speed delta AESGCMSeal1K-8 3.81GB/s ± 6% 3.91GB/s ± 2% ~ (p=0.310 n=5+5) AESGCMOpen1K-8 4.23GB/s ± 1% 4.27GB/s ± 2% ~ (p=0.222 n=5+5) AESGCMSign8K-8 9.43GB/s ± 0% 9.41GB/s ± 1% ~ (p=0.841 n=5+5) AESGCMSeal8K-8 5.01GB/s ± 6% 5.16GB/s ± 6% ~ (p=0.151 n=5+5) AESGCMOpen8K-8 5.54GB/s ± 2% 5.62GB/s ± 0% +1.41% (p=0.008 n=5+5) AESCFBEncrypt1K-8 543MB/s ± 5% 627MB/s ± 1% +15.55% (p=0.008 n=5+5) AESCFBDecrypt1K-8 580MB/s ± 1% 646MB/s ± 1% +11.40% (p=0.016 n=4+5) AESOFB1K-8 925MB/s ± 4% 988MB/s ± 2% +6.73% (p=0.008 n=5+5) AESCTR1K-8 821MB/s ± 1% 873MB/s ± 1% +6.34% (p=0.008 n=5+5) AESCBCEncrypt1K-8 588MB/s ± 1% 897MB/s ± 1% +52.36% (p=0.008 n=5+5) AESCBCDecrypt1K-8 799MB/s ± 1% 929MB/s ± 1% +16.32% (p=0.008 n=5+5) Change-Id: I42e6ba66c23dad853d33c924fca7b0ed805cefdd Reviewed-on: https://go-review.googlesource.com/c/125316 Reviewed-by: Ilya Tocar <[email protected]> Run-TryBot: Ilya Tocar <[email protected]> TryBot-Result: Gobot Gobot <[email protected]>
1 parent 0e0f798 commit 5168fcf

File tree

5 files changed

+177
-42
lines changed

5 files changed

+177
-42
lines changed

src/crypto/cipher/export_test.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
// Copyright 2018 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
package cipher
6+
7+
// Export internal functions for testing.
8+
var XorBytes = xorBytes

src/crypto/cipher/xor_amd64.go

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
// Copyright 2018 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
package cipher
6+
7+
// xorBytes xors the bytes in a and b. The destination should have enough
8+
// space, otherwise xorBytes will panic. Returns the number of bytes xor'd.
9+
func xorBytes(dst, a, b []byte) int {
10+
n := len(a)
11+
if len(b) < n {
12+
n = len(b)
13+
}
14+
if n == 0 {
15+
return 0
16+
}
17+
_ = dst[n-1]
18+
xorBytesSSE2(&dst[0], &a[0], &b[0], n) // amd64 must have SSE2
19+
return n
20+
}
21+
22+
func xorWords(dst, a, b []byte) {
23+
xorBytes(dst, a, b)
24+
}
25+
26+
//go:noescape
27+
func xorBytesSSE2(dst, a, b *byte, n int)

src/crypto/cipher/xor_amd64.s

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
// Copyright 2018 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
#include "textflag.h"
6+
7+
// func xorBytesSSE2(dst, a, b *byte, n int)
8+
TEXT ·xorBytesSSE2(SB), NOSPLIT, $0
9+
MOVQ dst+0(FP), BX
10+
MOVQ a+8(FP), SI
11+
MOVQ b+16(FP), CX
12+
MOVQ n+24(FP), DX
13+
TESTQ $15, DX // AND 15 & len, if not zero jump to not_aligned.
14+
JNZ not_aligned
15+
16+
aligned:
17+
MOVQ $0, AX // position in slices
18+
19+
loop16b:
20+
MOVOU (SI)(AX*1), X0 // XOR 16byte forwards.
21+
MOVOU (CX)(AX*1), X1
22+
PXOR X1, X0
23+
MOVOU X0, (BX)(AX*1)
24+
ADDQ $16, AX
25+
CMPQ DX, AX
26+
JNE loop16b
27+
RET
28+
29+
loop_1b:
30+
SUBQ $1, DX // XOR 1byte backwards.
31+
MOVB (SI)(DX*1), DI
32+
MOVB (CX)(DX*1), AX
33+
XORB AX, DI
34+
MOVB DI, (BX)(DX*1)
35+
TESTQ $7, DX // AND 7 & len, if not zero jump to loop_1b.
36+
JNZ loop_1b
37+
CMPQ DX, $0 // if len is 0, ret.
38+
JE ret
39+
TESTQ $15, DX // AND 15 & len, if zero jump to aligned.
40+
JZ aligned
41+
42+
not_aligned:
43+
TESTQ $7, DX // AND $7 & len, if not zero jump to loop_1b.
44+
JNE loop_1b
45+
SUBQ $8, DX // XOR 8bytes backwards.
46+
MOVQ (SI)(DX*1), DI
47+
MOVQ (CX)(DX*1), AX
48+
XORQ AX, DI
49+
MOVQ DI, (BX)(DX*1)
50+
CMPQ DX, $16 // if len is greater or equal 16 here, it must be aligned.
51+
JGE aligned
52+
53+
ret:
54+
RET

src/crypto/cipher/xor.go renamed to src/crypto/cipher/xor_generic.go

Lines changed: 32 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -2,26 +2,47 @@
22
// Use of this source code is governed by a BSD-style
33
// license that can be found in the LICENSE file.
44

5+
// +build !amd64
6+
57
package cipher
68

79
import (
810
"runtime"
911
"unsafe"
1012
)
1113

12-
const wordSize = int(unsafe.Sizeof(uintptr(0)))
13-
const supportsUnaligned = runtime.GOARCH == "386" || runtime.GOARCH == "amd64" || runtime.GOARCH == "ppc64" || runtime.GOARCH == "ppc64le" || runtime.GOARCH == "s390x"
14-
15-
// fastXORBytes xors in bulk. It only works on architectures that
16-
// support unaligned read/writes.
17-
func fastXORBytes(dst, a, b []byte) int {
14+
// xorBytes xors the bytes in a and b. The destination should have enough
15+
// space, otherwise xorBytes will panic. Returns the number of bytes xor'd.
16+
func xorBytes(dst, a, b []byte) int {
1817
n := len(a)
1918
if len(b) < n {
2019
n = len(b)
2120
}
2221
if n == 0 {
2322
return 0
2423
}
24+
25+
switch {
26+
case supportsUnaligned:
27+
fastXORBytes(dst, a, b, n)
28+
default:
29+
// TODO(hanwen): if (dst, a, b) have common alignment
30+
// we could still try fastXORBytes. It is not clear
31+
// how often this happens, and it's only worth it if
32+
// the block encryption itself is hardware
33+
// accelerated.
34+
safeXORBytes(dst, a, b, n)
35+
}
36+
return n
37+
}
38+
39+
const wordSize = int(unsafe.Sizeof(uintptr(0)))
40+
const supportsUnaligned = runtime.GOARCH == "386" || runtime.GOARCH == "ppc64" || runtime.GOARCH == "ppc64le" || runtime.GOARCH == "s390x"
41+
42+
// fastXORBytes xors in bulk. It only works on architectures that
43+
// support unaligned read/writes.
44+
// n needs to be smaller or equal than the length of a and b.
45+
func fastXORBytes(dst, a, b []byte, n int) {
2546
// Assert dst has enough space
2647
_ = dst[n-1]
2748

@@ -38,34 +59,13 @@ func fastXORBytes(dst, a, b []byte) int {
3859
for i := (n - n%wordSize); i < n; i++ {
3960
dst[i] = a[i] ^ b[i]
4061
}
41-
42-
return n
4362
}
4463

45-
func safeXORBytes(dst, a, b []byte) int {
46-
n := len(a)
47-
if len(b) < n {
48-
n = len(b)
49-
}
64+
// n needs to be smaller or equal than the length of a and b.
65+
func safeXORBytes(dst, a, b []byte, n int) {
5066
for i := 0; i < n; i++ {
5167
dst[i] = a[i] ^ b[i]
5268
}
53-
return n
54-
}
55-
56-
// xorBytes xors the bytes in a and b. The destination should have enough
57-
// space, otherwise xorBytes will panic. Returns the number of bytes xor'd.
58-
func xorBytes(dst, a, b []byte) int {
59-
if supportsUnaligned {
60-
return fastXORBytes(dst, a, b)
61-
} else {
62-
// TODO(hanwen): if (dst, a, b) have common alignment
63-
// we could still try fastXORBytes. It is not clear
64-
// how often this happens, and it's only worth it if
65-
// the block encryption itself is hardware
66-
// accelerated.
67-
return safeXORBytes(dst, a, b)
68-
}
6969
}
7070

7171
// fastXORWords XORs multiples of 4 or 8 bytes (depending on architecture.)
@@ -80,10 +80,12 @@ func fastXORWords(dst, a, b []byte) {
8080
}
8181
}
8282

83+
// fastXORWords XORs multiples of 4 or 8 bytes (depending on architecture.)
84+
// The slice arguments a and b are assumed to be of equal length.
8385
func xorWords(dst, a, b []byte) {
8486
if supportsUnaligned {
8587
fastXORWords(dst, a, b)
8688
} else {
87-
safeXORBytes(dst, a, b)
89+
safeXORBytes(dst, a, b, len(b))
8890
}
8991
}

src/crypto/cipher/xor_test.go

Lines changed: 56 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2,27 +2,71 @@
22
// Use of this source code is governed by a BSD-style
33
// license that can be found in the LICENSE file.
44

5-
package cipher
5+
package cipher_test
66

77
import (
88
"bytes"
9+
"crypto/cipher"
10+
"crypto/rand"
11+
"fmt"
12+
"io"
913
"testing"
1014
)
1115

1216
func TestXOR(t *testing.T) {
13-
for alignP := 0; alignP < 2; alignP++ {
14-
for alignQ := 0; alignQ < 2; alignQ++ {
15-
for alignD := 0; alignD < 2; alignD++ {
16-
p := make([]byte, 1024)[alignP:]
17-
q := make([]byte, 1024)[alignQ:]
18-
d1 := make([]byte, 1024+alignD)[alignD:]
19-
d2 := make([]byte, 1024+alignD)[alignD:]
20-
xorBytes(d1, p, q)
21-
safeXORBytes(d2, p, q)
22-
if !bytes.Equal(d1, d2) {
23-
t.Error("not equal")
17+
for j := 1; j <= 1024; j++ {
18+
for alignP := 0; alignP < 2; alignP++ {
19+
for alignQ := 0; alignQ < 2; alignQ++ {
20+
for alignD := 0; alignD < 2; alignD++ {
21+
p := make([]byte, j)[alignP:]
22+
q := make([]byte, j)[alignQ:]
23+
d1 := make([]byte, j+alignD)[alignD:]
24+
d2 := make([]byte, j+alignD)[alignD:]
25+
if _, err := io.ReadFull(rand.Reader, p); err != nil {
26+
t.Fatal(err)
27+
}
28+
if _, err := io.ReadFull(rand.Reader, q); err != nil {
29+
t.Fatal(err)
30+
}
31+
cipher.XorBytes(d1, p, q)
32+
n := min(p, q)
33+
for i := 0; i < n; i++ {
34+
d2[i] = p[i] ^ q[i]
35+
}
36+
if !bytes.Equal(d1, d2) {
37+
t.Logf("p: %#v", p)
38+
t.Logf("q: %#v", q)
39+
t.Logf("expect: %#v", d2)
40+
t.Logf("result: %#v", d1)
41+
t.Fatal("not equal")
42+
}
2443
}
2544
}
2645
}
2746
}
2847
}
48+
49+
func min(a, b []byte) int {
50+
n := len(a)
51+
if len(b) < n {
52+
n = len(b)
53+
}
54+
return n
55+
}
56+
57+
func BenchmarkXORBytes(b *testing.B) {
58+
dst := make([]byte, 1<<15)
59+
data0 := make([]byte, 1<<15)
60+
data1 := make([]byte, 1<<15)
61+
sizes := []int64{1 << 3, 1 << 7, 1 << 11, 1 << 15}
62+
for _, size := range sizes {
63+
b.Run(fmt.Sprintf("%dBytes", size), func(b *testing.B) {
64+
s0 := data0[:size]
65+
s1 := data1[:size]
66+
b.SetBytes(int64(size))
67+
for i := 0; i < b.N; i++ {
68+
cipher.XorBytes(dst, s0, s1)
69+
}
70+
})
71+
}
72+
}

0 commit comments

Comments
 (0)