Skip to content

Commit 4f1f503

Browse files
vkrasnovbradfitz
authored andcommitted
crypto/aes: implement AES-GCM AEAD for arm64
Use the dedicated AES* and PMULL* instructions to accelerate AES-GCM name old time/op new time/op delta AESGCMSeal1K-46 12.1µs ± 0% 0.9µs ± 0% -92.66% (p=0.000 n=9+10) AESGCMOpen1K-46 12.1µs ± 0% 0.9µs ± 0% -92.43% (p=0.000 n=10+10) AESGCMSign8K-46 58.6µs ± 0% 2.1µs ± 0% -96.41% (p=0.000 n=9+8) AESGCMSeal8K-46 92.8µs ± 0% 5.7µs ± 0% -93.86% (p=0.000 n=9+9) AESGCMOpen8K-46 92.9µs ± 0% 5.7µs ± 0% -93.84% (p=0.000 n=8+9) name old speed new speed delta AESGCMSeal1K-46 84.7MB/s ± 0% 1153.4MB/s ± 0% +1262.21% (p=0.000 n=9+10) AESGCMOpen1K-46 84.4MB/s ± 0% 1115.2MB/s ± 0% +1220.53% (p=0.000 n=10+10) AESGCMSign8K-46 140MB/s ± 0% 3894MB/s ± 0% +2687.50% (p=0.000 n=9+10) AESGCMSeal8K-46 88.2MB/s ± 0% 1437.5MB/s ± 0% +1529.30% (p=0.000 n=9+9) AESGCMOpen8K-46 88.2MB/s ± 0% 1430.5MB/s ± 0% +1522.01% (p=0.000 n=8+9) This change mirrors the current amd64 implementation, and provides optimal performance on a range of arm64 processors including Centriq 2400 and Apple A12. By and large it is implicitly tested by the robustness of the already existing amd64 implementation. The implementation interleaves GHASH with CTR mode to achieve the highest possible throughput, it also aggregates GHASH with a factor of 8, to decrease the cost of the reduction step. Even thought there is a significant amount of assembly, the code reuses the go code for the amd64 implementation, so there is little additional go code. Since AES-GCM is critical for performance of all web servers, this change is required to level the playfield for arm64 CPUs, where amd64 currently enjoys an unfair advantage. Ideally both amd64 and arm64 codepaths could be replaced by hypothetical AES and CLMUL intrinsics, with a few additional vector instructions. Fixes #18498 Fixes #19840 Change-Id: Icc57b868cd1f67ac695c1ac163a8e215f74c7910 Reviewed-on: https://go-review.googlesource.com/107298 Run-TryBot: Vlad Krasnov <[email protected]> TryBot-Result: Gobot Gobot <[email protected]> Reviewed-by: Brad Fitzpatrick <[email protected]>
1 parent c814ac4 commit 4f1f503

File tree

8 files changed

+1217
-151
lines changed

8 files changed

+1217
-151
lines changed

src/crypto/aes/aes_gcm.go

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
// Use of this source code is governed by a BSD-style
33
// license that can be found in the LICENSE file.
44

5-
// +build amd64
5+
// +build amd64 arm64
66

77
package aes
88

@@ -13,10 +13,7 @@ import (
1313
"errors"
1414
)
1515

16-
// The following functions are defined in gcm_amd64.s.
17-
18-
//go:noescape
19-
func aesEncBlock(dst, src *[16]byte, ks []uint32)
16+
// The following functions are defined in gcm_*.s.
2017

2118
//go:noescape
2219
func gcmAesInit(productTable *[256]byte, ks []uint32)
@@ -118,7 +115,7 @@ func (g *gcmAsm) Seal(dst, nonce, plaintext, data []byte) []byte {
118115
gcmAesFinish(&g.productTable, &tagMask, &counter, uint64(len(nonce)), uint64(0))
119116
}
120117

121-
aesEncBlock(&tagMask, &counter, g.ks)
118+
encryptBlockAsm(len(g.ks)/4-1, &g.ks[0], &tagMask[0], &counter[0])
122119

123120
var tagOut [gcmTagSize]byte
124121
gcmAesData(&g.productTable, data, &tagOut)
@@ -171,7 +168,7 @@ func (g *gcmAsm) Open(dst, nonce, ciphertext, data []byte) ([]byte, error) {
171168
gcmAesFinish(&g.productTable, &tagMask, &counter, uint64(len(nonce)), uint64(0))
172169
}
173170

174-
aesEncBlock(&tagMask, &counter, g.ks)
171+
encryptBlockAsm(len(g.ks)/4-1, &g.ks[0], &tagMask[0], &counter[0])
175172

176173
var expectedTag [gcmTagSize]byte
177174
gcmAesData(&g.productTable, data, &expectedTag)

src/crypto/aes/asm_arm64.s

Lines changed: 175 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,12 @@
33
// license that can be found in the LICENSE file.
44

55
#include "textflag.h"
6-
6+
DATA rotInvSRows<>+0x00(SB)/8, $0x080f0205040b0e01
7+
DATA rotInvSRows<>+0x08(SB)/8, $0x00070a0d0c030609
8+
GLOBL rotInvSRows<>(SB), (NOPTR+RODATA), $16
9+
DATA invSRows<>+0x00(SB)/8, $0x0b0e0104070a0d00
10+
DATA invSRows<>+0x08(SB)/8, $0x0306090c0f020508
11+
GLOBL invSRows<>(SB), (NOPTR+RODATA), $16
712
// func encryptBlockAsm(nr int, xk *uint32, dst, src *byte)
813
TEXT ·encryptBlockAsm(SB),NOSPLIT,$0
914
MOVD nr+0(FP), R9
@@ -105,3 +110,172 @@ dec128:
105110
VEOR V0.B16, V15.B16, V0.B16
106111
VST1 [V0.B16], (R11)
107112
RET
113+
114+
// func expandKeyAsm(nr int, key *byte, enc, dec *uint32) {
115+
// Note that round keys are stored in uint128 format, not uint32
116+
TEXT ·expandKeyAsm(SB),NOSPLIT,$0
117+
MOVD nr+0(FP), R8
118+
MOVD key+8(FP), R9
119+
MOVD enc+16(FP), R10
120+
MOVD dec+24(FP), R11
121+
LDP rotInvSRows<>(SB), (R0, R1)
122+
VMOV R0, V3.D[0]
123+
VMOV R1, V3.D[1]
124+
VEOR V0.B16, V0.B16, V0.B16 // All zeroes
125+
MOVW $1, R13
126+
TBZ $1, R8, ks192
127+
TBNZ $2, R8, ks256
128+
LDPW (R9), (R4, R5)
129+
LDPW 8(R9), (R6, R7)
130+
STPW.P (R4, R5), 8(R10)
131+
STPW.P (R6, R7), 8(R10)
132+
MOVW $0x1b, R14
133+
ks128Loop:
134+
VMOV R7, V2.S[0]
135+
WORD $0x4E030042 // TBL V3.B16, [V2.B16], V2.B16
136+
AESE V0.B16, V2.B16 // Use AES to compute the SBOX
137+
EORW R13, R4
138+
LSLW $1, R13 // Compute next Rcon
139+
ANDSW $0x100, R13, ZR
140+
CSELW NE, R14, R13, R13 // Fake modulo
141+
SUBS $1, R8
142+
VMOV V2.S[0], R0
143+
EORW R0, R4
144+
EORW R4, R5
145+
EORW R5, R6
146+
EORW R6, R7
147+
STPW.P (R4, R5), 8(R10)
148+
STPW.P (R6, R7), 8(R10)
149+
BNE ks128Loop
150+
CBZ R11, ksDone // If dec is nil we are done
151+
SUB $176, R10
152+
// Decryption keys are encryption keys with InverseMixColumns applied
153+
VLD1.P 64(R10), [V0.B16, V1.B16, V2.B16, V3.B16]
154+
VMOV V0.B16, V7.B16
155+
AESIMC V1.B16, V6.B16
156+
AESIMC V2.B16, V5.B16
157+
AESIMC V3.B16, V4.B16
158+
VLD1.P 64(R10), [V0.B16, V1.B16, V2.B16, V3.B16]
159+
AESIMC V0.B16, V11.B16
160+
AESIMC V1.B16, V10.B16
161+
AESIMC V2.B16, V9.B16
162+
AESIMC V3.B16, V8.B16
163+
VLD1 (R10), [V0.B16, V1.B16, V2.B16]
164+
AESIMC V0.B16, V14.B16
165+
AESIMC V1.B16, V13.B16
166+
VMOV V2.B16, V12.B16
167+
VST1.P [V12.B16, V13.B16, V14.B16], 48(R11)
168+
VST1.P [V8.B16, V9.B16, V10.B16, V11.B16], 64(R11)
169+
VST1 [V4.B16, V5.B16, V6.B16, V7.B16], (R11)
170+
B ksDone
171+
ks192:
172+
LDPW (R9), (R2, R3)
173+
LDPW 8(R9), (R4, R5)
174+
LDPW 16(R9), (R6, R7)
175+
STPW.P (R2, R3), 8(R10)
176+
STPW.P (R4, R5), 8(R10)
177+
SUB $4, R8
178+
ks192Loop:
179+
STPW.P (R6, R7), 8(R10)
180+
VMOV R7, V2.S[0]
181+
WORD $0x4E030042 //TBL V3.B16, [V2.B16], V2.B16
182+
AESE V0.B16, V2.B16
183+
EORW R13, R2
184+
LSLW $1, R13
185+
SUBS $1, R8
186+
VMOV V2.S[0], R0
187+
EORW R0, R2
188+
EORW R2, R3
189+
EORW R3, R4
190+
EORW R4, R5
191+
EORW R5, R6
192+
EORW R6, R7
193+
STPW.P (R2, R3), 8(R10)
194+
STPW.P (R4, R5), 8(R10)
195+
BNE ks192Loop
196+
CBZ R11, ksDone
197+
SUB $208, R10
198+
VLD1.P 64(R10), [V0.B16, V1.B16, V2.B16, V3.B16]
199+
VMOV V0.B16, V7.B16
200+
AESIMC V1.B16, V6.B16
201+
AESIMC V2.B16, V5.B16
202+
AESIMC V3.B16, V4.B16
203+
VLD1.P 64(R10), [V0.B16, V1.B16, V2.B16, V3.B16]
204+
AESIMC V0.B16, V11.B16
205+
AESIMC V1.B16, V10.B16
206+
AESIMC V2.B16, V9.B16
207+
AESIMC V3.B16, V8.B16
208+
VLD1.P 64(R10), [V0.B16, V1.B16, V2.B16, V3.B16]
209+
AESIMC V0.B16, V15.B16
210+
AESIMC V1.B16, V14.B16
211+
AESIMC V2.B16, V13.B16
212+
AESIMC V3.B16, V12.B16
213+
VLD1 (R10), [V0.B16]
214+
VST1.P [V0.B16], 16(R11)
215+
VST1.P [V12.B16, V13.B16, V14.B16, V15.B16], 64(R11)
216+
VST1.P [V8.B16, V9.B16, V10.B16, V11.B16], 64(R11)
217+
VST1 [V4.B16, V5.B16, V6.B16, V7.B16], (R11)
218+
B ksDone
219+
ks256:
220+
LDP invSRows<>(SB), (R0, R1)
221+
VMOV R0, V4.D[0]
222+
VMOV R1, V4.D[1]
223+
LDPW (R9), (R0, R1)
224+
LDPW 8(R9), (R2, R3)
225+
LDPW 16(R9), (R4, R5)
226+
LDPW 24(R9), (R6, R7)
227+
STPW.P (R0, R1), 8(R10)
228+
STPW.P (R2, R3), 8(R10)
229+
SUB $7, R8
230+
ks256Loop:
231+
STPW.P (R4, R5), 8(R10)
232+
STPW.P (R6, R7), 8(R10)
233+
VMOV R7, V2.S[0]
234+
WORD $0x4E030042 //TBL V3.B16, [V2.B16], V2.B16
235+
AESE V0.B16, V2.B16
236+
EORW R13, R0
237+
LSLW $1, R13
238+
SUBS $1, R8
239+
VMOV V2.S[0], R9
240+
EORW R9, R0
241+
EORW R0, R1
242+
EORW R1, R2
243+
EORW R2, R3
244+
VMOV R3, V2.S[0]
245+
WORD $0x4E040042 //TBL V3.B16, [V2.B16], V2.B16
246+
AESE V0.B16, V2.B16
247+
VMOV V2.S[0], R9
248+
EORW R9, R4
249+
EORW R4, R5
250+
EORW R5, R6
251+
EORW R6, R7
252+
STPW.P (R0, R1), 8(R10)
253+
STPW.P (R2, R3), 8(R10)
254+
BNE ks256Loop
255+
CBZ R11, ksDone
256+
SUB $240, R10
257+
VLD1.P 64(R10), [V0.B16, V1.B16, V2.B16, V3.B16]
258+
VMOV V0.B16, V7.B16
259+
AESIMC V1.B16, V6.B16
260+
AESIMC V2.B16, V5.B16
261+
AESIMC V3.B16, V4.B16
262+
VLD1.P 64(R10), [V0.B16, V1.B16, V2.B16, V3.B16]
263+
AESIMC V0.B16, V11.B16
264+
AESIMC V1.B16, V10.B16
265+
AESIMC V2.B16, V9.B16
266+
AESIMC V3.B16, V8.B16
267+
VLD1.P 64(R10), [V0.B16, V1.B16, V2.B16, V3.B16]
268+
AESIMC V0.B16, V15.B16
269+
AESIMC V1.B16, V14.B16
270+
AESIMC V2.B16, V13.B16
271+
AESIMC V3.B16, V12.B16
272+
VLD1 (R10), [V0.B16, V1.B16, V2.B16]
273+
AESIMC V0.B16, V18.B16
274+
AESIMC V1.B16, V17.B16
275+
VMOV V2.B16, V16.B16
276+
VST1.P [V16.B16, V17.B16, V18.B16], 48(R11)
277+
VST1.P [V12.B16, V13.B16, V14.B16, V15.B16], 64(R11)
278+
VST1.P [V8.B16, V9.B16, V10.B16, V11.B16], 64(R11)
279+
VST1 [V4.B16, V5.B16, V6.B16, V7.B16], (R11)
280+
ksDone:
281+
RET

src/crypto/aes/cipher_arm64.go

Lines changed: 0 additions & 80 deletions
This file was deleted.

src/crypto/aes/cipher_amd64.go renamed to src/crypto/aes/cipher_asm.go

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
// Use of this source code is governed by a BSD-style
33
// license that can be found in the LICENSE file.
44

5+
// +build amd64 arm64
6+
57
package aes
68

79
import (
@@ -10,23 +12,31 @@ import (
1012
"internal/cpu"
1113
)
1214

13-
// defined in asm_amd64.s
15+
// defined in asm_*.s
1416

17+
//go:noescape
1518
func encryptBlockAsm(nr int, xk *uint32, dst, src *byte)
19+
20+
//go:noescape
1621
func decryptBlockAsm(nr int, xk *uint32, dst, src *byte)
22+
23+
//go:noescape
1724
func expandKeyAsm(nr int, key *byte, enc *uint32, dec *uint32)
1825

1926
type aesCipherAsm struct {
2027
aesCipher
2128
}
2229

30+
var supportsAES = cpu.X86.HasAES || cpu.ARM64.HasAES
31+
var supportsGFMUL = cpu.X86.HasPCLMULQDQ || cpu.ARM64.HasPMULL
32+
2333
func newCipher(key []byte) (cipher.Block, error) {
24-
if !cpu.X86.HasAES {
34+
if !supportsAES {
2535
return newCipherGeneric(key)
2636
}
2737
n := len(key) + 28
2838
c := aesCipherAsm{aesCipher{make([]uint32, n), make([]uint32, n)}}
29-
rounds := 10
39+
var rounds int
3040
switch len(key) {
3141
case 128 / 8:
3242
rounds = 10
@@ -37,10 +47,9 @@ func newCipher(key []byte) (cipher.Block, error) {
3747
}
3848

3949
expandKeyAsm(rounds, &key[0], &c.enc[0], &c.dec[0])
40-
if cpu.X86.HasAES && cpu.X86.HasPCLMULQDQ {
50+
if supportsAES && supportsGFMUL {
4151
return &aesCipherGCM{c}, nil
4252
}
43-
4453
return &c, nil
4554
}
4655

@@ -75,7 +84,7 @@ func (c *aesCipherAsm) Decrypt(dst, src []byte) {
7584
// expandKey is used by BenchmarkExpand to ensure that the asm implementation
7685
// of key expansion is used for the benchmark when it is available.
7786
func expandKey(key []byte, enc, dec []uint32) {
78-
if cpu.X86.HasAES {
87+
if supportsAES {
7988
rounds := 10 // rounds needed for AES128
8089
switch len(key) {
8190
case 192 / 8:

0 commit comments

Comments
 (0)