Skip to content

Commit e087300

Browse files
committed
crypto/aes: speedup CTR mode on AMD64 and ARM64
The implementation runs up to 8 AES instructions in different registers one after another in ASM code. Because CPU has instruction pipelining and the instructions do not depend on each other, they can run in parallel with this layout of code. This results in significant speedup compared to the regular implementation in which blocks are processed in the same registers so AES instructions do not run in parallel. GCM mode already utilizes the approach. The type implementing ctrAble in ASM has most of its code in XORKeyStreamAt method which has an additional argument, offset. It allows to use it in a stateless way and to jump to any location in the stream. The method does not exist in pure Go and boringcrypto implementations. AES CTR benchmark delta. $ go test crypto/cipher -bench 'BenchmarkAESCTR*' AMD64. Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz name old time/op new time/op delta BenchmarkAESCTR1K-2 1259ns 266.9ns -78.8% BenchmarkAESCTR8K-2 9859ns 1953ns -80.1% ARM64. ARM Neoverse-N1 (AWS EC2 t4g.small instance) name old time/op new time/op delta BenchmarkAESCTR1K-2 1098ns 481.1ns -56.2% BenchmarkAESCTR8K-2 8447ns 3452ns -59.1% Original issue: #20967 Investigation and initial implementation: https://github.com/mmcloughlin/aesnix/ Full implementation in external repo: https://github.com/starius/aesctrat
1 parent 23ac159 commit e087300

6 files changed

+2170
-0
lines changed

src/crypto/aes/ctr_multiblock.go

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
// Copyright 2023 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
//go:build amd64 || arm64
6+
7+
package aes
8+
9+
import (
10+
"crypto/cipher"
11+
"crypto/internal/alias"
12+
)
13+
14+
//go:generate sh -c "go run ./ctr_multiblock_amd64_gen.go | asmfmt > ctr_multiblock_amd64.s"
15+
//go:generate sh -c "go run ./ctr_multiblock_arm64_gen.go | asmfmt > ctr_multiblock_arm64.s"
16+
17+
// defined in ctr_multiblock_*.s
18+
19+
//go:noescape
20+
func rev16Asm(iv *byte)
21+
22+
//go:noescape
23+
func ctrBlocks1Asm(nr int, xk *uint32, dst, src, ivRev *byte, blockIndex uint64)
24+
25+
//go:noescape
26+
func ctrBlocks2Asm(nr int, xk *uint32, dst, src, ivRev *byte, blockIndex uint64)
27+
28+
//go:noescape
29+
func ctrBlocks4Asm(nr int, xk *uint32, dst, src, ivRev *byte, blockIndex uint64)
30+
31+
//go:noescape
32+
func ctrBlocks8Asm(nr int, xk *uint32, dst, src, ivRev *byte, blockIndex uint64)
33+
34+
type aesCtrWithIV struct {
35+
enc []uint32
36+
rounds int
37+
ivRev [BlockSize]byte
38+
offset uint64
39+
}
40+
41+
// NewCTR implements crypto/cipher.ctrAble so that crypto/cipher.NewCTR
42+
// will use the optimised implementation in this file when possible.
43+
func (c *aesCipherAsm) NewCTR(iv []byte) cipher.Stream {
44+
if len(iv) != BlockSize {
45+
panic("bad IV length")
46+
}
47+
48+
// Reverse IV once, because it is needed in reversed form
49+
// in all subsequent ASM calls.
50+
var ivRev [BlockSize]byte
51+
copy(ivRev[:], iv)
52+
rev16Asm(&ivRev[0])
53+
54+
return &aesCtrWithIV{
55+
enc: c.enc,
56+
rounds: len(c.enc)/4 - 1,
57+
ivRev: ivRev,
58+
offset: 0,
59+
}
60+
}
61+
62+
func (c *aesCtrWithIV) XORKeyStream(dst, src []byte) {
63+
c.XORKeyStreamAt(dst, src, c.offset)
64+
c.offset += uint64(len(src))
65+
}
66+
67+
func (c *aesCtrWithIV) XORKeyStreamAt(dst, src []byte, offset uint64) {
68+
if len(dst) < len(src) {
69+
panic("len(dst) < len(src)")
70+
}
71+
dst = dst[:len(src)]
72+
73+
if alias.InexactOverlap(dst, src) {
74+
panic("crypto/aes: invalid buffer overlap")
75+
}
76+
77+
offsetMod16 := offset % BlockSize
78+
79+
if offsetMod16 != 0 {
80+
// We have a partial block in the beginning.
81+
plaintext := make([]byte, BlockSize)
82+
copy(plaintext[offsetMod16:BlockSize], src)
83+
ciphertext := make([]byte, BlockSize)
84+
ctrBlocks1Asm(c.rounds, &c.enc[0], &ciphertext[0], &plaintext[0], &c.ivRev[0], offset/BlockSize)
85+
progress := BlockSize - offsetMod16
86+
if progress > uint64(len(src)) {
87+
progress = uint64(len(src))
88+
}
89+
copy(dst[:progress], ciphertext[offsetMod16:BlockSize])
90+
src = src[progress:]
91+
dst = dst[progress:]
92+
offset += progress
93+
}
94+
95+
for len(src) >= 8*BlockSize {
96+
ctrBlocks8Asm(c.rounds, &c.enc[0], &dst[0], &src[0], &c.ivRev[0], offset/BlockSize)
97+
src = src[8*BlockSize:]
98+
dst = dst[8*BlockSize:]
99+
offset += 8 * BlockSize
100+
}
101+
// 4, 2, and 1 blocks in the end can happen max 1 times, so if, not for.
102+
if len(src) >= 4*BlockSize {
103+
ctrBlocks4Asm(c.rounds, &c.enc[0], &dst[0], &src[0], &c.ivRev[0], offset/BlockSize)
104+
src = src[4*BlockSize:]
105+
dst = dst[4*BlockSize:]
106+
offset += 4 * BlockSize
107+
}
108+
if len(src) >= 2*BlockSize {
109+
ctrBlocks2Asm(c.rounds, &c.enc[0], &dst[0], &src[0], &c.ivRev[0], offset/BlockSize)
110+
src = src[2*BlockSize:]
111+
dst = dst[2*BlockSize:]
112+
offset += 2 * BlockSize
113+
}
114+
if len(src) >= 1*BlockSize {
115+
ctrBlocks1Asm(c.rounds, &c.enc[0], &dst[0], &src[0], &c.ivRev[0], offset/BlockSize)
116+
src = src[1*BlockSize:]
117+
dst = dst[1*BlockSize:]
118+
offset += 1 * BlockSize
119+
}
120+
121+
if len(src) != 0 {
122+
// We have a partial block in the end.
123+
plaintext := make([]byte, BlockSize)
124+
copy(plaintext, src)
125+
ciphertext := make([]byte, BlockSize)
126+
ctrBlocks1Asm(c.rounds, &c.enc[0], &ciphertext[0], &plaintext[0], &c.ivRev[0], offset/BlockSize)
127+
copy(dst, ciphertext)
128+
}
129+
}

0 commit comments

Comments
 (0)