Skip to content

Commit b49d69b

Browse files
committed
internal/chacha20: add s390x SIMD implementation
Based on the SIMD algorithm described in: ChaCha, a variant of Salsa20 by Daniel J. Bernstein https://cr.yp.to/chacha/chacha-20080128.pdf Requires the vector facility (vx). name old speed new speed delta ChaCha20/32 178MB/s ± 0% 174MB/s ± 0% -2.10% (p=0.000 n=9+10) ChaCha20/63 341MB/s ± 0% 337MB/s ± 0% -1.16% (p=0.000 n=10+10) ChaCha20/64 367MB/s ± 0% 335MB/s ± 0% -8.73% (p=0.000 n=10+10) ChaCha20/256 404MB/s ± 0% 1448MB/s ± 0% +258.61% (p=0.000 n=9+10) ChaCha20/1024 410MB/s ± 0% 1568MB/s ± 0% +282.73% (p=0.000 n=9+10) ChaCha20/1350 393MB/s ± 0% 1389MB/s ± 0% +253.58% (p=0.000 n=10+10) ChaCha20/65536 414MB/s ± 0% 1634MB/s ± 0% +294.79% (p=0.000 n=10+10) Change-Id: I9a600fb5ae8ee3f3b81ae6b01cff139c1272d684 Reviewed-on: https://go-review.googlesource.com/35842 Run-TryBot: Michael Munday <[email protected]> Reviewed-by: Brad Fitzpatrick <[email protected]>
1 parent 754cb46 commit b49d69b

File tree

4 files changed

+336
-3
lines changed

4 files changed

+336
-3
lines changed

internal/chacha20/asm_s390x.s

Lines changed: 283 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,283 @@
1+
// Copyright 2018 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
// +build s390x,!gccgo,!appengine
6+
7+
#include "go_asm.h"
8+
#include "textflag.h"
9+
10+
// This is an implementation of the ChaCha20 encryption algorithm as
11+
// specified in RFC 7539. It uses vector instructions to compute
12+
// 4 keystream blocks in parallel (256 bytes) which are then XORed
13+
// with the bytes in the input slice.
14+
15+
GLOBL ·constants<>(SB), RODATA|NOPTR, $32
16+
// BSWAP: swap bytes in each 4-byte element
17+
DATA ·constants<>+0x00(SB)/4, $0x03020100
18+
DATA ·constants<>+0x04(SB)/4, $0x07060504
19+
DATA ·constants<>+0x08(SB)/4, $0x0b0a0908
20+
DATA ·constants<>+0x0c(SB)/4, $0x0f0e0d0c
21+
// J0: [j0, j1, j2, j3]
22+
DATA ·constants<>+0x10(SB)/4, $0x61707865
23+
DATA ·constants<>+0x14(SB)/4, $0x3320646e
24+
DATA ·constants<>+0x18(SB)/4, $0x79622d32
25+
DATA ·constants<>+0x1c(SB)/4, $0x6b206574
26+
27+
// EXRL targets:
28+
TEXT ·mvcSrcToBuf(SB), NOFRAME|NOSPLIT, $0
29+
MVC $1, (R1), (R8)
30+
RET
31+
32+
TEXT ·mvcBufToDst(SB), NOFRAME|NOSPLIT, $0
33+
MVC $1, (R8), (R9)
34+
RET
35+
36+
#define BSWAP V5
37+
#define J0 V6
38+
#define KEY0 V7
39+
#define KEY1 V8
40+
#define NONCE V9
41+
#define CTR V10
42+
#define M0 V11
43+
#define M1 V12
44+
#define M2 V13
45+
#define M3 V14
46+
#define INC V15
47+
#define X0 V16
48+
#define X1 V17
49+
#define X2 V18
50+
#define X3 V19
51+
#define X4 V20
52+
#define X5 V21
53+
#define X6 V22
54+
#define X7 V23
55+
#define X8 V24
56+
#define X9 V25
57+
#define X10 V26
58+
#define X11 V27
59+
#define X12 V28
60+
#define X13 V29
61+
#define X14 V30
62+
#define X15 V31
63+
64+
#define NUM_ROUNDS 20
65+
66+
#define ROUND4(a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3) \
67+
VAF a1, a0, a0 \
68+
VAF b1, b0, b0 \
69+
VAF c1, c0, c0 \
70+
VAF d1, d0, d0 \
71+
VX a0, a2, a2 \
72+
VX b0, b2, b2 \
73+
VX c0, c2, c2 \
74+
VX d0, d2, d2 \
75+
VERLLF $16, a2, a2 \
76+
VERLLF $16, b2, b2 \
77+
VERLLF $16, c2, c2 \
78+
VERLLF $16, d2, d2 \
79+
VAF a2, a3, a3 \
80+
VAF b2, b3, b3 \
81+
VAF c2, c3, c3 \
82+
VAF d2, d3, d3 \
83+
VX a3, a1, a1 \
84+
VX b3, b1, b1 \
85+
VX c3, c1, c1 \
86+
VX d3, d1, d1 \
87+
VERLLF $12, a1, a1 \
88+
VERLLF $12, b1, b1 \
89+
VERLLF $12, c1, c1 \
90+
VERLLF $12, d1, d1 \
91+
VAF a1, a0, a0 \
92+
VAF b1, b0, b0 \
93+
VAF c1, c0, c0 \
94+
VAF d1, d0, d0 \
95+
VX a0, a2, a2 \
96+
VX b0, b2, b2 \
97+
VX c0, c2, c2 \
98+
VX d0, d2, d2 \
99+
VERLLF $8, a2, a2 \
100+
VERLLF $8, b2, b2 \
101+
VERLLF $8, c2, c2 \
102+
VERLLF $8, d2, d2 \
103+
VAF a2, a3, a3 \
104+
VAF b2, b3, b3 \
105+
VAF c2, c3, c3 \
106+
VAF d2, d3, d3 \
107+
VX a3, a1, a1 \
108+
VX b3, b1, b1 \
109+
VX c3, c1, c1 \
110+
VX d3, d1, d1 \
111+
VERLLF $7, a1, a1 \
112+
VERLLF $7, b1, b1 \
113+
VERLLF $7, c1, c1 \
114+
VERLLF $7, d1, d1
115+
116+
#define PERMUTE(mask, v0, v1, v2, v3) \
117+
VPERM v0, v0, mask, v0 \
118+
VPERM v1, v1, mask, v1 \
119+
VPERM v2, v2, mask, v2 \
120+
VPERM v3, v3, mask, v3
121+
122+
#define ADDV(x, v0, v1, v2, v3) \
123+
VAF x, v0, v0 \
124+
VAF x, v1, v1 \
125+
VAF x, v2, v2 \
126+
VAF x, v3, v3
127+
128+
#define XORV(off, dst, src, v0, v1, v2, v3) \
129+
VLM off(src), M0, M3 \
130+
PERMUTE(BSWAP, v0, v1, v2, v3) \
131+
VX v0, M0, M0 \
132+
VX v1, M1, M1 \
133+
VX v2, M2, M2 \
134+
VX v3, M3, M3 \
135+
VSTM M0, M3, off(dst)
136+
137+
#define SHUFFLE(a, b, c, d, t, u, v, w) \
138+
VMRHF a, c, t \ // t = {a[0], c[0], a[1], c[1]}
139+
VMRHF b, d, u \ // u = {b[0], d[0], b[1], d[1]}
140+
VMRLF a, c, v \ // v = {a[2], c[2], a[3], c[3]}
141+
VMRLF b, d, w \ // w = {b[2], d[2], b[3], d[3]}
142+
VMRHF t, u, a \ // a = {a[0], b[0], c[0], d[0]}
143+
VMRLF t, u, b \ // b = {a[1], b[1], c[1], d[1]}
144+
VMRHF v, w, c \ // c = {a[2], b[2], c[2], d[2]}
145+
VMRLF v, w, d // d = {a[3], b[3], c[3], d[3]}
146+
147+
// func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32, buf *[256]byte, len *int)
148+
TEXT ·xorKeyStreamVX(SB), NOSPLIT, $0
149+
MOVD $·constants<>(SB), R1
150+
MOVD dst+0(FP), R2 // R2=&dst[0]
151+
LMG src+24(FP), R3, R4 // R3=&src[0] R4=len(src)
152+
MOVD key+48(FP), R5 // R5=key
153+
MOVD nonce+56(FP), R6 // R6=nonce
154+
MOVD counter+64(FP), R7 // R7=counter
155+
MOVD buf+72(FP), R8 // R8=buf
156+
MOVD len+80(FP), R9 // R9=len
157+
158+
// load BSWAP and J0
159+
VLM (R1), BSWAP, J0
160+
161+
// set up tail buffer
162+
ADD $-1, R4, R12
163+
MOVBZ R12, R12
164+
CMPUBEQ R12, $255, aligned
165+
MOVD R4, R1
166+
AND $~255, R1
167+
MOVD $(R3)(R1*1), R1
168+
EXRL $·mvcSrcToBuf(SB), R12
169+
MOVD $255, R0
170+
SUB R12, R0
171+
MOVD R0, (R9) // update len
172+
173+
aligned:
174+
// setup
175+
MOVD $95, R0
176+
VLM (R5), KEY0, KEY1
177+
VLL R0, (R6), NONCE
178+
VZERO M0
179+
VLEIB $7, $32, M0
180+
VSRLB M0, NONCE, NONCE
181+
182+
// initialize counter values
183+
VLREPF (R7), CTR
184+
VZERO INC
185+
VLEIF $1, $1, INC
186+
VLEIF $2, $2, INC
187+
VLEIF $3, $3, INC
188+
VAF INC, CTR, CTR
189+
VREPIF $4, INC
190+
191+
chacha:
192+
VREPF $0, J0, X0
193+
VREPF $1, J0, X1
194+
VREPF $2, J0, X2
195+
VREPF $3, J0, X3
196+
VREPF $0, KEY0, X4
197+
VREPF $1, KEY0, X5
198+
VREPF $2, KEY0, X6
199+
VREPF $3, KEY0, X7
200+
VREPF $0, KEY1, X8
201+
VREPF $1, KEY1, X9
202+
VREPF $2, KEY1, X10
203+
VREPF $3, KEY1, X11
204+
VLR CTR, X12
205+
VREPF $1, NONCE, X13
206+
VREPF $2, NONCE, X14
207+
VREPF $3, NONCE, X15
208+
209+
MOVD $(NUM_ROUNDS/2), R1
210+
211+
loop:
212+
ROUND4(X0, X4, X12, X8, X1, X5, X13, X9, X2, X6, X14, X10, X3, X7, X15, X11)
213+
ROUND4(X0, X5, X15, X10, X1, X6, X12, X11, X2, X7, X13, X8, X3, X4, X14, X9)
214+
215+
ADD $-1, R1
216+
BNE loop
217+
218+
// decrement length
219+
ADD $-256, R4
220+
BLT tail
221+
222+
continue:
223+
// rearrange vectors
224+
SHUFFLE(X0, X1, X2, X3, M0, M1, M2, M3)
225+
ADDV(J0, X0, X1, X2, X3)
226+
SHUFFLE(X4, X5, X6, X7, M0, M1, M2, M3)
227+
ADDV(KEY0, X4, X5, X6, X7)
228+
SHUFFLE(X8, X9, X10, X11, M0, M1, M2, M3)
229+
ADDV(KEY1, X8, X9, X10, X11)
230+
VAF CTR, X12, X12
231+
SHUFFLE(X12, X13, X14, X15, M0, M1, M2, M3)
232+
ADDV(NONCE, X12, X13, X14, X15)
233+
234+
// increment counters
235+
VAF INC, CTR, CTR
236+
237+
// xor keystream with plaintext
238+
XORV(0*64, R2, R3, X0, X4, X8, X12)
239+
XORV(1*64, R2, R3, X1, X5, X9, X13)
240+
XORV(2*64, R2, R3, X2, X6, X10, X14)
241+
XORV(3*64, R2, R3, X3, X7, X11, X15)
242+
243+
// increment pointers
244+
MOVD $256(R2), R2
245+
MOVD $256(R3), R3
246+
247+
CMPBNE R4, $0, chacha
248+
CMPUBEQ R12, $255, return
249+
EXRL $·mvcBufToDst(SB), R12 // len was updated during setup
250+
251+
return:
252+
VSTEF $0, CTR, (R7)
253+
RET
254+
255+
tail:
256+
MOVD R2, R9
257+
MOVD R8, R2
258+
MOVD R8, R3
259+
MOVD $0, R4
260+
JMP continue
261+
262+
// func hasVectorFacility() bool
263+
TEXT ·hasVectorFacility(SB), NOSPLIT, $24-1
264+
MOVD $x-24(SP), R1
265+
XC $24, 0(R1), 0(R1) // clear the storage
266+
MOVD $2, R0 // R0 is the number of double words stored -1
267+
WORD $0xB2B01000 // STFLE 0(R1)
268+
XOR R0, R0 // reset the value of R0
269+
MOVBZ z-8(SP), R1
270+
AND $0x40, R1
271+
BEQ novector
272+
273+
vectorinstalled:
274+
// check if the vector instruction has been enabled
275+
VLEIB $0, $0xF, V16
276+
VLGVB $0, V16, R1
277+
CMPBNE R1, $0xF, novector
278+
MOVB $1, ret+0(FP) // have vx
279+
RET
280+
281+
novector:
282+
MOVB $0, ret+0(FP) // no vx
283+
RET

internal/chacha20/chacha_generic.go

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,10 @@ var _ cipher.Stream = (*Cipher)(nil)
1818
// and nonce. A *Cipher implements the cipher.Stream interface.
1919
type Cipher struct {
2020
key [8]uint32
21+
counter uint32 // incremented after each block
2122
nonce [3]uint32
22-
counter uint32 // incremented after each block
23-
buf [64]byte // buffer for unused keystream bytes
24-
len int // number of unused keystream bytes at end of buf
23+
buf [bufSize]byte // buffer for unused keystream bytes
24+
len int // number of unused keystream bytes at end of buf
2525
}
2626

2727
// New creates a new ChaCha20 stream cipher with the given key and nonce.
@@ -63,6 +63,10 @@ func (s *Cipher) XORKeyStream(dst, src []byte) {
6363
if len(src) == 0 {
6464
return
6565
}
66+
if haveAsm {
67+
s.xorKeyStreamAsm(dst, src)
68+
return
69+
}
6670

6771
// set up a 64-byte buffer to pad out the final block if needed
6872
// (hoisted out of the main loop to avoid spills)

internal/chacha20/chacha_noasm.go

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
// Copyright 2018 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
// +build !s390x gccgo appengine
6+
7+
package chacha20
8+
9+
const (
10+
bufSize = 64
11+
haveAsm = false
12+
)
13+
14+
func (*Cipher) xorKeyStreamAsm(dst, src []byte) {
15+
panic("not implemented")
16+
}

internal/chacha20/chacha_s390x.go

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
// Copyright 2018 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
// +build s390x,!gccgo,!appengine
6+
7+
package chacha20
8+
9+
var haveAsm = hasVectorFacility()
10+
11+
const bufSize = 256
12+
13+
// hasVectorFacility reports whether the machine supports the vector
14+
// facility (vx).
15+
// Implementation in asm_s390x.s.
16+
func hasVectorFacility() bool
17+
18+
// xorKeyStreamVX is an assembly implementation of XORKeyStream. It must only
19+
// be called when the vector facility is available.
20+
// Implementation in asm_s390x.s.
21+
//go:noescape
22+
func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32, buf *[256]byte, len *int)
23+
24+
func (c *Cipher) xorKeyStreamAsm(dst, src []byte) {
25+
xorKeyStreamVX(dst, src, &c.key, &c.nonce, &c.counter, &c.buf, &c.len)
26+
}
27+
28+
// EXRL targets, DO NOT CALL!
29+
func mvcSrcToBuf()
30+
func mvcBufToDst()

0 commit comments

Comments
 (0)