Skip to content

Commit 4381c61

Browse files
committed
crypto/sha512: optimize ARM64 sha512 implemention
This CL enable sha512 for arm64 and ~390% performance improvement. Contributed under the Go License with permission of Linaro by Carlos Eduardo Seo <[email protected]> https://perf.golang.org/search?q=upload:20220526.18 Hash8Bytes/New 16.0MB/s ± 0% 61.3MB/s ± 0% +283.97% (p=0.000 n=9+9) Hash8Bytes/Sum384 16.4MB/s ± 0% 64.8MB/s ± 0% +295.31% (p=0.000 n=8+9) Hash8Bytes/Sum512 16.3MB/s ± 0% 64.2MB/s ± 0% +293.37% (p=0.000 n=10+10) Hash1K/New 252MB/s ± 0% 1217MB/s ± 0% +383.00% (p=0.000 n=9+10) Hash1K/Sum384 253MB/s ± 0% 1237MB/s ± 0% +389.25% (p=0.000 n=10+10) Hash1K/Sum512 253MB/s ± 0% 1231MB/s ± 0% +387.37% (p=0.000 n=10+8) Hash8K/New 284MB/s ± 0% 1405MB/s ± 2% +395.19% (p=0.000 n=9+8) Hash8K/Sum384 284MB/s ± 0% 1413MB/s ± 0% +397.76% (p=0.000 n=10+8) Hash8K/Sum512 284MB/s ± 0% 1411MB/s ± 0% +397.19% (p=0.000 n=10+10) Change-Id: I4476da23d8cd376bf1f75d946d6b0c58470df1b8 Reviewed-on: https://go-review.googlesource.com/c/go/+/180257 Reviewed-by: Carlos Eduardo Seo <[email protected]> Reviewed-by: Ard Biesheuvel <[email protected]> Reviewed-by: Heschi Kreinick <[email protected]> Reviewed-by: Filippo Valsorda <[email protected]> Run-TryBot: Meng Zhuo <[email protected]> TryBot-Result: Gopher Robot <[email protected]>
1 parent e8f0340 commit 4381c61

File tree

3 files changed

+154
-1
lines changed

3 files changed

+154
-1
lines changed
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
// Copyright 2022 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
package sha512
6+
7+
import "internal/cpu"
8+
9+
func block(dig *digest, p []byte) {
10+
if cpu.ARM64.HasSHA512 {
11+
blockAsm(dig, p)
12+
return
13+
}
14+
blockGeneric(dig, p)
15+
}
16+
17+
//go:noescape
18+
func blockAsm(dig *digest, p []byte)

src/crypto/sha512/sha512block_arm64.s

Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
// Copyright 2022 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
// Based on the Linux Kernel with the following comment:
6+
// Algorithm based on https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=fb87127bcefc17efab757606e1b1e333fd614dd0
7+
// Originally written by Ard Biesheuvel <[email protected]>
8+
9+
#include "textflag.h"
10+
11+
#define SHA512TRANS(i0, i1, i2, i3, i4, rc0, in0) \
12+
VADD in0.D2, rc0.D2, V5.D2 \
13+
VEXT $8, i3.B16, i2.B16, V6.B16 \
14+
VEXT $8, V5.B16, V5.B16, V5.B16 \
15+
VEXT $8, i2.B16, i1.B16, V7.B16 \
16+
VADD V5.D2, i3.D2, i3.D2 \
17+
18+
#define SHA512ROUND(i0, i1, i2, i3, i4, rc0, rc1, in0, in1, in2, in3, in4) \
19+
VLD1.P 16(R4), [rc1.D2] \
20+
SHA512TRANS(i0, i1, i2, i3, i4, rc0, in0) \
21+
VEXT $8, in4.B16, in3.B16, V5.B16 \
22+
SHA512SU0 in1.D2, in0.D2 \
23+
SHA512H V7.D2, V6, i3 \
24+
SHA512SU1 V5.D2, in2.D2, in0.D2 \
25+
VADD i3.D2, i1.D2, i4.D2 \
26+
SHA512H2 i0.D2, i1, i3
27+
28+
#define SHA512ROUND_NO_UPDATE(i0, i1, i2, i3, i4, rc0, rc1, in0) \
29+
VLD1.P 16(R4), [rc1.D2] \
30+
SHA512TRANS(i0, i1, i2, i3, i4, rc0, in0) \
31+
SHA512H V7.D2, V6, i3 \
32+
VADD i3.D2, i1.D2, i4.D2 \
33+
SHA512H2 i0.D2, i1, i3
34+
35+
#define SHA512ROUND_LAST(i0, i1, i2, i3, i4, rc0, in0) \
36+
SHA512TRANS(i0, i1, i2, i3, i4, rc0, in0) \
37+
SHA512H V7.D2, V6, i3 \
38+
VADD i3.D2, i1.D2, i4.D2 \
39+
SHA512H2 i0.D2, i1, i3
40+
41+
// func blockAsm(dig *digest, p []byte)
42+
TEXT ·blockAsm(SB),NOSPLIT,$0
43+
MOVD dig+0(FP), R0
44+
MOVD p_base+8(FP), R1
45+
MOVD p_len+16(FP), R2
46+
MOVD ·_K+0(SB), R3
47+
48+
// long enough to prefetch
49+
PRFM (R3), PLDL3KEEP
50+
// load digest
51+
VLD1 (R0), [V8.D2, V9.D2, V10.D2, V11.D2]
52+
loop:
53+
// load digest in V0-V3 keeping original in V8-V11
54+
VMOV V8.B16, V0.B16
55+
VMOV V9.B16, V1.B16
56+
VMOV V10.B16, V2.B16
57+
VMOV V11.B16, V3.B16
58+
59+
// load message data in V12-V19
60+
VLD1.P 64(R1), [V12.D2, V13.D2, V14.D2, V15.D2]
61+
VLD1.P 64(R1), [V16.D2, V17.D2, V18.D2, V19.D2]
62+
63+
// convert message into big endian format
64+
VREV64 V12.B16, V12.B16
65+
VREV64 V13.B16, V13.B16
66+
VREV64 V14.B16, V14.B16
67+
VREV64 V15.B16, V15.B16
68+
VREV64 V16.B16, V16.B16
69+
VREV64 V17.B16, V17.B16
70+
VREV64 V18.B16, V18.B16
71+
VREV64 V19.B16, V19.B16
72+
73+
MOVD R3, R4
74+
// load first 4 round consts in V20-V23
75+
VLD1.P 64(R4), [V20.D2, V21.D2, V22.D2, V23.D2]
76+
77+
SHA512ROUND(V0, V1, V2, V3, V4, V20, V24, V12, V13, V19, V16, V17)
78+
SHA512ROUND(V3, V0, V4, V2, V1, V21, V25, V13, V14, V12, V17, V18)
79+
SHA512ROUND(V2, V3, V1, V4, V0, V22, V26, V14, V15, V13, V18, V19)
80+
SHA512ROUND(V4, V2, V0, V1, V3, V23, V27, V15, V16, V14, V19, V12)
81+
SHA512ROUND(V1, V4, V3, V0, V2, V24, V28, V16, V17, V15, V12, V13)
82+
83+
SHA512ROUND(V0, V1, V2, V3, V4, V25, V29, V17, V18, V16, V13, V14)
84+
SHA512ROUND(V3, V0, V4, V2, V1, V26, V30, V18, V19, V17, V14, V15)
85+
SHA512ROUND(V2, V3, V1, V4, V0, V27, V31, V19, V12, V18, V15, V16)
86+
SHA512ROUND(V4, V2, V0, V1, V3, V28, V24, V12, V13, V19, V16, V17)
87+
SHA512ROUND(V1, V4, V3, V0, V2, V29, V25, V13, V14, V12, V17, V18)
88+
89+
SHA512ROUND(V0, V1, V2, V3, V4, V30, V26, V14, V15, V13, V18, V19)
90+
SHA512ROUND(V3, V0, V4, V2, V1, V31, V27, V15, V16, V14, V19, V12)
91+
SHA512ROUND(V2, V3, V1, V4, V0, V24, V28, V16, V17, V15, V12, V13)
92+
SHA512ROUND(V4, V2, V0, V1, V3, V25, V29, V17, V18, V16, V13, V14)
93+
SHA512ROUND(V1, V4, V3, V0, V2, V26, V30, V18, V19, V17, V14, V15)
94+
95+
SHA512ROUND(V0, V1, V2, V3, V4, V27, V31, V19, V12, V18, V15, V16)
96+
SHA512ROUND(V3, V0, V4, V2, V1, V28, V24, V12, V13, V19, V16, V17)
97+
SHA512ROUND(V2, V3, V1, V4, V0, V29, V25, V13, V14, V12, V17, V18)
98+
SHA512ROUND(V4, V2, V0, V1, V3, V30, V26, V14, V15, V13, V18, V19)
99+
SHA512ROUND(V1, V4, V3, V0, V2, V31, V27, V15, V16, V14, V19, V12)
100+
101+
SHA512ROUND(V0, V1, V2, V3, V4, V24, V28, V16, V17, V15, V12, V13)
102+
SHA512ROUND(V3, V0, V4, V2, V1, V25, V29, V17, V18, V16, V13, V14)
103+
SHA512ROUND(V2, V3, V1, V4, V0, V26, V30, V18, V19, V17, V14, V15)
104+
SHA512ROUND(V4, V2, V0, V1, V3, V27, V31, V19, V12, V18, V15, V16)
105+
SHA512ROUND(V1, V4, V3, V0, V2, V28, V24, V12, V13, V19, V16, V17)
106+
107+
SHA512ROUND(V0, V1, V2, V3, V4, V29, V25, V13, V14, V12, V17, V18)
108+
SHA512ROUND(V3, V0, V4, V2, V1, V30, V26, V14, V15, V13, V18, V19)
109+
SHA512ROUND(V2, V3, V1, V4, V0, V31, V27, V15, V16, V14, V19, V12)
110+
SHA512ROUND(V4, V2, V0, V1, V3, V24, V28, V16, V17, V15, V12, V13)
111+
SHA512ROUND(V1, V4, V3, V0, V2, V25, V29, V17, V18, V16, V13, V14)
112+
113+
SHA512ROUND(V0, V1, V2, V3, V4, V26, V30, V18, V19, V17, V14, V15)
114+
SHA512ROUND(V3, V0, V4, V2, V1, V27, V31, V19, V12, V18, V15, V16)
115+
116+
SHA512ROUND_NO_UPDATE(V2, V3, V1, V4, V0, V28, V24, V12)
117+
SHA512ROUND_NO_UPDATE(V4, V2, V0, V1, V3, V29, V25, V13)
118+
SHA512ROUND_NO_UPDATE(V1, V4, V3, V0, V2, V30, V26, V14)
119+
SHA512ROUND_NO_UPDATE(V0, V1, V2, V3, V4, V31, V27, V15)
120+
121+
SHA512ROUND_LAST(V3, V0, V4, V2, V1, V24, V16)
122+
SHA512ROUND_LAST(V2, V3, V1, V4, V0, V25, V17)
123+
SHA512ROUND_LAST(V4, V2, V0, V1, V3, V26, V18)
124+
SHA512ROUND_LAST(V1, V4, V3, V0, V2, V27, V19)
125+
126+
// add result to digest
127+
VADD V0.D2, V8.D2, V8.D2
128+
VADD V1.D2, V9.D2, V9.D2
129+
VADD V2.D2, V10.D2, V10.D2
130+
VADD V3.D2, V11.D2, V11.D2
131+
SUB $128, R2
132+
CBNZ R2, loop
133+
134+
VST1 [V8.D2, V9.D2, V10.D2, V11.D2], (R0)
135+
RET

src/crypto/sha512/sha512block_generic.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
// Use of this source code is governed by a BSD-style
33
// license that can be found in the LICENSE file.
44

5-
//go:build !amd64 && !s390x && !ppc64le && !ppc64
5+
//go:build !amd64 && !arm64 && !s390x && !ppc64le && !ppc64
66

77
package sha512
88

0 commit comments

Comments
 (0)