Skip to content

Commit 55d08e5

Browse files
klauspost4a6f656c
authored andcommitted
crypto/md5: optimize amd64 assembly
* Use two ADDL instead of LEAL * Keep ones in R11 * Use XORL with lower latency instead of NOTL * Remove loads and load the correct value in the previous round * Reduce dependency chain in round 2. * Remove MOVL in round 3. name old time/op new time/op delta Hash8Bytes-32 104ns ± 0% 96ns ± 1% -7.83% (p=0.000 n=9+10) Hash64-32 169ns ± 0% 155ns ± 0% -7.97% (p=0.000 n=10+10) Hash128-32 244ns ± 0% 224ns ± 0% -8.16% (p=0.000 n=9+10) Hash256-32 396ns ± 0% 360ns ± 1% -9.01% (p=0.000 n=10+10) Hash512-32 700ns ± 1% 634ns ± 1% -9.43% (p=0.000 n=10+10) Hash1K-32 1.30µs ± 0% 1.18µs ± 1% -9.32% (p=0.000 n=9+10) Hash8K-32 9.77µs ± 0% 8.81µs ± 0% -9.78% (p=0.000 n=9+10) Hash1M-32 1.24ms ± 1% 1.12ms ± 1% -9.54% (p=0.000 n=10+10) Hash8M-32 10.0ms ± 1% 9.0ms ± 1% -10.04% (p=0.000 n=10+10) Hash8BytesUnaligned-32 104ns ± 0% 96ns ± 0% -7.50% (p=0.000 n=10+10) Hash1KUnaligned-32 1.32µs ± 1% 1.18µs ± 1% -10.42% (p=0.000 n=10+10) Hash8KUnaligned-32 9.80µs ± 0% 8.79µs ± 1% -10.29% (p=0.000 n=10+10) name old speed new speed delta Hash8Bytes-32 77.1MB/s ± 0% 83.6MB/s ± 1% +8.49% (p=0.000 n=9+10) Hash64-32 379MB/s ± 0% 412MB/s ± 0% +8.66% (p=0.000 n=10+10) Hash128-32 525MB/s ± 0% 572MB/s ± 0% +8.89% (p=0.000 n=9+10) Hash256-32 646MB/s ± 0% 710MB/s ± 1% +9.90% (p=0.000 n=10+10) Hash512-32 732MB/s ± 1% 808MB/s ± 1% +10.41% (p=0.000 n=10+10) Hash1K-32 786MB/s ± 0% 866MB/s ± 1% +10.30% (p=0.000 n=9+10) Hash8K-32 839MB/s ± 0% 930MB/s ± 0% +10.79% (p=0.000 n=10+10) Hash1M-32 849MB/s ± 1% 938MB/s ± 1% +10.54% (p=0.000 n=10+10) Hash8M-32 841MB/s ± 1% 935MB/s ± 1% +11.16% (p=0.000 n=10+10) Hash8BytesUnaligned-32 77.1MB/s ± 0% 83.4MB/s ± 0% +8.12% (p=0.000 n=10+10) Hash1KUnaligned-32 778MB/s ± 1% 869MB/s ± 1% +11.64% (p=0.000 n=10+10) Hash8KUnaligned-32 836MB/s ± 0% 932MB/s ± 1% +11.47% (p=0.000 n=10+10) Change-Id: I02b31229b857e9257dc9d36538883eb3af4ad993 This PR will be imported into Gerrit with the title and first comment (this text) used to generate the subject and body of the Gerrit change. Change-Id: I02b31229b857e9257dc9d36538883eb3af4ad993 GitHub-Last-Rev: ec8b15d GitHub-Pull-Request: #43690 Reviewed-on: https://go-review.googlesource.com/c/go/+/283538 Run-TryBot: Joel Sing <[email protected]> Reviewed-by: Matthew Dempsky <[email protected]> Reviewed-by: David Chase <[email protected]> TryBot-Result: Gopher Robot <[email protected]> Reviewed-by: Joel Sing <[email protected]>
1 parent f7b4f02 commit 55d08e5

File tree

1 file changed

+40
-24
lines changed

1 file changed

+40
-24
lines changed

src/crypto/md5/md5block_amd64.s

Lines changed: 40 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ TEXT ·block(SB),NOSPLIT,$8-32
2525
MOVL (1*4)(BP), BX
2626
MOVL (2*4)(BP), CX
2727
MOVL (3*4)(BP), DX
28+
MOVL $0xffffffff, R11
2829

2930
CMPQ SI, DI
3031
JEQ end
@@ -40,14 +41,15 @@ loop:
4041

4142
#define ROUND1(a, b, c, d, index, const, shift) \
4243
XORL c, R9; \
43-
LEAL const(a)(R8*1), a; \
44+
ADDL $const, a; \
45+
ADDL R8, a; \
4446
ANDL b, R9; \
45-
XORL d, R9; \
46-
MOVL (index*4)(SI), R8; \
47-
ADDL R9, a; \
48-
ROLL $shift, a; \
49-
MOVL c, R9; \
50-
ADDL b, a
47+
XORL d, R9; \
48+
MOVL (index*4)(SI), R8; \
49+
ADDL R9, a; \
50+
ROLL $shift, a; \
51+
MOVL c, R9; \
52+
ADDL b, a
5153

5254
ROUND1(AX,BX,CX,DX, 1,0xd76aa478, 7);
5355
ROUND1(DX,AX,BX,CX, 2,0xe8c7b756,12);
@@ -64,21 +66,23 @@ loop:
6466
ROUND1(AX,BX,CX,DX,13,0x6b901122, 7);
6567
ROUND1(DX,AX,BX,CX,14,0xfd987193,12);
6668
ROUND1(CX,DX,AX,BX,15,0xa679438e,17);
67-
ROUND1(BX,CX,DX,AX, 0,0x49b40821,22);
69+
ROUND1(BX,CX,DX,AX, 1,0x49b40821,22);
6870

69-
MOVL (1*4)(SI), R8
7071
MOVL DX, R9
7172
MOVL DX, R10
7273

74+
// Uses https://github.com/animetosho/md5-optimisation#dependency-shortcut-in-g-function
75+
7376
#define ROUND2(a, b, c, d, index, const, shift) \
74-
NOTL R9; \
75-
LEAL const(a)(R8*1),a; \
77+
XORL R11, R9; \
78+
ADDL $const, a; \
79+
ADDL R8, a; \
7680
ANDL b, R10; \
7781
ANDL c, R9; \
7882
MOVL (index*4)(SI),R8; \
79-
ORL R9, R10; \
83+
ADDL R9, a; \
84+
ADDL R10, a; \
8085
MOVL c, R9; \
81-
ADDL R10, a; \
8286
MOVL c, R10; \
8387
ROLL $shift, a; \
8488
ADDL b, a
@@ -98,22 +102,34 @@ loop:
98102
ROUND2(AX,BX,CX,DX, 2,0xa9e3e905, 5);
99103
ROUND2(DX,AX,BX,CX, 7,0xfcefa3f8, 9);
100104
ROUND2(CX,DX,AX,BX,12,0x676f02d9,14);
101-
ROUND2(BX,CX,DX,AX, 0,0x8d2a4c8a,20);
105+
ROUND2(BX,CX,DX,AX, 5,0x8d2a4c8a,20);
102106

103-
MOVL (5*4)(SI), R8
104107
MOVL CX, R9
105108

106-
#define ROUND3(a, b, c, d, index, const, shift) \
107-
LEAL const(a)(R8*1),a; \
109+
// Uses https://github.com/animetosho/md5-optimisation#h-function-re-use
110+
111+
#define ROUND3FIRST(a, b, c, d, index, const, shift) \
112+
MOVL d, R9; \
113+
XORL c, R9; \
114+
XORL b, R9; \
115+
ADDL $const, a; \
116+
ADDL R8, a; \
108117
MOVL (index*4)(SI),R8; \
109-
XORL d, R9; \
118+
ADDL R9, a; \
119+
ROLL $shift, a; \
120+
ADDL b, a
121+
122+
#define ROUND3(a, b, c, d, index, const, shift) \
123+
XORL a, R9; \
110124
XORL b, R9; \
125+
ADDL $const, a; \
126+
ADDL R8, a; \
127+
MOVL (index*4)(SI),R8; \
111128
ADDL R9, a; \
112129
ROLL $shift, a; \
113-
MOVL b, R9; \
114130
ADDL b, a
115131

116-
ROUND3(AX,BX,CX,DX, 8,0xfffa3942, 4);
132+
ROUND3FIRST(AX,BX,CX,DX, 8,0xfffa3942, 4);
117133
ROUND3(DX,AX,BX,CX,11,0x8771f681,11);
118134
ROUND3(CX,DX,AX,BX,14,0x6d9d6122,16);
119135
ROUND3(BX,CX,DX,AX, 1,0xfde5380c,23);
@@ -130,13 +146,13 @@ loop:
130146
ROUND3(CX,DX,AX,BX, 2,0x1fa27cf8,16);
131147
ROUND3(BX,CX,DX,AX, 0,0xc4ac5665,23);
132148

133-
MOVL (0*4)(SI), R8
134-
MOVL $0xffffffff, R9
149+
MOVL R11, R9
135150
XORL DX, R9
136151

137152
#define ROUND4(a, b, c, d, index, const, shift) \
138-
LEAL const(a)(R8*1),a; \
139-
ORL b, R9; \
153+
ADDL $const, a; \
154+
ADDL R8, a; \
155+
ORL b, R9; \
140156
XORL c, R9; \
141157
ADDL R9, a; \
142158
MOVL (index*4)(SI),R8; \

0 commit comments

Comments
 (0)