Skip to content

Commit d8ec5eb

Browse files
committed
crypto/md5: Optimize amd64 assembly
* Use two ADDL instead of LEAL * Keep ones in R11 * Use XORL with lower latency instead of NOTL * Remove loads and load the correct value in the previous round ``` name old time/op new time/op delta Hash8Bytes-32 106ns ± 0% 103ns ± 0% -2.37% (p=0.000 n=10+10) Hash1K-32 1.33µs ± 0% 1.26µs ± 0% -4.78% (p=0.000 n=8+10) Hash8K-32 9.95µs ± 0% 9.46µs ± 0% -4.90% (p=0.000 n=10+10) Hash8BytesUnaligned-32 106ns ± 0% 103ns ± 0% -2.37% (p=0.000 n=10+10) Hash1KUnaligned-32 1.33µs ± 0% 1.26µs ± 0% -4.76% (p=0.000 n=10+9) Hash8KUnaligned-32 10.0µs ± 0% 9.5µs ± 0% -4.88% (p=0.000 n=10+10) name old speed new speed delta Hash8Bytes-32 75.8MB/s ± 0% 77.8MB/s ± 0% +2.70% (p=0.000 n=10+10) Hash1K-32 772MB/s ± 0% 810MB/s ± 0% +4.99% (p=0.000 n=9+10) Hash8K-32 823MB/s ± 0% 866MB/s ± 0% +5.15% (p=0.000 n=10+10) Hash8BytesUnaligned-32 75.8MB/s ± 0% 77.8MB/s ± 0% +2.64% (p=0.000 n=10+10) Hash1KUnaligned-32 771MB/s ± 0% 810MB/s ± 0% +4.96% (p=0.000 n=10+10) Hash8KUnaligned-32 823MB/s ± 0% 866MB/s ± 0% +5.13% (p=0.000 n=10+10) ``` Change-Id: I02b31229b857e9257dc9d36538883eb3af4ad993
1 parent 7eb31d9 commit d8ec5eb

File tree

1 file changed

+13
-11
lines changed

1 file changed

+13
-11
lines changed

src/crypto/md5/md5block_amd64.s

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ TEXT ·block(SB),NOSPLIT,$8-32
2525
MOVL (1*4)(BP), BX
2626
MOVL (2*4)(BP), CX
2727
MOVL (3*4)(BP), DX
28+
MOVL $0xffffffff, R11
2829

2930
CMPQ SI, DI
3031
JEQ end
@@ -40,7 +41,8 @@ loop:
4041

4142
#define ROUND1(a, b, c, d, index, const, shift) \
4243
XORL c, R9; \
43-
LEAL const(a)(R8*1), a; \
44+
ADDL $const, a; \
45+
ADDL R8, a; \
4446
ANDL b, R9; \
4547
XORL d, R9; \
4648
MOVL (index*4)(SI), R8; \
@@ -64,15 +66,15 @@ loop:
6466
ROUND1(AX,BX,CX,DX,13,0x6b901122, 7);
6567
ROUND1(DX,AX,BX,CX,14,0xfd987193,12);
6668
ROUND1(CX,DX,AX,BX,15,0xa679438e,17);
67-
ROUND1(BX,CX,DX,AX, 0,0x49b40821,22);
69+
ROUND1(BX,CX,DX,AX, 1,0x49b40821,22);
6870

69-
MOVL (1*4)(SI), R8
7071
MOVL DX, R9
7172
MOVL DX, R10
7273

7374
#define ROUND2(a, b, c, d, index, const, shift) \
74-
NOTL R9; \
75-
LEAL const(a)(R8*1),a; \
75+
XORL R11, R9; \
76+
ADDL $const, a; \
77+
ADDL R8, a; \
7678
ANDL b, R10; \
7779
ANDL c, R9; \
7880
MOVL (index*4)(SI),R8; \
@@ -98,13 +100,13 @@ loop:
98100
ROUND2(AX,BX,CX,DX, 2,0xa9e3e905, 5);
99101
ROUND2(DX,AX,BX,CX, 7,0xfcefa3f8, 9);
100102
ROUND2(CX,DX,AX,BX,12,0x676f02d9,14);
101-
ROUND2(BX,CX,DX,AX, 0,0x8d2a4c8a,20);
103+
ROUND2(BX,CX,DX,AX, 5,0x8d2a4c8a,20);
102104

103-
MOVL (5*4)(SI), R8
104105
MOVL CX, R9
105106

106107
#define ROUND3(a, b, c, d, index, const, shift) \
107-
LEAL const(a)(R8*1),a; \
108+
ADDL $const, a; \
109+
ADDL R8, a; \
108110
MOVL (index*4)(SI),R8; \
109111
XORL d, R9; \
110112
XORL b, R9; \
@@ -130,12 +132,12 @@ loop:
130132
ROUND3(CX,DX,AX,BX, 2,0x1fa27cf8,16);
131133
ROUND3(BX,CX,DX,AX, 0,0xc4ac5665,23);
132134

133-
MOVL (0*4)(SI), R8
134-
MOVL $0xffffffff, R9
135+
MOVL R11, R9
135136
XORL DX, R9
136137

137138
#define ROUND4(a, b, c, d, index, const, shift) \
138-
LEAL const(a)(R8*1),a; \
139+
ADDL $const, a; \
140+
ADDL R8, a; \
139141
ORL b, R9; \
140142
XORL c, R9; \
141143
ADDL R9, a; \

0 commit comments

Comments
 (0)