crypto/md5: optimize amd64 assembly

klauspost · 4a6f656c · commit 55d08e5010c3 · 2023-08-04T16:02:36.000Z
* Use two ADDL instead of LEAL * Keep ones in R11 * Use XORL with lower latency instead of NOTL * Remove loads and load the correct value in the previous round * Reduce dependency chain in round 2. * Remove MOVL in round 3. name old time/op new time/op delta Hash8Bytes-32 104ns ± 0% 96ns ± 1% -7.83% (p=0.000 n=9+10) Hash64-32 169ns ± 0% 155ns ± 0% -7.97% (p=0.000 n=10+10) Hash128-32 244ns ± 0% 224ns ± 0% -8.16% (p=0.000 n=9+10) Hash256-32 396ns ± 0% 360ns ± 1% -9.01% (p=0.000 n=10+10) Hash512-32 700ns ± 1% 634ns ± 1% -9.43% (p=0.000 n=10+10) Hash1K-32 1.30µs ± 0% 1.18µs ± 1% -9.32% (p=0.000 n=9+10) Hash8K-32 9.77µs ± 0% 8.81µs ± 0% -9.78% (p=0.000 n=9+10) Hash1M-32 1.24ms ± 1% 1.12ms ± 1% -9.54% (p=0.000 n=10+10) Hash8M-32 10.0ms ± 1% 9.0ms ± 1% -10.04% (p=0.000 n=10+10) Hash8BytesUnaligned-32 104ns ± 0% 96ns ± 0% -7.50% (p=0.000 n=10+10) Hash1KUnaligned-32 1.32µs ± 1% 1.18µs ± 1% -10.42% (p=0.000 n=10+10) Hash8KUnaligned-32 9.80µs ± 0% 8.79µs ± 1% -10.29% (p=0.000 n=10+10) name old speed new speed delta Hash8Bytes-32 77.1MB/s ± 0% 83.6MB/s ± 1% +8.49% (p=0.000 n=9+10) Hash64-32 379MB/s ± 0% 412MB/s ± 0% +8.66% (p=0.000 n=10+10) Hash128-32 525MB/s ± 0% 572MB/s ± 0% +8.89% (p=0.000 n=9+10) Hash256-32 646MB/s ± 0% 710MB/s ± 1% +9.90% (p=0.000 n=10+10) Hash512-32 732MB/s ± 1% 808MB/s ± 1% +10.41% (p=0.000 n=10+10) Hash1K-32 786MB/s ± 0% 866MB/s ± 1% +10.30% (p=0.000 n=9+10) Hash8K-32 839MB/s ± 0% 930MB/s ± 0% +10.79% (p=0.000 n=10+10) Hash1M-32 849MB/s ± 1% 938MB/s ± 1% +10.54% (p=0.000 n=10+10) Hash8M-32 841MB/s ± 1% 935MB/s ± 1% +11.16% (p=0.000 n=10+10) Hash8BytesUnaligned-32 77.1MB/s ± 0% 83.4MB/s ± 0% +8.12% (p=0.000 n=10+10) Hash1KUnaligned-32 778MB/s ± 1% 869MB/s ± 1% +11.64% (p=0.000 n=10+10) Hash8KUnaligned-32 836MB/s ± 0% 932MB/s ± 1% +11.47% (p=0.000 n=10+10) Change-Id: I02b31229b857e9257dc9d36538883eb3af4ad993 This PR will be imported into Gerrit with the title and first comment (this text) used to generate the subject and body of the Gerrit change. Change-Id: I02b31229b857e9257dc9d36538883eb3af4ad993 GitHub-Last-Rev: ec8b15d GitHub-Pull-Request: #43690 Reviewed-on: https://go-review.googlesource.com/c/go/+/283538 Run-TryBot: Joel Sing <joel@sing.id.au> Reviewed-by: Matthew Dempsky <mdempsky@google.com> Reviewed-by: David Chase <drchase@google.com> TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Joel Sing <joel@sing.id.au>
diff --git a/src/crypto/md5/md5block_amd64.s b/src/crypto/md5/md5block_amd64.s
@@ -25,6 +25,7 @@ TEXT	·block(SB),NOSPLIT,$8-32
 	MOVL	(1*4)(BP),	BX
 	MOVL	(2*4)(BP),	CX
 	MOVL	(3*4)(BP),	DX
+	MOVL	$0xffffffff,	R11
 
 	CMPQ	SI,		DI
 	JEQ	end
@@ -40,14 +41,15 @@ loop:
 
 #define ROUND1(a, b, c, d, index, const, shift) \
 	XORL	c, R9; \
-	LEAL	const(a)(R8*1), a; \
+	ADDL	$const, a; \
+	ADDL	R8, a; \
 	ANDL	b, R9; \
-	XORL d, R9; \
-	MOVL (index*4)(SI), R8; \
-	ADDL R9, a; \
-	ROLL $shift, a; \
-	MOVL c, R9; \
-	ADDL b, a
+	XORL	d, R9; \
+	MOVL	(index*4)(SI), R8; \
+	ADDL	R9, a; \
+	ROLL	$shift, a; \
+	MOVL	c, R9; \
+	ADDL	b, a
 
 	ROUND1(AX,BX,CX,DX, 1,0xd76aa478, 7);
 	ROUND1(DX,AX,BX,CX, 2,0xe8c7b756,12);
@@ -64,21 +66,23 @@ loop:
 	ROUND1(AX,BX,CX,DX,13,0x6b901122, 7);
 	ROUND1(DX,AX,BX,CX,14,0xfd987193,12);
 	ROUND1(CX,DX,AX,BX,15,0xa679438e,17);
-	ROUND1(BX,CX,DX,AX, 0,0x49b40821,22);
+	ROUND1(BX,CX,DX,AX, 1,0x49b40821,22);
 
-	MOVL	(1*4)(SI),	R8
 	MOVL	DX,		R9
 	MOVL	DX,		R10
 
+// Uses https://github.com/animetosho/md5-optimisation#dependency-shortcut-in-g-function
+
 #define ROUND2(a, b, c, d, index, const, shift) \
-	NOTL	R9; \
-	LEAL	const(a)(R8*1),a; \
+	XORL	R11, R9; \
+	ADDL	$const,	a; \
+	ADDL	R8,	a; \
 	ANDL	b,		R10; \
 	ANDL	c,		R9; \
 	MOVL	(index*4)(SI),R8; \
-	ORL	R9,		R10; \
+	ADDL	R9,	a; \
+	ADDL	R10,	a; \
 	MOVL	c,		R9; \
-	ADDL	R10,		a; \
 	MOVL	c,		R10; \
 	ROLL	$shift,	a; \
 	ADDL	b,		a
@@ -98,22 +102,34 @@ loop:
 	ROUND2(AX,BX,CX,DX, 2,0xa9e3e905, 5);
 	ROUND2(DX,AX,BX,CX, 7,0xfcefa3f8, 9);
 	ROUND2(CX,DX,AX,BX,12,0x676f02d9,14);
-	ROUND2(BX,CX,DX,AX, 0,0x8d2a4c8a,20);
+	ROUND2(BX,CX,DX,AX, 5,0x8d2a4c8a,20);
 
-	MOVL	(5*4)(SI),	R8
 	MOVL	CX,		R9
 
-#define ROUND3(a, b, c, d, index, const, shift) \
-	LEAL	const(a)(R8*1),a; \
+// Uses https://github.com/animetosho/md5-optimisation#h-function-re-use
+
+#define ROUND3FIRST(a, b, c, d, index, const, shift) \
+	MOVL	d,		R9; \
+	XORL	c,		R9; \
+	XORL	b,		R9; \
+	ADDL	$const,	a; \
+	ADDL	R8,		a; \
 	MOVL	(index*4)(SI),R8; \
-	XORL	d,		R9; \
+	ADDL	R9,		a; \
+	ROLL	$shift,		a; \
+	ADDL	b,		a
+
+#define ROUND3(a, b, c, d, index, const, shift) \
+	XORL	a,		R9; \
 	XORL	b,		R9; \
+	ADDL	$const,	a; \
+	ADDL	R8,		a; \
+	MOVL	(index*4)(SI),R8; \
 	ADDL	R9,		a; \
 	ROLL	$shift,		a; \
-	MOVL	b,		R9; \
 	ADDL	b,		a
 
-	ROUND3(AX,BX,CX,DX, 8,0xfffa3942, 4);
+	ROUND3FIRST(AX,BX,CX,DX, 8,0xfffa3942, 4);
 	ROUND3(DX,AX,BX,CX,11,0x8771f681,11);
 	ROUND3(CX,DX,AX,BX,14,0x6d9d6122,16);
 	ROUND3(BX,CX,DX,AX, 1,0xfde5380c,23);
@@ -130,13 +146,13 @@ loop:
 	ROUND3(CX,DX,AX,BX, 2,0x1fa27cf8,16);
 	ROUND3(BX,CX,DX,AX, 0,0xc4ac5665,23);
 
-	MOVL	(0*4)(SI),	R8
-	MOVL	$0xffffffff,	R9
+	MOVL	R11,	R9
 	XORL	DX,		R9
 
 #define ROUND4(a, b, c, d, index, const, shift) \
-	LEAL	const(a)(R8*1),a; \
-	ORL	b,		R9; \
+	ADDL	$const,	a; \
+	ADDL	R8,		a; \
+	ORL		b,		R9; \
 	XORL	c,		R9; \
 	ADDL	R9,		a; \
 	MOVL	(index*4)(SI),R8; \