crypto/subtle: improve xorBytes assembler on PPC64

laboger · pull[bot] · commit 20150702e51c · 2024-02-01T19:34:51.000Z
This makes some improvements to the xorBytes assembler implementation for PPC64 targets. The loops to process large streams of bytes has been changed to do 64 bytes at a time. Other changes were made to prevent degradations in some of the common sizes like 8, 16. The case for < 8 bytes on power10 has been modified to use the LXVL and STXVL instructions. Change-Id: I7477d12d5375d484af8c274443d595ccdafbda7c Reviewed-on: https://go-review.googlesource.com/c/go/+/530877 Reviewed-by: Paul Murphy <murp@ibm.com> TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Jayanth Krishnamurthy <jayanth.krishnamurthy@ibm.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Michael Pratt <mpratt@google.com> Reviewed-by: Benny Siegert <bsiegert@gmail.com> Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
diff --git a/src/crypto/subtle/xor_ppc64x.s b/src/crypto/subtle/xor_ppc64x.s
@@ -13,75 +13,130 @@ TEXT ·xorBytes(SB), NOSPLIT, $0
 	MOVD	b+16(FP), R5	// R5 = b
 	MOVD	n+24(FP), R6	// R6 = n
 
-	CMPU	R6, $32, CR7	// Check if n ≥ 32 bytes
+	CMPU	R6, $64, CR7	// Check if n ≥ 64 bytes
 	MOVD	R0, R8		// R8 = index
-	CMPU	R6, $8, CR6	// Check if 8 ≤ n < 32 bytes
-	BLT	CR6, small	// Smaller than 8
-	BLT	CR7, xor16	// Case for 16 ≤ n < 32 bytes
+	CMPU	R6, $8, CR6	// Check if 8 ≤ n < 64 bytes
+	BLE	CR6, small	// <= 8
+	BLT	CR7, xor32	// Case for 32 ≤ n < 64 bytes
 
-	// Case for n ≥ 32 bytes
-preloop32:
-	SRD	$5, R6, R7	// Setup loop counter
+	// Case for n ≥ 64 bytes
+preloop64:
+	SRD	$6, R6, R7	// Set up loop counter
 	MOVD	R7, CTR
 	MOVD	$16, R10
-	ANDCC	$31, R6, R9	// Check for tailing bytes for later
-loop32:
-	LXVD2X		(R4)(R8), VS32		// VS32 = a[i,...,i+15]
-	LXVD2X		(R4)(R10), VS34
-	LXVD2X		(R5)(R8), VS33		// VS33 = b[i,...,i+15]
-	LXVD2X		(R5)(R10), VS35
-	XXLXOR		VS32, VS33, VS32	// VS34 = a[] ^ b[]
-	XXLXOR		VS34, VS35, VS34
-	STXVD2X		VS32, (R3)(R8)		// Store to dst
-	STXVD2X		VS34, (R3)(R10)
-	ADD		$32, R8			// Update index
-	ADD		$32, R10
-	BC		16, 0, loop32		// bdnz loop16
-
-	BEQ		CR0, done
-
-	MOVD		R9, R6
-	CMP		R6, $8
-	BLT		small
+	MOVD	$32, R14
+	MOVD	$48, R15
+	ANDCC	$63, R6, R9	// Check for tailing bytes for later
+	PCALIGN $16
+	// Case for >= 64 bytes
+	// Process 64 bytes per iteration
+	// Load 4 vectors of a and b
+	// XOR the corresponding vectors
+	// from a and b and store the result
+loop64:
+	LXVD2X	(R4)(R8), VS32
+	LXVD2X	(R4)(R10), VS34
+	LXVD2X	(R4)(R14), VS36
+	LXVD2X	(R4)(R15), VS38
+	LXVD2X	(R5)(R8), VS33
+	LXVD2X	(R5)(R10), VS35
+	LXVD2X	(R5)(R14), VS37
+	LXVD2X	(R5)(R15), VS39
+	XXLXOR	VS32, VS33, VS32
+	XXLXOR	VS34, VS35, VS34
+	XXLXOR	VS36, VS37, VS36
+	XXLXOR	VS38, VS39, VS38
+	STXVD2X	VS32, (R3)(R8)
+	STXVD2X	VS34, (R3)(R10)
+	STXVD2X	VS36, (R3)(R14)
+	STXVD2X	VS38, (R3)(R15)
+	ADD	$64, R8
+	ADD	$64, R10
+	ADD	$64, R14
+	ADD	$64, R15
+	BDNZ	loop64
+	BC	12,2,LR		// BEQLR
+	MOVD	R9, R6
+	CMP	R6, $8
+	BLE	small
+	// Case for 8 <= n < 64 bytes
+	// Process 32 bytes if available
+xor32:
+	CMP	R6, $32
+	BLT	xor16
+	ADD	$16, R8, R9
+	LXVD2X	(R4)(R8), VS32
+	LXVD2X	(R4)(R9), VS33
+	LXVD2X	(R5)(R8), VS34
+	LXVD2X	(R5)(R9), VS35
+	XXLXOR	VS32, VS34, VS32
+	XXLXOR	VS33, VS35, VS33
+	STXVD2X	VS32, (R3)(R8)
+	STXVD2X	VS33, (R3)(R9)
+	ADD	$32, R8
+	ADD	$-32, R6
+	CMP	R6, $8
+	BLE	small
+	// Case for 8 <= n < 32 bytes
+	// Process 16 bytes if available
 xor16:
-	CMP		R6, $16
-	BLT		xor8
-	LXVD2X		(R4)(R8), VS32
-	LXVD2X		(R5)(R8), VS33
-	XXLXOR		VS32, VS33, VS32
-	STXVD2X		VS32, (R3)(R8)
-	ADD		$16, R8
-	ADD		$-16, R6
-	CMP		R6, $8
-	BLT		small
+	CMP	R6, $16
+	BLT	xor8
+	LXVD2X	(R4)(R8), VS32
+	LXVD2X	(R5)(R8), VS33
+	XXLXOR	VS32, VS33, VS32
+	STXVD2X	VS32, (R3)(R8)
+	ADD	$16, R8
+	ADD	$-16, R6
+small:
+	CMP	R6, R0
+	BC	12,2,LR		// BEQLR
 xor8:
-	// Case for 8 ≤ n < 16 bytes
-	MOVD    (R4)(R8), R14   // R14 = a[i,...,i+7]
-	MOVD    (R5)(R8), R15   // R15 = b[i,...,i+7]
-	XOR     R14, R15, R16   // R16 = a[] ^ b[]
-	SUB     $8, R6          // n = n - 8
-	MOVD    R16, (R3)(R8)   // Store to dst
-	ADD     $8, R8
-
-	// Check if we're finished
-	CMP     R6, R0
-	BGT     small
+#ifdef GOPPC64_power10
+	SLD	$56,R6,R17
+	ADD	R4,R8,R18
+	ADD	R5,R8,R19
+	ADD	R3,R8,R20
+	LXVL	R18,R17,V0
+	LXVL	R19,R17,V1
+	VXOR	V0,V1,V1
+	STXVL	V1,R20,R17
 	RET
-
-	// Case for n < 8 bytes and tailing bytes from the
-	// previous cases.
-small:
+#else
+	CMP	R6, $8
+	BLT	xor4
+	// Case for 8 ≤ n < 16 bytes
+	MOVD	(R4)(R8), R14   // R14 = a[i,...,i+7]
+	MOVD	(R5)(R8), R15   // R15 = b[i,...,i+7]
+	XOR	R14, R15, R16   // R16 = a[] ^ b[]
+	SUB	$8, R6          // n = n - 8
+	MOVD	R16, (R3)(R8)   // Store to dst
+	ADD	$8, R8
+xor4:
+	CMP	R6, $4
+	BLT	xor2
+	MOVWZ	(R4)(R8), R14
+	MOVWZ	(R5)(R8), R15
+	XOR	R14, R15, R16
+	MOVW	R16, (R3)(R8)
+	ADD	$4,R8
+	ADD	$-4,R6
+xor2:
+	CMP	R6, $2
+	BLT	xor1
+	MOVHZ	(R4)(R8), R14
+	MOVHZ	(R5)(R8), R15
+	XOR	R14, R15, R16
+	MOVH	R16, (R3)(R8)
+	ADD	$2,R8
+	ADD	$-2,R6
+xor1:
 	CMP	R6, R0
-	BEQ	done
-	MOVD	R6, CTR		// Setup loop counter
-
-loop:
+	BC	12,2,LR		// BEQLR
 	MOVBZ	(R4)(R8), R14	// R14 = a[i]
 	MOVBZ	(R5)(R8), R15	// R15 = b[i]
 	XOR	R14, R15, R16	// R16 = a[i] ^ b[i]
 	MOVB	R16, (R3)(R8)	// Store to dst
-	ADD	$1, R8
-	BC	16, 0, loop	// bdnz loop
-
+#endif
 done:
 	RET