Skip to content

Commit 2015070

Browse files
labogerpull[bot]
authored andcommitted
crypto/subtle: improve xorBytes assembler on PPC64
This makes some improvements to the xorBytes assembler implementation for PPC64 targets. The loops to process large streams of bytes has been changed to do 64 bytes at a time. Other changes were made to prevent degradations in some of the common sizes like 8, 16. The case for < 8 bytes on power10 has been modified to use the LXVL and STXVL instructions. Change-Id: I7477d12d5375d484af8c274443d595ccdafbda7c Reviewed-on: https://go-review.googlesource.com/c/go/+/530877 Reviewed-by: Paul Murphy <[email protected]> TryBot-Result: Gopher Robot <[email protected]> Reviewed-by: Jayanth Krishnamurthy <[email protected]> LUCI-TryBot-Result: Go LUCI <[email protected]> Reviewed-by: Michael Pratt <[email protected]> Reviewed-by: Benny Siegert <[email protected]> Run-TryBot: Lynn Boger <[email protected]>
1 parent f1215c5 commit 2015070

File tree

1 file changed

+113
-58
lines changed

1 file changed

+113
-58
lines changed

src/crypto/subtle/xor_ppc64x.s

+113-58
Original file line numberDiff line numberDiff line change
@@ -13,75 +13,130 @@ TEXT ·xorBytes(SB), NOSPLIT, $0
1313
MOVD b+16(FP), R5 // R5 = b
1414
MOVD n+24(FP), R6 // R6 = n
1515

16-
CMPU R6, $32, CR7 // Check if n ≥ 32 bytes
16+
CMPU R6, $64, CR7 // Check if n ≥ 64 bytes
1717
MOVD R0, R8 // R8 = index
18-
CMPU R6, $8, CR6 // Check if 8 ≤ n < 32 bytes
19-
BLT CR6, small // Smaller than 8
20-
BLT CR7, xor16 // Case for 16 ≤ n < 32 bytes
18+
CMPU R6, $8, CR6 // Check if 8 ≤ n < 64 bytes
19+
BLE CR6, small // <= 8
20+
BLT CR7, xor32 // Case for 32 ≤ n < 64 bytes
2121

22-
// Case for n ≥ 32 bytes
23-
preloop32:
24-
SRD $5, R6, R7 // Setup loop counter
22+
// Case for n ≥ 64 bytes
23+
preloop64:
24+
SRD $6, R6, R7 // Set up loop counter
2525
MOVD R7, CTR
2626
MOVD $16, R10
27-
ANDCC $31, R6, R9 // Check for tailing bytes for later
28-
loop32:
29-
LXVD2X (R4)(R8), VS32 // VS32 = a[i,...,i+15]
30-
LXVD2X (R4)(R10), VS34
31-
LXVD2X (R5)(R8), VS33 // VS33 = b[i,...,i+15]
32-
LXVD2X (R5)(R10), VS35
33-
XXLXOR VS32, VS33, VS32 // VS34 = a[] ^ b[]
34-
XXLXOR VS34, VS35, VS34
35-
STXVD2X VS32, (R3)(R8) // Store to dst
36-
STXVD2X VS34, (R3)(R10)
37-
ADD $32, R8 // Update index
38-
ADD $32, R10
39-
BC 16, 0, loop32 // bdnz loop16
40-
41-
BEQ CR0, done
42-
43-
MOVD R9, R6
44-
CMP R6, $8
45-
BLT small
27+
MOVD $32, R14
28+
MOVD $48, R15
29+
ANDCC $63, R6, R9 // Check for tailing bytes for later
30+
PCALIGN $16
31+
// Case for >= 64 bytes
32+
// Process 64 bytes per iteration
33+
// Load 4 vectors of a and b
34+
// XOR the corresponding vectors
35+
// from a and b and store the result
36+
loop64:
37+
LXVD2X (R4)(R8), VS32
38+
LXVD2X (R4)(R10), VS34
39+
LXVD2X (R4)(R14), VS36
40+
LXVD2X (R4)(R15), VS38
41+
LXVD2X (R5)(R8), VS33
42+
LXVD2X (R5)(R10), VS35
43+
LXVD2X (R5)(R14), VS37
44+
LXVD2X (R5)(R15), VS39
45+
XXLXOR VS32, VS33, VS32
46+
XXLXOR VS34, VS35, VS34
47+
XXLXOR VS36, VS37, VS36
48+
XXLXOR VS38, VS39, VS38
49+
STXVD2X VS32, (R3)(R8)
50+
STXVD2X VS34, (R3)(R10)
51+
STXVD2X VS36, (R3)(R14)
52+
STXVD2X VS38, (R3)(R15)
53+
ADD $64, R8
54+
ADD $64, R10
55+
ADD $64, R14
56+
ADD $64, R15
57+
BDNZ loop64
58+
BC 12,2,LR // BEQLR
59+
MOVD R9, R6
60+
CMP R6, $8
61+
BLE small
62+
// Case for 8 <= n < 64 bytes
63+
// Process 32 bytes if available
64+
xor32:
65+
CMP R6, $32
66+
BLT xor16
67+
ADD $16, R8, R9
68+
LXVD2X (R4)(R8), VS32
69+
LXVD2X (R4)(R9), VS33
70+
LXVD2X (R5)(R8), VS34
71+
LXVD2X (R5)(R9), VS35
72+
XXLXOR VS32, VS34, VS32
73+
XXLXOR VS33, VS35, VS33
74+
STXVD2X VS32, (R3)(R8)
75+
STXVD2X VS33, (R3)(R9)
76+
ADD $32, R8
77+
ADD $-32, R6
78+
CMP R6, $8
79+
BLE small
80+
// Case for 8 <= n < 32 bytes
81+
// Process 16 bytes if available
4682
xor16:
47-
CMP R6, $16
48-
BLT xor8
49-
LXVD2X (R4)(R8), VS32
50-
LXVD2X (R5)(R8), VS33
51-
XXLXOR VS32, VS33, VS32
52-
STXVD2X VS32, (R3)(R8)
53-
ADD $16, R8
54-
ADD $-16, R6
55-
CMP R6, $8
56-
BLT small
83+
CMP R6, $16
84+
BLT xor8
85+
LXVD2X (R4)(R8), VS32
86+
LXVD2X (R5)(R8), VS33
87+
XXLXOR VS32, VS33, VS32
88+
STXVD2X VS32, (R3)(R8)
89+
ADD $16, R8
90+
ADD $-16, R6
91+
small:
92+
CMP R6, R0
93+
BC 12,2,LR // BEQLR
5794
xor8:
58-
// Case for 8 ≤ n < 16 bytes
59-
MOVD (R4)(R8), R14 // R14 = a[i,...,i+7]
60-
MOVD (R5)(R8), R15 // R15 = b[i,...,i+7]
61-
XOR R14, R15, R16 // R16 = a[] ^ b[]
62-
SUB $8, R6 // n = n - 8
63-
MOVD R16, (R3)(R8) // Store to dst
64-
ADD $8, R8
65-
66-
// Check if we're finished
67-
CMP R6, R0
68-
BGT small
95+
#ifdef GOPPC64_power10
96+
SLD $56,R6,R17
97+
ADD R4,R8,R18
98+
ADD R5,R8,R19
99+
ADD R3,R8,R20
100+
LXVL R18,R17,V0
101+
LXVL R19,R17,V1
102+
VXOR V0,V1,V1
103+
STXVL V1,R20,R17
69104
RET
70-
71-
// Case for n < 8 bytes and tailing bytes from the
72-
// previous cases.
73-
small:
105+
#else
106+
CMP R6, $8
107+
BLT xor4
108+
// Case for 8 ≤ n < 16 bytes
109+
MOVD (R4)(R8), R14 // R14 = a[i,...,i+7]
110+
MOVD (R5)(R8), R15 // R15 = b[i,...,i+7]
111+
XOR R14, R15, R16 // R16 = a[] ^ b[]
112+
SUB $8, R6 // n = n - 8
113+
MOVD R16, (R3)(R8) // Store to dst
114+
ADD $8, R8
115+
xor4:
116+
CMP R6, $4
117+
BLT xor2
118+
MOVWZ (R4)(R8), R14
119+
MOVWZ (R5)(R8), R15
120+
XOR R14, R15, R16
121+
MOVW R16, (R3)(R8)
122+
ADD $4,R8
123+
ADD $-4,R6
124+
xor2:
125+
CMP R6, $2
126+
BLT xor1
127+
MOVHZ (R4)(R8), R14
128+
MOVHZ (R5)(R8), R15
129+
XOR R14, R15, R16
130+
MOVH R16, (R3)(R8)
131+
ADD $2,R8
132+
ADD $-2,R6
133+
xor1:
74134
CMP R6, R0
75-
BEQ done
76-
MOVD R6, CTR // Setup loop counter
77-
78-
loop:
135+
BC 12,2,LR // BEQLR
79136
MOVBZ (R4)(R8), R14 // R14 = a[i]
80137
MOVBZ (R5)(R8), R15 // R15 = b[i]
81138
XOR R14, R15, R16 // R16 = a[i] ^ b[i]
82139
MOVB R16, (R3)(R8) // Store to dst
83-
ADD $1, R8
84-
BC 16, 0, loop // bdnz loop
85-
140+
#endif
86141
done:
87142
RET

0 commit comments

Comments
 (0)