Skip to content

Commit 0f0c892

Browse files
committed
crypto/aes: merge ppc64le crypt key expansion
It is not necessary to expand the key twice for each direction, the decrypt key can be stored in reverse simultaneously. Likewise, there is no need to store the key length alongside the expanded keys, this is now inferred by the key length slice. Noteably, the key expansion benchmark assumes the key array size is the exact size of the expanded key. Now, the ppc64le aes asm interface is identical to the generic asm interface. Callsites and usage is updated to reflect this. Performance uplift on POWER9 is substantial: name old time/op new time/op delta Expand 167ns ± 0% 49ns ± 0% -70.55% Change-Id: I3fdaf9c27e8860e8150d4683eb4046d97a53293a Reviewed-on: https://go-review.googlesource.com/c/go/+/398894 Run-TryBot: Paul Murphy <[email protected]> TryBot-Result: Gopher Robot <[email protected]> Reviewed-by: Lynn Boger <[email protected]> Trust: Paul Murphy <[email protected]>
1 parent db576c9 commit 0f0c892

File tree

3 files changed

+105
-151
lines changed

3 files changed

+105
-151
lines changed

src/crypto/aes/asm_ppc64le.s

Lines changed: 78 additions & 129 deletions
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,14 @@
2222

2323
#include "textflag.h"
2424

25-
// For set{En,De}cryptKeyAsm
25+
// For expandKeyAsm
2626
#define INP R3
2727
#define BITS R4
28-
#define OUT R5
28+
#define OUTENC R5 // Pointer to next expanded encrypt key
2929
#define PTR R6
3030
#define CNT R7
3131
#define ROUNDS R8
32+
#define OUTDEC R9 // Pointer to next expanded decrypt key
3233
#define TEMP R19
3334
#define ZERO V0
3435
#define IN0 V1
@@ -87,31 +88,13 @@ GLOBL ·rcon(SB), RODATA, $80
8788
LXSDX (RA+RB), VT \
8889
VPERM VT, VT, ESPERM, VT
8990

90-
// func setEncryptKeyAsm(key *byte, keylen int, enc *uint32) int
91-
TEXT ·setEncryptKeyAsm(SB), NOSPLIT|NOFRAME, $0
91+
// func setEncryptKeyAsm(nr int, key *byte, enc *uint32, dec *uint32)
92+
TEXT ·expandKeyAsm(SB), NOSPLIT|NOFRAME, $0
9293
// Load the arguments inside the registers
93-
MOVD key+0(FP), INP
94-
MOVD keylen+8(FP), BITS
95-
MOVD enc+16(FP), OUT
96-
JMP ·doEncryptKeyAsm(SB)
97-
98-
// This text is used both setEncryptKeyAsm and setDecryptKeyAsm
99-
TEXT ·doEncryptKeyAsm(SB), NOSPLIT|NOFRAME, $0
100-
// Do not change R10 since it's storing the LR value in setDecryptKeyAsm
101-
102-
// Check arguments
103-
MOVD $-1, PTR // li 6,-1 exit code to -1 (255)
104-
CMPU INP, $0 // cmpldi r3,0 input key pointer set?
105-
BC 0x0E, 2, enc_key_abort // beq- .Lenc_key_abort
106-
CMPU OUT, $0 // cmpldi r5,0 output key pointer set?
107-
BC 0x0E, 2, enc_key_abort // beq- .Lenc_key_abort
108-
MOVD $-2, PTR // li 6,-2 exit code to -2 (254)
109-
CMPW BITS, $128 // cmpwi 4,128 greater or equal to 128
110-
BC 0x0E, 0, enc_key_abort // blt- .Lenc_key_abort
111-
CMPW BITS, $256 // cmpwi 4,256 lesser or equal to 256
112-
BC 0x0E, 1, enc_key_abort // bgt- .Lenc_key_abort
113-
ANDCC $0x3f, BITS, TEMP // andi. 0,4,0x3f multiple of 64
114-
BC 0x06, 2, enc_key_abort // bne- .Lenc_key_abort
94+
MOVD nr+0(FP), ROUNDS
95+
MOVD key+8(FP), INP
96+
MOVD enc+16(FP), OUTENC
97+
MOVD dec+24(FP), OUTDEC
11598

11699
MOVD $·rcon(SB), PTR // PTR point to rcon addr
117100
LVX (PTR), ESPERM
@@ -120,27 +103,34 @@ TEXT ·doEncryptKeyAsm(SB), NOSPLIT|NOFRAME, $0
120103
// Get key from memory and write aligned into VR
121104
P8_LXVB16X(INP, R0, IN0)
122105
ADD $0x10, INP, INP
123-
MOVD $0x20, R8 // li 8,0x20 R8 = 32
106+
MOVD $0x20, TEMP
124107

125-
CMPW BITS, $192 // cmpwi 4,192 Key size == 192?
108+
CMPW ROUNDS, $12
126109
LVX (PTR)(R0), RCON // lvx 4,0,6 Load first 16 bytes into RCON
127-
LVX (PTR)(R8), MASK // lvx 5,8,6
110+
LVX (PTR)(TEMP), MASK
128111
ADD $0x10, PTR, PTR // addi 6,6,0x10 PTR to next 16 bytes of RCON
129112
MOVD $8, CNT // li 7,8 CNT = 8
130113
VXOR ZERO, ZERO, ZERO // vxor 0,0,0 Zero to be zero :)
131114
MOVD CNT, CTR // mtctr 7 Set the counter to 8 (rounds)
132115

133-
BLT loop128 // blt .Loop128
134-
BEQ l192 // beq .L192
135-
JMP l256 // b .L256
116+
// The expanded decrypt key is the expanded encrypt key stored in reverse order.
117+
// Move OUTDEC to the last key location, and store in descending order.
118+
ADD $160, OUTDEC, OUTDEC
119+
BLT loop128
120+
ADD $32, OUTDEC, OUTDEC
121+
BEQ l192
122+
ADD $32, OUTDEC, OUTDEC
123+
JMP l256
136124

137125
loop128:
138126
// Key schedule (Round 1 to 8)
139127
VPERM IN0, IN0, MASK, KEY // vperm 3,1,1,5 Rotate-n-splat
140128
VSLDOI $12, ZERO, IN0, TMP // vsldoi 6,0,1,12
141-
P8_STXV(IN0, R0, OUT)
129+
P8_STXV(IN0, R0, OUTENC)
130+
P8_STXV(IN0, R0, OUTDEC)
142131
VCIPHERLAST KEY, RCON, KEY // vcipherlast 3,3,4
143-
ADD $16, OUT, OUT // addi 5,5,16 Point to the next round
132+
ADD $16, OUTENC, OUTENC
133+
ADD $-16, OUTDEC, OUTDEC
144134

145135
VXOR IN0, TMP, IN0 // vxor 1,1,6
146136
VSLDOI $12, ZERO, TMP, TMP // vsldoi 6,0,6,12
@@ -156,9 +146,11 @@ loop128:
156146
// Key schedule (Round 9)
157147
VPERM IN0, IN0, MASK, KEY // vperm 3,1,1,5 Rotate-n-spat
158148
VSLDOI $12, ZERO, IN0, TMP // vsldoi 6,0,1,12
159-
P8_STXV(IN0, R0, OUT)
149+
P8_STXV(IN0, R0, OUTENC)
150+
P8_STXV(IN0, R0, OUTDEC)
160151
VCIPHERLAST KEY, RCON, KEY // vcipherlast 3,3,4
161-
ADD $16, OUT, OUT // addi 5,5,16
152+
ADD $16, OUTENC, OUTENC
153+
ADD $-16, OUTDEC, OUTDEC
162154

163155
// Key schedule (Round 10)
164156
VXOR IN0, TMP, IN0 // vxor 1,1,6
@@ -171,9 +163,11 @@ loop128:
171163

172164
VPERM IN0, IN0, MASK, KEY // vperm 3,1,1,5 Rotate-n-splat
173165
VSLDOI $12, ZERO, IN0, TMP // vsldoi 6,0,1,12
174-
P8_STXV(IN0, R0, OUT)
166+
P8_STXV(IN0, R0, OUTENC)
167+
P8_STXV(IN0, R0, OUTDEC)
175168
VCIPHERLAST KEY, RCON, KEY // vcipherlast 3,3,4
176-
ADD $16, OUT, OUT // addi 5,5,16
169+
ADD $16, OUTENC, OUTENC
170+
ADD $-16, OUTDEC, OUTDEC
177171

178172
// Key schedule (Round 11)
179173
VXOR IN0, TMP, IN0 // vxor 1,1,6
@@ -182,18 +176,18 @@ loop128:
182176
VSLDOI $12, ZERO, TMP, TMP // vsldoi 6,0,6,12
183177
VXOR IN0, TMP, IN0 // vxor 1,1,6
184178
VXOR IN0, KEY, IN0 // vxor 1,1,3
185-
P8_STXV(IN0, R0, OUT)
179+
P8_STXV(IN0, R0, OUTENC)
180+
P8_STXV(IN0, R0, OUTDEC)
186181

187-
ADD $0x50, OUT, OUT // addi 5,5,0x50
188-
189-
MOVD $10, ROUNDS // li 8,10
190-
JMP done // b .Ldone
182+
RET
191183

192184
l192:
193185
LXSDX_BE(INP, R0, IN1) // Load next 8 bytes into upper half of VSR in BE order.
194186
MOVD $4, CNT // li 7,4
195-
P8_STXV(IN0, R0, OUT)
196-
ADD $16, OUT, OUT // addi 5,5,16
187+
P8_STXV(IN0, R0, OUTENC)
188+
P8_STXV(IN0, R0, OUTDEC)
189+
ADD $16, OUTENC, OUTENC
190+
ADD $-16, OUTDEC, OUTDEC
197191
VSPLTISB $8, KEY // vspltisb 3,8
198192
MOVD CNT, CTR // mtctr 7
199193
VSUBUBM MASK, KEY, MASK // vsububm 5,5,3
@@ -221,18 +215,22 @@ loop192:
221215

222216
VPERM IN1, IN1, MASK, KEY // vperm 3,2,2,5
223217
VSLDOI $12, ZERO, IN0, TMP // vsldoi 6,0,1,12
224-
P8_STXV(STAGE, R0, OUT)
218+
P8_STXV(STAGE, R0, OUTENC)
219+
P8_STXV(STAGE, R0, OUTDEC)
225220
VCIPHERLAST KEY, RCON, KEY // vcipherlast 3,3,4
226-
ADD $16, OUT, OUT // addi 5,5,16
221+
ADD $16, OUTENC, OUTENC
222+
ADD $-16, OUTDEC, OUTDEC
227223

228224
VSLDOI $8, IN0, IN1, STAGE // vsldoi 7,1,2,8
229225
VXOR IN0, TMP, IN0 // vxor 1,1,6
230226
VSLDOI $12, ZERO, TMP, TMP // vsldoi 6,0,6,12
231-
P8_STXV(STAGE, R0, OUT)
227+
P8_STXV(STAGE, R0, OUTENC)
228+
P8_STXV(STAGE, R0, OUTDEC)
232229
VXOR IN0, TMP, IN0 // vxor 1,1,6
233230
VSLDOI $12, ZERO, TMP, TMP // vsldoi 6,0,6,12
234231
VXOR IN0, TMP, IN0 // vxor 1,1,6
235-
ADD $16, OUT, OUT // addi 5,5,16
232+
ADD $16, OUTENC, OUTENC
233+
ADD $-16, OUTDEC, OUTDEC
236234

237235
VSPLTW $3, IN0, TMP // vspltw 6,1,3
238236
VXOR TMP, IN1, TMP // vxor 6,6,2
@@ -241,28 +239,31 @@ loop192:
241239
VXOR IN1, TMP, IN1 // vxor 2,2,6
242240
VXOR IN0, KEY, IN0 // vxor 1,1,3
243241
VXOR IN1, KEY, IN1 // vxor 2,2,3
244-
P8_STXV(IN0, R0, OUT)
245-
ADD $16, OUT, OUT // addi 5,5,16
242+
P8_STXV(IN0, R0, OUTENC)
243+
P8_STXV(IN0, R0, OUTDEC)
244+
ADD $16, OUTENC, OUTENC
245+
ADD $-16, OUTDEC, OUTDEC
246246
BC 0x10, 0, loop192 // bdnz .Loop192
247247

248-
MOVD $12, ROUNDS // li 8,12
249-
ADD $0x20, OUT, OUT // addi 5,5,0x20
250-
BR done // b .Ldone
248+
RET
251249

252250
l256:
253251
P8_LXVB16X(INP, R0, IN1)
254252
MOVD $7, CNT // li 7,7
255-
MOVD $14, ROUNDS // li 8,14
256-
P8_STXV(IN0, R0, OUT)
257-
ADD $16, OUT, OUT // addi 5,5,16
253+
P8_STXV(IN0, R0, OUTENC)
254+
P8_STXV(IN0, R0, OUTDEC)
255+
ADD $16, OUTENC, OUTENC
256+
ADD $-16, OUTDEC, OUTDEC
258257
MOVD CNT, CTR // mtctr 7
259258

260259
loop256:
261260
VPERM IN1, IN1, MASK, KEY // vperm 3,2,2,5
262261
VSLDOI $12, ZERO, IN0, TMP // vsldoi 6,0,1,12
263-
P8_STXV(IN1, R0, OUT)
262+
P8_STXV(IN1, R0, OUTENC)
263+
P8_STXV(IN1, R0, OUTDEC)
264264
VCIPHERLAST KEY, RCON, KEY // vcipherlast 3,3,4
265-
ADD $16, OUT, OUT // addi 5,5,16
265+
ADD $16, OUTENC, OUTENC
266+
ADD $-16, OUTDEC, OUTDEC
266267

267268
VXOR IN0, TMP, IN0 // vxor 1,1,6
268269
VSLDOI $12, ZERO, TMP, TMP // vsldoi 6,0,6,12
@@ -271,8 +272,10 @@ loop256:
271272
VXOR IN0, TMP, IN0 // vxor 1,1,6
272273
VADDUWM RCON, RCON, RCON // vadduwm 4,4,4
273274
VXOR IN0, KEY, IN0 // vxor 1,1,3
274-
P8_STXV(IN0, R0, OUT)
275-
ADD $16, OUT, OUT // addi 5,5,16
275+
P8_STXV(IN0, R0, OUTENC)
276+
P8_STXV(IN0, R0, OUTDEC)
277+
ADD $16, OUTENC, OUTENC
278+
ADD $-16, OUTDEC, OUTDEC
276279
BC 0x12, 0, done // bdz .Ldone
277280

278281
VSPLTW $3, IN0, KEY // vspltw 3,1,3
@@ -289,71 +292,16 @@ loop256:
289292
JMP loop256 // b .Loop256
290293

291294
done:
292-
MOVD $0, PTR // li 6,0 set PTR to 0 (exit code 0)
293-
MOVW ROUNDS, 0(OUT) // stw 8,0(5)
294-
295-
enc_key_abort:
296-
MOVD PTR, INP // mr 3,6 set exit code with PTR value
297-
MOVD INP, ret+24(FP) // Put return value into the FP
298-
RET // blr
295+
RET
299296

300-
// func setDecryptKeyAsm(key *byte, keylen int, dec *uint32) int
301-
TEXT ·setDecryptKeyAsm(SB), NOSPLIT|NOFRAME, $0
302-
// Load the arguments inside the registers
303-
MOVD key+0(FP), INP
304-
MOVD keylen+8(FP), BITS
305-
MOVD dec+16(FP), OUT
306-
307-
MOVD LR, R10 // mflr 10
308-
CALL ·doEncryptKeyAsm(SB)
309-
MOVD R10, LR // mtlr 10
310-
311-
CMPW INP, $0 // cmpwi 3,0 exit 0 = ok
312-
BC 0x06, 2, dec_key_abort // bne- .Ldec_key_abort
313-
314-
// doEncryptKeyAsm set ROUNDS (R8) with the proper value for each mode
315-
SLW $4, ROUNDS, CNT // slwi 7,8,4
316-
SUB $240, OUT, INP // subi 3,5,240
317-
SRW $1, ROUNDS, ROUNDS // srwi 8,8,1
318-
ADD R7, INP, OUT // add 5,3,7
319-
MOVD ROUNDS, CTR // mtctr 8
320-
321-
// dec_key will invert the key sequence in order to be used for decrypt
322-
dec_key:
323-
MOVWZ 0(INP), TEMP // lwz 0, 0(3)
324-
MOVWZ 4(INP), R6 // lwz 6, 4(3)
325-
MOVWZ 8(INP), R7 // lwz 7, 8(3)
326-
MOVWZ 12(INP), R8 // lwz 8, 12(3)
327-
ADD $16, INP, INP // addi 3,3,16
328-
MOVWZ 0(OUT), R9 // lwz 9, 0(5)
329-
MOVWZ 4(OUT), R10 // lwz 10,4(5)
330-
MOVWZ 8(OUT), R11 // lwz 11,8(5)
331-
MOVWZ 12(OUT), R12 // lwz 12,12(5)
332-
MOVW TEMP, 0(OUT) // stw 0, 0(5)
333-
MOVW R6, 4(OUT) // stw 6, 4(5)
334-
MOVW R7, 8(OUT) // stw 7, 8(5)
335-
MOVW R8, 12(OUT) // stw 8, 12(5)
336-
SUB $16, OUT, OUT // subi 5,5,16
337-
MOVW R9, -16(INP) // stw 9, -16(3)
338-
MOVW R10, -12(INP) // stw 10,-12(3)
339-
MOVW R11, -8(INP) // stw 11,-8(3)
340-
MOVW R12, -4(INP) // stw 12,-4(3)
341-
BC 0x10, 0, dec_key // bdnz .Ldeckey
342-
343-
XOR R3, R3, R3 // xor 3,3,3 Clean R3
344-
345-
dec_key_abort:
346-
MOVD R3, ret+24(FP) // Put return value into the FP
347-
RET // blr
348-
349-
// func encryptBlockAsm(dst, src *byte, enc *uint32)
297+
// func encryptBlockAsm(nr int, xk *uint32, dst, src *byte)
350298
TEXT ·encryptBlockAsm(SB), NOSPLIT|NOFRAME, $0
351299
// Load the arguments inside the registers
352-
MOVD dst+0(FP), BLK_OUT
353-
MOVD src+8(FP), BLK_INP
354-
MOVD enc+16(FP), BLK_KEY
300+
MOVD nr+0(FP), BLK_ROUNDS
301+
MOVD xk+8(FP), BLK_KEY
302+
MOVD dst+16(FP), BLK_OUT
303+
MOVD src+24(FP), BLK_INP
355304

356-
MOVWZ 240(BLK_KEY), BLK_ROUNDS // lwz 6,240(5)
357305
MOVD $15, BLK_IDX // li 7,15
358306

359307
LVX (BLK_INP)(R0), ZERO // lvx 0,0,3
@@ -410,14 +358,14 @@ loop_enc:
410358

411359
RET // blr
412360

413-
// func decryptBlockAsm(dst, src *byte, dec *uint32)
361+
// func decryptBlockAsm(nr int, xk *uint32, dst, src *byte)
414362
TEXT ·decryptBlockAsm(SB), NOSPLIT|NOFRAME, $0
415363
// Load the arguments inside the registers
416-
MOVD dst+0(FP), BLK_OUT
417-
MOVD src+8(FP), BLK_INP
418-
MOVD dec+16(FP), BLK_KEY
364+
MOVD nr+0(FP), BLK_ROUNDS
365+
MOVD xk+8(FP), BLK_KEY
366+
MOVD dst+16(FP), BLK_OUT
367+
MOVD src+24(FP), BLK_INP
419368

420-
MOVWZ 240(BLK_KEY), BLK_ROUNDS // lwz 6,240(5)
421369
MOVD $15, BLK_IDX // li 7,15
422370

423371
LVX (BLK_INP)(R0), ZERO // lvx 0,0,3
@@ -476,7 +424,7 @@ loop_dec:
476424

477425
// Remove defines from above so they can be defined here
478426
#undef INP
479-
#undef OUT
427+
#undef OUTENC
480428
#undef ROUNDS
481429
#undef KEY
482430
#undef TMP
@@ -545,13 +493,15 @@ loop_dec:
545493
// for decryption which was omitted to avoid the
546494
// complexity.
547495

496+
// func cryptBlocksChain(src, dst *byte, length int, key *uint32, iv *byte, enc int, nr int)
548497
TEXT ·cryptBlocksChain(SB), NOSPLIT|NOFRAME, $0
549498
MOVD src+0(FP), INP
550499
MOVD dst+8(FP), OUT
551500
MOVD length+16(FP), LEN
552501
MOVD key+24(FP), KEY
553502
MOVD iv+32(FP), IVP
554503
MOVD enc+40(FP), ENC
504+
MOVD nr+48(FP), ROUNDS
555505

556506
CMPU LEN, $16 // cmpldi r5,16
557507
BC 14, 0, LR // bltlr-
@@ -567,7 +517,6 @@ TEXT ·cryptBlocksChain(SB), NOSPLIT|NOFRAME, $0
567517
VPERM IVEC, INPTAIL, INPPERM, IVEC // vperm v4,v4,v5,v6
568518
NEG INP, R11 // neg r11,r3
569519
LVSR (KEY)(R0), KEYPERM // lvsr v10,r0,r6
570-
MOVWZ 240(KEY), ROUNDS // lwz r9,240(r6)
571520
LVSR (R11)(R0), V6 // lvsr v6,r0,r11
572521
LVX (INP)(R0), INPTAIL // lvx v5,r0,r3
573522
ADD $15, INP // addi r3,r3,15

src/crypto/aes/cbc_ppc64le.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ func (x *cbc) BlockSize() int { return BlockSize }
4242

4343
// cryptBlocksChain invokes the cipher message identifying encrypt or decrypt.
4444
//go:noescape
45-
func cryptBlocksChain(src, dst *byte, length int, key *uint32, iv *byte, enc int)
45+
func cryptBlocksChain(src, dst *byte, length int, key *uint32, iv *byte, enc int, nr int)
4646

4747
func (x *cbc) CryptBlocks(dst, src []byte) {
4848
if len(src)%BlockSize != 0 {
@@ -56,9 +56,9 @@ func (x *cbc) CryptBlocks(dst, src []byte) {
5656
}
5757
if len(src) > 0 {
5858
if x.enc == cbcEncrypt {
59-
cryptBlocksChain(&src[0], &dst[0], len(src), &x.b.enc[0], &x.iv[0], x.enc)
59+
cryptBlocksChain(&src[0], &dst[0], len(src), &x.b.enc[0], &x.iv[0], x.enc, len(x.b.enc)/4-1)
6060
} else {
61-
cryptBlocksChain(&src[0], &dst[0], len(src), &x.b.dec[0], &x.iv[0], x.enc)
61+
cryptBlocksChain(&src[0], &dst[0], len(src), &x.b.dec[0], &x.iv[0], x.enc, len(x.b.dec)/4-1)
6262
}
6363
}
6464
}

0 commit comments

Comments
 (0)