|
| 1 | +// Copyright 2018 The Go Authors. All rights reserved. |
| 2 | +// Use of this source code is governed by a BSD-style |
| 3 | +// license that can be found in the LICENSE file. |
| 4 | + |
| 5 | +// +build s390x,!gccgo,!appengine |
| 6 | + |
| 7 | +#include "go_asm.h" |
| 8 | +#include "textflag.h" |
| 9 | + |
| 10 | +// This is an implementation of the ChaCha20 encryption algorithm as |
| 11 | +// specified in RFC 7539. It uses vector instructions to compute |
| 12 | +// 4 keystream blocks in parallel (256 bytes) which are then XORed |
| 13 | +// with the bytes in the input slice. |
| 14 | + |
| 15 | +GLOBL ·constants<>(SB), RODATA|NOPTR, $32 |
| 16 | +// BSWAP: swap bytes in each 4-byte element |
| 17 | +DATA ·constants<>+0x00(SB)/4, $0x03020100 |
| 18 | +DATA ·constants<>+0x04(SB)/4, $0x07060504 |
| 19 | +DATA ·constants<>+0x08(SB)/4, $0x0b0a0908 |
| 20 | +DATA ·constants<>+0x0c(SB)/4, $0x0f0e0d0c |
| 21 | +// J0: [j0, j1, j2, j3] |
| 22 | +DATA ·constants<>+0x10(SB)/4, $0x61707865 |
| 23 | +DATA ·constants<>+0x14(SB)/4, $0x3320646e |
| 24 | +DATA ·constants<>+0x18(SB)/4, $0x79622d32 |
| 25 | +DATA ·constants<>+0x1c(SB)/4, $0x6b206574 |
| 26 | + |
| 27 | +// EXRL targets: |
| 28 | +TEXT ·mvcSrcToBuf(SB), NOFRAME|NOSPLIT, $0 |
| 29 | + MVC $1, (R1), (R8) |
| 30 | + RET |
| 31 | + |
| 32 | +TEXT ·mvcBufToDst(SB), NOFRAME|NOSPLIT, $0 |
| 33 | + MVC $1, (R8), (R9) |
| 34 | + RET |
| 35 | + |
| 36 | +#define BSWAP V5 |
| 37 | +#define J0 V6 |
| 38 | +#define KEY0 V7 |
| 39 | +#define KEY1 V8 |
| 40 | +#define NONCE V9 |
| 41 | +#define CTR V10 |
| 42 | +#define M0 V11 |
| 43 | +#define M1 V12 |
| 44 | +#define M2 V13 |
| 45 | +#define M3 V14 |
| 46 | +#define INC V15 |
| 47 | +#define X0 V16 |
| 48 | +#define X1 V17 |
| 49 | +#define X2 V18 |
| 50 | +#define X3 V19 |
| 51 | +#define X4 V20 |
| 52 | +#define X5 V21 |
| 53 | +#define X6 V22 |
| 54 | +#define X7 V23 |
| 55 | +#define X8 V24 |
| 56 | +#define X9 V25 |
| 57 | +#define X10 V26 |
| 58 | +#define X11 V27 |
| 59 | +#define X12 V28 |
| 60 | +#define X13 V29 |
| 61 | +#define X14 V30 |
| 62 | +#define X15 V31 |
| 63 | + |
| 64 | +#define NUM_ROUNDS 20 |
| 65 | + |
| 66 | +#define ROUND4(a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3) \ |
| 67 | + VAF a1, a0, a0 \ |
| 68 | + VAF b1, b0, b0 \ |
| 69 | + VAF c1, c0, c0 \ |
| 70 | + VAF d1, d0, d0 \ |
| 71 | + VX a0, a2, a2 \ |
| 72 | + VX b0, b2, b2 \ |
| 73 | + VX c0, c2, c2 \ |
| 74 | + VX d0, d2, d2 \ |
| 75 | + VERLLF $16, a2, a2 \ |
| 76 | + VERLLF $16, b2, b2 \ |
| 77 | + VERLLF $16, c2, c2 \ |
| 78 | + VERLLF $16, d2, d2 \ |
| 79 | + VAF a2, a3, a3 \ |
| 80 | + VAF b2, b3, b3 \ |
| 81 | + VAF c2, c3, c3 \ |
| 82 | + VAF d2, d3, d3 \ |
| 83 | + VX a3, a1, a1 \ |
| 84 | + VX b3, b1, b1 \ |
| 85 | + VX c3, c1, c1 \ |
| 86 | + VX d3, d1, d1 \ |
| 87 | + VERLLF $12, a1, a1 \ |
| 88 | + VERLLF $12, b1, b1 \ |
| 89 | + VERLLF $12, c1, c1 \ |
| 90 | + VERLLF $12, d1, d1 \ |
| 91 | + VAF a1, a0, a0 \ |
| 92 | + VAF b1, b0, b0 \ |
| 93 | + VAF c1, c0, c0 \ |
| 94 | + VAF d1, d0, d0 \ |
| 95 | + VX a0, a2, a2 \ |
| 96 | + VX b0, b2, b2 \ |
| 97 | + VX c0, c2, c2 \ |
| 98 | + VX d0, d2, d2 \ |
| 99 | + VERLLF $8, a2, a2 \ |
| 100 | + VERLLF $8, b2, b2 \ |
| 101 | + VERLLF $8, c2, c2 \ |
| 102 | + VERLLF $8, d2, d2 \ |
| 103 | + VAF a2, a3, a3 \ |
| 104 | + VAF b2, b3, b3 \ |
| 105 | + VAF c2, c3, c3 \ |
| 106 | + VAF d2, d3, d3 \ |
| 107 | + VX a3, a1, a1 \ |
| 108 | + VX b3, b1, b1 \ |
| 109 | + VX c3, c1, c1 \ |
| 110 | + VX d3, d1, d1 \ |
| 111 | + VERLLF $7, a1, a1 \ |
| 112 | + VERLLF $7, b1, b1 \ |
| 113 | + VERLLF $7, c1, c1 \ |
| 114 | + VERLLF $7, d1, d1 |
| 115 | + |
| 116 | +#define PERMUTE(mask, v0, v1, v2, v3) \ |
| 117 | + VPERM v0, v0, mask, v0 \ |
| 118 | + VPERM v1, v1, mask, v1 \ |
| 119 | + VPERM v2, v2, mask, v2 \ |
| 120 | + VPERM v3, v3, mask, v3 |
| 121 | + |
| 122 | +#define ADDV(x, v0, v1, v2, v3) \ |
| 123 | + VAF x, v0, v0 \ |
| 124 | + VAF x, v1, v1 \ |
| 125 | + VAF x, v2, v2 \ |
| 126 | + VAF x, v3, v3 |
| 127 | + |
| 128 | +#define XORV(off, dst, src, v0, v1, v2, v3) \ |
| 129 | + VLM off(src), M0, M3 \ |
| 130 | + PERMUTE(BSWAP, v0, v1, v2, v3) \ |
| 131 | + VX v0, M0, M0 \ |
| 132 | + VX v1, M1, M1 \ |
| 133 | + VX v2, M2, M2 \ |
| 134 | + VX v3, M3, M3 \ |
| 135 | + VSTM M0, M3, off(dst) |
| 136 | + |
| 137 | +#define SHUFFLE(a, b, c, d, t, u, v, w) \ |
| 138 | + VMRHF a, c, t \ // t = {a[0], c[0], a[1], c[1]} |
| 139 | + VMRHF b, d, u \ // u = {b[0], d[0], b[1], d[1]} |
| 140 | + VMRLF a, c, v \ // v = {a[2], c[2], a[3], c[3]} |
| 141 | + VMRLF b, d, w \ // w = {b[2], d[2], b[3], d[3]} |
| 142 | + VMRHF t, u, a \ // a = {a[0], b[0], c[0], d[0]} |
| 143 | + VMRLF t, u, b \ // b = {a[1], b[1], c[1], d[1]} |
| 144 | + VMRHF v, w, c \ // c = {a[2], b[2], c[2], d[2]} |
| 145 | + VMRLF v, w, d // d = {a[3], b[3], c[3], d[3]} |
| 146 | + |
| 147 | +// func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32, buf *[256]byte, len *int) |
| 148 | +TEXT ·xorKeyStreamVX(SB), NOSPLIT, $0 |
| 149 | + MOVD $·constants<>(SB), R1 |
| 150 | + MOVD dst+0(FP), R2 // R2=&dst[0] |
| 151 | + LMG src+24(FP), R3, R4 // R3=&src[0] R4=len(src) |
| 152 | + MOVD key+48(FP), R5 // R5=key |
| 153 | + MOVD nonce+56(FP), R6 // R6=nonce |
| 154 | + MOVD counter+64(FP), R7 // R7=counter |
| 155 | + MOVD buf+72(FP), R8 // R8=buf |
| 156 | + MOVD len+80(FP), R9 // R9=len |
| 157 | + |
| 158 | + // load BSWAP and J0 |
| 159 | + VLM (R1), BSWAP, J0 |
| 160 | + |
| 161 | + // set up tail buffer |
| 162 | + ADD $-1, R4, R12 |
| 163 | + MOVBZ R12, R12 |
| 164 | + CMPUBEQ R12, $255, aligned |
| 165 | + MOVD R4, R1 |
| 166 | + AND $~255, R1 |
| 167 | + MOVD $(R3)(R1*1), R1 |
| 168 | + EXRL $·mvcSrcToBuf(SB), R12 |
| 169 | + MOVD $255, R0 |
| 170 | + SUB R12, R0 |
| 171 | + MOVD R0, (R9) // update len |
| 172 | + |
| 173 | +aligned: |
| 174 | + // setup |
| 175 | + MOVD $95, R0 |
| 176 | + VLM (R5), KEY0, KEY1 |
| 177 | + VLL R0, (R6), NONCE |
| 178 | + VZERO M0 |
| 179 | + VLEIB $7, $32, M0 |
| 180 | + VSRLB M0, NONCE, NONCE |
| 181 | + |
| 182 | + // initialize counter values |
| 183 | + VLREPF (R7), CTR |
| 184 | + VZERO INC |
| 185 | + VLEIF $1, $1, INC |
| 186 | + VLEIF $2, $2, INC |
| 187 | + VLEIF $3, $3, INC |
| 188 | + VAF INC, CTR, CTR |
| 189 | + VREPIF $4, INC |
| 190 | + |
| 191 | +chacha: |
| 192 | + VREPF $0, J0, X0 |
| 193 | + VREPF $1, J0, X1 |
| 194 | + VREPF $2, J0, X2 |
| 195 | + VREPF $3, J0, X3 |
| 196 | + VREPF $0, KEY0, X4 |
| 197 | + VREPF $1, KEY0, X5 |
| 198 | + VREPF $2, KEY0, X6 |
| 199 | + VREPF $3, KEY0, X7 |
| 200 | + VREPF $0, KEY1, X8 |
| 201 | + VREPF $1, KEY1, X9 |
| 202 | + VREPF $2, KEY1, X10 |
| 203 | + VREPF $3, KEY1, X11 |
| 204 | + VLR CTR, X12 |
| 205 | + VREPF $1, NONCE, X13 |
| 206 | + VREPF $2, NONCE, X14 |
| 207 | + VREPF $3, NONCE, X15 |
| 208 | + |
| 209 | + MOVD $(NUM_ROUNDS/2), R1 |
| 210 | + |
| 211 | +loop: |
| 212 | + ROUND4(X0, X4, X12, X8, X1, X5, X13, X9, X2, X6, X14, X10, X3, X7, X15, X11) |
| 213 | + ROUND4(X0, X5, X15, X10, X1, X6, X12, X11, X2, X7, X13, X8, X3, X4, X14, X9) |
| 214 | + |
| 215 | + ADD $-1, R1 |
| 216 | + BNE loop |
| 217 | + |
| 218 | + // decrement length |
| 219 | + ADD $-256, R4 |
| 220 | + BLT tail |
| 221 | + |
| 222 | +continue: |
| 223 | + // rearrange vectors |
| 224 | + SHUFFLE(X0, X1, X2, X3, M0, M1, M2, M3) |
| 225 | + ADDV(J0, X0, X1, X2, X3) |
| 226 | + SHUFFLE(X4, X5, X6, X7, M0, M1, M2, M3) |
| 227 | + ADDV(KEY0, X4, X5, X6, X7) |
| 228 | + SHUFFLE(X8, X9, X10, X11, M0, M1, M2, M3) |
| 229 | + ADDV(KEY1, X8, X9, X10, X11) |
| 230 | + VAF CTR, X12, X12 |
| 231 | + SHUFFLE(X12, X13, X14, X15, M0, M1, M2, M3) |
| 232 | + ADDV(NONCE, X12, X13, X14, X15) |
| 233 | + |
| 234 | + // increment counters |
| 235 | + VAF INC, CTR, CTR |
| 236 | + |
| 237 | + // xor keystream with plaintext |
| 238 | + XORV(0*64, R2, R3, X0, X4, X8, X12) |
| 239 | + XORV(1*64, R2, R3, X1, X5, X9, X13) |
| 240 | + XORV(2*64, R2, R3, X2, X6, X10, X14) |
| 241 | + XORV(3*64, R2, R3, X3, X7, X11, X15) |
| 242 | + |
| 243 | + // increment pointers |
| 244 | + MOVD $256(R2), R2 |
| 245 | + MOVD $256(R3), R3 |
| 246 | + |
| 247 | + CMPBNE R4, $0, chacha |
| 248 | + CMPUBEQ R12, $255, return |
| 249 | + EXRL $·mvcBufToDst(SB), R12 // len was updated during setup |
| 250 | + |
| 251 | +return: |
| 252 | + VSTEF $0, CTR, (R7) |
| 253 | + RET |
| 254 | + |
| 255 | +tail: |
| 256 | + MOVD R2, R9 |
| 257 | + MOVD R8, R2 |
| 258 | + MOVD R8, R3 |
| 259 | + MOVD $0, R4 |
| 260 | + JMP continue |
| 261 | + |
| 262 | +// func hasVectorFacility() bool |
| 263 | +TEXT ·hasVectorFacility(SB), NOSPLIT, $24-1 |
| 264 | + MOVD $x-24(SP), R1 |
| 265 | + XC $24, 0(R1), 0(R1) // clear the storage |
| 266 | + MOVD $2, R0 // R0 is the number of double words stored -1 |
| 267 | + WORD $0xB2B01000 // STFLE 0(R1) |
| 268 | + XOR R0, R0 // reset the value of R0 |
| 269 | + MOVBZ z-8(SP), R1 |
| 270 | + AND $0x40, R1 |
| 271 | + BEQ novector |
| 272 | + |
| 273 | +vectorinstalled: |
| 274 | + // check if the vector instruction has been enabled |
| 275 | + VLEIB $0, $0xF, V16 |
| 276 | + VLGVB $0, V16, R1 |
| 277 | + CMPBNE R1, $0xF, novector |
| 278 | + MOVB $1, ret+0(FP) // have vx |
| 279 | + RET |
| 280 | + |
| 281 | +novector: |
| 282 | + MOVB $0, ret+0(FP) // no vx |
| 283 | + RET |
0 commit comments