|
| 1 | +// Copyright 2022 The Go Authors. All rights reserved. |
| 2 | +// Use of this source code is governed by a BSD-style |
| 3 | +// license that can be found in the LICENSE file. |
| 4 | + |
| 5 | +// Based on the Linux Kernel with the following comment: |
| 6 | +// Algorithm based on https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=fb87127bcefc17efab757606e1b1e333fd614dd0 |
| 7 | +// Originally written by Ard Biesheuvel <[email protected]> |
| 8 | + |
| 9 | +#include "textflag.h" |
| 10 | + |
| 11 | +#define SHA512TRANS(i0, i1, i2, i3, i4, rc0, in0) \ |
| 12 | + VADD in0.D2, rc0.D2, V5.D2 \ |
| 13 | + VEXT $8, i3.B16, i2.B16, V6.B16 \ |
| 14 | + VEXT $8, V5.B16, V5.B16, V5.B16 \ |
| 15 | + VEXT $8, i2.B16, i1.B16, V7.B16 \ |
| 16 | + VADD V5.D2, i3.D2, i3.D2 \ |
| 17 | + |
| 18 | +#define SHA512ROUND(i0, i1, i2, i3, i4, rc0, rc1, in0, in1, in2, in3, in4) \ |
| 19 | + VLD1.P 16(R4), [rc1.D2] \ |
| 20 | + SHA512TRANS(i0, i1, i2, i3, i4, rc0, in0) \ |
| 21 | + VEXT $8, in4.B16, in3.B16, V5.B16 \ |
| 22 | + SHA512SU0 in1.D2, in0.D2 \ |
| 23 | + SHA512H V7.D2, V6, i3 \ |
| 24 | + SHA512SU1 V5.D2, in2.D2, in0.D2 \ |
| 25 | + VADD i3.D2, i1.D2, i4.D2 \ |
| 26 | + SHA512H2 i0.D2, i1, i3 |
| 27 | + |
| 28 | +#define SHA512ROUND_NO_UPDATE(i0, i1, i2, i3, i4, rc0, rc1, in0) \ |
| 29 | + VLD1.P 16(R4), [rc1.D2] \ |
| 30 | + SHA512TRANS(i0, i1, i2, i3, i4, rc0, in0) \ |
| 31 | + SHA512H V7.D2, V6, i3 \ |
| 32 | + VADD i3.D2, i1.D2, i4.D2 \ |
| 33 | + SHA512H2 i0.D2, i1, i3 |
| 34 | + |
| 35 | +#define SHA512ROUND_LAST(i0, i1, i2, i3, i4, rc0, in0) \ |
| 36 | + SHA512TRANS(i0, i1, i2, i3, i4, rc0, in0) \ |
| 37 | + SHA512H V7.D2, V6, i3 \ |
| 38 | + VADD i3.D2, i1.D2, i4.D2 \ |
| 39 | + SHA512H2 i0.D2, i1, i3 |
| 40 | + |
| 41 | +// func blockAsm(dig *digest, p []byte) |
| 42 | +TEXT ·blockAsm(SB),NOSPLIT,$0 |
| 43 | + MOVD dig+0(FP), R0 |
| 44 | + MOVD p_base+8(FP), R1 |
| 45 | + MOVD p_len+16(FP), R2 |
| 46 | + MOVD ·_K+0(SB), R3 |
| 47 | + |
| 48 | + // long enough to prefetch |
| 49 | + PRFM (R3), PLDL3KEEP |
| 50 | + // load digest |
| 51 | + VLD1 (R0), [V8.D2, V9.D2, V10.D2, V11.D2] |
| 52 | +loop: |
| 53 | + // load digest in V0-V3 keeping original in V8-V11 |
| 54 | + VMOV V8.B16, V0.B16 |
| 55 | + VMOV V9.B16, V1.B16 |
| 56 | + VMOV V10.B16, V2.B16 |
| 57 | + VMOV V11.B16, V3.B16 |
| 58 | + |
| 59 | + // load message data in V12-V19 |
| 60 | + VLD1.P 64(R1), [V12.D2, V13.D2, V14.D2, V15.D2] |
| 61 | + VLD1.P 64(R1), [V16.D2, V17.D2, V18.D2, V19.D2] |
| 62 | + |
| 63 | + // convert message into big endian format |
| 64 | + VREV64 V12.B16, V12.B16 |
| 65 | + VREV64 V13.B16, V13.B16 |
| 66 | + VREV64 V14.B16, V14.B16 |
| 67 | + VREV64 V15.B16, V15.B16 |
| 68 | + VREV64 V16.B16, V16.B16 |
| 69 | + VREV64 V17.B16, V17.B16 |
| 70 | + VREV64 V18.B16, V18.B16 |
| 71 | + VREV64 V19.B16, V19.B16 |
| 72 | + |
| 73 | + MOVD R3, R4 |
| 74 | + // load first 4 round consts in V20-V23 |
| 75 | + VLD1.P 64(R4), [V20.D2, V21.D2, V22.D2, V23.D2] |
| 76 | + |
| 77 | + SHA512ROUND(V0, V1, V2, V3, V4, V20, V24, V12, V13, V19, V16, V17) |
| 78 | + SHA512ROUND(V3, V0, V4, V2, V1, V21, V25, V13, V14, V12, V17, V18) |
| 79 | + SHA512ROUND(V2, V3, V1, V4, V0, V22, V26, V14, V15, V13, V18, V19) |
| 80 | + SHA512ROUND(V4, V2, V0, V1, V3, V23, V27, V15, V16, V14, V19, V12) |
| 81 | + SHA512ROUND(V1, V4, V3, V0, V2, V24, V28, V16, V17, V15, V12, V13) |
| 82 | + |
| 83 | + SHA512ROUND(V0, V1, V2, V3, V4, V25, V29, V17, V18, V16, V13, V14) |
| 84 | + SHA512ROUND(V3, V0, V4, V2, V1, V26, V30, V18, V19, V17, V14, V15) |
| 85 | + SHA512ROUND(V2, V3, V1, V4, V0, V27, V31, V19, V12, V18, V15, V16) |
| 86 | + SHA512ROUND(V4, V2, V0, V1, V3, V28, V24, V12, V13, V19, V16, V17) |
| 87 | + SHA512ROUND(V1, V4, V3, V0, V2, V29, V25, V13, V14, V12, V17, V18) |
| 88 | + |
| 89 | + SHA512ROUND(V0, V1, V2, V3, V4, V30, V26, V14, V15, V13, V18, V19) |
| 90 | + SHA512ROUND(V3, V0, V4, V2, V1, V31, V27, V15, V16, V14, V19, V12) |
| 91 | + SHA512ROUND(V2, V3, V1, V4, V0, V24, V28, V16, V17, V15, V12, V13) |
| 92 | + SHA512ROUND(V4, V2, V0, V1, V3, V25, V29, V17, V18, V16, V13, V14) |
| 93 | + SHA512ROUND(V1, V4, V3, V0, V2, V26, V30, V18, V19, V17, V14, V15) |
| 94 | + |
| 95 | + SHA512ROUND(V0, V1, V2, V3, V4, V27, V31, V19, V12, V18, V15, V16) |
| 96 | + SHA512ROUND(V3, V0, V4, V2, V1, V28, V24, V12, V13, V19, V16, V17) |
| 97 | + SHA512ROUND(V2, V3, V1, V4, V0, V29, V25, V13, V14, V12, V17, V18) |
| 98 | + SHA512ROUND(V4, V2, V0, V1, V3, V30, V26, V14, V15, V13, V18, V19) |
| 99 | + SHA512ROUND(V1, V4, V3, V0, V2, V31, V27, V15, V16, V14, V19, V12) |
| 100 | + |
| 101 | + SHA512ROUND(V0, V1, V2, V3, V4, V24, V28, V16, V17, V15, V12, V13) |
| 102 | + SHA512ROUND(V3, V0, V4, V2, V1, V25, V29, V17, V18, V16, V13, V14) |
| 103 | + SHA512ROUND(V2, V3, V1, V4, V0, V26, V30, V18, V19, V17, V14, V15) |
| 104 | + SHA512ROUND(V4, V2, V0, V1, V3, V27, V31, V19, V12, V18, V15, V16) |
| 105 | + SHA512ROUND(V1, V4, V3, V0, V2, V28, V24, V12, V13, V19, V16, V17) |
| 106 | + |
| 107 | + SHA512ROUND(V0, V1, V2, V3, V4, V29, V25, V13, V14, V12, V17, V18) |
| 108 | + SHA512ROUND(V3, V0, V4, V2, V1, V30, V26, V14, V15, V13, V18, V19) |
| 109 | + SHA512ROUND(V2, V3, V1, V4, V0, V31, V27, V15, V16, V14, V19, V12) |
| 110 | + SHA512ROUND(V4, V2, V0, V1, V3, V24, V28, V16, V17, V15, V12, V13) |
| 111 | + SHA512ROUND(V1, V4, V3, V0, V2, V25, V29, V17, V18, V16, V13, V14) |
| 112 | + |
| 113 | + SHA512ROUND(V0, V1, V2, V3, V4, V26, V30, V18, V19, V17, V14, V15) |
| 114 | + SHA512ROUND(V3, V0, V4, V2, V1, V27, V31, V19, V12, V18, V15, V16) |
| 115 | + |
| 116 | + SHA512ROUND_NO_UPDATE(V2, V3, V1, V4, V0, V28, V24, V12) |
| 117 | + SHA512ROUND_NO_UPDATE(V4, V2, V0, V1, V3, V29, V25, V13) |
| 118 | + SHA512ROUND_NO_UPDATE(V1, V4, V3, V0, V2, V30, V26, V14) |
| 119 | + SHA512ROUND_NO_UPDATE(V0, V1, V2, V3, V4, V31, V27, V15) |
| 120 | + |
| 121 | + SHA512ROUND_LAST(V3, V0, V4, V2, V1, V24, V16) |
| 122 | + SHA512ROUND_LAST(V2, V3, V1, V4, V0, V25, V17) |
| 123 | + SHA512ROUND_LAST(V4, V2, V0, V1, V3, V26, V18) |
| 124 | + SHA512ROUND_LAST(V1, V4, V3, V0, V2, V27, V19) |
| 125 | + |
| 126 | + // add result to digest |
| 127 | + VADD V0.D2, V8.D2, V8.D2 |
| 128 | + VADD V1.D2, V9.D2, V9.D2 |
| 129 | + VADD V2.D2, V10.D2, V10.D2 |
| 130 | + VADD V3.D2, V11.D2, V11.D2 |
| 131 | + SUB $128, R2 |
| 132 | + CBNZ R2, loop |
| 133 | + |
| 134 | + VST1 [V8.D2, V9.D2, V10.D2, V11.D2], (R0) |
| 135 | + RET |
0 commit comments