Skip to content

Commit ccfcb04

Browse files
author
Palmer Cox
committed
Sha2: Re-write the Sha2 compression functions to improve performance.
The Sha2 compression functions were re-written to execute the message scheduling calculations in the same loop as the rest of the compression function. The compiler is able to generate much better code. Additionally, innermost part of the compression functions were turned into macros to reduce code duplicate and to make the functions more concise.
1 parent e771aef commit ccfcb04

File tree

1 file changed

+89
-100
lines changed

1 file changed

+89
-100
lines changed

src/libextra/crypto/sha2.rs

Lines changed: 89 additions & 100 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,25 @@ use cryptoutil::{write_u64_be, write_u32_be, read_u64v_be, read_u32v_be, FixedBu
1515
use digest::Digest;
1616

1717

18+
// Sha-512 and Sha-256 use basically the same calculations which are implemented by these macros.
19+
// Inlining the calculations seems to result in better generated code.
20+
macro_rules! schedule_round( ($t:expr) => (
21+
W[$t] = sigma1(W[$t - 2]) + W[$t - 7] + sigma0(W[$t - 15]) + W[$t - 16];
22+
)
23+
)
24+
25+
macro_rules! sha2_round(
26+
($A:ident, $B:ident, $C:ident, $D:ident,
27+
$E:ident, $F:ident, $G:ident, $H:ident, $K:ident, $t:expr) => (
28+
{
29+
$H += sum1($E) + ch($E, $F, $G) + $K[$t] + W[$t];
30+
$D += $H;
31+
$H += sum0($A) + maj($A, $B, $C);
32+
}
33+
)
34+
)
35+
36+
1837
// BitCounter is a specialized structure intended simply for counting the
1938
// number of bits that have been processed by the SHA-2 512 family of functions.
2039
// It does very little overflow checking since such checking is not necessary
@@ -119,15 +138,6 @@ impl Engine512State {
119138
((x << 45) | (x >> 19)) ^ ((x << 3) | (x >> 61)) ^ (x >> 6)
120139
}
121140

122-
let mut W = [0u64, ..80];
123-
124-
read_u64v_be(W.mut_slice(0, 16), data);
125-
126-
for uint::range(16, 80) |t| {
127-
W[t] = sigma1(W[t - 2]) + W[t - 7] + sigma0(W[t - 15]) +
128-
W[t - 16];
129-
}
130-
131141
let mut a = self.H0;
132142
let mut b = self.H1;
133143
let mut c = self.H2;
@@ -137,47 +147,41 @@ impl Engine512State {
137147
let mut g = self.H6;
138148
let mut h = self.H7;
139149

140-
let mut t = 0;
141-
for uint::range(0, 10) |_| {
142-
h += sum1(e) + ch(e, f, g) + K64[t] + W[t];
143-
d += h;
144-
h += sum0(a) + maj(a, b, c);
145-
t += 1;
146-
147-
g += sum1(d) + ch(d, e, f) + K64[t] + W[t];
148-
c += g;
149-
g += sum0(h) + maj(h, a, b);
150-
t += 1;
151-
152-
f += sum1(c) + ch(c, d, e) + K64[t] + W[t];
153-
b += f;
154-
f += sum0(g) + maj(g, h, a);
155-
t += 1;
156-
157-
e += sum1(b) + ch(b, c, d) + K64[t] + W[t];
158-
a += e;
159-
e += sum0(f) + maj(f, g, h);
160-
t += 1;
161-
162-
d += sum1(a) + ch(a, b, c) + K64[t] + W[t];
163-
h += d;
164-
d += sum0(e) + maj(e, f, g);
165-
t += 1;
166-
167-
c += sum1(h) + ch(h, a, b) + K64[t] + W[t];
168-
g += c;
169-
c += sum0(d) + maj(d, e, f);
170-
t += 1;
171-
172-
b += sum1(g) + ch(g, h, a) + K64[t] + W[t];
173-
f += b;
174-
b += sum0(c) + maj(c, d, e);
175-
t += 1;
176-
177-
a += sum1(f) + ch(f, g, h) + K64[t] + W[t];
178-
e += a;
179-
a += sum0(b) + maj(b, c, d);
180-
t += 1;
150+
let mut W = [0u64, ..80];
151+
152+
read_u64v_be(W.mut_slice(0, 16), data);
153+
154+
// Putting the message schedule inside the same loop as the round calculations allows for
155+
// the compiler to generate better code.
156+
for uint::range_step(0, 64, 8) |t| {
157+
schedule_round!(t + 16);
158+
schedule_round!(t + 17);
159+
schedule_round!(t + 18);
160+
schedule_round!(t + 19);
161+
schedule_round!(t + 20);
162+
schedule_round!(t + 21);
163+
schedule_round!(t + 22);
164+
schedule_round!(t + 23);
165+
166+
sha2_round!(a, b, c, d, e, f, g, h, K64, t);
167+
sha2_round!(h, a, b, c, d, e, f, g, K64, t + 1);
168+
sha2_round!(g, h, a, b, c, d, e, f, K64, t + 2);
169+
sha2_round!(f, g, h, a, b, c, d, e, K64, t + 3);
170+
sha2_round!(e, f, g, h, a, b, c, d, K64, t + 4);
171+
sha2_round!(d, e, f, g, h, a, b, c, K64, t + 5);
172+
sha2_round!(c, d, e, f, g, h, a, b, K64, t + 6);
173+
sha2_round!(b, c, d, e, f, g, h, a, K64, t + 7);
174+
}
175+
176+
for uint::range_step(64, 80, 8) |t| {
177+
sha2_round!(a, b, c, d, e, f, g, h, K64, t);
178+
sha2_round!(h, a, b, c, d, e, f, g, K64, t + 1);
179+
sha2_round!(g, h, a, b, c, d, e, f, K64, t + 2);
180+
sha2_round!(f, g, h, a, b, c, d, e, K64, t + 3);
181+
sha2_round!(e, f, g, h, a, b, c, d, K64, t + 4);
182+
sha2_round!(d, e, f, g, h, a, b, c, K64, t + 5);
183+
sha2_round!(c, d, e, f, g, h, a, b, K64, t + 6);
184+
sha2_round!(b, c, d, e, f, g, h, a, K64, t + 7);
181185
}
182186

183187
self.H0 += a;
@@ -524,15 +528,6 @@ impl Engine256State {
524528
((x >> 17) | (x << 15)) ^ ((x >> 19) | (x << 13)) ^ (x >> 10)
525529
}
526530

527-
let mut W = [0u32, ..80];
528-
529-
read_u32v_be(W.mut_slice(0, 16), data);
530-
531-
for uint::range(16, 64) |t| {
532-
W[t] = sigma1(W[t - 2]) + W[t - 7] + sigma0(W[t - 15]) +
533-
W[t - 16];
534-
}
535-
536531
let mut a = self.H0;
537532
let mut b = self.H1;
538533
let mut c = self.H2;
@@ -542,47 +537,41 @@ impl Engine256State {
542537
let mut g = self.H6;
543538
let mut h = self.H7;
544539

545-
let mut t = 0;
546-
for uint::range(0, 8) |_| {
547-
h += sum1(e) + ch(e, f, g) + K32[t] + W[t];
548-
d += h;
549-
h += sum0(a) + maj(a, b, c);
550-
t += 1;
551-
552-
g += sum1(d) + ch(d, e, f) + K32[t] + W[t];
553-
c += g;
554-
g += sum0(h) + maj(h, a, b);
555-
t += 1;
556-
557-
f += sum1(c) + ch(c, d, e) + K32[t] + W[t];
558-
b += f;
559-
f += sum0(g) + maj(g, h, a);
560-
t += 1;
561-
562-
e += sum1(b) + ch(b, c, d) + K32[t] + W[t];
563-
a += e;
564-
e += sum0(f) + maj(f, g, h);
565-
t += 1;
566-
567-
d += sum1(a) + ch(a, b, c) + K32[t] + W[t];
568-
h += d;
569-
d += sum0(e) + maj(e, f, g);
570-
t += 1;
571-
572-
c += sum1(h) + ch(h, a, b) + K32[t] + W[t];
573-
g += c;
574-
c += sum0(d) + maj(d, e, f);
575-
t += 1;
576-
577-
b += sum1(g) + ch(g, h, a) + K32[t] + W[t];
578-
f += b;
579-
b += sum0(c) + maj(c, d, e);
580-
t += 1;
581-
582-
a += sum1(f) + ch(f, g, h) + K32[t] + W[t];
583-
e += a;
584-
a += sum0(b) + maj(b, c, d);
585-
t += 1;
540+
let mut W = [0u32, ..64];
541+
542+
read_u32v_be(W.mut_slice(0, 16), data);
543+
544+
// Putting the message schedule inside the same loop as the round calculations allows for
545+
// the compiler to generate better code.
546+
for uint::range_step(0, 48, 8) |t| {
547+
schedule_round!(t + 16);
548+
schedule_round!(t + 17);
549+
schedule_round!(t + 18);
550+
schedule_round!(t + 19);
551+
schedule_round!(t + 20);
552+
schedule_round!(t + 21);
553+
schedule_round!(t + 22);
554+
schedule_round!(t + 23);
555+
556+
sha2_round!(a, b, c, d, e, f, g, h, K32, t);
557+
sha2_round!(h, a, b, c, d, e, f, g, K32, t + 1);
558+
sha2_round!(g, h, a, b, c, d, e, f, K32, t + 2);
559+
sha2_round!(f, g, h, a, b, c, d, e, K32, t + 3);
560+
sha2_round!(e, f, g, h, a, b, c, d, K32, t + 4);
561+
sha2_round!(d, e, f, g, h, a, b, c, K32, t + 5);
562+
sha2_round!(c, d, e, f, g, h, a, b, K32, t + 6);
563+
sha2_round!(b, c, d, e, f, g, h, a, K32, t + 7);
564+
}
565+
566+
for uint::range_step(48, 64, 8) |t| {
567+
sha2_round!(a, b, c, d, e, f, g, h, K32, t);
568+
sha2_round!(h, a, b, c, d, e, f, g, K32, t + 1);
569+
sha2_round!(g, h, a, b, c, d, e, f, K32, t + 2);
570+
sha2_round!(f, g, h, a, b, c, d, e, K32, t + 3);
571+
sha2_round!(e, f, g, h, a, b, c, d, K32, t + 4);
572+
sha2_round!(d, e, f, g, h, a, b, c, K32, t + 5);
573+
sha2_round!(c, d, e, f, g, h, a, b, K32, t + 6);
574+
sha2_round!(b, c, d, e, f, g, h, a, K32, t + 7);
586575
}
587576

588577
self.H0 += a;

0 commit comments

Comments
 (0)