Skip to content

Commit 7f99845

Browse files
committed
crypto/md5: simplify generic implementation
This change uses library functions such as bits.RotateLeft32 to reduce the amount of code needed in the generic implementation. Since the code is now shorter I've also removed the option to generate a non-unrolled version of the code. I've also tried to remove bounds checks where possible to make the new version performant, however that is not the primary goal of this change since most architectures have assembly implementations already. Assembly performance: name old speed new speed delta Hash8Bytes 50.3MB/s ± 1% 59.1MB/s ± 0% +17.63% (p=0.000 n=9+8) Hash1K 590MB/s ± 0% 597MB/s ± 0% +1.25% (p=0.000 n=9+9) Hash8K 636MB/s ± 1% 638MB/s ± 1% ~ (p=0.072 n=10+10) Hash8BytesUnaligned 50.5MB/s ± 0% 59.1MB/s ± 1% +17.09% (p=0.000 n=10+10) Hash1KUnaligned 589MB/s ± 1% 596MB/s ± 1% +1.23% (p=0.000 n=9+10) Hash8KUnaligned 638MB/s ± 1% 640MB/s ± 0% +0.35% (p=0.002 n=10+10) Pure Go performance: name old speed new speed delta Hash8Bytes 30.3MB/s ± 1% 42.8MB/s ± 0% +41.20% (p=0.000 n=9+9) Hash1K 364MB/s ± 4% 394MB/s ± 1% +8.27% (p=0.000 n=10+10) Hash8K 404MB/s ± 1% 420MB/s ± 0% +4.17% (p=0.000 n=10+9) Hash8BytesUnaligned 30.3MB/s ± 1% 42.8MB/s ± 1% +40.92% (p=0.000 n=9+10) Hash1KUnaligned 368MB/s ± 0% 394MB/s ± 0% +7.07% (p=0.000 n=9+9) Hash8KUnaligned 404MB/s ± 1% 411MB/s ± 3% +1.91% (p=0.026 n=9+10) Change-Id: I9a91fb52ea8d62964d5351bdf121e9fbc9282852 Reviewed-on: https://go-review.googlesource.com/c/137355 Run-TryBot: Michael Munday <[email protected]> TryBot-Result: Gobot Gobot <[email protected]> Reviewed-by: Brad Fitzpatrick <[email protected]>
1 parent f108158 commit 7f99845

File tree

5 files changed

+213
-433
lines changed

5 files changed

+213
-433
lines changed

src/crypto/md5/gen.go

+62-133
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,7 @@
77
// This program generates md5block.go
88
// Invoke as
99
//
10-
// go run gen.go [-full] -output md5block.go
11-
//
12-
// The -full flag causes the generated code to do a full
13-
// (16x) unrolling instead of a 4x unrolling.
10+
// go run gen.go -output md5block.go
1411

1512
package main
1613

@@ -56,13 +53,14 @@ type Data struct {
5653
Table2 []uint32
5754
Table3 []uint32
5855
Table4 []uint32
59-
Full bool
6056
}
6157

6258
var funcs = template.FuncMap{
6359
"dup": dup,
6460
"relabel": relabel,
6561
"rotate": rotate,
62+
"idx": idx,
63+
"seq": seq,
6664
}
6765

6866
func dup(count int, x []int) []int {
@@ -74,16 +72,35 @@ func dup(count int, x []int) []int {
7472
}
7573

7674
func relabel(s string) string {
77-
return strings.NewReplacer("a", data.a, "b", data.b, "c", data.c, "d", data.d).Replace(s)
75+
return strings.NewReplacer("arg0", data.a, "arg1", data.b, "arg2", data.c, "arg3", data.d).Replace(s)
7876
}
7977

8078
func rotate() string {
8179
data.a, data.b, data.c, data.d = data.d, data.a, data.b, data.c
8280
return "" // no output
8381
}
8482

85-
func init() {
86-
flag.BoolVar(&data.Full, "full", false, "complete unrolling")
83+
func idx(round, index int) int {
84+
v := 0
85+
switch round {
86+
case 1:
87+
v = index
88+
case 2:
89+
v = (1 + 5*index) & 15
90+
case 3:
91+
v = (5 + 3*index) & 15
92+
case 4:
93+
v = (7 * index) & 15
94+
}
95+
return v
96+
}
97+
98+
func seq(i int) []int {
99+
s := make([]int, i)
100+
for i := range s {
101+
s[i] = i
102+
}
103+
return s
87104
}
88105

89106
var data = Data{
@@ -179,152 +196,64 @@ var program = `// Copyright 2013 The Go Authors. All rights reserved.
179196
// Use of this source code is governed by a BSD-style
180197
// license that can be found in the LICENSE file.
181198
182-
// Code generated by go run gen.go{{if .Full}} -full{{end}} -output md5block.go; DO NOT EDIT.
199+
// Code generated by go run gen.go -output md5block.go; DO NOT EDIT.
183200
184201
package md5
185202
186203
import (
187-
"unsafe"
188-
"runtime"
204+
"encoding/binary"
205+
"math/bits"
189206
)
190207
191-
{{if not .Full}}
192-
var t1 = [...]uint32{
193-
{{range .Table1}}{{printf "\t%#x,\n" .}}{{end}}
194-
}
195-
196-
var t2 = [...]uint32{
197-
{{range .Table2}}{{printf "\t%#x,\n" .}}{{end}}
198-
}
199-
200-
var t3 = [...]uint32{
201-
{{range .Table3}}{{printf "\t%#x,\n" .}}{{end}}
202-
}
203-
204-
var t4 = [...]uint32{
205-
{{range .Table4}}{{printf "\t%#x,\n" .}}{{end}}
206-
}
207-
{{end}}
208-
209-
const x86 = runtime.GOARCH == "amd64" || runtime.GOARCH == "386"
210-
211-
var littleEndian bool
208+
func blockGeneric(dig *digest, p []byte) {
209+
// load state
210+
a, b, c, d := dig.s[0], dig.s[1], dig.s[2], dig.s[3]
212211
213-
func init() {
214-
x := uint32(0x04030201)
215-
y := [4]byte{0x1, 0x2, 0x3, 0x4}
216-
littleEndian = *(*[4]byte)(unsafe.Pointer(&x)) == y
217-
}
212+
for i := 0; i <= len(p)-BlockSize; i += BlockSize {
213+
// eliminate bounds checks on p
214+
q := p[i:]
215+
q = q[:BlockSize:BlockSize]
218216
219-
func blockGeneric(dig *digest, p []byte) {
220-
a := dig.s[0]
221-
b := dig.s[1]
222-
c := dig.s[2]
223-
d := dig.s[3]
224-
var X *[16]uint32
225-
var xbuf [16]uint32
226-
for len(p) >= chunk {
217+
// save current state
227218
aa, bb, cc, dd := a, b, c, d
228219
229-
// This is a constant condition - it is not evaluated on each iteration.
230-
if x86 {
231-
// MD5 was designed so that x86 processors can just iterate
232-
// over the block data directly as uint32s, and we generate
233-
// less code and run 1.3x faster if we take advantage of that.
234-
// My apologies.
235-
X = (*[16]uint32)(unsafe.Pointer(&p[0]))
236-
} else if littleEndian && uintptr(unsafe.Pointer(&p[0]))&(unsafe.Alignof(uint32(0))-1) == 0 {
237-
X = (*[16]uint32)(unsafe.Pointer(&p[0]))
238-
} else {
239-
X = &xbuf
240-
j := 0
241-
for i := 0; i < 16; i++ {
242-
X[i&15] = uint32(p[j]) | uint32(p[j+1])<<8 | uint32(p[j+2])<<16 | uint32(p[j+3])<<24
243-
j += 4
244-
}
245-
}
220+
// load input block
221+
{{range $i := seq 16 -}}
222+
{{printf "x%x := binary.LittleEndian.Uint32(q[4*%#x:])" $i $i}}
223+
{{end}}
246224
247-
{{if .Full}}
248-
// Round 1.
249-
{{range $i, $s := dup 4 .Shift1}}
250-
{{index $.Table1 $i | printf "a += (((c^d)&b)^d) + X[%d] + %d" $i | relabel}}
251-
{{printf "a = a<<%d | a>>(32-%d) + b" $s $s | relabel}}
252-
{{rotate}}
253-
{{end}}
254-
255-
// Round 2.
256-
{{range $i, $s := dup 4 .Shift2}}
257-
{{index $.Table2 $i | printf "a += (((b^c)&d)^c) + X[(1+5*%d)&15] + %d" $i | relabel}}
258-
{{printf "a = a<<%d | a>>(32-%d) + b" $s $s | relabel}}
259-
{{rotate}}
260-
{{end}}
261-
262-
// Round 3.
263-
{{range $i, $s := dup 4 .Shift3}}
264-
{{index $.Table3 $i | printf "a += (b^c^d) + X[(5+3*%d)&15] + %d" $i | relabel}}
265-
{{printf "a = a<<%d | a>>(32-%d) + b" $s $s | relabel}}
266-
{{rotate}}
267-
{{end}}
268-
269-
// Round 4.
270-
{{range $i, $s := dup 4 .Shift4}}
271-
{{index $.Table4 $i | printf "a += (c^(b|^d)) + X[(7*%d)&15] + %d" $i | relabel}}
272-
{{printf "a = a<<%d | a>>(32-%d) + b" $s $s | relabel}}
273-
{{rotate}}
274-
{{end}}
275-
{{else}}
276-
// Round 1.
277-
for i := uint(0); i < 16; {
278-
{{range $s := .Shift1}}
279-
{{printf "a += (((c^d)&b)^d) + X[i&15] + t1[i&15]" | relabel}}
280-
{{printf "a = a<<%d | a>>(32-%d) + b" $s $s | relabel}}
281-
i++
282-
{{rotate}}
283-
{{end}}
284-
}
225+
// round 1
226+
{{range $i, $s := dup 4 .Shift1 -}}
227+
{{printf "arg0 = arg1 + bits.RotateLeft32((((arg2^arg3)&arg1)^arg3)+arg0+x%x+%#08x, %d)" (idx 1 $i) (index $.Table1 $i) $s | relabel}}
228+
{{rotate -}}
229+
{{end}}
285230
286-
// Round 2.
287-
for i := uint(0); i < 16; {
288-
{{range $s := .Shift2}}
289-
{{printf "a += (((b^c)&d)^c) + X[(1+5*i)&15] + t2[i&15]" | relabel}}
290-
{{printf "a = a<<%d | a>>(32-%d) + b" $s $s | relabel}}
291-
i++
292-
{{rotate}}
293-
{{end}}
294-
}
231+
// round 2
232+
{{range $i, $s := dup 4 .Shift2 -}}
233+
{{printf "arg0 = arg1 + bits.RotateLeft32((((arg1^arg2)&arg3)^arg2)+arg0+x%x+%#08x, %d)" (idx 2 $i) (index $.Table2 $i) $s | relabel}}
234+
{{rotate -}}
235+
{{end}}
295236
296-
// Round 3.
297-
for i := uint(0); i < 16; {
298-
{{range $s := .Shift3}}
299-
{{printf "a += (b^c^d) + X[(5+3*i)&15] + t3[i&15]" | relabel}}
300-
{{printf "a = a<<%d | a>>(32-%d) + b" $s $s | relabel}}
301-
i++
302-
{{rotate}}
303-
{{end}}
304-
}
237+
// round 3
238+
{{range $i, $s := dup 4 .Shift3 -}}
239+
{{printf "arg0 = arg1 + bits.RotateLeft32((arg1^arg2^arg3)+arg0+x%x+%#08x, %d)" (idx 3 $i) (index $.Table3 $i) $s | relabel}}
240+
{{rotate -}}
241+
{{end}}
305242
306-
// Round 4.
307-
for i := uint(0); i < 16; {
308-
{{range $s := .Shift4}}
309-
{{printf "a += (c^(b|^d)) + X[(7*i)&15] + t4[i&15]" | relabel}}
310-
{{printf "a = a<<%d | a>>(32-%d) + b" $s $s | relabel}}
311-
i++
312-
{{rotate}}
313-
{{end}}
314-
}
243+
// round 4
244+
{{range $i, $s := dup 4 .Shift4 -}}
245+
{{printf "arg0 = arg1 + bits.RotateLeft32((arg2^(arg1|^arg3))+arg0+x%x+%#08x, %d)" (idx 4 $i) (index $.Table4 $i) $s | relabel}}
246+
{{rotate -}}
315247
{{end}}
316248
249+
// add saved state
317250
a += aa
318251
b += bb
319252
c += cc
320253
d += dd
321-
322-
p = p[chunk:]
323254
}
324255
325-
dig.s[0] = a
326-
dig.s[1] = b
327-
dig.s[2] = c
328-
dig.s[3] = d
256+
// save state
257+
dig.s[0], dig.s[1], dig.s[2], dig.s[3] = a, b, c, d
329258
}
330259
`

0 commit comments

Comments
 (0)