Skip to content

Commit b212c68

Browse files
klauspostbradfitz
authored andcommitted
hash/crc32: use slicing by 8 for Castagnoli and smaller sizes
This adds "slicing by 8" optimization to Castagnoli tables which will speed up CRC32 calculation on systems without asssembler, which are all but AMD64. In my tests, it is faster to use "slicing by 8" for sizes all down to 16 bytes, so the switchover point has been adjusted. There are no benchmarks for small sizes, so I have added one for 40 bytes, as well as one for bigger sizes (32KB). Castagnoli, No assembler, 40 Byte payload: (before, after) BenchmarkCastagnoli40B-4 10000000 161 ns/op 246.94 MB/s BenchmarkCastagnoli40B-4 20000000 100 ns/op 398.01 MB/s Castagnoli, No assembler, 32KB payload: (before, after) BenchmarkCastagnoli32KB-4 10000 115426 ns/op 283.89 MB/s BenchmarkCastagnoli32KB-4 30000 45171 ns/op 725.41 MB/s IEEE, No assembler, 1KB payload: (before, after) BenchmarkCrc1KB-4 500000 3604 ns/op 284.10 MB/s BenchmarkCrc1KB-4 1000000 1463 ns/op 699.79 MB/s Compared: benchmark old ns/op new ns/op delta BenchmarkCastagnoli40B-4 161 100 -37.89% BenchmarkCastagnoli32KB-4 115426 45171 -60.87% BenchmarkCrc1KB-4 3604 1463 -59.41% benchmark old MB/s new MB/s speedup BenchmarkCastagnoli40B-4 246.94 398.01 1.61x BenchmarkCastagnoli32KB-4 283.89 725.41 2.56x BenchmarkCrc1KB-4 284.10 699.79 2.46x Change-Id: I303e4ec84e8d4dafd057d64c0e43deb2b498e968 Reviewed-on: https://go-review.googlesource.com/19335 Run-TryBot: Brad Fitzpatrick <[email protected]> TryBot-Result: Gobot Gobot <[email protected]> Reviewed-by: Brad Fitzpatrick <[email protected]>
1 parent 3921427 commit b212c68

File tree

5 files changed

+62
-39
lines changed

5 files changed

+62
-39
lines changed

src/hash/crc32/crc32.go

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@ import (
2020
// The size of a CRC-32 checksum in bytes.
2121
const Size = 4
2222

23+
// Use "slice by 8" when payload >= this value.
24+
const sliceBy8Cutoff = 16
25+
2326
// Predefined polynomials.
2427
const (
2528
// IEEE is by far and away the most common CRC-32 polynomial.
@@ -45,10 +48,12 @@ type Table [256]uint32
4548
// Castagnoli table so we can compare against it to find when the caller is
4649
// using this polynomial.
4750
var castagnoliTable *Table
51+
var castagnoliTable8 *slicing8Table
4852
var castagnoliOnce sync.Once
4953

5054
func castagnoliInit() {
5155
castagnoliTable = makeTable(Castagnoli)
56+
castagnoliTable8 = makeTable8(Castagnoli)
5257
}
5358

5459
// IEEETable is the table for the IEEE polynomial.
@@ -146,6 +151,9 @@ func updateSlicingBy8(crc uint32, tab *slicing8Table, p []byte) uint32 {
146151
p = p[8:]
147152
}
148153
crc = ^crc
154+
if len(p) == 0 {
155+
return crc
156+
}
149157
return update(crc, &tab[0], p)
150158
}
151159

@@ -178,4 +186,4 @@ func Checksum(data []byte, tab *Table) uint32 { return Update(0, tab, data) }
178186

179187
// ChecksumIEEE returns the CRC-32 checksum of data
180188
// using the IEEE polynomial.
181-
func ChecksumIEEE(data []byte) uint32 { return Update(0, IEEETable, data) }
189+
func ChecksumIEEE(data []byte) uint32 { return updateIEEE(0, data) }

src/hash/crc32/crc32_amd64.go

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,10 @@ func updateCastagnoli(crc uint32, p []byte) uint32 {
3030
if sse42 {
3131
return castagnoliSSE42(crc, p)
3232
}
33+
// Use slicing-by-8 on larger inputs.
34+
if len(p) >= sliceBy8Cutoff {
35+
return updateSlicingBy8(crc, castagnoliTable8, p)
36+
}
3337
return update(crc, castagnoliTable, p)
3438
}
3539

@@ -44,8 +48,8 @@ func updateIEEE(crc uint32, p []byte) uint32 {
4448
return crc
4549
}
4650

47-
// only use slicing-by-8 when input is >= 4KB
48-
if len(p) >= 4096 {
51+
// Use slicing-by-8 on larger inputs.
52+
if len(p) >= sliceBy8Cutoff {
4953
ieeeTable8Once.Do(func() {
5054
ieeeTable8 = makeTable8(IEEE)
5155
})

src/hash/crc32/crc32_amd64p32.go

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,12 +22,16 @@ func updateCastagnoli(crc uint32, p []byte) uint32 {
2222
if sse42 {
2323
return castagnoliSSE42(crc, p)
2424
}
25+
// Use slicing-by-8 on larger inputs.
26+
if len(p) >= sliceBy8Cutoff {
27+
return updateSlicingBy8(crc, castagnoliTable8, p)
28+
}
2529
return update(crc, castagnoliTable, p)
2630
}
2731

2832
func updateIEEE(crc uint32, p []byte) uint32 {
29-
// only use slicing-by-8 when input is >= 4KB
30-
if len(p) >= 4096 {
33+
// Use slicing-by-8 on larger inputs.
34+
if len(p) >= sliceBy8Cutoff {
3135
ieeeTable8Once.Do(func() {
3236
ieeeTable8 = makeTable8(IEEE)
3337
})

src/hash/crc32/crc32_generic.go

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,16 +6,20 @@
66

77
package crc32
88

9-
// The file contains the generic version of updateCastagnoli which just calls
10-
// the software implementation.
9+
// This file contains the generic version of updateCastagnoli which does
10+
// slicing-by-8, or uses the fallback for very small sizes.
1111

1212
func updateCastagnoli(crc uint32, p []byte) uint32 {
13+
// Use slicing-by-8 on larger inputs.
14+
if len(p) >= sliceBy8Cutoff {
15+
return updateSlicingBy8(crc, castagnoliTable8, p)
16+
}
1317
return update(crc, castagnoliTable, p)
1418
}
1519

1620
func updateIEEE(crc uint32, p []byte) uint32 {
17-
// only use slicing-by-8 when input is >= 4KB
18-
if len(p) >= 4096 {
21+
// Use slicing-by-8 on larger inputs.
22+
if len(p) >= sliceBy8Cutoff {
1923
ieeeTable8Once.Do(func() {
2024
ieeeTable8 = makeTable8(IEEE)
2125
})

src/hash/crc32/crc32_test.go

Lines changed: 33 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
package crc32
66

77
import (
8+
"hash"
89
"io"
910
"testing"
1011
)
@@ -81,49 +82,51 @@ func TestGolden(t *testing.T) {
8182
}
8283
}
8384

84-
func BenchmarkIEEECrc1KB(b *testing.B) {
85-
b.SetBytes(1024)
86-
data := make([]byte, 1024)
87-
for i := range data {
88-
data[i] = byte(i)
89-
}
90-
h := NewIEEE()
91-
in := make([]byte, 0, h.Size())
85+
func BenchmarkIEEECrc40B(b *testing.B) {
86+
benchmark(b, NewIEEE(), 40)
87+
}
9288

93-
b.ResetTimer()
94-
for i := 0; i < b.N; i++ {
95-
h.Reset()
96-
h.Write(data)
97-
h.Sum(in)
98-
}
89+
func BenchmarkIEEECrc1KB(b *testing.B) {
90+
benchmark(b, NewIEEE(), 1<<10)
9991
}
10092

10193
func BenchmarkIEEECrc4KB(b *testing.B) {
102-
b.SetBytes(4096)
103-
data := make([]byte, 4096)
104-
for i := range data {
105-
data[i] = byte(i)
106-
}
107-
h := NewIEEE()
108-
in := make([]byte, 0, h.Size())
94+
benchmark(b, NewIEEE(), 4<<10)
95+
}
10996

110-
b.ResetTimer()
111-
for i := 0; i < b.N; i++ {
112-
h.Reset()
113-
h.Write(data)
114-
h.Sum(in)
115-
}
97+
func BenchmarkIEEECrc32KB(b *testing.B) {
98+
benchmark(b, NewIEEE(), 32<<10)
99+
}
100+
101+
func BenchmarkCastagnoliCrc40B(b *testing.B) {
102+
benchmark(b, New(MakeTable(Castagnoli)), 40)
116103
}
117104

118105
func BenchmarkCastagnoliCrc1KB(b *testing.B) {
119-
b.SetBytes(1024)
120-
data := make([]byte, 1024)
106+
benchmark(b, New(MakeTable(Castagnoli)), 1<<10)
107+
}
108+
109+
func BenchmarkCastagnoliCrc4KB(b *testing.B) {
110+
benchmark(b, New(MakeTable(Castagnoli)), 4<<10)
111+
}
112+
113+
func BenchmarkCastagnoliCrc32KB(b *testing.B) {
114+
benchmark(b, New(MakeTable(Castagnoli)), 32<<10)
115+
}
116+
117+
func benchmark(b *testing.B, h hash.Hash32, n int64) {
118+
b.SetBytes(n)
119+
data := make([]byte, n)
121120
for i := range data {
122121
data[i] = byte(i)
123122
}
124-
h := New(MakeTable(Castagnoli))
125123
in := make([]byte, 0, h.Size())
126124

125+
// Warm up
126+
h.Reset()
127+
h.Write(data)
128+
h.Sum(in)
129+
127130
b.ResetTimer()
128131
for i := 0; i < b.N; i++ {
129132
h.Reset()

0 commit comments

Comments
 (0)