4
4
5
5
package crc32
6
6
7
+ import "unsafe"
8
+
7
9
// This file contains the code to call the SSE 4.2 version of the Castagnoli
8
10
// and IEEE CRC.
9
11
@@ -13,11 +15,20 @@ func haveSSE41() bool
13
15
func haveSSE42 () bool
14
16
func haveCLMUL () bool
15
17
16
- // castagnoliSSE42 is defined in crc_amd64 .s and uses the SSE4.2 CRC32
18
+ // castagnoliSSE42 is defined in crc32_amd64 .s and uses the SSE4.2 CRC32
17
19
// instruction.
18
20
//go:noescape
19
21
func castagnoliSSE42 (crc uint32 , p []byte ) uint32
20
22
23
+ // castagnoliSSE42Triple is defined in crc32_amd64.s and uses the SSE4.2 CRC32
24
+ // instruction.
25
+ //go:noescape
26
+ func castagnoliSSE42Triple (
27
+ crcA , crcB , crcC uint32 ,
28
+ a , b , c []byte ,
29
+ rounds uint32 ,
30
+ ) (retA uint32 , retB uint32 , retC uint32 )
31
+
21
32
// ieeeCLMUL is defined in crc_amd64.s and uses the PCLMULQDQ
22
33
// instruction as well as SSE 4.1.
23
34
//go:noescape
@@ -26,15 +37,160 @@ func ieeeCLMUL(crc uint32, p []byte) uint32
26
37
var sse42 = haveSSE42 ()
27
38
var useFastIEEE = haveCLMUL () && haveSSE41 ()
28
39
40
+ const castagnoliK1 = 168
41
+ const castagnoliK2 = 1344
42
+
43
+ type sse42Table [4 ]Table
44
+
45
+ var castagnoliSSE42TableK1 * sse42Table
46
+ var castagnoliSSE42TableK2 * sse42Table
47
+
48
+ func castagnoliInitArch () (needGenericTables bool ) {
49
+ if ! sse42 {
50
+ return true
51
+ }
52
+ castagnoliSSE42TableK1 = new (sse42Table )
53
+ castagnoliSSE42TableK2 = new (sse42Table )
54
+ // See description in updateCastagnoli.
55
+ // t[0][i] = CRC(i000, O)
56
+ // t[1][i] = CRC(0i00, O)
57
+ // t[2][i] = CRC(00i0, O)
58
+ // t[3][i] = CRC(000i, O)
59
+ // where O is a sequence of K zeros.
60
+ var tmp [castagnoliK2 ]byte
61
+ for b := 0 ; b < 4 ; b ++ {
62
+ for i := 0 ; i < 256 ; i ++ {
63
+ val := uint32 (i ) << uint32 (b * 8 )
64
+ castagnoliSSE42TableK1 [b ][i ] = castagnoliSSE42 (val , tmp [:castagnoliK1 ])
65
+ castagnoliSSE42TableK2 [b ][i ] = castagnoliSSE42 (val , tmp [:])
66
+ }
67
+ }
68
+ return false
69
+ }
70
+
71
+ // castagnoliShift computes the CRC32-C of K1 or K2 zeroes (depending on the
72
+ // table given) with the given initial crc value. This corresponds to
73
+ // CRC(crc, O) in the description in updateCastagnoli.
74
+ func castagnoliShift (table * sse42Table , crc uint32 ) uint32 {
75
+ return table [3 ][crc >> 24 ] ^
76
+ table [2 ][(crc >> 16 )& 0xFF ] ^
77
+ table [1 ][(crc >> 8 )& 0xFF ] ^
78
+ table [0 ][crc & 0xFF ]
79
+ }
80
+
29
81
func updateCastagnoli (crc uint32 , p []byte ) uint32 {
30
- if sse42 {
31
- return castagnoliSSE42 (crc , p )
82
+ if ! sse42 {
83
+ // Use slicing-by-8 on larger inputs.
84
+ if len (p ) >= sliceBy8Cutoff {
85
+ return updateSlicingBy8 (crc , castagnoliTable8 , p )
86
+ }
87
+ return update (crc , castagnoliTable , p )
32
88
}
33
- // Use slicing-by-8 on larger inputs.
34
- if len (p ) >= sliceBy8Cutoff {
35
- return updateSlicingBy8 (crc , castagnoliTable8 , p )
89
+
90
+ // This method is inspired from the algorithm in Intel's white paper:
91
+ // "Fast CRC Computation for iSCSI Polynomial Using CRC32 Instruction"
92
+ // The same strategy of splitting the buffer in three is used but the
93
+ // combining calculation is different; the complete derivation is explained
94
+ // below.
95
+ //
96
+ // -- The basic idea --
97
+ //
98
+ // The CRC32 instruction (available in SSE4.2) can process 8 bytes at a
99
+ // time. In recent Intel architectures the instruction takes 3 cycles;
100
+ // however the processor can pipeline up to three instructions if they
101
+ // don't depend on each other.
102
+ //
103
+ // Roughly this means that we can process three buffers in about the same
104
+ // time we can process one buffer.
105
+ //
106
+ // The idea is then to split the buffer in three, CRC the three pieces
107
+ // separately and then combine the results.
108
+ //
109
+ // Combining the results requires precomputed tables, so we must choose a
110
+ // fixed buffer length to optimize. The longer the length, the faster; but
111
+ // only buffers longer than this length will use the optimization. We choose
112
+ // two cutoffs and compute tables for both:
113
+ // - one around 512: 168*3=504
114
+ // - one around 4KB: 1344*3=4032
115
+ //
116
+ // -- The nitty gritty --
117
+ //
118
+ // Let CRC(I, X) be the non-inverted CRC32-C of the sequence X (with
119
+ // initial non-inverted CRC I). This function has the following properties:
120
+ // (a) CRC(I, AB) = CRC(CRC(I, A), B)
121
+ // (b) CRC(I, A xor B) = CRC(I, A) xor CRC(0, B)
122
+ //
123
+ // Say we want to compute CRC(I, ABC) where A, B, C are three sequences of
124
+ // K bytes each, where K is a fixed constant. Let O be the sequence of K zero
125
+ // bytes.
126
+ //
127
+ // CRC(I, ABC) = CRC(I, ABO xor C)
128
+ // = CRC(I, ABO) xor CRC(0, C)
129
+ // = CRC(CRC(I, AB), O) xor CRC(0, C)
130
+ // = CRC(CRC(I, AO xor B), O) xor CRC(0, C)
131
+ // = CRC(CRC(I, AO) xor CRC(0, B), O) xor CRC(0, C)
132
+ // = CRC(CRC(CRC(I, A), O) xor CRC(0, B), O) xor CRC(0, C)
133
+ //
134
+ // The castagnoliSSE42Triple function can compute CRC(I, A), CRC(0, B),
135
+ // and CRC(0, C) efficiently. We just need to find a way to quickly compute
136
+ // CRC(uvwx, O) given a 4-byte initial value uvwx. We can precompute these
137
+ // values; since we can't have a 32-bit table, we break it up into four
138
+ // 8-bit tables:
139
+ //
140
+ // CRC(uvwx, O) = CRC(u000, O) xor
141
+ // CRC(0v00, O) xor
142
+ // CRC(00w0, O) xor
143
+ // CRC(000x, O)
144
+ //
145
+ // We can compute tables corresponding to the four terms for all 8-bit
146
+ // values.
147
+
148
+ crc = ^ crc
149
+
150
+ // If a buffer is long enough to use the optimization, process the first few
151
+ // bytes to align the buffer to an 8 byte boundary (if necessary).
152
+ if len (p ) >= castagnoliK1 * 3 {
153
+ delta := int (uintptr (unsafe .Pointer (& p [0 ])) & 7 )
154
+ if delta != 0 {
155
+ delta = 8 - delta
156
+ crc = castagnoliSSE42 (crc , p [:delta ])
157
+ p = p [delta :]
158
+ }
36
159
}
37
- return update (crc , castagnoliTable , p )
160
+
161
+ // Process 3*K2 at a time.
162
+ for len (p ) >= castagnoliK2 * 3 {
163
+ // Compute CRC(I, A), CRC(0, B), and CRC(0, C).
164
+ crcA , crcB , crcC := castagnoliSSE42Triple (
165
+ crc , 0 , 0 ,
166
+ p , p [castagnoliK2 :], p [castagnoliK2 * 2 :],
167
+ castagnoliK2 / 24 )
168
+
169
+ // CRC(I, AB) = CRC(CRC(I, A), O) xor CRC(0, B)
170
+ crcAB := castagnoliShift (castagnoliSSE42TableK2 , crcA ) ^ crcB
171
+ // CRC(I, ABC) = CRC(CRC(I, AB), O) xor CRC(0, C)
172
+ crc = castagnoliShift (castagnoliSSE42TableK2 , crcAB ) ^ crcC
173
+ p = p [castagnoliK2 * 3 :]
174
+ }
175
+
176
+ // Process 3*K1 at a time.
177
+ for len (p ) >= castagnoliK1 * 3 {
178
+ // Compute CRC(I, A), CRC(0, B), and CRC(0, C).
179
+ crcA , crcB , crcC := castagnoliSSE42Triple (
180
+ crc , 0 , 0 ,
181
+ p , p [castagnoliK1 :], p [castagnoliK1 * 2 :],
182
+ castagnoliK1 / 24 )
183
+
184
+ // CRC(I, AB) = CRC(CRC(I, A), O) xor CRC(0, B)
185
+ crcAB := castagnoliShift (castagnoliSSE42TableK1 , crcA ) ^ crcB
186
+ // CRC(I, ABC) = CRC(CRC(I, AB), O) xor CRC(0, C)
187
+ crc = castagnoliShift (castagnoliSSE42TableK1 , crcAB ) ^ crcC
188
+ p = p [castagnoliK1 * 3 :]
189
+ }
190
+
191
+ // Use the simple implementation for what's left.
192
+ crc = castagnoliSSE42 (crc , p )
193
+ return ^ crc
38
194
}
39
195
40
196
func updateIEEE (crc uint32 , p []byte ) uint32 {
0 commit comments