Skip to content

Commit eb0a88f

Browse files
committed
SipHasher128: fix platform-independence confusion
StableHasher is supposed to ensure platform independence by converting integers to little-endian and extending isize and usize to 64 bits as necessary, but in fact, much of that work is already handled by SipHasher128. In particular, SipHasher128 implements short_write in an endian-independent way, yet both StableHasher and SipHasher128 additionally attempt to achieve endian-independence by byte swapping on BE hardware before invoking short writes. This double swap has no effect, so let's remove it. Because short_write is endian-independent, SipHasher128 is already handling part of the platform-independence, and it would be somewhat difficult to make it *not* handle that part with the current implementation. As splitting platform-independence responsibilities between StableHasher and SipHasher128 would be confusing, let's make SipHasher128 handle all of it. Finally, update some incorrect comments and increase test coverage. Unit tests pass on both LE and BE systems.
1 parent fc2daaa commit eb0a88f

File tree

4 files changed

+183
-51
lines changed

4 files changed

+183
-51
lines changed

compiler/rustc_data_structures/src/sip128.rs

+46-24
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,13 @@ use std::ptr;
88
#[cfg(test)]
99
mod tests;
1010

11+
/// When hashing something that ends up affecting properties like symbol names,
12+
/// we want these symbol names to be calculated independently of other factors
13+
/// like what architecture you're compiling *from*.
14+
///
15+
/// To that end, we always convert integers to little-endian format or handle
16+
/// them in an endian-independent way, and extend the architecture-dependent
17+
/// `isize` and `usize` types to 64 bits if needed before hashing.
1118
#[derive(Debug, Clone)]
1219
pub struct SipHasher128 {
1320
k0: u64,
@@ -125,15 +132,17 @@ impl SipHasher128 {
125132

126133
// A specialized write function for values with size <= 8.
127134
//
128-
// The hashing of multi-byte integers depends on endianness. E.g.:
129-
// - little-endian: `write_u32(0xDDCCBBAA)` == `write([0xAA, 0xBB, 0xCC, 0xDD])`
130-
// - big-endian: `write_u32(0xDDCCBBAA)` == `write([0xDD, 0xCC, 0xBB, 0xAA])`
135+
// The input must be zero-extended to 64-bits by the caller. The extension
136+
// isn't hashed, but the implementation requires it for correctness.
137+
//
138+
// This function, given the same integer type and value, has the same effect
139+
// on both little- and big-endian hardware. It operates on values without
140+
// depending on their sequence in memory, so is independent of endianness.
131141
//
132-
// This function does the right thing for little-endian hardware. On
133-
// big-endian hardware `x` must be byte-swapped first to give the right
134-
// behaviour. After any byte-swapping, the input must be zero-extended to
135-
// 64-bits. The caller is responsible for the byte-swapping and
136-
// zero-extension.
142+
// The equivalent write() call *does* need the value's bytes converted to
143+
// little-endian (without zero-extension) for equivalent behavior on little-
144+
// and big-endian hardware, as write() *does* operate on byte sequences.
145+
// I.e. write_u32(0xDDCCBBAA) == write(&0xDDCCBBAA_u32.to_le_bytes()).
137146
#[inline]
138147
fn short_write<T>(&mut self, _x: T, x: u64) {
139148
let size = mem::size_of::<T>();
@@ -167,12 +176,9 @@ impl SipHasher128 {
167176
// left-shift it five bytes, giving 0xHHGG_FF00_0000_0000. We then
168177
// bitwise-OR that value into `self.tail`, resulting in
169178
// 0xHHGG_FFEE_DDCC_BBAA. `self.tail` is now full, and we can use it
170-
// to update `self.state`. (As mentioned above, this assumes a
171-
// little-endian machine; on a big-endian machine we would have
172-
// byte-swapped 0xIIHH_GGFF in the caller, giving 0xFFGG_HHII, and we
173-
// would then end up bitwise-ORing 0xGGHH_II00_0000_0000 into
174-
// `self.tail`).
175-
//
179+
// to update `self.state`. The analysis is the same whether we are on
180+
// a little-endian or big-endian machine, as the bitwise operations
181+
// are endian-independent.
176182
self.tail |= x << (8 * self.ntail);
177183
if size < needed {
178184
self.ntail += size;
@@ -186,8 +192,7 @@ impl SipHasher128 {
186192

187193
// Continuing scenario 2: we have one byte left over from the input. We
188194
// set `self.ntail` to 1 and `self.tail` to `0x0000_0000_IIHH_GGFF >>
189-
// 8*3`, which is 0x0000_0000_0000_00II. (Or on a big-endian machine
190-
// the prior byte-swapping would leave us with 0x0000_0000_0000_00FF.)
195+
// 8*3`, which is 0x0000_0000_0000_00II.
191196
//
192197
// The `if` is needed to avoid shifting by 64 bits, which Rust
193198
// complains about.
@@ -222,22 +227,30 @@ impl Hasher for SipHasher128 {
222227

223228
#[inline]
224229
fn write_u16(&mut self, i: u16) {
225-
self.short_write(i, i.to_le() as u64);
230+
self.short_write(i, i as u64);
226231
}
227232

228233
#[inline]
229234
fn write_u32(&mut self, i: u32) {
230-
self.short_write(i, i.to_le() as u64);
235+
self.short_write(i, i as u64);
231236
}
232237

233238
#[inline]
234239
fn write_u64(&mut self, i: u64) {
235-
self.short_write(i, i.to_le() as u64);
240+
self.short_write(i, i as u64);
241+
}
242+
243+
#[inline]
244+
fn write_u128(&mut self, i: u128) {
245+
self.write(&i.to_le_bytes());
236246
}
237247

238248
#[inline]
239249
fn write_usize(&mut self, i: usize) {
240-
self.short_write(i, i.to_le() as u64);
250+
// Always treat usize as u64 so we get the same results on 32 and 64 bit
251+
// platforms. This is important for symbol hashes when cross compiling,
252+
// for example.
253+
self.write_u64(i as u64);
241254
}
242255

243256
#[inline]
@@ -247,22 +260,31 @@ impl Hasher for SipHasher128 {
247260

248261
#[inline]
249262
fn write_i16(&mut self, i: i16) {
250-
self.short_write(i, (i as u16).to_le() as u64);
263+
self.short_write(i, i as u16 as u64);
251264
}
252265

253266
#[inline]
254267
fn write_i32(&mut self, i: i32) {
255-
self.short_write(i, (i as u32).to_le() as u64);
268+
self.short_write(i, i as u32 as u64);
256269
}
257270

258271
#[inline]
259272
fn write_i64(&mut self, i: i64) {
260-
self.short_write(i, (i as u64).to_le() as u64);
273+
self.short_write(i, i as u64);
274+
}
275+
276+
#[inline]
277+
fn write_i128(&mut self, i: i128) {
278+
self.write(&i.to_le_bytes());
261279
}
262280

263281
#[inline]
264282
fn write_isize(&mut self, i: isize) {
265-
self.short_write(i, (i as usize).to_le() as u64);
283+
// Always treat isize as i64 so we get the same results on 32 and 64 bit
284+
// platforms. This is important for symbol hashes when cross compiling,
285+
// for example. Sign extending here is preferable as it means that the
286+
// same negative number hashes the same on both 32 and 64 bit platforms.
287+
self.write_i64(i as i64);
266288
}
267289

268290
#[inline]

compiler/rustc_data_structures/src/sip128/tests.rs

+48-11
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
use super::*;
22

33
use std::hash::{Hash, Hasher};
4-
use std::{mem, slice};
54

65
// Hash just the bytes of the slice, without length prefix
76
struct Bytes<'a>(&'a [u8]);
@@ -399,20 +398,58 @@ fn test_hash_no_concat_alias() {
399398
}
400399

401400
#[test]
402-
fn test_write_short_works() {
403-
let test_usize = 0xd0c0b0a0usize;
401+
fn test_short_write_works() {
402+
let test_u8 = 0xFF_u8;
403+
let test_u16 = 0x1122_u16;
404+
let test_u32 = 0x22334455_u32;
405+
let test_u64 = 0x33445566_778899AA_u64;
406+
let test_u128 = 0x11223344_55667788_99AABBCC_DDEEFF77_u128;
407+
let test_usize = 0xD0C0B0A0_usize;
408+
409+
let test_i8 = -1_i8;
410+
let test_i16 = -2_i16;
411+
let test_i32 = -3_i32;
412+
let test_i64 = -4_i64;
413+
let test_i128 = -5_i128;
414+
let test_isize = -6_isize;
415+
404416
let mut h1 = SipHasher128::new_with_keys(0, 0);
405-
h1.write_usize(test_usize);
406417
h1.write(b"bytes");
407418
h1.write(b"string");
408-
h1.write_u8(0xFFu8);
409-
h1.write_u8(0x01u8);
419+
h1.write_u8(test_u8);
420+
h1.write_u16(test_u16);
421+
h1.write_u32(test_u32);
422+
h1.write_u64(test_u64);
423+
h1.write_u128(test_u128);
424+
h1.write_usize(test_usize);
425+
h1.write_i8(test_i8);
426+
h1.write_i16(test_i16);
427+
h1.write_i32(test_i32);
428+
h1.write_i64(test_i64);
429+
h1.write_i128(test_i128);
430+
h1.write_isize(test_isize);
431+
410432
let mut h2 = SipHasher128::new_with_keys(0, 0);
411-
h2.write(unsafe {
412-
slice::from_raw_parts(&test_usize as *const _ as *const u8, mem::size_of::<usize>())
413-
});
414433
h2.write(b"bytes");
415434
h2.write(b"string");
416-
h2.write(&[0xFFu8, 0x01u8]);
417-
assert_eq!(h1.finish128(), h2.finish128());
435+
h2.write(&test_u8.to_le_bytes());
436+
h2.write(&test_u16.to_le_bytes());
437+
h2.write(&test_u32.to_le_bytes());
438+
h2.write(&test_u64.to_le_bytes());
439+
h2.write(&test_u128.to_le_bytes());
440+
h2.write(&(test_usize as u64).to_le_bytes());
441+
h2.write(&test_i8.to_le_bytes());
442+
h2.write(&test_i16.to_le_bytes());
443+
h2.write(&test_i32.to_le_bytes());
444+
h2.write(&test_i64.to_le_bytes());
445+
h2.write(&test_i128.to_le_bytes());
446+
h2.write(&(test_isize as i64).to_le_bytes());
447+
448+
let h1_hash = h1.finish128();
449+
let h2_hash = h2.finish128();
450+
451+
let expected = (5926600258011434223, 10938367019217336666);
452+
453+
assert_eq!(h1_hash, expected);
454+
assert_eq!(h2_hash, expected);
418455
}

compiler/rustc_data_structures/src/stable_hasher.rs

+16-16
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,9 @@ use smallvec::SmallVec;
55
use std::hash::{BuildHasher, Hash, Hasher};
66
use std::mem;
77

8+
#[cfg(test)]
9+
mod tests;
10+
811
/// When hashing something that ends up affecting properties like symbol names,
912
/// we want these symbol names to be calculated independently of other factors
1013
/// like what architecture you're compiling *from*.
@@ -57,6 +60,9 @@ impl StableHasher {
5760
}
5861
}
5962

63+
// SipHasher128 currently handles ensuring platform-independent results with
64+
// respect to endianness and `isize` and `usize` differences (to the extent
65+
// possible). The write functions below don't need handle this at this time.
6066
impl Hasher for StableHasher {
6167
fn finish(&self) -> u64 {
6268
panic!("use StableHasher::finalize instead");
@@ -74,30 +80,27 @@ impl Hasher for StableHasher {
7480

7581
#[inline]
7682
fn write_u16(&mut self, i: u16) {
77-
self.state.write_u16(i.to_le());
83+
self.state.write_u16(i);
7884
}
7985

8086
#[inline]
8187
fn write_u32(&mut self, i: u32) {
82-
self.state.write_u32(i.to_le());
88+
self.state.write_u32(i);
8389
}
8490

8591
#[inline]
8692
fn write_u64(&mut self, i: u64) {
87-
self.state.write_u64(i.to_le());
93+
self.state.write_u64(i);
8894
}
8995

9096
#[inline]
9197
fn write_u128(&mut self, i: u128) {
92-
self.state.write_u128(i.to_le());
98+
self.state.write_u128(i);
9399
}
94100

95101
#[inline]
96102
fn write_usize(&mut self, i: usize) {
97-
// Always treat usize as u64 so we get the same results on 32 and 64 bit
98-
// platforms. This is important for symbol hashes when cross compiling,
99-
// for example.
100-
self.state.write_u64((i as u64).to_le());
103+
self.state.write_usize(i);
101104
}
102105

103106
#[inline]
@@ -107,30 +110,27 @@ impl Hasher for StableHasher {
107110

108111
#[inline]
109112
fn write_i16(&mut self, i: i16) {
110-
self.state.write_i16(i.to_le());
113+
self.state.write_i16(i);
111114
}
112115

113116
#[inline]
114117
fn write_i32(&mut self, i: i32) {
115-
self.state.write_i32(i.to_le());
118+
self.state.write_i32(i);
116119
}
117120

118121
#[inline]
119122
fn write_i64(&mut self, i: i64) {
120-
self.state.write_i64(i.to_le());
123+
self.state.write_i64(i);
121124
}
122125

123126
#[inline]
124127
fn write_i128(&mut self, i: i128) {
125-
self.state.write_i128(i.to_le());
128+
self.state.write_i128(i);
126129
}
127130

128131
#[inline]
129132
fn write_isize(&mut self, i: isize) {
130-
// Always treat isize as i64 so we get the same results on 32 and 64 bit
131-
// platforms. This is important for symbol hashes when cross compiling,
132-
// for example.
133-
self.state.write_i64((i as i64).to_le());
133+
self.state.write_isize(i);
134134
}
135135
}
136136

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
use super::*;
2+
3+
// The tests below compare the computed hashes to particular expected values
4+
// in order to test that we produce the same results on different platforms,
5+
// regardless of endianness and `usize` and `isize` size differences (this
6+
// of course assumes we run these tests on platforms that differ in those
7+
// ways). The expected values depend on the hashing algorithm used, so they
8+
// need to be updated whenever StableHasher changes its hashing algorithm.
9+
10+
#[test]
11+
fn test_hash_integers() {
12+
// Test that integers are handled consistently across platforms.
13+
let test_u8 = 0xAB_u8;
14+
let test_u16 = 0xFFEE_u16;
15+
let test_u32 = 0x445577AA_u32;
16+
let test_u64 = 0x01234567_13243546_u64;
17+
let test_u128 = 0x22114433_66557788_99AACCBB_EEDDFF77_u128;
18+
let test_usize = 0xD0C0B0A0_usize;
19+
20+
let test_i8 = -100_i8;
21+
let test_i16 = -200_i16;
22+
let test_i32 = -300_i32;
23+
let test_i64 = -400_i64;
24+
let test_i128 = -500_i128;
25+
let test_isize = -600_isize;
26+
27+
let mut h = StableHasher::new();
28+
test_u8.hash(&mut h);
29+
test_u16.hash(&mut h);
30+
test_u32.hash(&mut h);
31+
test_u64.hash(&mut h);
32+
test_u128.hash(&mut h);
33+
test_usize.hash(&mut h);
34+
test_i8.hash(&mut h);
35+
test_i16.hash(&mut h);
36+
test_i32.hash(&mut h);
37+
test_i64.hash(&mut h);
38+
test_i128.hash(&mut h);
39+
test_isize.hash(&mut h);
40+
41+
// This depends on the hashing algorithm. See note at top of file.
42+
let expected = (2736651863462566372, 8121090595289675650);
43+
44+
assert_eq!(h.finalize(), expected);
45+
}
46+
47+
#[test]
48+
fn test_hash_usize() {
49+
// Test that usize specifically is handled consistently across platforms.
50+
let test_usize = 0xABCDEF01_usize;
51+
52+
let mut h = StableHasher::new();
53+
test_usize.hash(&mut h);
54+
55+
// This depends on the hashing algorithm. See note at top of file.
56+
let expected = (5798740672699530587, 11186240177685111648);
57+
58+
assert_eq!(h.finalize(), expected);
59+
}
60+
61+
#[test]
62+
fn test_hash_isize() {
63+
// Test that isize specifically is handled consistently across platforms.
64+
let test_isize = -7_isize;
65+
66+
let mut h = StableHasher::new();
67+
test_isize.hash(&mut h);
68+
69+
// This depends on the hashing algorithm. See note at top of file.
70+
let expected = (14721296605626097289, 11385941877786388409);
71+
72+
assert_eq!(h.finalize(), expected);
73+
}

0 commit comments

Comments
 (0)