SipHasher128: fix platform-independence confusion

tgnottingham · tgnottingham · commit eb0a88f7c041 · 2020-09-29T14:36:48.000-07:00
StableHasher is supposed to ensure platform independence by converting
integers to little-endian and extending isize and usize to 64 bits as
necessary, but in fact, much of that work is already handled by
SipHasher128.

In particular, SipHasher128 implements short_write in an
endian-independent way, yet both StableHasher and SipHasher128
additionally attempt to achieve endian-independence by byte swapping on
BE hardware before invoking short writes. This double swap has no
effect, so let's remove it.

Because short_write is endian-independent, SipHasher128 is already
handling part of the platform-independence, and it would be somewhat
difficult to make it *not* handle that part with the current
implementation. As splitting platform-independence responsibilities
between StableHasher and SipHasher128 would be confusing, let's make
SipHasher128 handle all of it.

Finally, update some incorrect comments and increase test coverage.
Unit tests pass on both LE and BE systems.
diff --git a/compiler/rustc_data_structures/src/sip128.rs b/compiler/rustc_data_structures/src/sip128.rs
@@ -8,6 +8,13 @@ use std::ptr;
 #[cfg(test)]
 mod tests;
 
+/// When hashing something that ends up affecting properties like symbol names,
+/// we want these symbol names to be calculated independently of other factors
+/// like what architecture you're compiling *from*.
+///
+/// To that end, we always convert integers to little-endian format or handle
+/// them in an endian-independent way, and extend the architecture-dependent
+/// `isize` and `usize` types to 64 bits if needed before hashing.
 #[derive(Debug, Clone)]
 pub struct SipHasher128 {
     k0: u64,
@@ -125,15 +132,17 @@ impl SipHasher128 {
 
     // A specialized write function for values with size <= 8.
     //
-    // The hashing of multi-byte integers depends on endianness. E.g.:
-    // - little-endian: `write_u32(0xDDCCBBAA)` == `write([0xAA, 0xBB, 0xCC, 0xDD])`
-    // - big-endian:    `write_u32(0xDDCCBBAA)` == `write([0xDD, 0xCC, 0xBB, 0xAA])`
+    // The input must be zero-extended to 64-bits by the caller. The extension
+    // isn't hashed, but the implementation requires it for correctness.
+    //
+    // This function, given the same integer type and value, has the same effect
+    // on both little- and big-endian hardware. It operates on values without
+    // depending on their sequence in memory, so is independent of endianness.
     //
-    // This function does the right thing for little-endian hardware. On
-    // big-endian hardware `x` must be byte-swapped first to give the right
-    // behaviour. After any byte-swapping, the input must be zero-extended to
-    // 64-bits. The caller is responsible for the byte-swapping and
-    // zero-extension.
+    // The equivalent write() call *does* need the value's bytes converted to
+    // little-endian (without zero-extension) for equivalent behavior on little-
+    // and big-endian hardware, as write() *does* operate on byte sequences.
+    // I.e. write_u32(0xDDCCBBAA) == write(&0xDDCCBBAA_u32.to_le_bytes()).
     #[inline]
     fn short_write<T>(&mut self, _x: T, x: u64) {
         let size = mem::size_of::<T>();
@@ -167,12 +176,9 @@ impl SipHasher128 {
         //   left-shift it five bytes, giving 0xHHGG_FF00_0000_0000. We then
         //   bitwise-OR that value into `self.tail`, resulting in
         //   0xHHGG_FFEE_DDCC_BBAA. `self.tail` is now full, and we can use it
-        //   to update `self.state`. (As mentioned above, this assumes a
-        //   little-endian machine; on a big-endian machine we would have
-        //   byte-swapped 0xIIHH_GGFF in the caller, giving 0xFFGG_HHII, and we
-        //   would then end up bitwise-ORing 0xGGHH_II00_0000_0000 into
-        //   `self.tail`).
-        //
+        //   to update `self.state`. The analysis is the same whether we are on
+        //   a little-endian or big-endian machine, as the bitwise operations
+        //   are endian-independent.
         self.tail |= x << (8 * self.ntail);
         if size < needed {
             self.ntail += size;
@@ -186,8 +192,7 @@ impl SipHasher128 {
 
         // Continuing scenario 2: we have one byte left over from the input. We
         // set `self.ntail` to 1 and `self.tail` to `0x0000_0000_IIHH_GGFF >>
-        // 8*3`, which is 0x0000_0000_0000_00II. (Or on a big-endian machine
-        // the prior byte-swapping would leave us with 0x0000_0000_0000_00FF.)
+        // 8*3`, which is 0x0000_0000_0000_00II.
         //
         // The `if` is needed to avoid shifting by 64 bits, which Rust
         // complains about.
@@ -222,22 +227,30 @@ impl Hasher for SipHasher128 {
 
     #[inline]
     fn write_u16(&mut self, i: u16) {
-        self.short_write(i, i.to_le() as u64);
+        self.short_write(i, i as u64);
     }
 
     #[inline]
     fn write_u32(&mut self, i: u32) {
-        self.short_write(i, i.to_le() as u64);
+        self.short_write(i, i as u64);
     }
 
     #[inline]
     fn write_u64(&mut self, i: u64) {
-        self.short_write(i, i.to_le() as u64);
+        self.short_write(i, i as u64);
+    }
+
+    #[inline]
+    fn write_u128(&mut self, i: u128) {
+        self.write(&i.to_le_bytes());
     }
 
     #[inline]
     fn write_usize(&mut self, i: usize) {
-        self.short_write(i, i.to_le() as u64);
+        // Always treat usize as u64 so we get the same results on 32 and 64 bit
+        // platforms. This is important for symbol hashes when cross compiling,
+        // for example.
+        self.write_u64(i as u64);
     }
 
     #[inline]
@@ -247,22 +260,31 @@ impl Hasher for SipHasher128 {
 
     #[inline]
     fn write_i16(&mut self, i: i16) {
-        self.short_write(i, (i as u16).to_le() as u64);
+        self.short_write(i, i as u16 as u64);
     }
 
     #[inline]
     fn write_i32(&mut self, i: i32) {
-        self.short_write(i, (i as u32).to_le() as u64);
+        self.short_write(i, i as u32 as u64);
     }
 
     #[inline]
     fn write_i64(&mut self, i: i64) {
-        self.short_write(i, (i as u64).to_le() as u64);
+        self.short_write(i, i as u64);
+    }
+
+    #[inline]
+    fn write_i128(&mut self, i: i128) {
+        self.write(&i.to_le_bytes());
     }
 
     #[inline]
     fn write_isize(&mut self, i: isize) {
-        self.short_write(i, (i as usize).to_le() as u64);
+        // Always treat isize as i64 so we get the same results on 32 and 64 bit
+        // platforms. This is important for symbol hashes when cross compiling,
+        // for example. Sign extending here is preferable as it means that the
+        // same negative number hashes the same on both 32 and 64 bit platforms.
+        self.write_i64(i as i64);
     }
 
     #[inline]
diff --git a/compiler/rustc_data_structures/src/sip128/tests.rs b/compiler/rustc_data_structures/src/sip128/tests.rs
@@ -1,7 +1,6 @@
 use super::*;
 
 use std::hash::{Hash, Hasher};
-use std::{mem, slice};
 
 // Hash just the bytes of the slice, without length prefix
 struct Bytes<'a>(&'a [u8]);
@@ -399,20 +398,58 @@ fn test_hash_no_concat_alias() {
 }
 
 #[test]
-fn test_write_short_works() {
-    let test_usize = 0xd0c0b0a0usize;
+fn test_short_write_works() {
+    let test_u8 = 0xFF_u8;
+    let test_u16 = 0x1122_u16;
+    let test_u32 = 0x22334455_u32;
+    let test_u64 = 0x33445566_778899AA_u64;
+    let test_u128 = 0x11223344_55667788_99AABBCC_DDEEFF77_u128;
+    let test_usize = 0xD0C0B0A0_usize;
+
+    let test_i8 = -1_i8;
+    let test_i16 = -2_i16;
+    let test_i32 = -3_i32;
+    let test_i64 = -4_i64;
+    let test_i128 = -5_i128;
+    let test_isize = -6_isize;
+
     let mut h1 = SipHasher128::new_with_keys(0, 0);
-    h1.write_usize(test_usize);
     h1.write(b"bytes");
     h1.write(b"string");
-    h1.write_u8(0xFFu8);
-    h1.write_u8(0x01u8);
+    h1.write_u8(test_u8);
+    h1.write_u16(test_u16);
+    h1.write_u32(test_u32);
+    h1.write_u64(test_u64);
+    h1.write_u128(test_u128);
+    h1.write_usize(test_usize);
+    h1.write_i8(test_i8);
+    h1.write_i16(test_i16);
+    h1.write_i32(test_i32);
+    h1.write_i64(test_i64);
+    h1.write_i128(test_i128);
+    h1.write_isize(test_isize);
+
     let mut h2 = SipHasher128::new_with_keys(0, 0);
-    h2.write(unsafe {
-        slice::from_raw_parts(&test_usize as *const _ as *const u8, mem::size_of::<usize>())
-    });
     h2.write(b"bytes");
     h2.write(b"string");
-    h2.write(&[0xFFu8, 0x01u8]);
-    assert_eq!(h1.finish128(), h2.finish128());
+    h2.write(&test_u8.to_le_bytes());
+    h2.write(&test_u16.to_le_bytes());
+    h2.write(&test_u32.to_le_bytes());
+    h2.write(&test_u64.to_le_bytes());
+    h2.write(&test_u128.to_le_bytes());
+    h2.write(&(test_usize as u64).to_le_bytes());
+    h2.write(&test_i8.to_le_bytes());
+    h2.write(&test_i16.to_le_bytes());
+    h2.write(&test_i32.to_le_bytes());
+    h2.write(&test_i64.to_le_bytes());
+    h2.write(&test_i128.to_le_bytes());
+    h2.write(&(test_isize as i64).to_le_bytes());
+
+    let h1_hash = h1.finish128();
+    let h2_hash = h2.finish128();
+
+    let expected = (5926600258011434223, 10938367019217336666);
+
+    assert_eq!(h1_hash, expected);
+    assert_eq!(h2_hash, expected);
 }
diff --git a/compiler/rustc_data_structures/src/stable_hasher.rs b/compiler/rustc_data_structures/src/stable_hasher.rs
@@ -5,6 +5,9 @@ use smallvec::SmallVec;
 use std::hash::{BuildHasher, Hash, Hasher};
 use std::mem;
 
+#[cfg(test)]
+mod tests;
+
 /// When hashing something that ends up affecting properties like symbol names,
 /// we want these symbol names to be calculated independently of other factors
 /// like what architecture you're compiling *from*.
@@ -57,6 +60,9 @@ impl StableHasher {
     }
 }
 
+// SipHasher128 currently handles ensuring platform-independent results with
+// respect to endianness and `isize` and `usize` differences (to the extent
+// possible). The write functions below don't need handle this at this time.
 impl Hasher for StableHasher {
     fn finish(&self) -> u64 {
         panic!("use StableHasher::finalize instead");
@@ -74,30 +80,27 @@ impl Hasher for StableHasher {
 
     #[inline]
     fn write_u16(&mut self, i: u16) {
-        self.state.write_u16(i.to_le());
+        self.state.write_u16(i);
     }
 
     #[inline]
     fn write_u32(&mut self, i: u32) {
-        self.state.write_u32(i.to_le());
+        self.state.write_u32(i);
     }
 
     #[inline]
     fn write_u64(&mut self, i: u64) {
-        self.state.write_u64(i.to_le());
+        self.state.write_u64(i);
     }
 
     #[inline]
     fn write_u128(&mut self, i: u128) {
-        self.state.write_u128(i.to_le());
+        self.state.write_u128(i);
     }
 
     #[inline]
     fn write_usize(&mut self, i: usize) {
-        // Always treat usize as u64 so we get the same results on 32 and 64 bit
-        // platforms. This is important for symbol hashes when cross compiling,
-        // for example.
-        self.state.write_u64((i as u64).to_le());
+        self.state.write_usize(i);
     }
 
     #[inline]
@@ -107,30 +110,27 @@ impl Hasher for StableHasher {
 
     #[inline]
     fn write_i16(&mut self, i: i16) {
-        self.state.write_i16(i.to_le());
+        self.state.write_i16(i);
     }
 
     #[inline]
     fn write_i32(&mut self, i: i32) {
-        self.state.write_i32(i.to_le());
+        self.state.write_i32(i);
     }
 
     #[inline]
     fn write_i64(&mut self, i: i64) {
-        self.state.write_i64(i.to_le());
+        self.state.write_i64(i);
     }
 
     #[inline]
     fn write_i128(&mut self, i: i128) {
-        self.state.write_i128(i.to_le());
+        self.state.write_i128(i);
     }
 
     #[inline]
     fn write_isize(&mut self, i: isize) {
-        // Always treat isize as i64 so we get the same results on 32 and 64 bit
-        // platforms. This is important for symbol hashes when cross compiling,
-        // for example.
-        self.state.write_i64((i as i64).to_le());
+        self.state.write_isize(i);
     }
 }
 
diff --git a/compiler/rustc_data_structures/src/stable_hasher/tests.rs b/compiler/rustc_data_structures/src/stable_hasher/tests.rs
@@ -0,0 +1,73 @@
+use super::*;
+
+// The tests below compare the computed hashes to particular expected values
+// in order to test that we produce the same results on different platforms,
+// regardless of endianness and `usize` and `isize` size differences (this
+// of course assumes we run these tests on platforms that differ in those
+// ways). The expected values depend on the hashing algorithm used, so they
+// need to be updated whenever StableHasher changes its hashing algorithm.
+
+#[test]
+fn test_hash_integers() {
+    // Test that integers are handled consistently across platforms.
+    let test_u8 = 0xAB_u8;
+    let test_u16 = 0xFFEE_u16;
+    let test_u32 = 0x445577AA_u32;
+    let test_u64 = 0x01234567_13243546_u64;
+    let test_u128 = 0x22114433_66557788_99AACCBB_EEDDFF77_u128;
+    let test_usize = 0xD0C0B0A0_usize;
+
+    let test_i8 = -100_i8;
+    let test_i16 = -200_i16;
+    let test_i32 = -300_i32;
+    let test_i64 = -400_i64;
+    let test_i128 = -500_i128;
+    let test_isize = -600_isize;
+
+    let mut h = StableHasher::new();
+    test_u8.hash(&mut h);
+    test_u16.hash(&mut h);
+    test_u32.hash(&mut h);
+    test_u64.hash(&mut h);
+    test_u128.hash(&mut h);
+    test_usize.hash(&mut h);
+    test_i8.hash(&mut h);
+    test_i16.hash(&mut h);
+    test_i32.hash(&mut h);
+    test_i64.hash(&mut h);
+    test_i128.hash(&mut h);
+    test_isize.hash(&mut h);
+
+    // This depends on the hashing algorithm. See note at top of file.
+    let expected = (2736651863462566372, 8121090595289675650);
+
+    assert_eq!(h.finalize(), expected);
+}
+
+#[test]
+fn test_hash_usize() {
+    // Test that usize specifically is handled consistently across platforms.
+    let test_usize = 0xABCDEF01_usize;
+
+    let mut h = StableHasher::new();
+    test_usize.hash(&mut h);
+
+    // This depends on the hashing algorithm. See note at top of file.
+    let expected = (5798740672699530587, 11186240177685111648);
+
+    assert_eq!(h.finalize(), expected);
+}
+
+#[test]
+fn test_hash_isize() {
+    // Test that isize specifically is handled consistently across platforms.
+    let test_isize = -7_isize;
+
+    let mut h = StableHasher::new();
+    test_isize.hash(&mut h);
+
+    // This depends on the hashing algorithm. See note at top of file.
+    let expected = (14721296605626097289, 11385941877786388409);
+
+    assert_eq!(h.finalize(), expected);
+}