hash/maphash: revise API to be more idiomatic

rsc · rsc · commit 5a7c571ea19a · 2019-11-04T21:30:29.000Z
This CL makes these changes to the hash/maphash API to make it fit a bit more into the standard library: - Move some of the package doc onto type Hash, so that `go doc maphash.Hash` shows it. - Instead of having identical AddBytes and Write methods, standardize on Write, the usual name for this function. Similarly, AddString -> WriteString, AddByte -> WriteByte. - Instead of having identical Hash and Sum64 methods, standardize on Sum64 (for hash.Hash64). Dropping the "Hash" method also helps because Hash is usually reserved to mean the state of a hash function (hash.Hash etc), not the hash value itself. - Make an uninitialized hash.Hash auto-seed with a random seed. It is critical that users not use the same seed for all hash functions in their program, at least not accidentally. So the Hash implementation must either panic if uninitialized or initialize itself. Initializing itself is less work for users and can be done lazily. - Now that the zero hash.Hash is useful, drop maphash.New in favor of new(maphash.Hash) or simply declaring a maphash.Hash. - Add a [0]func()-typed field to the Hash so that Hashes cannot be compared. (I considered doing the same for Seed but comparing seeds seems OK.) - Drop the integer argument from MakeSeed, to match the original design in golang.org/issue/28322. There is no point to giving users control over the specific seed bits, since we want the interpretation of those bits to be different in every different process. The only thing users need is to be able to create a new random seed at each call. (Fixes a TODO in MakeSeed's public doc comment.) This API is new in Go 1.14, so these changes do not violate the compatibility promise. Fixes #35060. Fixes #35348. Change-Id: Ie6fecc441f3f5ef66388c6ead92e875c0871f805 Reviewed-on: https://go-review.googlesource.com/c/go/+/205069 Run-TryBot: Russ Cox <rsc@golang.org> Reviewed-by: Alan Donovan <adonovan@google.com> Reviewed-by: Keith Randall <khr@golang.org>
diff --git a/src/hash/maphash/maphash.go b/src/hash/maphash/maphash.go
@@ -2,96 +2,128 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// Package hash/maphash provides hash functions on byte sequences. These
-// hash functions are intended to be used to implement hash tables or
+// Package maphash provides hash functions on byte sequences.
+// These hash functions are intended to be used to implement hash tables or
 // other data structures that need to map arbitrary strings or byte
-// sequences to a uniform distribution of integers. The hash functions
-// are collision-resistant but are not cryptographically secure (use
-// one of the hash functions in crypto/* if you need that).
+// sequences to a uniform distribution of integers.
 //
-// The produced hashes depend only on the sequence of bytes provided
-// to the Hash object, not on the way in which they are provided. For
-// example, the calls
-//     h.AddString("foo")
-//     h.AddBytes([]byte{'f','o','o'})
-//     h.AddByte('f'); h.AddByte('o'); h.AddByte('o')
-// will all have the same effect.
-//
-// Two Hash instances in the same process using the same seed
-// behave identically.
-//
-// Two Hash instances with the same seed in different processes are
-// not guaranteed to behave identically, even if the processes share
-// the same binary.
-//
-// Hashes are intended to be collision-resistant, even for situations
-// where an adversary controls the byte sequences being hashed.
-// All bits of the Hash result are close to uniformly and
-// independently distributed, so can be safely restricted to a range
-// using bit masking, shifting, or modular arithmetic.
+// The hash functions are collision-resistant but not cryptographically secure.
+// (See crypto/sha256 and crypto/sha512 for cryptographic use.)
 package maphash
 
-import (
-	"unsafe"
-)
+import "unsafe"
 
-// A Seed controls the behavior of a Hash.  Two Hash objects with the
-// same seed in the same process will behave identically.  Two Hash
-// objects with different seeds will very likely behave differently.
+// A Seed is a random value that selects the specific hash function
+// computed by a Hash. If two Hashes use the same Seeds, they
+// will compute the same hash values for any given input.
+// If two Hashes use different Seeds, they are very likely to compute
+// distinct hash values for any given input.
+//
+// A Seed must be initialized by calling MakeSeed.
+// The zero seed is uninitialized and not valid for use with Hash's SetSeed method.
+//
+// Each Seed value is local to a single process and cannot be serialized
+// or otherwise recreated in a different process.
 type Seed struct {
 	s uint64
 }
 
-// A Hash object is used to compute the hash of a byte sequence.
+// A Hash computes a seeded hash of a byte sequence.
+//
+// The zero Hash is a valid Hash ready to use.
+// A zero Hash chooses a random seed for itself during
+// the first call to a Reset, Write, Seed, Sum64, or Seed method.
+// For control over the seed, use SetSeed.
+//
+// The computed hash values depend only on the initial seed and
+// the sequence of bytes provided to the Hash object, not on the way
+// in which the bytes are provided. For example, the three sequences
+//
+//     h.Write([]byte{'f','o','o'})
+//     h.WriteByte('f'); h.WriteByte('o'); h.WriteByte('o')
+//     h.WriteString("foo")
+//
+// all have the same effect.
+//
+// Hashes are intended to be collision-resistant, even for situations
+// where an adversary controls the byte sequences being hashed.
+//
+// A Hash is not safe for concurrent use by multiple goroutines, but a Seed is.
+// If multiple goroutines must compute the same seeded hash,
+// each can declare its own Hash and call SetSeed with a common Seed.
 type Hash struct {
-	seed  Seed     // initial seed used for this hash
-	state Seed     // current hash of all flushed bytes
-	buf   [64]byte // unflushed byte buffer
-	n     int      // number of unflushed bytes
+	_     [0]func() // not comparable
+	seed  Seed      // initial seed used for this hash
+	state Seed      // current hash of all flushed bytes
+	buf   [64]byte  // unflushed byte buffer
+	n     int       // number of unflushed bytes
+}
+
+// initSeed seeds the hash if necessary.
+// initSeed is called lazily before any operation that actually uses h.seed/h.state.
+// Note that this does not include Write/WriteByte/WriteString in the case
+// where they only add to h.buf. (If they write too much, they call h.flush,
+// which does call h.initSeed.)
+func (h *Hash) initSeed() {
+	if h.seed.s == 0 {
+		h.SetSeed(MakeSeed())
+	}
 }
 
-// AddByte adds b to the sequence of bytes hashed by h.
-func (h *Hash) AddByte(b byte) {
+// WriteByte adds b to the sequence of bytes hashed by h.
+// It never fails; the error result is for implementing io.ByteWriter.
+func (h *Hash) WriteByte(b byte) error {
 	if h.n == len(h.buf) {
 		h.flush()
 	}
 	h.buf[h.n] = b
 	h.n++
+	return nil
 }
 
-// AddBytes adds b to the sequence of bytes hashed by h.
-func (h *Hash) AddBytes(b []byte) {
+// Write adds b to the sequence of bytes hashed by h.
+// It always writes all of b and never fails; the count and error result are for implementing io.Writer.
+func (h *Hash) Write(b []byte) (int, error) {
+	size := len(b)
 	for h.n+len(b) > len(h.buf) {
 		k := copy(h.buf[h.n:], b)
 		h.n = len(h.buf)
 		b = b[k:]
 		h.flush()
 	}
 	h.n += copy(h.buf[h.n:], b)
+	return size, nil
 }
 
-// AddString adds the bytes of s to the sequence of bytes hashed by h.
-func (h *Hash) AddString(s string) {
+// WriteString adds the bytes of s to the sequence of bytes hashed by h.
+// It always writes all of s and never fails; the count and error result are for implementing io.StringWriter.
+func (h *Hash) WriteString(s string) (int, error) {
+	size := len(s)
 	for h.n+len(s) > len(h.buf) {
 		k := copy(h.buf[h.n:], s)
 		h.n = len(h.buf)
 		s = s[k:]
 		h.flush()
 	}
 	h.n += copy(h.buf[h.n:], s)
+	return size, nil
 }
 
-// Seed returns the seed value specified in the most recent call to
-// SetSeed, or the initial seed if SetSeed was never called.
+// Seed returns h's seed value.
 func (h *Hash) Seed() Seed {
+	h.initSeed()
 	return h.seed
 }
 
-// SetSeed sets the seed used by h. Two Hash objects with the same
-// seed in the same process will behave identically.  Two Hash objects
-// with different seeds will very likely behave differently.  Any
-// bytes added to h previous to this call will be discarded.
+// SetSeed sets h to use seed, which must have been returned by MakeSeed
+// or by another Hash's Seed method.
+// Two Hash objects with the same seed behave identically.
+// Two Hash objects with different seeds will very likely behave differently.
+// Any bytes added to h before this call will be discarded.
 func (h *Hash) SetSeed(seed Seed) {
+	if seed.s == 0 {
+		panic("maphash: use of uninitialized Seed")
+	}
 	h.seed = seed
 	h.state = seed
 	h.n = 0
@@ -100,43 +132,46 @@ func (h *Hash) SetSeed(seed Seed) {
 // Reset discards all bytes added to h.
 // (The seed remains the same.)
 func (h *Hash) Reset() {
+	h.initSeed()
 	h.state = h.seed
 	h.n = 0
 }
 
 // precondition: buffer is full.
 func (h *Hash) flush() {
 	if h.n != len(h.buf) {
-		panic("flush of partially full buffer")
+		panic("maphash: flush of partially full buffer")
 	}
+	h.initSeed()
 	h.state.s = rthash(h.buf[:], h.state.s)
 	h.n = 0
 }
 
-// Hash returns a value which depends on h's seed and the sequence of
-// bytes added to h (since the last call to Reset or SetSeed).
-func (h *Hash) Hash() uint64 {
+// Sum64 returns h's current 64-bit value, which depends on
+// h's seed and the sequence of bytes added to h since the
+// last call to Reset or SetSeed.
+//
+// All bits of the Sum64 result are close to uniformly and
+// independently distributed, so it can be safely reduced
+// by using bit masking, shifting, or modular arithmetic.
+func (h *Hash) Sum64() uint64 {
+	h.initSeed()
 	return rthash(h.buf[:h.n], h.state.s)
 }
 
-// MakeSeed returns a Seed initialized using the bits in s.
-// Two seeds generated with the same s are guaranteed to be equal.
-// Two seeds generated with different s are very likely to be different.
-// TODO: disallow this? See Alan's comment in the issue.
-func MakeSeed(s uint64) Seed {
-	return Seed{s: s}
-}
-
-// New returns a new Hash object. Different hash objects allocated by
-// this function will very likely have different seeds.
-func New() *Hash {
-	s1 := uint64(runtime_fastrand())
-	s2 := uint64(runtime_fastrand())
-	seed := Seed{s: s1<<32 + s2}
-	return &Hash{
-		seed:  seed,
-		state: seed,
+// MakeSeed returns a new random seed.
+func MakeSeed() Seed {
+	var s1, s2 uint64
+	for {
+		s1 = uint64(runtime_fastrand())
+		s2 = uint64(runtime_fastrand())
+		// We use seed 0 to indicate an uninitialized seed/hash,
+		// so keep trying until we get a non-zero seed.
+		if s1|s2 != 0 {
+			break
+		}
 	}
+	return Seed{s: s1<<32 + s2}
 }
 
 //go:linkname runtime_fastrand runtime.fastrand
@@ -154,22 +189,17 @@ func rthash(b []byte, seed uint64) uint64 {
 	}
 	lo := runtime_memhash(unsafe.Pointer(&b[0]), uintptr(seed), uintptr(len(b)))
 	hi := runtime_memhash(unsafe.Pointer(&b[0]), uintptr(seed>>32), uintptr(len(b)))
-	// TODO: mix lo/hi? Get 64 bits some other way?
 	return uint64(hi)<<32 | uint64(lo)
 }
 
 //go:linkname runtime_memhash runtime.memhash
 func runtime_memhash(p unsafe.Pointer, seed, s uintptr) uintptr
 
-// Wrapper functions so that a hash/maphash.Hash implements
-// the hash.Hash and hash.Hash64 interfaces.
-
-func (h *Hash) Write(b []byte) (int, error) {
-	h.AddBytes(b)
-	return len(b), nil
-}
+// Sum appends the hash's current 64-bit value to b.
+// It exists for implementing hash.Hash.
+// For direct calls, it is more efficient to use Sum64.
 func (h *Hash) Sum(b []byte) []byte {
-	x := h.Hash()
+	x := h.Sum64()
 	return append(b,
 		byte(x>>0),
 		byte(x>>8),
@@ -180,8 +210,9 @@ func (h *Hash) Sum(b []byte) []byte {
 		byte(x>>48),
 		byte(x>>56))
 }
-func (h *Hash) Sum64() uint64 {
-	return h.Hash()
-}
-func (h *Hash) Size() int      { return 8 }
+
+// Size returns h's hash value size, 8 bytes.
+func (h *Hash) Size() int { return 8 }
+
+// BlockSize returns h's block size.
 func (h *Hash) BlockSize() int { return len(h.buf) }
diff --git a/src/hash/maphash/maphash_test.go b/src/hash/maphash/maphash_test.go
@@ -2,32 +2,31 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-package maphash_test
+package maphash
 
 import (
 	"hash"
-	"hash/maphash"
 	"testing"
 )
 
 func TestUnseededHash(t *testing.T) {
 	m := map[uint64]struct{}{}
 	for i := 0; i < 1000; i++ {
-		h := maphash.New()
-		m[h.Hash()] = struct{}{}
+		h := new(Hash)
+		m[h.Sum64()] = struct{}{}
 	}
 	if len(m) < 900 {
 		t.Errorf("empty hash not sufficiently random: got %d, want 1000", len(m))
 	}
 }
 
 func TestSeededHash(t *testing.T) {
-	s := maphash.MakeSeed(1234)
+	s := MakeSeed()
 	m := map[uint64]struct{}{}
 	for i := 0; i < 1000; i++ {
-		h := maphash.New()
+		h := new(Hash)
 		h.SetSeed(s)
-		m[h.Hash()] = struct{}{}
+		m[h.Sum64()] = struct{}{}
 	}
 	if len(m) != 1 {
 		t.Errorf("seeded hash is random: got %d, want 1", len(m))
@@ -36,28 +35,37 @@ func TestSeededHash(t *testing.T) {
 
 func TestHashGrouping(t *testing.T) {
 	b := []byte("foo")
-	h1 := maphash.New()
-	h2 := maphash.New()
+	h1 := new(Hash)
+	h2 := new(Hash)
 	h2.SetSeed(h1.Seed())
-	h1.AddBytes(b)
+	h1.Write(b)
 	for _, x := range b {
-		h2.AddByte(x)
+		err := h2.WriteByte(x)
+		if err != nil {
+			t.Fatalf("WriteByte: %v", err)
+		}
 	}
-	if h1.Hash() != h2.Hash() {
+	if h1.Sum64() != h2.Sum64() {
 		t.Errorf("hash of \"foo\" and \"f\",\"o\",\"o\" not identical")
 	}
 }
 
 func TestHashBytesVsString(t *testing.T) {
 	s := "foo"
 	b := []byte(s)
-	h1 := maphash.New()
-	h2 := maphash.New()
+	h1 := new(Hash)
+	h2 := new(Hash)
 	h2.SetSeed(h1.Seed())
-	h1.AddString(s)
-	h2.AddBytes(b)
-	if h1.Hash() != h2.Hash() {
-		t.Errorf("hash of string and byts not identical")
+	n1, err1 := h1.WriteString(s)
+	if n1 != len(s) || err1 != nil {
+		t.Fatalf("WriteString(s) = %d, %v, want %d, nil", n1, err1, len(s))
+	}
+	n2, err2 := h2.Write(b)
+	if n2 != len(b) || err2 != nil {
+		t.Fatalf("Write(b) = %d, %v, want %d, nil", n2, err2, len(b))
+	}
+	if h1.Sum64() != h2.Sum64() {
+		t.Errorf("hash of string and bytes not identical")
 	}
 }
 
@@ -66,15 +74,15 @@ func TestHashHighBytes(t *testing.T) {
 	const N = 10
 	m := map[uint64]struct{}{}
 	for i := 0; i < N; i++ {
-		h := maphash.New()
-		h.AddString("foo")
-		m[h.Hash()>>32] = struct{}{}
+		h := new(Hash)
+		h.WriteString("foo")
+		m[h.Sum64()>>32] = struct{}{}
 	}
 	if len(m) < N/2 {
 		t.Errorf("from %d seeds, wanted at least %d different hashes; got %d", N, N/2, len(m))
 	}
 }
 
 // Make sure a Hash implements the hash.Hash and hash.Hash64 interfaces.
-var _ hash.Hash = &maphash.Hash{}
-var _ hash.Hash64 = &maphash.Hash{}
+var _ hash.Hash = &Hash{}
+var _ hash.Hash64 = &Hash{}
diff --git a/src/hash/maphash/smhasher_test.go b/src/hash/maphash/smhasher_test.go