1// Copyright 2019 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Package maphash provides hash functions on byte sequences.
6// These hash functions are intended to be used to implement hash tables or
7// other data structures that need to map arbitrary strings or byte
8// sequences to a uniform distribution on unsigned 64-bit integers.
9// Each different instance of a hash table or data structure should use its own [Seed].
10//
11// The hash functions are not cryptographically secure.
12// (See crypto/sha256 and crypto/sha512 for cryptographic use.)
13package maphash
14
15// A Seed is a random value that selects the specific hash function
16// computed by a [Hash]. If two Hashes use the same Seeds, they
17// will compute the same hash values for any given input.
18// If two Hashes use different Seeds, they are very likely to compute
19// distinct hash values for any given input.
20//
21// A Seed must be initialized by calling [MakeSeed].
22// The zero seed is uninitialized and not valid for use with [Hash]'s SetSeed method.
23//
24// Each Seed value is local to a single process and cannot be serialized
25// or otherwise recreated in a different process.
26type Seed struct {
27	s uint64
28}
29
30// Bytes returns the hash of b with the given seed.
31//
32// Bytes is equivalent to, but more convenient and efficient than:
33//
34//	var h Hash
35//	h.SetSeed(seed)
36//	h.Write(b)
37//	return h.Sum64()
38func Bytes(seed Seed, b []byte) uint64 {
39	state := seed.s
40	if state == 0 {
41		panic("maphash: use of uninitialized Seed")
42	}
43
44	if len(b) > bufSize {
45		b = b[:len(b):len(b)] // merge len and cap calculations when reslicing
46		for len(b) > bufSize {
47			state = rthash(b[:bufSize], state)
48			b = b[bufSize:]
49		}
50	}
51	return rthash(b, state)
52}
53
54// String returns the hash of s with the given seed.
55//
56// String is equivalent to, but more convenient and efficient than:
57//
58//	var h Hash
59//	h.SetSeed(seed)
60//	h.WriteString(s)
61//	return h.Sum64()
62func String(seed Seed, s string) uint64 {
63	state := seed.s
64	if state == 0 {
65		panic("maphash: use of uninitialized Seed")
66	}
67	for len(s) > bufSize {
68		state = rthashString(s[:bufSize], state)
69		s = s[bufSize:]
70	}
71	return rthashString(s, state)
72}
73
74// A Hash computes a seeded hash of a byte sequence.
75//
76// The zero Hash is a valid Hash ready to use.
77// A zero Hash chooses a random seed for itself during
78// the first call to a Reset, Write, Seed, or Sum64 method.
79// For control over the seed, use SetSeed.
80//
81// The computed hash values depend only on the initial seed and
82// the sequence of bytes provided to the Hash object, not on the way
83// in which the bytes are provided. For example, the three sequences
84//
85//	h.Write([]byte{'f','o','o'})
86//	h.WriteByte('f'); h.WriteByte('o'); h.WriteByte('o')
87//	h.WriteString("foo")
88//
89// all have the same effect.
90//
91// Hashes are intended to be collision-resistant, even for situations
92// where an adversary controls the byte sequences being hashed.
93//
94// A Hash is not safe for concurrent use by multiple goroutines, but a Seed is.
95// If multiple goroutines must compute the same seeded hash,
96// each can declare its own Hash and call SetSeed with a common Seed.
97type Hash struct {
98	_     [0]func()     // not comparable
99	seed  Seed          // initial seed used for this hash
100	state Seed          // current hash of all flushed bytes
101	buf   [bufSize]byte // unflushed byte buffer
102	n     int           // number of unflushed bytes
103}
104
105// bufSize is the size of the Hash write buffer.
106// The buffer ensures that writes depend only on the sequence of bytes,
107// not the sequence of WriteByte/Write/WriteString calls,
108// by always calling rthash with a full buffer (except for the tail).
109const bufSize = 128
110
111// initSeed seeds the hash if necessary.
112// initSeed is called lazily before any operation that actually uses h.seed/h.state.
113// Note that this does not include Write/WriteByte/WriteString in the case
114// where they only add to h.buf. (If they write too much, they call h.flush,
115// which does call h.initSeed.)
116func (h *Hash) initSeed() {
117	if h.seed.s == 0 {
118		seed := MakeSeed()
119		h.seed = seed
120		h.state = seed
121	}
122}
123
124// WriteByte adds b to the sequence of bytes hashed by h.
125// It never fails; the error result is for implementing [io.ByteWriter].
126func (h *Hash) WriteByte(b byte) error {
127	if h.n == len(h.buf) {
128		h.flush()
129	}
130	h.buf[h.n] = b
131	h.n++
132	return nil
133}
134
135// Write adds b to the sequence of bytes hashed by h.
136// It always writes all of b and never fails; the count and error result are for implementing [io.Writer].
137func (h *Hash) Write(b []byte) (int, error) {
138	size := len(b)
139	// Deal with bytes left over in h.buf.
140	// h.n <= bufSize is always true.
141	// Checking it is ~free and it lets the compiler eliminate a bounds check.
142	if h.n > 0 && h.n <= bufSize {
143		k := copy(h.buf[h.n:], b)
144		h.n += k
145		if h.n < bufSize {
146			// Copied the entirety of b to h.buf.
147			return size, nil
148		}
149		b = b[k:]
150		h.flush()
151		// No need to set h.n = 0 here; it happens just before exit.
152	}
153	// Process as many full buffers as possible, without copying, and calling initSeed only once.
154	if len(b) > bufSize {
155		h.initSeed()
156		for len(b) > bufSize {
157			h.state.s = rthash(b[:bufSize], h.state.s)
158			b = b[bufSize:]
159		}
160	}
161	// Copy the tail.
162	copy(h.buf[:], b)
163	h.n = len(b)
164	return size, nil
165}
166
167// WriteString adds the bytes of s to the sequence of bytes hashed by h.
168// It always writes all of s and never fails; the count and error result are for implementing [io.StringWriter].
169func (h *Hash) WriteString(s string) (int, error) {
170	// WriteString mirrors Write. See Write for comments.
171	size := len(s)
172	if h.n > 0 && h.n <= bufSize {
173		k := copy(h.buf[h.n:], s)
174		h.n += k
175		if h.n < bufSize {
176			return size, nil
177		}
178		s = s[k:]
179		h.flush()
180	}
181	if len(s) > bufSize {
182		h.initSeed()
183		for len(s) > bufSize {
184			h.state.s = rthashString(s[:bufSize], h.state.s)
185			s = s[bufSize:]
186		}
187	}
188	copy(h.buf[:], s)
189	h.n = len(s)
190	return size, nil
191}
192
193// Seed returns h's seed value.
194func (h *Hash) Seed() Seed {
195	h.initSeed()
196	return h.seed
197}
198
199// SetSeed sets h to use seed, which must have been returned by [MakeSeed]
200// or by another [Hash.Seed] method.
201// Two [Hash] objects with the same seed behave identically.
202// Two [Hash] objects with different seeds will very likely behave differently.
203// Any bytes added to h before this call will be discarded.
204func (h *Hash) SetSeed(seed Seed) {
205	if seed.s == 0 {
206		panic("maphash: use of uninitialized Seed")
207	}
208	h.seed = seed
209	h.state = seed
210	h.n = 0
211}
212
213// Reset discards all bytes added to h.
214// (The seed remains the same.)
215func (h *Hash) Reset() {
216	h.initSeed()
217	h.state = h.seed
218	h.n = 0
219}
220
221// precondition: buffer is full.
222func (h *Hash) flush() {
223	if h.n != len(h.buf) {
224		panic("maphash: flush of partially full buffer")
225	}
226	h.initSeed()
227	h.state.s = rthash(h.buf[:h.n], h.state.s)
228	h.n = 0
229}
230
231// Sum64 returns h's current 64-bit value, which depends on
232// h's seed and the sequence of bytes added to h since the
233// last call to [Hash.Reset] or [Hash.SetSeed].
234//
235// All bits of the Sum64 result are close to uniformly and
236// independently distributed, so it can be safely reduced
237// by using bit masking, shifting, or modular arithmetic.
238func (h *Hash) Sum64() uint64 {
239	h.initSeed()
240	return rthash(h.buf[:h.n], h.state.s)
241}
242
243// MakeSeed returns a new random seed.
244func MakeSeed() Seed {
245	var s uint64
246	for {
247		s = randUint64()
248		// We use seed 0 to indicate an uninitialized seed/hash,
249		// so keep trying until we get a non-zero seed.
250		if s != 0 {
251			break
252		}
253	}
254	return Seed{s: s}
255}
256
257// Sum appends the hash's current 64-bit value to b.
258// It exists for implementing [hash.Hash].
259// For direct calls, it is more efficient to use [Hash.Sum64].
260func (h *Hash) Sum(b []byte) []byte {
261	x := h.Sum64()
262	return append(b,
263		byte(x>>0),
264		byte(x>>8),
265		byte(x>>16),
266		byte(x>>24),
267		byte(x>>32),
268		byte(x>>40),
269		byte(x>>48),
270		byte(x>>56))
271}
272
273// Size returns h's hash value size, 8 bytes.
274func (h *Hash) Size() int { return 8 }
275
276// BlockSize returns h's block size.
277func (h *Hash) BlockSize() int { return len(h.buf) }
278