util/deephash: require pointer in API (#5467)

The entry logic of Hash has extra complexity to make sure we always have an addressable value on hand. If not, we heap allocate the input. For this reason we document that there are performance benefits to always providing a pointer. Rather than documenting this, just enforce it through generics. Also, delete the unused HasherForType function. It's an interesting use of generics, but not well tested. We can resurrect it from code history if there's a need for it. Signed-off-by: Joe Tsai <joetsai@digital-static.net>
3 years ago · ab7e6f3f11
parent c5b1565337
commit ab7e6f3f11
2 changed files with 97 additions and 122 deletions
--- a/util/deephash/deephash.go
+++ b/util/deephash/deephash.go
@ -6,7 +6,7 @@
 // without looping. The hash is only valid within the lifetime of a program.
 // Users should not store the hash on disk or send it over the network.
 // The hash is sufficiently strong and unique such that
-// Hash(x) == Hash(y) is an appropriate replacement for x == y.
+// Hash(&x) == Hash(&y) is an appropriate replacement for x == y.
 //
 // The definition of equality is identical to reflect.DeepEqual except:
 //   - Floating-point values are compared based on the raw bits,
@ -65,6 +65,33 @@ type hasher struct {
 	visitStack visitStack
 }

+var hasherPool = &sync.Pool{
+	New: func() any { return new(hasher) },
+}
+
+func (h *hasher) reset() {
+	if h.Block512.Hash == nil {
+		h.Block512.Hash = sha256.New()
+	}
+	h.Block512.Reset()
+}
+
+// hashType hashes a reflect.Type.
+// The hash is only consistent within the lifetime of a program.
+func (h *hasher) hashType(t reflect.Type) {
+	// This approach relies on reflect.Type always being backed by a unique
+	// *reflect.rtype pointer. A safer approach is to use a global sync.Map
+	// that maps reflect.Type to some arbitrary and unique index.
+	// While safer, it requires global state with memory that can never be GC'd.
+	rtypeAddr := reflect.ValueOf(t).Pointer() // address of *reflect.rtype
+	h.HashUint64(uint64(rtypeAddr))
+}
+
+func (h *hasher) sum() (s Sum) {
+	h.Sum(s.sum[:0])
+	return s
+}
+
 // Sum is an opaque checksum type that is comparable.
 type Sum struct {
 	sum [sha256.Size]byte
@ -89,97 +116,57 @@ func initSeed() {
 	seed = uint64(time.Now().UnixNano())
 }

-func (h *hasher) Reset() {
-	if h.Block512.Hash == nil {
-		h.Block512.Hash = sha256.New()
-	}
-	h.Block512.Reset()
-}
-
-func (h *hasher) sum() (s Sum) {
-	h.Sum(s.sum[:0])
-	return s
-}
-
-var hasherPool = &sync.Pool{
-	New: func() any { return new(hasher) },
-}
-
 // Hash returns the hash of v.
-// For performance, this should be a non-nil pointer.
-func Hash(v any) (s Sum) {
+func Hash[T any](v *T) Sum {
 	h := hasherPool.Get().(*hasher)
 	defer hasherPool.Put(h)
-	h.Reset()
+	h.reset()
 	seedOnce.Do(initSeed)
 	h.HashUint64(seed)

-	rv := reflect.ValueOf(v)
-	if rv.IsValid() {
-		var t reflect.Type
-		var p pointer
-		if rv.Kind() == reflect.Pointer && !rv.IsNil() {
-			t = rv.Type().Elem()
-			p = pointerOf(rv)
-		} else {
-			t = rv.Type()
-			va := reflect.New(t).Elem()
-			va.Set(rv)
-			p = pointerOf(va.Addr())
-		}
-
-		// Always treat the Hash input as an interface (it is), including hashing
-		// its type, otherwise two Hash calls of different types could hash to the
-		// same bytes off the different types and get equivalent Sum values. This is
-		// the same thing that we do for reflect.Kind Interface in hashValue, but
-		// the initial reflect.ValueOf from an interface value effectively strips
-		// the interface box off so we have to do it at the top level by hand.
-		h.hashType(t)
-		ti := getTypeInfo(t)
-		ti.hasher()(h, p)
+	// Always treat the Hash input as if it were an interface by including
+	// a hash of the type. This ensures that hashing of two different types
+	// but with the same value structure produces different hashes.
+	t := reflect.TypeOf(v).Elem()
+	h.hashType(t)
+	if v == nil {
+		h.HashUint8(0) // indicates nil
+	} else {
+		h.HashUint8(1) // indicates visiting pointer element
+		p := pointerOf(reflect.ValueOf(v))
+		hash := getTypeInfo(t).hasher()
+		hash(h, p)
 	}
 	return h.sum()
 }

-// HasherForType is like Hash, but it returns a Hash func that's specialized for
-// the provided reflect type, avoiding a map lookup per value.
-func HasherForType[T any]() func(T) Sum {
-	var zeroT T
-	t := reflect.TypeOf(zeroT)
-	ti := getTypeInfo(t)
-	var tiElem *typeInfo
-	if t.Kind() == reflect.Pointer {
-		tiElem = getTypeInfo(t.Elem())
-	}
+// HasherForType returns a hash that is specialized for the provided type.
+func HasherForType[T any]() func(*T) Sum {
+	var v *T
 	seedOnce.Do(initSeed)
-
-	return func(v T) (s Sum) {
+	t := reflect.TypeOf(v).Elem()
+	hash := getTypeInfo(t).hasher()
+	return func(v *T) (s Sum) {
+		// This logic is identical to Hash, but pull out a few statements.
 		h := hasherPool.Get().(*hasher)
 		defer hasherPool.Put(h)
-		h.Reset()
+		h.reset()
 		h.HashUint64(seed)

-		rv := reflect.ValueOf(v)
-
-		if rv.IsValid() {
-			if rv.Kind() == reflect.Pointer && !rv.IsNil() {
-				p := pointerOf(rv)
-				h.hashType(t.Elem())
-				tiElem.hasher()(h, p)
-			} else {
-				va := reflect.New(t).Elem()
-				va.Set(rv)
-				p := pointerOf(va.Addr())
-				h.hashType(t)
-				ti.hasher()(h, p)
-			}
+		h.hashType(t)
+		if v == nil {
+			h.HashUint8(0) // indicates nil
+		} else {
+			h.HashUint8(1) // indicates visiting pointer element
+			p := pointerOf(reflect.ValueOf(v))
+			hash(h, p)
 		}
 		return h.sum()
 	}
 }

 // Update sets last to the hash of v and reports whether its value changed.
-func Update(last *Sum, v any) (changed bool) {
+func Update[T any](last *Sum, v *T) (changed bool) {
 	sum := Hash(v)
 	changed = sum != *last
 	if changed {
@ -233,9 +220,9 @@ func genTypeHasher(ti *typeInfo) typeHasherFunc {
 	// Types with specific hashing.
 	switch t {
 	case timeTimeType:
-		return (*hasher).hashTimev
+		return hashTime
 	case netipAddrType:
-		return (*hasher).hashAddrv
+		return hashAddr
 	}

 	// Types that can have their memory representation directly hashed.
@ -245,7 +232,7 @@ func genTypeHasher(ti *typeInfo) typeHasherFunc {

 	switch t.Kind() {
 	case reflect.String:
-		return (*hasher).hashString
+		return hashString
 	case reflect.Array:
 		return makeArrayHasher(t)
 	case reflect.Slice:
@ -263,14 +250,7 @@ func genTypeHasher(ti *typeInfo) typeHasherFunc {
 	}
 }

-func (h *hasher) hashString(p pointer) {
-	s := *p.asString()
-	h.HashUint64(uint64(len(s)))
-	h.HashString(s)
-}
-
-// hashTimev hashes v, of kind time.Time.
-func (h *hasher) hashTimev(p pointer) {
+func hashTime(h *hasher, p pointer) {
 	// Include the zone offset (but not the name) to keep
 	// Hash(t1) == Hash(t2) being semantically equivalent to
 	// t1.Format(time.RFC3339Nano) == t2.Format(time.RFC3339Nano).
@ -281,8 +261,7 @@ func (h *hasher) hashTimev(p pointer) {
 	h.HashUint32(uint32(offset))
 }

-// hashAddrv hashes v, of type netip.Addr.
-func (h *hasher) hashAddrv(p pointer) {
+func hashAddr(h *hasher, p pointer) {
 	// The formatting of netip.Addr covers the
 	// IP version, the address, and the optional zone name (for v6).
 	// This is equivalent to a1.MarshalBinary() == a2.MarshalBinary().
@ -304,6 +283,12 @@ func (h *hasher) hashAddrv(p pointer) {
 	}
 }

+func hashString(h *hasher, p pointer) {
+	s := *p.asString()
+	h.HashUint64(uint64(len(s)))
+	h.HashString(s)
+}
+
 func makeMemHasher(n uintptr) typeHasherFunc {
 	return func(h *hasher, p pointer) {
 		h.HashBytes(p.asMemory(n))
@ -448,7 +433,7 @@ func makeMapHasher(t reflect.Type) typeHasherFunc {
 		for iter := v.MapRange(); iter.Next(); {
 			k.SetIterKey(iter)
 			e.SetIterValue(iter)
-			mh.h.Reset()
+			mh.h.reset()
 			hashKey(&mh.h, pointerOf(k.Addr()))
 			hashValue(&mh.h, pointerOf(e.Addr()))
 			mh.sum.xor(mh.h.sum())
@ -567,14 +552,3 @@ func (c *valueCache) get(t reflect.Type) reflect.Value {
 	}
 	return v
 }
-
-// hashType hashes a reflect.Type.
-// The hash is only consistent within the lifetime of a program.
-func (h *hasher) hashType(t reflect.Type) {
-	// This approach relies on reflect.Type always being backed by a unique
-	// *reflect.rtype pointer. A safer approach is to use a global sync.Map
-	// that maps reflect.Type to some arbitrary and unique index.
-	// While safer, it requires global state with memory that can never be GC'd.
-	rtypeAddr := reflect.ValueOf(t).Pointer() // address of *reflect.rtype
-	h.HashUint64(uint64(rtypeAddr))
-}
--- a/util/deephash/deephash_test.go
+++ b/util/deephash/deephash_test.go
@ -160,7 +160,7 @@ func TestHash(t *testing.T) {
 	}

 	for _, tt := range tests {
-		gotEq := Hash(tt.in[0]) == Hash(tt.in[1])
+		gotEq := Hash(&tt.in[0]) == Hash(&tt.in[1])
 		if gotEq != tt.wantEq {
 			t.Errorf("(Hash(%T %v) == Hash(%T %v)) = %v, want %v", tt.in[0], tt.in[0], tt.in[1], tt.in[1], gotEq, tt.wantEq)
 		}
@ -171,11 +171,11 @@ func TestDeepHash(t *testing.T) {
 	// v contains the types of values we care about for our current callers.
 	// Mostly we're just testing that we don't panic on handled types.
 	v := getVal()
-
 	hash1 := Hash(v)
 	t.Logf("hash: %v", hash1)
 	for i := 0; i < 20; i++ {
-		hash2 := Hash(getVal())
+		v := getVal()
+		hash2 := Hash(v)
 		if hash1 != hash2 {
 			t.Error("second hash didn't match")
 		}
@ -186,7 +186,7 @@ func TestDeepHash(t *testing.T) {
 func TestIssue4868(t *testing.T) {
 	m1 := map[int]string{1: "foo"}
 	m2 := map[int]string{1: "bar"}
-	if Hash(m1) == Hash(m2) {
+	if Hash(&m1) == Hash(&m2) {
 		t.Error("bogus")
 	}
 }
@ -194,7 +194,7 @@ func TestIssue4868(t *testing.T) {
 func TestIssue4871(t *testing.T) {
 	m1 := map[string]string{"": "", "x": "foo"}
 	m2 := map[string]string{}
-	if h1, h2 := Hash(m1), Hash(m2); h1 == h2 {
+	if h1, h2 := Hash(&m1), Hash(&m2); h1 == h2 {
 		t.Errorf("bogus: h1=%x, h2=%x", h1, h2)
 	}
 }
@ -202,7 +202,7 @@ func TestIssue4871(t *testing.T) {
 func TestNilVsEmptymap(t *testing.T) {
 	m1 := map[string]string(nil)
 	m2 := map[string]string{}
-	if h1, h2 := Hash(m1), Hash(m2); h1 == h2 {
+	if h1, h2 := Hash(&m1), Hash(&m2); h1 == h2 {
 		t.Errorf("bogus: h1=%x, h2=%x", h1, h2)
 	}
 }
@ -210,7 +210,7 @@ func TestNilVsEmptymap(t *testing.T) {
 func TestMapFraming(t *testing.T) {
 	m1 := map[string]string{"foo": "", "fo": "o"}
 	m2 := map[string]string{}
-	if h1, h2 := Hash(m1), Hash(m2); h1 == h2 {
+	if h1, h2 := Hash(&m1), Hash(&m2); h1 == h2 {
 		t.Errorf("bogus: h1=%x, h2=%x", h1, h2)
 	}
 }
@ -218,23 +218,25 @@ func TestMapFraming(t *testing.T) {
 func TestQuick(t *testing.T) {
 	initSeed()
 	err := quick.Check(func(v, w map[string]string) bool {
-		return (Hash(v) == Hash(w)) == reflect.DeepEqual(v, w)
+		return (Hash(&v) == Hash(&w)) == reflect.DeepEqual(v, w)
 	}, &quick.Config{MaxCount: 1000, Rand: rand.New(rand.NewSource(int64(seed)))})
 	if err != nil {
 		t.Fatalf("seed=%v, err=%v", seed, err)
 	}
 }

-func getVal() any {
-	return &struct {
-		WGConfig         *wgcfg.Config
-		RouterConfig     *router.Config
-		MapFQDNAddrs     map[dnsname.FQDN][]netip.Addr
-		MapFQDNAddrPorts map[dnsname.FQDN][]netip.AddrPort
-		MapDiscoPublics  map[key.DiscoPublic]bool
-		MapResponse      *tailcfg.MapResponse
-		FilterMatch      filter.Match
-	}{
+type tailscaleTypes struct {
+	WGConfig         *wgcfg.Config
+	RouterConfig     *router.Config
+	MapFQDNAddrs     map[dnsname.FQDN][]netip.Addr
+	MapFQDNAddrPorts map[dnsname.FQDN][]netip.AddrPort
+	MapDiscoPublics  map[key.DiscoPublic]bool
+	MapResponse      *tailcfg.MapResponse
+	FilterMatch      filter.Match
+}
+
+func getVal() *tailscaleTypes {
+	return &tailscaleTypes{
 		&wgcfg.Config{
 			Name:      "foo",
 			Addresses: []netip.Prefix{netip.PrefixFrom(netip.AddrFrom16([16]byte{3: 3}).Unmap(), 5)},
@ -600,23 +602,23 @@ func TestMapCycle(t *testing.T) {
 	a["self"] = a
 	b := make(M) // cylic graph of 1 node
 	b["self"] = b
-	ha := Hash(a)
-	hb := Hash(b)
+	ha := Hash(&a)
+	hb := Hash(&b)
 	c.Assert(ha, qt.Equals, hb)

 	c1 := make(M) // cyclic graph of 2 nodes
 	c2 := make(M) // cyclic graph of 2 nodes
 	c1["peer"] = c2
 	c2["peer"] = c1
-	hc1 := Hash(c1)
-	hc2 := Hash(c2)
+	hc1 := Hash(&c1)
+	hc2 := Hash(&c2)
 	c.Assert(hc1, qt.Equals, hc2)
 	c.Assert(ha, qt.Not(qt.Equals), hc1)
 	c.Assert(hb, qt.Not(qt.Equals), hc2)

 	c3 := make(M) // graph of 1 node pointing to cyclic graph of 2 nodes
 	c3["child"] = c1
-	hc3 := Hash(c3)
+	hc3 := Hash(&c3)
 	c.Assert(hc1, qt.Not(qt.Equals), hc3)
 }

@ -732,9 +734,8 @@ var filterRules = []tailcfg.FilterRule{
 func BenchmarkHashPacketFilter(b *testing.B) {
 	b.ReportAllocs()

-	hash := HasherForType[*[]tailcfg.FilterRule]()
 	for i := 0; i < b.N; i++ {
-		sink = hash(&filterRules)
+		sink = Hash(&filterRules)
 	}
 }

@ -815,7 +816,7 @@ func BenchmarkTailcfgNode(b *testing.B) {
 func TestExhaustive(t *testing.T) {
 	seen := make(map[Sum]bool)
 	for i := 0; i < 100000; i++ {
-		s := Hash(i)
+		s := Hash(&i)
 		if seen[s] {
 			t.Fatalf("hash collision %v", i)
 		}