You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
tailscale/util/deephash/deephash.go

577 lines
16 KiB
Go

// Copyright (c) 2020 Tailscale Inc & AUTHORS All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package deephash hashes a Go value recursively, in a predictable order,
// without looping. The hash is only valid within the lifetime of a program.
// Users should not store the hash on disk or send it over the network.
// The hash is sufficiently strong and unique such that
// Hash(x) == Hash(y) is an appropriate replacement for x == y.
//
// The definition of equality is identical to reflect.DeepEqual except:
// - Floating-point values are compared based on the raw bits,
// which means that NaNs (with the same bit pattern) are treated as equal.
// - time.Time are compared based on whether they are the same instant in time
// and also in the same zone offset. Monotonic measurements and zone names
// are ignored as part of the hash.
// - netip.Addr are compared based on a shallow comparison of the struct.
//
// WARNING: This package, like most of the tailscale.com Go module,
// should be considered Tailscale-internal; we make no API promises.
package deephash
import (
"crypto/sha256"
"encoding/binary"
"encoding/hex"
"reflect"
"sync"
"time"
"tailscale.com/util/hashx"
)
// There is much overlap between the theory of serialization and hashing.
// A hash (useful for determining equality) can be produced by printing a value
// and hashing the output. The format must:
// * be deterministic such that the same value hashes to the same output, and
// * be parsable such that the same value can be reproduced by the output.
//
// The logic below hashes a value by printing it to a hash.Hash.
// To be parsable, it assumes that we know the Go type of each value:
// * scalar types (e.g., bool or int32) are printed as fixed-width fields.
// * list types (e.g., strings, slices, and AppendTo buffers) are prefixed
// by a fixed-width length field, followed by the contents of the list.
// * slices, arrays, and structs print each element/field consecutively.
// * interfaces print with a 1-byte prefix indicating whether it is nil.
// If non-nil, it is followed by a fixed-width field of the type index,
// followed by the format of the underlying value.
// * pointers print with a 1-byte prefix indicating whether the pointer is
// 1) nil, 2) previously seen, or 3) newly seen. Previously seen pointers are
// followed by a fixed-width field with the index of the previous pointer.
// Newly seen pointers are followed by the format of the underlying value.
// * maps print with a 1-byte prefix indicating whether the map pointer is
// 1) nil, 2) previously seen, or 3) newly seen. Previously seen pointers
// are followed by a fixed-width field of the index of the previous pointer.
// Newly seen maps are printed as a fixed-width field with the XOR of the
// hash of every map entry. With a sufficiently strong hash, this value is
// theoretically "parsable" by looking up the hash in a magical map that
// returns the set of entries for that given hash.
const scratchSize = 128
// hasher is reusable state for hashing a value.
// Get one via hasherPool.
type hasher struct {
hashx.Block512
scratch [scratchSize]byte
visitStack visitStack
}
// Sum is an opaque checksum type that is comparable.
type Sum struct {
sum [sha256.Size]byte
}
func (s1 *Sum) xor(s2 Sum) {
for i := 0; i < sha256.Size; i++ {
s1.sum[i] ^= s2.sum[i]
}
}
func (s Sum) String() string {
return hex.EncodeToString(s.sum[:])
}
var (
seedOnce sync.Once
seed uint64
)
func initSeed() {
seed = uint64(time.Now().UnixNano())
}
func (h *hasher) Reset() {
if h.Block512.Hash == nil {
h.Block512.Hash = sha256.New()
}
h.Block512.Reset()
}
func (h *hasher) sum() (s Sum) {
h.Sum(s.sum[:0])
return s
}
var hasherPool = &sync.Pool{
New: func() any { return new(hasher) },
}
// Hash returns the hash of v.
// For performance, this should be a non-nil pointer.
func Hash(v any) (s Sum) {
h := hasherPool.Get().(*hasher)
defer hasherPool.Put(h)
h.Reset()
seedOnce.Do(initSeed)
h.HashUint64(seed)
rv := reflect.ValueOf(v)
if rv.IsValid() {
var t reflect.Type
var p pointer
if rv.Kind() == reflect.Pointer && !rv.IsNil() {
t = rv.Type().Elem()
p = pointerOf(rv)
} else {
t = rv.Type()
va := reflect.New(t).Elem()
va.Set(rv)
p = pointerOf(va.Addr())
}
// Always treat the Hash input as an interface (it is), including hashing
// its type, otherwise two Hash calls of different types could hash to the
// same bytes off the different types and get equivalent Sum values. This is
// the same thing that we do for reflect.Kind Interface in hashValue, but
// the initial reflect.ValueOf from an interface value effectively strips
// the interface box off so we have to do it at the top level by hand.
h.hashType(t)
ti := getTypeInfo(t)
ti.hasher()(h, p)
}
return h.sum()
}
// HasherForType is like Hash, but it returns a Hash func that's specialized for
// the provided reflect type, avoiding a map lookup per value.
func HasherForType[T any]() func(T) Sum {
var zeroT T
t := reflect.TypeOf(zeroT)
ti := getTypeInfo(t)
var tiElem *typeInfo
if t.Kind() == reflect.Pointer {
tiElem = getTypeInfo(t.Elem())
}
seedOnce.Do(initSeed)
return func(v T) (s Sum) {
h := hasherPool.Get().(*hasher)
defer hasherPool.Put(h)
h.Reset()
h.HashUint64(seed)
rv := reflect.ValueOf(v)
if rv.IsValid() {
if rv.Kind() == reflect.Pointer && !rv.IsNil() {
p := pointerOf(rv)
h.hashType(t.Elem())
tiElem.hasher()(h, p)
} else {
va := reflect.New(t).Elem()
va.Set(rv)
p := pointerOf(va.Addr())
h.hashType(t)
ti.hasher()(h, p)
}
}
return h.sum()
}
}
// Update sets last to the hash of v and reports whether its value changed.
func Update(last *Sum, v any) (changed bool) {
sum := Hash(v)
changed = sum != *last
if changed {
*last = sum
}
return changed
}
// typeInfo describes properties of a type.
//
// A non-nil typeInfo is populated into the typeHasher map
// when its type is first requested, before its func is created.
// Its func field fn is only populated once the type has been created.
// This is used for recursive types.
type typeInfo struct {
rtype reflect.Type
isRecursive bool
// elemTypeInfo is the element type's typeInfo.
// It's set when rtype is of Kind Ptr, Slice, Array, Map.
elemTypeInfo *typeInfo
// keyTypeInfo is the map key type's typeInfo.
// It's set when rtype is of Kind Map.
keyTypeInfo *typeInfo
hashFuncOnce sync.Once
hashFuncLazy typeHasherFunc // nil until created
}
// typeHasherFunc hashes the value pointed at by p for a given type.
// For example, if t is a bool, then p is a *bool.
// The provided pointer must always be non-nil.
type typeHasherFunc func(h *hasher, p pointer)
var typeInfoMap sync.Map // map[reflect.Type]*typeInfo
var typeInfoMapPopulate sync.Mutex // just for adding to typeInfoMap
func (ti *typeInfo) hasher() typeHasherFunc {
ti.hashFuncOnce.Do(ti.buildHashFuncOnce)
return ti.hashFuncLazy
}
func (ti *typeInfo) buildHashFuncOnce() {
ti.hashFuncLazy = genTypeHasher(ti)
}
// fieldInfo describes a struct field.
type fieldInfo struct {
index int // index of field for reflect.Value.Field(n); -1 if invalid
typeInfo *typeInfo
canMemHash bool
offset uintptr // when we can memhash the field
size uintptr // when we can memhash the field
}
// mergeContiguousFieldsCopy returns a copy of f with contiguous memhashable fields
// merged together. Such fields get a bogus index and fu value.
func mergeContiguousFieldsCopy(in []fieldInfo) []fieldInfo {
ret := make([]fieldInfo, 0, len(in))
var last *fieldInfo
for _, f := range in {
// Combine two fields if they're both contiguous & memhash-able.
if f.canMemHash && last != nil && last.canMemHash && last.offset+last.size == f.offset {
last.size += f.size
last.index = -1
last.typeInfo = nil
} else {
ret = append(ret, f)
last = &ret[len(ret)-1]
}
}
return ret
}
// genHashStructFields generates a typeHasherFunc for t, which must be of kind Struct.
func genHashStructFields(t reflect.Type) typeHasherFunc {
fields := make([]fieldInfo, 0, t.NumField())
for i, n := 0, t.NumField(); i < n; i++ {
sf := t.Field(i)
if sf.Type.Size() == 0 {
continue
}
fields = append(fields, fieldInfo{
index: i,
typeInfo: getTypeInfo(sf.Type),
canMemHash: typeIsMemHashable(sf.Type),
offset: sf.Offset,
size: sf.Type.Size(),
})
}
fields = mergeContiguousFieldsCopy(fields)
return structHasher{fields}.hash
}
type structHasher struct {
fields []fieldInfo
}
func (sh structHasher) hash(h *hasher, p pointer) {
for _, f := range sh.fields {
pf := p.structField(f.index, f.offset, f.size)
if f.canMemHash {
h.HashBytes(pf.asMemory(f.size))
} else {
f.typeInfo.hasher()(h, pf)
}
}
}
func genTypeHasher(ti *typeInfo) typeHasherFunc {
t := ti.rtype
// Types with specific hashing.
switch t {
case timeTimeType:
return (*hasher).hashTimev
case netipAddrType:
return (*hasher).hashAddrv
}
// Types that can have their memory representation directly hashed.
if typeIsMemHashable(t) {
return makeMemHasher(t.Size())
}
switch t.Kind() {
case reflect.String:
return (*hasher).hashString
case reflect.Slice:
et := t.Elem()
if typeIsMemHashable(et) {
return func(h *hasher, p pointer) {
pa := p.sliceArray()
vLen := p.sliceLen()
h.HashUint64(uint64(vLen))
if vLen == 0 {
return
}
h.HashBytes(pa.asMemory(et.Size() * uintptr(vLen)))
}
}
eti := getTypeInfo(et)
return genHashSliceElements(eti)
case reflect.Array:
et := t.Elem()
eti := getTypeInfo(et)
return genHashArray(t, eti)
case reflect.Struct:
return genHashStructFields(t)
case reflect.Map:
return func(h *hasher, p pointer) {
v := p.asValue(t).Elem() // reflect.Map kind
if v.IsNil() {
h.HashUint8(0) // indicates nil
return
}
if ti.isRecursive {
pm := v.UnsafePointer() // underlying pointer of map
if idx, ok := h.visitStack.seen(pm); ok {
h.HashUint8(2) // indicates cycle
h.HashUint64(uint64(idx))
return
}
h.visitStack.push(pm)
defer h.visitStack.pop(pm)
}
h.HashUint8(1) // indicates visiting a map
h.hashMap(v, ti)
}
case reflect.Pointer:
et := t.Elem()
eti := getTypeInfo(et)
return func(h *hasher, p pointer) {
pe := p.pointerElem()
if pe.isNil() {
h.HashUint8(0) // indicates nil
return
}
if ti.isRecursive {
if idx, ok := h.visitStack.seen(pe.p); ok {
h.HashUint8(2) // indicates cycle
h.HashUint64(uint64(idx))
return
}
h.visitStack.push(pe.p)
defer h.visitStack.pop(pe.p)
}
h.HashUint8(1) // indicates visiting a pointer
eti.hasher()(h, pe)
}
case reflect.Interface:
return func(h *hasher, p pointer) {
v := p.asValue(t).Elem() // reflect.Interface kind
if v.IsNil() {
h.HashUint8(0) // indicates nil
return
}
h.HashUint8(1) // visiting interface
v = v.Elem()
t := v.Type()
h.hashType(t)
va := reflect.New(t).Elem()
va.Set(v)
ti := getTypeInfo(t)
ti.hasher()(h, pointerOf(va.Addr()))
}
default: // Func, Chan, UnsafePointer
return func(*hasher, pointer) {}
}
}
func (h *hasher) hashString(p pointer) {
s := *p.asString()
h.HashUint64(uint64(len(s)))
h.HashString(s)
}
// hashTimev hashes v, of kind time.Time.
func (h *hasher) hashTimev(p pointer) {
// Include the zone offset (but not the name) to keep
// Hash(t1) == Hash(t2) being semantically equivalent to
// t1.Format(time.RFC3339Nano) == t2.Format(time.RFC3339Nano).
t := *p.asTime()
_, offset := t.Zone()
h.HashUint64(uint64(t.Unix()))
h.HashUint32(uint32(t.Nanosecond()))
h.HashUint32(uint32(offset))
}
// hashAddrv hashes v, of type netip.Addr.
func (h *hasher) hashAddrv(p pointer) {
// The formatting of netip.Addr covers the
// IP version, the address, and the optional zone name (for v6).
// This is equivalent to a1.MarshalBinary() == a2.MarshalBinary().
ip := *p.asAddr()
switch {
case !ip.IsValid():
h.HashUint64(0)
case ip.Is4():
b := ip.As4()
h.HashUint64(4)
h.HashUint32(binary.LittleEndian.Uint32(b[:]))
case ip.Is6():
b := ip.As16()
z := ip.Zone()
h.HashUint64(16 + uint64(len(z)))
h.HashUint64(binary.LittleEndian.Uint64(b[:8]))
h.HashUint64(binary.LittleEndian.Uint64(b[8:]))
h.HashString(z)
}
}
func makeMemHasher(n uintptr) typeHasherFunc {
return func(h *hasher, p pointer) {
h.HashBytes(p.asMemory(n))
}
}
func genHashArrayElements(n int, eti *typeInfo) typeHasherFunc {
nb := eti.rtype.Size() // byte size of each array element
return func(h *hasher, p pointer) {
for i := 0; i < n; i++ {
pe := p.arrayIndex(i, nb)
eti.hasher()(h, pe)
}
}
}
func genHashArray(t reflect.Type, eti *typeInfo) typeHasherFunc {
n := t.Len()
return genHashArrayElements(n, eti)
}
func genHashSliceElements(eti *typeInfo) typeHasherFunc {
return sliceElementHasher{eti}.hash
}
type sliceElementHasher struct {
eti *typeInfo
}
func (seh sliceElementHasher) hash(h *hasher, p pointer) {
pa := p.sliceArray()
vLen := p.sliceLen()
h.HashUint64(uint64(vLen))
nb := seh.eti.rtype.Size()
for i := 0; i < vLen; i++ {
pe := pa.arrayIndex(i, nb)
seh.eti.hasher()(h, pe)
}
}
func getTypeInfo(t reflect.Type) *typeInfo {
if f, ok := typeInfoMap.Load(t); ok {
return f.(*typeInfo)
}
typeInfoMapPopulate.Lock()
defer typeInfoMapPopulate.Unlock()
newTypes := map[reflect.Type]*typeInfo{}
ti := getTypeInfoLocked(t, newTypes)
for t, ti := range newTypes {
typeInfoMap.Store(t, ti)
}
return ti
}
func getTypeInfoLocked(t reflect.Type, incomplete map[reflect.Type]*typeInfo) *typeInfo {
if v, ok := typeInfoMap.Load(t); ok {
return v.(*typeInfo)
}
if ti, ok := incomplete[t]; ok {
return ti
}
ti := &typeInfo{
rtype: t,
isRecursive: typeIsRecursive(t),
}
incomplete[t] = ti
switch t.Kind() {
case reflect.Map:
ti.keyTypeInfo = getTypeInfoLocked(t.Key(), incomplete)
fallthrough
case reflect.Ptr, reflect.Slice, reflect.Array:
ti.elemTypeInfo = getTypeInfoLocked(t.Elem(), incomplete)
}
return ti
}
type mapHasher struct {
h hasher
valKey, valElem valueCache // re-usable values for map iteration
}
var mapHasherPool = &sync.Pool{
New: func() any { return new(mapHasher) },
}
type valueCache map[reflect.Type]reflect.Value
func (c *valueCache) get(t reflect.Type) reflect.Value {
v, ok := (*c)[t]
if !ok {
v = reflect.New(t).Elem()
if *c == nil {
*c = make(valueCache)
}
(*c)[t] = v
}
return v
}
// hashMap hashes a map in a sort-free manner.
// It relies on a map being a functionally an unordered set of KV entries.
// So long as we hash each KV entry together, we can XOR all
// of the individual hashes to produce a unique hash for the entire map.
func (h *hasher) hashMap(v reflect.Value, ti *typeInfo) {
mh := mapHasherPool.Get().(*mapHasher)
defer mapHasherPool.Put(mh)
var sum Sum
if v.IsNil() {
sum.sum[0] = 1 // something non-zero
}
k := mh.valKey.get(v.Type().Key())
e := mh.valElem.get(v.Type().Elem())
mh.h.visitStack = h.visitStack // always use the parent's visit stack to avoid cycles
for iter := v.MapRange(); iter.Next(); {
k.SetIterKey(iter)
e.SetIterValue(iter)
mh.h.Reset()
ti.keyTypeInfo.hasher()(&mh.h, pointerOf(k.Addr()))
ti.elemTypeInfo.hasher()(&mh.h, pointerOf(e.Addr()))
sum.xor(mh.h.sum())
}
h.HashBytes(append(h.scratch[:0], sum.sum[:]...)) // append into scratch to avoid heap allocation
}
// hashType hashes a reflect.Type.
// The hash is only consistent within the lifetime of a program.
func (h *hasher) hashType(t reflect.Type) {
// This approach relies on reflect.Type always being backed by a unique
// *reflect.rtype pointer. A safer approach is to use a global sync.Map
// that maps reflect.Type to some arbitrary and unique index.
// While safer, it requires global state with memory that can never be GC'd.
rtypeAddr := reflect.ValueOf(t).Pointer() // address of *reflect.rtype
h.HashUint64(uint64(rtypeAddr))
}