mirror of https://github.com/tailscale/tailscale/
util/topk: add package containing a probabilistic top-K tracker
This package uses a count-min sketch and a heap to track the top K items in a stream of data. Tracking a new item and adding a count to an existing item both require no memory allocations and is at worst O(log(k)) complexity. Change-Id: I0553381be3fef2470897e2bd806d43396f2dbb36 Signed-off-by: Andrew Dunham <andrew@du.nham.ca>pull/11136/head
parent
7ad2bb87a6
commit
b7104cde4a
@ -0,0 +1,135 @@
|
|||||||
|
// Copyright (c) Tailscale Inc & AUTHORS
|
||||||
|
// SPDX-License-Identifier: BSD-3-Clause
|
||||||
|
|
||||||
|
package topk
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/binary"
|
||||||
|
"fmt"
|
||||||
|
"slices"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestCountMinSketch(t *testing.T) {
|
||||||
|
cms := NewCountMinSketch(4, 10)
|
||||||
|
items := []string{"foo", "bar", "baz", "asdf", "quux"}
|
||||||
|
for _, item := range items {
|
||||||
|
cms.Add([]byte(item))
|
||||||
|
}
|
||||||
|
for _, item := range items {
|
||||||
|
count := cms.Get([]byte(item))
|
||||||
|
if count < 1 {
|
||||||
|
t.Errorf("item %q should have count >= 1", item)
|
||||||
|
} else if count > 1 {
|
||||||
|
t.Logf("item %q has count > 1: %d", item, count)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test that an item that's *not* in the set has a value lower than the
|
||||||
|
// total number of items we inserted (in the case that all items
|
||||||
|
// collided).
|
||||||
|
noItemCount := cms.Get([]byte("doesn't exist"))
|
||||||
|
if noItemCount > uint64(len(items)) {
|
||||||
|
t.Errorf("expected nonexistent item to have value < %d; got %d", len(items), noItemCount)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestTopK(t *testing.T) {
|
||||||
|
// This is probabilistic, so we're going to try 10 times to get the
|
||||||
|
// "right" value; the likelihood that we fail on all attempts is
|
||||||
|
// vanishingly small since the number of hash buckets is drastically
|
||||||
|
// larger than the number of items we're inserting.
|
||||||
|
var (
|
||||||
|
got []int
|
||||||
|
want = []int{5, 6, 7, 8, 9}
|
||||||
|
)
|
||||||
|
for try := 0; try < 10; try++ {
|
||||||
|
topk := NewWithParams[int](5, func(in []byte, val int) []byte {
|
||||||
|
return binary.LittleEndian.AppendUint64(in, uint64(val))
|
||||||
|
}, 4, 1000)
|
||||||
|
|
||||||
|
// Add the first 10 integers with counts equal to 2x their value
|
||||||
|
for i := 0; i < 10; i++ {
|
||||||
|
topk.AddN(i, uint64(i*2))
|
||||||
|
}
|
||||||
|
|
||||||
|
got = topk.Top()
|
||||||
|
t.Logf("top K items: %+v", got)
|
||||||
|
slices.Sort(got)
|
||||||
|
|
||||||
|
if slices.Equal(got, want) {
|
||||||
|
// All good!
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// continue and retry or fail
|
||||||
|
}
|
||||||
|
|
||||||
|
t.Errorf("top K mismatch\ngot: %v\nwant: %v", got, want)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestPickParams(t *testing.T) {
|
||||||
|
hashes, buckets := PickParams(
|
||||||
|
0.001, // 0.1% error rate
|
||||||
|
0.001, // 0.1% chance of having an error, or 99.9% chance of not having an error
|
||||||
|
)
|
||||||
|
t.Logf("hashes = %d, buckets = %d", hashes, buckets)
|
||||||
|
}
|
||||||
|
|
||||||
|
func BenchmarkCountMinSketch(b *testing.B) {
|
||||||
|
cms := NewCountMinSketch(PickParams(0.001, 0.001))
|
||||||
|
b.ResetTimer()
|
||||||
|
b.ReportAllocs()
|
||||||
|
|
||||||
|
var enc [8]byte
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
binary.LittleEndian.PutUint64(enc[:], uint64(i))
|
||||||
|
cms.Add(enc[:])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func BenchmarkTopK(b *testing.B) {
|
||||||
|
for _, n := range []int{
|
||||||
|
10,
|
||||||
|
128,
|
||||||
|
256,
|
||||||
|
1024,
|
||||||
|
8192,
|
||||||
|
} {
|
||||||
|
b.Run(fmt.Sprintf("Top%d", n), func(b *testing.B) {
|
||||||
|
out := make([]int, 0, n)
|
||||||
|
topk := New[int](n, func(in []byte, val int) []byte {
|
||||||
|
return binary.LittleEndian.AppendUint64(in, uint64(val))
|
||||||
|
})
|
||||||
|
b.ResetTimer()
|
||||||
|
b.ReportAllocs()
|
||||||
|
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
topk.Add(i)
|
||||||
|
}
|
||||||
|
out = topk.AppendTop(out[:0]) // should not allocate
|
||||||
|
_ = out // appease linter
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestMultiplyHigh64(t *testing.T) {
|
||||||
|
testCases := []struct {
|
||||||
|
x, y uint64
|
||||||
|
want uint64
|
||||||
|
}{
|
||||||
|
{0, 0, 0},
|
||||||
|
{0xffffffff, 0xffffffff, 0},
|
||||||
|
{0x2, 0xf000000000000000, 1},
|
||||||
|
{0x3, 0xf000000000000000, 2},
|
||||||
|
{0x3, 0xf000000000000001, 2},
|
||||||
|
{0x3, 0xffffffffffffffff, 2},
|
||||||
|
{0xffffffffffffffff, 0xffffffffffffffff, 0xfffffffffffffffe},
|
||||||
|
}
|
||||||
|
for _, tc := range testCases {
|
||||||
|
got := multiplyHigh64(tc.x, tc.y)
|
||||||
|
if got != tc.want {
|
||||||
|
t.Errorf("got multiplyHigh64(%x, %x) = %x, want %x", tc.x, tc.y, got, tc.want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue