From a7c910e36118344894cf94ca30ce114a2293b2ca Mon Sep 17 00:00:00 2001
From: David Anderson <danderson@tailscale.com>
Date: Tue, 4 Apr 2023 09:00:51 -0700
Subject: [PATCH] net/art: implement the Table type, a multi-level art route
 table.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Updates #7781

                           │    sec/op     │
TableInsertion/ipv4/10       1.562µ ±   2%
TableInsertion/ipv4/100      2.398µ ±   5%
TableInsertion/ipv4/1000     2.097µ ±   3%
TableInsertion/ipv4/10000    2.756µ ±   4%
TableInsertion/ipv4/100000   2.473µ ±  13%
TableInsertion/ipv6/10       7.649µ ±   2%
TableInsertion/ipv6/100      12.09µ ±   3%
TableInsertion/ipv6/1000     14.84µ ±   5%
TableInsertion/ipv6/10000    14.72µ ±   8%
TableInsertion/ipv6/100000   13.23µ ±  41%
TableDelete/ipv4/10          378.4n ±   5%
TableDelete/ipv4/100         366.9n ±   3%
TableDelete/ipv4/1000        418.6n ±   3%
TableDelete/ipv4/10000       609.2n ±  11%
TableDelete/ipv4/100000      679.2n ±  28%
TableDelete/ipv6/10          504.2n ±   4%
TableDelete/ipv6/100         959.5n ±  12%
TableDelete/ipv6/1000        1.436µ ±   6%
TableDelete/ipv6/10000       1.772µ ±  15%
TableDelete/ipv6/100000      1.172µ ± 113%
TableGet/ipv4/10             32.14n ±  11%
TableGet/ipv4/100            38.58n ±   2%
TableGet/ipv4/1000           45.03n ±   2%
TableGet/ipv4/10000          52.90n ±   7%
TableGet/ipv4/100000         135.2n ±  11%
TableGet/ipv6/10             41.55n ±   1%
TableGet/ipv6/100            44.78n ±   2%
TableGet/ipv6/1000           49.03n ±   2%
TableGet/ipv6/10000          65.38n ±   5%
TableGet/ipv6/100000         525.0n ±  39%

                           │   avg-B/op   │
TableInsertion/ipv4/10       25.18Ki ± 0%
TableInsertion/ipv4/100      17.63Ki ± 0%
TableInsertion/ipv4/1000     14.14Ki ± 0%
TableInsertion/ipv4/10000    12.92Ki ± 0%
TableInsertion/ipv4/100000   11.13Ki ± 0%
TableInsertion/ipv6/10       76.87Ki ± 0%
TableInsertion/ipv6/100      98.33Ki ± 0%
TableInsertion/ipv6/1000     91.44Ki ± 0%
TableInsertion/ipv6/10000    90.39Ki ± 0%
TableInsertion/ipv6/100000   87.19Ki ± 0%
TableDelete/ipv4/10            3.230 ± 0%
TableDelete/ipv4/100           4.020 ± 0%
TableDelete/ipv4/1000          3.990 ± 0%
TableDelete/ipv4/10000         4.000 ± 0%
TableDelete/ipv4/100000        4.000 ± 0%
TableDelete/ipv6/10            16.00 ± 0%
TableDelete/ipv6/100           16.00 ± 0%
TableDelete/ipv6/1000          16.00 ± 0%
TableDelete/ipv6/10000         16.00 ± 0%
TableDelete/ipv6/100000        16.00 ± 0%

                           │ avg-allocs/op │
TableInsertion/ipv4/10          2.900 ± 0%
TableInsertion/ipv4/100         2.330 ± 0%
TableInsertion/ipv4/1000        2.070 ± 0%
TableInsertion/ipv4/10000       1.980 ± 0%
TableInsertion/ipv4/100000      1.840 ± 0%
TableInsertion/ipv6/10          6.800 ± 0%
TableInsertion/ipv6/100         8.420 ± 0%
TableInsertion/ipv6/1000        7.900 ± 0%
TableInsertion/ipv6/10000       7.820 ± 0%
TableInsertion/ipv6/100000      7.580 ± 0%
TableDelete/ipv4/10             1.000 ± 0%
TableDelete/ipv4/100            1.000 ± 0%
TableDelete/ipv4/1000           1.000 ± 0%
TableDelete/ipv4/10000          1.000 ± 0%
TableDelete/ipv4/100000         1.000 ± 0%
TableDelete/ipv6/10             1.000 ± 0%
TableDelete/ipv6/100            1.000 ± 0%
TableDelete/ipv6/1000           1.000 ± 0%
TableDelete/ipv6/10000          1.000 ± 0%
TableDelete/ipv6/100000         1.000 ± 0%

                           │   routes/s   │
TableInsertion/ipv4/10       640.3k ±  2%
TableInsertion/ipv4/100      417.1k ±  5%
TableInsertion/ipv4/1000     477.0k ±  3%
TableInsertion/ipv4/10000    362.8k ±  5%
TableInsertion/ipv4/100000   404.5k ± 15%
TableInsertion/ipv6/10       130.7k ±  1%
TableInsertion/ipv6/100      82.69k ±  3%
TableInsertion/ipv6/1000     67.37k ±  5%
TableInsertion/ipv6/10000    67.93k ±  9%
TableInsertion/ipv6/100000   75.63k ± 29%
TableDelete/ipv4/10          2.642M ±  6%
TableDelete/ipv4/100         2.726M ±  3%
TableDelete/ipv4/1000        2.389M ±  3%
TableDelete/ipv4/10000       1.641M ± 12%
TableDelete/ipv4/100000      1.472M ± 27%
TableDelete/ipv6/10          1.984M ±  4%
TableDelete/ipv6/100         1.042M ± 11%
TableDelete/ipv6/1000        696.5k ±  6%
TableDelete/ipv6/10000       564.4k ± 13%
TableDelete/ipv6/100000      853.6k ± 53%

                     │   addrs/s    │
TableGet/ipv4/10       31.11M ± 10%
TableGet/ipv4/100      25.92M ±  2%
TableGet/ipv4/1000     22.21M ±  2%
TableGet/ipv4/10000    18.91M ±  8%
TableGet/ipv4/100000   7.397M ± 12%
TableGet/ipv6/10       24.07M ±  1%
TableGet/ipv6/100      22.33M ±  2%
TableGet/ipv6/1000     20.40M ±  2%
TableGet/ipv6/10000    15.30M ±  5%
TableGet/ipv6/100000   1.905M ± 28%

                     │    B/op    │
TableGet/ipv4/10       4.000 ± 0%
TableGet/ipv4/100      4.000 ± 0%
TableGet/ipv4/1000     4.000 ± 0%
TableGet/ipv4/10000    4.000 ± 0%
TableGet/ipv4/100000   4.000 ± 0%
TableGet/ipv6/10       16.00 ± 0%
TableGet/ipv6/100      16.00 ± 0%
TableGet/ipv6/1000     16.00 ± 0%
TableGet/ipv6/10000    16.00 ± 0%
TableGet/ipv6/100000   16.00 ± 0%

                     │ allocs/op  │
TableGet/ipv4/10       1.000 ± 0%
TableGet/ipv4/100      1.000 ± 0%
TableGet/ipv4/1000     1.000 ± 0%
TableGet/ipv4/10000    1.000 ± 0%
TableGet/ipv4/100000   1.000 ± 0%
TableGet/ipv6/10       1.000 ± 0%
TableGet/ipv6/100      1.000 ± 0%
TableGet/ipv6/1000     1.000 ± 0%
TableGet/ipv6/10000    1.000 ± 0%
TableGet/ipv6/100000   1.000 ± 0%

Signed-off-by: David Anderson <danderson@tailscale.com>
---
 net/art/stride_table.go      |   5 +
 net/art/stride_table_test.go |  18 +-
 net/art/table.go             | 149 ++++++++++
 net/art/table_test.go        | 542 +++++++++++++++++++++++++++++++++++
 4 files changed, 708 insertions(+), 6 deletions(-)
 create mode 100644 net/art/table_test.go

diff --git a/net/art/stride_table.go b/net/art/stride_table.go
index 99a5731ea..f8bdb20c5 100644
--- a/net/art/stride_table.go
+++ b/net/art/stride_table.go
@@ -82,6 +82,11 @@ func (t *strideTable[T]) getOrCreateChild(addr uint8) *strideTable[T] {
 	return t.entries[idx].child
 }
 
+func (t *strideTable[T]) getValAndChild(addr uint8) (*T, *strideTable[T]) {
+	idx := hostIndex(addr)
+	return t.entries[idx].value, t.entries[idx].child
+}
+
 // allot updates entries whose stored prefixIndex matches oldPrefixIndex, in the
 // subtree rooted at idx. Matching entries have their stored prefixIndex set to
 // newPrefixIndex, and their value set to val.
diff --git a/net/art/stride_table_test.go b/net/art/stride_table_test.go
index 03fb518ac..dec39cb7a 100644
--- a/net/art/stride_table_test.go
+++ b/net/art/stride_table_test.go
@@ -16,6 +16,7 @@ import (
 )
 
 func TestInversePrefix(t *testing.T) {
+	t.Parallel()
 	for i := 0; i < 256; i++ {
 		for len := 0; len < 9; len++ {
 			addr := i & (0xFF << (8 - len))
@@ -29,6 +30,7 @@ func TestInversePrefix(t *testing.T) {
 }
 
 func TestHostIndex(t *testing.T) {
+	t.Parallel()
 	for i := 0; i < 256; i++ {
 		got := hostIndex(uint8(i))
 		want := prefixIndex(uint8(i), 8)
@@ -39,6 +41,7 @@ func TestHostIndex(t *testing.T) {
 }
 
 func TestStrideTableInsert(t *testing.T) {
+	t.Parallel()
 	// Verify that strideTable's lookup results after a bunch of inserts exactly
 	// match those of a naive implementation that just scans all prefixes on
 	// every lookup. The naive implementation is very slow, but its behavior is
@@ -66,6 +69,7 @@ func TestStrideTableInsert(t *testing.T) {
 }
 
 func TestStrideTableInsertShuffled(t *testing.T) {
+	t.Parallel()
 	// The order in which routes are inserted into a route table does not
 	// influence the final shape of the table, as long as the same set of
 	// prefixes is being inserted. This test verifies that strideTable behaves
@@ -111,6 +115,7 @@ func TestStrideTableInsertShuffled(t *testing.T) {
 }
 
 func TestStrideTableDelete(t *testing.T) {
+	t.Parallel()
 	// Compare route deletion to our reference slowTable.
 	pfxs := shufflePrefixes(allPrefixes())[:100]
 	slow := slowTable[int]{pfxs}
@@ -145,6 +150,7 @@ func TestStrideTableDelete(t *testing.T) {
 }
 
 func TestStrideTableDeleteShuffle(t *testing.T) {
+	t.Parallel()
 	// Same as TestStrideTableInsertShuffle, the order in which prefixes are
 	// deleted should not impact the final shape of the route table.
 
@@ -191,17 +197,17 @@ func TestStrideTableDeleteShuffle(t *testing.T) {
 	}
 }
 
-var benchRouteCount = []int{10, 50, 100, 200}
+var strideRouteCount = []int{10, 50, 100, 200}
 
 // forCountAndOrdering runs the benchmark fn with different sets of routes.
 //
 // fn is called once for each combination of {num_routes, order}, where
-// num_routes is the values in benchRouteCount, and order is the order of the
+// num_routes is the values in strideRouteCount, and order is the order of the
 // routes in the list: random, largest prefix first (/0 to /8), and smallest
 // prefix first (/8 to /0).
-func forCountAndOrdering(b *testing.B, fn func(b *testing.B, routes []slowEntry[int])) {
+func forStrideCountAndOrdering(b *testing.B, fn func(b *testing.B, routes []slowEntry[int])) {
 	routes := shufflePrefixes(allPrefixes())
-	for _, nroutes := range benchRouteCount {
+	for _, nroutes := range strideRouteCount {
 		b.Run(fmt.Sprint(nroutes), func(b *testing.B) {
 			routes := append([]slowEntry[int](nil), routes[:nroutes]...)
 			b.Run("random_order", func(b *testing.B) {
@@ -233,7 +239,7 @@ func forCountAndOrdering(b *testing.B, fn func(b *testing.B, routes []slowEntry[
 }
 
 func BenchmarkStrideTableInsertion(b *testing.B) {
-	forCountAndOrdering(b, func(b *testing.B, routes []slowEntry[int]) {
+	forStrideCountAndOrdering(b, func(b *testing.B, routes []slowEntry[int]) {
 		val := 0
 		for i := 0; i < b.N; i++ {
 			var rt strideTable[int]
@@ -250,7 +256,7 @@ func BenchmarkStrideTableInsertion(b *testing.B) {
 }
 
 func BenchmarkStrideTableDeletion(b *testing.B) {
-	forCountAndOrdering(b, func(b *testing.B, routes []slowEntry[int]) {
+	forStrideCountAndOrdering(b, func(b *testing.B, routes []slowEntry[int]) {
 		val := 0
 		var rt strideTable[int]
 		for _, route := range routes {
diff --git a/net/art/table.go b/net/art/table.go
index 1d49f1566..90ae60f82 100644
--- a/net/art/table.go
+++ b/net/art/table.go
@@ -11,3 +11,152 @@
 // For more information, see Yoichi Hariguchi's paper:
 // https://cseweb.ucsd.edu//~varghese/TEACH/cs228/artlookup.pdf
 package art
+
+import (
+	"bytes"
+	"fmt"
+	"io"
+	"net/netip"
+	"strings"
+)
+
+// Table is an IPv4 and IPv6 routing table.
+type Table[T any] struct {
+	v4 strideTable[T]
+	v6 strideTable[T]
+}
+
+// Get does a route lookup for addr and returns the associated value, or nil if
+// no route matched.
+func (t *Table[T]) Get(addr netip.Addr) *T {
+	st := &t.v4
+	if addr.Is6() {
+		st = &t.v6
+	}
+
+	var ret *T
+	for _, stride := range addr.AsSlice() {
+		rt, child := st.getValAndChild(stride)
+		if rt != nil {
+			// Found a more specific route than whatever we found previously,
+			// keep a note.
+			ret = rt
+		}
+		if child == nil {
+			// No sub-routes further down, whatever we have recorded in ret is
+			// the result.
+			return ret
+		}
+		st = child
+	}
+
+	// Unreachable because Insert/Delete won't allow the leaf strideTables to
+	// have children, so we must return via the nil check in the loop.
+	panic("unreachable")
+}
+
+// Insert adds pfx to the table, with value val.
+// If pfx is already present in the table, its value is set to val.
+func (t *Table[T]) Insert(pfx netip.Prefix, val *T) {
+	if val == nil {
+		panic("Table.Insert called with nil value")
+	}
+	st := &t.v4
+	if pfx.Addr().Is6() {
+		st = &t.v6
+	}
+	bs := pfx.Addr().AsSlice()
+	i := 0
+	numBits := pfx.Bits()
+
+	// The strideTable we want to insert into is potentially at the end of a
+	// chain of parent tables, each one encoding successive 8 bits of the
+	// prefix. Navigate downwards, allocating child tables as needed, until we
+	// find the one this prefix belongs in.
+	for numBits > 8 {
+		st = st.getOrCreateChild(bs[i])
+		i++
+		numBits -= 8
+	}
+	// Finally, insert the remaining 0-8 bits of the prefix into the child
+	// table.
+	st.insert(bs[i], numBits, val)
+}
+
+// Delete removes pfx from the table, if it is present.
+func (t *Table[T]) Delete(pfx netip.Prefix) {
+	st := &t.v4
+	if pfx.Addr().Is6() {
+		st = &t.v6
+	}
+	bs := pfx.Addr().AsSlice()
+	i := 0
+	numBits := pfx.Bits()
+
+	// Deletion may drive the refcount of some strideTables down to zero. We
+	// need to clean up these dangling tables, so we have to keep track of which
+	// tables we touch on the way down, and which strideEntry index each child
+	// is registered in.
+	strideTables := [16]*strideTable[T]{st}
+	var strideIndexes [16]int
+
+	// Similar to Insert, navigate down the tree of strideTables, looking for
+	// the one that houses the last 0-8 bits of the prefix to delete.
+	//
+	// The only difference is that here, we don't create missing child tables.
+	// If a child necessary to pfx is missing, then the pfx cannot exist in the
+	// Table, and we can exit early.
+	for numBits > 8 {
+		child, idx := st.getChild(bs[i])
+		if child == nil {
+			// Prefix can't exist in the table, one of the necessary
+			// strideTables doesn't exit.
+			return
+		}
+		// Note that the strideIndex and strideTables entries are off-by-one.
+		// The child table pointer is recorded at i+1, but it is referenced by a
+		// particular index in the parent table, at index i.
+		strideIndexes[i] = idx
+		i++
+		strideTables[i] = child
+		numBits -= 8
+		st = child
+	}
+	if st.delete(bs[i], numBits) == nil {
+		// Prefix didn't exist in the expected strideTable, refcount hasn't
+		// changed, no need to run through cleanup.
+		return
+	}
+
+	// st.delete reduced st's refcount by one, so we may be hanging onto a chain
+	// of redundant strideTables. Walk back up the path we recorded in the
+	// descent loop, deleting tables until we encounter one that still has other
+	// refs (or we hit the root strideTable, which is never deleted).
+	for i > 0 && strideTables[i].refs == 0 {
+		strideTables[i-1].deleteChild(strideIndexes[i-1])
+		i--
+	}
+}
+
+// debugSummary prints the tree of allocated strideTables in t, with each
+// strideTable's refcount.
+func (t *Table[T]) debugSummary() string {
+	var ret bytes.Buffer
+	fmt.Fprintf(&ret, "v4: ")
+	strideSummary(&ret, &t.v4, 0)
+	fmt.Fprintf(&ret, "v6: ")
+	strideSummary(&ret, &t.v6, 0)
+	return ret.String()
+}
+
+func strideSummary[T any](w io.Writer, st *strideTable[T], indent int) {
+	fmt.Fprintf(w, "%d refs\n", st.refs)
+	indent += 2
+	for i := firstHostIndex; i <= lastHostIndex; i++ {
+		if child := st.entries[i].child; child != nil {
+			addr, len := inversePrefixIndex(i)
+			fmt.Fprintf(w, "%s%d/%d: ", strings.Repeat(" ", indent), addr, len)
+			strideSummary(w, child, indent)
+		}
+	}
+}
diff --git a/net/art/table_test.go b/net/art/table_test.go
new file mode 100644
index 000000000..fc4c8312c
--- /dev/null
+++ b/net/art/table_test.go
@@ -0,0 +1,542 @@
+// Copyright (c) Tailscale Inc & AUTHORS
+// SPDX-License-Identifier: BSD-3-Clause
+
+package art
+
+import (
+	crand "crypto/rand"
+	"fmt"
+	"math/rand"
+	"net/netip"
+	"runtime"
+	"strconv"
+	"testing"
+	"time"
+
+	"tailscale.com/types/ptr"
+)
+
+func TestInsert(t *testing.T) {
+	t.Parallel()
+	pfxs := randomPrefixes(10_000)
+
+	slow := slowPrefixTable[int]{pfxs}
+	fast := Table[int]{}
+
+	for _, pfx := range pfxs {
+		fast.Insert(pfx.pfx, pfx.val)
+	}
+
+	t.Logf(fast.debugSummary())
+
+	seenVals4 := map[*int]bool{}
+	seenVals6 := map[*int]bool{}
+	for i := 0; i < 10_000; i++ {
+		a := randomAddr()
+		slowVal := slow.get(a)
+		fastVal := fast.Get(a)
+		if a.Is6() {
+			seenVals6[fastVal] = true
+		} else {
+			seenVals4[fastVal] = true
+		}
+		if slowVal != fastVal {
+			t.Errorf("get(%q) = %p, want %p", a, fastVal, slowVal)
+		}
+	}
+	// Empirically, 10k probes into 5k v4 prefixes and 5k v6 prefixes results in
+	// ~1k distinct values for v4 and ~300 for v6. distinct routes. This sanity
+	// check that we didn't just return a single route for everything should be
+	// very generous indeed.
+	if cnt := len(seenVals4); cnt < 10 {
+		t.Fatalf("saw %d distinct v4 route results, statistically expected ~1000", cnt)
+	}
+	if cnt := len(seenVals6); cnt < 10 {
+		t.Fatalf("saw %d distinct v6 route results, statistically expected ~300", cnt)
+	}
+}
+
+func TestInsertShuffled(t *testing.T) {
+	t.Parallel()
+	pfxs := randomPrefixes(10_000)
+
+	rt := Table[int]{}
+	for _, pfx := range pfxs {
+		rt.Insert(pfx.pfx, pfx.val)
+	}
+
+	for i := 0; i < 10; i++ {
+		pfxs2 := append([]slowPrefixEntry[int](nil), pfxs...)
+		rand.Shuffle(len(pfxs2), func(i, j int) { pfxs2[i], pfxs2[j] = pfxs2[j], pfxs2[i] })
+		rt2 := Table[int]{}
+		for _, pfx := range pfxs2 {
+			rt2.Insert(pfx.pfx, pfx.val)
+		}
+
+		// Diffing a deep tree of tables gives cmp.Diff a nervous breakdown, so
+		// test for equivalence statistically with random probes instead.
+		for i := 0; i < 10_000; i++ {
+			a := randomAddr()
+			val1 := rt.Get(a)
+			val2 := rt2.Get(a)
+			if (val1 == nil && val2 != nil) || (val1 != nil && val2 == nil) || (*val1 != *val2) {
+				t.Errorf("get(%q) = %s, want %s", a, printIntPtr(val2), printIntPtr(val1))
+			}
+		}
+	}
+}
+
+func TestDelete(t *testing.T) {
+	t.Parallel()
+
+	const (
+		numPrefixes  = 10_000 // total prefixes to insert (test deletes 50% of them)
+		numPerFamily = numPrefixes / 2
+		deleteCut    = numPerFamily / 2
+		numProbes    = 10_000 // random addr lookups to do
+	)
+
+	// We have to do this little dance instead of just using allPrefixes,
+	// because we want pfxs and toDelete to be non-overlapping sets.
+	all4, all6 := randomPrefixes4(numPerFamily), randomPrefixes6(numPerFamily)
+	pfxs := append([]slowPrefixEntry[int](nil), all4[:deleteCut]...)
+	pfxs = append(pfxs, all6[:deleteCut]...)
+	toDelete := append([]slowPrefixEntry[int](nil), all4[deleteCut:]...)
+	toDelete = append(toDelete, all6[deleteCut:]...)
+
+	slow := slowPrefixTable[int]{pfxs}
+	fast := Table[int]{}
+
+	for _, pfx := range pfxs {
+		fast.Insert(pfx.pfx, pfx.val)
+	}
+
+	for _, pfx := range toDelete {
+		fast.Insert(pfx.pfx, pfx.val)
+	}
+	for _, pfx := range toDelete {
+		fast.Delete(pfx.pfx)
+	}
+
+	seenVals4 := map[*int]bool{}
+	seenVals6 := map[*int]bool{}
+	for i := 0; i < numProbes; i++ {
+		a := randomAddr()
+		slowVal := slow.get(a)
+		fastVal := fast.Get(a)
+		if a.Is6() {
+			seenVals6[fastVal] = true
+		} else {
+			seenVals4[fastVal] = true
+		}
+		if slowVal != fastVal {
+			t.Fatalf("get(%q) = %p, want %p", a, fastVal, slowVal)
+		}
+	}
+	// Empirically, 10k probes into 5k v4 prefixes and 5k v6 prefixes results in
+	// ~1k distinct values for v4 and ~300 for v6. distinct routes. This sanity
+	// check that we didn't just return a single route for everything should be
+	// very generous indeed.
+	if cnt := len(seenVals4); cnt < 10 {
+		t.Fatalf("saw %d distinct v4 route results, statistically expected ~1000", cnt)
+	}
+	if cnt := len(seenVals6); cnt < 10 {
+		t.Fatalf("saw %d distinct v6 route results, statistically expected ~300", cnt)
+	}
+}
+
+func TestDeleteShuffled(t *testing.T) {
+	t.Parallel()
+
+	const (
+		numPrefixes  = 10_000 // prefixes to insert (test deletes 50% of them)
+		numPerFamily = numPrefixes / 2
+		deleteCut    = numPerFamily / 2
+		numProbes    = 10_000 // random addr lookups to do
+	)
+
+	// We have to do this little dance instead of just using allPrefixes,
+	// because we want pfxs and toDelete to be non-overlapping sets.
+	all4, all6 := randomPrefixes4(numPerFamily), randomPrefixes6(numPerFamily)
+	pfxs := append([]slowPrefixEntry[int](nil), all4[:deleteCut]...)
+	pfxs = append(pfxs, all6[:deleteCut]...)
+	toDelete := append([]slowPrefixEntry[int](nil), all4[deleteCut:]...)
+	toDelete = append(toDelete, all6[deleteCut:]...)
+
+	rt := Table[int]{}
+	for _, pfx := range pfxs {
+		rt.Insert(pfx.pfx, pfx.val)
+	}
+	for _, pfx := range toDelete {
+		rt.Insert(pfx.pfx, pfx.val)
+	}
+	for _, pfx := range toDelete {
+		rt.Delete(pfx.pfx)
+	}
+
+	for i := 0; i < 10; i++ {
+		pfxs2 := append([]slowPrefixEntry[int](nil), pfxs...)
+		toDelete2 := append([]slowPrefixEntry[int](nil), toDelete...)
+		rand.Shuffle(len(toDelete2), func(i, j int) { toDelete2[i], toDelete2[j] = toDelete2[j], toDelete2[i] })
+		rt2 := Table[int]{}
+		for _, pfx := range pfxs2 {
+			rt2.Insert(pfx.pfx, pfx.val)
+		}
+		for _, pfx := range toDelete2 {
+			rt2.Insert(pfx.pfx, pfx.val)
+		}
+		for _, pfx := range toDelete2 {
+			rt2.Delete(pfx.pfx)
+		}
+
+		// Diffing a deep tree of tables gives cmp.Diff a nervous breakdown, so
+		// test for equivalence statistically with random probes instead.
+		for i := 0; i < numProbes; i++ {
+			a := randomAddr()
+			val1 := rt.Get(a)
+			val2 := rt2.Get(a)
+			if val1 == nil && val2 == nil {
+				continue
+			}
+			if (val1 == nil && val2 != nil) || (val1 != nil && val2 == nil) || (*val1 != *val2) {
+				t.Errorf("get(%q) = %s, want %s", a, printIntPtr(val2), printIntPtr(val1))
+			}
+		}
+	}
+}
+
+var benchRouteCount = []int{10, 100, 1000, 10_000, 100_000}
+
+// forFamilyAndCount runs the benchmark fn with different sets of
+// routes.
+//
+// fn is called once for each combination of {addr_family, num_routes},
+// where addr_family is ipv4 or ipv6, num_routes is the values in
+// benchRouteCount.
+func forFamilyAndCount(b *testing.B, fn func(b *testing.B, routes []slowPrefixEntry[int])) {
+	for _, fam := range []string{"ipv4", "ipv6"} {
+		rng := randomPrefixes4
+		if fam == "ipv6" {
+			rng = randomPrefixes6
+		}
+		b.Run(fam, func(b *testing.B) {
+			for _, nroutes := range benchRouteCount {
+				routes := rng(nroutes)
+				b.Run(fmt.Sprint(nroutes), func(b *testing.B) {
+					fn(b, routes)
+				})
+			}
+		})
+	}
+}
+
+func BenchmarkTableInsertion(b *testing.B) {
+	forFamilyAndCount(b, func(b *testing.B, routes []slowPrefixEntry[int]) {
+		b.StopTimer()
+		b.ResetTimer()
+		var startMem, endMem runtime.MemStats
+		runtime.ReadMemStats(&startMem)
+		b.StartTimer()
+		for i := 0; i < b.N; i++ {
+			var rt Table[int]
+			for _, route := range routes {
+				rt.Insert(route.pfx, route.val)
+			}
+		}
+		b.StopTimer()
+		runtime.ReadMemStats(&endMem)
+		inserts := float64(b.N) * float64(len(routes))
+		allocs := float64(endMem.Mallocs - startMem.Mallocs)
+		bytes := float64(endMem.TotalAlloc - startMem.TotalAlloc)
+		elapsed := float64(b.Elapsed().Nanoseconds())
+		elapsedSec := b.Elapsed().Seconds()
+		b.ReportMetric(elapsed/inserts, "ns/op")
+		b.ReportMetric(inserts/elapsedSec, "routes/s")
+		b.ReportMetric(roundFloat64(allocs/inserts), "avg-allocs/op")
+		b.ReportMetric(roundFloat64(bytes/inserts), "avg-B/op")
+	})
+}
+
+func BenchmarkTableDelete(b *testing.B) {
+	forFamilyAndCount(b, func(b *testing.B, routes []slowPrefixEntry[int]) {
+		// Collect memstats for one round of insertions, so we can remove it
+		// from the total at the end and get only the deletion alloc count.
+		insertAllocs, insertBytes := getMemCost(func() {
+			var rt Table[int]
+			for _, route := range routes {
+				rt.Insert(route.pfx, route.val)
+			}
+		})
+		insertAllocs *= float64(b.N)
+		insertBytes *= float64(b.N)
+
+		var t runningTimer
+		allocs, bytes := getMemCost(func() {
+			for i := 0; i < b.N; i++ {
+				var rt Table[int]
+				for _, route := range routes {
+					rt.Insert(route.pfx, route.val)
+				}
+				t.Start()
+				for _, route := range routes {
+					rt.Delete(route.pfx)
+				}
+				t.Stop()
+			}
+		})
+		inserts := float64(b.N) * float64(len(routes))
+		allocs -= insertAllocs
+		bytes -= insertBytes
+		elapsed := float64(t.Elapsed().Nanoseconds())
+		elapsedSec := t.Elapsed().Seconds()
+		b.ReportMetric(elapsed/inserts, "ns/op")
+		b.ReportMetric(inserts/elapsedSec, "routes/s")
+		b.ReportMetric(roundFloat64(allocs/inserts), "avg-allocs/op")
+		b.ReportMetric(roundFloat64(bytes/inserts), "avg-B/op")
+	})
+}
+
+var addrSink netip.Addr
+
+func BenchmarkTableGet(b *testing.B) {
+	forFamilyAndCount(b, func(b *testing.B, routes []slowPrefixEntry[int]) {
+		genAddr := randomAddr4
+		if routes[0].pfx.Addr().Is6() {
+			genAddr = randomAddr6
+		}
+		var rt Table[int]
+		for _, route := range routes {
+			rt.Insert(route.pfx, route.val)
+		}
+		addrAllocs, addrBytes := getMemCost(func() {
+			// Have to run genAddr more than once, otherwise the reported
+			// cost is 16 bytes - presumably due to some amortized costs in
+			// the memory allocator? Either way, empirically 100 iterations
+			// reliably reports the correct cost.
+			for i := 0; i < 100; i++ {
+				_ = genAddr()
+			}
+		})
+		addrAllocs /= 100
+		addrBytes /= 100
+		var t runningTimer
+		allocs, bytes := getMemCost(func() {
+			for i := 0; i < b.N; i++ {
+				addr := genAddr()
+				t.Start()
+				writeSink = rt.Get(addr)
+				t.Stop()
+			}
+		})
+		b.ReportAllocs() // Enables the output, but we report manually below
+		allocs -= (addrAllocs * float64(b.N))
+		bytes -= (addrBytes * float64(b.N))
+		lookups := float64(b.N)
+		elapsed := float64(t.Elapsed().Nanoseconds())
+		elapsedSec := float64(t.Elapsed().Seconds())
+		b.ReportMetric(elapsed/lookups, "ns/op")
+		b.ReportMetric(lookups/elapsedSec, "addrs/s")
+		b.ReportMetric(allocs/lookups, "allocs/op")
+		b.ReportMetric(bytes/lookups, "B/op")
+
+	})
+}
+
+// getMemCost runs fn 100 times and returns the number of allocations and bytes
+// allocated by each call to fn.
+//
+// Note that if your fn allocates very little memory (less than ~16 bytes), you
+// should make fn run its workload ~100 times and divide the results of
+// getMemCost yourself. Otherwise, the byte count you get will be rounded up due
+// to the memory allocator's bucketing granularity.
+func getMemCost(fn func()) (allocs, bytes float64) {
+	var start, end runtime.MemStats
+	runtime.ReadMemStats(&start)
+	fn()
+	runtime.ReadMemStats(&end)
+	return float64(end.Mallocs - start.Mallocs), float64(end.TotalAlloc - start.TotalAlloc)
+}
+
+// runningTimer is a timer that keeps track of the cumulative time it's spent
+// running since creation. A newly created runningTimer is stopped.
+//
+// This timer exists because some of our benchmarks have to interleave costly
+// ancillary logic in each benchmark iteration, rather than being able to
+// front-load all the work before a single b.ResetTimer().
+//
+// As it turns out, b.StartTimer() and b.StopTimer() are expensive function
+// calls, because they do costly memory allocation accounting on every call.
+// Starting and stopping the benchmark timer in every b.N loop iteration slows
+// the benchmarks down by orders of magnitude.
+//
+// So, rather than rely on testing.B's timing facility, we use this very
+// lightweight timer combined with getMemCost to do our own accounting more
+// efficiently.
+type runningTimer struct {
+	cumulative time.Duration
+	start      time.Time
+}
+
+func (t *runningTimer) Start() {
+	t.Stop()
+	t.start = time.Now()
+}
+
+func (t *runningTimer) Stop() {
+	if t.start.IsZero() {
+		return
+	}
+	t.cumulative += time.Since(t.start)
+	t.start = time.Time{}
+}
+
+func (t *runningTimer) Elapsed() time.Duration {
+	return t.cumulative
+}
+
+// slowPrefixTable is a routing table implemented as a set of prefixes that are
+// explicitly scanned in full for every route lookup. It is very slow, but also
+// reasonably easy to verify by inspection, and so a good correctness reference
+// for Table.
+type slowPrefixTable[T any] struct {
+	prefixes []slowPrefixEntry[T]
+}
+
+type slowPrefixEntry[T any] struct {
+	pfx netip.Prefix
+	val *T
+}
+
+func (t *slowPrefixTable[T]) delete(pfx netip.Prefix) {
+	ret := make([]slowPrefixEntry[T], 0, len(t.prefixes))
+	for _, ent := range t.prefixes {
+		if ent.pfx == pfx {
+			continue
+		}
+		ret = append(ret, ent)
+	}
+	t.prefixes = ret
+}
+
+func (t *slowPrefixTable[T]) insert(pfx netip.Prefix, val *T) {
+	for _, ent := range t.prefixes {
+		if ent.pfx == pfx {
+			ent.val = val
+			return
+		}
+	}
+	t.prefixes = append(t.prefixes, slowPrefixEntry[T]{pfx, val})
+}
+
+func (t *slowPrefixTable[T]) get(addr netip.Addr) *T {
+	var (
+		ret     *T
+		bestLen = -1
+	)
+
+	for _, pfx := range t.prefixes {
+		if pfx.pfx.Contains(addr) && pfx.pfx.Bits() > bestLen {
+			ret = pfx.val
+			bestLen = pfx.pfx.Bits()
+		}
+	}
+	return ret
+}
+
+// randomPrefixes returns n randomly generated prefixes and associated values,
+// distributed equally between IPv4 and IPv6.
+func randomPrefixes(n int) []slowPrefixEntry[int] {
+	pfxs := randomPrefixes4(n / 2)
+	pfxs = append(pfxs, randomPrefixes6(n-len(pfxs))...)
+	return pfxs
+}
+
+// randomPrefixes4 returns n randomly generated IPv4 prefixes and associated values.
+func randomPrefixes4(n int) []slowPrefixEntry[int] {
+	pfxs := map[netip.Prefix]bool{}
+
+	for len(pfxs) < n {
+		len := rand.Intn(33)
+		pfx, err := randomAddr4().Prefix(len)
+		if err != nil {
+			panic(err)
+		}
+		pfxs[pfx] = true
+	}
+
+	ret := make([]slowPrefixEntry[int], 0, len(pfxs))
+	for pfx := range pfxs {
+		ret = append(ret, slowPrefixEntry[int]{pfx, ptr.To(rand.Int())})
+	}
+
+	return ret
+}
+
+// randomPrefixes6 returns n randomly generated IPv4 prefixes and associated values.
+func randomPrefixes6(n int) []slowPrefixEntry[int] {
+	pfxs := map[netip.Prefix]bool{}
+
+	for len(pfxs) < n {
+		len := rand.Intn(129)
+		pfx, err := randomAddr6().Prefix(len)
+		if err != nil {
+			panic(err)
+		}
+		pfxs[pfx] = true
+	}
+
+	ret := make([]slowPrefixEntry[int], 0, len(pfxs))
+	for pfx := range pfxs {
+		ret = append(ret, slowPrefixEntry[int]{pfx, ptr.To(rand.Int())})
+	}
+
+	return ret
+}
+
+// randomAddr returns a randomly generated IP address.
+func randomAddr() netip.Addr {
+	if rand.Intn(2) == 1 {
+		return randomAddr6()
+	} else {
+		return randomAddr4()
+	}
+}
+
+// randomAddr4 returns a randomly generated IPv4 address.
+func randomAddr4() netip.Addr {
+	var b [4]byte
+	if _, err := crand.Read(b[:]); err != nil {
+		panic(err)
+	}
+	return netip.AddrFrom4(b)
+}
+
+// randomAddr6 returns a randomly generated IPv6 address.
+func randomAddr6() netip.Addr {
+	var b [16]byte
+	if _, err := crand.Read(b[:]); err != nil {
+		panic(err)
+	}
+	return netip.AddrFrom16(b)
+}
+
+// printIntPtr returns *v as a string, or the literal "<nil>" if v is nil.
+func printIntPtr(v *int) string {
+	if v == nil {
+		return "<nil>"
+	}
+	return fmt.Sprint(*v)
+}
+
+// roundFloat64 rounds f to 2 decimal places, for display.
+//
+// It round-trips through a float->string->float conversion, so should not be
+// used in a performance critical setting.
+func roundFloat64(f float64) float64 {
+	s := fmt.Sprintf("%.2f", f)
+	ret, err := strconv.ParseFloat(s, 64)
+	if err != nil {
+		panic(err)
+	}
+	return ret
+}