tailscale/wgengine/bench/trafficgen.go

// Copyright (c) 2021 Tailscale Inc & AUTHORS All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package main

import (
	"encoding/binary"
	"fmt"
	"log"
	"sync"
	"time"

	"inet.af/netaddr"
	"tailscale.com/net/packet"
	"tailscale.com/types/ipproto"
)

type Snapshot struct {
	WhenNsec int64 // current time
	timeAcc  int64 // accumulated time (+NSecPerTx per transmit)

	LastSeqTx    int64 // last sequence number sent
	LastSeqRx    int64 // last sequence number received
	TotalLost    int64 // packets out-of-order or lost so far
	TotalOOO     int64 // packets out-of-order so far
	TotalBytesRx int64 // total bytes received so far
}

type Delta struct {
	DurationNsec int64
	TxPackets    int64
	RxPackets    int64
	LostPackets  int64
	OOOPackets   int64
	Bytes        int64
}

func (b Snapshot) Sub(a Snapshot) Delta {
	return Delta{
		DurationNsec: b.WhenNsec - a.WhenNsec,
		TxPackets:    b.LastSeqTx - a.LastSeqTx,
		RxPackets: (b.LastSeqRx - a.LastSeqRx) -
			(b.TotalLost - a.TotalLost) +
			(b.TotalOOO - a.TotalOOO),
		LostPackets: b.TotalLost - a.TotalLost,
		OOOPackets:  b.TotalOOO - a.TotalOOO,
		Bytes:       b.TotalBytesRx - a.TotalBytesRx,
	}
}

func (d Delta) String() string {
	return fmt.Sprintf("tx=%-6d rx=%-4d (%6d = %.1f%% loss) (%d OOO) (%4.1f Mbit/s)",
		d.TxPackets, d.RxPackets, d.LostPackets,
		float64(d.LostPackets)*100/float64(d.TxPackets),
		d.OOOPackets,
		float64(d.Bytes)*8*1e9/float64(d.DurationNsec)/1e6)
}

type TrafficGen struct {
	mu        sync.Mutex
	cur, prev Snapshot // snapshots used for rate control
	buf       []byte   // pre-generated packet buffer
	done      bool     // true if the test has completed

	onFirstPacket func() // function to call on first received packet

	// maxPackets is the max packets to receive (not send) before
	// ending the test. If it's zero, the test runs forever.
	maxPackets int64

	// nsPerPacket is the target average nanoseconds between packets.
	// It's initially zero, which means transmit as fast as the
	// caller wants to go.
	nsPerPacket int64

	// bestPPS is the "best observed packets-per-second" in recent
	// memory.
	bestPPS float64
}

// NewTrafficGen creates a new, initially locked, TrafficGen.
// Until Start() is called, Generate() will block forever.
func NewTrafficGen(onFirstPacket func()) *TrafficGen {
	t := TrafficGen{
		onFirstPacket: onFirstPacket,
	}

	// initially locked, until first Start()
	t.mu.Lock()

	return &t
}

// Start starts the traffic generator. It assumes mu is already locked,
// and unlocks it.
func (t *TrafficGen) Start(src, dst netaddr.IP, bytesPerPacket int, maxPackets int64) {
	h12 := packet.ICMP4Header{
		IP4Header: packet.IP4Header{
			IPProto: ipproto.ICMPv4,
			IPID:    0,
			Src:     src,
			Dst:     dst,
		},
		Type: packet.ICMP4EchoRequest,
		Code: packet.ICMP4NoCode,
	}

	// ensure there's room for ICMP header plus sequence number
	if bytesPerPacket < ICMPMinSize+8 {
		log.Fatalf("bytesPerPacket must be > 24+8")
	}

	t.maxPackets = maxPackets

	payload := make([]byte, bytesPerPacket-ICMPMinSize)
	t.buf = packet.Generate(h12, payload)

	t.mu.Unlock()
}

func (t *TrafficGen) Snap() Snapshot {
	t.mu.Lock()
	defer t.mu.Unlock()

	t.cur.WhenNsec = time.Now().UnixNano()
	return t.cur
}

func (t *TrafficGen) Running() bool {
	t.mu.Lock()
	defer t.mu.Unlock()

	return !t.done
}

// Generate produces the next packet in the sequence. It sleeps if
// it's too soon for the next packet to be sent.
//
// The generated packet is placed into buf at offset ofs, for compatibility
// with the wireguard-go conventions.
//
// The return value is the number of bytes generated in the packet, or 0
// if the test has finished running.
func (t *TrafficGen) Generate(b []byte, ofs int) int {
	t.mu.Lock()

	now := time.Now().UnixNano()
	if t.nsPerPacket == 0 || t.cur.timeAcc == 0 {
		t.cur.timeAcc = now - 1
	}
	if t.cur.timeAcc >= now {
		// too soon
		t.mu.Unlock()
		time.Sleep(time.Duration(t.cur.timeAcc-now) * time.Nanosecond)
		t.mu.Lock()

		now = t.cur.timeAcc
	}
	if t.done {
		t.mu.Unlock()
		return 0
	}

	t.cur.timeAcc += t.nsPerPacket
	t.cur.LastSeqTx += 1
	t.cur.WhenNsec = now
	seq := t.cur.LastSeqTx

	t.mu.Unlock()

	copy(b[ofs:], t.buf)
	binary.BigEndian.PutUint64(
		b[ofs+ICMPMinSize:ofs+ICMPMinSize+8],
		uint64(seq))

	return len(t.buf)
}

// GotPacket processes a packet that came back on the receive side.
func (t *TrafficGen) GotPacket(b []byte, ofs int) {
	t.mu.Lock()

	s := &t.cur
	seq := int64(binary.BigEndian.Uint64(
		b[ofs+ICMPMinSize : ofs+ICMPMinSize+8]))
	if seq > s.LastSeqRx {
		if s.LastSeqRx > 0 {
			// only count lost packets after the very first
			// successful one.
			s.TotalLost += seq - s.LastSeqRx - 1
		}
		s.LastSeqRx = seq
	} else {
		s.TotalOOO += 1
	}

	// +1 packet since we only start counting after the first one
	if t.maxPackets > 0 && s.LastSeqRx >= t.maxPackets+1 {
		t.done = true
	}
	s.TotalBytesRx += int64(len(b) - ofs)

	f := t.onFirstPacket
	t.onFirstPacket = nil

	t.mu.Unlock()

	if f != nil {
		f()
	}
}

// Adjust tunes the transmit rate based on the received packets.
// The goal is to converge on the fastest transmit rate that still has
// minimal packet loss. Returns the new target rate in packets/sec.
//
// We need to play this guessing game in order to balance out tx and rx
// rates when there's a lossy network between them. Otherwise we can end
// up using 99% of the CPU to blast out transmitted packets and leaving only
// 1% to receive them, leading to a misleading throughput calculation.
//
// Call this function multiple times per second.
func (t *TrafficGen) Adjust() (pps float64) {
	t.mu.Lock()
	defer t.mu.Unlock()

	// don't adjust rate until the first full period *after* receiving
	// the first packet. This skips any handshake time in the underlying
	// transport.
	if t.prev.LastSeqRx == 0 {
		t.prev = t.cur
		return 0 // no estimate yet, continue at max speed
	}

	d := t.cur.Sub(t.prev)
	t.bestPPS *= 0.97
	pps = float64(d.RxPackets) * 1e9 / float64(d.DurationNsec)
	if pps > 0 && t.prev.WhenNsec > 0 {
		if pps > t.bestPPS {
			t.bestPPS = pps
		}
		t.nsPerPacket = int64(1e9 / t.bestPPS)
	}
	t.prev = t.cur

	return t.bestPPS
}
wgengine/bench: speed test for channels, sockets, and wireguard-go. This tries to generate traffic at a rate that will saturate the receiver, without overdoing it, even in the event of packet loss. It's unrealistically more aggressive than TCP (which will back off quickly in case of packet loss) but less silly than a blind test that just generates packets as fast as it can (which can cause all the CPU to be absorbed by the transmitter, giving an incorrect impression of how much capacity the total system has). Initial indications are that a syscall about every 10 packets (TCP bulk delivery) is roughly the same speed as sending every packet through a channel. A syscall per packet is about 5x-10x slower than that. The whole tailscale wireguard-go + magicsock + packet filter combination is about 4x slower again, which is better than I thought we'd do, but probably has room for improvement. Note that in "full" tailscale, there is also a tundev read/write for every packet, effectively doubling the syscall overhead per packet. Given these numbers, it seems like read/write syscalls are only 25-40% of the total CPU time used in tailscale proper, so we do have significant non-syscall optimization work to do too. Sample output: $ GOMAXPROCS=2 go test -bench . -benchtime 5s ./cmd/tailbench goos: linux goarch: amd64 pkg: tailscale.com/cmd/tailbench cpu: Intel(R) Core(TM) i7-4785T CPU @ 2.20GHz BenchmarkTrivialNoAlloc/32-2 56340248 93.85 ns/op 340.98 MB/s 0 %lost 0 B/op 0 allocs/op BenchmarkTrivialNoAlloc/124-2 57527490 99.27 ns/op 1249.10 MB/s 0 %lost 0 B/op 0 allocs/op BenchmarkTrivialNoAlloc/1024-2 52537773 111.3 ns/op 9200.39 MB/s 0 %lost 0 B/op 0 allocs/op BenchmarkTrivial/32-2 41878063 135.6 ns/op 236.04 MB/s 0 %lost 0 B/op 0 allocs/op BenchmarkTrivial/124-2 41270439 138.4 ns/op 896.02 MB/s 0 %lost 0 B/op 0 allocs/op BenchmarkTrivial/1024-2 36337252 154.3 ns/op 6635.30 MB/s 0 %lost 0 B/op 0 allocs/op BenchmarkBlockingChannel/32-2 12171654 494.3 ns/op 64.74 MB/s 0 %lost 1791 B/op 0 allocs/op BenchmarkBlockingChannel/124-2 12149956 507.8 ns/op 244.17 MB/s 0 %lost 1792 B/op 1 allocs/op BenchmarkBlockingChannel/1024-2 11034754 528.8 ns/op 1936.42 MB/s 0 %lost 1792 B/op 1 allocs/op BenchmarkNonlockingChannel/32-2 8960622 2195 ns/op 14.58 MB/s 8.825 %lost 1792 B/op 1 allocs/op BenchmarkNonlockingChannel/124-2 3014614 2224 ns/op 55.75 MB/s 11.18 %lost 1792 B/op 1 allocs/op BenchmarkNonlockingChannel/1024-2 3234915 1688 ns/op 606.53 MB/s 3.765 %lost 1792 B/op 1 allocs/op BenchmarkDoubleChannel/32-2 8457559 764.1 ns/op 41.88 MB/s 5.945 %lost 1792 B/op 1 allocs/op BenchmarkDoubleChannel/124-2 5497726 1030 ns/op 120.38 MB/s 12.14 %lost 1792 B/op 1 allocs/op BenchmarkDoubleChannel/1024-2 7985656 1360 ns/op 752.86 MB/s 13.57 %lost 1792 B/op 1 allocs/op BenchmarkUDP/32-2 1652134 3695 ns/op 8.66 MB/s 0 %lost 176 B/op 3 allocs/op BenchmarkUDP/124-2 1621024 3765 ns/op 32.94 MB/s 0 %lost 176 B/op 3 allocs/op BenchmarkUDP/1024-2 1553750 3825 ns/op 267.72 MB/s 0 %lost 176 B/op 3 allocs/op BenchmarkTCP/32-2 11056336 503.2 ns/op 63.60 MB/s 0 %lost 0 B/op 0 allocs/op BenchmarkTCP/124-2 11074869 533.7 ns/op 232.32 MB/s 0 %lost 0 B/op 0 allocs/op BenchmarkTCP/1024-2 8934968 671.4 ns/op 1525.20 MB/s 0 %lost 0 B/op 0 allocs/op BenchmarkWireGuardTest/32-2 1403702 4547 ns/op 7.04 MB/s 14.37 %lost 467 B/op 3 allocs/op BenchmarkWireGuardTest/124-2 780645 7927 ns/op 15.64 MB/s 1.537 %lost 420 B/op 3 allocs/op BenchmarkWireGuardTest/1024-2 512671 11791 ns/op 86.85 MB/s 0.5206 %lost 411 B/op 3 allocs/op PASS ok tailscale.com/wgengine/bench 195.724s Updates #414. Signed-off-by: Avery Pennarun <apenwarr@tailscale.com> 3 years ago			`// Copyright (c) 2021 Tailscale Inc & AUTHORS All rights reserved.`
			`// Use of this source code is governed by a BSD-style`
			`// license that can be found in the LICENSE file.`

			`package main`

			`import (`
			`"encoding/binary"`
			`"fmt"`
			`"log"`
			`"sync"`
			`"time"`

			`"inet.af/netaddr"`
			`"tailscale.com/net/packet"`
			`"tailscale.com/types/ipproto"`
			`)`

			`type Snapshot struct {`
			`WhenNsec int64 // current time`
			`timeAcc int64 // accumulated time (+NSecPerTx per transmit)`

			`LastSeqTx int64 // last sequence number sent`
			`LastSeqRx int64 // last sequence number received`
			`TotalLost int64 // packets out-of-order or lost so far`
			`TotalOOO int64 // packets out-of-order so far`
			`TotalBytesRx int64 // total bytes received so far`
			`}`

			`type Delta struct {`
			`DurationNsec int64`
			`TxPackets int64`
			`RxPackets int64`
			`LostPackets int64`
			`OOOPackets int64`
			`Bytes int64`
			`}`

			`func (b Snapshot) Sub(a Snapshot) Delta {`
			`return Delta{`
			`DurationNsec: b.WhenNsec - a.WhenNsec,`
			`TxPackets: b.LastSeqTx - a.LastSeqTx,`
			`RxPackets: (b.LastSeqRx - a.LastSeqRx) -`
			`(b.TotalLost - a.TotalLost) +`
			`(b.TotalOOO - a.TotalOOO),`
			`LostPackets: b.TotalLost - a.TotalLost,`
			`OOOPackets: b.TotalOOO - a.TotalOOO,`
			`Bytes: b.TotalBytesRx - a.TotalBytesRx,`
			`}`
			`}`

			`func (d Delta) String() string {`
			`return fmt.Sprintf("tx=%-6d rx=%-4d (%6d = %.1f%% loss) (%d OOO) (%4.1f Mbit/s)",`
			`d.TxPackets, d.RxPackets, d.LostPackets,`
			`float64(d.LostPackets)*100/float64(d.TxPackets),`
			`d.OOOPackets,`
			`float64(d.Bytes)81e9/float64(d.DurationNsec)/1e6)`
			`}`

			`type TrafficGen struct {`
			`mu sync.Mutex`
			`cur, prev Snapshot // snapshots used for rate control`
			`buf []byte // pre-generated packet buffer`
			`done bool // true if the test has completed`

			`onFirstPacket func() // function to call on first received packet`

			`// maxPackets is the max packets to receive (not send) before`
			`// ending the test. If it's zero, the test runs forever.`
			`maxPackets int64`

			`// nsPerPacket is the target average nanoseconds between packets.`
			`// It's initially zero, which means transmit as fast as the`
			`// caller wants to go.`
			`nsPerPacket int64`

			`// bestPPS is the "best observed packets-per-second" in recent`
			`// memory.`
			`bestPPS float64`
			`}`

			`// NewTrafficGen creates a new, initially locked, TrafficGen.`
			`// Until Start() is called, Generate() will block forever.`
			`func NewTrafficGen(onFirstPacket func()) *TrafficGen {`
			`t := TrafficGen{`
			`onFirstPacket: onFirstPacket,`
			`}`

			`// initially locked, until first Start()`
			`t.mu.Lock()`

			`return &t`
			`}`

			`// Start starts the traffic generator. It assumes mu is already locked,`
			`// and unlocks it.`
			`func (t *TrafficGen) Start(src, dst netaddr.IP, bytesPerPacket int, maxPackets int64) {`
			`h12 := packet.ICMP4Header{`
			`IP4Header: packet.IP4Header{`
			`IPProto: ipproto.ICMPv4,`
			`IPID: 0,`
			`Src: src,`
			`Dst: dst,`
			`},`
			`Type: packet.ICMP4EchoRequest,`
			`Code: packet.ICMP4NoCode,`
			`}`

			`// ensure there's room for ICMP header plus sequence number`
			`if bytesPerPacket < ICMPMinSize+8 {`
			`log.Fatalf("bytesPerPacket must be > 24+8")`
			`}`

			`t.maxPackets = maxPackets`

			`payload := make([]byte, bytesPerPacket-ICMPMinSize)`
			`t.buf = packet.Generate(h12, payload)`

			`t.mu.Unlock()`
			`}`

			`func (t *TrafficGen) Snap() Snapshot {`
			`t.mu.Lock()`
			`defer t.mu.Unlock()`

			`t.cur.WhenNsec = time.Now().UnixNano()`
			`return t.cur`
			`}`

			`func (t *TrafficGen) Running() bool {`
			`t.mu.Lock()`
			`defer t.mu.Unlock()`

			`return !t.done`
			`}`

			`// Generate produces the next packet in the sequence. It sleeps if`
			`// it's too soon for the next packet to be sent.`
			`//`
			`// The generated packet is placed into buf at offset ofs, for compatibility`
			`// with the wireguard-go conventions.`
			`//`
			`// The return value is the number of bytes generated in the packet, or 0`
			`// if the test has finished running.`
			`func (t *TrafficGen) Generate(b []byte, ofs int) int {`
			`t.mu.Lock()`

			`now := time.Now().UnixNano()`
			`if t.nsPerPacket == 0 \|\| t.cur.timeAcc == 0 {`
			`t.cur.timeAcc = now - 1`
			`}`
			`if t.cur.timeAcc >= now {`
			`// too soon`
			`t.mu.Unlock()`
			`time.Sleep(time.Duration(t.cur.timeAcc-now) * time.Nanosecond)`
			`t.mu.Lock()`

			`now = t.cur.timeAcc`
			`}`
			`if t.done {`
			`t.mu.Unlock()`
			`return 0`
			`}`

			`t.cur.timeAcc += t.nsPerPacket`
			`t.cur.LastSeqTx += 1`
			`t.cur.WhenNsec = now`
			`seq := t.cur.LastSeqTx`

			`t.mu.Unlock()`

			`copy(b[ofs:], t.buf)`
			`binary.BigEndian.PutUint64(`
			`b[ofs+ICMPMinSize:ofs+ICMPMinSize+8],`
			`uint64(seq))`

			`return len(t.buf)`
			`}`

			`// GotPacket processes a packet that came back on the receive side.`
			`func (t *TrafficGen) GotPacket(b []byte, ofs int) {`
			`t.mu.Lock()`

			`s := &t.cur`
			`seq := int64(binary.BigEndian.Uint64(`
			`b[ofs+ICMPMinSize : ofs+ICMPMinSize+8]))`
			`if seq > s.LastSeqRx {`
			`if s.LastSeqRx > 0 {`
			`// only count lost packets after the very first`
			`// successful one.`
			`s.TotalLost += seq - s.LastSeqRx - 1`
			`}`
			`s.LastSeqRx = seq`
			`} else {`
			`s.TotalOOO += 1`
			`}`

			`// +1 packet since we only start counting after the first one`
			`if t.maxPackets > 0 && s.LastSeqRx >= t.maxPackets+1 {`
			`t.done = true`
			`}`
			`s.TotalBytesRx += int64(len(b) - ofs)`

			`f := t.onFirstPacket`
			`t.onFirstPacket = nil`

			`t.mu.Unlock()`

			`if f != nil {`
			`f()`
			`}`
			`}`

			`// Adjust tunes the transmit rate based on the received packets.`
			`// The goal is to converge on the fastest transmit rate that still has`
			`// minimal packet loss. Returns the new target rate in packets/sec.`
			`//`
			`// We need to play this guessing game in order to balance out tx and rx`
			`// rates when there's a lossy network between them. Otherwise we can end`
			`// up using 99% of the CPU to blast out transmitted packets and leaving only`
			`// 1% to receive them, leading to a misleading throughput calculation.`
			`//`
			`// Call this function multiple times per second.`
			`func (t *TrafficGen) Adjust() (pps float64) {`
			`t.mu.Lock()`
			`defer t.mu.Unlock()`

			`// don't adjust rate until the first full period after receiving`
			`// the first packet. This skips any handshake time in the underlying`
			`// transport.`
			`if t.prev.LastSeqRx == 0 {`
			`t.prev = t.cur`
			`return 0 // no estimate yet, continue at max speed`
			`}`

			`d := t.cur.Sub(t.prev)`
			`t.bestPPS *= 0.97`
			`pps = float64(d.RxPackets) * 1e9 / float64(d.DurationNsec)`
			`if pps > 0 && t.prev.WhenNsec > 0 {`
			`if pps > t.bestPPS {`
			`t.bestPPS = pps`
			`}`
			`t.nsPerPacket = int64(1e9 / t.bestPPS)`
			`}`
			`t.prev = t.cur`

			`return t.bestPPS`
			`}`