You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
tailscale/wgengine/bench/bench.go

399 lines
8.7 KiB
Go

// Copyright (c) 2021 Tailscale Inc & AUTHORS All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Create two wgengine instances and pass data through them, measuring
// throughput, latency, and packet loss.
package main
import (
"bufio"
"io"
"log"
"net"
"net/http"
"net/http/pprof"
"net/netip"
"os"
"strconv"
"time"
"tailscale.com/types/logger"
)
const PayloadSize = 1000
const ICMPMinSize = 24
var Addr1 = netip.MustParsePrefix("100.64.1.1/32")
var Addr2 = netip.MustParsePrefix("100.64.1.2/32")
func main() {
var logf logger.Logf = log.Printf
log.SetFlags(0)
debugMux := newDebugMux()
go runDebugServer(debugMux, "0.0.0.0:8999")
mode, err := strconv.Atoi(os.Args[1])
if err != nil {
log.Fatalf("%q: %v", os.Args[1], err)
}
traf := NewTrafficGen(nil)
// Sample test results below are using GOMAXPROCS=2 (for some
// tests, including wireguard-go, higher GOMAXPROCS goes slower)
// on apenwarr's old Linux box:
// Intel(R) Core(TM) i7-4785T CPU @ 2.20GHz
// My 2019 Mac Mini is about 20% faster on most tests.
switch mode {
// tx=8786325 rx=8786326 (0 = 0.00% loss) (70768.7 Mbits/sec)
case 1:
setupTrivialNoAllocTest(logf, traf)
// tx=6476293 rx=6476293 (0 = 0.00% loss) (52249.7 Mbits/sec)
case 2:
setupTrivialTest(logf, traf)
// tx=1957974 rx=1958379 (0 = 0.00% loss) (15939.8 Mbits/sec)
case 11:
setupBlockingChannelTest(logf, traf)
// tx=728621 rx=701825 (26620 = 3.65% loss) (5525.2 Mbits/sec)
// (much faster on macOS??)
case 12:
setupNonblockingChannelTest(logf, traf)
// tx=1024260 rx=941098 (83334 = 8.14% loss) (7516.6 Mbits/sec)
// (much faster on macOS??)
case 13:
setupDoubleChannelTest(logf, traf)
// tx=265468 rx=263189 (2279 = 0.86% loss) (2162.0 Mbits/sec)
case 21:
setupUDPTest(logf, traf)
// tx=1493580 rx=1493580 (0 = 0.00% loss) (12210.4 Mbits/sec)
case 31:
setupBatchTCPTest(logf, traf)
// tx=134236 rx=133166 (1070 = 0.80% loss) (1088.9 Mbits/sec)
case 101:
setupWGTest(nil, logf, traf, Addr1, Addr2)
default:
log.Fatalf("provide a valid test number (0..n)")
}
logf("initialized ok.")
traf.Start(Addr1.Addr(), Addr2.Addr(), PayloadSize+ICMPMinSize, 0)
var cur, prev Snapshot
var pps int64
i := 0
for {
i += 1
time.Sleep(10 * time.Millisecond)
if (i % 100) == 0 {
prev = cur
cur = traf.Snap()
d := cur.Sub(prev)
if prev.WhenNsec == 0 {
logf("tx=%-6d rx=%-6d", d.TxPackets, d.RxPackets)
} else {
logf("%v @%7d pkt/s", d, pps)
}
}
pps = traf.Adjust()
}
}
func newDebugMux() *http.ServeMux {
mux := http.NewServeMux()
mux.HandleFunc("/debug/pprof/", pprof.Index)
mux.HandleFunc("/debug/pprof/cmdline", pprof.Cmdline)
mux.HandleFunc("/debug/pprof/profile", pprof.Profile)
mux.HandleFunc("/debug/pprof/symbol", pprof.Symbol)
mux.HandleFunc("/debug/pprof/trace", pprof.Trace)
return mux
}
func runDebugServer(mux *http.ServeMux, addr string) {
srv := &http.Server{
Addr: addr,
Handler: mux,
}
if err := srv.ListenAndServe(); err != nil {
log.Fatal(err)
}
}
// The absolute minimal test of the traffic generator: have it fill
// a packet buffer, then absorb it again. Zero packet loss.
func setupTrivialNoAllocTest(logf logger.Logf, traf *TrafficGen) {
go func() {
b := make([]byte, 1600)
for {
n := traf.Generate(b, 16)
if n == 0 {
break
}
traf.GotPacket(b[0:n+16], 16)
}
}()
}
// Almost the same, but this time allocate a fresh buffer each time
// through the loop. Still zero packet loss. Runs about 2/3 as fast for me.
func setupTrivialTest(logf logger.Logf, traf *TrafficGen) {
go func() {
for {
b := make([]byte, 1600)
n := traf.Generate(b, 16)
if n == 0 {
break
}
traf.GotPacket(b[0:n+16], 16)
}
}()
}
// Pass packets through a blocking channel between sender and receiver.
// Still zero packet loss since the sender stops when the channel is full.
// Max speed depends on channel length (I'm not sure why).
func setupBlockingChannelTest(logf logger.Logf, traf *TrafficGen) {
ch := make(chan []byte, 1000)
go func() {
// transmitter
for {
b := make([]byte, 1600)
n := traf.Generate(b, 16)
if n == 0 {
close(ch)
break
}
ch <- b[0 : n+16]
}
}()
go func() {
// receiver
for b := range ch {
traf.GotPacket(b, 16)
}
}()
}
// Same as setupBlockingChannelTest, but now we drop packets whenever the
// channel is full. Max speed is about the same as the above test, but
// now with nonzero packet loss.
func setupNonblockingChannelTest(logf logger.Logf, traf *TrafficGen) {
ch := make(chan []byte, 1000)
go func() {
// transmitter
for {
b := make([]byte, 1600)
n := traf.Generate(b, 16)
if n == 0 {
close(ch)
break
}
select {
case ch <- b[0 : n+16]:
default:
}
}
}()
go func() {
// receiver
for b := range ch {
traf.GotPacket(b, 16)
}
}()
}
// Same as above, but at an intermediate blocking channel and goroutine
// to make things a little more like wireguard-go. Roughly 20% slower than
// the single-channel version.
func setupDoubleChannelTest(logf logger.Logf, traf *TrafficGen) {
ch := make(chan []byte, 1000)
ch2 := make(chan []byte, 1000)
go func() {
// transmitter
for {
b := make([]byte, 1600)
n := traf.Generate(b, 16)
if n == 0 {
close(ch)
break
}
select {
case ch <- b[0 : n+16]:
default:
}
}
}()
go func() {
// intermediary
for b := range ch {
ch2 <- b
}
close(ch2)
}()
go func() {
// receiver
for b := range ch2 {
traf.GotPacket(b, 16)
}
}()
}
// Instead of a channel, pass packets through a UDP socket.
func setupUDPTest(logf logger.Logf, traf *TrafficGen) {
la, err := net.ResolveUDPAddr("udp", ":0")
if err != nil {
log.Fatalf("resolve: %v", err)
}
s1, err := net.ListenUDP("udp", la)
if err != nil {
log.Fatalf("listen1: %v", err)
}
s2, err := net.ListenUDP("udp", la)
if err != nil {
log.Fatalf("listen2: %v", err)
}
a2 := s2.LocalAddr()
// On macOS (but not Linux), you can't transmit to 0.0.0.0:port,
// which is what returns from .LocalAddr() above. We have to
// force it to localhost instead.
a2.(*net.UDPAddr).IP = net.ParseIP("127.0.0.1")
s1.SetWriteBuffer(1024 * 1024)
s2.SetReadBuffer(1024 * 1024)
go func() {
// transmitter
b := make([]byte, 1600)
for {
n := traf.Generate(b, 16)
if n == 0 {
break
}
s1.WriteTo(b[16:n+16], a2)
}
}()
go func() {
// receiver
b := make([]byte, 1600)
for traf.Running() {
// Use ReadFrom instead of Read, to be more like
// how wireguard-go does it, even though we're not
// going to actually look at the address.
n, _, err := s2.ReadFrom(b)
if err != nil {
log.Fatalf("s2.Read: %v", err)
}
traf.GotPacket(b[:n], 0)
}
}()
}
// Instead of a channel, pass packets through a TCP socket.
// TCP is a single stream, so we can amortize one syscall across
// multiple packets. 10x amortization seems to make it go ~10x faster,
// as expected, getting us close to the speed of the channel tests above.
// There's also zero packet loss.
func setupBatchTCPTest(logf logger.Logf, traf *TrafficGen) {
sl, err := net.Listen("tcp", ":0")
if err != nil {
log.Fatalf("listen: %v", err)
}
s1, err := net.Dial("tcp", sl.Addr().String())
if err != nil {
log.Fatalf("dial: %v", err)
}
s2, err := sl.Accept()
if err != nil {
log.Fatalf("accept: %v", err)
}
s1.(*net.TCPConn).SetWriteBuffer(1024 * 1024)
s2.(*net.TCPConn).SetReadBuffer(1024 * 1024)
ch := make(chan int)
go func() {
// transmitter
bs1 := bufio.NewWriterSize(s1, 1024*1024)
b := make([]byte, 1600)
i := 0
for {
i += 1
n := traf.Generate(b, 16)
if n == 0 {
break
}
if i == 1 {
ch <- n
}
bs1.Write(b[16 : n+16])
// TODO: this is a pretty half-baked batching
// function, which we'd never want to employ in
// a real-life program.
//
// In real life, we'd probably want to flush
// immediately when there are no more packets to
// generate, and queue up only if we fall behind.
//
// In our case however, we just want to see the
// technical benefits of batching 10 syscalls
// into 1, so a fixed ratio makes more sense.
if (i % 10) == 0 {
bs1.Flush()
}
}
}()
go func() {
// receiver
bs2 := bufio.NewReaderSize(s2, 1024*1024)
// Find out the packet size (we happen to know they're
// all the same size)
packetSize := <-ch
b := make([]byte, packetSize)
for traf.Running() {
// TODO: can't use ReadFrom() here, which is
// unfair compared to UDP. (ReadFrom for UDP
// apparently allocates memory per packet, which
// this test does not.)
n, err := io.ReadFull(bs2, b)
if err != nil {
log.Fatalf("s2.Read: %v", err)
}
traf.GotPacket(b[:n], 0)
}
}()
}