You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
tailscale/wgengine/bench/bench_test.go

109 lines
2.1 KiB
Go

wgengine/bench: speed test for channels, sockets, and wireguard-go. This tries to generate traffic at a rate that will saturate the receiver, without overdoing it, even in the event of packet loss. It's unrealistically more aggressive than TCP (which will back off quickly in case of packet loss) but less silly than a blind test that just generates packets as fast as it can (which can cause all the CPU to be absorbed by the transmitter, giving an incorrect impression of how much capacity the total system has). Initial indications are that a syscall about every 10 packets (TCP bulk delivery) is roughly the same speed as sending every packet through a channel. A syscall per packet is about 5x-10x slower than that. The whole tailscale wireguard-go + magicsock + packet filter combination is about 4x slower again, which is better than I thought we'd do, but probably has room for improvement. Note that in "full" tailscale, there is also a tundev read/write for every packet, effectively doubling the syscall overhead per packet. Given these numbers, it seems like read/write syscalls are only 25-40% of the total CPU time used in tailscale proper, so we do have significant non-syscall optimization work to do too. Sample output: $ GOMAXPROCS=2 go test -bench . -benchtime 5s ./cmd/tailbench goos: linux goarch: amd64 pkg: tailscale.com/cmd/tailbench cpu: Intel(R) Core(TM) i7-4785T CPU @ 2.20GHz BenchmarkTrivialNoAlloc/32-2 56340248 93.85 ns/op 340.98 MB/s 0 %lost 0 B/op 0 allocs/op BenchmarkTrivialNoAlloc/124-2 57527490 99.27 ns/op 1249.10 MB/s 0 %lost 0 B/op 0 allocs/op BenchmarkTrivialNoAlloc/1024-2 52537773 111.3 ns/op 9200.39 MB/s 0 %lost 0 B/op 0 allocs/op BenchmarkTrivial/32-2 41878063 135.6 ns/op 236.04 MB/s 0 %lost 0 B/op 0 allocs/op BenchmarkTrivial/124-2 41270439 138.4 ns/op 896.02 MB/s 0 %lost 0 B/op 0 allocs/op BenchmarkTrivial/1024-2 36337252 154.3 ns/op 6635.30 MB/s 0 %lost 0 B/op 0 allocs/op BenchmarkBlockingChannel/32-2 12171654 494.3 ns/op 64.74 MB/s 0 %lost 1791 B/op 0 allocs/op BenchmarkBlockingChannel/124-2 12149956 507.8 ns/op 244.17 MB/s 0 %lost 1792 B/op 1 allocs/op BenchmarkBlockingChannel/1024-2 11034754 528.8 ns/op 1936.42 MB/s 0 %lost 1792 B/op 1 allocs/op BenchmarkNonlockingChannel/32-2 8960622 2195 ns/op 14.58 MB/s 8.825 %lost 1792 B/op 1 allocs/op BenchmarkNonlockingChannel/124-2 3014614 2224 ns/op 55.75 MB/s 11.18 %lost 1792 B/op 1 allocs/op BenchmarkNonlockingChannel/1024-2 3234915 1688 ns/op 606.53 MB/s 3.765 %lost 1792 B/op 1 allocs/op BenchmarkDoubleChannel/32-2 8457559 764.1 ns/op 41.88 MB/s 5.945 %lost 1792 B/op 1 allocs/op BenchmarkDoubleChannel/124-2 5497726 1030 ns/op 120.38 MB/s 12.14 %lost 1792 B/op 1 allocs/op BenchmarkDoubleChannel/1024-2 7985656 1360 ns/op 752.86 MB/s 13.57 %lost 1792 B/op 1 allocs/op BenchmarkUDP/32-2 1652134 3695 ns/op 8.66 MB/s 0 %lost 176 B/op 3 allocs/op BenchmarkUDP/124-2 1621024 3765 ns/op 32.94 MB/s 0 %lost 176 B/op 3 allocs/op BenchmarkUDP/1024-2 1553750 3825 ns/op 267.72 MB/s 0 %lost 176 B/op 3 allocs/op BenchmarkTCP/32-2 11056336 503.2 ns/op 63.60 MB/s 0 %lost 0 B/op 0 allocs/op BenchmarkTCP/124-2 11074869 533.7 ns/op 232.32 MB/s 0 %lost 0 B/op 0 allocs/op BenchmarkTCP/1024-2 8934968 671.4 ns/op 1525.20 MB/s 0 %lost 0 B/op 0 allocs/op BenchmarkWireGuardTest/32-2 1403702 4547 ns/op 7.04 MB/s 14.37 %lost 467 B/op 3 allocs/op BenchmarkWireGuardTest/124-2 780645 7927 ns/op 15.64 MB/s 1.537 %lost 420 B/op 3 allocs/op BenchmarkWireGuardTest/1024-2 512671 11791 ns/op 86.85 MB/s 0.5206 %lost 411 B/op 3 allocs/op PASS ok tailscale.com/wgengine/bench 195.724s Updates #414. Signed-off-by: Avery Pennarun <apenwarr@tailscale.com>
4 years ago
// Copyright (c) 2021 Tailscale Inc & AUTHORS All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Create two wgengine instances and pass data through them, measuring
// throughput, latency, and packet loss.
package main
import (
"fmt"
"testing"
"time"
"tailscale.com/types/logger"
)
func BenchmarkTrivialNoAlloc(b *testing.B) {
run(b, setupTrivialNoAllocTest)
}
func BenchmarkTrivial(b *testing.B) {
run(b, setupTrivialTest)
}
func BenchmarkBlockingChannel(b *testing.B) {
run(b, setupBlockingChannelTest)
}
func BenchmarkNonblockingChannel(b *testing.B) {
run(b, setupNonblockingChannelTest)
}
func BenchmarkDoubleChannel(b *testing.B) {
run(b, setupDoubleChannelTest)
}
func BenchmarkUDP(b *testing.B) {
run(b, setupUDPTest)
}
func BenchmarkBatchTCP(b *testing.B) {
run(b, setupBatchTCPTest)
}
func BenchmarkWireGuardTest(b *testing.B) {
run(b, func(logf logger.Logf, traf *TrafficGen) {
setupWGTest(logf, traf, Addr1, Addr2)
})
}
type SetupFunc func(logger.Logf, *TrafficGen)
func run(b *testing.B, setup SetupFunc) {
sizes := []int{
ICMPMinSize + 8,
ICMPMinSize + 100,
ICMPMinSize + 1000,
}
for _, size := range sizes {
b.Run(fmt.Sprintf("%d", size), func(b *testing.B) {
runOnce(b, setup, size)
})
}
}
func runOnce(b *testing.B, setup SetupFunc, payload int) {
b.StopTimer()
b.ReportAllocs()
var logf logger.Logf = b.Logf
if !testing.Verbose() {
logf = logger.Discard
}
traf := NewTrafficGen(b.StartTimer)
setup(logf, traf)
logf("initialized. (n=%v)", b.N)
b.SetBytes(int64(payload))
traf.Start(Addr1.IP, Addr2.IP, payload, int64(b.N))
var cur, prev Snapshot
var pps int64
wgengine/bench: speed test for channels, sockets, and wireguard-go. This tries to generate traffic at a rate that will saturate the receiver, without overdoing it, even in the event of packet loss. It's unrealistically more aggressive than TCP (which will back off quickly in case of packet loss) but less silly than a blind test that just generates packets as fast as it can (which can cause all the CPU to be absorbed by the transmitter, giving an incorrect impression of how much capacity the total system has). Initial indications are that a syscall about every 10 packets (TCP bulk delivery) is roughly the same speed as sending every packet through a channel. A syscall per packet is about 5x-10x slower than that. The whole tailscale wireguard-go + magicsock + packet filter combination is about 4x slower again, which is better than I thought we'd do, but probably has room for improvement. Note that in "full" tailscale, there is also a tundev read/write for every packet, effectively doubling the syscall overhead per packet. Given these numbers, it seems like read/write syscalls are only 25-40% of the total CPU time used in tailscale proper, so we do have significant non-syscall optimization work to do too. Sample output: $ GOMAXPROCS=2 go test -bench . -benchtime 5s ./cmd/tailbench goos: linux goarch: amd64 pkg: tailscale.com/cmd/tailbench cpu: Intel(R) Core(TM) i7-4785T CPU @ 2.20GHz BenchmarkTrivialNoAlloc/32-2 56340248 93.85 ns/op 340.98 MB/s 0 %lost 0 B/op 0 allocs/op BenchmarkTrivialNoAlloc/124-2 57527490 99.27 ns/op 1249.10 MB/s 0 %lost 0 B/op 0 allocs/op BenchmarkTrivialNoAlloc/1024-2 52537773 111.3 ns/op 9200.39 MB/s 0 %lost 0 B/op 0 allocs/op BenchmarkTrivial/32-2 41878063 135.6 ns/op 236.04 MB/s 0 %lost 0 B/op 0 allocs/op BenchmarkTrivial/124-2 41270439 138.4 ns/op 896.02 MB/s 0 %lost 0 B/op 0 allocs/op BenchmarkTrivial/1024-2 36337252 154.3 ns/op 6635.30 MB/s 0 %lost 0 B/op 0 allocs/op BenchmarkBlockingChannel/32-2 12171654 494.3 ns/op 64.74 MB/s 0 %lost 1791 B/op 0 allocs/op BenchmarkBlockingChannel/124-2 12149956 507.8 ns/op 244.17 MB/s 0 %lost 1792 B/op 1 allocs/op BenchmarkBlockingChannel/1024-2 11034754 528.8 ns/op 1936.42 MB/s 0 %lost 1792 B/op 1 allocs/op BenchmarkNonlockingChannel/32-2 8960622 2195 ns/op 14.58 MB/s 8.825 %lost 1792 B/op 1 allocs/op BenchmarkNonlockingChannel/124-2 3014614 2224 ns/op 55.75 MB/s 11.18 %lost 1792 B/op 1 allocs/op BenchmarkNonlockingChannel/1024-2 3234915 1688 ns/op 606.53 MB/s 3.765 %lost 1792 B/op 1 allocs/op BenchmarkDoubleChannel/32-2 8457559 764.1 ns/op 41.88 MB/s 5.945 %lost 1792 B/op 1 allocs/op BenchmarkDoubleChannel/124-2 5497726 1030 ns/op 120.38 MB/s 12.14 %lost 1792 B/op 1 allocs/op BenchmarkDoubleChannel/1024-2 7985656 1360 ns/op 752.86 MB/s 13.57 %lost 1792 B/op 1 allocs/op BenchmarkUDP/32-2 1652134 3695 ns/op 8.66 MB/s 0 %lost 176 B/op 3 allocs/op BenchmarkUDP/124-2 1621024 3765 ns/op 32.94 MB/s 0 %lost 176 B/op 3 allocs/op BenchmarkUDP/1024-2 1553750 3825 ns/op 267.72 MB/s 0 %lost 176 B/op 3 allocs/op BenchmarkTCP/32-2 11056336 503.2 ns/op 63.60 MB/s 0 %lost 0 B/op 0 allocs/op BenchmarkTCP/124-2 11074869 533.7 ns/op 232.32 MB/s 0 %lost 0 B/op 0 allocs/op BenchmarkTCP/1024-2 8934968 671.4 ns/op 1525.20 MB/s 0 %lost 0 B/op 0 allocs/op BenchmarkWireGuardTest/32-2 1403702 4547 ns/op 7.04 MB/s 14.37 %lost 467 B/op 3 allocs/op BenchmarkWireGuardTest/124-2 780645 7927 ns/op 15.64 MB/s 1.537 %lost 420 B/op 3 allocs/op BenchmarkWireGuardTest/1024-2 512671 11791 ns/op 86.85 MB/s 0.5206 %lost 411 B/op 3 allocs/op PASS ok tailscale.com/wgengine/bench 195.724s Updates #414. Signed-off-by: Avery Pennarun <apenwarr@tailscale.com>
4 years ago
i := 0
for traf.Running() {
i += 1
time.Sleep(10 * time.Millisecond)
if (i % 100) == 0 {
prev = cur
cur = traf.Snap()
d := cur.Sub(prev)
if prev.WhenNsec != 0 {
logf("%v @%7d pkt/sec", d, pps)
wgengine/bench: speed test for channels, sockets, and wireguard-go. This tries to generate traffic at a rate that will saturate the receiver, without overdoing it, even in the event of packet loss. It's unrealistically more aggressive than TCP (which will back off quickly in case of packet loss) but less silly than a blind test that just generates packets as fast as it can (which can cause all the CPU to be absorbed by the transmitter, giving an incorrect impression of how much capacity the total system has). Initial indications are that a syscall about every 10 packets (TCP bulk delivery) is roughly the same speed as sending every packet through a channel. A syscall per packet is about 5x-10x slower than that. The whole tailscale wireguard-go + magicsock + packet filter combination is about 4x slower again, which is better than I thought we'd do, but probably has room for improvement. Note that in "full" tailscale, there is also a tundev read/write for every packet, effectively doubling the syscall overhead per packet. Given these numbers, it seems like read/write syscalls are only 25-40% of the total CPU time used in tailscale proper, so we do have significant non-syscall optimization work to do too. Sample output: $ GOMAXPROCS=2 go test -bench . -benchtime 5s ./cmd/tailbench goos: linux goarch: amd64 pkg: tailscale.com/cmd/tailbench cpu: Intel(R) Core(TM) i7-4785T CPU @ 2.20GHz BenchmarkTrivialNoAlloc/32-2 56340248 93.85 ns/op 340.98 MB/s 0 %lost 0 B/op 0 allocs/op BenchmarkTrivialNoAlloc/124-2 57527490 99.27 ns/op 1249.10 MB/s 0 %lost 0 B/op 0 allocs/op BenchmarkTrivialNoAlloc/1024-2 52537773 111.3 ns/op 9200.39 MB/s 0 %lost 0 B/op 0 allocs/op BenchmarkTrivial/32-2 41878063 135.6 ns/op 236.04 MB/s 0 %lost 0 B/op 0 allocs/op BenchmarkTrivial/124-2 41270439 138.4 ns/op 896.02 MB/s 0 %lost 0 B/op 0 allocs/op BenchmarkTrivial/1024-2 36337252 154.3 ns/op 6635.30 MB/s 0 %lost 0 B/op 0 allocs/op BenchmarkBlockingChannel/32-2 12171654 494.3 ns/op 64.74 MB/s 0 %lost 1791 B/op 0 allocs/op BenchmarkBlockingChannel/124-2 12149956 507.8 ns/op 244.17 MB/s 0 %lost 1792 B/op 1 allocs/op BenchmarkBlockingChannel/1024-2 11034754 528.8 ns/op 1936.42 MB/s 0 %lost 1792 B/op 1 allocs/op BenchmarkNonlockingChannel/32-2 8960622 2195 ns/op 14.58 MB/s 8.825 %lost 1792 B/op 1 allocs/op BenchmarkNonlockingChannel/124-2 3014614 2224 ns/op 55.75 MB/s 11.18 %lost 1792 B/op 1 allocs/op BenchmarkNonlockingChannel/1024-2 3234915 1688 ns/op 606.53 MB/s 3.765 %lost 1792 B/op 1 allocs/op BenchmarkDoubleChannel/32-2 8457559 764.1 ns/op 41.88 MB/s 5.945 %lost 1792 B/op 1 allocs/op BenchmarkDoubleChannel/124-2 5497726 1030 ns/op 120.38 MB/s 12.14 %lost 1792 B/op 1 allocs/op BenchmarkDoubleChannel/1024-2 7985656 1360 ns/op 752.86 MB/s 13.57 %lost 1792 B/op 1 allocs/op BenchmarkUDP/32-2 1652134 3695 ns/op 8.66 MB/s 0 %lost 176 B/op 3 allocs/op BenchmarkUDP/124-2 1621024 3765 ns/op 32.94 MB/s 0 %lost 176 B/op 3 allocs/op BenchmarkUDP/1024-2 1553750 3825 ns/op 267.72 MB/s 0 %lost 176 B/op 3 allocs/op BenchmarkTCP/32-2 11056336 503.2 ns/op 63.60 MB/s 0 %lost 0 B/op 0 allocs/op BenchmarkTCP/124-2 11074869 533.7 ns/op 232.32 MB/s 0 %lost 0 B/op 0 allocs/op BenchmarkTCP/1024-2 8934968 671.4 ns/op 1525.20 MB/s 0 %lost 0 B/op 0 allocs/op BenchmarkWireGuardTest/32-2 1403702 4547 ns/op 7.04 MB/s 14.37 %lost 467 B/op 3 allocs/op BenchmarkWireGuardTest/124-2 780645 7927 ns/op 15.64 MB/s 1.537 %lost 420 B/op 3 allocs/op BenchmarkWireGuardTest/1024-2 512671 11791 ns/op 86.85 MB/s 0.5206 %lost 411 B/op 3 allocs/op PASS ok tailscale.com/wgengine/bench 195.724s Updates #414. Signed-off-by: Avery Pennarun <apenwarr@tailscale.com>
4 years ago
}
}
pps = traf.Adjust()
}
cur = traf.Snap()
d := cur.Sub(prev)
loss := float64(d.LostPackets) / float64(d.RxPackets)
b.ReportMetric(loss*100, "%lost")
}