You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
tailscale/wgengine/userspace.go

1240 lines
35 KiB
Go

// Copyright (c) 2020 Tailscale Inc & AUTHORS All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package wgengine
import (
"bufio"
"bytes"
"context"
"encoding/binary"
"errors"
"fmt"
"io"
"log"
"net"
"os"
"os/exec"
"runtime"
"strconv"
"strings"
"sync"
"sync/atomic"
"time"
"github.com/tailscale/wireguard-go/device"
"github.com/tailscale/wireguard-go/tun"
"github.com/tailscale/wireguard-go/wgcfg"
"go4.org/mem"
"inet.af/netaddr"
"tailscale.com/control/controlclient"
"tailscale.com/internal/deepprint"
"tailscale.com/ipn/ipnstate"
"tailscale.com/net/interfaces"
"tailscale.com/net/tsaddr"
"tailscale.com/tailcfg"
"tailscale.com/types/key"
"tailscale.com/types/logger"
"tailscale.com/version"
"tailscale.com/wgengine/filter"
"tailscale.com/wgengine/magicsock"
"tailscale.com/wgengine/monitor"
"tailscale.com/wgengine/packet"
"tailscale.com/wgengine/router"
"tailscale.com/wgengine/tsdns"
wgengine: wrap tun.Device to support filtering and packet injection (#358) Right now, filtering and packet injection in wgengine depend on a patch to wireguard-go that probably isn't suitable for upstreaming. This need not be the case: wireguard-go/tun.Device is an interface. For example, faketun.go implements it to mock a TUN device for testing. This patch implements the same interface to provide filtering and packet injection at the tunnel device level, at which point the wireguard-go patch should no longer be necessary. This patch has the following performance impact on i7-7500U @ 2.70GHz, tested in the following namespace configuration: ┌────────────────┐ ┌─────────────────────────────────┐ ┌────────────────┐ │ $ns1 │ │ $ns0 │ │ $ns2 │ │ client0 │ │ tailcontrol, logcatcher │ │ client1 │ │ ┌─────┐ │ │ ┌──────┐ ┌──────┐ │ │ ┌─────┐ │ │ │vethc│───────┼────┼──│vethrc│ │vethrs│──────┼─────┼──│veths│ │ │ ├─────┴─────┐ │ │ ├──────┴────┐ ├──────┴────┐ │ │ ├─────┴─────┐ │ │ │10.0.0.2/24│ │ │ │10.0.0.1/24│ │10.0.1.1/24│ │ │ │10.0.1.2/24│ │ │ └───────────┘ │ │ └───────────┘ └───────────┘ │ │ └───────────┘ │ └────────────────┘ └─────────────────────────────────┘ └────────────────┘ Before: --------------------------------------------------- | TCP send | UDP send | |------------------------|------------------------| | 557.0 (±8.5) Mbits/sec | 3.03 (±0.02) Gbits/sec | --------------------------------------------------- After: --------------------------------------------------- | TCP send | UDP send | |------------------------|------------------------| | 544.8 (±1.6) Mbits/sec | 3.13 (±0.02) Gbits/sec | --------------------------------------------------- The impact on receive performance is similar. Signed-off-by: Dmytro Shynkevych <dmytro@tailscale.com>
4 years ago
"tailscale.com/wgengine/tstun"
)
// minimalMTU is the MTU we set on tailscale's tuntap
// interface. wireguard-go defaults to 1420 bytes, which only works if
// the "outer" MTU is 1500 bytes. This breaks on DSL connections
// (typically 1492 MTU) and on GCE (1460 MTU?!).
//
// 1280 is the smallest MTU allowed for IPv6, which is a sensible
// "probably works everywhere" setting until we develop proper PMTU
// discovery.
const minimalMTU = 1280
const (
magicDNSIP = 0x64646464 // 100.100.100.100
magicDNSPort = 53
)
// magicDNSDomain is the parent domain for Tailscale nodes.
const magicDNSDomain = "b.tailscale.net."
// Lazy wireguard-go configuration parameters.
const (
// lazyPeerIdleThreshold is the idle duration after
// which we remove a peer from the wireguard configuration.
// (This includes peers that have never been idle, which
// effectively have infinite idleness)
lazyPeerIdleThreshold = 5 * time.Minute
// packetSendTimeUpdateFrequency controls how often we record
// the time that we wrote a packet to an IP address.
packetSendTimeUpdateFrequency = 10 * time.Second
// packetSendRecheckWireguardThreshold controls how long we can go
// between packet sends to an IP before checking to see
// whether this IP address needs to be added back to the
// Wireguard peer oconfig.
packetSendRecheckWireguardThreshold = 1 * time.Minute
)
type userspaceEngine struct {
logf logger.Logf
reqCh chan struct{}
waitCh chan struct{} // chan is closed when first Close call completes; contrast with closing bool
timeNow func() time.Time
tundev *tstun.TUN
wgdev *device.Device
router router.Router
resolver *tsdns.Resolver
magicConn *magicsock.Conn
linkMon *monitor.Mon
testMaybeReconfigHook func() // for tests; if non-nil, fires if maybeReconfigWireguardLocked called
// localAddrs is the set of IP addresses assigned to the local
// tunnel interface. It's used to reflect local packets
// incorrectly sent to us.
localAddrs atomic.Value // of map[packet.IP]bool
wgLock sync.Mutex // serializes all wgdev operations; see lock order comment below
lastCfgFull wgcfg.Config
lastRouterSig string // of router.Config
lastEngineSigFull string // of full wireguard config
lastEngineSigTrim string // of trimmed wireguard config
recvActivityAt map[tailcfg.DiscoKey]time.Time
sentActivityAt map[packet.IP]*int64 // value is atomic int64 of unixtime
destIPActivityFuncs map[packet.IP]func()
mu sync.Mutex // guards following; see lock order comment below
closing bool // Close was called (even if we're still closing)
statusCallback StatusCallback
peerSequence []wgcfg.Key
endpoints []string
pingers map[wgcfg.Key]*pinger // legacy pingers for pre-discovery peers
linkState *interfaces.State
// Lock ordering: magicsock.Conn.mu, wgLock, then mu.
}
// RouterGen is the signature for a function that creates a
// router.Router.
type RouterGen func(logf logger.Logf, wgdev *device.Device, tundev tun.Device) (router.Router, error)
type EngineConfig struct {
// Logf is the logging function used by the engine.
Logf logger.Logf
// TUN is the tun device used by the engine.
TUN tun.Device
// RouterGen is the function used to instantiate the router.
RouterGen RouterGen
// ListenPort is the port on which the engine will listen.
ListenPort uint16
// Fake determines whether this engine is running in fake mode,
// which disables such features as DNS configuration and unrestricted ICMP Echo responses.
Fake bool
}
type Loggify struct {
f logger.Logf
}
func (l *Loggify) Write(b []byte) (int, error) {
l.f(string(b))
return len(b), nil
}
func NewFakeUserspaceEngine(logf logger.Logf, listenPort uint16) (Engine, error) {
logf("Starting userspace wireguard engine (FAKE tuntap device).")
conf := EngineConfig{
Logf: logf,
TUN: tstun.NewFakeTUN(),
RouterGen: router.NewFake,
ListenPort: listenPort,
Fake: true,
}
return NewUserspaceEngineAdvanced(conf)
}
// NewUserspaceEngine creates the named tun device and returns a
// Tailscale Engine running on it.
func NewUserspaceEngine(logf logger.Logf, tunname string, listenPort uint16) (Engine, error) {
if tunname == "" {
return nil, fmt.Errorf("--tun name must not be blank")
}
logf("Starting userspace wireguard engine with tun device %q", tunname)
wgengine: wrap tun.Device to support filtering and packet injection (#358) Right now, filtering and packet injection in wgengine depend on a patch to wireguard-go that probably isn't suitable for upstreaming. This need not be the case: wireguard-go/tun.Device is an interface. For example, faketun.go implements it to mock a TUN device for testing. This patch implements the same interface to provide filtering and packet injection at the tunnel device level, at which point the wireguard-go patch should no longer be necessary. This patch has the following performance impact on i7-7500U @ 2.70GHz, tested in the following namespace configuration: ┌────────────────┐ ┌─────────────────────────────────┐ ┌────────────────┐ │ $ns1 │ │ $ns0 │ │ $ns2 │ │ client0 │ │ tailcontrol, logcatcher │ │ client1 │ │ ┌─────┐ │ │ ┌──────┐ ┌──────┐ │ │ ┌─────┐ │ │ │vethc│───────┼────┼──│vethrc│ │vethrs│──────┼─────┼──│veths│ │ │ ├─────┴─────┐ │ │ ├──────┴────┐ ├──────┴────┐ │ │ ├─────┴─────┐ │ │ │10.0.0.2/24│ │ │ │10.0.0.1/24│ │10.0.1.1/24│ │ │ │10.0.1.2/24│ │ │ └───────────┘ │ │ └───────────┘ └───────────┘ │ │ └───────────┘ │ └────────────────┘ └─────────────────────────────────┘ └────────────────┘ Before: --------------------------------------------------- | TCP send | UDP send | |------------------------|------------------------| | 557.0 (±8.5) Mbits/sec | 3.03 (±0.02) Gbits/sec | --------------------------------------------------- After: --------------------------------------------------- | TCP send | UDP send | |------------------------|------------------------| | 544.8 (±1.6) Mbits/sec | 3.13 (±0.02) Gbits/sec | --------------------------------------------------- The impact on receive performance is similar. Signed-off-by: Dmytro Shynkevych <dmytro@tailscale.com>
4 years ago
tun, err := tun.CreateTUN(tunname, minimalMTU)
if err != nil {
diagnoseTUNFailure(logf)
logf("CreateTUN: %v", err)
return nil, err
}
logf("CreateTUN ok.")
conf := EngineConfig{
Logf: logf,
TUN: tun,
RouterGen: router.New,
ListenPort: listenPort,
}
e, err := NewUserspaceEngineAdvanced(conf)
if err != nil {
return nil, err
}
return e, err
}
// NewUserspaceEngineAdvanced is like NewUserspaceEngine
// but provides control over all config fields.
func NewUserspaceEngineAdvanced(conf EngineConfig) (Engine, error) {
return newUserspaceEngineAdvanced(conf)
}
func newUserspaceEngineAdvanced(conf EngineConfig) (_ Engine, reterr error) {
logf := conf.Logf
e := &userspaceEngine{
timeNow: time.Now,
logf: logf,
reqCh: make(chan struct{}, 1),
waitCh: make(chan struct{}),
tundev: tstun.WrapTUN(logf, conf.TUN),
resolver: tsdns.NewResolver(logf, magicDNSDomain),
pingers: make(map[wgcfg.Key]*pinger),
}
e.localAddrs.Store(map[packet.IP]bool{})
e.linkState, _ = getLinkState()
// Respond to all pings only in fake mode.
if conf.Fake {
e.tundev.PostFilterIn = echoRespondToAll
}
e.tundev.PreFilterOut = e.handleLocalPackets
mon, err := monitor.New(logf, func() { e.LinkChange(false) })
if err != nil {
e.tundev.Close()
return nil, err
}
e.linkMon = mon
endpointsFn := func(endpoints []string) {
e.mu.Lock()
e.endpoints = append(e.endpoints[:0], endpoints...)
e.mu.Unlock()
e.RequestStatus()
}
magicsockOpts := magicsock.Options{
Logf: logf,
Port: conf.ListenPort,
EndpointsFunc: endpointsFn,
IdleFunc: e.tundev.IdleDuration,
NoteRecvActivity: e.noteReceiveActivity,
}
e.magicConn, err = magicsock.NewConn(magicsockOpts)
if err != nil {
e.tundev.Close()
return nil, fmt.Errorf("wgengine: %v", err)
}
// flags==0 because logf is already nested in another logger.
// The outer one can display the preferred log prefixes, etc.
dlog := log.New(&Loggify{logf}, "", 0)
logger := device.Logger{
Debug: dlog,
Info: dlog,
Error: dlog,
}
opts := &device.DeviceOptions{
wgengine: wrap tun.Device to support filtering and packet injection (#358) Right now, filtering and packet injection in wgengine depend on a patch to wireguard-go that probably isn't suitable for upstreaming. This need not be the case: wireguard-go/tun.Device is an interface. For example, faketun.go implements it to mock a TUN device for testing. This patch implements the same interface to provide filtering and packet injection at the tunnel device level, at which point the wireguard-go patch should no longer be necessary. This patch has the following performance impact on i7-7500U @ 2.70GHz, tested in the following namespace configuration: ┌────────────────┐ ┌─────────────────────────────────┐ ┌────────────────┐ │ $ns1 │ │ $ns0 │ │ $ns2 │ │ client0 │ │ tailcontrol, logcatcher │ │ client1 │ │ ┌─────┐ │ │ ┌──────┐ ┌──────┐ │ │ ┌─────┐ │ │ │vethc│───────┼────┼──│vethrc│ │vethrs│──────┼─────┼──│veths│ │ │ ├─────┴─────┐ │ │ ├──────┴────┐ ├──────┴────┐ │ │ ├─────┴─────┐ │ │ │10.0.0.2/24│ │ │ │10.0.0.1/24│ │10.0.1.1/24│ │ │ │10.0.1.2/24│ │ │ └───────────┘ │ │ └───────────┘ └───────────┘ │ │ └───────────┘ │ └────────────────┘ └─────────────────────────────────┘ └────────────────┘ Before: --------------------------------------------------- | TCP send | UDP send | |------------------------|------------------------| | 557.0 (±8.5) Mbits/sec | 3.03 (±0.02) Gbits/sec | --------------------------------------------------- After: --------------------------------------------------- | TCP send | UDP send | |------------------------|------------------------| | 544.8 (±1.6) Mbits/sec | 3.13 (±0.02) Gbits/sec | --------------------------------------------------- The impact on receive performance is similar. Signed-off-by: Dmytro Shynkevych <dmytro@tailscale.com>
4 years ago
Logger: &logger,
HandshakeDone: func(peerKey wgcfg.Key, peer *device.Peer, deviceAllowedIPs *device.AllowedIPs) {
// Send an unsolicited status event every time a
// handshake completes. This makes sure our UI can
// update quickly as soon as it connects to a peer.
//
// We use a goroutine here to avoid deadlocking
// wireguard, since RequestStatus() will call back
// into it, and wireguard is what called us to get
// here.
go e.RequestStatus()
if e.magicConn.PeerHasDiscoKey(tailcfg.NodeKey(peerKey)) {
e.logf("wireguard handshake complete for %v", peerKey.ShortString())
// This is a modern peer with discovery support. No need to send pings.
return
}
e.logf("wireguard handshake complete for %v; sending legacy pings", peerKey.ShortString())
// Ping every single-IP that peer routes.
// These synthetic packets are used to traverse NATs.
var ips []wgcfg.IP
allowedIPs := deviceAllowedIPs.EntriesForPeer(peer)
for _, ipNet := range allowedIPs {
if ones, bits := ipNet.Mask.Size(); ones == bits && ones != 0 {
var ip wgcfg.IP
copy(ip.Addr[:], ipNet.IP.To16())
ips = append(ips, ip)
}
}
if len(ips) > 0 {
go e.pinger(peerKey, ips)
} else {
logf("[unexpected] peer %s has no single-IP routes: %v", peerKey.ShortString(), allowedIPs)
}
},
CreateBind: e.magicConn.CreateBind,
CreateEndpoint: e.magicConn.CreateEndpoint,
SkipBindUpdate: true,
}
// wgdev takes ownership of tundev, will close it when closed.
e.wgdev = device.NewDevice(e.tundev, opts)
defer func() {
if reterr != nil {
e.wgdev.Close()
}
}()
// Pass the underlying tun.(*NativeDevice) to the router:
// routers do not Read or Write, but do access native interfaces.
e.router, err = conf.RouterGen(logf, e.wgdev, e.tundev.Unwrap())
if err != nil {
e.magicConn.Close()
return nil, err
}
go func() {
up := false
for event := range e.tundev.Events() {
if event&tun.EventMTUUpdate != 0 {
mtu, err := e.tundev.MTU()
e.logf("external route MTU: %d (%v)", mtu, err)
}
if event&tun.EventUp != 0 && !up {
e.logf("external route: up")
e.RequestStatus()
up = true
}
if event&tun.EventDown != 0 && up {
e.logf("external route: down")
e.RequestStatus()
up = false
}
}
}()
e.wgdev.Up()
if err := e.router.Up(); err != nil {
e.magicConn.Close()
e.wgdev.Close()
return nil, err
}
// TODO(danderson): we should delete this. It's pointless to apply
// a no-op settings here.
if err := e.router.Set(nil); err != nil {
e.magicConn.Close()
e.wgdev.Close()
return nil, err
}
e.linkMon.Start()
e.magicConn.Start()
e.resolver.Start()
go e.pollResolver()
return e, nil
}
// echoRespondToAll is an inbound post-filter responding to all echo requests.
func echoRespondToAll(p *packet.ParsedPacket, t *tstun.TUN) filter.Response {
if p.IsEchoRequest() {
header := p.ICMPHeader()
header.ToResponse()
packet := packet.Generate(&header, p.Payload())
t.InjectOutbound(packet)
// We already handled it, stop.
return filter.Drop
}
return filter.Accept
}
// handleLocalPackets inspects packets coming from the local network
// stack, and intercepts any packets that should be handled by
// tailscaled directly. Other packets are allowed to proceed into the
// main ACL filter.
func (e *userspaceEngine) handleLocalPackets(p *packet.ParsedPacket, t *tstun.TUN) filter.Response {
if verdict := e.handleDNS(p, t); verdict == filter.Drop {
// local DNS handled the packet.
return filter.Drop
}
if runtime.GOOS == "darwin" && e.isLocalAddr(p.DstIP) {
// macOS NetworkExtension directs packets destined to the
// tunnel's local IP address into the tunnel, instead of
// looping back within the kernel network stack. We have to
// notice that an outbound packet is actually destined for
// ourselves, and loop it back into macOS.
t.InjectInboundCopy(p.Buffer())
return filter.Drop
}
return filter.Accept
}
func (e *userspaceEngine) isLocalAddr(ip packet.IP) bool {
localAddrs, ok := e.localAddrs.Load().(map[packet.IP]bool)
if !ok {
e.logf("[unexpected] e.localAddrs was nil, can't check for loopback packet")
return false
}
return localAddrs[ip]
}
// handleDNS is an outbound pre-filter resolving Tailscale domains.
func (e *userspaceEngine) handleDNS(p *packet.ParsedPacket, t *tstun.TUN) filter.Response {
if p.DstIP == magicDNSIP && p.DstPort == magicDNSPort && p.IPProto == packet.UDP {
request := tsdns.Packet{
Payload: append([]byte(nil), p.Payload()...),
Addr: netaddr.IPPort{IP: p.SrcIP.Netaddr(), Port: p.SrcPort},
}
err := e.resolver.EnqueueRequest(request)
if err != nil {
e.logf("tsdns: enqueue: %v", err)
}
return filter.Drop
}
return filter.Accept
}
// pollResolver reads responses from the DNS resolver and injects them inbound.
func (e *userspaceEngine) pollResolver() {
for {
resp, err := e.resolver.NextResponse()
if err == tsdns.ErrClosed {
return
}
if err != nil {
e.logf("tsdns: error: %v", err)
continue
}
h := packet.UDPHeader{
IPHeader: packet.IPHeader{
SrcIP: packet.IP(magicDNSIP),
DstIP: packet.IPFromNetaddr(resp.Addr.IP),
},
SrcPort: magicDNSPort,
DstPort: resp.Addr.Port,
}
hlen := h.Len()
// TODO(dmytro): avoid this allocation without importing tstun quirks into tsdns.
const offset = tstun.PacketStartOffset
buf := make([]byte, offset+hlen+len(resp.Payload))
copy(buf[offset+hlen:], resp.Payload)
h.Marshal(buf[offset:])
e.tundev.InjectInboundDirect(buf, offset)
}
}
// pinger sends ping packets for a few seconds.
//
// These generated packets are used to ensure we trigger the spray logic in
// the magicsock package for NAT traversal.
//
// These are only used with legacy peers (before 0.100.0) that don't
// have advertised discovery keys.
type pinger struct {
e *userspaceEngine
done chan struct{} // closed after shutdown (not the ctx.Done() chan)
cancel context.CancelFunc
}
// close cleans up pinger and removes it from the userspaceEngine.pingers map.
// It cannot be called while p.e.mu is held.
func (p *pinger) close() {
p.cancel()
<-p.done
}
func (p *pinger) run(ctx context.Context, peerKey wgcfg.Key, ips []wgcfg.IP, srcIP packet.IP) {
defer func() {
p.e.mu.Lock()
if p.e.pingers[peerKey] == p {
delete(p.e.pingers, peerKey)
}
p.e.mu.Unlock()
close(p.done)
}()
header := packet.ICMPHeader{
IPHeader: packet.IPHeader{
SrcIP: srcIP,
},
Type: packet.ICMPEchoRequest,
Code: packet.ICMPNoCode,
}
// sendFreq is slightly longer than sprayFreq in magicsock to ensure
// that if these ping packets are the only source of early packets
// sent to the peer, that each one will be sprayed.
const sendFreq = 300 * time.Millisecond
const stopAfter = 3 * time.Second
start := time.Now()
var dstIPs []packet.IP
for _, ip := range ips {
dstIPs = append(dstIPs, packet.NewIP(ip.IP()))
}
payload := []byte("magicsock_spray") // no meaning
header.IPID = 1
t := time.NewTicker(sendFreq)
defer t.Stop()
for {
select {
case <-ctx.Done():
return
case <-t.C:
}
if time.Since(start) > stopAfter {
return
}
for _, dstIP := range dstIPs {
header.DstIP = dstIP
// InjectOutbound take ownership of the packet, so we allocate.
b := packet.Generate(&header, payload)
p.e.tundev.InjectOutbound(b)
}
header.IPID++
}
}
// pinger sends ping packets for a few seconds.
//
// These generated packets are used to ensure we trigger the spray logic in
// the magicsock package for NAT traversal.
//
// This is only used with legacy peers (before 0.100.0) that don't
// have advertised discovery keys.
func (e *userspaceEngine) pinger(peerKey wgcfg.Key, ips []wgcfg.IP) {
e.logf("generating initial ping traffic to %s (%v)", peerKey.ShortString(), ips)
var srcIP packet.IP
e.wgLock.Lock()
if len(e.lastCfgFull.Addresses) > 0 {
srcIP = packet.NewIP(e.lastCfgFull.Addresses[0].IP.IP())
}
e.wgLock.Unlock()
if srcIP == 0 {
e.logf("generating initial ping traffic: no source IP")
return
}
ctx, cancel := context.WithCancel(context.Background())
p := &pinger{
e: e,
done: make(chan struct{}),
cancel: cancel,
}
e.mu.Lock()
if e.closing {
e.mu.Unlock()
return
}
oldPinger := e.pingers[peerKey]
e.pingers[peerKey] = p
e.mu.Unlock()
if oldPinger != nil {
oldPinger.close()
}
p.run(ctx, peerKey, ips, srcIP)
}
var debugTrimWireguard, _ = strconv.ParseBool(os.Getenv("TS_DEBUG_TRIM_WIREGUARD"))
// forceFullWireguardConfig reports whether we should give wireguard
// our full network map, even for inactive peers
//
// TODO(bradfitz): remove this after our 1.0 launch; we don't want to
// enable wireguard config trimming quite yet because it just landed
// and we haven't got enough time testing it.
func forceFullWireguardConfig(numPeers int) bool {
// Did the user explicitly enable trimmming via the environment variable knob?
if debugTrimWireguard {
return false
}
// On iOS with large networks, it's critical, so turn on trimming.
// Otherwise we run out of memory from wireguard-go goroutine stacks+buffers.
// This will be the default later for all platforms and network sizes.
iOS := runtime.GOOS == "darwin" && version.IsMobile()
if iOS && numPeers > 50 {
return false
}
return true
}
// isTrimmablePeer reports whether p is a peer that we can trim out of the
// network map.
//
// We can only trim peers that both a) support discovery (because we
// know who they are when we receive their data and don't need to rely
// on wireguard-go figuring it out) and b) for implementation
// simplicity, have only one IP address (an IPv4 /32), which is the
// common case for most peers. Subnet router nodes will just always be
// created in the wireguard-go config.
func isTrimmablePeer(p *wgcfg.Peer, numPeers int) bool {
if forceFullWireguardConfig(numPeers) {
return false
}
if len(p.AllowedIPs) != 1 || len(p.Endpoints) != 1 {
return false
}
if !strings.HasSuffix(p.Endpoints[0].Host, ".disco.tailscale") {
return false
}
aip := p.AllowedIPs[0]
// TODO: IPv6 support, once we support IPv6 within the tunnel. In that case,
// len(p.AllowedIPs) probably will be more than 1.
if aip.Mask != 32 || !aip.IP.Is4() {
return false
}
return true
}
// noteReceiveActivity is called by magicsock when a packet has been received
// by the peer using discovery key dk. Magicsock calls this no more than
// every 10 seconds for a given peer.
func (e *userspaceEngine) noteReceiveActivity(dk tailcfg.DiscoKey) {
e.wgLock.Lock()
defer e.wgLock.Unlock()
was, ok := e.recvActivityAt[dk]
if !ok {
// Not a trimmable peer we care about tracking. (See isTrimmablePeer)
return
}
now := e.timeNow()
e.recvActivityAt[dk] = now
// If the last activity time jumped a bunch (say, at least
// half the idle timeout) then see if we need to reprogram
// Wireguard. This could probably be just
// lazyPeerIdleThreshold without the divide by 2, but
// maybeReconfigWireguardLocked is cheap enough to call every
// couple minutes (just not on every packet).
if was.IsZero() || now.Sub(was) > lazyPeerIdleThreshold/2 {
e.maybeReconfigWireguardLocked()
}
}
// isActiveSince reports whether the peer identified by (dk, ip) has
// had a packet sent to or received from it since t.
//
// e.wgLock must be held.
func (e *userspaceEngine) isActiveSince(dk tailcfg.DiscoKey, ip wgcfg.IP, t time.Time) bool {
if e.recvActivityAt[dk].After(t) {
return true
}
pip := packet.IP(binary.BigEndian.Uint32(ip.Addr[12:]))
timePtr, ok := e.sentActivityAt[pip]
if !ok {
return false
}
unixTime := atomic.LoadInt64(timePtr)
return unixTime >= t.Unix()
}
// discoKeyFromPeer returns the DiscoKey for a wireguard config's Peer.
//
// Invariant: isTrimmablePeer(p) == true, so it should have 1 endpoint with
// Host of form "<64-hex-digits>.disco.tailscale". If invariant is violated,
// we return the zero value.
func discoKeyFromPeer(p *wgcfg.Peer) tailcfg.DiscoKey {
host := p.Endpoints[0].Host
if len(host) < 64 {
return tailcfg.DiscoKey{}
}
k, err := key.NewPublicFromHexMem(mem.S(host[:64]))
if err != nil {
return tailcfg.DiscoKey{}
}
return tailcfg.DiscoKey(k)
}
// e.wgLock must be held.
func (e *userspaceEngine) maybeReconfigWireguardLocked() error {
if hook := e.testMaybeReconfigHook; hook != nil {
hook()
return nil
}
full := e.lastCfgFull
// Compute a minimal config to pass to wireguard-go
// based on the full config. Prune off all the peers
// and only add the active ones back.
min := full
min.Peers = nil
// We'll only keep a peer around if it's been active in
// the past 5 minutes. That's more than WireGuard's key
// rotation time anyway so it's no harm if we remove it
// later if it's been inactive.
activeCutoff := e.timeNow().Add(-lazyPeerIdleThreshold)
// Not all peers can be trimmed from the network map (see
// isTrimmablePeer). For those are are trimmable, keep track
// of their DiscoKey and Tailscale IPs. These are the ones
// we'll need to install tracking hooks for to watch their
// send/receive activity.
trackDisco := make([]tailcfg.DiscoKey, 0, len(full.Peers))
trackIPs := make([]wgcfg.IP, 0, len(full.Peers))
for i := range full.Peers {
p := &full.Peers[i]
if !isTrimmablePeer(p, len(full.Peers)) {
min.Peers = append(min.Peers, *p)
continue
}
tsIP := p.AllowedIPs[0].IP
dk := discoKeyFromPeer(p)
trackDisco = append(trackDisco, dk)
trackIPs = append(trackIPs, tsIP)
if e.isActiveSince(dk, tsIP, activeCutoff) {
min.Peers = append(min.Peers, *p)
}
}
if !deepprint.UpdateHash(&e.lastEngineSigTrim, min) {
// No changes
return nil
}
e.updateActivityMapsLocked(trackDisco, trackIPs)
e.logf("wgengine: Reconfig: configuring userspace wireguard config (with %d/%d peers)", len(min.Peers), len(full.Peers))
if err := e.wgdev.Reconfig(&min); err != nil {
e.logf("wgdev.Reconfig: %v", err)
return err
}
return nil
}
// updateActivityMapsLocked updates the data structures used for tracking the activity
// of wireguard peers that we might add/remove dynamically from the real config
// as given to wireguard-go.
//
// e.wgLock must be held.
func (e *userspaceEngine) updateActivityMapsLocked(trackDisco []tailcfg.DiscoKey, trackIPs []wgcfg.IP) {
// Generate the new map of which discokeys we want to track
// receive times for.
mr := map[tailcfg.DiscoKey]time.Time{} // TODO: only recreate this if set of keys changed
for _, dk := range trackDisco {
// Preserve old times in the new map, but also
// populate map entries for new trackDisco values with
// time.Time{} zero values. (Only entries in this map
// are tracked, so the Time zero values allow it to be
// tracked later)
mr[dk] = e.recvActivityAt[dk]
}
e.recvActivityAt = mr
oldTime := e.sentActivityAt
e.sentActivityAt = make(map[packet.IP]*int64, len(oldTime))
oldFunc := e.destIPActivityFuncs
e.destIPActivityFuncs = make(map[packet.IP]func(), len(oldFunc))
for _, wip := range trackIPs {
pip := packet.IP(binary.BigEndian.Uint32(wip.Addr[12:]))
timePtr := oldTime[pip]
if timePtr == nil {
timePtr = new(int64)
}
e.sentActivityAt[pip] = timePtr
fn := oldFunc[pip]
if fn == nil {
// This is the func that gets run on every outgoing packet for tracked IPs:
fn = func() {
now := e.timeNow().Unix()
old := atomic.LoadInt64(timePtr)
// How long's it been since we last sent a packet?
// For our first packet, old is Unix epoch time 0 (1970).
elapsedSec := now - old
if elapsedSec >= int64(packetSendTimeUpdateFrequency/time.Second) {
atomic.StoreInt64(timePtr, now)
}
// On a big jump, assume we might no longer be in the wireguard
// config and go check.
if elapsedSec >= int64(packetSendRecheckWireguardThreshold/time.Second) {
e.wgLock.Lock()
defer e.wgLock.Unlock()
e.maybeReconfigWireguardLocked()
}
}
}
e.destIPActivityFuncs[pip] = fn
}
e.tundev.SetDestIPActivityFuncs(e.destIPActivityFuncs)
}
func (e *userspaceEngine) Reconfig(cfg *wgcfg.Config, routerCfg *router.Config) error {
if routerCfg == nil {
panic("routerCfg must not be nil")
}
localAddrs := map[packet.IP]bool{}
for _, addr := range routerCfg.LocalAddrs {
// TODO: ipv6
if !addr.IP.Is4() {
continue
}
localAddrs[packet.IPFromNetaddr(addr.IP)] = true
}
e.localAddrs.Store(localAddrs)
e.wgLock.Lock()
defer e.wgLock.Unlock()
peerSet := make(map[key.Public]struct{}, len(cfg.Peers))
e.mu.Lock()
e.peerSequence = e.peerSequence[:0]
for _, p := range cfg.Peers {
e.peerSequence = append(e.peerSequence, p.PublicKey)
peerSet[key.Public(p.PublicKey)] = struct{}{}
}
e.mu.Unlock()
engineChanged := deepprint.UpdateHash(&e.lastEngineSigFull, cfg)
routerChanged := deepprint.UpdateHash(&e.lastRouterSig, routerCfg)
if !engineChanged && !routerChanged {
return ErrNoChanges
}
e.lastCfgFull = cfg.Copy()
// Tell magicsock about the new (or initial) private key
// (which is needed by DERP) before wgdev gets it, as wgdev
// will start trying to handshake, which we want to be able to
// go over DERP.
if err := e.magicConn.SetPrivateKey(cfg.PrivateKey); err != nil {
e.logf("wgengine: Reconfig: SetPrivateKey: %v", err)
}
e.magicConn.UpdatePeers(peerSet)
if err := e.maybeReconfigWireguardLocked(); err != nil {
return err
}
if routerChanged {
if routerCfg.DNS.Proxied {
ips := routerCfg.DNS.Nameservers
nameservers := make([]string, len(ips))
for i, ip := range ips {
nameservers[i] = net.JoinHostPort(ip.String(), "53")
}
e.resolver.SetNameservers(nameservers)
routerCfg.DNS.Nameservers = []netaddr.IP{tsaddr.TailscaleServiceIP()}
}
e.logf("wgengine: Reconfig: configuring router")
if err := e.router.Set(routerCfg); err != nil {
return err
}
}
e.logf("wgengine: Reconfig done")
return nil
}
func (e *userspaceEngine) GetFilter() *filter.Filter {
wgengine: wrap tun.Device to support filtering and packet injection (#358) Right now, filtering and packet injection in wgengine depend on a patch to wireguard-go that probably isn't suitable for upstreaming. This need not be the case: wireguard-go/tun.Device is an interface. For example, faketun.go implements it to mock a TUN device for testing. This patch implements the same interface to provide filtering and packet injection at the tunnel device level, at which point the wireguard-go patch should no longer be necessary. This patch has the following performance impact on i7-7500U @ 2.70GHz, tested in the following namespace configuration: ┌────────────────┐ ┌─────────────────────────────────┐ ┌────────────────┐ │ $ns1 │ │ $ns0 │ │ $ns2 │ │ client0 │ │ tailcontrol, logcatcher │ │ client1 │ │ ┌─────┐ │ │ ┌──────┐ ┌──────┐ │ │ ┌─────┐ │ │ │vethc│───────┼────┼──│vethrc│ │vethrs│──────┼─────┼──│veths│ │ │ ├─────┴─────┐ │ │ ├──────┴────┐ ├──────┴────┐ │ │ ├─────┴─────┐ │ │ │10.0.0.2/24│ │ │ │10.0.0.1/24│ │10.0.1.1/24│ │ │ │10.0.1.2/24│ │ │ └───────────┘ │ │ └───────────┘ └───────────┘ │ │ └───────────┘ │ └────────────────┘ └─────────────────────────────────┘ └────────────────┘ Before: --------------------------------------------------- | TCP send | UDP send | |------------------------|------------------------| | 557.0 (±8.5) Mbits/sec | 3.03 (±0.02) Gbits/sec | --------------------------------------------------- After: --------------------------------------------------- | TCP send | UDP send | |------------------------|------------------------| | 544.8 (±1.6) Mbits/sec | 3.13 (±0.02) Gbits/sec | --------------------------------------------------- The impact on receive performance is similar. Signed-off-by: Dmytro Shynkevych <dmytro@tailscale.com>
4 years ago
return e.tundev.GetFilter()
}
func (e *userspaceEngine) SetFilter(filt *filter.Filter) {
wgengine: wrap tun.Device to support filtering and packet injection (#358) Right now, filtering and packet injection in wgengine depend on a patch to wireguard-go that probably isn't suitable for upstreaming. This need not be the case: wireguard-go/tun.Device is an interface. For example, faketun.go implements it to mock a TUN device for testing. This patch implements the same interface to provide filtering and packet injection at the tunnel device level, at which point the wireguard-go patch should no longer be necessary. This patch has the following performance impact on i7-7500U @ 2.70GHz, tested in the following namespace configuration: ┌────────────────┐ ┌─────────────────────────────────┐ ┌────────────────┐ │ $ns1 │ │ $ns0 │ │ $ns2 │ │ client0 │ │ tailcontrol, logcatcher │ │ client1 │ │ ┌─────┐ │ │ ┌──────┐ ┌──────┐ │ │ ┌─────┐ │ │ │vethc│───────┼────┼──│vethrc│ │vethrs│──────┼─────┼──│veths│ │ │ ├─────┴─────┐ │ │ ├──────┴────┐ ├──────┴────┐ │ │ ├─────┴─────┐ │ │ │10.0.0.2/24│ │ │ │10.0.0.1/24│ │10.0.1.1/24│ │ │ │10.0.1.2/24│ │ │ └───────────┘ │ │ └───────────┘ └───────────┘ │ │ └───────────┘ │ └────────────────┘ └─────────────────────────────────┘ └────────────────┘ Before: --------------------------------------------------- | TCP send | UDP send | |------------------------|------------------------| | 557.0 (±8.5) Mbits/sec | 3.03 (±0.02) Gbits/sec | --------------------------------------------------- After: --------------------------------------------------- | TCP send | UDP send | |------------------------|------------------------| | 544.8 (±1.6) Mbits/sec | 3.13 (±0.02) Gbits/sec | --------------------------------------------------- The impact on receive performance is similar. Signed-off-by: Dmytro Shynkevych <dmytro@tailscale.com>
4 years ago
e.tundev.SetFilter(filt)
}
func (e *userspaceEngine) SetDNSMap(dm *tsdns.Map) {
e.resolver.SetMap(dm)
}
func (e *userspaceEngine) SetStatusCallback(cb StatusCallback) {
e.mu.Lock()
defer e.mu.Unlock()
e.statusCallback = cb
}
func (e *userspaceEngine) getStatusCallback() StatusCallback {
e.mu.Lock()
defer e.mu.Unlock()
return e.statusCallback
}
// TODO: this function returns an error but it's always nil, and when
// there's actually a problem it just calls log.Fatal. Why?
func (e *userspaceEngine) getStatus() (*Status, error) {
// Grab derpConns before acquiring wgLock to not violate lock ordering;
// the DERPs method acquires magicsock.Conn.mu.
// (See comment in userspaceEngine's declaration.)
derpConns := e.magicConn.DERPs()
e.wgLock.Lock()
defer e.wgLock.Unlock()
e.mu.Lock()
closing := e.closing
e.mu.Unlock()
if closing {
return nil, errors.New("engine closing; no status")
}
if e.wgdev == nil {
// RequestStatus was invoked before the wgengine has
// finished initializing. This can happen when wgegine
// provides a callback to magicsock for endpoint
// updates that calls RequestStatus.
return nil, nil
}
// lineLen is the max UAPI line we expect. The longest I see is
// len("preshared_key=")+64 hex+"\n" == 79. Add some slop.
const lineLen = 100
pr, pw := io.Pipe()
errc := make(chan error, 1)
go func() {
defer pw.Close()
bw := bufio.NewWriterSize(pw, lineLen)
// TODO(apenwarr): get rid of silly uapi stuff for in-process comms
// FIXME: get notified of status changes instead of polling.
filter := device.IPCGetFilter{
// The allowed_ips are somewhat expensive to compute and they're
// unused below; request that they not be sent instead.
FilterAllowedIPs: true,
}
if err := e.wgdev.IpcGetOperationFiltered(bw, filter); err != nil {
errc <- fmt.Errorf("IpcGetOperation: %w", err)
return
}
errc <- bw.Flush()
}()
pp := make(map[wgcfg.Key]*PeerStatus)
p := &PeerStatus{}
var hst1, hst2, n int64
var err error
bs := bufio.NewScanner(pr)
bs.Buffer(make([]byte, lineLen), lineLen)
for bs.Scan() {
line := bs.Bytes()
k := line
var v mem.RO
if i := bytes.IndexByte(line, '='); i != -1 {
k = line[:i]
v = mem.B(line[i+1:])
}
switch string(k) {
case "public_key":
pk, err := key.NewPublicFromHexMem(v)
if err != nil {
log.Fatalf("IpcGetOperation: invalid key %#v", v)
}
p = &PeerStatus{}
pp[wgcfg.Key(pk)] = p
key := tailcfg.NodeKey(pk)
p.NodeKey = key
case "rx_bytes":
n, err = mem.ParseInt(v, 10, 64)
p.RxBytes = ByteCount(n)
if err != nil {
log.Fatalf("IpcGetOperation: rx_bytes invalid: %#v", line)
}
case "tx_bytes":
n, err = mem.ParseInt(v, 10, 64)
p.TxBytes = ByteCount(n)
if err != nil {
log.Fatalf("IpcGetOperation: tx_bytes invalid: %#v", line)
}
case "last_handshake_time_sec":
hst1, err = mem.ParseInt(v, 10, 64)
if err != nil {
log.Fatalf("IpcGetOperation: hst1 invalid: %#v", line)
}
case "last_handshake_time_nsec":
hst2, err = mem.ParseInt(v, 10, 64)
if err != nil {
log.Fatalf("IpcGetOperation: hst2 invalid: %#v", line)
}
if hst1 != 0 || hst2 != 0 {
p.LastHandshake = time.Unix(hst1, hst2)
} // else leave at time.IsZero()
}
}
if err := bs.Err(); err != nil {
log.Fatalf("reading IpcGetOperation output: %v", err)
}
if err := <-errc; err != nil {
log.Fatalf("IpcGetOperation: %v", err)
}
e.mu.Lock()
defer e.mu.Unlock()
var peers []PeerStatus
for _, pk := range e.peerSequence {
if p, ok := pp[pk]; ok { // ignore idle ones not in wireguard-go's config
peers = append(peers, *p)
}
}
return &Status{
LocalAddrs: append([]string(nil), e.endpoints...),
Peers: peers,
DERPs: derpConns,
}, nil
}
func (e *userspaceEngine) RequestStatus() {
// This is slightly tricky. e.getStatus() can theoretically get
// blocked inside wireguard for a while, and RequestStatus() is
// sometimes called from a goroutine, so we don't want a lot of
// them hanging around. On the other hand, requesting multiple
// status updates simultaneously is pointless anyway; they will
// all say the same thing.
// Enqueue at most one request. If one is in progress already, this
// adds one more to the queue. If one has been requested but not
// started, it is a no-op.
select {
case e.reqCh <- struct{}{}:
default:
}
// Dequeue at most one request. Another thread may have already
// dequeued the request we enqueued above, which is fine, since the
// information is guaranteed to be at least as recent as the current
// call to RequestStatus().
select {
case <-e.reqCh:
s, err := e.getStatus()
if s == nil && err == nil {
e.logf("RequestStatus: weird: both s and err are nil")
return
}
if cb := e.getStatusCallback(); cb != nil {
cb(s, err)
}
default:
}
}
func (e *userspaceEngine) Close() {
var pingers []*pinger
e.mu.Lock()
if e.closing {
e.mu.Unlock()
return
}
e.closing = true
for _, pinger := range e.pingers {
pingers = append(pingers, pinger)
}
e.mu.Unlock()
r := bufio.NewReader(strings.NewReader(""))
e.wgdev.IpcSetOperation(r)
e.resolver.Close()
e.magicConn.Close()
e.linkMon.Close()
e.router.Close()
e.wgdev.Close()
// Shut down pingers after tundev is closed (by e.wgdev.Close) so the
// synchronous close does not get stuck on InjectOutbound.
for _, pinger := range pingers {
pinger.close()
}
close(e.waitCh)
}
func (e *userspaceEngine) Wait() {
<-e.waitCh
}
func (e *userspaceEngine) setLinkState(st *interfaces.State) (changed bool) {
if st == nil {
return false
}
e.mu.Lock()
defer e.mu.Unlock()
changed = e.linkState == nil || !st.Equal(e.linkState)
e.linkState = st
return changed
}
func (e *userspaceEngine) LinkChange(isExpensive bool) {
cur, err := getLinkState()
if err != nil {
e.logf("LinkChange: interfaces.GetState: %v", err)
return
}
cur.IsExpensive = isExpensive
needRebind := e.setLinkState(cur)
e.logf("LinkChange(isExpensive=%v); needsRebind=%v", isExpensive, needRebind)
why := "link-change-minor"
if needRebind {
why = "link-change-major"
e.magicConn.Rebind()
}
e.magicConn.ReSTUN(why)
}
func getLinkState() (*interfaces.State, error) {
s, err := interfaces.GetState()
if s != nil {
s.RemoveTailscaleInterfaces()
}
return s, err
}
func (e *userspaceEngine) SetNetInfoCallback(cb NetInfoCallback) {
e.magicConn.SetNetInfoCallback(cb)
}
func (e *userspaceEngine) SetDERPMap(dm *tailcfg.DERPMap) {
e.magicConn.SetDERPMap(dm)
}
func (e *userspaceEngine) SetNetworkMap(nm *controlclient.NetworkMap) {
e.magicConn.SetNetworkMap(nm)
}
func (e *userspaceEngine) DiscoPublicKey() tailcfg.DiscoKey {
return e.magicConn.DiscoPublicKey()
}
func (e *userspaceEngine) UpdateStatus(sb *ipnstate.StatusBuilder) {
st, err := e.getStatus()
if err != nil {
e.logf("wgengine: getStatus: %v", err)
return
}
for _, ps := range st.Peers {
sb.AddPeer(key.Public(ps.NodeKey), &ipnstate.PeerStatus{
RxBytes: int64(ps.RxBytes),
TxBytes: int64(ps.TxBytes),
LastHandshake: ps.LastHandshake,
InEngine: true,
})
}
e.magicConn.UpdateStatus(sb)
}
func (e *userspaceEngine) Ping(ip netaddr.IP, cb func(*ipnstate.PingResult)) {
e.magicConn.Ping(ip, cb)
}
// diagnoseTUNFailure is called if tun.CreateTUN fails, to poke around
// the system and log some diagnostic info that might help debug why
// TUN failed. Because TUN's already failed and things the program's
// about to end, we might as well log a lot.
func diagnoseTUNFailure(logf logger.Logf) {
switch runtime.GOOS {
case "linux":
diagnoseLinuxTUNFailure(logf)
default:
logf("no TUN failure diagnostics for OS %q", runtime.GOOS)
}
}
func diagnoseLinuxTUNFailure(logf logger.Logf) {
kernel, err := exec.Command("uname", "-r").Output()
kernel = bytes.TrimSpace(kernel)
if err != nil {
logf("no TUN, and failed to look up kernel version: %v", err)
return
}
logf("Linux kernel version: %s", kernel)
modprobeOut, err := exec.Command("/sbin/modprobe", "tun").CombinedOutput()
if err == nil {
logf("'modprobe tun' successful")
// Either tun is currently loaded, or it's statically
// compiled into the kernel (which modprobe checks
// with /lib/modules/$(uname -r)/modules.builtin)
//
// So if there's a problem at this point, it's
// probably because /dev/net/tun doesn't exist.
const dev = "/dev/net/tun"
if fi, err := os.Stat(dev); err != nil {
logf("tun module loaded in kernel, but %s does not exist", dev)
} else {
logf("%s: %v", dev, fi.Mode())
}
// We failed to find why it failed. Just let our
// caller report the error it got from wireguard-go.
return
}
logf("is CONFIG_TUN enabled in your kernel? `modprobe tun` failed with: %s", modprobeOut)
distro := linuxDistro()
switch distro {
case "debian":
dpkgOut, err := exec.Command("dpkg", "-S", "kernel/drivers/net/tun.ko").CombinedOutput()
if len(bytes.TrimSpace(dpkgOut)) == 0 || err != nil {
logf("tun module not loaded nor found on disk")
return
}
if !bytes.Contains(dpkgOut, kernel) {
logf("kernel/drivers/net/tun.ko found on disk, but not for current kernel; are you in middle of a system update and haven't rebooted? found: %s", dpkgOut)
}
case "arch":
findOut, err := exec.Command("find", "/lib/modules/", "-path", "*/net/tun.ko*").CombinedOutput()
if len(bytes.TrimSpace(findOut)) == 0 || err != nil {
logf("tun module not loaded nor found on disk")
return
}
if !bytes.Contains(findOut, kernel) {
logf("kernel/drivers/net/tun.ko found on disk, but not for current kernel; are you in middle of a system update and haven't rebooted? found: %s", findOut)
}
}
}
func linuxDistro() string {
if _, err := os.Stat("/etc/debian_version"); err == nil {
return "debian"
}
if _, err := os.Stat("/etc/arch-release"); err == nil {
return "arch"
}
return ""
}