From c72caa6672c83a981e7b9ffcaa2f12fd9351d17e Mon Sep 17 00:00:00 2001 From: Andrew Dunham Date: Mon, 29 Aug 2022 10:57:54 -0400 Subject: [PATCH] wgengine/magicsock: use AF_PACKET socket + BPF to read disco messages This is entirely optional (i.e. failing in this code is non-fatal) and only enabled on Linux for now. Additionally, this new behaviour can be disabled by setting the TS_DEBUG_DISABLE_AF_PACKET environment variable. Updates #3824 Replaces #5474 Co-authored-by: Andrew Dunham Signed-off-by: David Anderson --- cmd/tailscaled/depaware.txt | 2 +- wgengine/magicsock/magicsock.go | 54 ++++- wgengine/magicsock/magicsock_default.go | 17 ++ wgengine/magicsock/magicsock_linux.go | 260 ++++++++++++++++++++++++ wgengine/magicsock/magicsock_test.go | 19 ++ 5 files changed, 347 insertions(+), 5 deletions(-) create mode 100644 wgengine/magicsock/magicsock_default.go create mode 100644 wgengine/magicsock/magicsock_linux.go diff --git a/cmd/tailscaled/depaware.txt b/cmd/tailscaled/depaware.txt index 2e0e99e2b..f046e4d73 100644 --- a/cmd/tailscaled/depaware.txt +++ b/cmd/tailscaled/depaware.txt @@ -290,7 +290,7 @@ tailscale.com/cmd/tailscaled dependencies: (generated by github.com/tailscale/de W tailscale.com/wf from tailscale.com/cmd/tailscaled tailscale.com/wgengine from tailscale.com/ipn/ipnlocal+ tailscale.com/wgengine/filter from tailscale.com/control/controlclient+ - tailscale.com/wgengine/magicsock from tailscale.com/ipn/ipnlocal+ + 💣 tailscale.com/wgengine/magicsock from tailscale.com/ipn/ipnlocal+ tailscale.com/wgengine/monitor from tailscale.com/control/controlclient+ tailscale.com/wgengine/netstack from tailscale.com/cmd/tailscaled+ tailscale.com/wgengine/router from tailscale.com/ipn/ipnlocal+ diff --git a/wgengine/magicsock/magicsock.go b/wgengine/magicsock/magicsock.go index 11f6f7ad4..288839b21 100644 --- a/wgengine/magicsock/magicsock.go +++ b/wgengine/magicsock/magicsock.go @@ -14,6 +14,7 @@ import ( "errors" "fmt" "hash/fnv" + "io" "math" "math/rand" "net" @@ -60,6 +61,16 @@ import ( "tailscale.com/wgengine/monitor" ) +const ( + // These are disco.Magic in big-endian form, 4 then 2 bytes. The + // BPF filters need the magic in this format to match on it. Used + // only in magicsock_linux.go, but defined here so that the test + // which verifies this is the correct magic doesn't also need a + // _linux variant. + discoMagic1 = 0x5453f09f + discoMagic2 = 0x92ac +) + // useDerpRoute reports whether magicsock should enable the DERP // return path optimization (Issue 150). func useDerpRoute() bool { @@ -254,6 +265,12 @@ type Conn struct { pconn4 *RebindingUDPConn pconn6 *RebindingUDPConn + // closeDisco4 and closeDisco6 are io.Closers to shut down the raw + // disco packet receivers. If nil, no raw disco receiver is + // running for the given family. + closeDisco4 io.Closer + closeDisco6 io.Closer + // netChecker is the prober that discovers local network // conditions, including the closest DERP relay and NAT mappings. netChecker *netcheck.Client @@ -572,6 +589,19 @@ func NewConn(opts Options) (*Conn, error) { c.ignoreSTUNPackets() + if d4, err := c.listenRawDisco("ip4"); err == nil { + c.logf("[v1] using BPF disco receiver for IPv4") + c.closeDisco4 = d4 + } else { + c.logf("[v1] couldn't create raw v4 disco listener, using regular listener instead: %v", err) + } + if d6, err := c.listenRawDisco("ip6"); err == nil { + c.logf("[v1] using BPF disco receiver for IPv6") + c.closeDisco6 = d6 + } else { + c.logf("[v1] couldn't create raw v6 disco listener, using regular listener instead: %v", err) + } + return c, nil } @@ -1638,7 +1668,7 @@ func (c *Conn) receiveIPv6(b []byte) (int, conn.Endpoint, error) { if err != nil { return 0, nil, err } - if ep, ok := c.receiveIP(b[:n], ipp, &c.ippEndpoint6); ok { + if ep, ok := c.receiveIP(b[:n], ipp, &c.ippEndpoint6, c.closeDisco6 == nil); ok { metricRecvDataIPv6.Add(1) return n, ep, nil } @@ -1654,7 +1684,7 @@ func (c *Conn) receiveIPv4(b []byte) (n int, ep conn.Endpoint, err error) { if err != nil { return 0, nil, err } - if ep, ok := c.receiveIP(b[:n], ipp, &c.ippEndpoint4); ok { + if ep, ok := c.receiveIP(b[:n], ipp, &c.ippEndpoint4, c.closeDisco4 == nil); ok { metricRecvDataIPv4.Add(1) return n, ep, nil } @@ -1665,12 +1695,18 @@ func (c *Conn) receiveIPv4(b []byte) (n int, ep conn.Endpoint, err error) { // // ok is whether this read should be reported up to wireguard-go (our // caller). -func (c *Conn) receiveIP(b []byte, ipp netip.AddrPort, cache *ippEndpointCache) (ep *endpoint, ok bool) { +func (c *Conn) receiveIP(b []byte, ipp netip.AddrPort, cache *ippEndpointCache, checkDisco bool) (ep *endpoint, ok bool) { if stun.Is(b) { c.stunReceiveFunc.Load()(b, ipp) return nil, false } - if c.handleDiscoMessage(b, ipp, key.NodePublic{}) { + if checkDisco { + if c.handleDiscoMessage(b, ipp, key.NodePublic{}) { + return nil, false + } + } else if disco.LooksLikeDiscoWrapper(b) { + // Caller told us to ignore disco traffic, don't let it fall + // through to wireguard-go. return nil, false } if !c.havePrivateKey.Load() { @@ -2632,6 +2668,12 @@ func (c *connBind) Close() error { if c.pconn6 != nil { c.pconn6.Close() } + if c.closeDisco4 != nil { + c.closeDisco4.Close() + } + if c.closeDisco6 != nil { + c.closeDisco6.Close() + } // Send an empty read result to unblock receiveDERP, // which will then check connBind.Closed. // connBind.Closed takes c.mu, but c.derpRecvCh is buffered. @@ -4192,4 +4234,8 @@ var ( // metricDERPHomeChange is how many times our DERP home region DI has // changed from non-zero to a different non-zero. metricDERPHomeChange = clientmetric.NewCounter("derp_home_change") + + // Disco packets received bpf read path + metricRecvDiscoPacketIPv4 = clientmetric.NewCounter("magicsock_disco_recv_bpf_ipv4") + metricRecvDiscoPacketIPv6 = clientmetric.NewCounter("magicsock_disco_recv_bpf_ipv6") ) diff --git a/wgengine/magicsock/magicsock_default.go b/wgengine/magicsock/magicsock_default.go new file mode 100644 index 000000000..530fe95fa --- /dev/null +++ b/wgengine/magicsock/magicsock_default.go @@ -0,0 +1,17 @@ +// Copyright (c) 2022 Tailscale Inc & AUTHORS All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build !linux +// +build !linux + +package magicsock + +import ( + "errors" + "io" +) + +func (c *Conn) listenRawDisco(family string) (io.Closer, error) { + return nil, errors.New("raw disco listening not supported on this OS") +} diff --git a/wgengine/magicsock/magicsock_linux.go b/wgengine/magicsock/magicsock_linux.go new file mode 100644 index 000000000..796682de7 --- /dev/null +++ b/wgengine/magicsock/magicsock_linux.go @@ -0,0 +1,260 @@ +// Copyright (c) 2022 Tailscale Inc & AUTHORS All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package magicsock + +import ( + "bytes" + "encoding/binary" + "errors" + "fmt" + "io" + "net" + "net/netip" + "time" + "unsafe" + + "golang.org/x/net/bpf" + "golang.org/x/sys/unix" + "tailscale.com/envknob" + "tailscale.com/types/key" +) + +const ( + udpHeaderSize = 8 + ipv6FragmentHeaderSize = 8 +) + +// Enable/disable using raw sockets to receive disco traffic. +var debugDisableRawDisco = envknob.Bool("TS_DEBUG_DISABLE_RAW_DISCO") + +// These are our BPF filters that we use for testing packets. +var ( + magicsockFilterV4 = []bpf.Instruction{ + // For raw UDPv4 sockets, BPF receives the entire IP packet to + // inspect. + + // Disco packets are so small they should never get + // fragmented, and we don't want to handle reassembly. + bpf.LoadAbsolute{Off: 6, Size: 2}, + // More Fragments bit set means this is part of a fragmented packet. + bpf.JumpIf{Cond: bpf.JumpBitsSet, Val: 0x2000, SkipTrue: 7, SkipFalse: 0}, + // Non-zero fragment offset with MF=0 means this is the last + // fragment of packet. + bpf.JumpIf{Cond: bpf.JumpBitsSet, Val: 0x1fff, SkipTrue: 6, SkipFalse: 0}, + + // Load IP header length into X register. + bpf.LoadMemShift{Off: 0}, + + // Get the first 4 bytes of the UDP packet, compare with our magic number + bpf.LoadIndirect{Off: udpHeaderSize, Size: 4}, + bpf.JumpIf{Cond: bpf.JumpEqual, Val: discoMagic1, SkipTrue: 0, SkipFalse: 3}, + + // Compare the next 2 bytes + bpf.LoadIndirect{Off: udpHeaderSize + 4, Size: 2}, + bpf.JumpIf{Cond: bpf.JumpEqual, Val: uint32(discoMagic2), SkipTrue: 0, SkipFalse: 1}, + + // Accept the whole packet + bpf.RetConstant{Val: 0xFFFFFFFF}, + + // Skip the packet + bpf.RetConstant{Val: 0x0}, + } + + // IPv6 is more complicated to filter, since we can have 0-to-N + // extension headers following the IPv6 header. Since BPF can't + // loop, we can't really parse these in a general way; instead, we + // simply handle the case where we have no extension headers; any + // packets with headers will be skipped. IPv6 extension headers + // are sufficiently uncommon that we're willing to accept false + // negatives here. + // + // The "proper" way to handle this would be to do minimal parsing in + // BPF and more in-depth parsing of all IPv6 packets in userspace, but + // on systems with a high volume of UDP that would be unacceptably slow + // and thus we'd rather be conservative here and possibly not receive + // disco packets rather than slow down the system. + magicsockFilterV6 = []bpf.Instruction{ + // For raw UDPv6 sockets, BPF receives _only_ the UDP header onwards, not an entire IP packet. + // + // https://stackoverflow.com/questions/24514333/using-bpf-with-sock-dgram-on-linux-machine + // https://blog.cloudflare.com/epbf_sockets_hop_distance/ + // + // This is especially confusing because this *isn't* true for + // IPv4; see the following code from the 'ping' utility that + // corroborates this: + // + // https://github.com/iputils/iputils/blob/1ab5fa/ping/ping.c#L1667-L1676 + // https://github.com/iputils/iputils/blob/1ab5fa/ping/ping6_common.c#L933-L941 + + // Compare with our magic number. Start by loading and + // comparing the first 4 bytes of the UDP payload. + bpf.LoadAbsolute{Off: udpHeaderSize, Size: 4}, + bpf.JumpIf{Cond: bpf.JumpEqual, Val: discoMagic1, SkipTrue: 0, SkipFalse: 3}, + + // Compare the next 2 bytes + bpf.LoadAbsolute{Off: udpHeaderSize + 4, Size: 2}, + bpf.JumpIf{Cond: bpf.JumpEqual, Val: discoMagic2, SkipTrue: 0, SkipFalse: 1}, + + // Accept the whole packet + bpf.RetConstant{Val: 0xFFFFFFFF}, + + // Skip the packet + bpf.RetConstant{Val: 0x0}, + } + + testDiscoPacket = []byte{ + // Disco magic + 0x54, 0x53, 0xf0, 0x9f, 0x92, 0xac, + // Sender key + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + // Nonce + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + } +) + +// listenRawDisco starts listening for disco packets on the given +// address family, which must be "ip4" or "ip6", using a raw socket +// and BPF filter. +// https://github.com/tailscale/tailscale/issues/3824 +func (c *Conn) listenRawDisco(family string) (io.Closer, error) { + if debugDisableRawDisco { + return nil, errors.New("raw disco listening disabled by debug flag") + } + + var ( + network string + addr string + testAddr string + prog []bpf.Instruction + ) + switch family { + case "ip4": + network = "ip4:17" + addr = "0.0.0.0" + testAddr = "127.0.0.1:1" + prog = magicsockFilterV4 + case "ip6": + network = "ip6:17" + addr = "::" + testAddr = "[::1]:1" + prog = magicsockFilterV6 + default: + return nil, fmt.Errorf("unsupported address family %q", family) + } + + asm, err := bpf.Assemble(prog) + if err != nil { + return nil, fmt.Errorf("assembling filter: %w", err) + } + + pc, err := net.ListenPacket(network, addr) + if err != nil { + return nil, fmt.Errorf("creating packet conn: %w", err) + } + + if err := setBPF(pc, asm); err != nil { + pc.Close() + return nil, fmt.Errorf("installing BPF filter: %w", err) + } + + // If all the above succeeds, we should be ready to receive. Just + // out of paranoia, check that we do receive a well-formed disco + // packet. + tc, err := net.ListenPacket("udp", net.JoinHostPort(addr, "0")) + if err != nil { + pc.Close() + return nil, fmt.Errorf("creating disco test socket: %w", err) + } + defer tc.Close() + if _, err := tc.(*net.UDPConn).WriteToUDPAddrPort(testDiscoPacket, netip.MustParseAddrPort(testAddr)); err != nil { + pc.Close() + return nil, fmt.Errorf("writing disco test packet: %w", err) + } + pc.SetReadDeadline(time.Now().Add(100 * time.Millisecond)) + var buf [1500]byte + for { + n, _, err := pc.ReadFrom(buf[:]) + if err != nil { + pc.Close() + return nil, fmt.Errorf("reading during raw disco self-test: %w", err) + } + if n < udpHeaderSize { + continue + } + if !bytes.Equal(buf[udpHeaderSize:n], testDiscoPacket) { + continue + } + break + } + pc.SetReadDeadline(time.Time{}) + + go c.receiveDisco(pc) + return pc, nil +} + +func (c *Conn) receiveDisco(pc net.PacketConn) { + var buf [1500]byte + for { + n, src, err := pc.ReadFrom(buf[:]) + if errors.Is(err, net.ErrClosed) { + return + } else if err != nil { + c.logf("disco raw reader failed: %v", err) + return + } + if n < udpHeaderSize { + // Too small to be a valid UDP datagram, drop. + continue + } + srcIP, ok := netip.AddrFromSlice(src.(*net.IPAddr).IP) + if !ok { + c.logf("[unexpected] PacketConn.ReadFrom returned not-an-IP %v in from", src) + continue + } + srcPort := binary.BigEndian.Uint16(buf[:2]) + + if srcIP.Is4() { + metricRecvDiscoPacketIPv4.Add(1) + } else { + metricRecvDiscoPacketIPv6.Add(1) + } + + c.handleDiscoMessage(buf[udpHeaderSize:n], netip.AddrPortFrom(srcIP, srcPort), key.NodePublic{}) + } +} + +// setBPF installs filter as the BPF filter on conn. +// Ideally we would just use SetBPF as implemented in x/net/ipv4, +// but x/net/ipv6 doesn't implement it. And once you've written +// this code once, it turns out to be address family agnostic, so +// we might as well use it on both and get to use a net.PacketConn +// directly for both families instead of being stuck with +// different types. +func setBPF(conn net.PacketConn, filter []bpf.RawInstruction) error { + sc, err := conn.(*net.IPConn).SyscallConn() + if err != nil { + return err + } + prog := &unix.SockFprog{ + Len: uint16(len(filter)), + Filter: (*unix.SockFilter)(unsafe.Pointer(&filter[0])), + } + var setErr error + err = sc.Control(func(fd uintptr) { + setErr = unix.SetsockoptSockFprog(int(fd), unix.SOL_SOCKET, unix.SO_ATTACH_FILTER, prog) + }) + if err != nil { + return err + } + if setErr != nil { + return err + } + return nil +} diff --git a/wgengine/magicsock/magicsock_test.go b/wgengine/magicsock/magicsock_test.go index 9551dc7fb..590442dd4 100644 --- a/wgengine/magicsock/magicsock_test.go +++ b/wgengine/magicsock/magicsock_test.go @@ -32,6 +32,7 @@ import ( "golang.zx2c4.com/wireguard/tun/tuntest" "tailscale.com/derp" "tailscale.com/derp/derphttp" + "tailscale.com/disco" "tailscale.com/ipn/ipnstate" "tailscale.com/net/netaddr" "tailscale.com/net/stun/stuntest" @@ -1799,3 +1800,21 @@ func TestBlockForeverConnUnblocks(t *testing.T) { t.Fatal("timeout") } } + +func TestDiscoMagicMatches(t *testing.T) { + // Convert our disco magic number into a uint32 and uint16 to test + // against. We panic on an incorrect length here rather than try to be + // generic with our BPF instructions below. + // + // Note that BPF uses network byte order (big-endian) when loading data + // from a packet, so that is what we use to generate our magic numbers. + if len(disco.Magic) != 6 { + t.Fatalf("expected disco.Magic to be of length 6") + } + if m1 := binary.BigEndian.Uint32([]byte(disco.Magic[:4])); m1 != discoMagic1 { + t.Errorf("first 4 bytes of disco magic don't match, got %v want %v", discoMagic1, m1) + } + if m2 := binary.BigEndian.Uint16([]byte(disco.Magic[4:6])); m2 != discoMagic2 { + t.Errorf("last 2 bytes of disco magic don't match, got %v want %v", discoMagic2, m2) + } +}