From c72caa6672c83a981e7b9ffcaa2f12fd9351d17e Mon Sep 17 00:00:00 2001
From: Andrew Dunham <andrew@du.nham.ca>
Date: Mon, 29 Aug 2022 10:57:54 -0400
Subject: [PATCH] wgengine/magicsock: use AF_PACKET socket + BPF to read disco
 messages

This is entirely optional (i.e. failing in this code is non-fatal) and
only enabled on Linux for now. Additionally, this new behaviour can be
disabled by setting the TS_DEBUG_DISABLE_AF_PACKET environment variable.

Updates #3824
Replaces #5474

Co-authored-by: Andrew Dunham <andrew@du.nham.ca>
Signed-off-by: David Anderson <danderson@tailscale.com>
---
 cmd/tailscaled/depaware.txt             |   2 +-
 wgengine/magicsock/magicsock.go         |  54 ++++-
 wgengine/magicsock/magicsock_default.go |  17 ++
 wgengine/magicsock/magicsock_linux.go   | 260 ++++++++++++++++++++++++
 wgengine/magicsock/magicsock_test.go    |  19 ++
 5 files changed, 347 insertions(+), 5 deletions(-)
 create mode 100644 wgengine/magicsock/magicsock_default.go
 create mode 100644 wgengine/magicsock/magicsock_linux.go

diff --git a/cmd/tailscaled/depaware.txt b/cmd/tailscaled/depaware.txt
index 2e0e99e2b..f046e4d73 100644
--- a/cmd/tailscaled/depaware.txt
+++ b/cmd/tailscaled/depaware.txt
@@ -290,7 +290,7 @@ tailscale.com/cmd/tailscaled dependencies: (generated by github.com/tailscale/de
    W    tailscale.com/wf                                             from tailscale.com/cmd/tailscaled
         tailscale.com/wgengine                                       from tailscale.com/ipn/ipnlocal+
         tailscale.com/wgengine/filter                                from tailscale.com/control/controlclient+
-        tailscale.com/wgengine/magicsock                             from tailscale.com/ipn/ipnlocal+
+     💣 tailscale.com/wgengine/magicsock                             from tailscale.com/ipn/ipnlocal+
         tailscale.com/wgengine/monitor                               from tailscale.com/control/controlclient+
         tailscale.com/wgengine/netstack                              from tailscale.com/cmd/tailscaled+
         tailscale.com/wgengine/router                                from tailscale.com/ipn/ipnlocal+
diff --git a/wgengine/magicsock/magicsock.go b/wgengine/magicsock/magicsock.go
index 11f6f7ad4..288839b21 100644
--- a/wgengine/magicsock/magicsock.go
+++ b/wgengine/magicsock/magicsock.go
@@ -14,6 +14,7 @@ import (
 	"errors"
 	"fmt"
 	"hash/fnv"
+	"io"
 	"math"
 	"math/rand"
 	"net"
@@ -60,6 +61,16 @@ import (
 	"tailscale.com/wgengine/monitor"
 )
 
+const (
+	// These are disco.Magic in big-endian form, 4 then 2 bytes. The
+	// BPF filters need the magic in this format to match on it. Used
+	// only in magicsock_linux.go, but defined here so that the test
+	// which verifies this is the correct magic doesn't also need a
+	// _linux variant.
+	discoMagic1 = 0x5453f09f
+	discoMagic2 = 0x92ac
+)
+
 // useDerpRoute reports whether magicsock should enable the DERP
 // return path optimization (Issue 150).
 func useDerpRoute() bool {
@@ -254,6 +265,12 @@ type Conn struct {
 	pconn4 *RebindingUDPConn
 	pconn6 *RebindingUDPConn
 
+	// closeDisco4 and closeDisco6 are io.Closers to shut down the raw
+	// disco packet receivers. If nil, no raw disco receiver is
+	// running for the given family.
+	closeDisco4 io.Closer
+	closeDisco6 io.Closer
+
 	// netChecker is the prober that discovers local network
 	// conditions, including the closest DERP relay and NAT mappings.
 	netChecker *netcheck.Client
@@ -572,6 +589,19 @@ func NewConn(opts Options) (*Conn, error) {
 
 	c.ignoreSTUNPackets()
 
+	if d4, err := c.listenRawDisco("ip4"); err == nil {
+		c.logf("[v1] using BPF disco receiver for IPv4")
+		c.closeDisco4 = d4
+	} else {
+		c.logf("[v1] couldn't create raw v4 disco listener, using regular listener instead: %v", err)
+	}
+	if d6, err := c.listenRawDisco("ip6"); err == nil {
+		c.logf("[v1] using BPF disco receiver for IPv6")
+		c.closeDisco6 = d6
+	} else {
+		c.logf("[v1] couldn't create raw v6 disco listener, using regular listener instead: %v", err)
+	}
+
 	return c, nil
 }
 
@@ -1638,7 +1668,7 @@ func (c *Conn) receiveIPv6(b []byte) (int, conn.Endpoint, error) {
 		if err != nil {
 			return 0, nil, err
 		}
-		if ep, ok := c.receiveIP(b[:n], ipp, &c.ippEndpoint6); ok {
+		if ep, ok := c.receiveIP(b[:n], ipp, &c.ippEndpoint6, c.closeDisco6 == nil); ok {
 			metricRecvDataIPv6.Add(1)
 			return n, ep, nil
 		}
@@ -1654,7 +1684,7 @@ func (c *Conn) receiveIPv4(b []byte) (n int, ep conn.Endpoint, err error) {
 		if err != nil {
 			return 0, nil, err
 		}
-		if ep, ok := c.receiveIP(b[:n], ipp, &c.ippEndpoint4); ok {
+		if ep, ok := c.receiveIP(b[:n], ipp, &c.ippEndpoint4, c.closeDisco4 == nil); ok {
 			metricRecvDataIPv4.Add(1)
 			return n, ep, nil
 		}
@@ -1665,12 +1695,18 @@ func (c *Conn) receiveIPv4(b []byte) (n int, ep conn.Endpoint, err error) {
 //
 // ok is whether this read should be reported up to wireguard-go (our
 // caller).
-func (c *Conn) receiveIP(b []byte, ipp netip.AddrPort, cache *ippEndpointCache) (ep *endpoint, ok bool) {
+func (c *Conn) receiveIP(b []byte, ipp netip.AddrPort, cache *ippEndpointCache, checkDisco bool) (ep *endpoint, ok bool) {
 	if stun.Is(b) {
 		c.stunReceiveFunc.Load()(b, ipp)
 		return nil, false
 	}
-	if c.handleDiscoMessage(b, ipp, key.NodePublic{}) {
+	if checkDisco {
+		if c.handleDiscoMessage(b, ipp, key.NodePublic{}) {
+			return nil, false
+		}
+	} else if disco.LooksLikeDiscoWrapper(b) {
+		// Caller told us to ignore disco traffic, don't let it fall
+		// through to wireguard-go.
 		return nil, false
 	}
 	if !c.havePrivateKey.Load() {
@@ -2632,6 +2668,12 @@ func (c *connBind) Close() error {
 	if c.pconn6 != nil {
 		c.pconn6.Close()
 	}
+	if c.closeDisco4 != nil {
+		c.closeDisco4.Close()
+	}
+	if c.closeDisco6 != nil {
+		c.closeDisco6.Close()
+	}
 	// Send an empty read result to unblock receiveDERP,
 	// which will then check connBind.Closed.
 	// connBind.Closed takes c.mu, but c.derpRecvCh is buffered.
@@ -4192,4 +4234,8 @@ var (
 	// metricDERPHomeChange is how many times our DERP home region DI has
 	// changed from non-zero to a different non-zero.
 	metricDERPHomeChange = clientmetric.NewCounter("derp_home_change")
+
+	// Disco packets received bpf read path
+	metricRecvDiscoPacketIPv4 = clientmetric.NewCounter("magicsock_disco_recv_bpf_ipv4")
+	metricRecvDiscoPacketIPv6 = clientmetric.NewCounter("magicsock_disco_recv_bpf_ipv6")
 )
diff --git a/wgengine/magicsock/magicsock_default.go b/wgengine/magicsock/magicsock_default.go
new file mode 100644
index 000000000..530fe95fa
--- /dev/null
+++ b/wgengine/magicsock/magicsock_default.go
@@ -0,0 +1,17 @@
+// Copyright (c) 2022 Tailscale Inc & AUTHORS All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !linux
+// +build !linux
+
+package magicsock
+
+import (
+	"errors"
+	"io"
+)
+
+func (c *Conn) listenRawDisco(family string) (io.Closer, error) {
+	return nil, errors.New("raw disco listening not supported on this OS")
+}
diff --git a/wgengine/magicsock/magicsock_linux.go b/wgengine/magicsock/magicsock_linux.go
new file mode 100644
index 000000000..796682de7
--- /dev/null
+++ b/wgengine/magicsock/magicsock_linux.go
@@ -0,0 +1,260 @@
+// Copyright (c) 2022 Tailscale Inc & AUTHORS All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package magicsock
+
+import (
+	"bytes"
+	"encoding/binary"
+	"errors"
+	"fmt"
+	"io"
+	"net"
+	"net/netip"
+	"time"
+	"unsafe"
+
+	"golang.org/x/net/bpf"
+	"golang.org/x/sys/unix"
+	"tailscale.com/envknob"
+	"tailscale.com/types/key"
+)
+
+const (
+	udpHeaderSize          = 8
+	ipv6FragmentHeaderSize = 8
+)
+
+// Enable/disable using raw sockets to receive disco traffic.
+var debugDisableRawDisco = envknob.Bool("TS_DEBUG_DISABLE_RAW_DISCO")
+
+// These are our BPF filters that we use for testing packets.
+var (
+	magicsockFilterV4 = []bpf.Instruction{
+		// For raw UDPv4 sockets, BPF receives the entire IP packet to
+		// inspect.
+
+		// Disco packets are so small they should never get
+		// fragmented, and we don't want to handle reassembly.
+		bpf.LoadAbsolute{Off: 6, Size: 2},
+		// More Fragments bit set means this is part of a fragmented packet.
+		bpf.JumpIf{Cond: bpf.JumpBitsSet, Val: 0x2000, SkipTrue: 7, SkipFalse: 0},
+		// Non-zero fragment offset with MF=0 means this is the last
+		// fragment of packet.
+		bpf.JumpIf{Cond: bpf.JumpBitsSet, Val: 0x1fff, SkipTrue: 6, SkipFalse: 0},
+
+		// Load IP header length into X register.
+		bpf.LoadMemShift{Off: 0},
+
+		// Get the first 4 bytes of the UDP packet, compare with our magic number
+		bpf.LoadIndirect{Off: udpHeaderSize, Size: 4},
+		bpf.JumpIf{Cond: bpf.JumpEqual, Val: discoMagic1, SkipTrue: 0, SkipFalse: 3},
+
+		// Compare the next 2 bytes
+		bpf.LoadIndirect{Off: udpHeaderSize + 4, Size: 2},
+		bpf.JumpIf{Cond: bpf.JumpEqual, Val: uint32(discoMagic2), SkipTrue: 0, SkipFalse: 1},
+
+		// Accept the whole packet
+		bpf.RetConstant{Val: 0xFFFFFFFF},
+
+		// Skip the packet
+		bpf.RetConstant{Val: 0x0},
+	}
+
+	// IPv6 is more complicated to filter, since we can have 0-to-N
+	// extension headers following the IPv6 header. Since BPF can't
+	// loop, we can't really parse these in a general way; instead, we
+	// simply handle the case where we have no extension headers; any
+	// packets with headers will be skipped. IPv6 extension headers
+	// are sufficiently uncommon that we're willing to accept false
+	// negatives here.
+	//
+	// The "proper" way to handle this would be to do minimal parsing in
+	// BPF and more in-depth parsing of all IPv6 packets in userspace, but
+	// on systems with a high volume of UDP that would be unacceptably slow
+	// and thus we'd rather be conservative here and possibly not receive
+	// disco packets rather than slow down the system.
+	magicsockFilterV6 = []bpf.Instruction{
+		// For raw UDPv6 sockets, BPF receives _only_ the UDP header onwards, not an entire IP packet.
+		//
+		//    https://stackoverflow.com/questions/24514333/using-bpf-with-sock-dgram-on-linux-machine
+		//    https://blog.cloudflare.com/epbf_sockets_hop_distance/
+		//
+		// This is especially confusing because this *isn't* true for
+		// IPv4; see the following code from the 'ping' utility that
+		// corroborates this:
+		//
+		//    https://github.com/iputils/iputils/blob/1ab5fa/ping/ping.c#L1667-L1676
+		//    https://github.com/iputils/iputils/blob/1ab5fa/ping/ping6_common.c#L933-L941
+
+		// Compare with our magic number. Start by loading and
+		// comparing the first 4 bytes of the UDP payload.
+		bpf.LoadAbsolute{Off: udpHeaderSize, Size: 4},
+		bpf.JumpIf{Cond: bpf.JumpEqual, Val: discoMagic1, SkipTrue: 0, SkipFalse: 3},
+
+		// Compare the next 2 bytes
+		bpf.LoadAbsolute{Off: udpHeaderSize + 4, Size: 2},
+		bpf.JumpIf{Cond: bpf.JumpEqual, Val: discoMagic2, SkipTrue: 0, SkipFalse: 1},
+
+		// Accept the whole packet
+		bpf.RetConstant{Val: 0xFFFFFFFF},
+
+		// Skip the packet
+		bpf.RetConstant{Val: 0x0},
+	}
+
+	testDiscoPacket = []byte{
+		// Disco magic
+		0x54, 0x53, 0xf0, 0x9f, 0x92, 0xac,
+		// Sender key
+		0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+		0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+		0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+		0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+		// Nonce
+		0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+		0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+		0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+	}
+)
+
+// listenRawDisco starts listening for disco packets on the given
+// address family, which must be "ip4" or "ip6", using a raw socket
+// and BPF filter.
+// https://github.com/tailscale/tailscale/issues/3824
+func (c *Conn) listenRawDisco(family string) (io.Closer, error) {
+	if debugDisableRawDisco {
+		return nil, errors.New("raw disco listening disabled by debug flag")
+	}
+
+	var (
+		network  string
+		addr     string
+		testAddr string
+		prog     []bpf.Instruction
+	)
+	switch family {
+	case "ip4":
+		network = "ip4:17"
+		addr = "0.0.0.0"
+		testAddr = "127.0.0.1:1"
+		prog = magicsockFilterV4
+	case "ip6":
+		network = "ip6:17"
+		addr = "::"
+		testAddr = "[::1]:1"
+		prog = magicsockFilterV6
+	default:
+		return nil, fmt.Errorf("unsupported address family %q", family)
+	}
+
+	asm, err := bpf.Assemble(prog)
+	if err != nil {
+		return nil, fmt.Errorf("assembling filter: %w", err)
+	}
+
+	pc, err := net.ListenPacket(network, addr)
+	if err != nil {
+		return nil, fmt.Errorf("creating packet conn: %w", err)
+	}
+
+	if err := setBPF(pc, asm); err != nil {
+		pc.Close()
+		return nil, fmt.Errorf("installing BPF filter: %w", err)
+	}
+
+	// If all the above succeeds, we should be ready to receive. Just
+	// out of paranoia, check that we do receive a well-formed disco
+	// packet.
+	tc, err := net.ListenPacket("udp", net.JoinHostPort(addr, "0"))
+	if err != nil {
+		pc.Close()
+		return nil, fmt.Errorf("creating disco test socket: %w", err)
+	}
+	defer tc.Close()
+	if _, err := tc.(*net.UDPConn).WriteToUDPAddrPort(testDiscoPacket, netip.MustParseAddrPort(testAddr)); err != nil {
+		pc.Close()
+		return nil, fmt.Errorf("writing disco test packet: %w", err)
+	}
+	pc.SetReadDeadline(time.Now().Add(100 * time.Millisecond))
+	var buf [1500]byte
+	for {
+		n, _, err := pc.ReadFrom(buf[:])
+		if err != nil {
+			pc.Close()
+			return nil, fmt.Errorf("reading during raw disco self-test: %w", err)
+		}
+		if n < udpHeaderSize {
+			continue
+		}
+		if !bytes.Equal(buf[udpHeaderSize:n], testDiscoPacket) {
+			continue
+		}
+		break
+	}
+	pc.SetReadDeadline(time.Time{})
+
+	go c.receiveDisco(pc)
+	return pc, nil
+}
+
+func (c *Conn) receiveDisco(pc net.PacketConn) {
+	var buf [1500]byte
+	for {
+		n, src, err := pc.ReadFrom(buf[:])
+		if errors.Is(err, net.ErrClosed) {
+			return
+		} else if err != nil {
+			c.logf("disco raw reader failed: %v", err)
+			return
+		}
+		if n < udpHeaderSize {
+			// Too small to be a valid UDP datagram, drop.
+			continue
+		}
+		srcIP, ok := netip.AddrFromSlice(src.(*net.IPAddr).IP)
+		if !ok {
+			c.logf("[unexpected] PacketConn.ReadFrom returned not-an-IP %v in from", src)
+			continue
+		}
+		srcPort := binary.BigEndian.Uint16(buf[:2])
+
+		if srcIP.Is4() {
+			metricRecvDiscoPacketIPv4.Add(1)
+		} else {
+			metricRecvDiscoPacketIPv6.Add(1)
+		}
+
+		c.handleDiscoMessage(buf[udpHeaderSize:n], netip.AddrPortFrom(srcIP, srcPort), key.NodePublic{})
+	}
+}
+
+// setBPF installs filter as the BPF filter on conn.
+// Ideally we would just use SetBPF as implemented in x/net/ipv4,
+// but x/net/ipv6 doesn't implement it. And once you've written
+// this code once, it turns out to be address family agnostic, so
+// we might as well use it on both and get to use a net.PacketConn
+// directly for both families instead of being stuck with
+// different types.
+func setBPF(conn net.PacketConn, filter []bpf.RawInstruction) error {
+	sc, err := conn.(*net.IPConn).SyscallConn()
+	if err != nil {
+		return err
+	}
+	prog := &unix.SockFprog{
+		Len:    uint16(len(filter)),
+		Filter: (*unix.SockFilter)(unsafe.Pointer(&filter[0])),
+	}
+	var setErr error
+	err = sc.Control(func(fd uintptr) {
+		setErr = unix.SetsockoptSockFprog(int(fd), unix.SOL_SOCKET, unix.SO_ATTACH_FILTER, prog)
+	})
+	if err != nil {
+		return err
+	}
+	if setErr != nil {
+		return err
+	}
+	return nil
+}
diff --git a/wgengine/magicsock/magicsock_test.go b/wgengine/magicsock/magicsock_test.go
index 9551dc7fb..590442dd4 100644
--- a/wgengine/magicsock/magicsock_test.go
+++ b/wgengine/magicsock/magicsock_test.go
@@ -32,6 +32,7 @@ import (
 	"golang.zx2c4.com/wireguard/tun/tuntest"
 	"tailscale.com/derp"
 	"tailscale.com/derp/derphttp"
+	"tailscale.com/disco"
 	"tailscale.com/ipn/ipnstate"
 	"tailscale.com/net/netaddr"
 	"tailscale.com/net/stun/stuntest"
@@ -1799,3 +1800,21 @@ func TestBlockForeverConnUnblocks(t *testing.T) {
 		t.Fatal("timeout")
 	}
 }
+
+func TestDiscoMagicMatches(t *testing.T) {
+	// Convert our disco magic number into a uint32 and uint16 to test
+	// against. We panic on an incorrect length here rather than try to be
+	// generic with our BPF instructions below.
+	//
+	// Note that BPF uses network byte order (big-endian) when loading data
+	// from a packet, so that is what we use to generate our magic numbers.
+	if len(disco.Magic) != 6 {
+		t.Fatalf("expected disco.Magic to be of length 6")
+	}
+	if m1 := binary.BigEndian.Uint32([]byte(disco.Magic[:4])); m1 != discoMagic1 {
+		t.Errorf("first 4 bytes of disco magic don't match, got %v want %v", discoMagic1, m1)
+	}
+	if m2 := binary.BigEndian.Uint16([]byte(disco.Magic[4:6])); m2 != discoMagic2 {
+		t.Errorf("last 2 bytes of disco magic don't match, got %v want %v", discoMagic2, m2)
+	}
+}