You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
tailscale/net/netmon/netmon.go

699 lines
20 KiB
Go

// Copyright (c) Tailscale Inc & AUTHORS
// SPDX-License-Identifier: BSD-3-Clause
// Package monitor provides facilities for monitoring network
// interface and route changes. It primarily exists to know when
// portable devices move between different networks.
package netmon
import (
"errors"
"fmt"
"net/netip"
"runtime"
"slices"
"sync"
"time"
"tailscale.com/feature/buildfeatures"
"tailscale.com/syncs"
"tailscale.com/types/logger"
"tailscale.com/util/clientmetric"
"tailscale.com/util/eventbus"
"tailscale.com/util/set"
)
// pollWallTimeInterval is how often we check the time to check
// for big jumps in wall (non-monotonic) time as a backup mechanism
// to get notified of a sleeping device waking back up.
// Usually there are also minor network change events on wake that let
// us check the wall time sooner than this.
const pollWallTimeInterval = 15 * time.Second
// message represents a message returned from an osMon.
type message interface {
// Ignore is whether we should ignore this message.
ignore() bool
}
// osMon is the interface that each operating system-specific
// implementation of the link monitor must implement.
type osMon interface {
Close() error
// Receive returns a new network interface change message. It
// should block until there's either something to return, or
// until the osMon is closed. After a Close, the returned
// error is ignored.
Receive() (message, error)
}
// IsInterestingInterface is the function used to determine whether
// a given interface name is interesting enough to pay attention to
// for network change monitoring purposes.
//
// If nil, all interfaces are considered interesting.
var IsInterestingInterface func(Interface, []netip.Prefix) bool
// Monitor represents a monitoring instance.
type Monitor struct {
logf logger.Logf
b *eventbus.Client
changed *eventbus.Publisher[ChangeDelta]
om osMon // nil means not supported on this platform
change chan bool // send false to wake poller, true to also force ChangeDeltas be sent
stop chan struct{} // closed on Stop
static bool // static Monitor that doesn't actually monitor
mu syncs.Mutex // guards all following fields
cbs set.HandleSet[ChangeFunc]
ifState *State
gwValid bool // whether gw and gwSelfIP are valid
gw netip.Addr // our gateway's IP
gwSelfIP netip.Addr // our own IP address (that corresponds to gw)
started bool
closed bool
goroutines sync.WaitGroup
wallTimer *time.Timer // nil until Started; re-armed AfterFunc per tick
lastWall time.Time
timeJumped bool // whether we need to send a changed=true after a big time jump
tsIfName string // tailscale interface name, if known/set ("tailscale0", "utun3", ...)
}
// ChangeFunc is a callback function registered with Monitor that's called when the
// network changed.
type ChangeFunc func(*ChangeDelta)
// ChangeDelta describes the difference between two network states.
//
// Use NewChangeDelta to construct a delta and compute the cached fields.
//
// TODO (barnstar): make new and old (and netmon.State) private once all consumers are updated
// to use the accessor methods.
type ChangeDelta struct {
// Old is the old interface state, if known.
// It's nil if the old state is unknown.
// Do not mutate it.
Old *State
// New is the new network state.
// It is always non-nil.
// Do not mutate it.
New *State
// TimeJumped is whether there was a big jump in wall time since the last
// time we checked. This is a hint that a sleeping device might have
// come out of sleep.
TimeJumped bool
// The tailscale interface name, e.g. "tailscale0", "utun3", etc. Not all
// platforms know this or set it. Copied from netmon.Monitor.tsIfName.
TailscaleIfaceName string
DefaultRouteInterface string
// Computed Fields
DefaultInterfaceChanged bool // whether default route interface changed
IsLessExpensive bool // whether new state's default interface is less expensive than old.
HasPACOrProxyConfigChanged bool // whether PAC/HTTP proxy config changed
InterfaceIPsChanged bool // whether any interface IPs changed in a meaningful way
AvailableProtocolsChanged bool // whether we have seen a change in available IPv4/IPv6
DefaultInterfaceMaybeViable bool // whether the default interface is potentially viable (has usable IPs, is up and is not the tunnel itself)
IsInitialState bool // whether this is the initial state (old == nil, new != nil)
// RebindLikelyRequired combines the various fields above to report whether this change likely requires us
// to rebind sockets. This is a very conservative estimate and covers a number ofcases where a rebind
// may not be strictly necessary. Consumers of the ChangeDelta should consider checking the individual fields
// above or the state of their sockets.
RebindLikelyRequired bool
}
// NewChangeDelta builds a ChangeDelta and eagerly computes the cached fields.
// forceViability, if true, forces DefaultInterfaceMaybeViable to be true regardless of the
// actual state of the default interface. This is useful in testing.
func NewChangeDelta(old, new *State, timeJumped bool, tsIfName string, forceViability bool) ChangeDelta {
cd := ChangeDelta{
Old: old,
New: new,
TimeJumped: timeJumped,
TailscaleIfaceName: tsIfName,
}
if cd.New == nil {
return cd
} else if cd.Old == nil {
cd.DefaultInterfaceChanged = cd.New.DefaultRouteInterface != ""
cd.IsLessExpensive = false
cd.HasPACOrProxyConfigChanged = true
cd.InterfaceIPsChanged = true
cd.IsInitialState = true
} else {
cd.AvailableProtocolsChanged = (cd.Old.HaveV4 != cd.New.HaveV4) || (cd.Old.HaveV6 != cd.New.HaveV6)
cd.DefaultInterfaceChanged = cd.Old.DefaultRouteInterface != cd.New.DefaultRouteInterface
cd.IsLessExpensive = cd.Old.IsExpensive && !cd.New.IsExpensive
cd.HasPACOrProxyConfigChanged = (cd.Old.PAC != cd.New.PAC) || (cd.Old.HTTPProxy != cd.New.HTTPProxy)
cd.InterfaceIPsChanged = cd.isInterestingIntefaceChange()
}
// If the default route interface is populated, but it's not up this event signifies that we're in
// the process of tearing it down. Rebinds are going to fail so it's flappy to try.
cd.DefaultRouteInterface = new.DefaultRouteInterface
defIf := new.Interface[cd.DefaultRouteInterface]
// The default interface is not viable if is down or is the Tailscale interface itself.
if !forceViability && (!defIf.IsUp() || cd.DefaultRouteInterface == tsIfName) {
cd.DefaultInterfaceMaybeViable = false
} else {
cd.DefaultInterfaceMaybeViable = true
}
// Compute rebind requirement. The default interface needs to be viable and
// one of the other conditions needs to be true.
cd.RebindLikelyRequired = (cd.Old == nil ||
cd.TimeJumped ||
cd.DefaultInterfaceChanged ||
cd.InterfaceIPsChanged ||
cd.IsLessExpensive ||
cd.HasPACOrProxyConfigChanged ||
cd.AvailableProtocolsChanged) &&
cd.DefaultInterfaceMaybeViable
return cd
}
// StateDesc returns a description of the old and new states for logging
func (cd *ChangeDelta) StateDesc() string {
return fmt.Sprintf("old: %v new: %v", cd.Old, cd.New)
}
// InterfaceIPAppeared reports whether the given IP address exists on any interface
// in the old state, but not in the new state.
func (cd *ChangeDelta) InterfaceIPDisppeared(ip netip.Addr) bool {
if cd.Old == nil {
return false
}
if cd.New == nil && cd.Old.HasIP(ip) {
return true
}
return cd.New.HasIP(ip) && !cd.Old.HasIP(ip)
}
// InterfaceIPDisappeared reports whether the given IP address existed on any interface
// in the old state, but not in the new state.
func (cd *ChangeDelta) InterfaceIPDisappeared(ip netip.Addr) bool {
return !cd.New.HasIP(ip) && cd.Old.HasIP(ip)
}
// AnyInterfaceUp reports whether any interfaces are up in the new state.
func (cd *ChangeDelta) AnyInterfaceUp() bool {
if cd.New == nil {
return false
}
for _, ifi := range cd.New.Interface {
if ifi.IsUp() {
return true
}
}
return false
}
// isInterestingIntefaceChange reports whether any interfaces have changed in a meaninful way.
// This excludes interfaces that are not interesting per IsInterestingInterface and
// filters out changes to interface IPs that that are uninteresting (e.g. link-local addresses).
func (cd *ChangeDelta) isInterestingIntefaceChange() bool {
if cd.Old == nil && cd.New == nil {
return false
}
// If either side is nil treat as changed.
if cd.Old == nil || cd.New == nil {
return true
}
// Compare interfaces in both directions. Old to new and new to old.
for iname, oldInterface := range cd.Old.Interface {
if iname == cd.TailscaleIfaceName {
// Ignore changes in the Tailscale interface itself.
continue
}
oldIps := filterRoutableIPs(cd.Old.InterfaceIPs[iname])
if IsInterestingInterface != nil && !IsInterestingInterface(oldInterface, oldIps) {
continue
}
// Old interfaces with no routable addresses are not interesting
if len(oldIps) == 0 {
continue
}
// The old interface doesn't exist in the new interface set and it has
// a global unicast IP. That's considered a change from the perspective
// of anything that may have been bound to it. If it didn't have a global
// unicast IP, it's not interesting.
newInterface, ok := cd.New.Interface[iname]
if !ok {
return true
}
newIps, ok := cd.New.InterfaceIPs[iname]
if !ok {
return true
}
newIps = filterRoutableIPs(newIps)
if !oldInterface.Equal(newInterface) || !prefixesEqual(oldIps, newIps) {
return true
}
}
for iname, newInterface := range cd.New.Interface {
if iname == cd.TailscaleIfaceName {
continue
}
newIps := filterRoutableIPs(cd.New.InterfaceIPs[iname])
if IsInterestingInterface != nil && !IsInterestingInterface(newInterface, newIps) {
continue
}
// New interfaces with no routable addresses are not interesting
if len(newIps) == 0 {
continue
}
oldInterface, ok := cd.Old.Interface[iname]
if !ok {
return true
}
oldIps, ok := cd.Old.InterfaceIPs[iname]
if !ok {
// Redundant but we can't dig up the "old" IPs for this interface.
return true
}
oldIps = filterRoutableIPs(oldIps)
// The interface's IPs, Name, MTU, etc have changed. This is definitely interesting.
if !newInterface.Equal(oldInterface) || !prefixesEqual(oldIps, newIps) {
return true
}
}
return false
}
func filterRoutableIPs(addrs []netip.Prefix) []netip.Prefix {
var filtered []netip.Prefix
for _, pfx := range addrs {
a := pfx.Addr()
// Skip link-local multicast addresses.
if a.IsLinkLocalMulticast() {
continue
}
if isUsableV4(a) || isUsableV6(a) {
filtered = append(filtered, pfx)
}
}
fmt.Printf("Filtered: %v\n", filtered)
return filtered
}
// New instantiates and starts a monitoring instance.
// The returned monitor is inactive until it's started by the Start method.
// Use RegisterChangeCallback to get notified of network changes.
func New(bus *eventbus.Bus, logf logger.Logf) (*Monitor, error) {
logf = logger.WithPrefix(logf, "monitor: ")
m := &Monitor{
logf: logf,
b: bus.Client("netmon"),
change: make(chan bool, 1),
stop: make(chan struct{}),
lastWall: wallTime(),
}
m.changed = eventbus.Publish[ChangeDelta](m.b)
st, err := m.interfaceStateUncached()
if err != nil {
return nil, err
}
m.ifState = st
m.om, err = newOSMon(bus, logf, m)
if err != nil {
return nil, err
}
if m.om == nil {
return nil, errors.New("newOSMon returned nil, nil")
}
return m, nil
}
// NewStatic returns a Monitor that's a one-time snapshot of the network state
// but doesn't actually monitor for changes. It should only be used in tests
// and situations like cleanups or short-lived CLI programs.
func NewStatic() *Monitor {
m := &Monitor{static: true}
if st, err := m.interfaceStateUncached(); err == nil {
m.ifState = st
}
return m
}
// InterfaceState returns the latest snapshot of the machine's network
// interfaces.
//
// The returned value is owned by Mon; it must not be modified.
func (m *Monitor) InterfaceState() *State {
m.mu.Lock()
defer m.mu.Unlock()
return m.ifState
}
func (m *Monitor) interfaceStateUncached() (*State, error) {
return getState(m.tsIfName)
}
// SetTailscaleInterfaceName sets the name of the Tailscale interface. For
// example, "tailscale0", "tun0", "utun3", etc.
//
// This must be called only early in tailscaled startup before the monitor is
// used.
func (m *Monitor) SetTailscaleInterfaceName(ifName string) {
m.mu.Lock()
defer m.mu.Unlock()
m.tsIfName = ifName
}
func (m *Monitor) TailscaleInterfaceName() string {
m.mu.Lock()
defer m.mu.Unlock()
return m.tsIfName
}
// GatewayAndSelfIP returns the current network's default gateway, and
// the machine's default IP for that gateway.
//
// It's the same as interfaces.LikelyHomeRouterIP, but it caches the
// result until the monitor detects a network change.
func (m *Monitor) GatewayAndSelfIP() (gw, myIP netip.Addr, ok bool) {
if !buildfeatures.HasPortMapper {
return
}
if m.static {
return
}
m.mu.Lock()
defer m.mu.Unlock()
if m.gwValid {
return m.gw, m.gwSelfIP, true
}
gw, myIP, ok = LikelyHomeRouterIP()
changed := false
if ok {
changed = m.gw != gw || m.gwSelfIP != myIP
m.gw, m.gwSelfIP = gw, myIP
m.gwValid = true
}
if changed {
m.logf("gateway and self IP changed: gw=%v self=%v", m.gw, m.gwSelfIP)
}
return gw, myIP, ok
}
// RegisterChangeCallback adds callback to the set of parties to be
// notified (in their own goroutine) when the network state changes.
// To remove this callback, call unregister (or close the monitor).
func (m *Monitor) RegisterChangeCallback(callback ChangeFunc) (unregister func()) {
if m.static {
return func() {}
}
m.mu.Lock()
defer m.mu.Unlock()
handle := m.cbs.Add(callback)
return func() {
m.mu.Lock()
defer m.mu.Unlock()
delete(m.cbs, handle)
}
}
// Start starts the monitor.
// A monitor can only be started & closed once.
func (m *Monitor) Start() {
if m.static {
return
}
m.mu.Lock()
defer m.mu.Unlock()
if m.started || m.closed {
return
}
m.started = true
if shouldMonitorTimeJump {
m.wallTimer = time.AfterFunc(pollWallTimeInterval, m.pollWallTime)
}
if m.om == nil {
return
}
m.goroutines.Add(2)
go m.pump()
go m.debounce()
}
// Close closes the monitor.
func (m *Monitor) Close() error {
if m.static {
return nil
}
m.mu.Lock()
if m.closed {
m.mu.Unlock()
return nil
}
m.closed = true
close(m.stop)
if m.wallTimer != nil {
m.wallTimer.Stop()
}
var err error
if m.om != nil {
err = m.om.Close()
}
started := m.started
m.mu.Unlock()
if started {
m.goroutines.Wait()
}
return err
}
// InjectEvent forces the monitor to pretend there was a network
// change and re-check the state of the network. Any registered
// ChangeFunc callbacks will be called within the event coalescing
// period (under a fraction of a second).
func (m *Monitor) InjectEvent() {
if m.static {
return
}
select {
case m.change <- true:
default:
// Another change signal is already
// buffered. Debounce will wake up soon
// enough.
}
}
// Poll forces the monitor to pretend there was a network
// change and re-check the state of the network.
//
// This is like InjectEvent but only fires ChangeFunc callbacks
// if the network state differed at all.
func (m *Monitor) Poll() {
if m.static {
return
}
select {
case m.change <- false:
default:
}
}
func (m *Monitor) stopped() bool {
select {
case <-m.stop:
return true
default:
return false
}
}
// pump continuously retrieves messages from the connection, notifying
// the change channel of changes, and stopping when a stop is issued.
func (m *Monitor) pump() {
defer m.goroutines.Done()
for !m.stopped() {
msg, err := m.om.Receive()
if err != nil {
if m.stopped() {
return
}
// Keep retrying while we're not closed.
m.logf("error from link monitor: %v", err)
time.Sleep(time.Second)
continue
}
if msg.ignore() {
continue
}
m.Poll()
}
}
// / debounce calls the callback function with a delay between events
// and exits when a stop is issued.
func (m *Monitor) debounce() {
defer m.goroutines.Done()
for {
var forceCallbacks bool
select {
case <-m.stop:
return
case forceCallbacks = <-m.change:
}
if newState, err := m.interfaceStateUncached(); err != nil {
m.logf("interfaces.State: %v", err)
} else {
m.handlePotentialChange(newState, forceCallbacks)
}
select {
case <-m.stop:
return
// 1s is reasonable debounce time for network changes. Events such as undocking a laptop
// or roaming onto wifi will often generate multiple events in quick succession as interfaces
// flap. We want to avoid spamming consumers of these events.
case <-time.After(1000 * time.Millisecond):
}
}
}
var (
metricChangeEq = clientmetric.NewCounter("netmon_link_change_eq")
metricChange = clientmetric.NewCounter("netmon_link_change")
metricChangeTimeJump = clientmetric.NewCounter("netmon_link_change_timejump")
metricChangeMajor = clientmetric.NewCounter("netmon_link_change_major")
)
// handlePotentialChange considers whether newState is different enough to wake
// up callers and updates the monitor's state if so.
//
// If forceCallbacks is true, they're always notified.
func (m *Monitor) handlePotentialChange(newState *State, forceCallbacks bool) {
m.mu.Lock()
defer m.mu.Unlock()
oldState := m.ifState
timeJumped := shouldMonitorTimeJump && m.checkWallTimeAdvanceLocked()
if !timeJumped && !forceCallbacks && oldState.Equal(newState) {
// Exactly equal. Nothing to do.
metricChangeEq.Add(1)
return
}
delta := NewChangeDelta(oldState, newState, timeJumped, m.tsIfName, false)
if delta.RebindLikelyRequired {
m.gwValid = false
}
m.ifState = newState
// See if we have a queued or new time jump signal.
if timeJumped {
m.resetTimeJumpedLocked()
}
metricChange.Add(1)
if delta.RebindLikelyRequired {
metricChangeMajor.Add(1)
}
if delta.TimeJumped {
metricChangeTimeJump.Add(1)
}
m.changed.Publish(delta)
for _, cb := range m.cbs {
go cb(&delta)
}
}
// reports whether a and b contain the same set of prefixes regardless of order.
func prefixesEqual(a, b []netip.Prefix) bool {
if len(a) != len(b) {
return false
}
aa := make([]netip.Prefix, len(a))
bb := make([]netip.Prefix, len(b))
copy(aa, a)
copy(bb, b)
less := func(x, y netip.Prefix) int {
return x.Addr().Compare(y.Addr())
}
slices.SortFunc(aa, less)
slices.SortFunc(bb, less)
return slices.Equal(aa, bb)
}
func wallTime() time.Time {
// From time package's docs: "The canonical way to strip a
// monotonic clock reading is to use t = t.Round(0)."
return time.Now().Round(0)
}
func (m *Monitor) pollWallTime() {
m.mu.Lock()
defer m.mu.Unlock()
if m.closed {
return
}
if m.checkWallTimeAdvanceLocked() {
m.InjectEvent()
}
m.wallTimer.Reset(pollWallTimeInterval)
}
// shouldMonitorTimeJump is whether we keep a regular periodic timer running in
// the background watching for jumps in wall time.
//
// We don't do this on mobile platforms for battery reasons, and because these
// platforms don't really sleep in the same way.
const shouldMonitorTimeJump = runtime.GOOS != "android" && runtime.GOOS != "ios" && runtime.GOOS != "plan9"
// checkWallTimeAdvanceLocked reports whether wall time jumped more than 150% of
// pollWallTimeInterval, indicating we probably just came out of sleep. Once a
// time jump is detected it must be reset by calling resetTimeJumpedLocked.
func (m *Monitor) checkWallTimeAdvanceLocked() bool {
if !shouldMonitorTimeJump {
panic("unreachable") // if callers are correct
}
now := wallTime()
if now.Sub(m.lastWall) > pollWallTimeInterval*3/2 {
m.timeJumped = true // it is reset by debounce.
}
m.lastWall = now
return m.timeJumped
}
// resetTimeJumpedLocked consumes the signal set by checkWallTimeAdvanceLocked.
func (m *Monitor) resetTimeJumpedLocked() {
m.timeJumped = false
}