diff --git a/health/health.go b/health/health.go index a9d29b7ba..d545e885a 100644 --- a/health/health.go +++ b/health/health.go @@ -37,12 +37,9 @@ var ( ipnWantRunning bool anyInterfaceUp = true // until told otherwise - receiveIPv4Started bool - receiveIPv4Running bool - receiveIPv6Started bool - receiveIPv6Running bool - receiveDERPStarted bool - receiveDERPRunning bool + ReceiveIPv4 = ReceiveFuncState{name: "IPv4"} + ReceiveIPv6 = ReceiveFuncState{name: "IPv6"} + ReceiveDERP = ReceiveFuncState{name: "DERP"} ) // Subsystem is the name of a subsystem whose health can be monitored. @@ -220,17 +217,65 @@ func SetAnyInterfaceUp(up bool) { selfCheckLocked() } -func SetReceiveIPv4Started(running bool) { setHealthBool(&receiveIPv4Started, running) } -func SetReceiveIPv4Running(running bool) { setHealthBool(&receiveIPv4Running, running) } -func SetReceiveIPv6Started(running bool) { setHealthBool(&receiveIPv6Started, running) } -func SetReceiveIPv6Running(running bool) { setHealthBool(&receiveIPv6Running, running) } -func SetReceiveDERPStarted(running bool) { setHealthBool(&receiveDERPStarted, running) } -func SetReceiveDERPRunning(running bool) { setHealthBool(&receiveDERPRunning, running) } +// ReceiveFuncState tracks the state of a wireguard-go conn.ReceiveFunc. +type ReceiveFuncState struct { + // name is a mnemonic for the receive func, used in error messages. + name string + // started indicates whether magicsock.connBind.Open + // has requested that wireguard-go start its receive func + // goroutine (without a corresponding connBind.Close). + started bool + // running models whether wireguard-go's receive func + // goroutine is actually running. We cannot easily introspect that, + // so it is based on our knowledge of wireguard-go's internals. + running bool +} + +// err returns the error state (if any) that s represents. +func (s ReceiveFuncState) err() error { + // Possible states: + // | started | running | notes + // | ------- | ------- | ----- + // | true | true | normal operation + // | true | false | we prematurely returned a permanent error from this receive func + // | false | true | we have told package health that we're closing the bind, but the receive funcs haven't closed yet (transient) + // | false | false | not running + + // The problematic case is started && !running. + // If that happens, wireguard-go will no longer request packets, + // and we'll lose an entire communication channel. + if s.started && !s.running { + return fmt.Errorf("receive%s started but not running", s.name) + } + return nil +} + +// Open tells r that connBind.Open has requested wireguard-go open a conn.Bind that includes r. +func (r *ReceiveFuncState) Open() { + mu.Lock() + defer mu.Unlock() + r.started = true + r.running = true + selfCheckLocked() +} -func setHealthBool(dst *bool, b bool) { +// Stop tells r that we have returned a permanent error to wireguard-go. +// wireguard-go's receive func goroutine for r will soon stop. +func (r *ReceiveFuncState) Stop() { mu.Lock() defer mu.Unlock() - *dst = b + r.running = false + selfCheckLocked() +} + +// Close tells r that connBind.Close has requested wireguard-go close the bind for r. +// This will stop the corresponding receive func goroutine. +// Close must be called before actually closing the underlying connection, +// to avoid a small window of false positives. +func (r *ReceiveFuncState) Close() { + mu.Lock() + defer mu.Unlock() + r.started = false selfCheckLocked() } @@ -284,14 +329,10 @@ func overallErrorLocked() error { _ = lastMapRequestHeard var errs []error - if receiveIPv4Started && !receiveIPv4Running { - errs = append(errs, errors.New("receiveIPv4 not running")) - } - if receiveDERPStarted && !receiveDERPRunning { - errs = append(errs, errors.New("receiveDERP not running")) - } - if receiveIPv6Started && !receiveIPv6Running { - errs = append(errs, errors.New("receiveIPv6 not running")) + for _, recv := range []ReceiveFuncState{ReceiveIPv4, ReceiveIPv6, ReceiveDERP} { + if err := recv.err(); err != nil { + errs = append(errs, err) + } } for sys, err := range sysErr { if err == nil || sys == SysOverall { diff --git a/wgengine/magicsock/magicsock.go b/wgengine/magicsock/magicsock.go index b2c873624..8b90f2fbb 100644 --- a/wgengine/magicsock/magicsock.go +++ b/wgengine/magicsock/magicsock.go @@ -1581,12 +1581,12 @@ func (c *Conn) noteRecvActivityFromEndpoint(e conn.Endpoint) { // receiveIPv6 receives a UDP IPv6 packet. It is called by wireguard-go. func (c *Conn) receiveIPv6(b []byte) (int, conn.Endpoint, error) { - health.SetReceiveIPv6Running(true) - health.SetReceiveIPv6Started(true) - defer health.SetReceiveIPv6Running(false) for { n, ipp, err := c.pconn6.ReadFromNetaddr(b) if err != nil { + if isPermanentNetError(err) { + health.ReceiveIPv6.Stop() + } return 0, nil, err } if ep, ok := c.receiveIP(b[:n], ipp, &c.ippEndpoint6); ok { @@ -1597,12 +1597,12 @@ func (c *Conn) receiveIPv6(b []byte) (int, conn.Endpoint, error) { // receiveIPv4 receives a UDP IPv4 packet. It is called by wireguard-go. func (c *Conn) receiveIPv4(b []byte) (n int, ep conn.Endpoint, err error) { - health.SetReceiveIPv4Running(true) - health.SetReceiveIPv4Started(true) - defer health.SetReceiveIPv4Running(false) for { n, ipp, err := c.pconn4.ReadFromNetaddr(b) if err != nil { + if isPermanentNetError(err) { + health.ReceiveIPv4.Stop() + } return 0, nil, err } if ep, ok := c.receiveIP(b[:n], ipp, &c.ippEndpoint4); ok { @@ -1652,9 +1652,6 @@ func (c *Conn) receiveIP(b []byte, ipp netaddr.IPPort, cache *ippEndpointCache) // If the packet was a disco message or the peer endpoint wasn't // found, the returned error is errLoopAgain. func (c *connBind) receiveDERP(b []byte) (n int, ep conn.Endpoint, err error) { - health.SetReceiveDERPRunning(true) - health.SetReceiveDERPStarted(true) - defer health.SetReceiveDERPRunning(false) for dm := range c.derpRecvCh { if c.Closed() { break @@ -1666,6 +1663,7 @@ func (c *connBind) receiveDERP(b []byte) (n int, ep conn.Endpoint, err error) { } return n, ep, nil } + health.ReceiveDERP.Stop() return 0, nil, net.ErrClosed } @@ -1736,6 +1734,18 @@ func (c *Conn) processDERPReadResult(dm derpReadResult, b []byte) (n int, ep con return n, ep } +// isPermanentNetError reports whether err is permanent. +// It matches an equivalent check in wireguard-go's RoutineReceiveIncoming. +func isPermanentNetError(err error) bool { + // Once this module requires Go 1.17, the comparison to net.ErrClosed can be removed. + // See https://github.com/golang/go/issues/45357. + if errors.Is(err, net.ErrClosed) { + return true + } + neterr, ok := err.(net.Error) + return ok && !neterr.Temporary() +} + // discoLogLevel controls the verbosity of discovery log messages. type discoLogLevel int @@ -2420,8 +2430,11 @@ func (c *connBind) Open(ignoredPort uint16) ([]conn.ReceiveFunc, uint16, error) } c.closed = false fns := []conn.ReceiveFunc{c.receiveIPv4, c.receiveDERP} + health.ReceiveIPv4.Open() + health.ReceiveDERP.Open() if c.pconn6 != nil { fns = append(fns, c.receiveIPv6) + health.ReceiveIPv6.Open() } // TODO: Combine receiveIPv4 and receiveIPv6 and receiveIP into a single // closure that closes over a *RebindingUDPConn? @@ -2443,17 +2456,15 @@ func (c *connBind) Close() error { } c.closed = true // Unblock all outstanding receives. - // Tell the health checker that we're closing the connections - // before actually closing them to avoid false positives. - health.SetReceiveIPv4Started(false) + health.ReceiveIPv4.Close() c.pconn4.Close() if c.pconn6 != nil { - health.SetReceiveIPv6Started(false) + health.ReceiveIPv6.Close() c.pconn6.Close() } // Send an empty read result to unblock receiveDERP, // which will then check connBind.Closed. - health.SetReceiveDERPStarted(false) + health.ReceiveDERP.Close() c.derpRecvCh <- derpReadResult{} return nil }