From fd8e070d01299eb0f0b1d5d0e89747d7f512a22d Mon Sep 17 00:00:00 2001 From: Brad Fitzpatrick Date: Thu, 18 Feb 2021 08:58:13 -0800 Subject: [PATCH] health, control/controlclient, wgengine: report when router unhealthy Updates tailscale/corp#1338 Signed-off-by: Brad Fitzpatrick --- cmd/tailscaled/depaware.txt | 1 + control/controlclient/auto.go | 11 +++++ control/controlclient/direct.go | 10 ++++- health/health.go | 71 +++++++++++++++++++++++++++++++++ tailcfg/tailcfg.go | 2 + wgengine/userspace.go | 5 ++- 6 files changed, 98 insertions(+), 2 deletions(-) create mode 100644 health/health.go diff --git a/cmd/tailscaled/depaware.txt b/cmd/tailscaled/depaware.txt index 414734d43..0fd74fe8d 100644 --- a/cmd/tailscaled/depaware.txt +++ b/cmd/tailscaled/depaware.txt @@ -71,6 +71,7 @@ tailscale.com/cmd/tailscaled dependencies: (generated by github.com/tailscale/de tailscale.com/derp/derphttp from tailscale.com/net/netcheck+ tailscale.com/derp/derpmap from tailscale.com/cmd/tailscaled tailscale.com/disco from tailscale.com/derp+ + tailscale.com/health from tailscale.com/control/controlclient+ tailscale.com/internal/deepprint from tailscale.com/ipn/ipnlocal+ tailscale.com/ipn from tailscale.com/ipn/ipnserver+ tailscale.com/ipn/ipnlocal from tailscale.com/ipn/ipnserver+ diff --git a/control/controlclient/auto.go b/control/controlclient/auto.go index 128ead9a7..b9360a287 100644 --- a/control/controlclient/auto.go +++ b/control/controlclient/auto.go @@ -18,6 +18,7 @@ import ( "time" "golang.org/x/oauth2" + "tailscale.com/health" "tailscale.com/logtail/backoff" "tailscale.com/tailcfg" "tailscale.com/types/empty" @@ -116,6 +117,8 @@ type Client struct { closed bool newMapCh chan struct{} // readable when we must restart a map request + unregisterHealthWatch func() + mu sync.Mutex // mutex guards the following fields statusFunc func(Status) // called to update Client status @@ -171,7 +174,14 @@ func NewNoStart(opts Options) (*Client, error) { } c.authCtx, c.authCancel = context.WithCancel(context.Background()) c.mapCtx, c.mapCancel = context.WithCancel(context.Background()) + c.unregisterHealthWatch = health.RegisterWatcher(c.onHealthChange) return c, nil + +} + +func (c *Client) onHealthChange(key string, err error) { + c.logf("controlclient: restarting map request for %q health change to new state: %v", key, err) + c.cancelMapSafely() } // SetPaused controls whether HTTP activity should be paused. @@ -700,6 +710,7 @@ func (c *Client) Shutdown() { c.logf("client.Shutdown: inSendStatus=%v", inSendStatus) if !closed { + c.unregisterHealthWatch() close(c.quit) c.cancelAuth() <-c.authDone diff --git a/control/controlclient/direct.go b/control/controlclient/direct.go index 1328af09e..be04e143c 100644 --- a/control/controlclient/direct.go +++ b/control/controlclient/direct.go @@ -33,6 +33,7 @@ import ( "golang.org/x/crypto/nacl/box" "golang.org/x/oauth2" "inet.af/netaddr" + "tailscale.com/health" "tailscale.com/log/logheap" "tailscale.com/net/dnscache" "tailscale.com/net/netns" @@ -537,9 +538,16 @@ func (c *Direct) sendMapRequest(ctx context.Context, maxPolls int, cb func(*netm DebugFlags: c.debugFlags, OmitPeers: cb == nil, } + var extraDebugFlags []string if hostinfo != nil && ipForwardingBroken(hostinfo.RoutableIPs) { + extraDebugFlags = append(extraDebugFlags, "warn-ip-forwarding-off") + } + if health.RouterHealth() != nil { + extraDebugFlags = append(extraDebugFlags, "warn-router-unhealthy") + } + if len(extraDebugFlags) > 0 { old := request.DebugFlags - request.DebugFlags = append(old[:len(old):len(old)], "warn-ip-forwarding-off") + request.DebugFlags = append(old[:len(old):len(old)], extraDebugFlags...) } if c.newDecompressor != nil { request.Compress = "zstd" diff --git a/health/health.go b/health/health.go new file mode 100644 index 000000000..a199ead62 --- /dev/null +++ b/health/health.go @@ -0,0 +1,71 @@ +// Copyright (c) 2021 Tailscale Inc & AUTHORS All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package health is a registry for other packages to report & check +// overall health status of the node. +package health + +import ( + "sync" +) + +var ( + mu sync.Mutex + m = map[string]error{} // error key => err (or nil for no error) + watchers = map[*watchHandle]func(string, error){} // opt func to run if error state changes +) + +type watchHandle byte + +// RegisterWatcher adds a function that will be called if an +// error changes state either to unhealthy or from unhealthy. It is +// not called on transition from unknown to healthy. It must be non-nil +// and is run in its own goroutine. The returned func unregisters it. +func RegisterWatcher(cb func(errKey string, err error)) (unregister func()) { + mu.Lock() + defer mu.Unlock() + handle := new(watchHandle) + watchers[handle] = cb + return func() { + mu.Lock() + defer mu.Unlock() + delete(watchers, handle) + } +} + +// SetRouter sets the state of the wgengine/router.Router. +func SetRouterHealth(err error) { set("router", err) } + +// RouterHealth returns the wgengine/router.Router error state. +func RouterHealth() error { return get("router") } + +func get(key string) error { + mu.Lock() + defer mu.Unlock() + return m[key] +} + +func set(key string, err error) { + mu.Lock() + defer mu.Unlock() + old, ok := m[key] + if !ok && err == nil { + // Initial happy path. + m[key] = nil + return + } + if ok && (old == nil) == (err == nil) { + // No change in overall error status (nil-vs-not), so + // don't run callbacks, but exact error might've + // changed, so note it. + if err != nil { + m[key] = err + } + return + } + m[key] = err + for _, cb := range watchers { + go cb(key, err) + } +} diff --git a/tailcfg/tailcfg.go b/tailcfg/tailcfg.go index f687c9b04..61a63c7b4 100644 --- a/tailcfg/tailcfg.go +++ b/tailcfg/tailcfg.go @@ -635,6 +635,8 @@ type MapRequest struct { // Current DebugFlags values are: // * "warn-ip-forwarding-off": client is trying to be a subnet // router but their IP forwarding is broken. + // * "warn-router-unhealthy": client's Router implementation is + // having problems. // * "v6-overlay": IPv6 development flag to have control send // v6 node addrs // * "minimize-netmap": have control minimize the netmap, removing diff --git a/wgengine/userspace.go b/wgengine/userspace.go index f3ce131c9..352a490cd 100644 --- a/wgengine/userspace.go +++ b/wgengine/userspace.go @@ -26,6 +26,7 @@ import ( "go4.org/mem" "inet.af/netaddr" "tailscale.com/control/controlclient" + "tailscale.com/health" "tailscale.com/internal/deepprint" "tailscale.com/ipn/ipnstate" "tailscale.com/net/flowtrack" @@ -1003,7 +1004,9 @@ func (e *userspaceEngine) Reconfig(cfg *wgcfg.Config, routerCfg *router.Config) routerCfg.DNS.Nameservers = []netaddr.IP{tsaddr.TailscaleServiceIP()} } e.logf("wgengine: Reconfig: configuring router") - if err := e.router.Set(routerCfg); err != nil { + err := e.router.Set(routerCfg) + health.SetRouterHealth(err) + if err != nil { return err } }