health, ipn/ipnlocal: track, log overall health

Updates #1505

Signed-off-by: Brad Fitzpatrick <bradfitz@tailscale.com>
bradfitz/ipv6_link_local_strip
Brad Fitzpatrick 3 years ago committed by Brad Fitzpatrick
parent 6fbc9b3a98
commit 9eb65601ef

@ -179,8 +179,11 @@ func NewNoStart(opts Options) (*Client, error) {
} }
func (c *Client) onHealthChange(key string, err error) { func (c *Client) onHealthChange(sys health.Subsystem, err error) {
c.logf("controlclient: restarting map request for %q health change to new state: %v", key, err) if sys == health.SysOverall {
return
}
c.logf("controlclient: restarting map request for %q health change to new state: %v", sys, err)
c.cancelMapSafely() c.cancelMapSafely()
} }

@ -7,9 +7,13 @@
package health package health
import ( import (
"errors"
"fmt"
"sort"
"sync" "sync"
"time" "time"
"github.com/go-multierror/multierror"
"tailscale.com/tailcfg" "tailscale.com/tailcfg"
) )
@ -17,8 +21,9 @@ var (
// mu guards everything in this var block. // mu guards everything in this var block.
mu sync.Mutex mu sync.Mutex
m = map[string]error{} // error key => err (or nil for no error) sysErr = map[Subsystem]error{} // error key => err (or nil for no error)
watchers = map[*watchHandle]func(string, error){} // opt func to run if error state changes watchers = map[*watchHandle]func(Subsystem, error){} // opt func to run if error state changes
timer *time.Timer
inMapPoll bool inMapPoll bool
inMapPollSince time.Time inMapPollSince time.Time
@ -32,49 +37,77 @@ var (
ipnWantRunning bool ipnWantRunning bool
) )
// Subsystem is the name of a subsystem whose health can be monitored.
type Subsystem string
const (
// SysOverall is the name representing the overall health of
// the system, rather than one particular subsystem.
SysOverall = Subsystem("overall")
// SysRouter is the name the wgengine/router subsystem.
SysRouter = Subsystem("router")
// SysNetworkCategory is the name of the subsystem that sets
// the Windows network adapter's "category" (public, private, domain).
// If it's unhealthy, the Windows firewall rules won't match.
SysNetworkCategory = Subsystem("network-category")
)
type watchHandle byte type watchHandle byte
// RegisterWatcher adds a function that will be called if an // RegisterWatcher adds a function that will be called if an
// error changes state either to unhealthy or from unhealthy. It is // error changes state either to unhealthy or from unhealthy. It is
// not called on transition from unknown to healthy. It must be non-nil // not called on transition from unknown to healthy. It must be non-nil
// and is run in its own goroutine. The returned func unregisters it. // and is run in its own goroutine. The returned func unregisters it.
func RegisterWatcher(cb func(errKey string, err error)) (unregister func()) { func RegisterWatcher(cb func(key Subsystem, err error)) (unregister func()) {
mu.Lock() mu.Lock()
defer mu.Unlock() defer mu.Unlock()
handle := new(watchHandle) handle := new(watchHandle)
watchers[handle] = cb watchers[handle] = cb
if timer == nil {
timer = time.AfterFunc(time.Minute, timerSelfCheck)
}
return func() { return func() {
mu.Lock() mu.Lock()
defer mu.Unlock() defer mu.Unlock()
delete(watchers, handle) delete(watchers, handle)
if len(watchers) == 0 && timer != nil {
timer.Stop()
timer = nil
}
} }
} }
// SetRouter sets the state of the wgengine/router.Router. // SetRouter sets the state of the wgengine/router.Router.
func SetRouterHealth(err error) { set("router", err) } func SetRouterHealth(err error) { set(SysRouter, err) }
// RouterHealth returns the wgengine/router.Router error state. // RouterHealth returns the wgengine/router.Router error state.
func RouterHealth() error { return get("router") } func RouterHealth() error { return get(SysRouter) }
// SetNetworkCategoryHealth sets the state of setting the network adaptor's category. // SetNetworkCategoryHealth sets the state of setting the network adaptor's category.
// This only applies on Windows. // This only applies on Windows.
func SetNetworkCategoryHealth(err error) { set("network-category", err) } func SetNetworkCategoryHealth(err error) { set(SysNetworkCategory, err) }
func NetworkCategoryHealth() error { return get("network-category") } func NetworkCategoryHealth() error { return get(SysNetworkCategory) }
func get(key string) error { func get(key Subsystem) error {
mu.Lock() mu.Lock()
defer mu.Unlock() defer mu.Unlock()
return m[key] return sysErr[key]
} }
func set(key string, err error) { func set(key Subsystem, err error) {
mu.Lock() mu.Lock()
defer mu.Unlock() defer mu.Unlock()
old, ok := m[key] setLocked(key, err)
}
func setLocked(key Subsystem, err error) {
old, ok := sysErr[key]
if !ok && err == nil { if !ok && err == nil {
// Initial happy path. // Initial happy path.
m[key] = nil sysErr[key] = nil
selfCheckLocked() selfCheckLocked()
return return
} }
@ -83,11 +116,11 @@ func set(key string, err error) {
// don't run callbacks, but exact error might've // don't run callbacks, but exact error might've
// changed, so note it. // changed, so note it.
if err != nil { if err != nil {
m[key] = err sysErr[key] = err
} }
return return
} }
m[key] = err sysErr[key] = err
selfCheckLocked() selfCheckLocked()
for _, cb := range watchers { for _, cb := range watchers {
go cb(key, err) go cb(key, err)
@ -162,14 +195,62 @@ func SetIPNState(state string, wantRunning bool) {
selfCheckLocked() selfCheckLocked()
} }
func timerSelfCheck() {
mu.Lock()
defer mu.Unlock()
selfCheckLocked()
if timer != nil {
timer.Reset(time.Minute)
}
}
func selfCheckLocked() { func selfCheckLocked() {
// TODO: check states against each other. if ipnState == "" {
// For staticcheck for now: // Don't check yet.
return
}
setLocked(SysOverall, overallErrorLocked())
}
func overallErrorLocked() error {
if ipnState != "Running" || !ipnWantRunning {
return fmt.Errorf("state=%v, wantRunning=%v", ipnState, ipnWantRunning)
}
now := time.Now()
if !inMapPoll && (lastMapPollEndedAt.IsZero() || now.Sub(lastMapPollEndedAt) > 10*time.Second) {
return errors.New("not in map poll")
}
const tooIdle = 2*time.Minute + 5*time.Second
if d := now.Sub(lastStreamedMapResponse).Round(time.Second); d > tooIdle {
return fmt.Errorf("no map response in %v", d)
}
rid := derpHomeRegion
if rid == 0 {
return errors.New("no DERP home")
}
if !derpRegionConnected[rid] {
return fmt.Errorf("not connected to home DERP region %v", rid)
}
if d := now.Sub(derpRegionLastFrame[rid]).Round(time.Second); d > tooIdle {
return fmt.Errorf("haven't heard from home DERP region %v in %v", rid, d)
}
// TODO: use
_ = inMapPollSince _ = inMapPollSince
_ = lastMapPollEndedAt _ = lastMapPollEndedAt
_ = lastStreamedMapResponse _ = lastStreamedMapResponse
_ = derpHomeRegion
_ = lastMapRequestHeard _ = lastMapRequestHeard
_ = ipnState
_ = ipnWantRunning var errs []error
for sys, err := range sysErr {
if err == nil || sys == SysOverall {
continue
}
errs = append(errs, fmt.Errorf("%v: %w", sys, err))
}
sort.Slice(errs, func(i, j int) bool {
// Not super efficient (stringifying these in a sort), but probably max 2 or 3 items.
return errs[i].Error() < errs[j].Error()
})
return multierror.New(errs)
} }

@ -65,20 +65,21 @@ func getControlDebugFlags() []string {
// state machine generates events back out to zero or more components. // state machine generates events back out to zero or more components.
type LocalBackend struct { type LocalBackend struct {
// Elements that are thread-safe or constant after construction. // Elements that are thread-safe or constant after construction.
ctx context.Context // canceled by Close ctx context.Context // canceled by Close
ctxCancel context.CancelFunc // cancels ctx ctxCancel context.CancelFunc // cancels ctx
logf logger.Logf // general logging logf logger.Logf // general logging
keyLogf logger.Logf // for printing list of peers on change keyLogf logger.Logf // for printing list of peers on change
statsLogf logger.Logf // for printing peers stats on change statsLogf logger.Logf // for printing peers stats on change
e wgengine.Engine e wgengine.Engine
store ipn.StateStore store ipn.StateStore
backendLogID string backendLogID string
unregisterLinkMon func() unregisterLinkMon func()
portpoll *portlist.Poller // may be nil unregisterHealthWatch func()
portpollOnce sync.Once // guards starting readPoller portpoll *portlist.Poller // may be nil
gotPortPollRes chan struct{} // closed upon first readPoller result portpollOnce sync.Once // guards starting readPoller
serverURL string // tailcontrol URL gotPortPollRes chan struct{} // closed upon first readPoller result
newDecompressor func() (controlclient.Decompressor, error) serverURL string // tailcontrol URL
newDecompressor func() (controlclient.Decompressor, error)
filterHash string filterHash string
@ -148,6 +149,8 @@ func NewLocalBackend(logf logger.Logf, logid string, store ipn.StateStore, e wge
b.linkChange(false, linkMon.InterfaceState()) b.linkChange(false, linkMon.InterfaceState())
b.unregisterLinkMon = linkMon.RegisterChangeCallback(b.linkChange) b.unregisterLinkMon = linkMon.RegisterChangeCallback(b.linkChange)
b.unregisterHealthWatch = health.RegisterWatcher(b.onHealthChange)
return b, nil return b, nil
} }
@ -182,6 +185,14 @@ func (b *LocalBackend) linkChange(major bool, ifst *interfaces.State) {
b.updateFilter(b.netMap, b.prefs) b.updateFilter(b.netMap, b.prefs)
} }
func (b *LocalBackend) onHealthChange(sys health.Subsystem, err error) {
if err == nil {
b.logf("health(%q): ok", sys)
} else {
b.logf("health(%q): error: %v", sys, err)
}
}
// Shutdown halts the backend and all its sub-components. The backend // Shutdown halts the backend and all its sub-components. The backend
// can no longer be used after Shutdown returns. // can no longer be used after Shutdown returns.
func (b *LocalBackend) Shutdown() { func (b *LocalBackend) Shutdown() {
@ -190,6 +201,7 @@ func (b *LocalBackend) Shutdown() {
b.mu.Unlock() b.mu.Unlock()
b.unregisterLinkMon() b.unregisterLinkMon()
b.unregisterHealthWatch()
if cli != nil { if cli != nil {
cli.Shutdown() cli.Shutdown()
} }

Loading…
Cancel
Save