diff --git a/control/controlclient/direct.go b/control/controlclient/direct.go index 49879710e..b305a302e 100644 --- a/control/controlclient/direct.go +++ b/control/controlclient/direct.go @@ -1581,9 +1581,9 @@ func postPingResult(start time.Time, logf logger.Logf, c *http.Client, pr *tailc } // ReportHealthChange reports to the control plane a change to this node's -// health. -func (c *Direct) ReportHealthChange(sys health.Subsystem, sysErr error) { - if sys == health.SysOverall { +// health. w must be non-nil. us can be nil to indicate a healthy state for w. +func (c *Direct) ReportHealthChange(w *health.Warnable, us *health.UnhealthyState) { + if w == health.NetworkStatusWarnable || w == health.IPNStateWarnable || w == health.LoginStateWarnable { // We don't report these. These include things like the network is down // (in which case we can't report anyway) or the user wanted things // stopped, as opposed to the more unexpected failure types in the other @@ -1602,12 +1602,13 @@ func (c *Direct) ReportHealthChange(sys health.Subsystem, sysErr error) { if c.panicOnUse { panic("tainted client") } + // TODO(angott): at some point, update `Subsys` in the request to be `Warnable` req := &tailcfg.HealthChangeRequest{ - Subsys: string(sys), + Subsys: string(w.Code), NodeKey: nodeKey, } - if sysErr != nil { - req.Error = sysErr.Error() + if us != nil { + req.Error = us.Text } // Best effort, no logging: diff --git a/health/args.go b/health/args.go new file mode 100644 index 000000000..18fe4365a --- /dev/null +++ b/health/args.go @@ -0,0 +1,30 @@ +// Copyright (c) Tailscale Inc & AUTHORS +// SPDX-License-Identifier: BSD-3-Clause + +package health + +// Arg is a type for the key to be used in the Args of a Warnable. +type Arg string + +const ( + // ArgAvailableVersion provides an update notification Warnable with the available version of the Tailscale client. + ArgAvailableVersion Arg = "available-version" + + // ArgCurrentVersion provides an update notification Warnable with the current version of the Tailscale client. + ArgCurrentVersion Arg = "current-version" + + // ArgDuration provides a Warnable with how long the Warnable has been in an unhealthy state. + ArgDuration Arg = "duration" + + // ArgError provides a Warnable with the underlying error behind an unhealthy state. + ArgError Arg = "error" + + // ArgMagicsockFunctionName provides a Warnable with the name of the Magicsock function that caused the unhealthy state. + ArgMagicsockFunctionName Arg = "magicsock-function-name" + + // ArgRegionID provides a Warnable with the ID of a DERP server involved in the unhealthy state. + ArgRegionID Arg = "region-id" + + // ArgServerName provides a Warnable with the hostname of a server involved in the unhealthy state. + ArgServerName Arg = "server-name" +) diff --git a/health/health.go b/health/health.go index 08badf04e..db3bc8e90 100644 --- a/health/health.go +++ b/health/health.go @@ -8,6 +8,7 @@ package health import ( "errors" "fmt" + "maps" "net/http" "os" "runtime" @@ -67,10 +68,12 @@ type Tracker struct { mu sync.Mutex warnables []*Warnable // keys ever set - warnableVal map[*Warnable]error + warnableVal map[*Warnable]*warningState - sysErr map[Subsystem]error // subsystem => err (or nil for no error) - watchers set.HandleSet[func(Subsystem, error)] // opt func to run if error state changes + // sysErr maps subsystems to their current error (or nil if the subsystem is healthy) + // Deprecated: using Warnables should be preferred + sysErr map[Subsystem]error + watchers set.HandleSet[func(*Warnable, *UnhealthyState)] // opt func to run if error state changes timer *time.Timer latestVersion *tailcfg.ClientVersion // or nil @@ -97,13 +100,12 @@ type Tracker struct { } // Subsystem is the name of a subsystem whose health can be monitored. +// +// Deprecated: Registering a Warnable using Register() and updating its health state +// with SetUnhealthy() and SetHealthy() should be preferred. type Subsystem string const ( - // SysOverall is the name representing the overall health of - // the system, rather than one particular subsystem. - SysOverall = Subsystem("overall") - // SysRouter is the name of the wgengine/router subsystem. SysRouter = Subsystem("router") @@ -120,55 +122,98 @@ const ( SysTKA = Subsystem("tailnet-lock") ) -// NewWarnable returns a new warnable item that the caller can mark as health or -// in warning state via Tracker.SetWarnable. -// -// NewWarnable is generally called in init and stored in a package global. It -// can be used by multiple Trackers. -func NewWarnable(opts ...WarnableOpt) *Warnable { - w := new(Warnable) - for _, o := range opts { - o.mod(w) +var subsystemsWarnables = map[Subsystem]*Warnable{} + +const legacyErrorArgKey = "LegacyError" + +// Warnable() returns a Warnable representing a legacy Subsystem. This is used +// *temporarily* while we migrate the old health infrastructure based on +// Subsystems to the new Warnables architecture. +func (s Subsystem) Warnable() *Warnable { + if w, ok := subsystemsWarnables[s]; ok { + return w + } else { + w := Register(&Warnable{ + Code: WarnableCode(s), + Severity: SeverityMedium, + Text: func(args Args) string { + return args[legacyErrorArgKey] + }, + }) + subsystemsWarnables[s] = w + return w } - return w } -// WarnableOpt is an option passed to NewWarnable. -type WarnableOpt interface { - mod(*Warnable) -} +var registeredWarnables = map[WarnableCode]*Warnable{} -// WithMapDebugFlag returns a WarnableOpt for NewWarnable that makes the returned -// Warnable report itself to the coordination server as broken with this -// string in MapRequest.DebugFlag when Set to a non-nil value. -func WithMapDebugFlag(name string) WarnableOpt { - return warnOptFunc(func(w *Warnable) { - w.debugFlag = name - }) +// Register registers a new Warnable with the health package and returns it. +// Register panics if the Warnable was already registered, because Warnables +// should be unique across the program. +func Register(w *Warnable) *Warnable { + if registeredWarnables[w.Code] != nil { + panic(fmt.Sprintf("health: a Warnable with code %q was already registered", w.Code)) + } + mak.Set(®isteredWarnables, w.Code, w) + return w } -// WithConnectivityImpact returns an option which makes a Warnable annotated as -// something that could be breaking external network connectivity on the -// machine. This will make the warnable returned by OverallError alongside -// network connectivity errors. -func WithConnectivityImpact() WarnableOpt { - return warnOptFunc(func(w *Warnable) { - w.hasConnectivityImpact = true - }) +// unregister removes a Warnable from the health package. It should only be used +// for testing purposes. +func unregister(w *Warnable) { + if registeredWarnables[w.Code] == nil { + panic(fmt.Sprintf("health: attempting to unregister Warnable %q that was not registered", w.Code)) + } + delete(registeredWarnables, w.Code) } -type warnOptFunc func(*Warnable) - -func (f warnOptFunc) mod(w *Warnable) { f(w) } - -// Warnable is a health check item that may or may not be in a bad warning state. -// The caller of NewWarnable is responsible for calling Tracker.SetWarnable to update the state. +// WarnableCode is a string that distinguishes each Warnable from others. It is globally unique within +// the program. +type WarnableCode string + +// A Warnable is something that we might want to warn the user about, or not. A Warnable is either +// in an healthy or unhealth state. A Warnable is unhealthy if the Tracker knows about a WarningState +// affecting the Warnable. +// In most cases, Warnables are components of the backend (for instance, "DNS" or "Magicsock"). +// Warnables are similar to the Subsystem type previously used in this package, but they provide +// a unique identifying code for each Warnable, along with more metadata that makes it easier for +// a GUI to display the Warnable in a user-friendly way. type Warnable struct { - debugFlag string // optional MapRequest.DebugFlag to send when unhealthy - - // If true, this warning is related to configuration of networking stack + // Code is a string that uniquely identifies this Warnable across the entire Tailscale backend, + // and can be mapped to a user-displayable localized string. + Code WarnableCode + // Title is a string that the GUI uses as title for any message involving this Warnable. The title + // should be short and fit in a single line. + Title string + // Text is a function that generates an extended string that the GUI will display to the user when + // this Warnable is in an unhealthy state. The function can use the Args map to provide dynamic + // information to the user. + Text func(args Args) string + // Severity is the severity of the Warnable, which the GUI can use to determine how to display it. + // For instance, a Warnable with SeverityHigh could trigger a modal view, while a Warnable with + // SeverityLow could be displayed in a less intrusive way. + // TODO(angott): turn this into a SeverityFunc, which allows the Warnable to change its severity based on + // the Args of the unhappy state, just like we do in the Text function. + Severity Severity + // DependsOn is a set of Warnables that this Warnable depends, on and need to be healthy + // before this Warnable can also be healthy again. The GUI can use this information to ignore + // this Warnable if one of its dependencies is unhealthy. + DependsOn []*Warnable + + // MapDebugFlag is a MapRequest.DebugFlag that is sent to control when this Warnable is unhealthy + // + // Deprecated: this is only used in one case, and will be removed in a future PR + MapDebugFlag string + + // If true, this warnable is related to configuration of networking stack // on the machine that impacts connectivity. - hasConnectivityImpact bool + ImpactsConnectivity bool +} + +// StaticMessage returns a function that always returns the input string, to be used in +// simple Warnables that do not use the Args map to generate their Text. +func StaticMessage(s string) func(Args) string { + return func(Args) string { return s } } // nil reports whether t is nil. @@ -180,6 +225,7 @@ func (t *Tracker) nil() bool { if t != nil { return false } + if cibuild.On() { stack := make([]byte, 1<<10) stack = stack[:runtime.Stack(stack, false)] @@ -191,19 +237,104 @@ func (t *Tracker) nil() bool { return true } -// Set updates the Warnable's state. -// If non-nil, it's considered unhealthy. -func (t *Tracker) SetWarnable(w *Warnable, err error) { +// Severity represents how serious an error is. Each GUI interprets this severity value in different ways, +// to surface the error in a more or less visible way. For instance, the macOS GUI could change its menubar +// icon to display an exclamation mark and present a modal notification for SeverityHigh warnings, but not +// for SeverityLow messages, which would only appear in the Settings window. +type Severity string + +const ( + SeverityHigh Severity = "high" + SeverityMedium Severity = "medium" + SeverityLow Severity = "low" +) + +// Args is a map of Args to string values that can be used to provide parameters regarding +// the unhealthy state of a Warnable. +// For instance, if you have a Warnable to track the health of DNS lookups, here you can include +// the hostname that failed to resolve, or the IP address of the DNS server that has been failing +// to respond. You can then use these parameters in the Text function of the Warnable to provide a detailed +// error message to the user. +type Args map[Arg]string + +// A warningState is a condition affecting a Warnable. For each Warnable known to the Tracker, a Warnable +// is in an unhappy state if there is a warningState associated with the Warnable. +type warningState struct { + BrokenSince time.Time // when the Warnable became unhealthy + Args Args // args can be used to provide parameters to the function that generates the Text in the Warnable +} + +func (ws *warningState) Equal(other *warningState) bool { + if ws == nil && other == nil { + return true + } + if ws == nil || other == nil { + return false + } + return ws.BrokenSince.Equal(other.BrokenSince) && maps.Equal(ws.Args, other.Args) +} + +// SetUnhealthy sets a warningState for the given Warnable with the provided Args, and should be +// called when a Warnable becomes unhealthy, or its unhealthy status needs to be updated. +// SetUnhealthy takes ownership of args. The args can be nil if no additional information is +// needed for the unhealthy state. +func (t *Tracker) SetUnhealthy(w *Warnable, args Args) { if t.nil() { return } t.mu.Lock() defer t.mu.Unlock() - l0 := len(t.warnableVal) - mak.Set(&t.warnableVal, w, err) - if len(t.warnableVal) != l0 { + t.setUnhealthyLocked(w, args) +} + +func (t *Tracker) setUnhealthyLocked(w *Warnable, args Args) { + if w == nil { + return + } + + // If we already have a warningState for this Warnable with an earlier BrokenSince time, keep that + // BrokenSince time. + brokenSince := time.Now() + if existingWS := t.warnableVal[w]; existingWS != nil { + brokenSince = existingWS.BrokenSince + } + + if t.warnableVal[w] == nil { t.warnables = append(t.warnables, w) } + ws := &warningState{ + BrokenSince: brokenSince, + Args: args, + } + prevWs := t.warnableVal[w] + mak.Set(&t.warnableVal, w, ws) + if !ws.Equal(prevWs) { + for _, cb := range t.watchers { + go cb(w, w.unhealthyState(ws)) + } + } +} + +// SetHealthy removes any warningState for the given Warnable. +func (t *Tracker) SetHealthy(w *Warnable) { + if t.nil() { + return + } + t.mu.Lock() + defer t.mu.Unlock() + t.setHealthyLocked(w) +} + +func (t *Tracker) setHealthyLocked(w *Warnable) { + if t.warnableVal[w] == nil { + // Nothing to remove + return + } + + delete(t.warnableVal, w) + for _, cb := range t.watchers { + go cb(w, nil) + } } // AppendWarnableDebugFlags appends to base any health items that are currently in failed @@ -218,29 +349,31 @@ func (t *Tracker) AppendWarnableDebugFlags(base []string) []string { t.mu.Lock() defer t.mu.Unlock() for w, err := range t.warnableVal { - if w.debugFlag == "" { + if w.MapDebugFlag == "" { continue } if err != nil { - ret = append(ret, w.debugFlag) + ret = append(ret, w.MapDebugFlag) } } sort.Strings(ret[len(base):]) // sort the new ones return ret } -// RegisterWatcher adds a function that will be called if an -// error changes state either to unhealthy or from unhealthy. It is -// not called on transition from unknown to healthy. It must be non-nil -// and is run in its own goroutine. The returned func unregisters it. -func (t *Tracker) RegisterWatcher(cb func(key Subsystem, err error)) (unregister func()) { +// RegisterWatcher adds a function that will be called whenever the health state of any Warnable changes. +// If a Warnable becomes unhealthy or its unhealthy state is updated, the callback will be called with its +// current Representation. +// If a Warnable becomes healthy, the callback will be called with ws set to nil. +// The provided callback function will be executed in its own goroutine. The returned function can be used +// to unregister the callback. +func (t *Tracker) RegisterWatcher(cb func(w *Warnable, r *UnhealthyState)) (unregister func()) { if t.nil() { return func() {} } t.mu.Lock() defer t.mu.Unlock() if t.watchers == nil { - t.watchers = set.HandleSet[func(Subsystem, error)]{} + t.watchers = set.HandleSet[func(*Warnable, *UnhealthyState)]{} } handle := t.watchers.Add(cb) if t.timer == nil { @@ -258,31 +391,49 @@ func (t *Tracker) RegisterWatcher(cb func(key Subsystem, err error)) (unregister } // SetRouterHealth sets the state of the wgengine/router.Router. +// +// Deprecated: Warnables should be preferred over Subsystem errors. func (t *Tracker) SetRouterHealth(err error) { t.setErr(SysRouter, err) } // RouterHealth returns the wgengine/router.Router error state. +// +// Deprecated: Warnables should be preferred over Subsystem errors. func (t *Tracker) RouterHealth() error { return t.get(SysRouter) } // SetDNSHealth sets the state of the net/dns.Manager +// +// Deprecated: Warnables should be preferred over Subsystem errors. func (t *Tracker) SetDNSHealth(err error) { t.setErr(SysDNS, err) } // DNSHealth returns the net/dns.Manager error state. +// +// Deprecated: Warnables should be preferred over Subsystem errors. func (t *Tracker) DNSHealth() error { return t.get(SysDNS) } // SetDNSOSHealth sets the state of the net/dns.OSConfigurator +// +// Deprecated: Warnables should be preferred over Subsystem errors. func (t *Tracker) SetDNSOSHealth(err error) { t.setErr(SysDNSOS, err) } // SetDNSManagerHealth sets the state of the Linux net/dns manager's // discovery of the /etc/resolv.conf situation. +// +// Deprecated: Warnables should be preferred over Subsystem errors. func (t *Tracker) SetDNSManagerHealth(err error) { t.setErr(SysDNSManager, err) } // DNSOSHealth returns the net/dns.OSConfigurator error state. +// +// Deprecated: Warnables should be preferred over Subsystem errors. func (t *Tracker) DNSOSHealth() error { return t.get(SysDNSOS) } // SetTKAHealth sets the health of the tailnet key authority. +// +// Deprecated: Warnables should be preferred over Subsystem errors. func (t *Tracker) SetTKAHealth(err error) { t.setErr(SysTKA, err) } // TKAHealth returns the tailnet key authority error state. +// +// Deprecated: Warnables should be preferred over Subsystem errors. func (t *Tracker) TKAHealth() error { return t.get(SysTKA) } // SetLocalLogConfigHealth sets the error state of this client's local log configuration. @@ -362,8 +513,20 @@ func (t *Tracker) setLocked(key Subsystem, err error) { } t.sysErr[key] = err t.selfCheckLocked() - for _, cb := range t.watchers { - go cb(key, err) +} + +// updateLegacyErrorWarnableLocked takes a legacy Subsystem and an optional error, and +// updates the WarningState for that legacy Subsystem, setting it to healthy or unhealthy. +// It is used temporarily while we migrate from Subsystems to Warnables. +// +// Deprecated: this function will be removed after migrating all subsystem errors to use +// Warnables instead. +func (t *Tracker) updateLegacyErrorWarnableLocked(key Subsystem, err error) { + w := key.Warnable() + if err != nil { + t.setUnhealthyLocked(key.Warnable(), Args{legacyErrorArgKey: err.Error()}) + } else { + t.setHealthyLocked(w) } } @@ -598,24 +761,7 @@ func (t *Tracker) selfCheckLocked() { // Don't check yet. return } - t.setLocked(SysOverall, t.overallErrorLocked()) -} - -// AppendWarnings appends all current health warnings to dst and returns the -// result. -func (t *Tracker) AppendWarnings(dst []string) []string { - err := t.OverallError() - if err == nil { - return dst - } - if me, ok := err.(multierr.Error); ok { - for _, err := range me.Errors() { - dst = append(dst, err.Error()) - } - } else { - dst = append(dst, err.Error()) - } - return dst + t.updateBuiltinWarnablesLocked() } // OverallError returns a summary of the health state. @@ -628,108 +774,161 @@ func (t *Tracker) OverallError() error { } t.mu.Lock() defer t.mu.Unlock() - return t.overallErrorLocked() + t.updateBuiltinWarnablesLocked() + return t.multiErrLocked() } -var fakeErrForTesting = envknob.RegisterString("TS_DEBUG_FAKE_HEALTH_ERROR") - -// networkErrorfLocked creates an error that indicates issues with outgoing network -// connectivity. Any active warnings related to network connectivity will -// automatically be appended to it. -// -// t.mu must be held. -func (t *Tracker) networkErrorfLocked(format string, a ...any) error { - errs := []error{ - fmt.Errorf(format, a...), +// Strings() returns a string array containing the Text of all Warnings +// currently known to the Tracker. These strings can be presented to the +// user, although ideally you would use the Code property on each Warning +// to show a localized version of them instead. +// This function is here for legacy compatibility purposes and is deprecated. +func (t *Tracker) Strings() []string { + if t.nil() { + return nil } - for _, w := range t.warnables { - if !w.hasConnectivityImpact { - continue - } - if err := t.warnableVal[w]; err != nil { - errs = append(errs, err) + t.mu.Lock() + defer t.mu.Unlock() + return t.stringsLocked() +} + +func (t *Tracker) stringsLocked() []string { + result := []string{} + for w, ws := range t.warnableVal { + if ws.Args == nil { + result = append(result, w.Text(Args{})) + } else { + result = append(result, w.Text(ws.Args)) } } - if len(errs) == 1 { - return errs[0] + return result +} + +// errorsLocked returns an array of errors where each error is the Text +// of a Warning known to the Tracker. +// This function is here for legacy compatibility purposes and is deprecated. +func (t *Tracker) errorsLocked() []error { + strs := t.stringsLocked() + errs := []error{} + for _, str := range strs { + errs = append(errs, errors.New(str)) } - return multierr.New(errs...) + return errs } -var errNetworkDown = errors.New("network down") -var errNotInMapPoll = errors.New("not in map poll") -var errNoDERPHome = errors.New("no DERP home") -var errNoUDP4Bind = errors.New("no udp4 bind") -var errUnstable = errors.New("This is an unstable (development) version of Tailscale; frequent updates and bugs are likely") +// multiErrLocked returns an error listing all errors known to the Tracker. +// This function is here for legacy compatibility purposes and is deprecated. +func (t *Tracker) multiErrLocked() error { + errs := t.errorsLocked() + return multierr.New(errs...) +} -func (t *Tracker) overallErrorLocked() error { - var errs []error - add := func(err error) { - if err != nil { - errs = append(errs, err) - } - } - merged := func() error { - return multierr.New(errs...) - } +var fakeErrForTesting = envknob.RegisterString("TS_DEBUG_FAKE_HEALTH_ERROR") +// updateBuiltinWarnablesLocked performs a number of checks on the state of the backend, +// and adds/removes Warnings from the Tracker as needed. +func (t *Tracker) updateBuiltinWarnablesLocked() { if t.checkForUpdates { if cv := t.latestVersion; cv != nil && !cv.RunningLatest && cv.LatestVersion != "" { if cv.UrgentSecurityUpdate { - add(fmt.Errorf("Security update available: %v -> %v, run `tailscale update` or `tailscale set --auto-update` to update", version.Short(), cv.LatestVersion)) + t.setUnhealthyLocked(securityUpdateAvailableWarnable, Args{ + ArgCurrentVersion: version.Short(), + ArgAvailableVersion: cv.LatestVersion, + }) } else { - add(fmt.Errorf("Update available: %v -> %v, run `tailscale update` or `tailscale set --auto-update` to update", version.Short(), cv.LatestVersion)) + t.setUnhealthyLocked(updateAvailableWarnable, Args{ + ArgCurrentVersion: version.Short(), + ArgAvailableVersion: cv.LatestVersion, + }) } } } + if version.IsUnstableBuild() { - add(errUnstable) + t.setUnhealthyLocked(unstableWarnable, Args{ + ArgCurrentVersion: version.Short(), + }) } if v, ok := t.anyInterfaceUp.Get(); ok && !v { - add(errNetworkDown) - return merged() + t.setUnhealthyLocked(NetworkStatusWarnable, nil) + return + } else { + t.setHealthyLocked(NetworkStatusWarnable) } + if t.localLogConfigErr != nil { - add(t.localLogConfigErr) - return merged() + t.setUnhealthyLocked(localLogWarnable, Args{ + ArgError: t.localLogConfigErr.Error(), + }) + return + } else { + t.setHealthyLocked(localLogWarnable) } + if !t.ipnWantRunning { - add(fmt.Errorf("state=%v, wantRunning=%v", t.ipnState, t.ipnWantRunning)) - return merged() + t.setUnhealthyLocked(IPNStateWarnable, Args{ + "State": t.ipnState, + }) + return + } else { + t.setHealthyLocked(IPNStateWarnable) } + if t.lastLoginErr != nil { - add(fmt.Errorf("not logged in, last login error=%v", t.lastLoginErr)) - return merged() + t.setUnhealthyLocked(LoginStateWarnable, Args{ + ArgError: t.lastLoginErr.Error(), + }) + return + } else { + t.setHealthyLocked(LoginStateWarnable) } + now := time.Now() if !t.inMapPoll && (t.lastMapPollEndedAt.IsZero() || now.Sub(t.lastMapPollEndedAt) > 10*time.Second) { - add(errNotInMapPoll) - return merged() + t.setUnhealthyLocked(notInMapPollWarnable, nil) + return + } else { + t.setHealthyLocked(notInMapPollWarnable) } + const tooIdle = 2*time.Minute + 5*time.Second if d := now.Sub(t.lastStreamedMapResponse).Round(time.Second); d > tooIdle { - add(t.networkErrorfLocked("no map response in %v", d)) - return merged() + t.setUnhealthyLocked(mapResponseTimeoutWarnable, Args{ + ArgDuration: d.String(), + }) + return + } else { + t.setHealthyLocked(mapResponseTimeoutWarnable) } + if !t.derpHomeless { rid := t.derpHomeRegion if rid == 0 { - add(errNoDERPHome) - return merged() - } - if !t.derpRegionConnected[rid] { - add(t.networkErrorfLocked("not connected to home DERP region %v", rid)) - return merged() - } - if d := now.Sub(t.derpRegionLastFrame[rid]).Round(time.Second); d > tooIdle { - add(t.networkErrorfLocked("haven't heard from home DERP region %v in %v", rid, d)) - return merged() + t.setUnhealthyLocked(noDERPHomeWarnable, nil) + return + } else if !t.derpRegionConnected[rid] { + t.setUnhealthyLocked(noDERPConnectionWarnable, Args{ + ArgRegionID: fmt.Sprint(rid), + }) + return + } else if d := now.Sub(t.derpRegionLastFrame[rid]).Round(time.Second); d > tooIdle { + t.setUnhealthyLocked(derpTimeoutWarnable, Args{ + ArgRegionID: fmt.Sprint(rid), + ArgDuration: d.String(), + }) + return } } + t.setHealthyLocked(noDERPHomeWarnable) + t.setHealthyLocked(noDERPConnectionWarnable) + t.setHealthyLocked(derpTimeoutWarnable) + if t.udp4Unbound { - add(errNoUDP4Bind) - return merged() + t.setUnhealthyLocked(noUDP4BindWarnable, nil) + return + } else { + t.setHealthyLocked(noUDP4BindWarnable) } // TODO: use @@ -738,43 +937,72 @@ func (t *Tracker) overallErrorLocked() error { _ = t.lastStreamedMapResponse _ = t.lastMapRequestHeard + shouldClearMagicsockWarnings := false for i := range t.MagicSockReceiveFuncs { f := &t.MagicSockReceiveFuncs[i] if f.missing { - errs = append(errs, fmt.Errorf("%s is not running", f.name)) + t.setUnhealthyLocked(magicsockReceiveFuncWarnable, Args{ + ArgMagicsockFunctionName: f.name, + }) + shouldClearMagicsockWarnings = false } } + if shouldClearMagicsockWarnings { + t.setHealthyLocked(magicsockReceiveFuncWarnable) + } + + // Iterates over the legacy subsystems and their error, and turns them into structured errors for sys, err := range t.sysErr { - if err == nil || sys == SysOverall { - continue - } - errs = append(errs, fmt.Errorf("%v: %w", sys, err)) + t.updateLegacyErrorWarnableLocked(sys, err) } - for _, w := range t.warnables { - if err := t.warnableVal[w]; err != nil { - errs = append(errs, err) + + if len(t.derpRegionHealthProblem) > 0 { + for regionID, problem := range t.derpRegionHealthProblem { + t.setUnhealthyLocked(derpRegionErrorWarnable, Args{ + ArgRegionID: fmt.Sprint(regionID), + ArgError: problem, + }) } + } else { + t.setHealthyLocked(derpRegionErrorWarnable) } - for regionID, problem := range t.derpRegionHealthProblem { - errs = append(errs, fmt.Errorf("derp%d: %v", regionID, problem)) - } - for _, s := range t.controlHealth { - errs = append(errs, errors.New(s)) + + if len(t.controlHealth) > 0 { + for _, s := range t.controlHealth { + t.setUnhealthyLocked(controlHealthWarnable, Args{ + ArgError: s, + }) + } + } else { + t.setHealthyLocked(controlHealthWarnable) } + if err := envknob.ApplyDiskConfigError(); err != nil { - errs = append(errs, err) + t.setUnhealthyLocked(applyDiskConfigWarnable, Args{ + ArgError: err.Error(), + }) + } else { + t.setHealthyLocked(applyDiskConfigWarnable) } - for serverName, err := range t.tlsConnectionErrors { - errs = append(errs, fmt.Errorf("TLS connection error for %q: %w", serverName, err)) + + if len(t.tlsConnectionErrors) > 0 { + for serverName, err := range t.tlsConnectionErrors { + t.setUnhealthyLocked(tlsConnectionFailedWarnable, Args{ + ArgServerName: serverName, + ArgError: err.Error(), + }) + } + } else { + t.setHealthyLocked(tlsConnectionFailedWarnable) } - if e := fakeErrForTesting(); len(errs) == 0 && e != "" { - return errors.New(e) + + if e := fakeErrForTesting(); len(t.warnables) == 0 && e != "" { + t.setUnhealthyLocked(testWarnable, Args{ + ArgError: e, + }) + } else { + t.setHealthyLocked(testWarnable) } - sort.Slice(errs, func(i, j int) bool { - // Not super efficient (stringifying these in a sort), but probably max 2 or 3 items. - return errs[i].Error() < errs[j].Error() - }) - return multierr.New(errs...) } // ReceiveFuncStats tracks the calls made to a wireguard-go receive func. diff --git a/health/health_test.go b/health/health_test.go index f6673d324..ff4cddb70 100644 --- a/health/health_test.go +++ b/health/health_test.go @@ -4,19 +4,23 @@ package health import ( - "errors" "fmt" "reflect" "testing" + "time" ) func TestAppendWarnableDebugFlags(t *testing.T) { var tr Tracker for i := range 10 { - w := NewWarnable(WithMapDebugFlag(fmt.Sprint(i))) + w := Register(&Warnable{ + Code: WarnableCode(fmt.Sprintf("warnable-code-%d", i)), + MapDebugFlag: fmt.Sprint(i), + }) + defer unregister(w) if i%2 == 0 { - tr.SetWarnable(w, errors.New("boom")) + tr.SetUnhealthy(w, Args{"test-arg": fmt.Sprint(i)}) } } @@ -49,3 +53,126 @@ func TestNilMethodsDontCrash(t *testing.T) { rv.Method(i).Call(args) } } + +func TestSetUnhealthyWithDuplicateThenHealthyAgain(t *testing.T) { + ht := Tracker{} + if len(ht.Strings()) != 0 { + t.Fatalf("before first insertion, len(newTracker.Strings) = %d; want = 0", len(ht.Strings())) + } + + ht.SetUnhealthy(testWarnable, Args{ArgError: "Hello world 1"}) + want := []string{"Hello world 1"} + if !reflect.DeepEqual(ht.Strings(), want) { + t.Fatalf("after calling SetUnhealthy, newTracker.Strings() = %v; want = %v", ht.Strings(), want) + } + + // Adding a second warning state with the same WarningCode overwrites the existing warning state, + // the count shouldn't have changed. + ht.SetUnhealthy(testWarnable, Args{ArgError: "Hello world 2"}) + want = []string{"Hello world 2"} + if !reflect.DeepEqual(ht.Strings(), want) { + t.Fatalf("after insertion of same WarningCode, newTracker.Strings() = %v; want = %v", ht.Strings(), want) + } + + ht.SetHealthy(testWarnable) + want = []string{} + if !reflect.DeepEqual(ht.Strings(), want) { + t.Fatalf("after setting the healthy, newTracker.Strings() = %v; want = %v", ht.Strings(), want) + } +} + +func TestRemoveAllWarnings(t *testing.T) { + ht := Tracker{} + if len(ht.Strings()) != 0 { + t.Fatalf("before first insertion, len(newTracker.Strings) = %d; want = 0", len(ht.Strings())) + } + + ht.SetUnhealthy(testWarnable, Args{"Text": "Hello world 1"}) + if len(ht.Strings()) != 1 { + t.Fatalf("after first insertion, len(newTracker.Strings) = %d; want = %d", len(ht.Strings()), 1) + } + + ht.SetHealthy(testWarnable) + if len(ht.Strings()) != 0 { + t.Fatalf("after RemoveAll, len(newTracker.Strings) = %d; want = 0", len(ht.Strings())) + } +} + +// TestWatcher tests that a registered watcher function gets called with the correct +// Warnable and non-nil/nil UnhealthyState upon setting a Warnable to unhealthy/healthy. +func TestWatcher(t *testing.T) { + ht := Tracker{} + wantText := "Hello world" + becameUnhealthy := make(chan struct{}) + becameHealthy := make(chan struct{}) + + watcherFunc := func(w *Warnable, us *UnhealthyState) { + if w != testWarnable { + t.Fatalf("watcherFunc was called, but with an unexpected Warnable: %v, want: %v", w, testWarnable) + } + + if us != nil { + if us.Text != wantText { + t.Fatalf("unexpected us.Text: %s, want: %s", us.Text, wantText) + } + if us.Args[ArgError] != wantText { + t.Fatalf("unexpected us.Args[ArgError]: %s, want: %s", us.Args[ArgError], wantText) + } + becameUnhealthy <- struct{}{} + } else { + becameHealthy <- struct{}{} + } + } + + unregisterFunc := ht.RegisterWatcher(watcherFunc) + if len(ht.watchers) != 1 { + t.Fatalf("after RegisterWatcher, len(newTracker.watchers) = %d; want = 1", len(ht.watchers)) + } + ht.SetUnhealthy(testWarnable, Args{ArgError: wantText}) + + select { + case <-becameUnhealthy: + // Test passed because the watcher got notified of an unhealthy state + case <-becameHealthy: + // Test failed because the watcher got of a healthy state instead of an unhealthy one + t.Fatalf("watcherFunc was called with a healthy state") + case <-time.After(1 * time.Second): + t.Fatalf("watcherFunc didn't get called upon calling SetUnhealthy") + } + + ht.SetHealthy(testWarnable) + + select { + case <-becameUnhealthy: + // Test failed because the watcher got of an unhealthy state instead of a healthy one + t.Fatalf("watcherFunc was called with an unhealthy state") + case <-becameHealthy: + // Test passed because the watcher got notified of a healthy state + case <-time.After(1 * time.Second): + t.Fatalf("watcherFunc didn't get called upon calling SetUnhealthy") + } + + unregisterFunc() + if len(ht.watchers) != 0 { + t.Fatalf("after unregisterFunc, len(newTracker.watchers) = %d; want = 0", len(ht.watchers)) + } +} + +func TestRegisterWarnablePanicsWithDuplicate(t *testing.T) { + w := &Warnable{ + Code: "test-warnable-1", + } + + Register(w) + defer unregister(w) + if registeredWarnables[w.Code] != w { + t.Fatalf("after Register, registeredWarnables[%s] = %v; want = %v", w.Code, registeredWarnables[w.Code], w) + } + + defer func() { + if r := recover(); r == nil { + t.Fatalf("Registering the same Warnable twice didn't panic") + } + }() + Register(w) +} diff --git a/health/state.go b/health/state.go new file mode 100644 index 000000000..1b12898eb --- /dev/null +++ b/health/state.go @@ -0,0 +1,79 @@ +// Copyright (c) Tailscale Inc & AUTHORS +// SPDX-License-Identifier: BSD-3-Clause + +package health + +import ( + "time" +) + +// State contains the health status of the backend, and is +// provided to the client UI via LocalAPI through ipn.Notify. +type State struct { + // Each key-value pair in Warnings represents a Warnable that is currently + // unhealthy. If a Warnable is healthy, it will not be present in this map. + // When a Warnable is unhealthy and becomes healthy, its key-value pair + // disappears in the next issued State. Observers should treat the absence of + // a WarnableCode in this map as an indication that the Warnable became healthy, + // and may use that to clear any notifications that were previously shown to the user. + // If Warnings is nil, all Warnables are healthy and the backend is overall healthy. + Warnings map[WarnableCode]UnhealthyState +} + +// Representation contains information to be shown to the user to inform them +// that a Warnable is currently unhealthy. +type UnhealthyState struct { + WarnableCode WarnableCode + Severity Severity + Title string + Text string + BrokenSince *time.Time `json:",omitempty"` + Args Args `json:",omitempty"` +} + +// unhealthyState returns a unhealthyState of the Warnable given its current warningState. +func (w *Warnable) unhealthyState(ws *warningState) *UnhealthyState { + var text string + if ws.Args != nil { + text = w.Text(ws.Args) + } else { + text = w.Text(Args{}) + } + + return &UnhealthyState{ + WarnableCode: w.Code, + Severity: w.Severity, + Title: w.Title, + Text: text, + BrokenSince: &ws.BrokenSince, + Args: ws.Args, + } +} + +// CurrentState returns a snapshot of the current health status of the backend. +// It returns a State with nil Warnings if the backend is healthy (all Warnables +// have no issues). +// The returned State is a snapshot of shared memory, and the caller should not +// mutate the returned value. +func (t *Tracker) CurrentState() *State { + if t.nil() { + return &State{} + } + + t.mu.Lock() + defer t.mu.Unlock() + + if t.warnableVal == nil || len(t.warnableVal) == 0 { + return &State{} + } + + wm := map[WarnableCode]UnhealthyState{} + + for w, ws := range t.warnableVal { + wm[w.Code] = *w.unhealthyState(ws) + } + + return &State{ + Warnings: wm, + } +} diff --git a/health/warnings.go b/health/warnings.go new file mode 100644 index 000000000..fc769e686 --- /dev/null +++ b/health/warnings.go @@ -0,0 +1,206 @@ +// Copyright (c) Tailscale Inc & AUTHORS +// SPDX-License-Identifier: BSD-3-Clause + +package health + +import ( + "fmt" +) + +/** +This file contains definitions for the Warnables maintained within this `health` package. +*/ + +// updateAvailableWarnable is a Warnable that warns the user that an update is available. +var updateAvailableWarnable = Register(&Warnable{ + Code: "update-available", + Title: "Update available", + Severity: SeverityLow, + Text: func(args Args) string { + return fmt.Sprintf("An update from version %s to %s is available. Run `tailscale update` or `tailscale set --auto-update` to update.", args[ArgCurrentVersion], args[ArgAvailableVersion]) + }, +}) + +// securityUpdateAvailableWarnable is a Warnable that warns the user that an important security update is available. +var securityUpdateAvailableWarnable = Register(&Warnable{ + Code: "security-update-available", + Title: "Security update available", + Severity: SeverityHigh, + Text: func(args Args) string { + return fmt.Sprintf("An urgent security update from version %s to %s is available. Run `tailscale update` or `tailscale set --auto-update` to update now.", args[ArgCurrentVersion], args[ArgAvailableVersion]) + }, +}) + +// unstableWarnable is a Warnable that warns the user that they are using an unstable version of Tailscale +// so they won't be surprised by all the issues that may arise. +var unstableWarnable = Register(&Warnable{ + Code: "is-using-unstable-version", + Title: "Using an unstable version", + Severity: SeverityLow, + Text: StaticMessage("This is an unstable version of Tailscale meant for testing and development purposes: please report any bugs to Tailscale."), +}) + +// NetworkStatusWarnable is a Warnable that warns the user that the network is down. +var NetworkStatusWarnable = Register(&Warnable{ + Code: "network-status", + Title: "Network down", + Severity: SeverityHigh, + Text: StaticMessage("Tailscale cannot connect because the network is down. (No network interface is up.)"), + ImpactsConnectivity: true, +}) + +// IPNStateWarnable is a Warnable that warns the user that Tailscale is stopped. +var IPNStateWarnable = Register(&Warnable{ + Code: "wantrunning-false", + Title: "Not connected to Tailscale", + Severity: SeverityLow, + Text: StaticMessage("Tailscale is stopped."), +}) + +// localLogWarnable is a Warnable that warns the user that the local log is misconfigured. +var localLogWarnable = Register(&Warnable{ + Code: "local-log-config-error", + Title: "Local log misconfiguration", + Severity: SeverityLow, + Text: func(args Args) string { + return fmt.Sprintf("The local log is misconfigured: %v", args[ArgError]) + }, +}) + +// LoginStateWarnable is a Warnable that warns the user that they are logged out, +// and provides the last login error if available. +var LoginStateWarnable = Register(&Warnable{ + Code: "login-state", + Title: "Logged out", + Severity: SeverityMedium, + Text: func(args Args) string { + if args[ArgError] != "" { + return fmt.Sprintf("You are logged out. The last login error was: %v", args[ArgError]) + } else { + return "You are logged out." + } + }, +}) + +// notInMapPollWarnable is a Warnable that warns the user that they cannot connect to the control server. +var notInMapPollWarnable = Register(&Warnable{ + Code: "not-in-map-poll", + Title: "Cannot connect to control server", + Severity: SeverityMedium, + DependsOn: []*Warnable{NetworkStatusWarnable}, + Text: StaticMessage("Cannot connect to the control server (not in map poll). Check your Internet connection."), +}) + +// noDERPHomeWarnable is a Warnable that warns the user that Tailscale doesn't have a home DERP. +var noDERPHomeWarnable = Register(&Warnable{ + Code: "no-derp-home", + Title: "No home relay server", + Severity: SeverityHigh, + DependsOn: []*Warnable{NetworkStatusWarnable}, + Text: StaticMessage("Tailscale could not connect to any relay server. Check your Internet connection."), +}) + +// noDERPConnectionWarnable is a Warnable that warns the user that Tailscale couldn't connect to a specific DERP server. +var noDERPConnectionWarnable = Register(&Warnable{ + Code: "no-derp-connection", + Title: "No relay server connection", + Severity: SeverityHigh, + DependsOn: []*Warnable{NetworkStatusWarnable}, + Text: func(args Args) string { + return fmt.Sprintf("Tailscale could not connect to the relay server '%s'. The server might be temporarily unavailable, or your Internet connection might be down.", args[ArgRegionID]) + }, +}) + +// derpTimeoutWarnable is a Warnable that warns the user that Tailscale hasn't heard from the home DERP region for a while. +var derpTimeoutWarnable = Register(&Warnable{ + Code: "derp-timed-out", + Title: "Relay server timed out", + Severity: SeverityMedium, + DependsOn: []*Warnable{NetworkStatusWarnable}, + Text: func(args Args) string { + return fmt.Sprintf("Tailscale hasn't heard from the home relay server (region %v) in %v. The server might be temporarily unavailable, or your Internet connection might be down.", args[ArgRegionID], args[ArgDuration]) + }, +}) + +// derpRegionErrorWarnable is a Warnable that warns the user that a DERP region is reporting an issue. +var derpRegionErrorWarnable = Register(&Warnable{ + Code: "derp-region-error", + Title: "Relay server error", + Severity: SeverityMedium, + DependsOn: []*Warnable{NetworkStatusWarnable}, + Text: func(args Args) string { + return fmt.Sprintf("The relay server #%v is reporting an issue: %v", args[ArgRegionID], args[ArgError]) + }, +}) + +// noUDP4BindWarnable is a Warnable that warns the user that Tailscale couldn't listen for incoming UDP connections. +var noUDP4BindWarnable = Register(&Warnable{ + Code: "no-udp4-bind", + Title: "Incoming connections may fail", + Severity: SeverityHigh, + DependsOn: []*Warnable{NetworkStatusWarnable}, + Text: StaticMessage("Tailscale couldn't listen for incoming UDP connections."), + ImpactsConnectivity: true, +}) + +// mapResponseTimeoutWarnable is a Warnable that warns the user that Tailscale hasn't received a network map from the coordination server in a while. +var mapResponseTimeoutWarnable = Register(&Warnable{ + Code: "mapresponse-timeout", + Title: "Network map response timeout", + Severity: SeverityMedium, + DependsOn: []*Warnable{NetworkStatusWarnable}, + Text: func(args Args) string { + return fmt.Sprintf("Tailscale hasn't received a network map from the coordination server in %s.", args[ArgDuration]) + }, +}) + +// tlsConnectionFailedWarnable is a Warnable that warns the user that Tailscale could not establish an encrypted connection with a server. +var tlsConnectionFailedWarnable = Register(&Warnable{ + Code: "tls-connection-failed", + Title: "Encrypted connection failed", + Severity: SeverityMedium, + DependsOn: []*Warnable{NetworkStatusWarnable}, + Text: func(args Args) string { + return fmt.Sprintf("Tailscale could not establish an encrypted connection with '%q': %v", args[ArgServerName], args[ArgError]) + }, +}) + +// magicsockReceiveFuncWarnable is a Warnable that warns the user that one of the Magicsock functions is not running. +var magicsockReceiveFuncWarnable = Register(&Warnable{ + Code: "magicsock-receive-func-error", + Title: "MagicSock function not running", + Severity: SeverityMedium, + Text: func(args Args) string { + return fmt.Sprintf("The MagicSock function %s is not running. You might experience connectivity issues.", args[ArgMagicsockFunctionName]) + }, +}) + +// testWarnable is a Warnable that is used within this package for testing purposes only. +var testWarnable = Register(&Warnable{ + Code: "test-warnable", + Title: "Test warnable", + Severity: SeverityLow, + Text: func(args Args) string { + return args[ArgError] + }, +}) + +// applyDiskConfigWarnable is a Warnable that warns the user that there was an error applying the envknob config stored on disk. +var applyDiskConfigWarnable = Register(&Warnable{ + Code: "apply-disk-config", + Title: "Could not apply configuration", + Severity: SeverityMedium, + Text: func(args Args) string { + return fmt.Sprintf("An error occurred applying the Tailscale envknob configuration stored on disk: %v", args[ArgError]) + }, +}) + +// controlHealthWarnable is a Warnable that warns the user that the coordination server is reporting an health issue. +var controlHealthWarnable = Register(&Warnable{ + Code: "control-health", + Title: "Coordination server reports an issue", + Severity: SeverityMedium, + Text: func(args Args) string { + return fmt.Sprintf("The coordination server is reporting an health issue: %v", args[ArgError]) + }, +}) diff --git a/ipn/backend.go b/ipn/backend.go index 6408d7e98..d6ba95408 100644 --- a/ipn/backend.go +++ b/ipn/backend.go @@ -9,6 +9,7 @@ import ( "time" "tailscale.com/drive" + "tailscale.com/health" "tailscale.com/ipn/ipnstate" "tailscale.com/tailcfg" "tailscale.com/types/empty" @@ -70,6 +71,8 @@ const ( NotifyNoPrivateKeys // if set, private keys that would normally be sent in updates are zeroed out NotifyInitialDriveShares // if set, the first Notify message (sent immediately) will contain the current Taildrive Shares NotifyInitialOutgoingFiles // if set, the first Notify message (sent immediately) will contain the current Taildrop OutgoingFiles + + NotifyInitialHealthState // if set, the first Notify message (sent immediately) will contain the current health.State of the client ) // Notify is a communication from a backend (e.g. tailscaled) to a frontend @@ -138,6 +141,11 @@ type Notify struct { // empty value means that there are no shares. DriveShares views.SliceView[*drive.Share, drive.ShareView] + // Health is the last-known health state of the backend. When this field is + // non-nil, a change in health verified, and the API client should surface + // any changes to the user in the UI. + Health *health.State `json:",omitempty"` + // type is mirrored in xcode/Shared/IPN.swift } @@ -177,6 +185,9 @@ func (n Notify) String() string { if n.LocalTCPPort != nil { fmt.Fprintf(&sb, "tcpport=%v ", n.LocalTCPPort) } + if n.Health != nil { + sb.WriteString("Health{...} ") + } s := sb.String() return s[0:len(s)-1] + "}" } diff --git a/ipn/ipnlocal/local.go b/ipn/ipnlocal/local.go index 213a00e21..c219a391e 100644 --- a/ipn/ipnlocal/local.go +++ b/ipn/ipnlocal/local.go @@ -666,12 +666,18 @@ func (b *LocalBackend) linkChange(delta *netmon.ChangeDelta) { } } -func (b *LocalBackend) onHealthChange(sys health.Subsystem, err error) { - if err == nil { - b.logf("health(%q): ok", sys) +func (b *LocalBackend) onHealthChange(w *health.Warnable, us *health.UnhealthyState) { + if us == nil { + b.logf("health(warnable=%s): ok", w.Code) } else { - b.logf("health(%q): error: %v", sys, err) + b.logf("health(warnable=%s): error: %s", w.Code, us.Text) } + + // Whenever health changes, send the current health state to the frontend. + state := b.health.CurrentState() + b.send(ipn.Notify{ + Health: state, + }) } // Shutdown halts the backend and all its sub-components. The backend @@ -788,7 +794,7 @@ func (b *LocalBackend) UpdateStatus(sb *ipnstate.StatusBuilder) { if prefs := b.pm.CurrentPrefs(); prefs.Valid() && prefs.AutoUpdate().Check { s.ClientVersion = b.lastClientVersion } - s.Health = b.health.AppendWarnings(s.Health) + s.Health = b.health.Strings() s.HaveNodeKey = b.hasNodeKeyLocked() // TODO(bradfitz): move this health check into a health.Warnable @@ -1870,7 +1876,13 @@ func (b *LocalBackend) Start(opts ipn.Options) error { return nil } -var warnInvalidUnsignedNodes = health.NewWarnable() +// invalidPacketFilterWarnable is a Warnable to warn the user that the control server sent an invalid packet filter. +var invalidPacketFilterWarnable = health.Register(&health.Warnable{ + Code: "invalid-packet-filter", + Title: "Invalid packet filter", + Severity: health.SeverityHigh, + Text: health.StaticMessage("The coordination server sent an invalid packet filter permitting traffic to unlocked nodes; rejecting all packets for safety"), +}) // updateFilterLocked updates the packet filter in wgengine based on the // given netMap and user preferences. @@ -1902,11 +1914,10 @@ func (b *LocalBackend) updateFilterLocked(netMap *netmap.NetworkMap, prefs ipn.P packetFilter = netMap.PacketFilter if packetFilterPermitsUnlockedNodes(b.peers, packetFilter) { - err := errors.New("server sent invalid packet filter permitting traffic to unlocked nodes; rejecting all packets for safety") - b.health.SetWarnable(warnInvalidUnsignedNodes, err) + b.health.SetUnhealthy(invalidPacketFilterWarnable, nil) packetFilter = nil } else { - b.health.SetWarnable(warnInvalidUnsignedNodes, nil) + b.health.SetHealthy(invalidPacketFilterWarnable) } } if prefs.Valid() { @@ -2309,6 +2320,9 @@ func (b *LocalBackend) WatchNotifications(ctx context.Context, mask ipn.NotifyWa if mask&ipn.NotifyInitialDriveShares != 0 && b.driveSharingEnabledLocked() { ini.DriveShares = b.pm.prefs.DriveShares() } + if mask&ipn.NotifyInitialHealthState != 0 { + ini.Health = b.HealthTracker().CurrentState() + } } mak.Set(&b.notifyWatchers, sessionID, &watchSession{ch, sessionID}) @@ -3120,20 +3134,31 @@ func (b *LocalBackend) isDefaultServerLocked() bool { return prefs.ControlURLOrDefault() == ipn.DefaultControlURL } -var warnExitNodeUsage = health.NewWarnable(health.WithConnectivityImpact()) +var exitNodeMisconfigurationWarnable = health.Register(&health.Warnable{ + Code: "exit-node-misconfiguration", + Title: "Exit node misconfiguration", + Severity: health.SeverityMedium, + Text: func(args health.Args) string { + return "Exit node misconfiguration: " + args[health.ArgError] + }, +}) // updateExitNodeUsageWarning updates a warnable meant to notify users of // configuration issues that could break exit node usage. -func updateExitNodeUsageWarning(p ipn.PrefsView, state *netmon.State, health *health.Tracker) { - var result error +func updateExitNodeUsageWarning(p ipn.PrefsView, state *netmon.State, healthTracker *health.Tracker) { + var msg string if p.ExitNodeIP().IsValid() || p.ExitNodeID() != "" { warn, _ := netutil.CheckReversePathFiltering(state) const comment = "please set rp_filter=2 instead of rp_filter=1; see https://github.com/tailscale/tailscale/issues/3310" if len(warn) > 0 { - result = fmt.Errorf("%s: %v, %s", healthmsg.WarnExitNodeUsage, warn, comment) + msg = fmt.Sprintf("%s: %v, %s", healthmsg.WarnExitNodeUsage, warn, comment) } } - health.SetWarnable(warnExitNodeUsage, result) + if len(msg) > 0 { + healthTracker.SetUnhealthy(exitNodeMisconfigurationWarnable, health.Args{health.ArgError: msg}) + } else { + healthTracker.SetHealthy(exitNodeMisconfigurationWarnable) + } } func (b *LocalBackend) checkExitNodePrefsLocked(p *ipn.Prefs) error { @@ -5841,13 +5866,18 @@ func (b *LocalBackend) sshServerOrInit() (_ SSHServer, err error) { return b.sshServer, nil } -var warnSSHSELinux = health.NewWarnable() +var warnSSHSELinuxWarnable = health.Register(&health.Warnable{ + Code: "ssh-unavailable-selinux-enabled", + Title: "Tailscale SSH and SELinux", + Severity: health.SeverityLow, + Text: health.StaticMessage("SELinux is enabled; Tailscale SSH may not work. See https://tailscale.com/s/ssh-selinux"), +}) func (b *LocalBackend) updateSELinuxHealthWarning() { if hostinfo.IsSELinuxEnforcing() { - b.health.SetWarnable(warnSSHSELinux, errors.New("SELinux is enabled; Tailscale SSH may not work. See https://tailscale.com/s/ssh-selinux")) + b.health.SetUnhealthy(warnSSHSELinuxWarnable, nil) } else { - b.health.SetWarnable(warnSSHSELinux, nil) + b.health.SetHealthy(warnSSHSELinuxWarnable) } } diff --git a/net/dns/direct_linux.go b/net/dns/direct_linux.go index 1de22e1b9..319462170 100644 --- a/net/dns/direct_linux.go +++ b/net/dns/direct_linux.go @@ -6,7 +6,6 @@ package dns import ( "bytes" "context" - "errors" "github.com/illarion/gonotify" "tailscale.com/health" @@ -58,7 +57,12 @@ func (m *directManager) runFileWatcher() { } } -var warnTrample = health.NewWarnable() +var resolvTrampleWarnable = health.Register(&health.Warnable{ + Code: "resolv-conf-overwritten", + Severity: health.SeverityMedium, + Title: "Linux DNS configuration issue", + Text: health.StaticMessage("Linux DNS config not ideal. /etc/resolv.conf overwritten. See https://tailscale.com/s/dns-fight"), +}) // checkForFileTrample checks whether /etc/resolv.conf has been trampled // by another program on the system. (e.g. a DHCP client) @@ -78,7 +82,7 @@ func (m *directManager) checkForFileTrample() { return } if bytes.Equal(cur, want) { - m.health.SetWarnable(warnTrample, nil) + m.health.SetHealthy(resolvTrampleWarnable) if lastWarn != nil { m.mu.Lock() m.lastWarnContents = nil @@ -101,7 +105,7 @@ func (m *directManager) checkForFileTrample() { show = show[:1024] } m.logf("trample: resolv.conf changed from what we expected. did some other program interfere? current contents: %q", show) - m.health.SetWarnable(warnTrample, errors.New("Linux DNS config not ideal. /etc/resolv.conf overwritten. See https://tailscale.com/s/dns-fight")) + m.health.SetUnhealthy(resolvTrampleWarnable, nil) } func (m *directManager) closeInotifyOnDone(ctx context.Context, in *gonotify.Inotify) { diff --git a/wgengine/router/ifconfig_windows.go b/wgengine/router/ifconfig_windows.go index b9b1873b8..779acc881 100644 --- a/wgengine/router/ifconfig_windows.go +++ b/wgengine/router/ifconfig_windows.go @@ -14,17 +14,18 @@ import ( "sort" "time" - ole "github.com/go-ole/go-ole" - "github.com/tailscale/wireguard-go/tun" - "go4.org/netipx" - "golang.org/x/sys/windows" - "golang.zx2c4.com/wireguard/windows/tunnel/winipcfg" "tailscale.com/health" "tailscale.com/net/netmon" "tailscale.com/net/tsaddr" "tailscale.com/net/tstun" "tailscale.com/util/multierr" "tailscale.com/wgengine/winnet" + + ole "github.com/go-ole/go-ole" + "github.com/tailscale/wireguard-go/tun" + "go4.org/netipx" + "golang.org/x/sys/windows" + "golang.zx2c4.com/wireguard/windows/tunnel/winipcfg" ) // monitorDefaultRoutes subscribes to route change events and updates @@ -235,9 +236,17 @@ func interfaceFromLUID(luid winipcfg.LUID, flags winipcfg.GAAFlags) (*winipcfg.I return nil, fmt.Errorf("interfaceFromLUID: interface with LUID %v not found", luid) } -var networkCategoryWarning = health.NewWarnable(health.WithMapDebugFlag("warn-network-category-unhealthy")) - -func configureInterface(cfg *Config, tun *tun.NativeTun, health *health.Tracker) (retErr error) { +var networkCategoryWarnable = health.Register(&health.Warnable{ + Code: "set-network-category-failed", + Severity: health.SeverityMedium, + Title: "Windows network configuration failed", + Text: func(args health.Args) string { + return fmt.Sprintf("Failed to set the network category to private on the Tailscale adapter. This may prevent Tailscale from working correctly. Error: %s", args[health.ArgError]) + }, + MapDebugFlag: "warn-network-category-unhealthy", +}) + +func configureInterface(cfg *Config, tun *tun.NativeTun, ht *health.Tracker) (retErr error) { var mtu = tstun.DefaultTUNMTU() luid := winipcfg.LUID(tun.LUID()) iface, err := interfaceFromLUID(luid, @@ -268,10 +277,10 @@ func configureInterface(cfg *Config, tun *tun.NativeTun, health *health.Tracker) for i := range tries { found, err := setPrivateNetwork(luid) if err != nil { - health.SetWarnable(networkCategoryWarning, fmt.Errorf("set-network-category: %w", err)) + ht.SetUnhealthy(networkCategoryWarnable, health.Args{health.ArgError: err.Error()}) log.Printf("setPrivateNetwork(try=%d): %v", i, err) } else { - health.SetWarnable(networkCategoryWarning, nil) + ht.SetHealthy(networkCategoryWarnable) if found { if i > 0 { log.Printf("setPrivateNetwork(try=%d): success", i) diff --git a/wgengine/router/router_linux.go b/wgengine/router/router_linux.go index a328075d9..2af73e26d 100644 --- a/wgengine/router/router_linux.go +++ b/wgengine/router/router_linux.go @@ -445,12 +445,17 @@ func (r *linuxRouter) Set(cfg *Config) error { return multierr.New(errs...) } -var warnStatefulFilteringWithDocker = health.NewWarnable() +var dockerStatefulFilteringWarnable = health.Register(&health.Warnable{ + Code: "docker-stateful-filtering", + Title: "Docker with stateful filtering", + Severity: health.SeverityMedium, + Text: health.StaticMessage("Stateful filtering is enabled and Docker was detected; this may prevent Docker containers on this host from resolving DNS and connecting to Tailscale nodes. See https://tailscale.com/s/stateful-docker"), +}) func (r *linuxRouter) updateStatefulFilteringWithDockerWarning(cfg *Config) { // If stateful filtering is disabled, clear the warning. if !r.statefulFiltering { - r.health.SetWarnable(warnStatefulFilteringWithDocker, nil) + r.health.SetHealthy(dockerStatefulFilteringWarnable) return } @@ -479,17 +484,13 @@ func (r *linuxRouter) updateStatefulFilteringWithDockerWarning(cfg *Config) { // socket/daemon/etc. ifstate := r.netMon.InterfaceState() if _, found := ifstate.Interface["docker0"]; found { - r.health.SetWarnable(warnStatefulFilteringWithDocker, fmt.Errorf(""+ - "Stateful filtering is enabled and Docker was detected; this may prevent Docker containers "+ - "on this host from resolving DNS and connecting to Tailscale nodes. "+ - "See https://tailscale.com/s/stateful-docker", - )) + r.health.SetUnhealthy(dockerStatefulFilteringWarnable, nil) return } } // If we get here, then we have no warnings; clear anything existing. - r.health.SetWarnable(warnStatefulFilteringWithDocker, nil) + r.health.SetHealthy(dockerStatefulFilteringWarnable) } // UpdateMagicsockPort implements the Router interface.