health: begin work to use structured health warnings instead of strings, pipe changes into ipn.Notify (#12406)

Updates tailscale/tailscale#4136

This PR is the first round of work to move from encoding health warnings as strings and use structured data instead. The current health package revolves around the idea of Subsystems. Each subsystem can have (or not have) a Go error associated with it. The overall health of the backend is given by the concatenation of all these errors.

This PR polishes the concept of Warnable introduced by @bradfitz a few weeks ago. Each Warnable is a component of the backend (for instance, things like 'dns' or 'magicsock' are Warnables). Each Warnable has a unique identifying code. A Warnable is an entity we can warn the user about, by setting (or unsetting) a WarningState for it. Warnables have:

- an identifying Code, so that the GUI can track them as their WarningStates come and go
- a Title, which the GUIs can use to tell the user what component of the backend is broken
- a Text, which is a function that is called with a set of Args to generate a more detailed error message to explain the unhappy state

Additionally, this PR also begins to send Warnables and their WarningStates through LocalAPI to the clients, using ipn.Notify messages. An ipn.Notify is only issued when a warning is added or removed from the Tracker.

In a next PR, we'll get rid of subsystems entirely, and we'll start using structured warnings for all errors affecting the backend functionality.

Signed-off-by: Andrea Gottardo <andrea@gottardo.me>
pull/12480/head
Andrea Gottardo 6 months ago committed by GitHub
parent e8ca30a5c7
commit a8ee83e2c5
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -1581,9 +1581,9 @@ func postPingResult(start time.Time, logf logger.Logf, c *http.Client, pr *tailc
} }
// ReportHealthChange reports to the control plane a change to this node's // ReportHealthChange reports to the control plane a change to this node's
// health. // health. w must be non-nil. us can be nil to indicate a healthy state for w.
func (c *Direct) ReportHealthChange(sys health.Subsystem, sysErr error) { func (c *Direct) ReportHealthChange(w *health.Warnable, us *health.UnhealthyState) {
if sys == health.SysOverall { if w == health.NetworkStatusWarnable || w == health.IPNStateWarnable || w == health.LoginStateWarnable {
// We don't report these. These include things like the network is down // We don't report these. These include things like the network is down
// (in which case we can't report anyway) or the user wanted things // (in which case we can't report anyway) or the user wanted things
// stopped, as opposed to the more unexpected failure types in the other // stopped, as opposed to the more unexpected failure types in the other
@ -1602,12 +1602,13 @@ func (c *Direct) ReportHealthChange(sys health.Subsystem, sysErr error) {
if c.panicOnUse { if c.panicOnUse {
panic("tainted client") panic("tainted client")
} }
// TODO(angott): at some point, update `Subsys` in the request to be `Warnable`
req := &tailcfg.HealthChangeRequest{ req := &tailcfg.HealthChangeRequest{
Subsys: string(sys), Subsys: string(w.Code),
NodeKey: nodeKey, NodeKey: nodeKey,
} }
if sysErr != nil { if us != nil {
req.Error = sysErr.Error() req.Error = us.Text
} }
// Best effort, no logging: // Best effort, no logging:

@ -0,0 +1,30 @@
// Copyright (c) Tailscale Inc & AUTHORS
// SPDX-License-Identifier: BSD-3-Clause
package health
// Arg is a type for the key to be used in the Args of a Warnable.
type Arg string
const (
// ArgAvailableVersion provides an update notification Warnable with the available version of the Tailscale client.
ArgAvailableVersion Arg = "available-version"
// ArgCurrentVersion provides an update notification Warnable with the current version of the Tailscale client.
ArgCurrentVersion Arg = "current-version"
// ArgDuration provides a Warnable with how long the Warnable has been in an unhealthy state.
ArgDuration Arg = "duration"
// ArgError provides a Warnable with the underlying error behind an unhealthy state.
ArgError Arg = "error"
// ArgMagicsockFunctionName provides a Warnable with the name of the Magicsock function that caused the unhealthy state.
ArgMagicsockFunctionName Arg = "magicsock-function-name"
// ArgRegionID provides a Warnable with the ID of a DERP server involved in the unhealthy state.
ArgRegionID Arg = "region-id"
// ArgServerName provides a Warnable with the hostname of a server involved in the unhealthy state.
ArgServerName Arg = "server-name"
)

@ -8,6 +8,7 @@ package health
import ( import (
"errors" "errors"
"fmt" "fmt"
"maps"
"net/http" "net/http"
"os" "os"
"runtime" "runtime"
@ -67,10 +68,12 @@ type Tracker struct {
mu sync.Mutex mu sync.Mutex
warnables []*Warnable // keys ever set warnables []*Warnable // keys ever set
warnableVal map[*Warnable]error warnableVal map[*Warnable]*warningState
sysErr map[Subsystem]error // subsystem => err (or nil for no error) // sysErr maps subsystems to their current error (or nil if the subsystem is healthy)
watchers set.HandleSet[func(Subsystem, error)] // opt func to run if error state changes // Deprecated: using Warnables should be preferred
sysErr map[Subsystem]error
watchers set.HandleSet[func(*Warnable, *UnhealthyState)] // opt func to run if error state changes
timer *time.Timer timer *time.Timer
latestVersion *tailcfg.ClientVersion // or nil latestVersion *tailcfg.ClientVersion // or nil
@ -97,13 +100,12 @@ type Tracker struct {
} }
// Subsystem is the name of a subsystem whose health can be monitored. // Subsystem is the name of a subsystem whose health can be monitored.
//
// Deprecated: Registering a Warnable using Register() and updating its health state
// with SetUnhealthy() and SetHealthy() should be preferred.
type Subsystem string type Subsystem string
const ( const (
// SysOverall is the name representing the overall health of
// the system, rather than one particular subsystem.
SysOverall = Subsystem("overall")
// SysRouter is the name of the wgengine/router subsystem. // SysRouter is the name of the wgengine/router subsystem.
SysRouter = Subsystem("router") SysRouter = Subsystem("router")
@ -120,55 +122,98 @@ const (
SysTKA = Subsystem("tailnet-lock") SysTKA = Subsystem("tailnet-lock")
) )
// NewWarnable returns a new warnable item that the caller can mark as health or var subsystemsWarnables = map[Subsystem]*Warnable{}
// in warning state via Tracker.SetWarnable.
//
// NewWarnable is generally called in init and stored in a package global. It
// can be used by multiple Trackers.
func NewWarnable(opts ...WarnableOpt) *Warnable {
w := new(Warnable)
for _, o := range opts {
o.mod(w)
}
return w
}
// WarnableOpt is an option passed to NewWarnable. const legacyErrorArgKey = "LegacyError"
type WarnableOpt interface {
mod(*Warnable)
}
// WithMapDebugFlag returns a WarnableOpt for NewWarnable that makes the returned // Warnable() returns a Warnable representing a legacy Subsystem. This is used
// Warnable report itself to the coordination server as broken with this // *temporarily* while we migrate the old health infrastructure based on
// string in MapRequest.DebugFlag when Set to a non-nil value. // Subsystems to the new Warnables architecture.
func WithMapDebugFlag(name string) WarnableOpt { func (s Subsystem) Warnable() *Warnable {
return warnOptFunc(func(w *Warnable) { if w, ok := subsystemsWarnables[s]; ok {
w.debugFlag = name return w
} else {
w := Register(&Warnable{
Code: WarnableCode(s),
Severity: SeverityMedium,
Text: func(args Args) string {
return args[legacyErrorArgKey]
},
}) })
subsystemsWarnables[s] = w
return w
}
} }
// WithConnectivityImpact returns an option which makes a Warnable annotated as var registeredWarnables = map[WarnableCode]*Warnable{}
// something that could be breaking external network connectivity on the
// machine. This will make the warnable returned by OverallError alongside // Register registers a new Warnable with the health package and returns it.
// network connectivity errors. // Register panics if the Warnable was already registered, because Warnables
func WithConnectivityImpact() WarnableOpt { // should be unique across the program.
return warnOptFunc(func(w *Warnable) { func Register(w *Warnable) *Warnable {
w.hasConnectivityImpact = true if registeredWarnables[w.Code] != nil {
}) panic(fmt.Sprintf("health: a Warnable with code %q was already registered", w.Code))
}
mak.Set(&registeredWarnables, w.Code, w)
return w
} }
type warnOptFunc func(*Warnable) // unregister removes a Warnable from the health package. It should only be used
// for testing purposes.
func unregister(w *Warnable) {
if registeredWarnables[w.Code] == nil {
panic(fmt.Sprintf("health: attempting to unregister Warnable %q that was not registered", w.Code))
}
delete(registeredWarnables, w.Code)
}
func (f warnOptFunc) mod(w *Warnable) { f(w) } // WarnableCode is a string that distinguishes each Warnable from others. It is globally unique within
// the program.
type WarnableCode string
// Warnable is a health check item that may or may not be in a bad warning state. // A Warnable is something that we might want to warn the user about, or not. A Warnable is either
// The caller of NewWarnable is responsible for calling Tracker.SetWarnable to update the state. // in an healthy or unhealth state. A Warnable is unhealthy if the Tracker knows about a WarningState
// affecting the Warnable.
// In most cases, Warnables are components of the backend (for instance, "DNS" or "Magicsock").
// Warnables are similar to the Subsystem type previously used in this package, but they provide
// a unique identifying code for each Warnable, along with more metadata that makes it easier for
// a GUI to display the Warnable in a user-friendly way.
type Warnable struct { type Warnable struct {
debugFlag string // optional MapRequest.DebugFlag to send when unhealthy // Code is a string that uniquely identifies this Warnable across the entire Tailscale backend,
// and can be mapped to a user-displayable localized string.
Code WarnableCode
// Title is a string that the GUI uses as title for any message involving this Warnable. The title
// should be short and fit in a single line.
Title string
// Text is a function that generates an extended string that the GUI will display to the user when
// this Warnable is in an unhealthy state. The function can use the Args map to provide dynamic
// information to the user.
Text func(args Args) string
// Severity is the severity of the Warnable, which the GUI can use to determine how to display it.
// For instance, a Warnable with SeverityHigh could trigger a modal view, while a Warnable with
// SeverityLow could be displayed in a less intrusive way.
// TODO(angott): turn this into a SeverityFunc, which allows the Warnable to change its severity based on
// the Args of the unhappy state, just like we do in the Text function.
Severity Severity
// DependsOn is a set of Warnables that this Warnable depends, on and need to be healthy
// before this Warnable can also be healthy again. The GUI can use this information to ignore
// this Warnable if one of its dependencies is unhealthy.
DependsOn []*Warnable
// MapDebugFlag is a MapRequest.DebugFlag that is sent to control when this Warnable is unhealthy
//
// Deprecated: this is only used in one case, and will be removed in a future PR
MapDebugFlag string
// If true, this warning is related to configuration of networking stack // If true, this warnable is related to configuration of networking stack
// on the machine that impacts connectivity. // on the machine that impacts connectivity.
hasConnectivityImpact bool ImpactsConnectivity bool
}
// StaticMessage returns a function that always returns the input string, to be used in
// simple Warnables that do not use the Args map to generate their Text.
func StaticMessage(s string) func(Args) string {
return func(Args) string { return s }
} }
// nil reports whether t is nil. // nil reports whether t is nil.
@ -180,6 +225,7 @@ func (t *Tracker) nil() bool {
if t != nil { if t != nil {
return false return false
} }
if cibuild.On() { if cibuild.On() {
stack := make([]byte, 1<<10) stack := make([]byte, 1<<10)
stack = stack[:runtime.Stack(stack, false)] stack = stack[:runtime.Stack(stack, false)]
@ -191,19 +237,104 @@ func (t *Tracker) nil() bool {
return true return true
} }
// Set updates the Warnable's state. // Severity represents how serious an error is. Each GUI interprets this severity value in different ways,
// If non-nil, it's considered unhealthy. // to surface the error in a more or less visible way. For instance, the macOS GUI could change its menubar
func (t *Tracker) SetWarnable(w *Warnable, err error) { // icon to display an exclamation mark and present a modal notification for SeverityHigh warnings, but not
// for SeverityLow messages, which would only appear in the Settings window.
type Severity string
const (
SeverityHigh Severity = "high"
SeverityMedium Severity = "medium"
SeverityLow Severity = "low"
)
// Args is a map of Args to string values that can be used to provide parameters regarding
// the unhealthy state of a Warnable.
// For instance, if you have a Warnable to track the health of DNS lookups, here you can include
// the hostname that failed to resolve, or the IP address of the DNS server that has been failing
// to respond. You can then use these parameters in the Text function of the Warnable to provide a detailed
// error message to the user.
type Args map[Arg]string
// A warningState is a condition affecting a Warnable. For each Warnable known to the Tracker, a Warnable
// is in an unhappy state if there is a warningState associated with the Warnable.
type warningState struct {
BrokenSince time.Time // when the Warnable became unhealthy
Args Args // args can be used to provide parameters to the function that generates the Text in the Warnable
}
func (ws *warningState) Equal(other *warningState) bool {
if ws == nil && other == nil {
return true
}
if ws == nil || other == nil {
return false
}
return ws.BrokenSince.Equal(other.BrokenSince) && maps.Equal(ws.Args, other.Args)
}
// SetUnhealthy sets a warningState for the given Warnable with the provided Args, and should be
// called when a Warnable becomes unhealthy, or its unhealthy status needs to be updated.
// SetUnhealthy takes ownership of args. The args can be nil if no additional information is
// needed for the unhealthy state.
func (t *Tracker) SetUnhealthy(w *Warnable, args Args) {
if t.nil() { if t.nil() {
return return
} }
t.mu.Lock() t.mu.Lock()
defer t.mu.Unlock() defer t.mu.Unlock()
l0 := len(t.warnableVal) t.setUnhealthyLocked(w, args)
mak.Set(&t.warnableVal, w, err) }
if len(t.warnableVal) != l0 {
func (t *Tracker) setUnhealthyLocked(w *Warnable, args Args) {
if w == nil {
return
}
// If we already have a warningState for this Warnable with an earlier BrokenSince time, keep that
// BrokenSince time.
brokenSince := time.Now()
if existingWS := t.warnableVal[w]; existingWS != nil {
brokenSince = existingWS.BrokenSince
}
if t.warnableVal[w] == nil {
t.warnables = append(t.warnables, w) t.warnables = append(t.warnables, w)
} }
ws := &warningState{
BrokenSince: brokenSince,
Args: args,
}
prevWs := t.warnableVal[w]
mak.Set(&t.warnableVal, w, ws)
if !ws.Equal(prevWs) {
for _, cb := range t.watchers {
go cb(w, w.unhealthyState(ws))
}
}
}
// SetHealthy removes any warningState for the given Warnable.
func (t *Tracker) SetHealthy(w *Warnable) {
if t.nil() {
return
}
t.mu.Lock()
defer t.mu.Unlock()
t.setHealthyLocked(w)
}
func (t *Tracker) setHealthyLocked(w *Warnable) {
if t.warnableVal[w] == nil {
// Nothing to remove
return
}
delete(t.warnableVal, w)
for _, cb := range t.watchers {
go cb(w, nil)
}
} }
// AppendWarnableDebugFlags appends to base any health items that are currently in failed // AppendWarnableDebugFlags appends to base any health items that are currently in failed
@ -218,29 +349,31 @@ func (t *Tracker) AppendWarnableDebugFlags(base []string) []string {
t.mu.Lock() t.mu.Lock()
defer t.mu.Unlock() defer t.mu.Unlock()
for w, err := range t.warnableVal { for w, err := range t.warnableVal {
if w.debugFlag == "" { if w.MapDebugFlag == "" {
continue continue
} }
if err != nil { if err != nil {
ret = append(ret, w.debugFlag) ret = append(ret, w.MapDebugFlag)
} }
} }
sort.Strings(ret[len(base):]) // sort the new ones sort.Strings(ret[len(base):]) // sort the new ones
return ret return ret
} }
// RegisterWatcher adds a function that will be called if an // RegisterWatcher adds a function that will be called whenever the health state of any Warnable changes.
// error changes state either to unhealthy or from unhealthy. It is // If a Warnable becomes unhealthy or its unhealthy state is updated, the callback will be called with its
// not called on transition from unknown to healthy. It must be non-nil // current Representation.
// and is run in its own goroutine. The returned func unregisters it. // If a Warnable becomes healthy, the callback will be called with ws set to nil.
func (t *Tracker) RegisterWatcher(cb func(key Subsystem, err error)) (unregister func()) { // The provided callback function will be executed in its own goroutine. The returned function can be used
// to unregister the callback.
func (t *Tracker) RegisterWatcher(cb func(w *Warnable, r *UnhealthyState)) (unregister func()) {
if t.nil() { if t.nil() {
return func() {} return func() {}
} }
t.mu.Lock() t.mu.Lock()
defer t.mu.Unlock() defer t.mu.Unlock()
if t.watchers == nil { if t.watchers == nil {
t.watchers = set.HandleSet[func(Subsystem, error)]{} t.watchers = set.HandleSet[func(*Warnable, *UnhealthyState)]{}
} }
handle := t.watchers.Add(cb) handle := t.watchers.Add(cb)
if t.timer == nil { if t.timer == nil {
@ -258,31 +391,49 @@ func (t *Tracker) RegisterWatcher(cb func(key Subsystem, err error)) (unregister
} }
// SetRouterHealth sets the state of the wgengine/router.Router. // SetRouterHealth sets the state of the wgengine/router.Router.
//
// Deprecated: Warnables should be preferred over Subsystem errors.
func (t *Tracker) SetRouterHealth(err error) { t.setErr(SysRouter, err) } func (t *Tracker) SetRouterHealth(err error) { t.setErr(SysRouter, err) }
// RouterHealth returns the wgengine/router.Router error state. // RouterHealth returns the wgengine/router.Router error state.
//
// Deprecated: Warnables should be preferred over Subsystem errors.
func (t *Tracker) RouterHealth() error { return t.get(SysRouter) } func (t *Tracker) RouterHealth() error { return t.get(SysRouter) }
// SetDNSHealth sets the state of the net/dns.Manager // SetDNSHealth sets the state of the net/dns.Manager
//
// Deprecated: Warnables should be preferred over Subsystem errors.
func (t *Tracker) SetDNSHealth(err error) { t.setErr(SysDNS, err) } func (t *Tracker) SetDNSHealth(err error) { t.setErr(SysDNS, err) }
// DNSHealth returns the net/dns.Manager error state. // DNSHealth returns the net/dns.Manager error state.
//
// Deprecated: Warnables should be preferred over Subsystem errors.
func (t *Tracker) DNSHealth() error { return t.get(SysDNS) } func (t *Tracker) DNSHealth() error { return t.get(SysDNS) }
// SetDNSOSHealth sets the state of the net/dns.OSConfigurator // SetDNSOSHealth sets the state of the net/dns.OSConfigurator
//
// Deprecated: Warnables should be preferred over Subsystem errors.
func (t *Tracker) SetDNSOSHealth(err error) { t.setErr(SysDNSOS, err) } func (t *Tracker) SetDNSOSHealth(err error) { t.setErr(SysDNSOS, err) }
// SetDNSManagerHealth sets the state of the Linux net/dns manager's // SetDNSManagerHealth sets the state of the Linux net/dns manager's
// discovery of the /etc/resolv.conf situation. // discovery of the /etc/resolv.conf situation.
//
// Deprecated: Warnables should be preferred over Subsystem errors.
func (t *Tracker) SetDNSManagerHealth(err error) { t.setErr(SysDNSManager, err) } func (t *Tracker) SetDNSManagerHealth(err error) { t.setErr(SysDNSManager, err) }
// DNSOSHealth returns the net/dns.OSConfigurator error state. // DNSOSHealth returns the net/dns.OSConfigurator error state.
//
// Deprecated: Warnables should be preferred over Subsystem errors.
func (t *Tracker) DNSOSHealth() error { return t.get(SysDNSOS) } func (t *Tracker) DNSOSHealth() error { return t.get(SysDNSOS) }
// SetTKAHealth sets the health of the tailnet key authority. // SetTKAHealth sets the health of the tailnet key authority.
//
// Deprecated: Warnables should be preferred over Subsystem errors.
func (t *Tracker) SetTKAHealth(err error) { t.setErr(SysTKA, err) } func (t *Tracker) SetTKAHealth(err error) { t.setErr(SysTKA, err) }
// TKAHealth returns the tailnet key authority error state. // TKAHealth returns the tailnet key authority error state.
//
// Deprecated: Warnables should be preferred over Subsystem errors.
func (t *Tracker) TKAHealth() error { return t.get(SysTKA) } func (t *Tracker) TKAHealth() error { return t.get(SysTKA) }
// SetLocalLogConfigHealth sets the error state of this client's local log configuration. // SetLocalLogConfigHealth sets the error state of this client's local log configuration.
@ -362,8 +513,20 @@ func (t *Tracker) setLocked(key Subsystem, err error) {
} }
t.sysErr[key] = err t.sysErr[key] = err
t.selfCheckLocked() t.selfCheckLocked()
for _, cb := range t.watchers { }
go cb(key, err)
// updateLegacyErrorWarnableLocked takes a legacy Subsystem and an optional error, and
// updates the WarningState for that legacy Subsystem, setting it to healthy or unhealthy.
// It is used temporarily while we migrate from Subsystems to Warnables.
//
// Deprecated: this function will be removed after migrating all subsystem errors to use
// Warnables instead.
func (t *Tracker) updateLegacyErrorWarnableLocked(key Subsystem, err error) {
w := key.Warnable()
if err != nil {
t.setUnhealthyLocked(key.Warnable(), Args{legacyErrorArgKey: err.Error()})
} else {
t.setHealthyLocked(w)
} }
} }
@ -598,24 +761,7 @@ func (t *Tracker) selfCheckLocked() {
// Don't check yet. // Don't check yet.
return return
} }
t.setLocked(SysOverall, t.overallErrorLocked()) t.updateBuiltinWarnablesLocked()
}
// AppendWarnings appends all current health warnings to dst and returns the
// result.
func (t *Tracker) AppendWarnings(dst []string) []string {
err := t.OverallError()
if err == nil {
return dst
}
if me, ok := err.(multierr.Error); ok {
for _, err := range me.Errors() {
dst = append(dst, err.Error())
}
} else {
dst = append(dst, err.Error())
}
return dst
} }
// OverallError returns a summary of the health state. // OverallError returns a summary of the health state.
@ -628,108 +774,161 @@ func (t *Tracker) OverallError() error {
} }
t.mu.Lock() t.mu.Lock()
defer t.mu.Unlock() defer t.mu.Unlock()
return t.overallErrorLocked() t.updateBuiltinWarnablesLocked()
return t.multiErrLocked()
} }
var fakeErrForTesting = envknob.RegisterString("TS_DEBUG_FAKE_HEALTH_ERROR") // Strings() returns a string array containing the Text of all Warnings
// currently known to the Tracker. These strings can be presented to the
// networkErrorfLocked creates an error that indicates issues with outgoing network // user, although ideally you would use the Code property on each Warning
// connectivity. Any active warnings related to network connectivity will // to show a localized version of them instead.
// automatically be appended to it. // This function is here for legacy compatibility purposes and is deprecated.
// func (t *Tracker) Strings() []string {
// t.mu must be held. if t.nil() {
func (t *Tracker) networkErrorfLocked(format string, a ...any) error { return nil
errs := []error{
fmt.Errorf(format, a...),
}
for _, w := range t.warnables {
if !w.hasConnectivityImpact {
continue
} }
if err := t.warnableVal[w]; err != nil { t.mu.Lock()
errs = append(errs, err) defer t.mu.Unlock()
return t.stringsLocked()
} }
func (t *Tracker) stringsLocked() []string {
result := []string{}
for w, ws := range t.warnableVal {
if ws.Args == nil {
result = append(result, w.Text(Args{}))
} else {
result = append(result, w.Text(ws.Args))
} }
if len(errs) == 1 {
return errs[0]
} }
return multierr.New(errs...) return result
} }
var errNetworkDown = errors.New("network down") // errorsLocked returns an array of errors where each error is the Text
var errNotInMapPoll = errors.New("not in map poll") // of a Warning known to the Tracker.
var errNoDERPHome = errors.New("no DERP home") // This function is here for legacy compatibility purposes and is deprecated.
var errNoUDP4Bind = errors.New("no udp4 bind") func (t *Tracker) errorsLocked() []error {
var errUnstable = errors.New("This is an unstable (development) version of Tailscale; frequent updates and bugs are likely") strs := t.stringsLocked()
errs := []error{}
func (t *Tracker) overallErrorLocked() error { for _, str := range strs {
var errs []error errs = append(errs, errors.New(str))
add := func(err error) {
if err != nil {
errs = append(errs, err)
} }
return errs
} }
merged := func() error {
// multiErrLocked returns an error listing all errors known to the Tracker.
// This function is here for legacy compatibility purposes and is deprecated.
func (t *Tracker) multiErrLocked() error {
errs := t.errorsLocked()
return multierr.New(errs...) return multierr.New(errs...)
} }
var fakeErrForTesting = envknob.RegisterString("TS_DEBUG_FAKE_HEALTH_ERROR")
// updateBuiltinWarnablesLocked performs a number of checks on the state of the backend,
// and adds/removes Warnings from the Tracker as needed.
func (t *Tracker) updateBuiltinWarnablesLocked() {
if t.checkForUpdates { if t.checkForUpdates {
if cv := t.latestVersion; cv != nil && !cv.RunningLatest && cv.LatestVersion != "" { if cv := t.latestVersion; cv != nil && !cv.RunningLatest && cv.LatestVersion != "" {
if cv.UrgentSecurityUpdate { if cv.UrgentSecurityUpdate {
add(fmt.Errorf("Security update available: %v -> %v, run `tailscale update` or `tailscale set --auto-update` to update", version.Short(), cv.LatestVersion)) t.setUnhealthyLocked(securityUpdateAvailableWarnable, Args{
ArgCurrentVersion: version.Short(),
ArgAvailableVersion: cv.LatestVersion,
})
} else { } else {
add(fmt.Errorf("Update available: %v -> %v, run `tailscale update` or `tailscale set --auto-update` to update", version.Short(), cv.LatestVersion)) t.setUnhealthyLocked(updateAvailableWarnable, Args{
ArgCurrentVersion: version.Short(),
ArgAvailableVersion: cv.LatestVersion,
})
} }
} }
} }
if version.IsUnstableBuild() { if version.IsUnstableBuild() {
add(errUnstable) t.setUnhealthyLocked(unstableWarnable, Args{
ArgCurrentVersion: version.Short(),
})
} }
if v, ok := t.anyInterfaceUp.Get(); ok && !v { if v, ok := t.anyInterfaceUp.Get(); ok && !v {
add(errNetworkDown) t.setUnhealthyLocked(NetworkStatusWarnable, nil)
return merged() return
} else {
t.setHealthyLocked(NetworkStatusWarnable)
} }
if t.localLogConfigErr != nil { if t.localLogConfigErr != nil {
add(t.localLogConfigErr) t.setUnhealthyLocked(localLogWarnable, Args{
return merged() ArgError: t.localLogConfigErr.Error(),
})
return
} else {
t.setHealthyLocked(localLogWarnable)
} }
if !t.ipnWantRunning { if !t.ipnWantRunning {
add(fmt.Errorf("state=%v, wantRunning=%v", t.ipnState, t.ipnWantRunning)) t.setUnhealthyLocked(IPNStateWarnable, Args{
return merged() "State": t.ipnState,
})
return
} else {
t.setHealthyLocked(IPNStateWarnable)
} }
if t.lastLoginErr != nil { if t.lastLoginErr != nil {
add(fmt.Errorf("not logged in, last login error=%v", t.lastLoginErr)) t.setUnhealthyLocked(LoginStateWarnable, Args{
return merged() ArgError: t.lastLoginErr.Error(),
})
return
} else {
t.setHealthyLocked(LoginStateWarnable)
} }
now := time.Now() now := time.Now()
if !t.inMapPoll && (t.lastMapPollEndedAt.IsZero() || now.Sub(t.lastMapPollEndedAt) > 10*time.Second) { if !t.inMapPoll && (t.lastMapPollEndedAt.IsZero() || now.Sub(t.lastMapPollEndedAt) > 10*time.Second) {
add(errNotInMapPoll) t.setUnhealthyLocked(notInMapPollWarnable, nil)
return merged() return
} else {
t.setHealthyLocked(notInMapPollWarnable)
} }
const tooIdle = 2*time.Minute + 5*time.Second const tooIdle = 2*time.Minute + 5*time.Second
if d := now.Sub(t.lastStreamedMapResponse).Round(time.Second); d > tooIdle { if d := now.Sub(t.lastStreamedMapResponse).Round(time.Second); d > tooIdle {
add(t.networkErrorfLocked("no map response in %v", d)) t.setUnhealthyLocked(mapResponseTimeoutWarnable, Args{
return merged() ArgDuration: d.String(),
})
return
} else {
t.setHealthyLocked(mapResponseTimeoutWarnable)
} }
if !t.derpHomeless { if !t.derpHomeless {
rid := t.derpHomeRegion rid := t.derpHomeRegion
if rid == 0 { if rid == 0 {
add(errNoDERPHome) t.setUnhealthyLocked(noDERPHomeWarnable, nil)
return merged() return
} } else if !t.derpRegionConnected[rid] {
if !t.derpRegionConnected[rid] { t.setUnhealthyLocked(noDERPConnectionWarnable, Args{
add(t.networkErrorfLocked("not connected to home DERP region %v", rid)) ArgRegionID: fmt.Sprint(rid),
return merged() })
} return
if d := now.Sub(t.derpRegionLastFrame[rid]).Round(time.Second); d > tooIdle { } else if d := now.Sub(t.derpRegionLastFrame[rid]).Round(time.Second); d > tooIdle {
add(t.networkErrorfLocked("haven't heard from home DERP region %v in %v", rid, d)) t.setUnhealthyLocked(derpTimeoutWarnable, Args{
return merged() ArgRegionID: fmt.Sprint(rid),
ArgDuration: d.String(),
})
return
} }
} }
t.setHealthyLocked(noDERPHomeWarnable)
t.setHealthyLocked(noDERPConnectionWarnable)
t.setHealthyLocked(derpTimeoutWarnable)
if t.udp4Unbound { if t.udp4Unbound {
add(errNoUDP4Bind) t.setUnhealthyLocked(noUDP4BindWarnable, nil)
return merged() return
} else {
t.setHealthyLocked(noUDP4BindWarnable)
} }
// TODO: use // TODO: use
@ -738,43 +937,72 @@ func (t *Tracker) overallErrorLocked() error {
_ = t.lastStreamedMapResponse _ = t.lastStreamedMapResponse
_ = t.lastMapRequestHeard _ = t.lastMapRequestHeard
shouldClearMagicsockWarnings := false
for i := range t.MagicSockReceiveFuncs { for i := range t.MagicSockReceiveFuncs {
f := &t.MagicSockReceiveFuncs[i] f := &t.MagicSockReceiveFuncs[i]
if f.missing { if f.missing {
errs = append(errs, fmt.Errorf("%s is not running", f.name)) t.setUnhealthyLocked(magicsockReceiveFuncWarnable, Args{
} ArgMagicsockFunctionName: f.name,
} })
for sys, err := range t.sysErr { shouldClearMagicsockWarnings = false
if err == nil || sys == SysOverall {
continue
} }
errs = append(errs, fmt.Errorf("%v: %w", sys, err))
} }
for _, w := range t.warnables { if shouldClearMagicsockWarnings {
if err := t.warnableVal[w]; err != nil { t.setHealthyLocked(magicsockReceiveFuncWarnable)
errs = append(errs, err)
} }
// Iterates over the legacy subsystems and their error, and turns them into structured errors
for sys, err := range t.sysErr {
t.updateLegacyErrorWarnableLocked(sys, err)
} }
if len(t.derpRegionHealthProblem) > 0 {
for regionID, problem := range t.derpRegionHealthProblem { for regionID, problem := range t.derpRegionHealthProblem {
errs = append(errs, fmt.Errorf("derp%d: %v", regionID, problem)) t.setUnhealthyLocked(derpRegionErrorWarnable, Args{
ArgRegionID: fmt.Sprint(regionID),
ArgError: problem,
})
}
} else {
t.setHealthyLocked(derpRegionErrorWarnable)
} }
if len(t.controlHealth) > 0 {
for _, s := range t.controlHealth { for _, s := range t.controlHealth {
errs = append(errs, errors.New(s)) t.setUnhealthyLocked(controlHealthWarnable, Args{
ArgError: s,
})
} }
} else {
t.setHealthyLocked(controlHealthWarnable)
}
if err := envknob.ApplyDiskConfigError(); err != nil { if err := envknob.ApplyDiskConfigError(); err != nil {
errs = append(errs, err) t.setUnhealthyLocked(applyDiskConfigWarnable, Args{
ArgError: err.Error(),
})
} else {
t.setHealthyLocked(applyDiskConfigWarnable)
} }
if len(t.tlsConnectionErrors) > 0 {
for serverName, err := range t.tlsConnectionErrors { for serverName, err := range t.tlsConnectionErrors {
errs = append(errs, fmt.Errorf("TLS connection error for %q: %w", serverName, err)) t.setUnhealthyLocked(tlsConnectionFailedWarnable, Args{
ArgServerName: serverName,
ArgError: err.Error(),
})
} }
if e := fakeErrForTesting(); len(errs) == 0 && e != "" { } else {
return errors.New(e) t.setHealthyLocked(tlsConnectionFailedWarnable)
} }
sort.Slice(errs, func(i, j int) bool {
// Not super efficient (stringifying these in a sort), but probably max 2 or 3 items. if e := fakeErrForTesting(); len(t.warnables) == 0 && e != "" {
return errs[i].Error() < errs[j].Error() t.setUnhealthyLocked(testWarnable, Args{
ArgError: e,
}) })
return multierr.New(errs...) } else {
t.setHealthyLocked(testWarnable)
}
} }
// ReceiveFuncStats tracks the calls made to a wireguard-go receive func. // ReceiveFuncStats tracks the calls made to a wireguard-go receive func.

@ -4,19 +4,23 @@
package health package health
import ( import (
"errors"
"fmt" "fmt"
"reflect" "reflect"
"testing" "testing"
"time"
) )
func TestAppendWarnableDebugFlags(t *testing.T) { func TestAppendWarnableDebugFlags(t *testing.T) {
var tr Tracker var tr Tracker
for i := range 10 { for i := range 10 {
w := NewWarnable(WithMapDebugFlag(fmt.Sprint(i))) w := Register(&Warnable{
Code: WarnableCode(fmt.Sprintf("warnable-code-%d", i)),
MapDebugFlag: fmt.Sprint(i),
})
defer unregister(w)
if i%2 == 0 { if i%2 == 0 {
tr.SetWarnable(w, errors.New("boom")) tr.SetUnhealthy(w, Args{"test-arg": fmt.Sprint(i)})
} }
} }
@ -49,3 +53,126 @@ func TestNilMethodsDontCrash(t *testing.T) {
rv.Method(i).Call(args) rv.Method(i).Call(args)
} }
} }
func TestSetUnhealthyWithDuplicateThenHealthyAgain(t *testing.T) {
ht := Tracker{}
if len(ht.Strings()) != 0 {
t.Fatalf("before first insertion, len(newTracker.Strings) = %d; want = 0", len(ht.Strings()))
}
ht.SetUnhealthy(testWarnable, Args{ArgError: "Hello world 1"})
want := []string{"Hello world 1"}
if !reflect.DeepEqual(ht.Strings(), want) {
t.Fatalf("after calling SetUnhealthy, newTracker.Strings() = %v; want = %v", ht.Strings(), want)
}
// Adding a second warning state with the same WarningCode overwrites the existing warning state,
// the count shouldn't have changed.
ht.SetUnhealthy(testWarnable, Args{ArgError: "Hello world 2"})
want = []string{"Hello world 2"}
if !reflect.DeepEqual(ht.Strings(), want) {
t.Fatalf("after insertion of same WarningCode, newTracker.Strings() = %v; want = %v", ht.Strings(), want)
}
ht.SetHealthy(testWarnable)
want = []string{}
if !reflect.DeepEqual(ht.Strings(), want) {
t.Fatalf("after setting the healthy, newTracker.Strings() = %v; want = %v", ht.Strings(), want)
}
}
func TestRemoveAllWarnings(t *testing.T) {
ht := Tracker{}
if len(ht.Strings()) != 0 {
t.Fatalf("before first insertion, len(newTracker.Strings) = %d; want = 0", len(ht.Strings()))
}
ht.SetUnhealthy(testWarnable, Args{"Text": "Hello world 1"})
if len(ht.Strings()) != 1 {
t.Fatalf("after first insertion, len(newTracker.Strings) = %d; want = %d", len(ht.Strings()), 1)
}
ht.SetHealthy(testWarnable)
if len(ht.Strings()) != 0 {
t.Fatalf("after RemoveAll, len(newTracker.Strings) = %d; want = 0", len(ht.Strings()))
}
}
// TestWatcher tests that a registered watcher function gets called with the correct
// Warnable and non-nil/nil UnhealthyState upon setting a Warnable to unhealthy/healthy.
func TestWatcher(t *testing.T) {
ht := Tracker{}
wantText := "Hello world"
becameUnhealthy := make(chan struct{})
becameHealthy := make(chan struct{})
watcherFunc := func(w *Warnable, us *UnhealthyState) {
if w != testWarnable {
t.Fatalf("watcherFunc was called, but with an unexpected Warnable: %v, want: %v", w, testWarnable)
}
if us != nil {
if us.Text != wantText {
t.Fatalf("unexpected us.Text: %s, want: %s", us.Text, wantText)
}
if us.Args[ArgError] != wantText {
t.Fatalf("unexpected us.Args[ArgError]: %s, want: %s", us.Args[ArgError], wantText)
}
becameUnhealthy <- struct{}{}
} else {
becameHealthy <- struct{}{}
}
}
unregisterFunc := ht.RegisterWatcher(watcherFunc)
if len(ht.watchers) != 1 {
t.Fatalf("after RegisterWatcher, len(newTracker.watchers) = %d; want = 1", len(ht.watchers))
}
ht.SetUnhealthy(testWarnable, Args{ArgError: wantText})
select {
case <-becameUnhealthy:
// Test passed because the watcher got notified of an unhealthy state
case <-becameHealthy:
// Test failed because the watcher got of a healthy state instead of an unhealthy one
t.Fatalf("watcherFunc was called with a healthy state")
case <-time.After(1 * time.Second):
t.Fatalf("watcherFunc didn't get called upon calling SetUnhealthy")
}
ht.SetHealthy(testWarnable)
select {
case <-becameUnhealthy:
// Test failed because the watcher got of an unhealthy state instead of a healthy one
t.Fatalf("watcherFunc was called with an unhealthy state")
case <-becameHealthy:
// Test passed because the watcher got notified of a healthy state
case <-time.After(1 * time.Second):
t.Fatalf("watcherFunc didn't get called upon calling SetUnhealthy")
}
unregisterFunc()
if len(ht.watchers) != 0 {
t.Fatalf("after unregisterFunc, len(newTracker.watchers) = %d; want = 0", len(ht.watchers))
}
}
func TestRegisterWarnablePanicsWithDuplicate(t *testing.T) {
w := &Warnable{
Code: "test-warnable-1",
}
Register(w)
defer unregister(w)
if registeredWarnables[w.Code] != w {
t.Fatalf("after Register, registeredWarnables[%s] = %v; want = %v", w.Code, registeredWarnables[w.Code], w)
}
defer func() {
if r := recover(); r == nil {
t.Fatalf("Registering the same Warnable twice didn't panic")
}
}()
Register(w)
}

@ -0,0 +1,79 @@
// Copyright (c) Tailscale Inc & AUTHORS
// SPDX-License-Identifier: BSD-3-Clause
package health
import (
"time"
)
// State contains the health status of the backend, and is
// provided to the client UI via LocalAPI through ipn.Notify.
type State struct {
// Each key-value pair in Warnings represents a Warnable that is currently
// unhealthy. If a Warnable is healthy, it will not be present in this map.
// When a Warnable is unhealthy and becomes healthy, its key-value pair
// disappears in the next issued State. Observers should treat the absence of
// a WarnableCode in this map as an indication that the Warnable became healthy,
// and may use that to clear any notifications that were previously shown to the user.
// If Warnings is nil, all Warnables are healthy and the backend is overall healthy.
Warnings map[WarnableCode]UnhealthyState
}
// Representation contains information to be shown to the user to inform them
// that a Warnable is currently unhealthy.
type UnhealthyState struct {
WarnableCode WarnableCode
Severity Severity
Title string
Text string
BrokenSince *time.Time `json:",omitempty"`
Args Args `json:",omitempty"`
}
// unhealthyState returns a unhealthyState of the Warnable given its current warningState.
func (w *Warnable) unhealthyState(ws *warningState) *UnhealthyState {
var text string
if ws.Args != nil {
text = w.Text(ws.Args)
} else {
text = w.Text(Args{})
}
return &UnhealthyState{
WarnableCode: w.Code,
Severity: w.Severity,
Title: w.Title,
Text: text,
BrokenSince: &ws.BrokenSince,
Args: ws.Args,
}
}
// CurrentState returns a snapshot of the current health status of the backend.
// It returns a State with nil Warnings if the backend is healthy (all Warnables
// have no issues).
// The returned State is a snapshot of shared memory, and the caller should not
// mutate the returned value.
func (t *Tracker) CurrentState() *State {
if t.nil() {
return &State{}
}
t.mu.Lock()
defer t.mu.Unlock()
if t.warnableVal == nil || len(t.warnableVal) == 0 {
return &State{}
}
wm := map[WarnableCode]UnhealthyState{}
for w, ws := range t.warnableVal {
wm[w.Code] = *w.unhealthyState(ws)
}
return &State{
Warnings: wm,
}
}

@ -0,0 +1,206 @@
// Copyright (c) Tailscale Inc & AUTHORS
// SPDX-License-Identifier: BSD-3-Clause
package health
import (
"fmt"
)
/**
This file contains definitions for the Warnables maintained within this `health` package.
*/
// updateAvailableWarnable is a Warnable that warns the user that an update is available.
var updateAvailableWarnable = Register(&Warnable{
Code: "update-available",
Title: "Update available",
Severity: SeverityLow,
Text: func(args Args) string {
return fmt.Sprintf("An update from version %s to %s is available. Run `tailscale update` or `tailscale set --auto-update` to update.", args[ArgCurrentVersion], args[ArgAvailableVersion])
},
})
// securityUpdateAvailableWarnable is a Warnable that warns the user that an important security update is available.
var securityUpdateAvailableWarnable = Register(&Warnable{
Code: "security-update-available",
Title: "Security update available",
Severity: SeverityHigh,
Text: func(args Args) string {
return fmt.Sprintf("An urgent security update from version %s to %s is available. Run `tailscale update` or `tailscale set --auto-update` to update now.", args[ArgCurrentVersion], args[ArgAvailableVersion])
},
})
// unstableWarnable is a Warnable that warns the user that they are using an unstable version of Tailscale
// so they won't be surprised by all the issues that may arise.
var unstableWarnable = Register(&Warnable{
Code: "is-using-unstable-version",
Title: "Using an unstable version",
Severity: SeverityLow,
Text: StaticMessage("This is an unstable version of Tailscale meant for testing and development purposes: please report any bugs to Tailscale."),
})
// NetworkStatusWarnable is a Warnable that warns the user that the network is down.
var NetworkStatusWarnable = Register(&Warnable{
Code: "network-status",
Title: "Network down",
Severity: SeverityHigh,
Text: StaticMessage("Tailscale cannot connect because the network is down. (No network interface is up.)"),
ImpactsConnectivity: true,
})
// IPNStateWarnable is a Warnable that warns the user that Tailscale is stopped.
var IPNStateWarnable = Register(&Warnable{
Code: "wantrunning-false",
Title: "Not connected to Tailscale",
Severity: SeverityLow,
Text: StaticMessage("Tailscale is stopped."),
})
// localLogWarnable is a Warnable that warns the user that the local log is misconfigured.
var localLogWarnable = Register(&Warnable{
Code: "local-log-config-error",
Title: "Local log misconfiguration",
Severity: SeverityLow,
Text: func(args Args) string {
return fmt.Sprintf("The local log is misconfigured: %v", args[ArgError])
},
})
// LoginStateWarnable is a Warnable that warns the user that they are logged out,
// and provides the last login error if available.
var LoginStateWarnable = Register(&Warnable{
Code: "login-state",
Title: "Logged out",
Severity: SeverityMedium,
Text: func(args Args) string {
if args[ArgError] != "" {
return fmt.Sprintf("You are logged out. The last login error was: %v", args[ArgError])
} else {
return "You are logged out."
}
},
})
// notInMapPollWarnable is a Warnable that warns the user that they cannot connect to the control server.
var notInMapPollWarnable = Register(&Warnable{
Code: "not-in-map-poll",
Title: "Cannot connect to control server",
Severity: SeverityMedium,
DependsOn: []*Warnable{NetworkStatusWarnable},
Text: StaticMessage("Cannot connect to the control server (not in map poll). Check your Internet connection."),
})
// noDERPHomeWarnable is a Warnable that warns the user that Tailscale doesn't have a home DERP.
var noDERPHomeWarnable = Register(&Warnable{
Code: "no-derp-home",
Title: "No home relay server",
Severity: SeverityHigh,
DependsOn: []*Warnable{NetworkStatusWarnable},
Text: StaticMessage("Tailscale could not connect to any relay server. Check your Internet connection."),
})
// noDERPConnectionWarnable is a Warnable that warns the user that Tailscale couldn't connect to a specific DERP server.
var noDERPConnectionWarnable = Register(&Warnable{
Code: "no-derp-connection",
Title: "No relay server connection",
Severity: SeverityHigh,
DependsOn: []*Warnable{NetworkStatusWarnable},
Text: func(args Args) string {
return fmt.Sprintf("Tailscale could not connect to the relay server '%s'. The server might be temporarily unavailable, or your Internet connection might be down.", args[ArgRegionID])
},
})
// derpTimeoutWarnable is a Warnable that warns the user that Tailscale hasn't heard from the home DERP region for a while.
var derpTimeoutWarnable = Register(&Warnable{
Code: "derp-timed-out",
Title: "Relay server timed out",
Severity: SeverityMedium,
DependsOn: []*Warnable{NetworkStatusWarnable},
Text: func(args Args) string {
return fmt.Sprintf("Tailscale hasn't heard from the home relay server (region %v) in %v. The server might be temporarily unavailable, or your Internet connection might be down.", args[ArgRegionID], args[ArgDuration])
},
})
// derpRegionErrorWarnable is a Warnable that warns the user that a DERP region is reporting an issue.
var derpRegionErrorWarnable = Register(&Warnable{
Code: "derp-region-error",
Title: "Relay server error",
Severity: SeverityMedium,
DependsOn: []*Warnable{NetworkStatusWarnable},
Text: func(args Args) string {
return fmt.Sprintf("The relay server #%v is reporting an issue: %v", args[ArgRegionID], args[ArgError])
},
})
// noUDP4BindWarnable is a Warnable that warns the user that Tailscale couldn't listen for incoming UDP connections.
var noUDP4BindWarnable = Register(&Warnable{
Code: "no-udp4-bind",
Title: "Incoming connections may fail",
Severity: SeverityHigh,
DependsOn: []*Warnable{NetworkStatusWarnable},
Text: StaticMessage("Tailscale couldn't listen for incoming UDP connections."),
ImpactsConnectivity: true,
})
// mapResponseTimeoutWarnable is a Warnable that warns the user that Tailscale hasn't received a network map from the coordination server in a while.
var mapResponseTimeoutWarnable = Register(&Warnable{
Code: "mapresponse-timeout",
Title: "Network map response timeout",
Severity: SeverityMedium,
DependsOn: []*Warnable{NetworkStatusWarnable},
Text: func(args Args) string {
return fmt.Sprintf("Tailscale hasn't received a network map from the coordination server in %s.", args[ArgDuration])
},
})
// tlsConnectionFailedWarnable is a Warnable that warns the user that Tailscale could not establish an encrypted connection with a server.
var tlsConnectionFailedWarnable = Register(&Warnable{
Code: "tls-connection-failed",
Title: "Encrypted connection failed",
Severity: SeverityMedium,
DependsOn: []*Warnable{NetworkStatusWarnable},
Text: func(args Args) string {
return fmt.Sprintf("Tailscale could not establish an encrypted connection with '%q': %v", args[ArgServerName], args[ArgError])
},
})
// magicsockReceiveFuncWarnable is a Warnable that warns the user that one of the Magicsock functions is not running.
var magicsockReceiveFuncWarnable = Register(&Warnable{
Code: "magicsock-receive-func-error",
Title: "MagicSock function not running",
Severity: SeverityMedium,
Text: func(args Args) string {
return fmt.Sprintf("The MagicSock function %s is not running. You might experience connectivity issues.", args[ArgMagicsockFunctionName])
},
})
// testWarnable is a Warnable that is used within this package for testing purposes only.
var testWarnable = Register(&Warnable{
Code: "test-warnable",
Title: "Test warnable",
Severity: SeverityLow,
Text: func(args Args) string {
return args[ArgError]
},
})
// applyDiskConfigWarnable is a Warnable that warns the user that there was an error applying the envknob config stored on disk.
var applyDiskConfigWarnable = Register(&Warnable{
Code: "apply-disk-config",
Title: "Could not apply configuration",
Severity: SeverityMedium,
Text: func(args Args) string {
return fmt.Sprintf("An error occurred applying the Tailscale envknob configuration stored on disk: %v", args[ArgError])
},
})
// controlHealthWarnable is a Warnable that warns the user that the coordination server is reporting an health issue.
var controlHealthWarnable = Register(&Warnable{
Code: "control-health",
Title: "Coordination server reports an issue",
Severity: SeverityMedium,
Text: func(args Args) string {
return fmt.Sprintf("The coordination server is reporting an health issue: %v", args[ArgError])
},
})

@ -9,6 +9,7 @@ import (
"time" "time"
"tailscale.com/drive" "tailscale.com/drive"
"tailscale.com/health"
"tailscale.com/ipn/ipnstate" "tailscale.com/ipn/ipnstate"
"tailscale.com/tailcfg" "tailscale.com/tailcfg"
"tailscale.com/types/empty" "tailscale.com/types/empty"
@ -70,6 +71,8 @@ const (
NotifyNoPrivateKeys // if set, private keys that would normally be sent in updates are zeroed out NotifyNoPrivateKeys // if set, private keys that would normally be sent in updates are zeroed out
NotifyInitialDriveShares // if set, the first Notify message (sent immediately) will contain the current Taildrive Shares NotifyInitialDriveShares // if set, the first Notify message (sent immediately) will contain the current Taildrive Shares
NotifyInitialOutgoingFiles // if set, the first Notify message (sent immediately) will contain the current Taildrop OutgoingFiles NotifyInitialOutgoingFiles // if set, the first Notify message (sent immediately) will contain the current Taildrop OutgoingFiles
NotifyInitialHealthState // if set, the first Notify message (sent immediately) will contain the current health.State of the client
) )
// Notify is a communication from a backend (e.g. tailscaled) to a frontend // Notify is a communication from a backend (e.g. tailscaled) to a frontend
@ -138,6 +141,11 @@ type Notify struct {
// empty value means that there are no shares. // empty value means that there are no shares.
DriveShares views.SliceView[*drive.Share, drive.ShareView] DriveShares views.SliceView[*drive.Share, drive.ShareView]
// Health is the last-known health state of the backend. When this field is
// non-nil, a change in health verified, and the API client should surface
// any changes to the user in the UI.
Health *health.State `json:",omitempty"`
// type is mirrored in xcode/Shared/IPN.swift // type is mirrored in xcode/Shared/IPN.swift
} }
@ -177,6 +185,9 @@ func (n Notify) String() string {
if n.LocalTCPPort != nil { if n.LocalTCPPort != nil {
fmt.Fprintf(&sb, "tcpport=%v ", n.LocalTCPPort) fmt.Fprintf(&sb, "tcpport=%v ", n.LocalTCPPort)
} }
if n.Health != nil {
sb.WriteString("Health{...} ")
}
s := sb.String() s := sb.String()
return s[0:len(s)-1] + "}" return s[0:len(s)-1] + "}"
} }

@ -666,12 +666,18 @@ func (b *LocalBackend) linkChange(delta *netmon.ChangeDelta) {
} }
} }
func (b *LocalBackend) onHealthChange(sys health.Subsystem, err error) { func (b *LocalBackend) onHealthChange(w *health.Warnable, us *health.UnhealthyState) {
if err == nil { if us == nil {
b.logf("health(%q): ok", sys) b.logf("health(warnable=%s): ok", w.Code)
} else { } else {
b.logf("health(%q): error: %v", sys, err) b.logf("health(warnable=%s): error: %s", w.Code, us.Text)
} }
// Whenever health changes, send the current health state to the frontend.
state := b.health.CurrentState()
b.send(ipn.Notify{
Health: state,
})
} }
// Shutdown halts the backend and all its sub-components. The backend // Shutdown halts the backend and all its sub-components. The backend
@ -788,7 +794,7 @@ func (b *LocalBackend) UpdateStatus(sb *ipnstate.StatusBuilder) {
if prefs := b.pm.CurrentPrefs(); prefs.Valid() && prefs.AutoUpdate().Check { if prefs := b.pm.CurrentPrefs(); prefs.Valid() && prefs.AutoUpdate().Check {
s.ClientVersion = b.lastClientVersion s.ClientVersion = b.lastClientVersion
} }
s.Health = b.health.AppendWarnings(s.Health) s.Health = b.health.Strings()
s.HaveNodeKey = b.hasNodeKeyLocked() s.HaveNodeKey = b.hasNodeKeyLocked()
// TODO(bradfitz): move this health check into a health.Warnable // TODO(bradfitz): move this health check into a health.Warnable
@ -1870,7 +1876,13 @@ func (b *LocalBackend) Start(opts ipn.Options) error {
return nil return nil
} }
var warnInvalidUnsignedNodes = health.NewWarnable() // invalidPacketFilterWarnable is a Warnable to warn the user that the control server sent an invalid packet filter.
var invalidPacketFilterWarnable = health.Register(&health.Warnable{
Code: "invalid-packet-filter",
Title: "Invalid packet filter",
Severity: health.SeverityHigh,
Text: health.StaticMessage("The coordination server sent an invalid packet filter permitting traffic to unlocked nodes; rejecting all packets for safety"),
})
// updateFilterLocked updates the packet filter in wgengine based on the // updateFilterLocked updates the packet filter in wgengine based on the
// given netMap and user preferences. // given netMap and user preferences.
@ -1902,11 +1914,10 @@ func (b *LocalBackend) updateFilterLocked(netMap *netmap.NetworkMap, prefs ipn.P
packetFilter = netMap.PacketFilter packetFilter = netMap.PacketFilter
if packetFilterPermitsUnlockedNodes(b.peers, packetFilter) { if packetFilterPermitsUnlockedNodes(b.peers, packetFilter) {
err := errors.New("server sent invalid packet filter permitting traffic to unlocked nodes; rejecting all packets for safety") b.health.SetUnhealthy(invalidPacketFilterWarnable, nil)
b.health.SetWarnable(warnInvalidUnsignedNodes, err)
packetFilter = nil packetFilter = nil
} else { } else {
b.health.SetWarnable(warnInvalidUnsignedNodes, nil) b.health.SetHealthy(invalidPacketFilterWarnable)
} }
} }
if prefs.Valid() { if prefs.Valid() {
@ -2309,6 +2320,9 @@ func (b *LocalBackend) WatchNotifications(ctx context.Context, mask ipn.NotifyWa
if mask&ipn.NotifyInitialDriveShares != 0 && b.driveSharingEnabledLocked() { if mask&ipn.NotifyInitialDriveShares != 0 && b.driveSharingEnabledLocked() {
ini.DriveShares = b.pm.prefs.DriveShares() ini.DriveShares = b.pm.prefs.DriveShares()
} }
if mask&ipn.NotifyInitialHealthState != 0 {
ini.Health = b.HealthTracker().CurrentState()
}
} }
mak.Set(&b.notifyWatchers, sessionID, &watchSession{ch, sessionID}) mak.Set(&b.notifyWatchers, sessionID, &watchSession{ch, sessionID})
@ -3120,20 +3134,31 @@ func (b *LocalBackend) isDefaultServerLocked() bool {
return prefs.ControlURLOrDefault() == ipn.DefaultControlURL return prefs.ControlURLOrDefault() == ipn.DefaultControlURL
} }
var warnExitNodeUsage = health.NewWarnable(health.WithConnectivityImpact()) var exitNodeMisconfigurationWarnable = health.Register(&health.Warnable{
Code: "exit-node-misconfiguration",
Title: "Exit node misconfiguration",
Severity: health.SeverityMedium,
Text: func(args health.Args) string {
return "Exit node misconfiguration: " + args[health.ArgError]
},
})
// updateExitNodeUsageWarning updates a warnable meant to notify users of // updateExitNodeUsageWarning updates a warnable meant to notify users of
// configuration issues that could break exit node usage. // configuration issues that could break exit node usage.
func updateExitNodeUsageWarning(p ipn.PrefsView, state *netmon.State, health *health.Tracker) { func updateExitNodeUsageWarning(p ipn.PrefsView, state *netmon.State, healthTracker *health.Tracker) {
var result error var msg string
if p.ExitNodeIP().IsValid() || p.ExitNodeID() != "" { if p.ExitNodeIP().IsValid() || p.ExitNodeID() != "" {
warn, _ := netutil.CheckReversePathFiltering(state) warn, _ := netutil.CheckReversePathFiltering(state)
const comment = "please set rp_filter=2 instead of rp_filter=1; see https://github.com/tailscale/tailscale/issues/3310" const comment = "please set rp_filter=2 instead of rp_filter=1; see https://github.com/tailscale/tailscale/issues/3310"
if len(warn) > 0 { if len(warn) > 0 {
result = fmt.Errorf("%s: %v, %s", healthmsg.WarnExitNodeUsage, warn, comment) msg = fmt.Sprintf("%s: %v, %s", healthmsg.WarnExitNodeUsage, warn, comment)
} }
} }
health.SetWarnable(warnExitNodeUsage, result) if len(msg) > 0 {
healthTracker.SetUnhealthy(exitNodeMisconfigurationWarnable, health.Args{health.ArgError: msg})
} else {
healthTracker.SetHealthy(exitNodeMisconfigurationWarnable)
}
} }
func (b *LocalBackend) checkExitNodePrefsLocked(p *ipn.Prefs) error { func (b *LocalBackend) checkExitNodePrefsLocked(p *ipn.Prefs) error {
@ -5841,13 +5866,18 @@ func (b *LocalBackend) sshServerOrInit() (_ SSHServer, err error) {
return b.sshServer, nil return b.sshServer, nil
} }
var warnSSHSELinux = health.NewWarnable() var warnSSHSELinuxWarnable = health.Register(&health.Warnable{
Code: "ssh-unavailable-selinux-enabled",
Title: "Tailscale SSH and SELinux",
Severity: health.SeverityLow,
Text: health.StaticMessage("SELinux is enabled; Tailscale SSH may not work. See https://tailscale.com/s/ssh-selinux"),
})
func (b *LocalBackend) updateSELinuxHealthWarning() { func (b *LocalBackend) updateSELinuxHealthWarning() {
if hostinfo.IsSELinuxEnforcing() { if hostinfo.IsSELinuxEnforcing() {
b.health.SetWarnable(warnSSHSELinux, errors.New("SELinux is enabled; Tailscale SSH may not work. See https://tailscale.com/s/ssh-selinux")) b.health.SetUnhealthy(warnSSHSELinuxWarnable, nil)
} else { } else {
b.health.SetWarnable(warnSSHSELinux, nil) b.health.SetHealthy(warnSSHSELinuxWarnable)
} }
} }

@ -6,7 +6,6 @@ package dns
import ( import (
"bytes" "bytes"
"context" "context"
"errors"
"github.com/illarion/gonotify" "github.com/illarion/gonotify"
"tailscale.com/health" "tailscale.com/health"
@ -58,7 +57,12 @@ func (m *directManager) runFileWatcher() {
} }
} }
var warnTrample = health.NewWarnable() var resolvTrampleWarnable = health.Register(&health.Warnable{
Code: "resolv-conf-overwritten",
Severity: health.SeverityMedium,
Title: "Linux DNS configuration issue",
Text: health.StaticMessage("Linux DNS config not ideal. /etc/resolv.conf overwritten. See https://tailscale.com/s/dns-fight"),
})
// checkForFileTrample checks whether /etc/resolv.conf has been trampled // checkForFileTrample checks whether /etc/resolv.conf has been trampled
// by another program on the system. (e.g. a DHCP client) // by another program on the system. (e.g. a DHCP client)
@ -78,7 +82,7 @@ func (m *directManager) checkForFileTrample() {
return return
} }
if bytes.Equal(cur, want) { if bytes.Equal(cur, want) {
m.health.SetWarnable(warnTrample, nil) m.health.SetHealthy(resolvTrampleWarnable)
if lastWarn != nil { if lastWarn != nil {
m.mu.Lock() m.mu.Lock()
m.lastWarnContents = nil m.lastWarnContents = nil
@ -101,7 +105,7 @@ func (m *directManager) checkForFileTrample() {
show = show[:1024] show = show[:1024]
} }
m.logf("trample: resolv.conf changed from what we expected. did some other program interfere? current contents: %q", show) m.logf("trample: resolv.conf changed from what we expected. did some other program interfere? current contents: %q", show)
m.health.SetWarnable(warnTrample, errors.New("Linux DNS config not ideal. /etc/resolv.conf overwritten. See https://tailscale.com/s/dns-fight")) m.health.SetUnhealthy(resolvTrampleWarnable, nil)
} }
func (m *directManager) closeInotifyOnDone(ctx context.Context, in *gonotify.Inotify) { func (m *directManager) closeInotifyOnDone(ctx context.Context, in *gonotify.Inotify) {

@ -14,17 +14,18 @@ import (
"sort" "sort"
"time" "time"
ole "github.com/go-ole/go-ole"
"github.com/tailscale/wireguard-go/tun"
"go4.org/netipx"
"golang.org/x/sys/windows"
"golang.zx2c4.com/wireguard/windows/tunnel/winipcfg"
"tailscale.com/health" "tailscale.com/health"
"tailscale.com/net/netmon" "tailscale.com/net/netmon"
"tailscale.com/net/tsaddr" "tailscale.com/net/tsaddr"
"tailscale.com/net/tstun" "tailscale.com/net/tstun"
"tailscale.com/util/multierr" "tailscale.com/util/multierr"
"tailscale.com/wgengine/winnet" "tailscale.com/wgengine/winnet"
ole "github.com/go-ole/go-ole"
"github.com/tailscale/wireguard-go/tun"
"go4.org/netipx"
"golang.org/x/sys/windows"
"golang.zx2c4.com/wireguard/windows/tunnel/winipcfg"
) )
// monitorDefaultRoutes subscribes to route change events and updates // monitorDefaultRoutes subscribes to route change events and updates
@ -235,9 +236,17 @@ func interfaceFromLUID(luid winipcfg.LUID, flags winipcfg.GAAFlags) (*winipcfg.I
return nil, fmt.Errorf("interfaceFromLUID: interface with LUID %v not found", luid) return nil, fmt.Errorf("interfaceFromLUID: interface with LUID %v not found", luid)
} }
var networkCategoryWarning = health.NewWarnable(health.WithMapDebugFlag("warn-network-category-unhealthy")) var networkCategoryWarnable = health.Register(&health.Warnable{
Code: "set-network-category-failed",
Severity: health.SeverityMedium,
Title: "Windows network configuration failed",
Text: func(args health.Args) string {
return fmt.Sprintf("Failed to set the network category to private on the Tailscale adapter. This may prevent Tailscale from working correctly. Error: %s", args[health.ArgError])
},
MapDebugFlag: "warn-network-category-unhealthy",
})
func configureInterface(cfg *Config, tun *tun.NativeTun, health *health.Tracker) (retErr error) { func configureInterface(cfg *Config, tun *tun.NativeTun, ht *health.Tracker) (retErr error) {
var mtu = tstun.DefaultTUNMTU() var mtu = tstun.DefaultTUNMTU()
luid := winipcfg.LUID(tun.LUID()) luid := winipcfg.LUID(tun.LUID())
iface, err := interfaceFromLUID(luid, iface, err := interfaceFromLUID(luid,
@ -268,10 +277,10 @@ func configureInterface(cfg *Config, tun *tun.NativeTun, health *health.Tracker)
for i := range tries { for i := range tries {
found, err := setPrivateNetwork(luid) found, err := setPrivateNetwork(luid)
if err != nil { if err != nil {
health.SetWarnable(networkCategoryWarning, fmt.Errorf("set-network-category: %w", err)) ht.SetUnhealthy(networkCategoryWarnable, health.Args{health.ArgError: err.Error()})
log.Printf("setPrivateNetwork(try=%d): %v", i, err) log.Printf("setPrivateNetwork(try=%d): %v", i, err)
} else { } else {
health.SetWarnable(networkCategoryWarning, nil) ht.SetHealthy(networkCategoryWarnable)
if found { if found {
if i > 0 { if i > 0 {
log.Printf("setPrivateNetwork(try=%d): success", i) log.Printf("setPrivateNetwork(try=%d): success", i)

@ -445,12 +445,17 @@ func (r *linuxRouter) Set(cfg *Config) error {
return multierr.New(errs...) return multierr.New(errs...)
} }
var warnStatefulFilteringWithDocker = health.NewWarnable() var dockerStatefulFilteringWarnable = health.Register(&health.Warnable{
Code: "docker-stateful-filtering",
Title: "Docker with stateful filtering",
Severity: health.SeverityMedium,
Text: health.StaticMessage("Stateful filtering is enabled and Docker was detected; this may prevent Docker containers on this host from resolving DNS and connecting to Tailscale nodes. See https://tailscale.com/s/stateful-docker"),
})
func (r *linuxRouter) updateStatefulFilteringWithDockerWarning(cfg *Config) { func (r *linuxRouter) updateStatefulFilteringWithDockerWarning(cfg *Config) {
// If stateful filtering is disabled, clear the warning. // If stateful filtering is disabled, clear the warning.
if !r.statefulFiltering { if !r.statefulFiltering {
r.health.SetWarnable(warnStatefulFilteringWithDocker, nil) r.health.SetHealthy(dockerStatefulFilteringWarnable)
return return
} }
@ -479,17 +484,13 @@ func (r *linuxRouter) updateStatefulFilteringWithDockerWarning(cfg *Config) {
// socket/daemon/etc. // socket/daemon/etc.
ifstate := r.netMon.InterfaceState() ifstate := r.netMon.InterfaceState()
if _, found := ifstate.Interface["docker0"]; found { if _, found := ifstate.Interface["docker0"]; found {
r.health.SetWarnable(warnStatefulFilteringWithDocker, fmt.Errorf(""+ r.health.SetUnhealthy(dockerStatefulFilteringWarnable, nil)
"Stateful filtering is enabled and Docker was detected; this may prevent Docker containers "+
"on this host from resolving DNS and connecting to Tailscale nodes. "+
"See https://tailscale.com/s/stateful-docker",
))
return return
} }
} }
// If we get here, then we have no warnings; clear anything existing. // If we get here, then we have no warnings; clear anything existing.
r.health.SetWarnable(warnStatefulFilteringWithDocker, nil) r.health.SetHealthy(dockerStatefulFilteringWarnable)
} }
// UpdateMagicsockPort implements the Router interface. // UpdateMagicsockPort implements the Router interface.

Loading…
Cancel
Save