tailcfg, health: add way for control plane to add problems to health check

So if the control plane knows that something's broken about the node, it can
include problem(s) in MapResponse and "tailscale status" will show it.
(and GUIs in the future, as it's in ipnstate.Status/JSON)

This also bumps the MapRequest.Version, though it's not strictly
required. Doesn't hurt.

Signed-off-by: Brad Fitzpatrick <bradfitz@tailscale.com>
pull/2885/head
Brad Fitzpatrick 3 years ago committed by Brad Fitzpatrick
parent b14db5d943
commit aae622314e

@ -44,6 +44,7 @@ type mapSession struct {
collectServices bool
previousPeers []*tailcfg.Node // for delta-purposes
lastDomain string
lastHealth []string
// netMapBuilding is non-nil during a netmapForResponse call,
// containing the value to be returned, once fully populated.
@ -105,6 +106,9 @@ func (ms *mapSession) netmapForResponse(resp *tailcfg.MapResponse) *netmap.Netwo
if resp.Domain != "" {
ms.lastDomain = resp.Domain
}
if resp.Health != nil {
ms.lastHealth = resp.Health
}
nm := &netmap.NetworkMap{
NodeKey: tailcfg.NodeKey(ms.privateNodeKey.Public()),
@ -118,6 +122,7 @@ func (ms *mapSession) netmapForResponse(resp *tailcfg.MapResponse) *netmap.Netwo
CollectServices: ms.collectServices,
DERPMap: ms.lastDERPMap,
Debug: resp.Debug,
ControlHealth: ms.lastHealth,
}
ms.netMapBuilding = nm

@ -40,6 +40,7 @@ var (
ipnWantRunning bool
anyInterfaceUp = true // until told otherwise
udp4Unbound bool
controlHealth []string
)
// Subsystem is the name of a subsystem whose health can be monitored.
@ -141,6 +142,13 @@ func setLocked(key Subsystem, err error) {
}
}
func SetControlHealth(problems []string) {
mu.Lock()
defer mu.Unlock()
controlHealth = problems
selfCheckLocked()
}
// GotStreamedMapResponse notes that we got a tailcfg.MapResponse
// message in streaming mode, even if it's just a keep-alive message.
func GotStreamedMapResponse() {
@ -318,6 +326,9 @@ func overallErrorLocked() error {
for regionID, problem := range derpRegionHealthProblem {
errs = append(errs, fmt.Errorf("derp%d: %v", regionID, problem))
}
for _, s := range controlHealth {
errs = append(errs, errors.New(s))
}
if e := fakeErrForTesting; len(errs) == 0 && e != "" {
return errors.New(e)
}

@ -2548,6 +2548,12 @@ func (b *LocalBackend) setNetMapLocked(nm *netmap.NetworkMap) {
}
b.maybePauseControlClientLocked()
if nm != nil {
health.SetControlHealth(nm.ControlHealth)
} else {
health.SetControlHealth(nil)
}
// Determine if file sharing is enabled
fs := hasCapability(nm, tailcfg.CapabilityFileSharing)
if fs != b.capFileSharing {

@ -47,7 +47,8 @@ import (
// 21: 2021-06-15: added MapResponse.DNSConfig.CertDomains
// 22: 2021-06-16: added MapResponse.DNSConfig.ExtraRecords
// 23: 2021-08-25: DNSConfig.Routes values may be empty (for ExtraRecords support in 1.14.1+)
const CurrentMapRequestVersion = 23
// 24: 2021-09-18: MapResponse.Health from control to node; node shows in "tailscale status"
const CurrentMapRequestVersion = 24
type StableID string
@ -1028,6 +1029,14 @@ type MapResponse struct {
// user profiles only.
UserProfiles []UserProfile `json:",omitempty"`
// Health, if non-nil, sets the health state
// of the node from the control plane's perspective.
// A nil value means no change from the previous MapResponse.
// A non-nil 0-length slice restores the health to good (no known problems).
// A non-zero length slice are the list of problems that the control place
// sees.
Health []string `json:",omitempty"`
// Debug is normally nil, except for when the control server
// is setting debug settings on a node.
Debug *Debug `json:",omitempty"`

@ -54,6 +54,13 @@ type NetworkMap struct {
// Debug knobs from control server for debug or feature gating.
Debug *tailcfg.Debug
// ControlHealth are the list of health check problems for this
// node from the perspective of the control plane.
// If empty, there are no known problems from the control plane's
// point of view, but the node might know about its own health
// check problems.
ControlHealth []string
// ACLs
User tailcfg.UserID

Loading…
Cancel
Save