health: support delayed Warnable visibility (#12783)

Updates tailscale/tailscale#4136

To reduce the likelihood of presenting spurious warnings, add the ability to delay the visibility of certain Warnables, based on a TimeToVisible time.Duration field on each Warnable. The default is zero, meaning that a Warnable is immediately visible to the user when it enters an unhealthy state.

Signed-off-by: Andrea Gottardo <andrea@gottardo.me>
pull/12791/head
Andrea Gottardo 4 months ago committed by GitHub
parent 8d7b78f3f7
commit b7c3cfe049
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -69,6 +69,9 @@ type Tracker struct {
warnables []*Warnable // keys ever set
warnableVal map[*Warnable]*warningState
// pendingVisibleTimers contains timers for Warnables that are unhealthy, but are
// not visible to the user yet, because they haven't been unhealthy for TimeToVisible
pendingVisibleTimers map[*Warnable]*time.Timer
// sysErr maps subsystems to their current error (or nil if the subsystem is healthy)
// Deprecated: using Warnables should be preferred
@ -162,6 +165,7 @@ func Register(w *Warnable) *Warnable {
if registeredWarnables[w.Code] != nil {
panic(fmt.Sprintf("health: a Warnable with code %q was already registered", w.Code))
}
mak.Set(&registeredWarnables, w.Code, w)
return w
}
@ -218,6 +222,11 @@ type Warnable struct {
// the client GUI supports a tray icon, the client will display an exclamation mark
// on the tray icon when ImpactsConnectivity is set to true and the Warnable is unhealthy.
ImpactsConnectivity bool
// TimeToVisible is the Duration that the Warnable has to be in an unhealthy state before it
// should be surfaced as unhealthy to the user. This is used to prevent transient errors from being
// displayed to the user.
TimeToVisible time.Duration
}
// StaticMessage returns a function that always returns the input string, to be used in
@ -291,6 +300,15 @@ func (ws *warningState) Equal(other *warningState) bool {
return ws.BrokenSince.Equal(other.BrokenSince) && maps.Equal(ws.Args, other.Args)
}
// IsVisible returns whether the Warnable should be visible to the user, based on the TimeToVisible
// field of the Warnable and the BrokenSince time when the Warnable became unhealthy.
func (w *Warnable) IsVisible(ws *warningState) bool {
if ws == nil || w.TimeToVisible == 0 {
return true
}
return time.Since(ws.BrokenSince) >= w.TimeToVisible
}
// SetUnhealthy sets a warningState for the given Warnable with the provided Args, and should be
// called when a Warnable becomes unhealthy, or its unhealthy status needs to be updated.
// SetUnhealthy takes ownership of args. The args can be nil if no additional information is
@ -327,7 +345,27 @@ func (t *Tracker) setUnhealthyLocked(w *Warnable, args Args) {
mak.Set(&t.warnableVal, w, ws)
if !ws.Equal(prevWs) {
for _, cb := range t.watchers {
// If the Warnable has been unhealthy for more than its TimeToVisible, the callback should be
// executed immediately. Otherwise, the callback should be enqueued to run once the Warnable
// becomes visible.
if w.IsVisible(ws) {
go cb(w, w.unhealthyState(ws))
continue
}
// The time remaining until the Warnable will be visible to the user is the TimeToVisible
// minus the time that has already passed since the Warnable became unhealthy.
visibleIn := w.TimeToVisible - time.Since(brokenSince)
mak.Set(&t.pendingVisibleTimers, w, time.AfterFunc(visibleIn, func() {
t.mu.Lock()
defer t.mu.Unlock()
// Check if the Warnable is still unhealthy, as it could have become healthy between the time
// the timer was set for and the time it was executed.
if t.warnableVal[w] != nil {
go cb(w, w.unhealthyState(ws))
delete(t.pendingVisibleTimers, w)
}
}))
}
}
}
@ -349,6 +387,13 @@ func (t *Tracker) setHealthyLocked(w *Warnable) {
}
delete(t.warnableVal, w)
// Stop any pending visiblity timers for this Warnable
if canc, ok := t.pendingVisibleTimers[w]; ok {
canc.Stop()
delete(t.pendingVisibleTimers, w)
}
for _, cb := range t.watchers {
go cb(w, nil)
}
@ -861,6 +906,10 @@ func (t *Tracker) Strings() []string {
func (t *Tracker) stringsLocked() []string {
result := []string{}
for w, ws := range t.warnableVal {
if !w.IsVisible(ws) {
// Do not append invisible warnings.
continue
}
if ws.Args == nil {
result = append(result, w.Text(Args{}))
} else {

@ -162,6 +162,53 @@ func TestWatcher(t *testing.T) {
}
}
// TestWatcherWithTimeToVisible tests that a registered watcher function gets called with the correct
// Warnable and non-nil/nil UnhealthyState upon setting a Warnable to unhealthy/healthy, but the Warnable
// has a TimeToVisible set, which means that a watcher should only be notified of an unhealthy state after
// the TimeToVisible duration has passed.
func TestSetUnhealthyWithTimeToVisible(t *testing.T) {
ht := Tracker{}
mw := Register(&Warnable{
Code: "test-warnable-3-secs-to-visible",
Title: "Test Warnable with 3 seconds to visible",
Text: StaticMessage("Hello world"),
TimeToVisible: 2 * time.Second,
ImpactsConnectivity: true,
})
defer unregister(mw)
becameUnhealthy := make(chan struct{})
becameHealthy := make(chan struct{})
watchFunc := func(w *Warnable, us *UnhealthyState) {
if w != mw {
t.Fatalf("watcherFunc was called, but with an unexpected Warnable: %v, want: %v", w, w)
}
if us != nil {
t.Logf("watcherFunc was called with an UnhealthyState: %v", us)
becameUnhealthy <- struct{}{}
} else {
t.Logf("watcherFunc was called with an healthy state: %v", us)
becameHealthy <- struct{}{}
}
}
ht.RegisterWatcher(watchFunc)
ht.SetUnhealthy(mw, Args{ArgError: "Hello world"})
select {
case <-becameUnhealthy:
// Test failed because the watcher got notified of an unhealthy state
t.Fatalf("watcherFunc was called with an unhealthy state")
case <-becameHealthy:
// Test failed because the watcher got of a healthy state
t.Fatalf("watcherFunc was called with a healthy state")
case <-time.After(1 * time.Second):
// As expected, watcherFunc still had not been called after 1 second
}
}
func TestRegisterWarnablePanicsWithDuplicate(t *testing.T) {
w := &Warnable{
Code: "test-warnable-1",

@ -20,7 +20,7 @@ type State struct {
Warnings map[WarnableCode]UnhealthyState
}
// Representation contains information to be shown to the user to inform them
// UnhealthyState contains information to be shown to the user to inform them
// that a Warnable is currently unhealthy.
type UnhealthyState struct {
WarnableCode WarnableCode
@ -86,6 +86,10 @@ func (t *Tracker) CurrentState() *State {
wm := map[WarnableCode]UnhealthyState{}
for w, ws := range t.warnableVal {
if !w.IsVisible(ws) {
// Skip invisible Warnables.
continue
}
wm[w.Code] = *w.unhealthyState(ws)
}

@ -59,6 +59,7 @@ var NetworkStatusWarnable = Register(&Warnable{
Severity: SeverityMedium,
Text: StaticMessage("Tailscale cannot connect because the network is down. Check your Internet connection."),
ImpactsConnectivity: true,
TimeToVisible: 5 * time.Second,
})
// IPNStateWarnable is a Warnable that warns the user that Tailscale is stopped.
@ -101,6 +102,8 @@ var notInMapPollWarnable = Register(&Warnable{
Severity: SeverityMedium,
DependsOn: []*Warnable{NetworkStatusWarnable},
Text: StaticMessage("Unable to connect to the Tailscale coordination server to synchronize the state of your tailnet. Peer reachability might degrade over time."),
// 8 minutes reflects a maximum maintenance window for the coordination server.
TimeToVisible: 8 * time.Minute,
})
// noDERPHomeWarnable is a Warnable that warns the user that Tailscale doesn't have a home DERP.
@ -111,6 +114,7 @@ var noDERPHomeWarnable = Register(&Warnable{
DependsOn: []*Warnable{NetworkStatusWarnable},
Text: StaticMessage("Tailscale could not connect to any relay server. Check your Internet connection."),
ImpactsConnectivity: true,
TimeToVisible: 10 * time.Second,
})
// noDERPConnectionWarnable is a Warnable that warns the user that Tailscale couldn't connect to a specific DERP server.
@ -127,6 +131,7 @@ var noDERPConnectionWarnable = Register(&Warnable{
}
},
ImpactsConnectivity: true,
TimeToVisible: 10 * time.Second,
})
// derpTimeoutWarnable is a Warnable that warns the user that Tailscale hasn't heard from the home DERP region for a while.

Loading…
Cancel
Save