From b7c3cfe04983d5a8961ed9d798a1cf7482d9bcfd Mon Sep 17 00:00:00 2001 From: Andrea Gottardo Date: Thu, 11 Jul 2024 11:51:47 -0700 Subject: [PATCH] health: support delayed Warnable visibility (#12783) Updates tailscale/tailscale#4136 To reduce the likelihood of presenting spurious warnings, add the ability to delay the visibility of certain Warnables, based on a TimeToVisible time.Duration field on each Warnable. The default is zero, meaning that a Warnable is immediately visible to the user when it enters an unhealthy state. Signed-off-by: Andrea Gottardo --- health/health.go | 51 ++++++++++++++++++++++++++++++++++++++++++- health/health_test.go | 47 +++++++++++++++++++++++++++++++++++++++ health/state.go | 6 ++++- health/warnings.go | 5 +++++ 4 files changed, 107 insertions(+), 2 deletions(-) diff --git a/health/health.go b/health/health.go index 472f6b4e2..f1fe8eccb 100644 --- a/health/health.go +++ b/health/health.go @@ -69,6 +69,9 @@ type Tracker struct { warnables []*Warnable // keys ever set warnableVal map[*Warnable]*warningState + // pendingVisibleTimers contains timers for Warnables that are unhealthy, but are + // not visible to the user yet, because they haven't been unhealthy for TimeToVisible + pendingVisibleTimers map[*Warnable]*time.Timer // sysErr maps subsystems to their current error (or nil if the subsystem is healthy) // Deprecated: using Warnables should be preferred @@ -162,6 +165,7 @@ func Register(w *Warnable) *Warnable { if registeredWarnables[w.Code] != nil { panic(fmt.Sprintf("health: a Warnable with code %q was already registered", w.Code)) } + mak.Set(®isteredWarnables, w.Code, w) return w } @@ -218,6 +222,11 @@ type Warnable struct { // the client GUI supports a tray icon, the client will display an exclamation mark // on the tray icon when ImpactsConnectivity is set to true and the Warnable is unhealthy. ImpactsConnectivity bool + + // TimeToVisible is the Duration that the Warnable has to be in an unhealthy state before it + // should be surfaced as unhealthy to the user. This is used to prevent transient errors from being + // displayed to the user. + TimeToVisible time.Duration } // StaticMessage returns a function that always returns the input string, to be used in @@ -291,6 +300,15 @@ func (ws *warningState) Equal(other *warningState) bool { return ws.BrokenSince.Equal(other.BrokenSince) && maps.Equal(ws.Args, other.Args) } +// IsVisible returns whether the Warnable should be visible to the user, based on the TimeToVisible +// field of the Warnable and the BrokenSince time when the Warnable became unhealthy. +func (w *Warnable) IsVisible(ws *warningState) bool { + if ws == nil || w.TimeToVisible == 0 { + return true + } + return time.Since(ws.BrokenSince) >= w.TimeToVisible +} + // SetUnhealthy sets a warningState for the given Warnable with the provided Args, and should be // called when a Warnable becomes unhealthy, or its unhealthy status needs to be updated. // SetUnhealthy takes ownership of args. The args can be nil if no additional information is @@ -327,7 +345,27 @@ func (t *Tracker) setUnhealthyLocked(w *Warnable, args Args) { mak.Set(&t.warnableVal, w, ws) if !ws.Equal(prevWs) { for _, cb := range t.watchers { - go cb(w, w.unhealthyState(ws)) + // If the Warnable has been unhealthy for more than its TimeToVisible, the callback should be + // executed immediately. Otherwise, the callback should be enqueued to run once the Warnable + // becomes visible. + if w.IsVisible(ws) { + go cb(w, w.unhealthyState(ws)) + continue + } + + // The time remaining until the Warnable will be visible to the user is the TimeToVisible + // minus the time that has already passed since the Warnable became unhealthy. + visibleIn := w.TimeToVisible - time.Since(brokenSince) + mak.Set(&t.pendingVisibleTimers, w, time.AfterFunc(visibleIn, func() { + t.mu.Lock() + defer t.mu.Unlock() + // Check if the Warnable is still unhealthy, as it could have become healthy between the time + // the timer was set for and the time it was executed. + if t.warnableVal[w] != nil { + go cb(w, w.unhealthyState(ws)) + delete(t.pendingVisibleTimers, w) + } + })) } } } @@ -349,6 +387,13 @@ func (t *Tracker) setHealthyLocked(w *Warnable) { } delete(t.warnableVal, w) + + // Stop any pending visiblity timers for this Warnable + if canc, ok := t.pendingVisibleTimers[w]; ok { + canc.Stop() + delete(t.pendingVisibleTimers, w) + } + for _, cb := range t.watchers { go cb(w, nil) } @@ -861,6 +906,10 @@ func (t *Tracker) Strings() []string { func (t *Tracker) stringsLocked() []string { result := []string{} for w, ws := range t.warnableVal { + if !w.IsVisible(ws) { + // Do not append invisible warnings. + continue + } if ws.Args == nil { result = append(result, w.Text(Args{})) } else { diff --git a/health/health_test.go b/health/health_test.go index c5b7d865a..65a5d79ce 100644 --- a/health/health_test.go +++ b/health/health_test.go @@ -162,6 +162,53 @@ func TestWatcher(t *testing.T) { } } +// TestWatcherWithTimeToVisible tests that a registered watcher function gets called with the correct +// Warnable and non-nil/nil UnhealthyState upon setting a Warnable to unhealthy/healthy, but the Warnable +// has a TimeToVisible set, which means that a watcher should only be notified of an unhealthy state after +// the TimeToVisible duration has passed. +func TestSetUnhealthyWithTimeToVisible(t *testing.T) { + ht := Tracker{} + mw := Register(&Warnable{ + Code: "test-warnable-3-secs-to-visible", + Title: "Test Warnable with 3 seconds to visible", + Text: StaticMessage("Hello world"), + TimeToVisible: 2 * time.Second, + ImpactsConnectivity: true, + }) + defer unregister(mw) + + becameUnhealthy := make(chan struct{}) + becameHealthy := make(chan struct{}) + + watchFunc := func(w *Warnable, us *UnhealthyState) { + if w != mw { + t.Fatalf("watcherFunc was called, but with an unexpected Warnable: %v, want: %v", w, w) + } + + if us != nil { + t.Logf("watcherFunc was called with an UnhealthyState: %v", us) + becameUnhealthy <- struct{}{} + } else { + t.Logf("watcherFunc was called with an healthy state: %v", us) + becameHealthy <- struct{}{} + } + } + + ht.RegisterWatcher(watchFunc) + ht.SetUnhealthy(mw, Args{ArgError: "Hello world"}) + + select { + case <-becameUnhealthy: + // Test failed because the watcher got notified of an unhealthy state + t.Fatalf("watcherFunc was called with an unhealthy state") + case <-becameHealthy: + // Test failed because the watcher got of a healthy state + t.Fatalf("watcherFunc was called with a healthy state") + case <-time.After(1 * time.Second): + // As expected, watcherFunc still had not been called after 1 second + } +} + func TestRegisterWarnablePanicsWithDuplicate(t *testing.T) { w := &Warnable{ Code: "test-warnable-1", diff --git a/health/state.go b/health/state.go index 07258c75e..17a646794 100644 --- a/health/state.go +++ b/health/state.go @@ -20,7 +20,7 @@ type State struct { Warnings map[WarnableCode]UnhealthyState } -// Representation contains information to be shown to the user to inform them +// UnhealthyState contains information to be shown to the user to inform them // that a Warnable is currently unhealthy. type UnhealthyState struct { WarnableCode WarnableCode @@ -86,6 +86,10 @@ func (t *Tracker) CurrentState() *State { wm := map[WarnableCode]UnhealthyState{} for w, ws := range t.warnableVal { + if !w.IsVisible(ws) { + // Skip invisible Warnables. + continue + } wm[w.Code] = *w.unhealthyState(ws) } diff --git a/health/warnings.go b/health/warnings.go index 73de0f1b9..e84043341 100644 --- a/health/warnings.go +++ b/health/warnings.go @@ -59,6 +59,7 @@ var NetworkStatusWarnable = Register(&Warnable{ Severity: SeverityMedium, Text: StaticMessage("Tailscale cannot connect because the network is down. Check your Internet connection."), ImpactsConnectivity: true, + TimeToVisible: 5 * time.Second, }) // IPNStateWarnable is a Warnable that warns the user that Tailscale is stopped. @@ -101,6 +102,8 @@ var notInMapPollWarnable = Register(&Warnable{ Severity: SeverityMedium, DependsOn: []*Warnable{NetworkStatusWarnable}, Text: StaticMessage("Unable to connect to the Tailscale coordination server to synchronize the state of your tailnet. Peer reachability might degrade over time."), + // 8 minutes reflects a maximum maintenance window for the coordination server. + TimeToVisible: 8 * time.Minute, }) // noDERPHomeWarnable is a Warnable that warns the user that Tailscale doesn't have a home DERP. @@ -111,6 +114,7 @@ var noDERPHomeWarnable = Register(&Warnable{ DependsOn: []*Warnable{NetworkStatusWarnable}, Text: StaticMessage("Tailscale could not connect to any relay server. Check your Internet connection."), ImpactsConnectivity: true, + TimeToVisible: 10 * time.Second, }) // noDERPConnectionWarnable is a Warnable that warns the user that Tailscale couldn't connect to a specific DERP server. @@ -127,6 +131,7 @@ var noDERPConnectionWarnable = Register(&Warnable{ } }, ImpactsConnectivity: true, + TimeToVisible: 10 * time.Second, }) // derpTimeoutWarnable is a Warnable that warns the user that Tailscale hasn't heard from the home DERP region for a while.