wgengine: print in-flight operations on watchdog trigger (#5447)

In addition to printing goroutine stacks, explicitly track all in-flight
operations and print them when the watchdog triggers (along with the
time they were started at). This should make debugging watchdog failures
easier, since we can look at the longest-running operation(s) first.

Signed-off-by: Andrew Dunham <andrew@tailscale.com>

Signed-off-by: Andrew Dunham <andrew@tailscale.com>
pull/5478/head
Andrew Dunham 2 years ago committed by GitHub
parent e1738ea78e
commit 9b77ac128a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -8,10 +8,12 @@
package wgengine
import (
"fmt"
"log"
"net/netip"
"runtime/pprof"
"strings"
"sync"
"time"
"tailscale.com/envknob"
@ -38,21 +40,49 @@ func NewWatchdog(e Engine) Engine {
return e
}
return &watchdogEngine{
wrap: e,
logf: log.Printf,
fatalf: log.Fatalf,
maxWait: 45 * time.Second,
wrap: e,
logf: log.Printf,
fatalf: log.Fatalf,
maxWait: 45 * time.Second,
inFlight: make(map[inFlightKey]time.Time),
}
}
type inFlightKey struct {
op string
ctr uint64
}
type watchdogEngine struct {
wrap Engine
logf func(format string, args ...any)
fatalf func(format string, args ...any)
maxWait time.Duration
// Track the start time(s) of in-flight operations
inFlightMu sync.Mutex
inFlight map[inFlightKey]time.Time
inFlightCtr uint64
}
func (e *watchdogEngine) watchdogErr(name string, fn func() error) error {
// Track all in-flight operations so we can print more useful error
// messages on watchdog failure
e.inFlightMu.Lock()
key := inFlightKey{
op: name,
ctr: e.inFlightCtr,
}
e.inFlightCtr++
e.inFlight[key] = time.Now()
e.inFlightMu.Unlock()
defer func() {
e.inFlightMu.Lock()
defer e.inFlightMu.Unlock()
delete(e.inFlight, key)
}()
errCh := make(chan error)
go func() {
errCh <- fn()
@ -66,6 +96,22 @@ func (e *watchdogEngine) watchdogErr(name string, fn func() error) error {
buf := new(strings.Builder)
pprof.Lookup("goroutine").WriteTo(buf, 1)
e.logf("wgengine watchdog stacks:\n%s", buf.String())
// Collect the list of in-flight operations for debugging.
var (
b []byte
now = time.Now()
)
e.inFlightMu.Lock()
for k, t := range e.inFlight {
dur := now.Sub(t).Round(time.Millisecond)
b = fmt.Appendf(b, "in-flight[%d]: name=%s duration=%v start=%s\n", k.ctr, k.op, dur, t.Format(time.RFC3339Nano))
}
e.inFlightMu.Unlock()
// Print everything as a single string to avoid log
// rate limits.
e.logf("wgengine watchdog in-flight:\n%s", b)
e.fatalf("wgengine: watchdog timeout on %s", name)
return nil
}

@ -6,6 +6,7 @@ package wgengine
import (
"fmt"
"regexp"
"runtime"
"strings"
"testing"
@ -68,9 +69,15 @@ func TestWatchdog(t *testing.T) {
select {
case <-fatalCalled:
if !strings.Contains(logBuf.String(), "goroutine profile: total ") {
t.Errorf("fatal called without watchdog stacks, got: %s", logBuf.String())
logs := logBuf.String()
if !strings.Contains(logs, "goroutine profile: total ") {
t.Errorf("fatal called without watchdog stacks, got: %s", logs)
}
re := regexp.MustCompile(`(?m)^\s*in-flight\[\d+\]: name=RequestStatus duration=.* start=.*$`)
if !re.MatchString(logs) {
t.Errorf("fatal called without in-flight list, got: %s", logs)
}
// expected
case <-time.After(3 * time.Second):
t.Fatalf("watchdog failed to fire")

Loading…
Cancel
Save