prober: add an HTTP endpoint for triggering a probe

- Keep track of the last 10 probe results and successful probe latencies; - Add an HTTP handler that triggers a given probe by name and returns it result as a plaintext HTML page, showing recent probe results as a baseline Updates tailscale/corp#20583 Signed-off-by: Anton Tolchanov <anton@tailscale.com>
4 months ago · 153a476957
parent 227509547f
commit 153a476957
2 changed files with 311 additions and 40 deletions
--- a/prober/prober.go
+++ b/prober/prober.go
@ -7,19 +7,25 @@
 package prober
 import (
 	"container/ring"
 	"context"
 	"errors"
 	"fmt"
 	"hash/fnv"
 	"log"
 	"maps"
 	"math/rand"
 	"net/http"
 	"sync"
 	"time"
 	"github.com/prometheus/client_golang/prometheus"
 	"tailscale.com/tsweb"
 )
 // recentHistSize is the number of recent probe results and latencies to keep
 // in memory.
 const recentHistSize = 10
 // ProbeClass defines a probe of a specific type: a probing function that will
 // be regularly ran, and metric labels that will be added automatically to all
 // probes using this class.
@ -106,6 +112,14 @@ func (p *Prober) Run(name string, interval time.Duration, labels Labels, pc Prob
 		l[k] = v
 	}
 	probe := newProbe(p, name, interval, l, pc)
 	p.probes[name] = probe
 	go probe.loop()
 	return probe
 }
 // newProbe creates a new Probe with the given parameters, but does not start it.
 func newProbe(p *Prober, name string, interval time.Duration, l prometheus.Labels, pc ProbeClass) *Probe {
 	ctx, cancel := context.WithCancel(context.Background())
 	probe := &Probe{
 		prober:  p,
@ -117,6 +131,9 @@ func (p *Prober) Run(name string, interval time.Duration, labels Labels, pc Prob
 		probeClass:   pc,
 		interval:     interval,
 		initialDelay: initialDelay(name, interval),
 		successHist:  ring.New(recentHistSize),
 		latencyHist:  ring.New(recentHistSize),
 		metrics:      prometheus.NewRegistry(),
 		metricLabels: l,
 		mInterval:    prometheus.NewDesc("interval_secs", "Probe interval in seconds", nil, l),
@ -131,15 +148,14 @@ func (p *Prober) Run(name string, interval time.Duration, labels Labels, pc Prob
 			Name: "seconds_total", Help: "Total amount of time spent executing the probe", ConstLabels: l,
 		}, []string{"status"}),
 	}
-
+	if p.metrics != nil {
-	prometheus.WrapRegistererWithPrefix(p.namespace+"_", p.metrics).MustRegister(probe.metrics)
+		prometheus.WrapRegistererWithPrefix(p.namespace+"_", p.metrics).MustRegister(probe.metrics)
 	}
 	probe.metrics.MustRegister(probe)
 	p.probes[name] = probe
 	go probe.loop()
 	return probe
 }
 // unregister removes a probe from the prober's internal state.
 func (p *Prober) unregister(probe *Probe) {
 	p.mu.Lock()
 	defer p.mu.Unlock()
@ -206,6 +222,7 @@ type Probe struct {
 	ctx     context.Context
 	cancel  context.CancelFunc // run to initiate shutdown
 	stopped chan struct{}      // closed when shutdown is complete
 	runMu   sync.Mutex         // ensures only one probe runs at a time
 	name         string
 	probeClass   ProbeClass
@ -232,6 +249,10 @@ type Probe struct {
 	latency   time.Duration // last successful probe latency
 	succeeded bool          // whether the last doProbe call succeeded
 	lastErr   error
 	// History of recent probe results and latencies.
 	successHist *ring.Ring
 	latencyHist *ring.Ring
 }
 // Close shuts down the Probe and unregisters it from its Prober.
@ -278,13 +299,17 @@ func (p *Probe) loop() {
 	}
 }
-// run invokes fun and records the results.
+// run invokes the probe function and records the result. It returns the probe
 // result and an error if the probe failed.
 //
-// fun is invoked with a timeout slightly less than interval, so that
+// The probe function is invoked with a timeout slightly less than interval, so
-// the probe either succeeds or fails before the next cycle is
+// that the probe either succeeds or fails before the next cycle is scheduled to
-// scheduled to start.
+// start.
-func (p *Probe) run() {
+func (p *Probe) run() (pi ProbeInfo, err error) {
-	start := p.recordStart()
+	p.runMu.Lock()
 	defer p.runMu.Unlock()
 	p.recordStart()
 	defer func() {
 		// Prevent a panic within one probe function from killing the
 		// entire prober, so that a single buggy probe doesn't destroy
@ -293,29 +318,30 @@ func (p *Probe) run() {
 		// alert for debugging.
 		if r := recover(); r != nil {
 			log.Printf("probe %s panicked: %v", p.name, r)
-			p.recordEnd(start, errors.New("panic"))
+			err = fmt.Errorf("panic: %v", r)
 			p.recordEnd(err)
 		}
 	}()
 	timeout := time.Duration(float64(p.interval) * 0.8)
 	ctx, cancel := context.WithTimeout(p.ctx, timeout)
 	defer cancel()
-	err := p.probeClass.Probe(ctx)
+	err = p.probeClass.Probe(ctx)
-	p.recordEnd(start, err)
+	p.recordEnd(err)
 	if err != nil {
 		log.Printf("probe %s: %v", p.name, err)
 	}
 	pi = p.probeInfoLocked()
 	return
 }
-func (p *Probe) recordStart() time.Time {
+func (p *Probe) recordStart() {
 	st := p.prober.now()
 	p.mu.Lock()
-	defer p.mu.Unlock()
+	p.start = p.prober.now()
-	p.start = st
+	p.mu.Unlock()
 	return st
 }
-func (p *Probe) recordEnd(start time.Time, err error) {
+func (p *Probe) recordEnd(err error) {
 	end := p.prober.now()
 	p.mu.Lock()
 	defer p.mu.Unlock()
@ -327,22 +353,55 @@ func (p *Probe) recordEnd(start time.Time, err error) {
 		p.latency = latency
 		p.mAttempts.WithLabelValues("ok").Inc()
 		p.mSeconds.WithLabelValues("ok").Add(latency.Seconds())
 		p.latencyHist.Value = latency
 		p.latencyHist = p.latencyHist.Next()
 	} else {
 		p.latency = 0
 		p.mAttempts.WithLabelValues("fail").Inc()
 		p.mSeconds.WithLabelValues("fail").Add(latency.Seconds())
 	}
 	p.successHist.Value = p.succeeded
 	p.successHist = p.successHist.Next()
 }
-// ProbeInfo is the state of a Probe.
+// ProbeInfo is a snapshot of the configuration and state of a Probe.
 type ProbeInfo struct {
-	Start   time.Time
+	Name            string
-	End     time.Time
+	Class           string
-	Latency string
+	Interval        time.Duration
-	Result  bool
+	Labels          map[string]string
-	Error   string
+	Start           time.Time
 	End             time.Time
 	Latency         time.Duration
 	Result          bool
 	Error           string
 	RecentResults   []bool
 	RecentLatencies []time.Duration
 }
 // RecentSuccessRatio returns the success ratio of the probe in the recent history.
 func (pb ProbeInfo) RecentSuccessRatio() float64 {
 	if len(pb.RecentResults) == 0 {
 		return 0
 	}
 	var sum int
 	for _, r := range pb.RecentResults {
 		if r {
 			sum++
 		}
 	}
 	return float64(sum) / float64(len(pb.RecentResults))
 }
 // RecentMedianLatency returns the median latency of the probe in the recent history.
 func (pb ProbeInfo) RecentMedianLatency() time.Duration {
 	if len(pb.RecentLatencies) == 0 {
 		return 0
 	}
 	return pb.RecentLatencies[len(pb.RecentLatencies)/2]
 }
 // ProbeInfo returns the state of all probes.
 func (p *Prober) ProbeInfo() map[string]ProbeInfo {
 	out := map[string]ProbeInfo{}
@ -352,26 +411,69 @@ func (p *Prober) ProbeInfo() map[string]ProbeInfo {
 		probes = append(probes, probe)
 	}
 	p.mu.Unlock()
 	for _, probe := range probes {
 		probe.mu.Lock()
-		inf := ProbeInfo{
+		out[probe.name] = probe.probeInfoLocked()
 			Start:  probe.start,
 			End:    probe.end,
 			Result: probe.succeeded,
 		}
 		if probe.lastErr != nil {
 			inf.Error = probe.lastErr.Error()
 		}
 		if probe.latency > 0 {
 			inf.Latency = probe.latency.String()
 		}
 		out[probe.name] = inf
 		probe.mu.Unlock()
 	}
 	return out
 }
 // probeInfoLocked returns the state of the probe.
 func (probe *Probe) probeInfoLocked() ProbeInfo {
 	inf := ProbeInfo{
 		Name:     probe.name,
 		Class:    probe.probeClass.Class,
 		Interval: probe.interval,
 		Labels:   probe.metricLabels,
 		Start:    probe.start,
 		End:      probe.end,
 		Result:   probe.succeeded,
 	}
 	if probe.lastErr != nil {
 		inf.Error = probe.lastErr.Error()
 	}
 	if probe.latency > 0 {
 		inf.Latency = probe.latency
 	}
 	probe.latencyHist.Do(func(v any) {
 		if l, ok := v.(time.Duration); ok {
 			inf.RecentLatencies = append(inf.RecentLatencies, l)
 		}
 	})
 	probe.successHist.Do(func(v any) {
 		if r, ok := v.(bool); ok {
 			inf.RecentResults = append(inf.RecentResults, r)
 		}
 	})
 	return inf
 }
 // RunHandler runs a probe by name and returns the result as an HTTP response.
 func (p *Prober) RunHandler(w http.ResponseWriter, r *http.Request) error {
 	// Look up prober by name.
 	name := r.FormValue("name")
 	if name == "" {
 		return tsweb.Error(http.StatusBadRequest, "missing name parameter", nil)
 	}
 	p.mu.Lock()
 	probe, ok := p.probes[name]
 	prevInfo := probe.probeInfoLocked()
 	p.mu.Unlock()
 	if !ok {
 		return tsweb.Error(http.StatusNotFound, fmt.Sprintf("unknown probe %q", name), nil)
 	}
 	info, err := probe.run()
 	stats := fmt.Sprintf("Previous runs: success rate %d%%, median latency %v",
 		int(prevInfo.RecentSuccessRatio()*100), prevInfo.RecentMedianLatency())
 	if err != nil {
 		return tsweb.Error(http.StatusFailedDependency, fmt.Sprintf("Probe failed: %s\n%s", err.Error(), stats), err)
 	}
 	w.WriteHeader(respStatus)
 	w.Write([]byte(fmt.Sprintf("Probe succeeded in %v\n%s", info.Latency, stats)))
 	return nil
 }
 // Describe implements prometheus.Collector.
 func (p *Probe) Describe(ch chan<- *prometheus.Desc) {
 	ch <- p.mInterval
--- a/prober/prober_test.go
+++ b/prober/prober_test.go
@ -13,6 +13,8 @@ import (
 	"testing"
 	"time"
 	"github.com/google/go-cmp/cmp"
 	"github.com/google/go-cmp/cmp/cmpopts"
 	"github.com/prometheus/client_golang/prometheus/testutil"
 	"tailscale.com/tstest"
 )
@ -292,6 +294,173 @@ func TestOnceMode(t *testing.T) {
 	}
 }
 func TestProberProbeInfo(t *testing.T) {
 	clk := newFakeTime()
 	p := newForTest(clk.Now, clk.NewTicker).WithOnce(true)
 	p.Run("probe1", probeInterval, nil, FuncProbe(func(context.Context) error {
 		clk.Advance(500 * time.Millisecond)
 		return nil
 	}))
 	p.Run("probe2", probeInterval, nil, FuncProbe(func(context.Context) error { return fmt.Errorf("error2") }))
 	p.Wait()
 	info := p.ProbeInfo()
 	wantInfo := map[string]ProbeInfo{
 		"probe1": {
 			Name:            "probe1",
 			Interval:        probeInterval,
 			Labels:          map[string]string{"class": "", "name": "probe1"},
 			Latency:         500 * time.Millisecond,
 			Result:          true,
 			RecentResults:   []bool{true},
 			RecentLatencies: []time.Duration{500 * time.Millisecond},
 		},
 		"probe2": {
 			Name:            "probe2",
 			Interval:        probeInterval,
 			Labels:          map[string]string{"class": "", "name": "probe2"},
 			Error:           "error2",
 			RecentResults:   []bool{false},
 			RecentLatencies: nil, // no latency for failed probes
 		},
 	}
 	if diff := cmp.Diff(wantInfo, info, cmpopts.IgnoreFields(ProbeInfo{}, "Start", "End")); diff != "" {
 		t.Fatalf("unexpected ProbeInfo (-want +got):\n%s", diff)
 	}
 }
 func TestProbeInfoRecent(t *testing.T) {
 	type probeResult struct {
 		latency time.Duration
 		err     error
 	}
 	tests := []struct {
 		name                    string
 		results                 []probeResult
 		wantProbeInfo           ProbeInfo
 		wantRecentSuccessRatio  float64
 		wantRecentMedianLatency time.Duration
 	}{
 		{
 			name:                    "no_runs",
 			wantProbeInfo:           ProbeInfo{},
 			wantRecentSuccessRatio:  0,
 			wantRecentMedianLatency: 0,
 		},
 		{
 			name:    "single_success",
 			results: []probeResult{{latency: 100 * time.Millisecond, err: nil}},
 			wantProbeInfo: ProbeInfo{
 				Latency:         100 * time.Millisecond,
 				Result:          true,
 				RecentResults:   []bool{true},
 				RecentLatencies: []time.Duration{100 * time.Millisecond},
 			},
 			wantRecentSuccessRatio:  1,
 			wantRecentMedianLatency: 100 * time.Millisecond,
 		},
 		{
 			name:    "single_failure",
 			results: []probeResult{{latency: 100 * time.Millisecond, err: errors.New("error123")}},
 			wantProbeInfo: ProbeInfo{
 				Result:          false,
 				RecentResults:   []bool{false},
 				RecentLatencies: nil,
 				Error:           "error123",
 			},
 			wantRecentSuccessRatio:  0,
 			wantRecentMedianLatency: 0,
 		},
 		{
 			name: "recent_mix",
 			results: []probeResult{
 				{latency: 10 * time.Millisecond, err: errors.New("error1")},
 				{latency: 20 * time.Millisecond, err: nil},
 				{latency: 30 * time.Millisecond, err: nil},
 				{latency: 40 * time.Millisecond, err: errors.New("error4")},
 				{latency: 50 * time.Millisecond, err: nil},
 				{latency: 60 * time.Millisecond, err: nil},
 				{latency: 70 * time.Millisecond, err: errors.New("error7")},
 				{latency: 80 * time.Millisecond, err: nil},
 			},
 			wantProbeInfo: ProbeInfo{
 				Result:        true,
 				Latency:       80 * time.Millisecond,
 				RecentResults: []bool{false, true, true, false, true, true, false, true},
 				RecentLatencies: []time.Duration{
 					20 * time.Millisecond,
 					30 * time.Millisecond,
 					50 * time.Millisecond,
 					60 * time.Millisecond,
 					80 * time.Millisecond,
 				},
 			},
 			wantRecentSuccessRatio:  0.625,
 			wantRecentMedianLatency: 50 * time.Millisecond,
 		},
 		{
 			name: "only_last_10",
 			results: []probeResult{
 				{latency: 10 * time.Millisecond, err: errors.New("old_error")},
 				{latency: 20 * time.Millisecond, err: nil},
 				{latency: 30 * time.Millisecond, err: nil},
 				{latency: 40 * time.Millisecond, err: nil},
 				{latency: 50 * time.Millisecond, err: nil},
 				{latency: 60 * time.Millisecond, err: nil},
 				{latency: 70 * time.Millisecond, err: nil},
 				{latency: 80 * time.Millisecond, err: nil},
 				{latency: 90 * time.Millisecond, err: nil},
 				{latency: 100 * time.Millisecond, err: nil},
 				{latency: 110 * time.Millisecond, err: nil},
 			},
 			wantProbeInfo: ProbeInfo{
 				Result:        true,
 				Latency:       110 * time.Millisecond,
 				RecentResults: []bool{true, true, true, true, true, true, true, true, true, true},
 				RecentLatencies: []time.Duration{
 					20 * time.Millisecond,
 					30 * time.Millisecond,
 					40 * time.Millisecond,
 					50 * time.Millisecond,
 					60 * time.Millisecond,
 					70 * time.Millisecond,
 					80 * time.Millisecond,
 					90 * time.Millisecond,
 					100 * time.Millisecond,
 					110 * time.Millisecond,
 				},
 			},
 			wantRecentSuccessRatio:  1,
 			wantRecentMedianLatency: 70 * time.Millisecond,
 		},
 	}
 	clk := newFakeTime()
 	p := newForTest(clk.Now, clk.NewTicker).WithOnce(true)
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			probe := newProbe(p, "", probeInterval, nil, FuncProbe(func(context.Context) error { return nil }))
 			for _, r := range tt.results {
 				probe.recordStart()
 				clk.Advance(r.latency)
 				probe.recordEnd(r.err)
 			}
 			info := probe.probeInfoLocked()
 			if diff := cmp.Diff(tt.wantProbeInfo, info, cmpopts.IgnoreFields(ProbeInfo{}, "Start", "End", "Interval")); diff != "" {
 				t.Fatalf("unexpected ProbeInfo (-want +got):\n%s", diff)
 			}
 			if got := info.RecentSuccessRatio(); got != tt.wantRecentSuccessRatio {
 				t.Errorf("recentSuccessRatio() = %v, want %v", got, tt.wantRecentSuccessRatio)
 			}
 			if got := info.RecentMedianLatency(); got != tt.wantRecentMedianLatency {
 				t.Errorf("recentMedianLatency() = %v, want %v", got, tt.wantRecentMedianLatency)
 			}
 		})
 	}
 }
 type fakeTicker struct {
 	ch       chan time.Time
 	interval time.Duration