util/usermetrics: make usermetrics non-global

this commit changes usermetrics to be non-global, this is a building
block for correct metrics if a go process runs multiple tsnets or
in tests.

Updates #13420
Updates tailscale/corp#22075

Signed-off-by: Kristoffer Dalby <kristoffer@tailscale.com>
pull/8070/merge
Kristoffer Dalby 2 months ago committed by Kristoffer Dalby
parent e1bbe1bf45
commit 0e0e53d3b3

@ -680,12 +680,15 @@ func tryEngine(logf logger.Logf, sys *tsd.System, name string) (onlyNetstack boo
ListenPort: args.port,
NetMon: sys.NetMon.Get(),
HealthTracker: sys.HealthTracker(),
Metrics: sys.UserMetricsRegistry(),
Dialer: sys.Dialer.Get(),
SetSubsystem: sys.Set,
ControlKnobs: sys.ControlKnobs(),
DriveForLocal: driveimpl.NewFileSystemForLocal(logf),
}
sys.HealthTracker().SetMetricsRegistry(sys.UserMetricsRegistry())
onlyNetstack = name == "userspace-networking"
netstackSubnetRouter := onlyNetstack // but mutated later on some platforms
netns.SetEnabled(!onlyNetstack)

@ -320,7 +320,7 @@ require (
github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e // indirect
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
github.com/polyfloyd/go-errorlint v1.4.1 // indirect
github.com/prometheus/client_model v0.5.0 // indirect
github.com/prometheus/client_model v0.5.0
github.com/prometheus/procfs v0.12.0 // indirect
github.com/quasilyte/go-ruleguard v0.3.19 // indirect
github.com/quasilyte/gogrep v0.5.0 // indirect

@ -20,6 +20,7 @@ import (
"time"
"tailscale.com/envknob"
"tailscale.com/metrics"
"tailscale.com/tailcfg"
"tailscale.com/types/opt"
"tailscale.com/util/cibuild"
@ -111,6 +112,7 @@ type Tracker struct {
lastLoginErr error
localLogConfigErr error
tlsConnectionErrors map[string]error // map[ServerName]error
metricHealthMessage *metrics.MultiLabelMap[metricHealthMessageLabel]
}
// Subsystem is the name of a subsystem whose health can be monitored.
@ -317,6 +319,33 @@ func (w *Warnable) IsVisible(ws *warningState) bool {
return time.Since(ws.BrokenSince) >= w.TimeToVisible
}
// SetMetricsRegistry sets up the metrics for the Tracker. It takes
// a usermetric.Registry and registers the metrics there.
func (t *Tracker) SetMetricsRegistry(reg *usermetric.Registry) {
if reg == nil || t.metricHealthMessage != nil {
return
}
t.metricHealthMessage = usermetric.NewMultiLabelMapWithRegistry[metricHealthMessageLabel](
reg,
"tailscaled_health_messages",
"gauge",
"Number of health messages broken down by type.",
)
t.metricHealthMessage.Set(metricHealthMessageLabel{
Type: "warning",
}, expvar.Func(func() any {
if t.nil() {
return 0
}
t.mu.Lock()
defer t.mu.Unlock()
t.updateBuiltinWarnablesLocked()
return int64(len(t.stringsLocked()))
}))
}
// SetUnhealthy sets a warningState for the given Warnable with the provided Args, and should be
// called when a Warnable becomes unhealthy, or its unhealthy status needs to be updated.
// SetUnhealthy takes ownership of args. The args can be nil if no additional information is
@ -1205,18 +1234,6 @@ func (t *Tracker) ReceiveFuncStats(which ReceiveFunc) *ReceiveFuncStats {
}
func (t *Tracker) doOnceInit() {
metricHealthMessage.Set(metricHealthMessageLabel{
Type: "warning",
}, expvar.Func(func() any {
if t.nil() {
return 0
}
t.mu.Lock()
defer t.mu.Unlock()
t.updateBuiltinWarnablesLocked()
return int64(len(t.stringsLocked()))
}))
for i := range t.MagicSockReceiveFuncs {
f := &t.MagicSockReceiveFuncs[i]
f.name = (ReceiveFunc(i)).String()
@ -1252,9 +1269,3 @@ type metricHealthMessageLabel struct {
// TODO: break down by warnable.severity as well?
Type string
}
var metricHealthMessage = usermetric.NewMultiLabelMap[metricHealthMessageLabel](
"tailscaled_health_messages",
"gauge",
"Number of health messages broken down by type.",
)

@ -119,9 +119,6 @@ import (
"tailscale.com/wgengine/wgcfg/nmcfg"
)
var metricAdvertisedRoutes = usermetric.NewGauge(
"tailscaled_advertised_routes", "Number of advertised network routes (e.g. by a subnet router)")
var controlDebugFlags = getControlDebugFlags()
func getControlDebugFlags() []string {
@ -184,6 +181,7 @@ type LocalBackend struct {
statsLogf logger.Logf // for printing peers stats on change
sys *tsd.System
health *health.Tracker // always non-nil
metrics metrics
e wgengine.Engine // non-nil; TODO(bradfitz): remove; use sys
store ipn.StateStore // non-nil; TODO(bradfitz): remove; use sys
dialer *tsdial.Dialer // non-nil; TODO(bradfitz): remove; use sys
@ -377,6 +375,11 @@ func (b *LocalBackend) HealthTracker() *health.Tracker {
return b.health
}
// UserMetricsRegistry returns the usermetrics registry for the backend
func (b *LocalBackend) UserMetricsRegistry() *usermetric.Registry {
return b.sys.UserMetricsRegistry()
}
// NetMon returns the network monitor for the backend.
func (b *LocalBackend) NetMon() *netmon.Monitor {
return b.sys.NetMon.Get()
@ -386,6 +389,12 @@ type updateStatus struct {
started bool
}
type metrics struct {
// advertisedRoutes is a metric that counts the number of network routes that are advertised by the local node.
// This informs the user of how many routes are being advertised by the local node, excluding exit routes.
advertisedRoutes *usermetric.Gauge
}
// clientGen is a func that creates a control plane client.
// It's the type used by LocalBackend.SetControlClientGetterForTesting.
type clientGen func(controlclient.Options) (controlclient.Client, error)
@ -429,6 +438,11 @@ func NewLocalBackend(logf logger.Logf, logID logid.PublicID, sys *tsd.System, lo
captiveCtx, captiveCancel := context.WithCancel(ctx)
captiveCancel()
m := metrics{
advertisedRoutes: sys.UserMetricsRegistry().NewGauge(
"tailscaled_advertised_routes", "Number of advertised network routes (e.g. by a subnet router)"),
}
b := &LocalBackend{
ctx: ctx,
ctxCancel: cancel,
@ -437,6 +451,7 @@ func NewLocalBackend(logf logger.Logf, logID logid.PublicID, sys *tsd.System, lo
statsLogf: logger.LogOnChange(logf, 5*time.Minute, clock.Now),
sys: sys,
health: sys.HealthTracker(),
metrics: m,
e: e,
dialer: dialer,
store: store,
@ -4760,7 +4775,7 @@ func (b *LocalBackend) applyPrefsToHostinfoLocked(hi *tailcfg.Hostinfo, prefs ip
routes++
}
}
metricAdvertisedRoutes.Set(float64(routes))
b.metrics.advertisedRoutes.Set(float64(routes))
var sshHostKeys []string
if prefs.RunSSH() && envknob.CanSSHD() {

@ -432,7 +432,7 @@ func newTestLocalBackend(t testing.TB) *LocalBackend {
sys := new(tsd.System)
store := new(mem.Store)
sys.Set(store)
eng, err := wgengine.NewFakeUserspaceEngine(logf, sys.Set, sys.HealthTracker())
eng, err := wgengine.NewFakeUserspaceEngine(logf, sys.Set, sys.HealthTracker(), sys.UserMetricsRegistry())
if err != nil {
t.Fatalf("NewFakeUserspaceEngine: %v", err)
}

@ -50,7 +50,7 @@ func TestLocalLogLines(t *testing.T) {
sys := new(tsd.System)
store := new(mem.Store)
sys.Set(store)
e, err := wgengine.NewFakeUserspaceEngine(logf, sys.Set, sys.HealthTracker())
e, err := wgengine.NewFakeUserspaceEngine(logf, sys.Set, sys.HealthTracker(), sys.UserMetricsRegistry())
if err != nil {
t.Fatal(err)
}

@ -35,6 +35,7 @@ import (
"tailscale.com/types/logger"
"tailscale.com/types/netmap"
"tailscale.com/util/must"
"tailscale.com/util/usermetric"
"tailscale.com/wgengine"
"tailscale.com/wgengine/filter"
)
@ -643,7 +644,8 @@ func TestPeerAPIReplyToDNSQueries(t *testing.T) {
h.remoteAddr = netip.MustParseAddrPort("100.150.151.152:12345")
ht := new(health.Tracker)
eng, _ := wgengine.NewFakeUserspaceEngine(logger.Discard, 0, ht)
reg := new(usermetric.Registry)
eng, _ := wgengine.NewFakeUserspaceEngine(logger.Discard, 0, ht, reg)
pm := must.Get(newProfileManager(new(mem.Store), t.Logf, ht))
h.ps = &peerAPIServer{
b: &LocalBackend{
@ -694,7 +696,8 @@ func TestPeerAPIPrettyReplyCNAME(t *testing.T) {
h.remoteAddr = netip.MustParseAddrPort("100.150.151.152:12345")
ht := new(health.Tracker)
eng, _ := wgengine.NewFakeUserspaceEngine(logger.Discard, 0, ht)
reg := new(usermetric.Registry)
eng, _ := wgengine.NewFakeUserspaceEngine(logger.Discard, 0, ht, reg)
pm := must.Get(newProfileManager(new(mem.Store), t.Logf, ht))
var a *appc.AppConnector
if shouldStore {
@ -767,7 +770,8 @@ func TestPeerAPIReplyToDNSQueriesAreObserved(t *testing.T) {
rc := &appctest.RouteCollector{}
ht := new(health.Tracker)
eng, _ := wgengine.NewFakeUserspaceEngine(logger.Discard, 0, ht)
reg := new(usermetric.Registry)
eng, _ := wgengine.NewFakeUserspaceEngine(logger.Discard, 0, ht, reg)
pm := must.Get(newProfileManager(new(mem.Store), t.Logf, ht))
var a *appc.AppConnector
if shouldStore {
@ -830,8 +834,9 @@ func TestPeerAPIReplyToDNSQueriesAreObservedWithCNAMEFlattening(t *testing.T) {
h.remoteAddr = netip.MustParseAddrPort("100.150.151.152:12345")
ht := new(health.Tracker)
reg := new(usermetric.Registry)
rc := &appctest.RouteCollector{}
eng, _ := wgengine.NewFakeUserspaceEngine(logger.Discard, 0, ht)
eng, _ := wgengine.NewFakeUserspaceEngine(logger.Discard, 0, ht, reg)
pm := must.Get(newProfileManager(new(mem.Store), t.Logf, ht))
var a *appc.AppConnector
if shouldStore {

@ -684,6 +684,7 @@ func newTestBackend(t *testing.T) *LocalBackend {
e, err := wgengine.NewUserspaceEngine(logf, wgengine.Config{
SetSubsystem: sys.Set,
HealthTracker: sys.HealthTracker(),
Metrics: sys.UserMetricsRegistry(),
})
if err != nil {
t.Fatal(err)

@ -298,7 +298,7 @@ func TestStateMachine(t *testing.T) {
sys := new(tsd.System)
store := new(testStateStorage)
sys.Set(store)
e, err := wgengine.NewFakeUserspaceEngine(logf, sys.Set, sys.HealthTracker())
e, err := wgengine.NewFakeUserspaceEngine(logf, sys.Set, sys.HealthTracker(), sys.UserMetricsRegistry())
if err != nil {
t.Fatalf("NewFakeUserspaceEngine: %v", err)
}
@ -931,7 +931,7 @@ func TestEditPrefsHasNoKeys(t *testing.T) {
logf := tstest.WhileTestRunningLogger(t)
sys := new(tsd.System)
sys.Set(new(mem.Store))
e, err := wgengine.NewFakeUserspaceEngine(logf, sys.Set, sys.HealthTracker())
e, err := wgengine.NewFakeUserspaceEngine(logf, sys.Set, sys.HealthTracker(), sys.UserMetricsRegistry())
if err != nil {
t.Fatalf("NewFakeUserspaceEngine: %v", err)
}

@ -64,7 +64,6 @@ import (
"tailscale.com/util/progresstracking"
"tailscale.com/util/rands"
"tailscale.com/util/testenv"
"tailscale.com/util/usermetric"
"tailscale.com/version"
"tailscale.com/wgengine/magicsock"
)
@ -581,7 +580,7 @@ func (h *Handler) serveUserMetrics(w http.ResponseWriter, r *http.Request) {
http.Error(w, "usermetrics debug flag not enabled", http.StatusForbidden)
return
}
usermetric.Handler(w, r)
h.b.UserMetricsRegistry().Handler(w, r)
}
func (h *Handler) serveDebug(w http.ResponseWriter, r *http.Request) {

@ -356,7 +356,7 @@ func newTestLocalBackend(t testing.TB) *ipnlocal.LocalBackend {
sys := new(tsd.System)
store := new(mem.Store)
sys.Set(store)
eng, err := wgengine.NewFakeUserspaceEngine(logf, sys.Set, sys.HealthTracker())
eng, err := wgengine.NewFakeUserspaceEngine(logf, sys.Set, sys.HealthTracker(), sys.UserMetricsRegistry())
if err != nil {
t.Fatalf("NewFakeUserspaceEngine: %v", err)
}

@ -97,7 +97,12 @@ type KeyValue[T comparable] struct {
}
func (v *MultiLabelMap[T]) String() string {
return `"MultiLabelMap"`
var sb strings.Builder
sb.WriteString("MultiLabelMap:\n")
v.Do(func(kv KeyValue[T]) {
fmt.Fprintf(&sb, "\t%v: %v\n", kv.Key, kv.Value)
})
return sb.String()
}
// WritePrometheus writes v to w in Prometheus exposition format.
@ -281,3 +286,16 @@ func (v *MultiLabelMap[T]) Do(f func(KeyValue[T])) {
f(KeyValue[T]{e.key, e.val})
}
}
// ResetAllForTest resets all values for metrics to zero.
// Should only be used in tests.
func (v *MultiLabelMap[T]) ResetAllForTest() {
v.Do(func(kv KeyValue[T]) {
switch v := kv.Value.(type) {
case *expvar.Int:
v.Set(0)
case *expvar.Float:
v.Set(0)
}
})
}

@ -24,6 +24,7 @@ import (
"go4.org/mem"
"gvisor.dev/gvisor/pkg/tcpip/stack"
"tailscale.com/disco"
tsmetrics "tailscale.com/metrics"
"tailscale.com/net/connstats"
"tailscale.com/net/packet"
"tailscale.com/net/packet/checksum"
@ -209,6 +210,30 @@ type Wrapper struct {
stats atomic.Pointer[connstats.Statistics]
captureHook syncs.AtomicValue[capture.Callback]
metrics *metrics
}
type metrics struct {
inboundDroppedPacketsTotal *tsmetrics.MultiLabelMap[dropPacketLabel]
outboundDroppedPacketsTotal *tsmetrics.MultiLabelMap[dropPacketLabel]
}
func registerMetrics(reg *usermetric.Registry) *metrics {
return &metrics{
inboundDroppedPacketsTotal: usermetric.NewMultiLabelMapWithRegistry[dropPacketLabel](
reg,
"tailscaled_inbound_dropped_packets_total",
"counter",
"Counts the number of dropped packets received by the node from other peers",
),
outboundDroppedPacketsTotal: usermetric.NewMultiLabelMapWithRegistry[dropPacketLabel](
reg,
"tailscaled_outbound_dropped_packets_total",
"counter",
"Counts the number of packets dropped while being sent to other peers",
),
}
}
// tunInjectedRead is an injected packet pretending to be a tun.Read().
@ -248,15 +273,15 @@ func (w *Wrapper) Start() {
close(w.startCh)
}
func WrapTAP(logf logger.Logf, tdev tun.Device) *Wrapper {
return wrap(logf, tdev, true)
func WrapTAP(logf logger.Logf, tdev tun.Device, m *usermetric.Registry) *Wrapper {
return wrap(logf, tdev, true, m)
}
func Wrap(logf logger.Logf, tdev tun.Device) *Wrapper {
return wrap(logf, tdev, false)
func Wrap(logf logger.Logf, tdev tun.Device, m *usermetric.Registry) *Wrapper {
return wrap(logf, tdev, false, m)
}
func wrap(logf logger.Logf, tdev tun.Device, isTAP bool) *Wrapper {
func wrap(logf logger.Logf, tdev tun.Device, isTAP bool, m *usermetric.Registry) *Wrapper {
logf = logger.WithPrefix(logf, "tstun: ")
w := &Wrapper{
logf: logf,
@ -274,6 +299,7 @@ func wrap(logf logger.Logf, tdev tun.Device, isTAP bool) *Wrapper {
// TODO(dmytro): (highly rate-limited) hexdumps should happen on unknown packets.
filterFlags: filter.LogAccepts | filter.LogDrops,
startCh: make(chan struct{}),
metrics: registerMetrics(m),
}
w.vectorBuffer = make([][]byte, tdev.BatchSize())
@ -872,7 +898,7 @@ func (t *Wrapper) filterPacketOutboundToWireGuard(p *packet.Parsed, pc *peerConf
if filt.RunOut(p, t.filterFlags) != filter.Accept {
metricPacketOutDropFilter.Add(1)
metricOutboundDroppedPacketsTotal.Add(dropPacketLabel{
t.metrics.outboundDroppedPacketsTotal.Add(dropPacketLabel{
Reason: DropReasonACL,
}, 1)
return filter.Drop, gro
@ -1144,7 +1170,7 @@ func (t *Wrapper) filterPacketInboundFromWireGuard(p *packet.Parsed, captHook ca
if outcome != filter.Accept {
metricPacketInDropFilter.Add(1)
metricInboundDroppedPacketsTotal.Add(dropPacketLabel{
t.metrics.inboundDroppedPacketsTotal.Add(dropPacketLabel{
Reason: DropReasonACL,
}, 1)
@ -1225,7 +1251,7 @@ func (t *Wrapper) Write(buffs [][]byte, offset int) (int, error) {
t.noteActivity()
_, err := t.tdevWrite(buffs, offset)
if err != nil {
metricInboundDroppedPacketsTotal.Add(dropPacketLabel{
t.metrics.inboundDroppedPacketsTotal.Add(dropPacketLabel{
Reason: DropReasonError,
}, int64(len(buffs)))
}
@ -1482,19 +1508,6 @@ type dropPacketLabel struct {
Reason DropReason
}
var (
metricInboundDroppedPacketsTotal = usermetric.NewMultiLabelMap[dropPacketLabel](
"tailscaled_inbound_dropped_packets_total",
"counter",
"Counts the number of dropped packets received by the node from other peers",
)
metricOutboundDroppedPacketsTotal = usermetric.NewMultiLabelMap[dropPacketLabel](
"tailscaled_outbound_dropped_packets_total",
"counter",
"Counts the number of packets dropped while being sent to other peers",
)
)
func (t *Wrapper) InstallCaptureHook(cb capture.Callback) {
t.captureHook.Store(cb)
}

@ -8,6 +8,7 @@ import (
"context"
"encoding/binary"
"encoding/hex"
"expvar"
"fmt"
"net/netip"
"reflect"
@ -38,6 +39,7 @@ import (
"tailscale.com/types/ptr"
"tailscale.com/types/views"
"tailscale.com/util/must"
"tailscale.com/util/usermetric"
"tailscale.com/wgengine/capture"
"tailscale.com/wgengine/filter"
"tailscale.com/wgengine/wgcfg"
@ -173,7 +175,8 @@ func setfilter(logf logger.Logf, tun *Wrapper) {
func newChannelTUN(logf logger.Logf, secure bool) (*tuntest.ChannelTUN, *Wrapper) {
chtun := tuntest.NewChannelTUN()
tun := Wrap(logf, chtun.TUN())
reg := new(usermetric.Registry)
tun := Wrap(logf, chtun.TUN(), reg)
if secure {
setfilter(logf, tun)
} else {
@ -185,7 +188,8 @@ func newChannelTUN(logf logger.Logf, secure bool) (*tuntest.ChannelTUN, *Wrapper
func newFakeTUN(logf logger.Logf, secure bool) (*fakeTUN, *Wrapper) {
ftun := NewFake()
tun := Wrap(logf, ftun)
reg := new(usermetric.Registry)
tun := Wrap(logf, ftun, reg)
if secure {
setfilter(logf, tun)
} else {
@ -315,15 +319,15 @@ func mustHexDecode(s string) []byte {
}
func TestFilter(t *testing.T) {
// Reset the metrics before test. These are global
// so the different tests might have affected them.
metricInboundDroppedPacketsTotal.SetInt(dropPacketLabel{Reason: DropReasonACL}, 0)
metricInboundDroppedPacketsTotal.SetInt(dropPacketLabel{Reason: DropReasonError}, 0)
metricOutboundDroppedPacketsTotal.SetInt(dropPacketLabel{Reason: DropReasonACL}, 0)
chtun, tun := newChannelTUN(t.Logf, true)
defer tun.Close()
// Reset the metrics before test. These are global
// so the different tests might have affected them.
tun.metrics.inboundDroppedPacketsTotal.ResetAllForTest()
tun.metrics.outboundDroppedPacketsTotal.ResetAllForTest()
type direction int
const (
@ -436,20 +440,26 @@ func TestFilter(t *testing.T) {
})
}
inACL := metricInboundDroppedPacketsTotal.Get(dropPacketLabel{Reason: DropReasonACL})
inError := metricInboundDroppedPacketsTotal.Get(dropPacketLabel{Reason: DropReasonError})
outACL := metricOutboundDroppedPacketsTotal.Get(dropPacketLabel{Reason: DropReasonACL})
assertMetricPackets(t, "inACL", "3", inACL.String())
assertMetricPackets(t, "inError", "0", inError.String())
assertMetricPackets(t, "outACL", "1", outACL.String())
var metricInboundDroppedPacketsACL, metricInboundDroppedPacketsErr, metricOutboundDroppedPacketsACL int64
if m, ok := tun.metrics.inboundDroppedPacketsTotal.Get(dropPacketLabel{Reason: DropReasonACL}).(*expvar.Int); ok {
metricInboundDroppedPacketsACL = m.Value()
}
if m, ok := tun.metrics.inboundDroppedPacketsTotal.Get(dropPacketLabel{Reason: DropReasonError}).(*expvar.Int); ok {
metricInboundDroppedPacketsErr = m.Value()
}
if m, ok := tun.metrics.outboundDroppedPacketsTotal.Get(dropPacketLabel{Reason: DropReasonACL}).(*expvar.Int); ok {
metricOutboundDroppedPacketsACL = m.Value()
}
assertMetricPackets(t, "inACL", 3, metricInboundDroppedPacketsACL)
assertMetricPackets(t, "inError", 0, metricInboundDroppedPacketsErr)
assertMetricPackets(t, "outACL", 1, metricOutboundDroppedPacketsACL)
}
func assertMetricPackets(t *testing.T, metricName, want, got string) {
func assertMetricPackets(t *testing.T, metricName string, want, got int64) {
t.Helper()
if want != got {
t.Errorf("%s got unexpected value, got %s, want %s", metricName, got, want)
t.Errorf("%s got unexpected value, got %d, want %d", metricName, got, want)
}
}
@ -512,6 +522,7 @@ func TestAtomic64Alignment(t *testing.T) {
}
func TestPeerAPIBypass(t *testing.T) {
reg := new(usermetric.Registry)
wrapperWithPeerAPI := &Wrapper{
PeerAPIPort: func(ip netip.Addr) (port uint16, ok bool) {
if ip == netip.MustParseAddr("100.64.1.2") {
@ -519,6 +530,7 @@ func TestPeerAPIBypass(t *testing.T) {
}
return
},
metrics: registerMetrics(reg),
}
tests := []struct {
@ -534,13 +546,16 @@ func TestPeerAPIBypass(t *testing.T) {
PeerAPIPort: func(netip.Addr) (port uint16, ok bool) {
return 60000, true
},
metrics: registerMetrics(reg),
},
pkt: tcp4syn("1.2.3.4", "100.64.1.2", 1234, 60000),
want: filter.Drop,
},
{
name: "reject_with_filter",
w: &Wrapper{},
w: &Wrapper{
metrics: registerMetrics(reg),
},
filter: filter.NewAllowNone(logger.Discard, new(netipx.IPSet)),
pkt: tcp4syn("1.2.3.4", "100.64.1.2", 1234, 60000),
want: filter.Drop,

@ -826,7 +826,7 @@ func TestSSHAuthFlow(t *testing.T) {
func TestSSH(t *testing.T) {
var logf logger.Logf = t.Logf
sys := &tsd.System{}
eng, err := wgengine.NewFakeUserspaceEngine(logf, sys.Set, sys.HealthTracker())
eng, err := wgengine.NewFakeUserspaceEngine(logf, sys.Set, sys.HealthTracker(), sys.UserMetricsRegistry())
if err != nil {
t.Fatal(err)
}

@ -32,6 +32,7 @@ import (
"tailscale.com/net/tstun"
"tailscale.com/proxymap"
"tailscale.com/types/netmap"
"tailscale.com/util/usermetric"
"tailscale.com/wgengine"
"tailscale.com/wgengine/magicsock"
"tailscale.com/wgengine/router"
@ -66,6 +67,7 @@ type System struct {
proxyMap proxymap.Mapper
healthTracker health.Tracker
userMetricsRegistry usermetric.Registry
}
// NetstackImpl is the interface that *netstack.Impl implements.
@ -142,6 +144,11 @@ func (s *System) HealthTracker() *health.Tracker {
return &s.healthTracker
}
// UserMetricsRegistry returns the system usermetrics.
func (s *System) UserMetricsRegistry() *usermetric.Registry {
return &s.userMetricsRegistry
}
// SubSystem represents some subsystem of the Tailscale node daemon.
//
// A subsystem can be set to a value, and then later retrieved. A subsystem

@ -536,12 +536,14 @@ func (s *Server) start() (reterr error) {
SetSubsystem: sys.Set,
ControlKnobs: sys.ControlKnobs(),
HealthTracker: sys.HealthTracker(),
Metrics: sys.UserMetricsRegistry(),
})
if err != nil {
return err
}
closePool.add(s.dialer)
sys.Set(eng)
sys.HealthTracker().SetMetricsRegistry(sys.UserMetricsRegistry())
// TODO(oxtoacart): do we need to support Taildrive on tsnet, and if so, how?
ns, err := netstack.Create(tsLogf, sys.Tun.Get(), eng, sys.MagicSock.Get(), s.dialer, sys.DNSManager.Get(), sys.ProxyMapper(), nil)

@ -5,6 +5,7 @@ package tsnet
import (
"bufio"
"bytes"
"context"
"crypto/ecdsa"
"crypto/elliptic"
@ -31,7 +32,8 @@ import (
"testing"
"time"
"github.com/google/go-cmp/cmp"
dto "github.com/prometheus/client_model/go"
"github.com/prometheus/common/expfmt"
"golang.org/x/net/proxy"
"tailscale.com/cmd/testwrapper/flakytest"
"tailscale.com/health"
@ -818,11 +820,6 @@ func TestUDPConn(t *testing.T) {
}
}
func TestUserMetrics(t *testing.T) {
tstest.ResourceCheck(t)
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
// testWarnable is a Warnable that is used within this package for testing purposes only.
var testWarnable = health.Register(&health.Warnable{
Code: "test-warnable-tsnet",
@ -833,50 +830,156 @@ func TestUserMetrics(t *testing.T) {
},
})
func parseMetrics(m []byte) (map[string]float64, error) {
metrics := make(map[string]float64)
var parser expfmt.TextParser
mf, err := parser.TextToMetricFamilies(bytes.NewReader(m))
if err != nil {
return nil, err
}
for _, f := range mf {
for _, ff := range f.Metric {
val := float64(0)
switch f.GetType() {
case dto.MetricType_COUNTER:
val = ff.GetCounter().GetValue()
case dto.MetricType_GAUGE:
val = ff.GetGauge().GetValue()
}
metrics[f.GetName()+promMetricLabelsStr(ff.GetLabel())] = val
}
}
return metrics, nil
}
func promMetricLabelsStr(labels []*dto.LabelPair) string {
if len(labels) == 0 {
return ""
}
var b strings.Builder
b.WriteString("{")
for i, l := range labels {
if i > 0 {
b.WriteString(",")
}
b.WriteString(fmt.Sprintf("%s=%q", l.GetName(), l.GetValue()))
}
b.WriteString("}")
return b.String()
}
func TestUserMetrics(t *testing.T) {
flakytest.Mark(t, "https://github.com/tailscale/tailscale/issues/13420")
tstest.ResourceCheck(t)
ctx, cancel := context.WithTimeout(context.Background(), 120*time.Second)
defer cancel()
controlURL, c := startControl(t)
s1, _, s1PubKey := startServer(t, ctx, controlURL, "s1")
s1, s1ip, s1PubKey := startServer(t, ctx, controlURL, "s1")
s2, _, _ := startServer(t, ctx, controlURL, "s2")
s1.lb.EditPrefs(&ipn.MaskedPrefs{
Prefs: ipn.Prefs{
AdvertiseRoutes: []netip.Prefix{
netip.MustParsePrefix("192.0.2.0/24"),
netip.MustParsePrefix("192.0.3.0/24"),
netip.MustParsePrefix("192.0.5.1/32"),
netip.MustParsePrefix("0.0.0.0/0"),
},
},
AdvertiseRoutesSet: true,
})
c.SetSubnetRoutes(s1PubKey, []netip.Prefix{netip.MustParsePrefix("192.0.2.0/24")})
c.SetSubnetRoutes(s1PubKey, []netip.Prefix{
netip.MustParsePrefix("192.0.2.0/24"),
netip.MustParsePrefix("192.0.5.1/32"),
netip.MustParsePrefix("0.0.0.0/0"),
})
lc1, err := s1.LocalClient()
if err != nil {
t.Fatal(err)
}
lc2, err := s2.LocalClient()
if err != nil {
t.Fatal(err)
}
// ping to make sure the connection is up.
res, err := lc2.Ping(ctx, s1ip, tailcfg.PingICMP)
if err != nil {
t.Fatalf("pinging: %s", err)
}
t.Logf("ping success: %#+v", res)
ht := s1.lb.HealthTracker()
ht.SetUnhealthy(testWarnable, health.Args{"Text": "Hello world 1"})
metrics1, err := lc1.UserMetrics(ctx)
// Force an update to the netmap to ensure that the metrics are up-to-date.
s1.lb.DebugForceNetmapUpdate()
s2.lb.DebugForceNetmapUpdate()
ctxLc, cancelLc := context.WithTimeout(context.Background(), 5*time.Second)
defer cancelLc()
metrics1, err := lc1.UserMetrics(ctxLc)
if err != nil {
t.Fatal(err)
}
status1, err := lc1.Status(ctxLc)
if err != nil {
t.Fatal(err)
}
// Note that this test will check for two warnings because the health
// tracker will have two warnings: one from the testWarnable, added in
// this test, and one because we are running the dev/unstable version
// of tailscale.
want := `# TYPE tailscaled_advertised_routes gauge
# HELP tailscaled_advertised_routes Number of advertised network routes (e.g. by a subnet router)
tailscaled_advertised_routes 2
# TYPE tailscaled_health_messages gauge
# HELP tailscaled_health_messages Number of health messages broken down by type.
tailscaled_health_messages{type="warning"} 2
# TYPE tailscaled_inbound_dropped_packets_total counter
# HELP tailscaled_inbound_dropped_packets_total Counts the number of dropped packets received by the node from other peers
# TYPE tailscaled_outbound_dropped_packets_total counter
# HELP tailscaled_outbound_dropped_packets_total Counts the number of packets dropped while being sent to other peers
`
parsedMetrics1, err := parseMetrics(metrics1)
if err != nil {
t.Fatal(err)
}
t.Logf("Metrics1:\n%s\n", metrics1)
// The node is advertising 4 routes:
// - 192.0.2.0/24
// - 192.0.3.0/24
// - 192.0.5.1/32
if got, want := parsedMetrics1["tailscaled_advertised_routes"], 3.0; got != want {
t.Errorf("metrics1, tailscaled_advertised_routes: got %v, want %v", got, want)
}
// Validate the health counter metric against the status of the node
if got, want := parsedMetrics1[`tailscaled_health_messages{type="warning"}`], float64(len(status1.Health)); got != want {
t.Errorf("metrics1, tailscaled_health_messages: got %v, want %v", got, want)
}
metrics2, err := lc2.UserMetrics(ctx)
if err != nil {
t.Fatal(err)
}
status2, err := lc2.Status(ctx)
if err != nil {
t.Fatal(err)
}
parsedMetrics2, err := parseMetrics(metrics2)
if err != nil {
t.Fatal(err)
}
t.Logf("Metrics2:\n%s\n", metrics2)
// The node is advertising 0 routes
if got, want := parsedMetrics2["tailscaled_advertised_routes"], 0.0; got != want {
t.Errorf("metrics2, tailscaled_advertised_routes: got %v, want %v", got, want)
}
if diff := cmp.Diff(want, string(metrics1)); diff != "" {
t.Fatalf("unexpected metrics (-want +got):\n%s", diff)
// Validate the health counter metric against the status of the node
if got, want := parsedMetrics2[`tailscaled_health_messages{type="warning"}`], float64(len(status2.Health)); got != want {
t.Errorf("metrics2, tailscaled_health_messages: got %v, want %v", got, want)
}
}

@ -10,29 +10,33 @@ import (
"fmt"
"io"
"net/http"
"strings"
"tailscale.com/metrics"
"tailscale.com/tsweb/varz"
)
var vars expvar.Map
// Registry tracks user-facing metrics of various Tailscale subsystems.
type Registry struct {
vars expvar.Map
}
// NewMultiLabelMap creates and register a new
// NewMultiLabelMapWithRegistry creates and register a new
// MultiLabelMap[T] variable with the given name and returns it.
// The variable is registered with the userfacing metrics package.
//
// Note that usermetric are not protected against duplicate
// metrics name. It is the caller's responsibility to ensure that
// the name is unique.
func NewMultiLabelMap[T comparable](name string, promType, helpText string) *metrics.MultiLabelMap[T] {
m := &metrics.MultiLabelMap[T]{
func NewMultiLabelMapWithRegistry[T comparable](m *Registry, name string, promType, helpText string) *metrics.MultiLabelMap[T] {
ml := &metrics.MultiLabelMap[T]{
Type: promType,
Help: helpText,
}
var zero T
_ = metrics.LabelString(zero) // panic early if T is invalid
vars.Set(name, m)
return m
m.vars.Set(name, ml)
return ml
}
// Gauge is a gauge metric with no labels.
@ -42,20 +46,26 @@ type Gauge struct {
}
// NewGauge creates and register a new gauge metric with the given name and help text.
func NewGauge(name, help string) *Gauge {
func (r *Registry) NewGauge(name, help string) *Gauge {
g := &Gauge{&expvar.Float{}, help}
vars.Set(name, g)
r.vars.Set(name, g)
return g
}
// Set sets the gauge to the given value.
func (g *Gauge) Set(v float64) {
if g == nil {
return
}
g.m.Set(v)
}
// String returns the string of the underlying expvar.Float.
// This satisfies the expvar.Var interface.
func (g *Gauge) String() string {
if g == nil {
return ""
}
return g.m.String()
}
@ -79,6 +89,17 @@ func (g *Gauge) WritePrometheus(w io.Writer, name string) {
// Handler returns a varz.Handler that serves the userfacing expvar contained
// in this package.
func Handler(w http.ResponseWriter, r *http.Request) {
varz.ExpvarDoHandler(vars.Do)(w, r)
func (r *Registry) Handler(w http.ResponseWriter, req *http.Request) {
varz.ExpvarDoHandler(r.vars.Do)(w, req)
}
// String returns the string representation of all the metrics and their
// values in the registry. It is useful for debugging.
func (r *Registry) String() string {
var sb strings.Builder
r.vars.Do(func(kv expvar.KeyValue) {
fmt.Fprintf(&sb, "%s: %v\n", kv.Key, kv.Value)
})
return sb.String()
}

@ -9,7 +9,8 @@ import (
)
func TestGauge(t *testing.T) {
g := NewGauge("test_gauge", "This is a test gauge")
var reg Registry
g := reg.NewGauge("test_gauge", "This is a test gauge")
g.Set(15)
var buf bytes.Buffer

@ -60,6 +60,7 @@ import (
"tailscale.com/util/set"
"tailscale.com/util/testenv"
"tailscale.com/util/uniq"
"tailscale.com/util/usermetric"
"tailscale.com/wgengine/capture"
"tailscale.com/wgengine/wgint"
)
@ -386,6 +387,9 @@ type Options struct {
// report errors and warnings to.
HealthTracker *health.Tracker
// Metrics specifies the metrics registry to record metrics to.
Metrics *usermetric.Registry
// ControlKnobs are the set of control knobs to use.
// If nil, they're ignored and not updated.
ControlKnobs *controlknobs.Knobs

@ -64,6 +64,7 @@ import (
"tailscale.com/util/cibuild"
"tailscale.com/util/racebuild"
"tailscale.com/util/set"
"tailscale.com/util/usermetric"
"tailscale.com/wgengine/filter"
"tailscale.com/wgengine/wgcfg"
"tailscale.com/wgengine/wgcfg/nmcfg"
@ -156,6 +157,7 @@ type magicStack struct {
dev *device.Device // the wireguard-go Device that connects the previous things
wgLogger *wglog.Logger // wireguard-go log wrapper
netMon *netmon.Monitor // always non-nil
metrics *usermetric.Registry
}
// newMagicStack builds and initializes an idle magicsock and
@ -174,9 +176,11 @@ func newMagicStackWithKey(t testing.TB, logf logger.Logf, l nettype.PacketListen
t.Fatalf("netmon.New: %v", err)
}
var reg usermetric.Registry
epCh := make(chan []tailcfg.Endpoint, 100) // arbitrary
conn, err := NewConn(Options{
NetMon: netMon,
Metrics: &reg,
Logf: logf,
DisablePortMapper: true,
TestOnlyPacketListener: l,
@ -193,7 +197,7 @@ func newMagicStackWithKey(t testing.TB, logf logger.Logf, l nettype.PacketListen
}
tun := tuntest.NewChannelTUN()
tsTun := tstun.Wrap(logf, tun.TUN())
tsTun := tstun.Wrap(logf, tun.TUN(), &reg)
tsTun.SetFilter(filter.NewAllowAllForTest(logf))
tsTun.Start()
@ -219,6 +223,7 @@ func newMagicStackWithKey(t testing.TB, logf logger.Logf, l nettype.PacketListen
dev: dev,
wgLogger: wgLogger,
netMon: netMon,
metrics: &reg,
}
}
@ -397,6 +402,7 @@ func TestNewConn(t *testing.T) {
EndpointsFunc: epFunc,
Logf: t.Logf,
NetMon: netMon,
Metrics: new(usermetric.Registry),
})
if err != nil {
t.Fatal(err)
@ -523,6 +529,7 @@ func TestDeviceStartStop(t *testing.T) {
EndpointsFunc: func(eps []tailcfg.Endpoint) {},
Logf: t.Logf,
NetMon: netMon,
Metrics: new(usermetric.Registry),
})
if err != nil {
t.Fatal(err)
@ -1275,6 +1282,7 @@ func newTestConn(t testing.TB) *Conn {
conn, err := NewConn(Options{
NetMon: netMon,
HealthTracker: new(health.Tracker),
Metrics: new(usermetric.Registry),
DisablePortMapper: true,
Logf: t.Logf,
Port: port,

@ -50,6 +50,7 @@ func TestInjectInboundLeak(t *testing.T) {
Dialer: dialer,
SetSubsystem: sys.Set,
HealthTracker: sys.HealthTracker(),
Metrics: sys.UserMetricsRegistry(),
})
if err != nil {
t.Fatal(err)
@ -107,6 +108,7 @@ func makeNetstack(tb testing.TB, config func(*Impl)) *Impl {
Dialer: dialer,
SetSubsystem: sys.Set,
HealthTracker: sys.HealthTracker(),
Metrics: sys.UserMetricsRegistry(),
})
if err != nil {
tb.Fatal(err)

@ -49,6 +49,7 @@ import (
"tailscale.com/util/mak"
"tailscale.com/util/set"
"tailscale.com/util/testenv"
"tailscale.com/util/usermetric"
"tailscale.com/version"
"tailscale.com/wgengine/capture"
"tailscale.com/wgengine/filter"
@ -195,6 +196,9 @@ type Config struct {
// HealthTracker, if non-nil, is the health tracker to use.
HealthTracker *health.Tracker
// Metrics, if non-nil, is the usermetrics registry to use.
Metrics *usermetric.Registry
// Dialer is the dialer to use for outbound connections.
// If nil, a new Dialer is created.
Dialer *tsdial.Dialer
@ -249,6 +253,8 @@ func NewFakeUserspaceEngine(logf logger.Logf, opts ...any) (Engine, error) {
conf.ControlKnobs = v
case *health.Tracker:
conf.HealthTracker = v
case *usermetric.Registry:
conf.Metrics = v
default:
return nil, fmt.Errorf("unknown option type %T", v)
}
@ -289,9 +295,9 @@ func NewUserspaceEngine(logf logger.Logf, conf Config) (_ Engine, reterr error)
var tsTUNDev *tstun.Wrapper
if conf.IsTAP {
tsTUNDev = tstun.WrapTAP(logf, conf.Tun)
tsTUNDev = tstun.WrapTAP(logf, conf.Tun, conf.Metrics)
} else {
tsTUNDev = tstun.Wrap(logf, conf.Tun)
tsTUNDev = tstun.Wrap(logf, conf.Tun, conf.Metrics)
}
closePool.add(tsTUNDev)
@ -387,6 +393,7 @@ func NewUserspaceEngine(logf logger.Logf, conf Config) (_ Engine, reterr error)
NoteRecvActivity: e.noteRecvActivity,
NetMon: e.netMon,
HealthTracker: e.health,
Metrics: conf.Metrics,
ControlKnobs: conf.ControlKnobs,
OnPortUpdate: onPortUpdate,
PeerByKeyFunc: e.PeerByKey,

@ -22,6 +22,7 @@ func TestIsNetstack(t *testing.T) {
wgengine.Config{
SetSubsystem: sys.Set,
HealthTracker: sys.HealthTracker(),
Metrics: sys.UserMetricsRegistry(),
},
)
if err != nil {
@ -72,6 +73,7 @@ func TestIsNetstackRouter(t *testing.T) {
conf := tt.conf
conf.SetSubsystem = sys.Set
conf.HealthTracker = sys.HealthTracker()
conf.Metrics = sys.UserMetricsRegistry()
e, err := wgengine.NewUserspaceEngine(logger.Discard, conf)
if err != nil {
t.Fatal(err)

@ -25,6 +25,7 @@ import (
"tailscale.com/types/key"
"tailscale.com/types/netmap"
"tailscale.com/types/opt"
"tailscale.com/util/usermetric"
"tailscale.com/wgengine/router"
"tailscale.com/wgengine/wgcfg"
)
@ -100,7 +101,8 @@ func nodeViews(v []*tailcfg.Node) []tailcfg.NodeView {
func TestUserspaceEngineReconfig(t *testing.T) {
ht := new(health.Tracker)
e, err := NewFakeUserspaceEngine(t.Logf, 0, ht)
reg := new(usermetric.Registry)
e, err := NewFakeUserspaceEngine(t.Logf, 0, ht, reg)
if err != nil {
t.Fatal(err)
}
@ -167,9 +169,10 @@ func TestUserspaceEnginePortReconfig(t *testing.T) {
// Keep making a wgengine until we find an unused port
var ue *userspaceEngine
ht := new(health.Tracker)
reg := new(usermetric.Registry)
for i := range 100 {
attempt := uint16(defaultPort + i)
e, err := NewFakeUserspaceEngine(t.Logf, attempt, &knobs, ht)
e, err := NewFakeUserspaceEngine(t.Logf, attempt, &knobs, ht, reg)
if err != nil {
t.Fatal(err)
}
@ -249,7 +252,8 @@ func TestUserspaceEnginePeerMTUReconfig(t *testing.T) {
var knobs controlknobs.Knobs
ht := new(health.Tracker)
e, err := NewFakeUserspaceEngine(t.Logf, 0, &knobs, ht)
reg := new(usermetric.Registry)
e, err := NewFakeUserspaceEngine(t.Logf, 0, &knobs, ht, reg)
if err != nil {
t.Fatal(err)
}

@ -9,6 +9,7 @@ import (
"time"
"tailscale.com/health"
"tailscale.com/util/usermetric"
)
func TestWatchdog(t *testing.T) {
@ -24,7 +25,8 @@ func TestWatchdog(t *testing.T) {
t.Run("default watchdog does not fire", func(t *testing.T) {
t.Parallel()
ht := new(health.Tracker)
e, err := NewFakeUserspaceEngine(t.Logf, 0, ht)
reg := new(usermetric.Registry)
e, err := NewFakeUserspaceEngine(t.Logf, 0, ht, reg)
if err != nil {
t.Fatal(err)
}

Loading…
Cancel
Save