From 77832553e538618ad94c6fc33f32204c3dcb552d Mon Sep 17 00:00:00 2001 From: Kristoffer Dalby Date: Wed, 25 Sep 2024 16:50:34 +0200 Subject: [PATCH] ipn/ipnlocal: add advertised and primary route metrics Updates tailscale/corp#22075 Signed-off-by: Kristoffer Dalby --- ipn/ipnlocal/local.go | 29 ++++++++- tsnet/tsnet_test.go | 61 +++++++++++++++++++ tstest/integration/testcontrol/testcontrol.go | 2 + 3 files changed, 91 insertions(+), 1 deletion(-) diff --git a/ipn/ipnlocal/local.go b/ipn/ipnlocal/local.go index d9f36797d..6505edf1f 100644 --- a/ipn/ipnlocal/local.go +++ b/ipn/ipnlocal/local.go @@ -390,9 +390,18 @@ type updateStatus struct { } type metrics struct { - // advertisedRoutes is a metric that counts the number of network routes that are advertised by the local node. + // advertisedRoutes is a metric that reports the number of network routes that are advertised by the local node. // This informs the user of how many routes are being advertised by the local node, excluding exit routes. advertisedRoutes *usermetric.Gauge + + // approvedRoutes is a metric that reports the number of network routes served by the local node and approved + // by the control server. + approvedRoutes *usermetric.Gauge + + // primaryRoutes is a metric that reports the number of primary network routes served by the local node. + // A route being a primary route implies that the route is currently served by this node, and not by another + // subnet router in a high availability configuration. + primaryRoutes *usermetric.Gauge } // clientGen is a func that creates a control plane client. @@ -441,6 +450,10 @@ func NewLocalBackend(logf logger.Logf, logID logid.PublicID, sys *tsd.System, lo m := metrics{ advertisedRoutes: sys.UserMetricsRegistry().NewGauge( "tailscaled_advertised_routes", "Number of advertised network routes (e.g. by a subnet router)"), + approvedRoutes: sys.UserMetricsRegistry().NewGauge( + "tailscaled_approved_routes", "Number of approved network routes (e.g. by a subnet router)"), + primaryRoutes: sys.UserMetricsRegistry().NewGauge( + "tailscaled_primary_routes", "Number of network routes for which this node is a primary router (in high availability configuration)"), } b := &LocalBackend{ @@ -5388,6 +5401,11 @@ func (b *LocalBackend) setNetMapLocked(nm *netmap.NetworkMap) { b.setTCPPortsInterceptedFromNetmapAndPrefsLocked(b.pm.CurrentPrefs()) if nm == nil { b.nodeByAddr = nil + + // If there is no netmap, the client is going into a "turned off" + // state so reset the metrics. + b.metrics.approvedRoutes.Set(0) + b.metrics.primaryRoutes.Set(0) return } @@ -5408,6 +5426,15 @@ func (b *LocalBackend) setNetMapLocked(nm *netmap.NetworkMap) { } if nm.SelfNode.Valid() { addNode(nm.SelfNode) + + var approved float64 + for _, route := range nm.SelfNode.AllowedIPs().All() { + if !views.SliceContains(nm.SelfNode.Addresses(), route) && !tsaddr.IsExitRoute(route) { + approved++ + } + } + b.metrics.approvedRoutes.Set(approved) + b.metrics.primaryRoutes.Set(float64(tsaddr.WithoutExitRoute(nm.SelfNode.PrimaryRoutes()).Len())) } for _, p := range nm.Peers { addNode(p) diff --git a/tsnet/tsnet_test.go b/tsnet/tsnet_test.go index 96c60de47..b95061d38 100644 --- a/tsnet/tsnet_test.go +++ b/tsnet/tsnet_test.go @@ -26,6 +26,7 @@ import ( "os" "path/filepath" "reflect" + "runtime" "strings" "sync" "sync/atomic" @@ -924,6 +925,32 @@ func TestUserMetrics(t *testing.T) { s1.lb.DebugForceNetmapUpdate() s2.lb.DebugForceNetmapUpdate() + wantRoutes := float64(2) + if runtime.GOOS == "windows" { + wantRoutes = 0 + } + + // Wait for the routes to be propagated to node 1 to ensure + // that the metrics are up-to-date. + waitForCondition(t, "primary routes available for node1", 90*time.Second, func() bool { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + status1, err := lc1.Status(ctx) + if err != nil { + t.Logf("getting status: %s", err) + return false + } + if runtime.GOOS == "windows" { + // Windows does not seem to support or report back routes when running in + // userspace via tsnet. So, we skip this check on Windows. + // TODO(kradalby): Figure out if this is correct. + return true + } + // Wait for the primary routes to reach our desired routes, which is wantRoutes + 1, because + // the PrimaryRoutes list will contain a exit node route, which the metric does not count. + return status1.Self.PrimaryRoutes != nil && status1.Self.PrimaryRoutes.Len() == int(wantRoutes)+1 + }) + ctxLc, cancelLc := context.WithTimeout(context.Background(), 5*time.Second) defer cancelLc() metrics1, err := lc1.UserMetrics(ctxLc) @@ -951,11 +978,25 @@ func TestUserMetrics(t *testing.T) { t.Errorf("metrics1, tailscaled_advertised_routes: got %v, want %v", got, want) } + // The control has approved 2 routes: + // - 192.0.2.0/24 + // - 192.0.5.1/32 + if got, want := parsedMetrics1["tailscaled_approved_routes"], wantRoutes; got != want { + t.Errorf("metrics1, tailscaled_approved_routes: got %v, want %v", got, want) + } + // Validate the health counter metric against the status of the node if got, want := parsedMetrics1[`tailscaled_health_messages{type="warning"}`], float64(len(status1.Health)); got != want { t.Errorf("metrics1, tailscaled_health_messages: got %v, want %v", got, want) } + // The node is the primary subnet router for 2 routes: + // - 192.0.2.0/24 + // - 192.0.5.1/32 + if got, want := parsedMetrics1["tailscaled_primary_routes"], wantRoutes; got != want { + t.Errorf("metrics1, tailscaled_primary_routes: got %v, want %v", got, want) + } + metrics2, err := lc2.UserMetrics(ctx) if err != nil { t.Fatal(err) @@ -978,8 +1019,28 @@ func TestUserMetrics(t *testing.T) { t.Errorf("metrics2, tailscaled_advertised_routes: got %v, want %v", got, want) } + // The control has approved 0 routes + if got, want := parsedMetrics2["tailscaled_approved_routes"], 0.0; got != want { + t.Errorf("metrics2, tailscaled_approved_routes: got %v, want %v", got, want) + } + // Validate the health counter metric against the status of the node if got, want := parsedMetrics2[`tailscaled_health_messages{type="warning"}`], float64(len(status2.Health)); got != want { t.Errorf("metrics2, tailscaled_health_messages: got %v, want %v", got, want) } + + // The node is the primary subnet router for 0 routes + if got, want := parsedMetrics2["tailscaled_primary_routes"], 0.0; got != want { + t.Errorf("metrics2, tailscaled_primary_routes: got %v, want %v", got, want) + } +} + +func waitForCondition(t *testing.T, msg string, waitTime time.Duration, f func() bool) { + t.Helper() + for deadline := time.Now().Add(waitTime); time.Now().Before(deadline); time.Sleep(1 * time.Second) { + if f() { + return + } + } + t.Fatalf("waiting for condition: %s", msg) } diff --git a/tstest/integration/testcontrol/testcontrol.go b/tstest/integration/testcontrol/testcontrol.go index 44ed2da06..bbcf277d1 100644 --- a/tstest/integration/testcontrol/testcontrol.go +++ b/tstest/integration/testcontrol/testcontrol.go @@ -366,6 +366,7 @@ func (s *Server) serveMachine(w http.ResponseWriter, r *http.Request) { func (s *Server) SetSubnetRoutes(nodeKey key.NodePublic, routes []netip.Prefix) { s.mu.Lock() defer s.mu.Unlock() + s.logf("Setting subnet routes for %s: %v", nodeKey.ShortString(), routes) mak.Set(&s.nodeSubnetRoutes, nodeKey, routes) } @@ -1018,6 +1019,7 @@ func (s *Server) MapResponse(req *tailcfg.MapRequest) (res *tailcfg.MapResponse, s.mu.Lock() defer s.mu.Unlock() + res.Node.PrimaryRoutes = s.nodeSubnetRoutes[nk] res.Node.AllowedIPs = append(res.Node.Addresses, s.nodeSubnetRoutes[nk]...) // Consume a PingRequest while protected by mutex if it exists