From c76a6e5167d4f669a91818d502a642c1634251e7 Mon Sep 17 00:00:00 2001 From: Brad Fitzpatrick Date: Sun, 20 Oct 2024 13:22:31 -0700 Subject: [PATCH] derp: track client-advertised non-ideal DERP connections in more places In f77821fd63 (released in v1.72.0), we made the client tell a DERP server when the connection was not its ideal choice (the first node in its region). But we didn't do anything with that information until now. This adds a metric about how many such connections are on a given derper, and also adds a bit to the PeerPresentFlags bitmask so watchers can identify (and rebalance) them. Updates tailscale/corp#372 Change-Id: Ief8af448750aa6d598e5939a57c062f4e55962be Signed-off-by: Brad Fitzpatrick --- cmd/tailscale/depaware.txt | 2 +- derp/derp.go | 1 + derp/derp_server.go | 30 ++++++++++++++++++++++++++---- derp/derphttp/derphttp_client.go | 2 +- derp/derphttp/derphttp_server.go | 8 +++++++- 5 files changed, 36 insertions(+), 7 deletions(-) diff --git a/cmd/tailscale/depaware.txt b/cmd/tailscale/depaware.txt index de534df8d..765bbc483 100644 --- a/cmd/tailscale/depaware.txt +++ b/cmd/tailscale/depaware.txt @@ -155,7 +155,7 @@ tailscale.com/cmd/tailscale dependencies: (generated by github.com/tailscale/dep tailscale.com/util/clientmetric from tailscale.com/net/netcheck+ tailscale.com/util/cloudenv from tailscale.com/net/dnscache+ tailscale.com/util/cmpver from tailscale.com/net/tshttpproxy+ - tailscale.com/util/ctxkey from tailscale.com/types/logger + tailscale.com/util/ctxkey from tailscale.com/types/logger+ 💣 tailscale.com/util/deephash from tailscale.com/util/syspolicy/setting L 💣 tailscale.com/util/dirwalk from tailscale.com/metrics tailscale.com/util/dnsname from tailscale.com/cmd/tailscale/cli+ diff --git a/derp/derp.go b/derp/derp.go index f9b070647..878188cd2 100644 --- a/derp/derp.go +++ b/derp/derp.go @@ -147,6 +147,7 @@ const ( PeerPresentIsRegular = 1 << 0 PeerPresentIsMeshPeer = 1 << 1 PeerPresentIsProber = 1 << 2 + PeerPresentNotIdeal = 1 << 3 // client said derp server is not its Region.Nodes[0] ideal node ) var bin = binary.BigEndian diff --git a/derp/derp_server.go b/derp/derp_server.go index 2a0f1aa2a..ab0ab0a90 100644 --- a/derp/derp_server.go +++ b/derp/derp_server.go @@ -47,6 +47,7 @@ import ( "tailscale.com/tstime/rate" "tailscale.com/types/key" "tailscale.com/types/logger" + "tailscale.com/util/ctxkey" "tailscale.com/util/mak" "tailscale.com/util/set" "tailscale.com/util/slicesx" @@ -57,6 +58,16 @@ import ( // verbosely log whenever DERP drops a packet. var verboseDropKeys = map[key.NodePublic]bool{} +// IdealNodeHeader is the HTTP request header sent on DERP HTTP client requests +// to indicate that they're connecting to their ideal (Region.Nodes[0]) node. +// The HTTP header value is the name of the node they wish they were connected +// to. This is an optional header. +const IdealNodeHeader = "Ideal-Node" + +// IdealNodeContextKey is the context key used to pass the IdealNodeHeader value +// from the HTTP handler to the DERP server's Accept method. +var IdealNodeContextKey = ctxkey.New[string]("ideal-node", "") + func init() { keys := envknob.String("TS_DEBUG_VERBOSE_DROPS") if keys == "" { @@ -133,6 +144,7 @@ type Server struct { sentPong expvar.Int // number of pong frames enqueued to client accepts expvar.Int curClients expvar.Int + curClientsNotIdeal expvar.Int curHomeClients expvar.Int // ones with preferred dupClientKeys expvar.Int // current number of public keys we have 2+ connections for dupClientConns expvar.Int // current number of connections sharing a public key @@ -603,6 +615,9 @@ func (s *Server) registerClient(c *sclient) { } s.keyOfAddr[c.remoteIPPort] = c.key s.curClients.Add(1) + if c.isNotIdealConn { + s.curClientsNotIdeal.Add(1) + } s.broadcastPeerStateChangeLocked(c.key, c.remoteIPPort, c.presentFlags(), true) } @@ -693,6 +708,9 @@ func (s *Server) unregisterClient(c *sclient) { if c.preferred { s.curHomeClients.Add(-1) } + if c.isNotIdealConn { + s.curClientsNotIdeal.Add(-1) + } } // addPeerGoneFromRegionWatcher adds a function to be called when peer is gone @@ -809,8 +827,8 @@ func (s *Server) accept(ctx context.Context, nc Conn, brw *bufio.ReadWriter, rem return fmt.Errorf("receive client key: %v", err) } - clientAP, _ := netip.ParseAddrPort(remoteAddr) - if err := s.verifyClient(ctx, clientKey, clientInfo, clientAP.Addr()); err != nil { + remoteIPPort, _ := netip.ParseAddrPort(remoteAddr) + if err := s.verifyClient(ctx, clientKey, clientInfo, remoteIPPort.Addr()); err != nil { return fmt.Errorf("client %v rejected: %v", clientKey, err) } @@ -820,8 +838,6 @@ func (s *Server) accept(ctx context.Context, nc Conn, brw *bufio.ReadWriter, rem ctx, cancel := context.WithCancel(ctx) defer cancel() - remoteIPPort, _ := netip.ParseAddrPort(remoteAddr) - c := &sclient{ connNum: connNum, s: s, @@ -838,6 +854,7 @@ func (s *Server) accept(ctx context.Context, nc Conn, brw *bufio.ReadWriter, rem sendPongCh: make(chan [8]byte, 1), peerGone: make(chan peerGoneMsg), canMesh: s.isMeshPeer(clientInfo), + isNotIdealConn: IdealNodeContextKey.Value(ctx) != "", peerGoneLim: rate.NewLimiter(rate.Every(time.Second), 3), } @@ -1511,6 +1528,7 @@ type sclient struct { peerGone chan peerGoneMsg // write request that a peer is not at this server (not used by mesh peers) meshUpdate chan struct{} // write request to write peerStateChange canMesh bool // clientInfo had correct mesh token for inter-region routing + isNotIdealConn bool // client indicated it is not its ideal node in the region isDup atomic.Bool // whether more than 1 sclient for key is connected isDisabled atomic.Bool // whether sends to this peer are disabled due to active/active dups debug bool // turn on for verbose logging @@ -1546,6 +1564,9 @@ func (c *sclient) presentFlags() PeerPresentFlags { if c.canMesh { f |= PeerPresentIsMeshPeer } + if c.isNotIdealConn { + f |= PeerPresentNotIdeal + } if f == 0 { return PeerPresentIsRegular } @@ -2051,6 +2072,7 @@ func (s *Server) ExpVar() expvar.Var { m.Set("gauge_current_file_descriptors", expvar.Func(func() any { return metrics.CurrentFDs() })) m.Set("gauge_current_connections", &s.curClients) m.Set("gauge_current_home_connections", &s.curHomeClients) + m.Set("gauge_current_notideal_connections", &s.curClientsNotIdeal) m.Set("gauge_clients_total", expvar.Func(func() any { return len(s.clientsMesh) })) m.Set("gauge_clients_local", expvar.Func(func() any { return len(s.clients) })) m.Set("gauge_clients_remote", expvar.Func(func() any { return len(s.clientsMesh) - len(s.clients) })) diff --git a/derp/derphttp/derphttp_client.go b/derp/derphttp/derphttp_client.go index b8cce8cdc..b695a52a8 100644 --- a/derp/derphttp/derphttp_client.go +++ b/derp/derphttp/derphttp_client.go @@ -498,7 +498,7 @@ func (c *Client) connect(ctx context.Context, caller string) (client *derp.Clien req.Header.Set("Connection", "Upgrade") if !idealNodeInRegion && reg != nil { // This is purely informative for now (2024-07-06) for stats: - req.Header.Set("Ideal-Node", reg.Nodes[0].Name) + req.Header.Set(derp.IdealNodeHeader, reg.Nodes[0].Name) // TODO(bradfitz,raggi): start a time.AfterFunc for 30m-1h or so to // dialNode(reg.Nodes[0]) and see if we can even TCP connect to it. If // so, TLS handshake it as well (which is mixed up in this massive diff --git a/derp/derphttp/derphttp_server.go b/derp/derphttp/derphttp_server.go index 41ce86764..ed7d3d707 100644 --- a/derp/derphttp/derphttp_server.go +++ b/derp/derphttp/derphttp_server.go @@ -21,6 +21,8 @@ const fastStartHeader = "Derp-Fast-Start" // Handler returns an http.Handler to be mounted at /derp, serving s. func Handler(s *derp.Server) http.Handler { return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + ctx := r.Context() + // These are installed both here and in cmd/derper. The check here // catches both cmd/derper run with DERP disabled (STUN only mode) as // well as DERP being run in tests with derphttp.Handler directly, @@ -66,7 +68,11 @@ func Handler(s *derp.Server) http.Handler { pubKey.UntypedHexString()) } - s.Accept(r.Context(), netConn, conn, netConn.RemoteAddr().String()) + if v := r.Header.Get(derp.IdealNodeHeader); v != "" { + ctx = derp.IdealNodeContextKey.WithValue(ctx, v) + } + + s.Accept(ctx, netConn, conn, netConn.RemoteAddr().String()) }) }