From bb60da276468a18b5159598f09649289ad5471c3 Mon Sep 17 00:00:00 2001 From: Jordan Whited Date: Fri, 18 Oct 2024 10:53:49 -0700 Subject: [PATCH] derp: add sclient write deadline timeout metric (#13831) Write timeouts can be indicative of stalled TCP streams. Understanding changes in the rate of such events can be helpful in an ops context. Updates tailscale/corp#23668 Signed-off-by: Jordan Whited --- derp/derp_server.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/derp/derp_server.go b/derp/derp_server.go index 94d2263f4..2a0f1aa2a 100644 --- a/derp/derp_server.go +++ b/derp/derp_server.go @@ -26,6 +26,7 @@ import ( "net" "net/http" "net/netip" + "os" "os/exec" "runtime" "strconv" @@ -142,6 +143,7 @@ type Server struct { multiForwarderCreated expvar.Int multiForwarderDeleted expvar.Int removePktForwardOther expvar.Int + sclientWriteTimeouts expvar.Int avgQueueDuration *uint64 // In milliseconds; accessed atomically tcpRtt metrics.LabelMap // histogram meshUpdateBatchSize *metrics.Histogram @@ -882,6 +884,9 @@ func (c *sclient) run(ctx context.Context) error { if errors.Is(err, context.Canceled) { c.debugLogf("sender canceled by reader exiting") } else { + if errors.Is(err, os.ErrDeadlineExceeded) { + c.s.sclientWriteTimeouts.Add(1) + } c.logf("sender failed: %v", err) } } @@ -2073,6 +2078,7 @@ func (s *Server) ExpVar() expvar.Var { m.Set("multiforwarder_created", &s.multiForwarderCreated) m.Set("multiforwarder_deleted", &s.multiForwarderDeleted) m.Set("packet_forwarder_delete_other_value", &s.removePktForwardOther) + m.Set("sclient_write_timeouts", &s.sclientWriteTimeouts) m.Set("average_queue_duration_ms", expvar.Func(func() any { return math.Float64frombits(atomic.LoadUint64(s.avgQueueDuration)) }))