From 1841d0bf981f9c387d272477b3f6fd54fbc2187a Mon Sep 17 00:00:00 2001 From: Brad Fitzpatrick Date: Mon, 3 Oct 2022 20:39:45 -0700 Subject: [PATCH] wgengine/magicsock: make debug-level stuff not logged by default And add a CLI/localapi and c2n mechanism to enable it for a fixed amount of time. Updates #1548 Change-Id: I71674aaf959a9c6761ff33bbf4a417ffd42195a7 Signed-off-by: Brad Fitzpatrick --- client/tailscale/localclient.go | 22 +++++++ cmd/tailscale/cli/debug.go | 33 +++++++++++ ipn/backend.go | 5 ++ ipn/ipnlocal/c2n.go | 17 ++++++ ipn/ipnlocal/local.go | 85 +++++++++++++++++++++++++++ ipn/localapi/localapi.go | 20 +++++++ ipn/store.go | 18 +++++- tailcfg/tailcfg.go | 3 +- wgengine/magicsock/magicsock.go | 55 +++++++++++------ wgengine/magicsock/magicsock_linux.go | 4 +- 10 files changed, 241 insertions(+), 21 deletions(-) diff --git a/client/tailscale/localclient.go b/client/tailscale/localclient.go index da64dcbb9..01b7a08fd 100644 --- a/client/tailscale/localclient.go +++ b/client/tailscale/localclient.go @@ -318,6 +318,28 @@ func (lc *LocalClient) DebugAction(ctx context.Context, action string) error { return nil } +// SetComponentDebugLogging sets component's debug logging enabled for +// the provided duration. If the duration is in the past, the debug logging +// is disabled. +func (lc *LocalClient) SetComponentDebugLogging(ctx context.Context, component string, d time.Duration) error { + body, err := lc.send(ctx, "POST", + fmt.Sprintf("/localapi/v0/component-debug-logging?component=%s&secs=%d", + url.QueryEscape(component), int64(d.Seconds())), 200, nil) + if err != nil { + return fmt.Errorf("error %w: %s", err, body) + } + var res struct { + Error string + } + if err := json.Unmarshal(body, &res); err != nil { + return err + } + if res.Error != "" { + return errors.New(res.Error) + } + return nil +} + // Status returns the Tailscale daemon's status. func Status(ctx context.Context) (*ipnstate.Status, error) { return defaultLocalClient.Status(ctx) diff --git a/cmd/tailscale/cli/debug.go b/cmd/tailscale/cli/debug.go index 0d4518efe..0ad9f034b 100644 --- a/cmd/tailscale/cli/debug.go +++ b/cmd/tailscale/cli/debug.go @@ -53,6 +53,16 @@ var debugCmd = &ffcli.Command{ Exec: runDERPMap, ShortHelp: "print DERP map", }, + { + Name: "component-logs", + Exec: runDebugComponentLogs, + ShortHelp: "enable/disable debug logs for a component", + FlagSet: (func() *flag.FlagSet { + fs := newFlagSet("component-logs") + fs.DurationVar(&debugComponentLogsArgs.forDur, "for", time.Hour, "how long to enable debug logs for; zero or negative means to disable") + return fs + })(), + }, { Name: "daemon-goroutines", Exec: runDaemonGoroutines, @@ -513,3 +523,26 @@ func runTS2021(ctx context.Context, args []string) error { log.Printf("final underlying conn: %v / %v", conn.LocalAddr(), conn.RemoteAddr()) return nil } + +var debugComponentLogsArgs struct { + forDur time.Duration +} + +func runDebugComponentLogs(ctx context.Context, args []string) error { + if len(args) != 1 { + return errors.New("usage: debug component-logs ") + } + component := args[0] + dur := debugComponentLogsArgs.forDur + + err := localClient.SetComponentDebugLogging(ctx, component, dur) + if err != nil { + return err + } + if debugComponentLogsArgs.forDur <= 0 { + fmt.Printf("Disabled debug logs for component %q\n", component) + } else { + fmt.Printf("Enabled debug logs for component %q for %v\n", component, dur) + } + return nil +} diff --git a/ipn/backend.go b/ipn/backend.go index 917123b3a..5e907d250 100644 --- a/ipn/backend.go +++ b/ipn/backend.go @@ -168,6 +168,11 @@ type PartialFile struct { // LocalBackend.userID, a string like "user-$USER_ID" (used in // server mode). // - on Linux/etc, it's always "_daemon" (ipn.GlobalDaemonStateKey) +// +// Additionally, the StateKey can be debug setting name: +// +// - "_debug_magicsock_until" with value being a unix timestamp stringified +// - "_debug__until" with value being a unix timestamp stringified type StateKey string type Options struct { diff --git a/ipn/ipnlocal/c2n.go b/ipn/ipnlocal/c2n.go index 596f1ad04..0302ff156 100644 --- a/ipn/ipnlocal/c2n.go +++ b/ipn/ipnlocal/c2n.go @@ -8,6 +8,8 @@ import ( "encoding/json" "io" "net/http" + "strconv" + "time" "tailscale.com/tailcfg" "tailscale.com/util/clientmetric" @@ -32,6 +34,21 @@ func (b *LocalBackend) handleC2N(w http.ResponseWriter, r *http.Request) { case "/debug/metrics": w.Header().Set("Content-Type", "text/plain") clientmetric.WritePrometheusExpositionFormat(w) + case "/debug/component-logging": + component := r.FormValue("component") + secs, _ := strconv.Atoi(r.FormValue("secs")) + if secs == 0 { + secs -= 1 + } + until := time.Now().Add(time.Duration(secs) * time.Second) + err := b.SetComponentDebugLogging(component, until) + var res struct { + Error string `json:",omitempty"` + } + if err != nil { + res.Error = err.Error() + } + writeJSON(res) case "/ssh/usernames": var req tailcfg.C2NSSHUsernamesRequest if r.Method == "POST" { diff --git a/ipn/ipnlocal/local.go b/ipn/ipnlocal/local.go index f0436615e..51bbf32b8 100644 --- a/ipn/ipnlocal/local.go +++ b/ipn/ipnlocal/local.go @@ -24,6 +24,7 @@ import ( "time" "go4.org/netipx" + "golang.org/x/exp/slices" "tailscale.com/client/tailscale/apitype" "tailscale.com/control/controlclient" "tailscale.com/doctor" @@ -55,6 +56,7 @@ import ( "tailscale.com/types/views" "tailscale.com/util/deephash" "tailscale.com/util/dnsname" + "tailscale.com/util/mak" "tailscale.com/util/multierr" "tailscale.com/util/osshare" "tailscale.com/util/systemd" @@ -186,6 +188,7 @@ type LocalBackend struct { // *.partial file to its final name on completion. directFileRoot string directFileDoFinalRename bool // false on macOS, true on several NAS platforms + componentLogUntil map[string]componentLogState // statusLock must be held before calling statusChanged.Wait() or // statusChanged.Broadcast(). @@ -267,9 +270,91 @@ func NewLocalBackend(logf logger.Logf, logid string, store ipn.StateStore, diale b.logf("[unexpected] failed to wire up peer API port for engine %T", e) } + for _, component := range debuggableComponents { + key := componentStateKey(component) + if ut, err := ipn.ReadStoreInt(store, key); err == nil { + if until := time.Unix(ut, 0); until.After(time.Now()) { + // conditional to avoid log spam at start when off + b.SetComponentDebugLogging(component, until) + } + } + } + return b, nil } +type componentLogState struct { + until time.Time + timer *time.Timer // if non-nil, the AfterFunc to disable it +} + +var debuggableComponents = []string{ + "magicsock", +} + +func componentStateKey(component string) ipn.StateKey { + return ipn.StateKey("_debug_" + component + "_until") +} + +// SetComponentDebugLogging sets component's debug logging enabled until the until time. +// If until is in the past, the component's debug logging is disabled. +// +// The following components are recognized: +// +// - magicsock +func (b *LocalBackend) SetComponentDebugLogging(component string, until time.Time) error { + b.mu.Lock() + defer b.mu.Unlock() + + var setEnabled func(bool) + switch component { + case "magicsock": + mc, err := b.magicConn() + if err != nil { + return err + } + setEnabled = mc.SetDebugLoggingEnabled + } + if setEnabled == nil || !slices.Contains(debuggableComponents, component) { + return fmt.Errorf("unknown component %q", component) + } + timeUnixOrZero := func(t time.Time) int64 { + if t.IsZero() { + return 0 + } + return t.Unix() + } + ipn.PutStoreInt(b.store, componentStateKey(component), timeUnixOrZero(until)) + now := time.Now() + on := now.Before(until) + setEnabled(on) + var onFor time.Duration + if on { + onFor = until.Sub(now) + b.logf("debugging logging for component %q enabled for %v (until %v)", component, onFor.Round(time.Second), until.UTC().Format(time.RFC3339)) + } else { + b.logf("debugging logging for component %q disabled", component) + } + if oldSt, ok := b.componentLogUntil[component]; ok && oldSt.timer != nil { + oldSt.timer.Stop() + } + newSt := componentLogState{until: until} + if on { + newSt.timer = time.AfterFunc(onFor, func() { + // Turn off logging after the timer fires, as long as the state is + // unchanged when the timer actually fires. + b.mu.Lock() + defer b.mu.Unlock() + if ls := b.componentLogUntil[component]; ls.until == until { + setEnabled(false) + b.logf("debugging logging for component %q disabled (by timer)", component) + } + }) + } + mak.Set(&b.componentLogUntil, component, newSt) + return nil +} + // Dialer returns the backend's dialer. func (b *LocalBackend) Dialer() *tsdial.Dialer { return b.dialer diff --git a/ipn/localapi/localapi.go b/ipn/localapi/localapi.go index a29e18270..446b1df40 100644 --- a/ipn/localapi/localapi.go +++ b/ipn/localapi/localapi.go @@ -146,6 +146,8 @@ func (h *Handler) ServeHTTP(w http.ResponseWriter, r *http.Request) { h.serveMetrics(w, r) case "/localapi/v0/debug": h.serveDebug(w, r) + case "/localapi/v0/component-debug-logging": + h.serveComponentDebugLogging(w, r) case "/localapi/v0/set-expiry-sooner": h.serveSetExpirySooner(w, r) case "/localapi/v0/dial": @@ -329,6 +331,24 @@ func (h *Handler) serveDebug(w http.ResponseWriter, r *http.Request) { io.WriteString(w, "done\n") } +func (h *Handler) serveComponentDebugLogging(w http.ResponseWriter, r *http.Request) { + if !h.PermitWrite { + http.Error(w, "debug access denied", http.StatusForbidden) + return + } + component := r.FormValue("component") + secs, _ := strconv.Atoi(r.FormValue("secs")) + err := h.b.SetComponentDebugLogging(component, time.Now().Add(time.Duration(secs)*time.Second)) + var res struct { + Error string + } + if err != nil { + res.Error = err.Error() + } + w.Header().Set("Content-Type", "application/json") + json.NewEncoder(w).Encode(res) +} + // serveProfileFunc is the implementation of Handler.serveProfile, after auth, // for platforms where we want to link it in. var serveProfileFunc func(http.ResponseWriter, *http.Request) diff --git a/ipn/store.go b/ipn/store.go index 46df80d09..b2f9f82d2 100644 --- a/ipn/store.go +++ b/ipn/store.go @@ -6,6 +6,8 @@ package ipn import ( "errors" + "fmt" + "strconv" ) // ErrStateNotExist is returned by StateStore.ReadState when the @@ -35,7 +37,7 @@ const ( // StateKey "user-1234". ServerModeStartKey = StateKey("server-mode-start-key") - // NLKeyStateKey is the key under which we store the nodes' + // NLKeyStateKey is the key under which we store the node's // network-lock node key, in its key.NLPrivate.MarshalText representation. NLKeyStateKey = StateKey("_nl-node-key") ) @@ -48,3 +50,17 @@ type StateStore interface { // WriteState saves bs as the state associated with ID. WriteState(id StateKey, bs []byte) error } + +// ReadStoreInt reads an integer from a StateStore. +func ReadStoreInt(store StateStore, id StateKey) (int64, error) { + v, err := store.ReadState(id) + if err != nil { + return 0, err + } + return strconv.ParseInt(string(v), 10, 64) +} + +// PutStoreInt puts an integer into a StateStore. +func PutStoreInt(store StateStore, id StateKey, val int64) error { + return store.WriteState(id, fmt.Appendf(nil, "%d", val)) +} diff --git a/tailcfg/tailcfg.go b/tailcfg/tailcfg.go index 67720a66f..dc4df564a 100644 --- a/tailcfg/tailcfg.go +++ b/tailcfg/tailcfg.go @@ -82,7 +82,8 @@ type CapabilityVersion int // - 43: 2022-09-21: clients can return usernames for SSH // - 44: 2022-09-22: MapResponse.ControlDialPlan // - 45: 2022-09-26: c2n /debug/{goroutines,prefs,metrics} -const CurrentCapabilityVersion CapabilityVersion = 45 +// - 46: 2022-10-04: c2n /debug/component-logging +const CurrentCapabilityVersion CapabilityVersion = 46 type StableID string diff --git a/wgengine/magicsock/magicsock.go b/wgengine/magicsock/magicsock.go index 69bd430ed..f5826e7f2 100644 --- a/wgengine/magicsock/magicsock.go +++ b/wgengine/magicsock/magicsock.go @@ -315,6 +315,9 @@ type Conn struct { // new connection that'll fail. networkUp atomic.Bool + // Whether debugging logging is enabled. + debugLogging atomic.Bool + // havePrivateKey is whether privateKey is non-zero. havePrivateKey atomic.Bool publicKeyAtomic syncs.AtomicValue[key.NodePublic] // or NodeKey zero value if !havePrivateKey @@ -430,6 +433,24 @@ type Conn struct { peerLastDerp map[key.NodePublic]int } +// SetDebugLoggingEnabled controls whether spammy debug logging is enabled. +// +// Note that this is currently independent from the log levels, even though +// they're pretty correlated: debugging logs should be [v1] (or higher), but +// some non-debug logs may also still have a [vN] annotation. The [vN] level +// controls which gets shown in stderr. The dlogf method, on the other hand, +// controls which gets even printed or uploaded at any level. +func (c *Conn) SetDebugLoggingEnabled(v bool) { + c.debugLogging.Store(v) +} + +// dlogf logs a debug message if debug logging is enabled via SetDebugLoggingEnabled. +func (c *Conn) dlogf(format string, a ...any) { + if c.debugLogging.Load() { + c.logf(format, a...) + } +} + // derpRoute is a route entry for a public key, saying that a certain // peer should be available at DERP node derpID, as long as the // current connection for that derpID is dc. (but dc should not be @@ -658,7 +679,7 @@ func (c *Conn) updateEndpoints(why string) { c.endpointsUpdateActive = false c.muCond.Broadcast() }() - c.logf("[v1] magicsock: starting endpoint update (%s)", why) + c.dlogf("[v1] magicsock: starting endpoint update (%s)", why) if c.noV4Send.Load() && runtime.GOOS != "js" { c.mu.Lock() closed := c.closed @@ -706,7 +727,7 @@ func (c *Conn) setEndpoints(endpoints []tailcfg.Endpoint) (changed bool) { // skipped during the e2e tests because they depend // too much on the exact sequence of updates. Fix the // tests. But a protocol rewrite might happen first. - c.logf("[v1] magicsock: ignoring pre-DERP map, STUN-less endpoint update: %v", endpoints) + c.dlogf("[v1] magicsock: ignoring pre-DERP map, STUN-less endpoint update: %v", endpoints) return false } @@ -860,7 +881,7 @@ func (c *Conn) callNetInfoCallback(ni *tailcfg.NetInfo) { func (c *Conn) callNetInfoCallbackLocked(ni *tailcfg.NetInfo) { c.netInfoLast = ni if c.netInfoFunc != nil { - c.logf("[v1] magicsock: netInfo update: %+v", ni) + c.dlogf("[v1] magicsock: netInfo update: %+v", ni) go c.netInfoFunc(ni) } } @@ -1831,7 +1852,7 @@ func (c *Conn) sendDiscoMessage(dst netip.AddrPort, dstKey key.NodePublic, dstDi if !dstKey.IsZero() { node = dstKey.ShortString() } - c.logf("[v1] magicsock: disco: %v->%v (%v, %v) sent %v", c.discoShort, dstDisco.ShortString(), node, derpStr(dst.String()), disco.MessageSummary(m)) + c.dlogf("[v1] magicsock: disco: %v->%v (%v, %v) sent %v", c.discoShort, dstDisco.ShortString(), node, derpStr(dst.String()), disco.MessageSummary(m)) } if isDERP { metricSentDiscoDERP.Add(1) @@ -1997,7 +2018,7 @@ func (c *Conn) handleDiscoMessage(msg []byte, src netip.AddrPort, derpNodeSrc ke return } di.setNodeKey(nodeKey) - c.logf("[v1] magicsock: disco: %v<-%v (%v, %v) got call-me-maybe, %d endpoints", + c.dlogf("[v1] magicsock: disco: %v<-%v (%v, %v) got call-me-maybe, %d endpoints", c.discoShort, ep.discoShort, ep.publicKey.ShortString(), derpStr(src.String()), len(dm.MyNumber)) @@ -2099,7 +2120,7 @@ func (c *Conn) handlePingLocked(dm *disco.Ping, src netip.AddrPort, di *discoInf if numNodes > 1 { pingNodeSrcStr = "[one-of-multi]" } - c.logf("[v1] magicsock: disco: %v<-%v (%v, %v) got ping tx=%x", c.discoShort, di.discoShort, pingNodeSrcStr, src, dm.TxID[:6]) + c.dlogf("[v1] magicsock: disco: %v<-%v (%v, %v) got ping tx=%x", c.discoShort, di.discoShort, pingNodeSrcStr, src, dm.TxID[:6]) } ipDst := src @@ -2122,10 +2143,10 @@ func (c *Conn) enqueueCallMeMaybe(derpAddr netip.AddrPort, de *endpoint) { defer c.mu.Unlock() if !c.lastEndpointsTime.After(time.Now().Add(-endpointsFreshEnoughDuration)) { - c.logf("[v1] magicsock: want call-me-maybe but endpoints stale; restunning") + c.dlogf("[v1] magicsock: want call-me-maybe but endpoints stale; restunning") mak.Set(&c.onEndpointRefreshed, de, func() { - c.logf("[v1] magicsock: STUN done; sending call-me-maybe to %v %v", de.discoShort, de.publicKey.ShortString()) + c.dlogf("[v1] magicsock: STUN done; sending call-me-maybe to %v %v", de.discoShort, de.publicKey.ShortString()) c.enqueueCallMeMaybe(derpAddr, de) }) // TODO(bradfitz): make a new 'reSTUNQuickly' method @@ -2352,7 +2373,7 @@ func (c *Conn) SetNetworkMap(nm *netmap.NetworkMap) { c.logf("[v1] magicsock: got updated network map; %d peers", len(nm.Peers)) if numNoDisco != 0 { - c.logf("[v1] magicsock: %d DERP-only peers (no discokey)", numNoDisco) + c.logf("magicsock: %d DERP-only peers (no discokey)", numNoDisco) } c.netMap = nm @@ -2800,7 +2821,7 @@ func (c *Conn) ReSTUN(why string) { if c.endpointsUpdateActive { if c.wantEndpointsUpdate != why { - c.logf("[v1] magicsock: ReSTUN: endpoint update active, need another later (%q)", why) + c.dlogf("[v1] magicsock: ReSTUN: endpoint update active, need another later (%q)", why) c.wantEndpointsUpdate = why } } else { @@ -3524,7 +3545,7 @@ func (de *endpoint) heartbeat() { if mono.Since(de.lastSend) > sessionActiveTimeout { // Session's idle. Stop heartbeating. - de.c.logf("[v1] magicsock: disco: ending heartbeats for idle session to %v (%v)", de.publicKey.ShortString(), de.discoShort) + de.c.dlogf("[v1] magicsock: disco: ending heartbeats for idle session to %v (%v)", de.publicKey.ShortString(), de.discoShort) return } @@ -3637,7 +3658,7 @@ func (de *endpoint) pingTimeout(txid stun.TxID) { return } if debugDisco() || !de.bestAddr.IsValid() || mono.Now().After(de.trustBestAddrUntil) { - de.c.logf("[v1] magicsock: disco: timeout waiting for pong %x from %v (%v, %v)", txid[:6], sp.to, de.publicKey.ShortString(), de.discoShort) + de.c.dlogf("[v1] magicsock: disco: timeout waiting for pong %x from %v (%v, %v)", txid[:6], sp.to, de.publicKey.ShortString(), de.discoShort) } de.removeSentPingLocked(txid, sp) } @@ -3745,7 +3766,7 @@ func (de *endpoint) sendPingsLocked(now mono.Time, sendCallMeMaybe bool) { sentAny = true if firstPing && sendCallMeMaybe { - de.c.logf("[v1] magicsock: disco: send, starting discovery for %v (%v)", de.publicKey.ShortString(), de.discoShort) + de.c.dlogf("[v1] magicsock: disco: send, starting discovery for %v (%v)", de.publicKey.ShortString(), de.discoShort) } de.startPingLocked(ep, now, pingDiscovery) @@ -3830,7 +3851,7 @@ func (de *endpoint) addCandidateEndpoint(ep netip.AddrPort) { } // Newly discovered endpoint. Exciting! - de.c.logf("[v1] magicsock: disco: adding %v as candidate endpoint for %v (%s)", ep, de.discoShort, de.publicKey.ShortString()) + de.c.dlogf("[v1] magicsock: disco: adding %v as candidate endpoint for %v (%s)", ep, de.discoShort, de.publicKey.ShortString()) de.endpointState[ep] = &endpointState{ lastGotPing: time.Now(), } @@ -3843,7 +3864,7 @@ func (de *endpoint) addCandidateEndpoint(ep netip.AddrPort) { } } size2 := len(de.endpointState) - de.c.logf("[v1] magicsock: disco: addCandidateEndpoint pruned %v candidate set from %v to %v entries", size, size2) + de.c.dlogf("[v1] magicsock: disco: addCandidateEndpoint pruned %v candidate set from %v to %v entries", size, size2) } } @@ -3897,7 +3918,7 @@ func (de *endpoint) handlePongConnLocked(m *disco.Pong, di *discoInfo, src netip } if sp.purpose != pingHeartbeat { - de.c.logf("[v1] magicsock: disco: %v<-%v (%v, %v) got pong tx=%x latency=%v pong.src=%v%v", de.c.discoShort, de.discoShort, de.publicKey.ShortString(), src, m.TxID[:6], latency.Round(time.Millisecond), m.Src, logger.ArgWriter(func(bw *bufio.Writer) { + de.c.dlogf("[v1] magicsock: disco: %v<-%v (%v, %v) got pong tx=%x latency=%v pong.src=%v%v", de.c.discoShort, de.discoShort, de.publicKey.ShortString(), src, m.TxID[:6], latency.Round(time.Millisecond), m.Src, logger.ArgWriter(func(bw *bufio.Writer) { if sp.to != src { fmt.Fprintf(bw, " ping.to=%v", sp.to) } @@ -4010,7 +4031,7 @@ func (de *endpoint) handleCallMeMaybe(m *disco.CallMeMaybe) { } } if len(newEPs) > 0 { - de.c.logf("[v1] magicsock: disco: call-me-maybe from %v %v added new endpoints: %v", + de.c.dlogf("[v1] magicsock: disco: call-me-maybe from %v %v added new endpoints: %v", de.publicKey.ShortString(), de.discoShort, logger.ArgWriter(func(w *bufio.Writer) { for i, ep := range newEPs { diff --git a/wgengine/magicsock/magicsock_linux.go b/wgengine/magicsock/magicsock_linux.go index fa0bb167e..40b3f07c5 100644 --- a/wgengine/magicsock/magicsock_linux.go +++ b/wgengine/magicsock/magicsock_linux.go @@ -234,12 +234,12 @@ func (c *Conn) receiveDisco(pc net.PacketConn, isIPV6 bool) { if acceptPort == 0 { // This should only typically happen if the receiving address family // was recently disabled. - c.logf("[v1] disco raw: dropping packet for port %d as acceptPort=0", dstPort) + c.dlogf("[v1] disco raw: dropping packet for port %d as acceptPort=0", dstPort) continue } if dstPort != acceptPort { - c.logf("[v1] disco raw: dropping packet for port %d", dstPort) + c.dlogf("[v1] disco raw: dropping packet for port %d", dstPort) continue }