From 27033c62778f19a899204d82d4fc1436f7d55d2f Mon Sep 17 00:00:00 2001 From: Jonathan Nobels Date: Tue, 25 Jun 2024 14:56:13 -0400 Subject: [PATCH] net/dns: recheck DNS config on SERVFAIL errors (#12547) Fixes tailscale/corp#20677 Replaces the original attempt to rectify this (by injecting a netMon event) which was both heavy handed, and missed cases where the netMon event was "minor". On apple platforms, the fetching the interface's nameservers can and does return an empty list in certain situations. Apple's API in particular is very limiting here. The header hints at notifications for dns changes which would let us react ahead of time, but it's all private APIs. To avoid remaining in the state where we end up with no nameservers but we absolutely need them, we'll react to a lack of upstream nameservers by attempting to re-query the OS. We'll rate limit this to space out the attempts. It seems relatively harmless to attempt a reconfig every 5 seconds (triggered by an incoming query) if the network is in this broken state. Missing nameservers might possibly be a persistent condition (vs a transient error), but that would also imply that something out of our control is badly misconfigured. Tested by randomly returning [] for the nameservers. When switching between Wifi networks, or cell->wifi, this will randomly trigger the bug, and we appear to reliably heal the DNS state. Signed-off-by: Jonathan Nobels --- net/dns/manager.go | 44 +++++++++++++++++++++++++++++++++++ net/dns/resolver/forwarder.go | 25 +++++++------------- net/dns/resolver/tsdns.go | 7 ++++++ 3 files changed, 59 insertions(+), 17 deletions(-) diff --git a/net/dns/manager.go b/net/dns/manager.go index dbfcb1def..88fe94fac 100644 --- a/net/dns/manager.go +++ b/net/dns/manager.go @@ -14,6 +14,7 @@ import ( "runtime" "slices" "strings" + "sync" "sync/atomic" "time" @@ -23,6 +24,8 @@ import ( "tailscale.com/net/dns/resolver" "tailscale.com/net/netmon" "tailscale.com/net/tsdial" + "tailscale.com/syncs" + "tailscale.com/tstime/rate" "tailscale.com/types/dnstype" "tailscale.com/types/logger" "tailscale.com/util/clientmetric" @@ -55,6 +58,12 @@ type Manager struct { os OSConfigurator knobs *controlknobs.Knobs // or nil goos string // if empty, gets set to runtime.GOOS + + // The last configuration we successfully compiled. Set to nil if + // there was any failure applying the last configuration + config *Config + // Must be held when accessing/setting config. + mu sync.Mutex } // NewManagers created a new manager from the given config. @@ -80,6 +89,26 @@ func NewManager(logf logger.Logf, oscfg OSConfigurator, health *health.Tracker, knobs: knobs, goos: goos, } + + // Rate limit our attempts to correct our DNS configuration. + limiter := rate.NewLimiter(1.0/5.0, 1) + + // This will recompile the DNS config, which in turn will requery the system + // DNS settings. The recovery func should triggered only when we are missing + // upstream nameservers and require them to forward a query. + m.resolver.SetMissingUpstreamRecovery(func() { + m.mu.Lock() + defer m.mu.Unlock() + if m.config == nil { + return + } + + if limiter.Allow() { + m.logf("DNS resolution failed due to missing upstream nameservers. Recompiling DNS configuration.") + m.setLocked(*m.config) + } + }) + m.ctx, m.ctxCancel = context.WithCancel(context.Background()) m.logf("using %T", m.os) return m @@ -89,6 +118,19 @@ func NewManager(logf logger.Logf, oscfg OSConfigurator, health *health.Tracker, func (m *Manager) Resolver() *resolver.Resolver { return m.resolver } func (m *Manager) Set(cfg Config) error { + m.mu.Lock() + defer m.mu.Unlock() + return m.setLocked(cfg) +} + +// Sets the DNS configuration. +// m.mu must be held +func (m *Manager) setLocked(cfg Config) error { + syncs.AssertLocked(&m.mu) + + // On errors, the 'set' config is cleared. + m.config = nil + m.logf("Set: %v", logger.ArgWriter(func(w *bufio.Writer) { cfg.WriteToBufioWriter(w) })) @@ -112,7 +154,9 @@ func (m *Manager) Set(cfg Config) error { m.health.SetDNSOSHealth(err) return err } + m.health.SetDNSOSHealth(nil) + m.config = &cfg return nil } diff --git a/net/dns/resolver/forwarder.go b/net/dns/resolver/forwarder.go index cfecc38ed..ecc3ce5ea 100644 --- a/net/dns/resolver/forwarder.go +++ b/net/dns/resolver/forwarder.go @@ -14,7 +14,6 @@ import ( "net/http" "net/netip" "net/url" - "runtime" "sort" "strings" "sync" @@ -212,6 +211,10 @@ type forwarder struct { // /etc/resolv.conf is missing/corrupt, and the peerapi ExitDNS stub // resolver lookup. cloudHostFallback []resolverAndDelay + + // To be called when a SERVFAIL is returned due to missing upstream resolvers. + // This should attempt to properly (re)set the upstream resolvers. + missingUpstreamRecovery func() } func newForwarder(logf logger.Logf, netMon *netmon.Monitor, linkSel ForwardLinkSelector, dialer *tsdial.Dialer, knobs *controlknobs.Knobs) *forwarder { @@ -883,22 +886,10 @@ func (f *forwarder) forwardWithDestChan(ctx context.Context, query packet, respo metricDNSFwdErrorNoUpstream.Add(1) f.logf("no upstream resolvers set, returning SERVFAIL") - if runtime.GOOS == "darwin" || runtime.GOOS == "ios" { - // On apple, having no upstream resolvers here is the result a race condition where - // we've tried a reconfig after a major link change but the system has not yet set - // the resolvers for the new link. We use SystemConfiguration to query nameservers, and - // the timing of when that will give us the "right" answer is non-deterministic. - // - // This will typically happen on sleep-wake cycles with a Wifi interface where - // it takes some random amount of time (after telling us that the interface exists) - // for the system to configure the dns servers. - // - // Repolling the network monitor here is a bit odd, but if we're - // seeing DNS queries, it's likely that the network is now fully configured, and it's - // an ideal time to to requery for the nameservers. - f.logf("injecting network monitor event to attempt to refresh the resolvers") - f.netMon.InjectEvent() - } + // Attempt to recompile the DNS configuration + // If we are being asked to forward queries and we have no + // nameservers, the network is in a bad state. + f.missingUpstreamRecovery() res, err := servfailResponse(query) if err != nil { diff --git a/net/dns/resolver/tsdns.go b/net/dns/resolver/tsdns.go index a2837e0ec..a140c7e93 100644 --- a/net/dns/resolver/tsdns.go +++ b/net/dns/resolver/tsdns.go @@ -244,6 +244,13 @@ func New(logf logger.Logf, linkSel ForwardLinkSelector, dialer *tsdial.Dialer, k return r } +// Called by the forwarder on SERVFAIL due to missing upstream resolvers +// The func passed in here should attempt to re-query for those resolvers, +// repair, or recover +func (r *Resolver) SetMissingUpstreamRecovery(f func()) { + r.forwarder.missingUpstreamRecovery = f +} + func (r *Resolver) TestOnlySetHook(hook func(Config)) { r.saveConfigForTests = hook } func (r *Resolver) SetConfig(cfg Config) error {