diff --git a/net/dns/manager.go b/net/dns/manager.go index dbfcb1def..88fe94fac 100644 --- a/net/dns/manager.go +++ b/net/dns/manager.go @@ -14,6 +14,7 @@ import ( "runtime" "slices" "strings" + "sync" "sync/atomic" "time" @@ -23,6 +24,8 @@ import ( "tailscale.com/net/dns/resolver" "tailscale.com/net/netmon" "tailscale.com/net/tsdial" + "tailscale.com/syncs" + "tailscale.com/tstime/rate" "tailscale.com/types/dnstype" "tailscale.com/types/logger" "tailscale.com/util/clientmetric" @@ -55,6 +58,12 @@ type Manager struct { os OSConfigurator knobs *controlknobs.Knobs // or nil goos string // if empty, gets set to runtime.GOOS + + // The last configuration we successfully compiled. Set to nil if + // there was any failure applying the last configuration + config *Config + // Must be held when accessing/setting config. + mu sync.Mutex } // NewManagers created a new manager from the given config. @@ -80,6 +89,26 @@ func NewManager(logf logger.Logf, oscfg OSConfigurator, health *health.Tracker, knobs: knobs, goos: goos, } + + // Rate limit our attempts to correct our DNS configuration. + limiter := rate.NewLimiter(1.0/5.0, 1) + + // This will recompile the DNS config, which in turn will requery the system + // DNS settings. The recovery func should triggered only when we are missing + // upstream nameservers and require them to forward a query. + m.resolver.SetMissingUpstreamRecovery(func() { + m.mu.Lock() + defer m.mu.Unlock() + if m.config == nil { + return + } + + if limiter.Allow() { + m.logf("DNS resolution failed due to missing upstream nameservers. Recompiling DNS configuration.") + m.setLocked(*m.config) + } + }) + m.ctx, m.ctxCancel = context.WithCancel(context.Background()) m.logf("using %T", m.os) return m @@ -89,6 +118,19 @@ func NewManager(logf logger.Logf, oscfg OSConfigurator, health *health.Tracker, func (m *Manager) Resolver() *resolver.Resolver { return m.resolver } func (m *Manager) Set(cfg Config) error { + m.mu.Lock() + defer m.mu.Unlock() + return m.setLocked(cfg) +} + +// Sets the DNS configuration. +// m.mu must be held +func (m *Manager) setLocked(cfg Config) error { + syncs.AssertLocked(&m.mu) + + // On errors, the 'set' config is cleared. + m.config = nil + m.logf("Set: %v", logger.ArgWriter(func(w *bufio.Writer) { cfg.WriteToBufioWriter(w) })) @@ -112,7 +154,9 @@ func (m *Manager) Set(cfg Config) error { m.health.SetDNSOSHealth(err) return err } + m.health.SetDNSOSHealth(nil) + m.config = &cfg return nil } diff --git a/net/dns/resolver/forwarder.go b/net/dns/resolver/forwarder.go index cfecc38ed..ecc3ce5ea 100644 --- a/net/dns/resolver/forwarder.go +++ b/net/dns/resolver/forwarder.go @@ -14,7 +14,6 @@ import ( "net/http" "net/netip" "net/url" - "runtime" "sort" "strings" "sync" @@ -212,6 +211,10 @@ type forwarder struct { // /etc/resolv.conf is missing/corrupt, and the peerapi ExitDNS stub // resolver lookup. cloudHostFallback []resolverAndDelay + + // To be called when a SERVFAIL is returned due to missing upstream resolvers. + // This should attempt to properly (re)set the upstream resolvers. + missingUpstreamRecovery func() } func newForwarder(logf logger.Logf, netMon *netmon.Monitor, linkSel ForwardLinkSelector, dialer *tsdial.Dialer, knobs *controlknobs.Knobs) *forwarder { @@ -883,22 +886,10 @@ func (f *forwarder) forwardWithDestChan(ctx context.Context, query packet, respo metricDNSFwdErrorNoUpstream.Add(1) f.logf("no upstream resolvers set, returning SERVFAIL") - if runtime.GOOS == "darwin" || runtime.GOOS == "ios" { - // On apple, having no upstream resolvers here is the result a race condition where - // we've tried a reconfig after a major link change but the system has not yet set - // the resolvers for the new link. We use SystemConfiguration to query nameservers, and - // the timing of when that will give us the "right" answer is non-deterministic. - // - // This will typically happen on sleep-wake cycles with a Wifi interface where - // it takes some random amount of time (after telling us that the interface exists) - // for the system to configure the dns servers. - // - // Repolling the network monitor here is a bit odd, but if we're - // seeing DNS queries, it's likely that the network is now fully configured, and it's - // an ideal time to to requery for the nameservers. - f.logf("injecting network monitor event to attempt to refresh the resolvers") - f.netMon.InjectEvent() - } + // Attempt to recompile the DNS configuration + // If we are being asked to forward queries and we have no + // nameservers, the network is in a bad state. + f.missingUpstreamRecovery() res, err := servfailResponse(query) if err != nil { diff --git a/net/dns/resolver/tsdns.go b/net/dns/resolver/tsdns.go index a2837e0ec..a140c7e93 100644 --- a/net/dns/resolver/tsdns.go +++ b/net/dns/resolver/tsdns.go @@ -244,6 +244,13 @@ func New(logf logger.Logf, linkSel ForwardLinkSelector, dialer *tsdial.Dialer, k return r } +// Called by the forwarder on SERVFAIL due to missing upstream resolvers +// The func passed in here should attempt to re-query for those resolvers, +// repair, or recover +func (r *Resolver) SetMissingUpstreamRecovery(f func()) { + r.forwarder.missingUpstreamRecovery = f +} + func (r *Resolver) TestOnlySetHook(hook func(Config)) { r.saveConfigForTests = hook } func (r *Resolver) SetConfig(cfg Config) error {