net/dns: recheck DNS config on SERVFAIL errors (#12547)

Fixes tailscale/corp#20677

Replaces the original attempt to rectify this (by injecting a netMon
event) which was both heavy handed, and missed cases where the
netMon event was "minor".

On apple platforms, the fetching the interface's nameservers can
and does return an empty list in certain situations.   Apple's API
in particular is very limiting here.  The header hints at notifications
for dns changes which would let us react ahead of time, but it's all
private APIs.

To avoid remaining in the state where we end up with no
nameservers but we absolutely need them, we'll react
to a lack of upstream nameservers by attempting to re-query
the OS.

We'll rate limit this to space out the attempts.   It seems relatively
harmless to attempt a reconfig every 5 seconds (triggered
by an incoming query) if the network is in this broken state.

Missing nameservers might possibly be a persistent condition
(vs a transient error), but that would  also imply that something
out of our control is badly misconfigured.

Tested by randomly returning [] for the nameservers.   When switching
between Wifi networks, or cell->wifi, this will randomly trigger
the bug, and we appear to reliably heal the DNS state.

Signed-off-by: Jonathan Nobels <jonathan@tailscale.com>
pull/12610/head
Jonathan Nobels 5 months ago committed by GitHub
parent d5e692f7e7
commit 27033c6277
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -14,6 +14,7 @@ import (
"runtime" "runtime"
"slices" "slices"
"strings" "strings"
"sync"
"sync/atomic" "sync/atomic"
"time" "time"
@ -23,6 +24,8 @@ import (
"tailscale.com/net/dns/resolver" "tailscale.com/net/dns/resolver"
"tailscale.com/net/netmon" "tailscale.com/net/netmon"
"tailscale.com/net/tsdial" "tailscale.com/net/tsdial"
"tailscale.com/syncs"
"tailscale.com/tstime/rate"
"tailscale.com/types/dnstype" "tailscale.com/types/dnstype"
"tailscale.com/types/logger" "tailscale.com/types/logger"
"tailscale.com/util/clientmetric" "tailscale.com/util/clientmetric"
@ -55,6 +58,12 @@ type Manager struct {
os OSConfigurator os OSConfigurator
knobs *controlknobs.Knobs // or nil knobs *controlknobs.Knobs // or nil
goos string // if empty, gets set to runtime.GOOS goos string // if empty, gets set to runtime.GOOS
// The last configuration we successfully compiled. Set to nil if
// there was any failure applying the last configuration
config *Config
// Must be held when accessing/setting config.
mu sync.Mutex
} }
// NewManagers created a new manager from the given config. // NewManagers created a new manager from the given config.
@ -80,6 +89,26 @@ func NewManager(logf logger.Logf, oscfg OSConfigurator, health *health.Tracker,
knobs: knobs, knobs: knobs,
goos: goos, goos: goos,
} }
// Rate limit our attempts to correct our DNS configuration.
limiter := rate.NewLimiter(1.0/5.0, 1)
// This will recompile the DNS config, which in turn will requery the system
// DNS settings. The recovery func should triggered only when we are missing
// upstream nameservers and require them to forward a query.
m.resolver.SetMissingUpstreamRecovery(func() {
m.mu.Lock()
defer m.mu.Unlock()
if m.config == nil {
return
}
if limiter.Allow() {
m.logf("DNS resolution failed due to missing upstream nameservers. Recompiling DNS configuration.")
m.setLocked(*m.config)
}
})
m.ctx, m.ctxCancel = context.WithCancel(context.Background()) m.ctx, m.ctxCancel = context.WithCancel(context.Background())
m.logf("using %T", m.os) m.logf("using %T", m.os)
return m return m
@ -89,6 +118,19 @@ func NewManager(logf logger.Logf, oscfg OSConfigurator, health *health.Tracker,
func (m *Manager) Resolver() *resolver.Resolver { return m.resolver } func (m *Manager) Resolver() *resolver.Resolver { return m.resolver }
func (m *Manager) Set(cfg Config) error { func (m *Manager) Set(cfg Config) error {
m.mu.Lock()
defer m.mu.Unlock()
return m.setLocked(cfg)
}
// Sets the DNS configuration.
// m.mu must be held
func (m *Manager) setLocked(cfg Config) error {
syncs.AssertLocked(&m.mu)
// On errors, the 'set' config is cleared.
m.config = nil
m.logf("Set: %v", logger.ArgWriter(func(w *bufio.Writer) { m.logf("Set: %v", logger.ArgWriter(func(w *bufio.Writer) {
cfg.WriteToBufioWriter(w) cfg.WriteToBufioWriter(w)
})) }))
@ -112,7 +154,9 @@ func (m *Manager) Set(cfg Config) error {
m.health.SetDNSOSHealth(err) m.health.SetDNSOSHealth(err)
return err return err
} }
m.health.SetDNSOSHealth(nil) m.health.SetDNSOSHealth(nil)
m.config = &cfg
return nil return nil
} }

@ -14,7 +14,6 @@ import (
"net/http" "net/http"
"net/netip" "net/netip"
"net/url" "net/url"
"runtime"
"sort" "sort"
"strings" "strings"
"sync" "sync"
@ -212,6 +211,10 @@ type forwarder struct {
// /etc/resolv.conf is missing/corrupt, and the peerapi ExitDNS stub // /etc/resolv.conf is missing/corrupt, and the peerapi ExitDNS stub
// resolver lookup. // resolver lookup.
cloudHostFallback []resolverAndDelay cloudHostFallback []resolverAndDelay
// To be called when a SERVFAIL is returned due to missing upstream resolvers.
// This should attempt to properly (re)set the upstream resolvers.
missingUpstreamRecovery func()
} }
func newForwarder(logf logger.Logf, netMon *netmon.Monitor, linkSel ForwardLinkSelector, dialer *tsdial.Dialer, knobs *controlknobs.Knobs) *forwarder { func newForwarder(logf logger.Logf, netMon *netmon.Monitor, linkSel ForwardLinkSelector, dialer *tsdial.Dialer, knobs *controlknobs.Knobs) *forwarder {
@ -883,22 +886,10 @@ func (f *forwarder) forwardWithDestChan(ctx context.Context, query packet, respo
metricDNSFwdErrorNoUpstream.Add(1) metricDNSFwdErrorNoUpstream.Add(1)
f.logf("no upstream resolvers set, returning SERVFAIL") f.logf("no upstream resolvers set, returning SERVFAIL")
if runtime.GOOS == "darwin" || runtime.GOOS == "ios" { // Attempt to recompile the DNS configuration
// On apple, having no upstream resolvers here is the result a race condition where // If we are being asked to forward queries and we have no
// we've tried a reconfig after a major link change but the system has not yet set // nameservers, the network is in a bad state.
// the resolvers for the new link. We use SystemConfiguration to query nameservers, and f.missingUpstreamRecovery()
// the timing of when that will give us the "right" answer is non-deterministic.
//
// This will typically happen on sleep-wake cycles with a Wifi interface where
// it takes some random amount of time (after telling us that the interface exists)
// for the system to configure the dns servers.
//
// Repolling the network monitor here is a bit odd, but if we're
// seeing DNS queries, it's likely that the network is now fully configured, and it's
// an ideal time to to requery for the nameservers.
f.logf("injecting network monitor event to attempt to refresh the resolvers")
f.netMon.InjectEvent()
}
res, err := servfailResponse(query) res, err := servfailResponse(query)
if err != nil { if err != nil {

@ -244,6 +244,13 @@ func New(logf logger.Logf, linkSel ForwardLinkSelector, dialer *tsdial.Dialer, k
return r return r
} }
// Called by the forwarder on SERVFAIL due to missing upstream resolvers
// The func passed in here should attempt to re-query for those resolvers,
// repair, or recover
func (r *Resolver) SetMissingUpstreamRecovery(f func()) {
r.forwarder.missingUpstreamRecovery = f
}
func (r *Resolver) TestOnlySetHook(hook func(Config)) { r.saveConfigForTests = hook } func (r *Resolver) TestOnlySetHook(hook func(Config)) { r.saveConfigForTests = hook }
func (r *Resolver) SetConfig(cfg Config) error { func (r *Resolver) SetConfig(cfg Config) error {

Loading…
Cancel
Save