net/dns/resolver: fix dns-sd NXDOMAIN responses from quad-100

mdnsResponder at least as of macOS Sequoia does not find NXDOMAIN
responses to these dns-sd PTR queries acceptable unless they include the
question section in the response. This was found debugging #13511, once
we turned on additional diagnostic reporting from mdnsResponder we
witnessed:

```
Received unacceptable 12-byte response from 100.100.100.100 over UDP via utun6/27 -- id: 0x7F41 (32577), flags: 0x8183 (R/Query, RD, RA, NXDomain), counts: 0/0/0/0,
```

If the response includes a question section, the resposnes are
acceptable, e.g.:

```
Received acceptable 59-byte response from 8.8.8.8 over UDP via en0/17 -- id: 0x2E55 (11861), flags: 0x8183 (R/Query, RD, RA, NXDomain), counts: 1/0/0/0,
```

This may be contributing to an issue under diagnosis in #13511 wherein
some combination of conditions results in mdnsResponder no longer
answering DNS queries correctly to applications on the system for
extended periods of time (multiple minutes), while dig against quad-100
provides correct responses for those same domains. If additional debug
logging is enabled in mdnsResponder we see it reporting:

```
Penalizing server 100.100.100.100 for 60 seconds
```

It is also possible that the reason that macOS & iOS never "stopped
spamming" these queries is that they have never been replied to with
acceptable responses. It is not clear if this special case handling of
dns-sd PTR queries was ever beneficial, and given this evidence may have
always been harmful. If we subsequently observe that the queries settle
down now that they have acceptable responses, we should remove these
special cases - making upstream queries very occasionally isn't a lot of
battery, so we should be better off having to maintain less special
cases and avoid bugs of this class.

Updates #2442
Updates #3025
Updates #3363
Updates #3594
Updates #13511

Signed-off-by: James Tucker <james@tailscale.com>
adrian/vip
James Tucker 2 months ago committed by James Tucker
parent 3a467b66b6
commit af5a845a87

@ -1093,6 +1093,8 @@ func nxDomainResponse(req packet) (res packet, err error) {
// TODO(bradfitz): should we add an SOA record in the Authority
// section too? (for the nxdomain negative caching TTL)
// For which zone? Does iOS care?
b.StartQuestions()
b.Question(p.Question)
res.bs, err = b.Finish()
res.addr = req.addr
return res, err

@ -15,6 +15,7 @@ import (
"net/netip"
"os"
"reflect"
"slices"
"strings"
"sync"
"sync/atomic"
@ -406,7 +407,7 @@ func enableDebug(tb testing.TB) {
func makeLargeResponse(tb testing.TB, domain string) (request, response []byte) {
name := dns.MustNewName(domain)
builder := dns.NewBuilder(nil, dns.Header{})
builder := dns.NewBuilder(nil, dns.Header{Response: true})
builder.StartQuestions()
builder.Question(dns.Question{
Name: name,
@ -463,19 +464,26 @@ func runTestQuery(tb testing.TB, port uint16, request []byte, modify func(*forwa
modify(fwd)
}
fq := &forwardQuery{
txid: getTxID(request),
packet: request,
closeOnCtxDone: new(closePool),
family: "tcp",
}
defer fq.closeOnCtxDone.Close()
rr := resolverAndDelay{
name: &dnstype.Resolver{Addr: fmt.Sprintf("127.0.0.1:%d", port)},
}
return fwd.send(context.Background(), fq, rr)
rpkt := packet{
bs: request,
family: "tcp",
addr: netip.MustParseAddrPort("127.0.0.1:12345"),
}
rchan := make(chan packet, 1)
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
tb.Cleanup(cancel)
err = fwd.forwardWithDestChan(ctx, rpkt, rchan, rr)
select {
case res := <-rchan:
return res.bs, err
case <-ctx.Done():
return nil, ctx.Err()
}
}
func mustRunTestQuery(tb testing.TB, port uint16, request []byte, modify func(*forwarder)) []byte {
@ -609,7 +617,8 @@ func TestForwarderTCPFallbackError(t *testing.T) {
name := dns.MustNewName(domain)
builder := dns.NewBuilder(nil, dns.Header{
RCode: dns.RCodeServerFailure,
Response: true,
RCode: dns.RCodeServerFailure,
})
builder.StartQuestions()
builder.Question(dns.Question{
@ -658,3 +667,58 @@ func TestForwarderTCPFallbackError(t *testing.T) {
t.Errorf("wanted errServerFailure, got: %v", err)
}
}
// mdnsResponder at minimum has an expectation that NXDOMAIN must include the
// question, otherwise it will penalize our server (#13511).
func TestNXDOMAINIncludesQuestion(t *testing.T) {
var domain = "lb._dns-sd._udp.example.org."
// Our response is a NXDOMAIN
response := func() []byte {
name := dns.MustNewName(domain)
builder := dns.NewBuilder(nil, dns.Header{
Response: true,
RCode: dns.RCodeNameError,
})
builder.StartQuestions()
builder.Question(dns.Question{
Name: name,
Type: dns.TypePTR,
Class: dns.ClassINET,
})
response, err := builder.Finish()
if err != nil {
t.Fatal(err)
}
return response
}()
// Our request is a single PTR query for the domain in the answer, above.
request := func() []byte {
builder := dns.NewBuilder(nil, dns.Header{})
builder.StartQuestions()
builder.Question(dns.Question{
Name: dns.MustNewName(domain),
Type: dns.TypePTR,
Class: dns.ClassINET,
})
request, err := builder.Finish()
if err != nil {
t.Fatal(err)
}
return request
}()
port := runDNSServer(t, nil, response, func(isTCP bool, gotRequest []byte) {
})
res, err := runTestQuery(t, port, request, nil)
if err != nil {
t.Fatal(err)
}
if !slices.Equal(res, response) {
t.Errorf("invalid response\ngot: %+v\nwant: %+v", res, response)
}
}

Loading…
Cancel
Save