net/dns/resolver: add clientmetrics for DNS

Fixes tailscale/corp#1811

Change-Id: I864d11e0332a177e8c5ff403591bff6fec548f5a
Signed-off-by: Brad Fitzpatrick <bradfitz@tailscale.com>
pull/3426/head
Brad Fitzpatrick 3 years ago committed by Brad Fitzpatrick
parent 097602b3ca
commit 78b0bd2957

@ -385,6 +385,7 @@ func (f *forwarder) sendDoH(ctx context.Context, urlBase string, c *http.Client,
} }
defer f.releaseDoHSem() defer f.releaseDoHSem()
metricDNSFwdDoH.Add(1)
req, err := http.NewRequestWithContext(ctx, "POST", urlBase, bytes.NewReader(packet)) req, err := http.NewRequestWithContext(ctx, "POST", urlBase, bytes.NewReader(packet))
if err != nil { if err != nil {
return nil, err return nil, err
@ -398,16 +399,23 @@ func (f *forwarder) sendDoH(ctx context.Context, urlBase string, c *http.Client,
hres, err := c.Do(req) hres, err := c.Do(req)
if err != nil { if err != nil {
metricDNSFwdDoHErrorTransport.Add(1)
return nil, err return nil, err
} }
defer hres.Body.Close() defer hres.Body.Close()
if hres.StatusCode != 200 { if hres.StatusCode != 200 {
metricDNSFwdDoHErrorStatus.Add(1)
return nil, errors.New(hres.Status) return nil, errors.New(hres.Status)
} }
if ct := hres.Header.Get("Content-Type"); ct != dohType { if ct := hres.Header.Get("Content-Type"); ct != dohType {
metricDNSFwdDoHErrorCT.Add(1)
return nil, fmt.Errorf("unexpected response Content-Type %q", ct) return nil, fmt.Errorf("unexpected response Content-Type %q", ct)
} }
return ioutil.ReadAll(hres.Body) res, err := ioutil.ReadAll(hres.Body)
if err != nil {
metricDNSFwdDoHErrorBody.Add(1)
}
return res, err
} }
// send sends packet to dst. It is best effort. // send sends packet to dst. It is best effort.
@ -415,12 +423,15 @@ func (f *forwarder) sendDoH(ctx context.Context, urlBase string, c *http.Client,
// send expects the reply to have the same txid as txidOut. // send expects the reply to have the same txid as txidOut.
func (f *forwarder) send(ctx context.Context, fq *forwardQuery, rr resolverAndDelay) ([]byte, error) { func (f *forwarder) send(ctx context.Context, fq *forwardQuery, rr resolverAndDelay) ([]byte, error) {
if strings.HasPrefix(rr.name.Addr, "http://") { if strings.HasPrefix(rr.name.Addr, "http://") {
metricDNSFwdErrorType.Add(1)
return nil, fmt.Errorf("http:// resolvers not supported yet") return nil, fmt.Errorf("http:// resolvers not supported yet")
} }
if strings.HasPrefix(rr.name.Addr, "https://") { if strings.HasPrefix(rr.name.Addr, "https://") {
metricDNSFwdErrorType.Add(1)
return nil, fmt.Errorf("https:// resolvers not supported yet") return nil, fmt.Errorf("https:// resolvers not supported yet")
} }
if strings.HasPrefix(rr.name.Addr, "tls://") { if strings.HasPrefix(rr.name.Addr, "tls://") {
metricDNSFwdErrorType.Add(1)
return nil, fmt.Errorf("tls:// resolvers not supported yet") return nil, fmt.Errorf("tls:// resolvers not supported yet")
} }
ipp, err := netaddr.ParseIPPort(rr.name.Addr) ipp, err := netaddr.ParseIPPort(rr.name.Addr)
@ -438,6 +449,7 @@ func (f *forwarder) send(ctx context.Context, fq *forwardQuery, rr resolverAndDe
f.logf("DoH error from %v: %v", ipp.IP(), err) f.logf("DoH error from %v: %v", ipp.IP(), err)
} }
metricDNSFwdUDP.Add(1)
ln, err := f.packetListener(ipp.IP()) ln, err := f.packetListener(ipp.IP())
if err != nil { if err != nil {
return nil, err return nil, err
@ -453,11 +465,13 @@ func (f *forwarder) send(ctx context.Context, fq *forwardQuery, rr resolverAndDe
defer fq.closeOnCtxDone.Remove(conn) defer fq.closeOnCtxDone.Remove(conn)
if _, err := conn.WriteTo(fq.packet, ipp.UDPAddr()); err != nil { if _, err := conn.WriteTo(fq.packet, ipp.UDPAddr()); err != nil {
metricDNSFwdUDPErrorWrite.Add(1)
if err := ctx.Err(); err != nil { if err := ctx.Err(); err != nil {
return nil, err return nil, err
} }
return nil, err return nil, err
} }
metricDNSFwdUDPWrote.Add(1)
// The 1 extra byte is to detect packet truncation. // The 1 extra byte is to detect packet truncation.
out := make([]byte, maxResponseBytes+1) out := make([]byte, maxResponseBytes+1)
@ -469,6 +483,7 @@ func (f *forwarder) send(ctx context.Context, fq *forwardQuery, rr resolverAndDe
if packetWasTruncated(err) { if packetWasTruncated(err) {
err = nil err = nil
} else { } else {
metricDNSFwdUDPErrorRead.Add(1)
return nil, err return nil, err
} }
} }
@ -482,12 +497,14 @@ func (f *forwarder) send(ctx context.Context, fq *forwardQuery, rr resolverAndDe
out = out[:n] out = out[:n]
txid := getTxID(out) txid := getTxID(out)
if txid != fq.txid { if txid != fq.txid {
metricDNSFwdUDPErrorTxID.Add(1)
return nil, errors.New("txid doesn't match") return nil, errors.New("txid doesn't match")
} }
rcode := getRCode(out) rcode := getRCode(out)
// don't forward transient errors back to the client when the server fails // don't forward transient errors back to the client when the server fails
if rcode == dns.RCodeServerFailure { if rcode == dns.RCodeServerFailure {
f.logf("recv: response code indicating server failure: %d", rcode) f.logf("recv: response code indicating server failure: %d", rcode)
metricDNSFwdUDPErrorServer.Add(1)
return nil, errors.New("response code indicates server issue") return nil, errors.New("response code indicates server issue")
} }
@ -505,7 +522,7 @@ func (f *forwarder) send(ctx context.Context, fq *forwardQuery, rr resolverAndDe
} }
clampEDNSSize(out, maxResponseBytes) clampEDNSSize(out, maxResponseBytes)
metricDNSFwdUDPSuccess.Add(1)
return out, nil return out, nil
} }
@ -566,8 +583,10 @@ func (f *forwarder) forward(query packet) error {
// If backupResolvers are specified, they're used in the case that no // If backupResolvers are specified, they're used in the case that no
// upstreams are available. // upstreams are available.
func (f *forwarder) forwardWithDestChan(ctx context.Context, query packet, responseChan chan<- packet, backupResolvers ...resolverAndDelay) error { func (f *forwarder) forwardWithDestChan(ctx context.Context, query packet, responseChan chan<- packet, backupResolvers ...resolverAndDelay) error {
metricDNSFwd.Add(1)
domain, err := nameFromQuery(query.bs) domain, err := nameFromQuery(query.bs)
if err != nil { if err != nil {
metricDNSFwdErrorName.Add(1)
return err return err
} }
@ -576,6 +595,7 @@ func (f *forwarder) forwardWithDestChan(ctx context.Context, query packet, respo
// when browsing for LAN devices. But even when filtering this // when browsing for LAN devices. But even when filtering this
// out, playing on Sonos still works. // out, playing on Sonos still works.
if hasRDNSBonjourPrefix(domain) { if hasRDNSBonjourPrefix(domain) {
metricDNSFwdDropBonjour.Add(1)
return nil return nil
} }
@ -586,6 +606,7 @@ func (f *forwarder) forwardWithDestChan(ctx context.Context, query packet, respo
resolvers = backupResolvers resolvers = backupResolvers
} }
if len(resolvers) == 0 { if len(resolvers) == 0 {
metricDNSFwdErrorNoUpstream.Add(1)
return errNoUpstreams return errNoUpstreams
} }
@ -633,14 +654,18 @@ func (f *forwarder) forwardWithDestChan(ctx context.Context, query packet, respo
case v := <-resc: case v := <-resc:
select { select {
case <-ctx.Done(): case <-ctx.Done():
metricDNSFwdErrorContext.Add(1)
return ctx.Err() return ctx.Err()
case responseChan <- packet{v, query.addr}: case responseChan <- packet{v, query.addr}:
metricDNSFwdSuccess.Add(1)
return nil return nil
} }
case <-ctx.Done(): case <-ctx.Done():
mu.Lock() mu.Lock()
defer mu.Unlock() defer mu.Unlock()
metricDNSFwdErrorContext.Add(1)
if firstErr != nil { if firstErr != nil {
metricDNSFwdErrorContextGotError.Add(1)
return firstErr return firstErr
} }
return ctx.Err() return ctx.Err()

@ -24,6 +24,7 @@ import (
"inet.af/netaddr" "inet.af/netaddr"
"tailscale.com/types/dnstype" "tailscale.com/types/dnstype"
"tailscale.com/types/logger" "tailscale.com/types/logger"
"tailscale.com/util/clientmetric"
"tailscale.com/util/dnsname" "tailscale.com/util/dnsname"
"tailscale.com/wgengine/monitor" "tailscale.com/wgengine/monitor"
) )
@ -273,13 +274,16 @@ func (r *Resolver) Close() {
// It takes ownership of the payload and does not block. // It takes ownership of the payload and does not block.
// If the queue is full, the request will be dropped and an error will be returned. // If the queue is full, the request will be dropped and an error will be returned.
func (r *Resolver) EnqueueRequest(bs []byte, from netaddr.IPPort) error { func (r *Resolver) EnqueueRequest(bs []byte, from netaddr.IPPort) error {
metricDNSQueryLocal.Add(1)
select { select {
case <-r.closed: case <-r.closed:
metricDNSQueryErrorClosed.Add(1)
return ErrClosed return ErrClosed
default: default:
} }
if n := atomic.AddInt32(&r.activeQueriesAtomic, 1); n > maxActiveQueries() { if n := atomic.AddInt32(&r.activeQueriesAtomic, 1); n > maxActiveQueries() {
atomic.AddInt32(&r.activeQueriesAtomic, -1) atomic.AddInt32(&r.activeQueriesAtomic, -1)
metricDNSQueryErrorQueue.Add(1)
return errFullQueue return errFullQueue
} }
go r.handleQuery(packet{bs, from}) go r.handleQuery(packet{bs, from})
@ -303,6 +307,7 @@ func (r *Resolver) NextResponse() (packet []byte, to netaddr.IPPort, err error)
// via the peerapi's DoH server. This is only used when the local // via the peerapi's DoH server. This is only used when the local
// node is being an exit node. // node is being an exit node.
func (r *Resolver) HandleExitNodeDNSQuery(ctx context.Context, q []byte, from netaddr.IPPort) (res []byte, err error) { func (r *Resolver) HandleExitNodeDNSQuery(ctx context.Context, q []byte, from netaddr.IPPort) (res []byte, err error) {
metricDNSQueryForPeer.Add(1)
ch := make(chan packet, 1) ch := make(chan packet, 1)
err = r.forwarder.forwardWithDestChan(ctx, packet{q, from}, ch) err = r.forwarder.forwardWithDestChan(ctx, packet{q, from}, ch)
@ -338,6 +343,7 @@ func (r *Resolver) HandleExitNodeDNSQuery(ctx context.Context, q []byte, from ne
} }
} }
if err != nil { if err != nil {
metricDNSQueryForPeerError.Add(1)
return nil, err return nil, err
} }
select { select {
@ -357,8 +363,10 @@ func (r *Resolver) HandleExitNodeDNSQuery(ctx context.Context, q []byte, from ne
// Returns dns.RCodeRefused to indicate that the local map is not // Returns dns.RCodeRefused to indicate that the local map is not
// authoritative for domain. // authoritative for domain.
func (r *Resolver) resolveLocal(domain dnsname.FQDN, typ dns.Type) (netaddr.IP, dns.RCode) { func (r *Resolver) resolveLocal(domain dnsname.FQDN, typ dns.Type) (netaddr.IP, dns.RCode) {
metricDNSResolveLocal.Add(1)
// Reject .onion domains per RFC 7686. // Reject .onion domains per RFC 7686.
if dnsname.HasSuffix(domain.WithoutTrailingDot(), ".onion") { if dnsname.HasSuffix(domain.WithoutTrailingDot(), ".onion") {
metricDNSResolveLocalErrorOnion.Add(1)
return netaddr.IP{}, dns.RCodeNameError return netaddr.IP{}, dns.RCodeNameError
} }
@ -372,6 +380,7 @@ func (r *Resolver) resolveLocal(domain dnsname.FQDN, typ dns.Type) (netaddr.IP,
for _, suffix := range localDomains { for _, suffix := range localDomains {
if suffix.Contains(domain) { if suffix.Contains(domain) {
// We are authoritative for the queried domain. // We are authoritative for the queried domain.
metricDNSResolveLocalErrorMissing.Add(1)
return netaddr.IP{}, dns.RCodeNameError return netaddr.IP{}, dns.RCodeNameError
} }
} }
@ -389,30 +398,37 @@ func (r *Resolver) resolveLocal(domain dnsname.FQDN, typ dns.Type) (netaddr.IP,
case dns.TypeA: case dns.TypeA:
for _, ip := range addrs { for _, ip := range addrs {
if ip.Is4() { if ip.Is4() {
metricDNSResolveLocalOKA.Add(1)
return ip, dns.RCodeSuccess return ip, dns.RCodeSuccess
} }
} }
metricDNSResolveLocalNoA.Add(1)
return netaddr.IP{}, dns.RCodeSuccess return netaddr.IP{}, dns.RCodeSuccess
case dns.TypeAAAA: case dns.TypeAAAA:
for _, ip := range addrs { for _, ip := range addrs {
if ip.Is6() { if ip.Is6() {
metricDNSResolveLocalOKAAAA.Add(1)
return ip, dns.RCodeSuccess return ip, dns.RCodeSuccess
} }
} }
metricDNSResolveLocalNoAAAA.Add(1)
return netaddr.IP{}, dns.RCodeSuccess return netaddr.IP{}, dns.RCodeSuccess
case dns.TypeALL: case dns.TypeALL:
// Answer with whatever we've got. // Answer with whatever we've got.
// It could be IPv4, IPv6, or a zero addr. // It could be IPv4, IPv6, or a zero addr.
// TODO: Return all available resolutions (A and AAAA, if we have them). // TODO: Return all available resolutions (A and AAAA, if we have them).
if len(addrs) == 0 { if len(addrs) == 0 {
metricDNSResolveLocalNoAll.Add(1)
return netaddr.IP{}, dns.RCodeSuccess return netaddr.IP{}, dns.RCodeSuccess
} }
metricDNSResolveLocalOKAll.Add(1)
return addrs[0], dns.RCodeSuccess return addrs[0], dns.RCodeSuccess
// Leave some some record types explicitly unimplemented. // Leave some some record types explicitly unimplemented.
// These types relate to recursive resolution or special // These types relate to recursive resolution or special
// DNS semantics and might be implemented in the future. // DNS semantics and might be implemented in the future.
case dns.TypeNS, dns.TypeSOA, dns.TypeAXFR, dns.TypeHINFO: case dns.TypeNS, dns.TypeSOA, dns.TypeAXFR, dns.TypeHINFO:
metricDNSResolveNotImplType.Add(1)
return netaddr.IP{}, dns.RCodeNotImplemented return netaddr.IP{}, dns.RCodeNotImplemented
// For everything except for the few types above that are explicitly not implemented, return no records. // For everything except for the few types above that are explicitly not implemented, return no records.
@ -422,6 +438,7 @@ func (r *Resolver) resolveLocal(domain dnsname.FQDN, typ dns.Type) (netaddr.IP,
// dig -t TYPE9824 example.com // dig -t TYPE9824 example.com
// and note that NOERROR is returned, despite that record type being made up. // and note that NOERROR is returned, despite that record type being made up.
default: default:
metricDNSResolveNoRecordType.Add(1)
// The name exists, but no records exist of the requested type. // The name exists, but no records exist of the requested type.
return netaddr.IP{}, dns.RCodeSuccess return netaddr.IP{}, dns.RCodeSuccess
} }
@ -753,6 +770,7 @@ func (r *Resolver) respondReverse(query []byte, name dnsname.FQDN, resp *respons
return nil, errNotOurName return nil, errNotOurName
} }
metricDNSMagicDNSSuccessReverse.Add(1)
return marshalResponse(resp) return marshalResponse(resp)
} }
@ -769,8 +787,10 @@ func (r *Resolver) respond(query []byte) ([]byte, error) {
// We will not return this error: it is the sender's fault. // We will not return this error: it is the sender's fault.
if err != nil { if err != nil {
if errors.Is(err, dns.ErrSectionDone) { if errors.Is(err, dns.ErrSectionDone) {
metricDNSErrorParseNoQ.Add(1)
r.logf("parseQuery(%02x): no DNS questions", query) r.logf("parseQuery(%02x): no DNS questions", query)
} else { } else {
metricDNSErrorParseQuery.Add(1)
r.logf("parseQuery(%02x): %v", query, err) r.logf("parseQuery(%02x): %v", query, err)
} }
resp := parser.response() resp := parser.response()
@ -780,6 +800,7 @@ func (r *Resolver) respond(query []byte) ([]byte, error) {
rawName := parser.Question.Name.Data[:parser.Question.Name.Length] rawName := parser.Question.Name.Data[:parser.Question.Name.Length]
name, err := dnsname.ToFQDN(rawNameToLower(rawName)) name, err := dnsname.ToFQDN(rawNameToLower(rawName))
if err != nil { if err != nil {
metricDNSErrorNotFQDN.Add(1)
// DNS packet unexpectedly contains an invalid FQDN. // DNS packet unexpectedly contains an invalid FQDN.
resp := parser.response() resp := parser.response()
resp.Header.RCode = dns.RCodeFormatError resp.Header.RCode = dns.RCodeFormatError
@ -803,3 +824,57 @@ func (r *Resolver) respond(query []byte) ([]byte, error) {
resp.IP = ip resp.IP = ip
return marshalResponse(resp) return marshalResponse(resp)
} }
var (
metricDNSQueryLocal = clientmetric.NewCounter("dns_query_local")
metricDNSQueryErrorClosed = clientmetric.NewCounter("dns_query_local_error_closed")
metricDNSQueryErrorQueue = clientmetric.NewCounter("dns_query_local_error_queue")
metricDNSErrorParseNoQ = clientmetric.NewCounter("dns_query_respond_error_no_question")
metricDNSErrorParseQuery = clientmetric.NewCounter("dns_query_respond_error_parse")
metricDNSErrorNotFQDN = clientmetric.NewCounter("dns_query_respond_error_not_fqdn")
metricDNSMagicDNSSuccessName = clientmetric.NewCounter("dns_query_magic_success_name")
metricDNSMagicDNSSuccessReverse = clientmetric.NewCounter("dns_query_magic_success_reverse")
metricDNSQueryForPeer = clientmetric.NewCounter("dns_query_peerapi")
metricDNSQueryForPeerError = clientmetric.NewCounter("dns_query_peerapi_error")
metricDNSFwd = clientmetric.NewCounter("dns_query_fwd")
metricDNSFwdDropBonjour = clientmetric.NewCounter("dns_query_fwd_drop_bonjour")
metricDNSFwdErrorName = clientmetric.NewCounter("dns_query_fwd_error_name")
metricDNSFwdErrorNoUpstream = clientmetric.NewCounter("dns_query_fwd_error_no_upstream")
metricDNSFwdSuccess = clientmetric.NewCounter("dns_query_fwd_success")
metricDNSFwdErrorContext = clientmetric.NewCounter("dns_query_fwd_error_context")
metricDNSFwdErrorContextGotError = clientmetric.NewCounter("dns_query_fwd_error_context_got_error")
metricDNSFwdErrorType = clientmetric.NewCounter("dns_query_fwd_error_type")
metricDNSFwdErrorParseAddr = clientmetric.NewCounter("dns_query_fwd_error_parse_addr")
metricDNSFwdUDP = clientmetric.NewCounter("dns_query_fwd_udp") // on entry
metricDNSFwdUDPWrote = clientmetric.NewCounter("dns_query_fwd_udp_wrote") // sent UDP packet
metricDNSFwdUDPErrorWrite = clientmetric.NewCounter("dns_query_fwd_udp_error_write")
metricDNSFwdUDPErrorServer = clientmetric.NewCounter("dns_query_fwd_udp_error_server")
metricDNSFwdUDPErrorTxID = clientmetric.NewCounter("dns_query_fwd_udp_error_txid")
metricDNSFwdUDPErrorRead = clientmetric.NewCounter("dns_query_fwd_udp_error_read")
metricDNSFwdUDPSuccess = clientmetric.NewCounter("dns_query_fwd_udp_success")
metricDNSFwdDoH = clientmetric.NewCounter("dns_query_fwd_doh")
metricDNSFwdDoHErrorStatus = clientmetric.NewCounter("dns_query_fwd_doh_error_status")
metricDNSFwdDoHErrorCT = clientmetric.NewCounter("dns_query_fwd_doh_error_content_type")
metricDNSFwdDoHErrorTransport = clientmetric.NewCounter("dns_query_fwd_doh_error_transport")
metricDNSFwdDoHErrorBody = clientmetric.NewCounter("dns_query_fwd_doh_error_body")
metricDNSResolveLocal = clientmetric.NewCounter("dns_resolve_local")
metricDNSResolveLocalErrorOnion = clientmetric.NewCounter("dns_resolve_local_error_onion")
metricDNSResolveLocalErrorMissing = clientmetric.NewCounter("dns_resolve_local_error_missing")
metricDNSResolveLocalErrorRefused = clientmetric.NewCounter("dns_resolve_local_error_refused")
metricDNSResolveLocalOKA = clientmetric.NewCounter("dns_resolve_local_ok_a")
metricDNSResolveLocalOKAAAA = clientmetric.NewCounter("dns_resolve_local_ok_aaaa")
metricDNSResolveLocalOKAll = clientmetric.NewCounter("dns_resolve_local_ok_all")
metricDNSResolveLocalNoA = clientmetric.NewCounter("dns_resolve_local_no_a")
metricDNSResolveLocalNoAAAA = clientmetric.NewCounter("dns_resolve_local_no_aaaa")
metricDNSResolveLocalNoAll = clientmetric.NewCounter("dns_resolve_local_no_all")
metricDNSResolveNotImplType = clientmetric.NewCounter("dns_resolve_local_not_impl_type")
metricDNSResolveNoRecordType = clientmetric.NewCounter("dns_resolve_local_no_record_type")
)

Loading…
Cancel
Save