From 95d78e1314438531916978f3d36d3f25bd1a5e20 Mon Sep 17 00:00:00 2001 From: chaosinthecrd Date: Wed, 17 Dec 2025 13:37:33 +0000 Subject: [PATCH] cmd/containerboot: switch to tsclient QueryDNS to convert FQDN to Tailscale IPs Fixes #18262 Signed-off-by: chaosinthecrd (cherry picked from commit 39881c0187d3c71529f3f04431282a746fafd616) --- cmd/containerboot/egressservices.go | 17 ++-- cmd/containerboot/main.go | 70 +++++++++++-- cmd/containerboot/main_test.go | 152 +++++++++++++++++++++++++++- 3 files changed, 217 insertions(+), 22 deletions(-) diff --git a/cmd/containerboot/egressservices.go b/cmd/containerboot/egressservices.go index 21d9f0bcb..a8d295b61 100644 --- a/cmd/containerboot/egressservices.go +++ b/cmd/containerboot/egressservices.go @@ -173,7 +173,7 @@ func (ep *egressProxy) sync(ctx context.Context, n ipn.Notify) error { if err != nil { return fmt.Errorf("error retrieving current egress proxy status: %w", err) } - newStatus, err := ep.syncEgressConfigs(cfgs, status, n) + newStatus, err := ep.syncEgressConfigs(ctx, cfgs, status, n) if err != nil { return fmt.Errorf("error syncing egress service configs: %w", err) } @@ -194,7 +194,7 @@ func (ep *egressProxy) addrsHaveChanged(n ipn.Notify) bool { // syncEgressConfigs adds and deletes firewall rules to match the desired // configuration. It uses the provided status to determine what is currently // applied and updates the status after a successful sync. -func (ep *egressProxy) syncEgressConfigs(cfgs *egressservices.Configs, status *egressservices.Status, n ipn.Notify) (*egressservices.Status, error) { +func (ep *egressProxy) syncEgressConfigs(ctx context.Context, cfgs *egressservices.Configs, status *egressservices.Status, n ipn.Notify) (*egressservices.Status, error) { if !(wantsServicesConfigured(cfgs) || hasServicesConfigured(status)) { return nil, nil } @@ -202,7 +202,6 @@ func (ep *egressProxy) syncEgressConfigs(cfgs *egressservices.Configs, status *e // Delete unnecessary services. if err := ep.deleteUnnecessaryServices(cfgs, status); err != nil { return nil, fmt.Errorf("error deleting services: %w", err) - } newStatus := &egressservices.Status{} if !wantsServicesConfigured(cfgs) { @@ -213,7 +212,7 @@ func (ep *egressProxy) syncEgressConfigs(cfgs *egressservices.Configs, status *e rulesPerSvcToAdd := make(map[string][]rule, 0) rulesPerSvcToDelete := make(map[string][]rule, 0) for svcName, cfg := range *cfgs { - tailnetTargetIPs, err := ep.tailnetTargetIPsForSvc(cfg, n) + tailnetTargetIPs, err := ep.tailnetTargetIPsForSvc(ctx, cfg, n) if err != nil { return nil, fmt.Errorf("error determining tailnet target IPs: %w", err) } @@ -456,7 +455,7 @@ func (ep *egressProxy) setStatus(ctx context.Context, status *egressservices.Sta // FQDN, resolve the FQDN and return the resolved IPs. It checks if the // netfilter runner supports IPv6 NAT and skips any IPv6 addresses if it // doesn't. -func (ep *egressProxy) tailnetTargetIPsForSvc(svc egressservices.Config, n ipn.Notify) (addrs []netip.Addr, err error) { +func (ep *egressProxy) tailnetTargetIPsForSvc(ctx context.Context, svc egressservices.Config, n ipn.Notify) (addrs []netip.Addr, err error) { if svc.TailnetTarget.IP != "" { addr, err := netip.ParseAddr(svc.TailnetTarget.IP) if err != nil { @@ -476,11 +475,8 @@ func (ep *egressProxy) tailnetTargetIPsForSvc(svc egressservices.Config, n ipn.N log.Printf("netmap is not available, unable to determine backend addresses for %s", svc.TailnetTarget.FQDN) return addrs, nil } - egressAddrs, err := resolveTailnetFQDN(n.NetMap, svc.TailnetTarget.FQDN) - if err != nil { - return nil, fmt.Errorf("error fetching backend addresses for %q: %w", svc.TailnetTarget.FQDN, err) - } - if len(egressAddrs) == 0 { + egressAddrs, err := resolveTailnetFQDN(ctx, ep.tsClient, svc.TailnetTarget.FQDN) + if err != nil || len(egressAddrs) == 0 { log.Printf("tailnet target %q does not have any backend addresses, skipping", svc.TailnetTarget.FQDN) return addrs, nil } @@ -532,7 +528,6 @@ func (ep *egressProxy) shouldResync(n ipn.Notify) bool { // ensureServiceDeleted ensures that any rules for an egress service are removed // from the firewall configuration. func ensureServiceDeleted(svcName string, svc *egressservices.ServiceStatus, nfr linuxfw.NetfilterRunner) error { - // Note that the portmap is needed for iptables based firewall only. // Nftables group rules for a service in a chain, so there is no need to // specify individual portmapping based rules. diff --git a/cmd/containerboot/main.go b/cmd/containerboot/main.go index 011c1830a..26cd6ab4e 100644 --- a/cmd/containerboot/main.go +++ b/cmd/containerboot/main.go @@ -127,7 +127,9 @@ import ( "syscall" "time" + "golang.org/x/net/dns/dnsmessage" "golang.org/x/sys/unix" + "tailscale.com/client/local" "tailscale.com/client/tailscale" "tailscale.com/ipn" kubeutils "tailscale.com/k8s-operator" @@ -538,7 +540,7 @@ runLoop: } } if cfg.TailnetTargetFQDN != "" { - egressAddrs, err := resolveTailnetFQDN(n.NetMap, cfg.TailnetTargetFQDN) + egressAddrs, err := resolveTailnetFQDN(ctx, client, cfg.TailnetTargetFQDN) if err != nil { log.Print(err.Error()) break @@ -894,27 +896,75 @@ func runHTTPServer(mux *http.ServeMux, addr string) (close func() error) { // resolveTailnetFQDN resolves a tailnet FQDN to a list of IP prefixes, which // can be either a peer device or a Tailscale Service. -func resolveTailnetFQDN(nm *netmap.NetworkMap, fqdn string) ([]netip.Prefix, error) { +func resolveTailnetFQDN(ctx context.Context, c *local.Client, fqdn string) ([]netip.Prefix, error) { dnsFQDN, err := dnsname.ToFQDN(fqdn) if err != nil { return nil, fmt.Errorf("error parsing %q as FQDN: %w", fqdn, err) } - // Check all peer devices first. - for _, p := range nm.Peers { - if strings.EqualFold(p.Name(), dnsFQDN.WithTrailingDot()) { - return p.Addresses().AsSlice(), nil - } + bytes, _, err := c.QueryDNS(ctx, dnsFQDN.WithTrailingDot(), "ALL") + if err != nil { + return nil, fmt.Errorf("error querying tailscale DNS: %w", err) } - // If not found yet, check for a matching Tailscale Service. - if svcIPs := serviceIPsFromNetMap(nm, dnsFQDN); len(svcIPs) != 0 { - return svcIPs, nil + addrs, err := processDNSAnswers(bytes) + if err != nil { + return nil, fmt.Errorf("failed to process answers from dns response: %w", err) + } + + if len(addrs) > 0 { + return addrs, nil } return nil, fmt.Errorf("could not find Tailscale node or service %q; it either does not exist, or not reachable because of ACLs", fqdn) } +func processDNSAnswers(bytes []byte) ([]netip.Prefix, error) { + var p dnsmessage.Parser + header, err := p.Start(bytes) + if err != nil { + return nil, fmt.Errorf("failed to parse DNS response: %w", err) + } + + p.SkipAllQuestions() + if header.RCode != dnsmessage.RCodeSuccess { + return nil, fmt.Errorf("no answers in response") + } + + answers, err := p.AllAnswers() + if err != nil { + return nil, fmt.Errorf("failed to parse DNS answers: %w", err) + } + + addrs := []netip.Prefix{} + for _, a := range answers { + switch body := a.Body.(type) { + + case *dnsmessage.AResource: + addr := netip.AddrFrom4(body.A) + if !addr.IsValid() { + continue + } + // IPv4 uses /32 + addrs = append(addrs, netip.PrefixFrom(addr, 32)) + + case *dnsmessage.AAAAResource: + addr := netip.AddrFrom16(body.AAAA) + if !addr.IsValid() { + continue + } + // IPv6 uses /128 + addrs = append(addrs, netip.PrefixFrom(addr, 128)) + + default: + // Ignore other record types (CNAME, TXT, etc.) + continue + } + } + + return addrs, nil +} + // serviceIPsFromNetMap returns all IPs of a Tailscale Service if its FQDN is // found in the netmap. Note that Tailscale Services are not a first-class // object in the netmap, so we guess based on DNS ExtraRecords and AllowedIPs. diff --git a/cmd/containerboot/main_test.go b/cmd/containerboot/main_test.go index 7007cc152..58d95829f 100644 --- a/cmd/containerboot/main_test.go +++ b/cmd/containerboot/main_test.go @@ -30,6 +30,7 @@ import ( "time" "github.com/google/go-cmp/cmp" + "golang.org/x/net/dns/dnsmessage" "golang.org/x/sys/unix" "tailscale.com/ipn" "tailscale.com/kube/egressservices" @@ -39,6 +40,7 @@ import ( "tailscale.com/tstest" "tailscale.com/types/netmap" "tailscale.com/types/ptr" + "tailscale.com/util/dnsname" ) func TestContainerBoot(t *testing.T) { @@ -66,6 +68,8 @@ func TestContainerBoot(t *testing.T) { // Waits below to be true before proceeding to the next phase. Notify *ipn.Notify + DnsBuilder func(b *dnsmessage.Builder) + // WantCmds is the commands that containerboot should run in this phase. WantCmds []string @@ -391,6 +395,20 @@ func TestContainerBoot(t *testing.T) { }, WantLog: "no forwarding rules for egress addresses [::1/128], host supports IPv6: false", WantExitCode: ptr.To(1), + DnsBuilder: func(b *dnsmessage.Builder) { + b.AAAAResource( + dnsmessage.ResourceHeader{ + Name: dnsmessage.MustNewName("ipv6-node.test.ts.net."), + Type: dnsmessage.TypeAAAA, + Class: dnsmessage.ClassINET, + TTL: 0, + }, + dnsmessage.AAAAResource{ + // ::1 + AAAA: [16]byte{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1}, + }, + ) + }, }, }, } @@ -980,6 +998,19 @@ func TestContainerBoot(t *testing.T) { }, }, }, + DnsBuilder: func(b *dnsmessage.Builder) { + b.AResource( + dnsmessage.ResourceHeader{ + Name: dnsmessage.MustNewName("foo.tailnetxyz.ts.net."), + Type: dnsmessage.TypeA, + Class: dnsmessage.ClassINET, + TTL: 0, + }, + dnsmessage.AResource{ + A: [4]byte{100, 64, 0, 2}, + }, + ) + }, WantKubeSecret: map[string]string{ "egress-services": string(mustJSON(t, egressStatus)), "authkey": "tskey-key", @@ -1112,6 +1143,8 @@ func TestContainerBoot(t *testing.T) { var wantCmds []string for i, p := range tc.Phases { + env.lapi.build = p.DnsBuilder + for k, v := range p.UpdateKubeSecret { env.kube.SetSecret(k, v) } @@ -1298,9 +1331,11 @@ type localAPI struct { srv *http.Server + build func(*dnsmessage.Builder) sync.Mutex cond *sync.Cond notify *ipn.Notify + t *testing.T } func (lc *localAPI) Start() error { @@ -1344,6 +1379,43 @@ func (lc *localAPI) ServeHTTP(w http.ResponseWriter, r *http.Request) { panic(fmt.Sprintf("unsupported method %q", r.Method)) } return + case "/localapi/v0/dns-query": + if r.Method != "GET" { + panic(fmt.Sprintf("unsupported method %q", r.Method)) + } + + name, err := dnsname.ToFQDN(r.URL.Query().Get("name")) + if err != nil { + panic(fmt.Sprintf("failed to parse dns name path value %q as fqdn", r.URL.Query().Get("name"))) + } + + tp := dnsmessage.TypeA + queryType := r.URL.Query().Get("type") + if queryType != "" { + t, err := dnsMessageTypeForString(queryType) + if err != nil { + http.Error(w, err.Error(), http.StatusBadRequest) + return + } + tp = t + } + + ans, err := lc.handlePeerDNSQuery(name, tp) + if err != nil { + panic(fmt.Sprintf("failed to query dns: %s", err.Error())) + } + + resp := struct { + Bytes []byte `json:"bytes"` + }{ + Bytes: ans, + } + + w.Header().Set("Content-Type", "application/json") + if err := json.NewEncoder(w).Encode(resp); err != nil { + http.Error(w, fmt.Sprintf("failed to encode response: %v", err), http.StatusInternalServerError) + } + return case "/localapi/v0/watch-ipn-bus": if r.Method != "GET" { panic(fmt.Sprintf("unsupported method %q", r.Method)) @@ -1385,6 +1457,47 @@ func (lc *localAPI) ServeHTTP(w http.ResponseWriter, r *http.Request) { } } +func (lc *localAPI) handlePeerDNSQuery(domain dnsname.FQDN, tp dnsmessage.Type) (res []byte, err error) { + if lc.build == nil { + lc.t.Logf("lc.build is empty, returning") + return []byte{}, nil + } + + var dnsHeader dnsmessage.Header + dnsHeader.Response = true + dnsHeader.Authoritative = true + + question := dnsmessage.Question{ + Name: dnsmessage.MustNewName(domain.WithTrailingDot()), + Type: tp, + Class: dnsmessage.ClassINET, + } + + b := dnsmessage.NewBuilder(nil, dnsHeader) + b.EnableCompression() + + if err := b.StartQuestions(); err != nil { + return nil, err + } + + if err := b.Question(question); err != nil { + return nil, err + } + + if err := b.StartAnswers(); err != nil { + return nil, err + } + + lc.build(&b) + + ans, err := b.Finish() + if err != nil { + return nil, err + } + + return ans, nil +} + // kubeServer is a minimal fake Kubernetes server that presents just // enough functionality for containerboot to function correctly. In // practice this means it only supports reading and modifying a single @@ -1623,7 +1736,7 @@ type testEnv struct { func newTestEnv(t *testing.T) testEnv { d := t.TempDir() - lapi := localAPI{FSRoot: d} + lapi := localAPI{FSRoot: d, t: t} if err := lapi.Start(); err != nil { t.Fatal(err) } @@ -1698,3 +1811,40 @@ func newTestEnv(t *testing.T) testEnv { healthAddrPort: healthAddrPort, } } + +// dnsMessageTypeForString returns the dnsmessage.Type for the given string. +// For example, DNSMessageTypeForString("A") returns dnsmessage.TypeA. +func dnsMessageTypeForString(s string) (t dnsmessage.Type, err error) { + s = strings.TrimSpace(strings.ToUpper(s)) + switch s { + case "AAAA": + return dnsmessage.TypeAAAA, nil + case "ALL": + return dnsmessage.TypeALL, nil + case "A": + return dnsmessage.TypeA, nil + case "CNAME": + return dnsmessage.TypeCNAME, nil + case "HINFO": + return dnsmessage.TypeHINFO, nil + case "MINFO": + return dnsmessage.TypeMINFO, nil + case "MX": + return dnsmessage.TypeMX, nil + case "NS": + return dnsmessage.TypeNS, nil + case "OPT": + return dnsmessage.TypeOPT, nil + case "PTR": + return dnsmessage.TypePTR, nil + case "SOA": + return dnsmessage.TypeSOA, nil + case "SRV": + return dnsmessage.TypeSRV, nil + case "TXT": + return dnsmessage.TypeTXT, nil + case "WKS": + return dnsmessage.TypeWKS, nil + } + return 0, errors.New("unknown DNS message type: " + s) +}