From 8d1249550a924d028de0844c0d101f29308e69b8 Mon Sep 17 00:00:00 2001 From: James Tucker Date: Thu, 16 May 2024 15:58:19 -0700 Subject: [PATCH] net/netcheck,wgengine/magicsock: add potential workaround for Palo Alto DIPP misbehavior Palo Alto firewalls have a typically hard NAT, but also have a mode called Persistent DIPP that is supposed to provide consistent port mapping suitable for STUN resolution of public ports. Persistent DIPP works initially on most Palo Alto firewalls, but some models/software versions have a bug which this works around. The bug symptom presents as follows: - STUN sessions resolve a consistent public IP:port to start with - Much later netchecks report the same IP:Port for a subset of sessions, most often the users active DERP, and/or the port related to sustained traffic. - The broader set of DERPs in a full netcheck will now consistently observe a new IP:Port. - After this point of observation, new inbound connections will only succeed to the new IP:Port observed, and existing/old sessions will only work to the old binding. In this patch we now advertise the lowest latency global endpoint discovered as we always have, but in addition any global endpoints that are observed more than once in a single netcheck report. This should provide viable endpoints for potential connection establishment across a NAT with this behavior. Updates tailscale/corp#19106 Signed-off-by: James Tucker --- cmd/tailscale/cli/netcheck.go | 8 ++-- net/netcheck/netcheck.go | 75 +++++++++++++++++++++++++-------- net/netcheck/netcheck_test.go | 41 +++++++++++++++++- wgengine/magicsock/magicsock.go | 17 ++++---- 4 files changed, 111 insertions(+), 30 deletions(-) diff --git a/cmd/tailscale/cli/netcheck.go b/cmd/tailscale/cli/netcheck.go index e642baec8..1a38870c7 100644 --- a/cmd/tailscale/cli/netcheck.go +++ b/cmd/tailscale/cli/netcheck.go @@ -127,13 +127,13 @@ func printReport(dm *tailcfg.DERPMap, report *netcheck.Report) error { printf("\nReport:\n") printf("\t* UDP: %v\n", report.UDP) - if report.GlobalV4 != "" { - printf("\t* IPv4: yes, %v\n", report.GlobalV4) + if report.GlobalV4.IsValid() { + printf("\t* IPv4: yes, %s\n", report.GlobalV4) } else { printf("\t* IPv4: (no addr found)\n") } - if report.GlobalV6 != "" { - printf("\t* IPv6: yes, %v\n", report.GlobalV6) + if report.GlobalV6.IsValid() { + printf("\t* IPv6: yes, %s\n", report.GlobalV6) } else if report.IPv6 { printf("\t* IPv6: (no addr found)\n") } else if report.OSHasIPv6 { diff --git a/net/netcheck/netcheck.go b/net/netcheck/netcheck.go index 45523c7ae..396081739 100644 --- a/net/netcheck/netcheck.go +++ b/net/netcheck/netcheck.go @@ -13,6 +13,7 @@ import ( "fmt" "io" "log" + "maps" "math/rand" "net" "net/http" @@ -115,8 +116,11 @@ type Report struct { RegionV4Latency map[int]time.Duration // keyed by DERP Region ID RegionV6Latency map[int]time.Duration // keyed by DERP Region ID - GlobalV4 string // ip:port of global IPv4 - GlobalV6 string // [ip]:port of global IPv6 + GlobalV4Counters map[netip.AddrPort]int // keyed by IP:port, number of times observed + GlobalV6Counters map[netip.AddrPort]int // keyed by [IP]:port, number of times observed + + GlobalV4 netip.AddrPort // ip:port of global IPv4 + GlobalV6 netip.AddrPort // [ip]:port of global IPv6 // CaptivePortal is set when we think there's a captive portal that is // intercepting HTTP traffic. @@ -125,6 +129,44 @@ type Report struct { // TODO: update Clone when adding new fields } +// GetGlobalAddrs returns the v4 and v6 global addresses observed during the +// netcheck, which includes the best latency endpoint first, followed by any +// other endpoints that were observed repeatedly. It excludes singular endpoints +// that are likely only the result of a hard NAT. +func (r *Report) GetGlobalAddrs() ([]netip.AddrPort, []netip.AddrPort) { + var v4, v6 []netip.AddrPort + // Always add the best latency entries first. + if r.GlobalV4.IsValid() { + v4 = append(v4, r.GlobalV4) + } + if r.GlobalV6.IsValid() { + v6 = append(v6, r.GlobalV6) + } + // Add any other entries for which we have multiple observations. + // This covers a case of bad NATs that start to provide new mappings for new + // STUN sessions mid-expiration, even while a live mapping for the best + // latency endpoint still exists. This has been observed on some Palo Alto + // Networks firewalls, wherein new traffic to the old endpoint will not + // succeed, but new traffic to the newly discovered endpoints does succeed. + for ipp, count := range r.GlobalV4Counters { + if ipp == r.GlobalV4 { + continue + } + if count > 1 { + v4 = append(v4, ipp) + } + } + for ipp, count := range r.GlobalV6Counters { + if ipp == r.GlobalV6 { + continue + } + if count > 1 { + v6 = append(v6, ipp) + } + } + return v4, v6 +} + // AnyPortMappingChecked reports whether any of UPnP, PMP, or PCP are non-empty. func (r *Report) AnyPortMappingChecked() bool { return r.UPnP != "" || r.PMP != "" || r.PCP != "" @@ -138,6 +180,8 @@ func (r *Report) Clone() *Report { r2.RegionLatency = cloneDurationMap(r2.RegionLatency) r2.RegionV4Latency = cloneDurationMap(r2.RegionV4Latency) r2.RegionV6Latency = cloneDurationMap(r2.RegionV6Latency) + r2.GlobalV4Counters = maps.Clone(r2.GlobalV4Counters) + r2.GlobalV6Counters = maps.Clone(r2.GlobalV6Counters) return &r2 } @@ -533,7 +577,7 @@ type reportState struct { sentHairCheck bool report *Report // to be returned by GetReport inFlight map[stun.TxID]func(netip.AddrPort) // called without c.mu held - gotEP4 string + gotEP4 netip.AddrPort timers []*time.Timer } @@ -640,11 +684,6 @@ func (rs *reportState) stopTimers() { // is non-zero (for all but HTTPS replies), it's recorded as our UDP // IP:port. func (rs *reportState) addNodeLatency(node *tailcfg.DERPNode, ipp netip.AddrPort, d time.Duration) { - var ipPortStr string - if ipp != (netip.AddrPort{}) { - ipPortStr = net.JoinHostPort(ipp.Addr().String(), fmt.Sprint(ipp.Port())) - } - rs.mu.Lock() defer rs.mu.Unlock() ret := rs.report @@ -670,18 +709,20 @@ func (rs *reportState) addNodeLatency(node *tailcfg.DERPNode, ipp netip.AddrPort case ipp.Addr().Is6(): updateLatency(ret.RegionV6Latency, node.RegionID, d) ret.IPv6 = true - ret.GlobalV6 = ipPortStr + ret.GlobalV6 = ipp + mak.Set(&ret.GlobalV6Counters, ipp, ret.GlobalV6Counters[ipp]+1) // TODO: track MappingVariesByDestIP for IPv6 // too? Would be sad if so, but who knows. case ipp.Addr().Is4(): updateLatency(ret.RegionV4Latency, node.RegionID, d) ret.IPv4 = true - if rs.gotEP4 == "" { - rs.gotEP4 = ipPortStr - ret.GlobalV4 = ipPortStr + mak.Set(&ret.GlobalV4Counters, ipp, ret.GlobalV4Counters[ipp]+1) + if !rs.gotEP4.IsValid() { + rs.gotEP4 = ipp + ret.GlobalV4 = ipp rs.startHairCheckLocked(ipp) } else { - if rs.gotEP4 != ipPortStr { + if rs.gotEP4 != ipp { ret.MappingVariesByDestIP.Set(true) } else if ret.MappingVariesByDestIP == "" { ret.MappingVariesByDestIP.Set(false) @@ -1334,11 +1375,11 @@ func (c *Client) logConciseReport(r *Report, dm *tailcfg.DERPMap) { } else { fmt.Fprintf(w, " portmap=?") } - if r.GlobalV4 != "" { - fmt.Fprintf(w, " v4a=%v", r.GlobalV4) + if r.GlobalV4.IsValid() { + fmt.Fprintf(w, " v4a=%s", r.GlobalV4) } - if r.GlobalV6 != "" { - fmt.Fprintf(w, " v6a=%v", r.GlobalV6) + if r.GlobalV6.IsValid() { + fmt.Fprintf(w, " v6a=%s", r.GlobalV6) } if r.CaptivePortal != "" { fmt.Fprintf(w, " captiveportal=%v", r.CaptivePortal) diff --git a/net/netcheck/netcheck_test.go b/net/netcheck/netcheck_test.go index 3652c2e55..4cbb06be2 100644 --- a/net/netcheck/netcheck_test.go +++ b/net/netcheck/netcheck_test.go @@ -11,6 +11,7 @@ import ( "net/http" "net/netip" "reflect" + "slices" "sort" "strconv" "strings" @@ -189,12 +190,50 @@ func TestBasic(t *testing.T) { if _, ok := r.RegionLatency[1]; !ok { t.Errorf("expected key 1 in DERPLatency; got %+v", r.RegionLatency) } - if r.GlobalV4 == "" { + if !r.GlobalV4.IsValid() { t.Error("expected GlobalV4 set") } if r.PreferredDERP != 1 { t.Errorf("PreferredDERP = %v; want 1", r.PreferredDERP) } + v4Addrs, _ := r.GetGlobalAddrs() + if len(v4Addrs) != 1 { + t.Error("expected one global IPv4 address") + } + if got, want := v4Addrs[0], r.GlobalV4; got != want { + t.Errorf("got %v; want %v", got, want) + } +} + +func TestMultiGlobalAddressMapping(t *testing.T) { + c := &Client{ + Logf: t.Logf, + } + + rs := &reportState{ + c: c, + start: time.Now(), + report: newReport(), + sentHairCheck: true, // prevent hair check start, not relevant here + } + derpNode := &tailcfg.DERPNode{} + port1 := netip.MustParseAddrPort("127.0.0.1:1234") + port2 := netip.MustParseAddrPort("127.0.0.1:2345") + port3 := netip.MustParseAddrPort("127.0.0.1:3456") + // First report for port1 + rs.addNodeLatency(derpNode, port1, 10*time.Millisecond) + // Singular report for port2 + rs.addNodeLatency(derpNode, port2, 11*time.Millisecond) + // Duplicate reports for port3 + rs.addNodeLatency(derpNode, port3, 12*time.Millisecond) + rs.addNodeLatency(derpNode, port3, 13*time.Millisecond) + + r := rs.report + v4Addrs, _ := r.GetGlobalAddrs() + wantV4Addrs := []netip.AddrPort{port1, port3} + if !slices.Equal(v4Addrs, wantV4Addrs) { + t.Errorf("got global addresses: %v, want %v", v4Addrs, wantV4Addrs) + } } func TestWorksWhenUDPBlocked(t *testing.T) { diff --git a/wgengine/magicsock/magicsock.go b/wgengine/magicsock/magicsock.go index 397796030..4f1617140 100644 --- a/wgengine/magicsock/magicsock.go +++ b/wgengine/magicsock/magicsock.go @@ -900,23 +900,24 @@ func (c *Conn) determineEndpoints(ctx context.Context) ([]tailcfg.Endpoint, erro c.setNetInfoHavePortMap() } - if nr.GlobalV4 != "" { - addAddr(ipp(nr.GlobalV4), tailcfg.EndpointSTUN) + v4Addrs, v6Addrs := nr.GetGlobalAddrs() + for _, addr := range v4Addrs { + addAddr(addr, tailcfg.EndpointSTUN) + } + for _, addr := range v6Addrs { + addAddr(addr, tailcfg.EndpointSTUN) + } + if len(v4Addrs) >= 1 { // If they're behind a hard NAT and are using a fixed // port locally, assume they might've added a static // port mapping on their router to the same explicit // port that tailscaled is running with. Worst case // it's an invalid candidate mapping. if port := c.port.Load(); nr.MappingVariesByDestIP.EqualBool(true) && port != 0 { - if ip, _, err := net.SplitHostPort(nr.GlobalV4); err == nil { - addAddr(ipp(net.JoinHostPort(ip, strconv.Itoa(int(port)))), tailcfg.EndpointSTUN4LocalPort) - } + addAddr(netip.AddrPortFrom(v4Addrs[0].Addr(), uint16(port)), tailcfg.EndpointSTUN4LocalPort) } } - if nr.GlobalV6 != "" { - addAddr(ipp(nr.GlobalV6), tailcfg.EndpointSTUN) - } // Update our set of endpoints by adding any endpoints that we // previously found but haven't expired yet. This also updates the