net/netcheck,wgengine/magicsock: add potential workaround for Palo Alto DIPP misbehavior

Palo Alto firewalls have a typically hard NAT, but also have a mode
called Persistent DIPP that is supposed to provide consistent port
mapping suitable for STUN resolution of public ports. Persistent DIPP
works initially on most Palo Alto firewalls, but some models/software
versions have a bug which this works around.

The bug symptom presents as follows:

- STUN sessions resolve a consistent public IP:port to start with
- Much later netchecks report the same IP:Port for a subset of
  sessions, most often the users active DERP, and/or the port related
  to sustained traffic.
- The broader set of DERPs in a full netcheck will now consistently
  observe a new IP:Port.
- After this point of observation, new inbound connections will only
  succeed to the new IP:Port observed, and existing/old sessions will
  only work to the old binding.

In this patch we now advertise the lowest latency global endpoint
discovered as we always have, but in addition any global endpoints that
are observed more than once in a single netcheck report. This should
provide viable endpoints for potential connection establishment across
a NAT with this behavior.

Updates tailscale/corp#19106

Signed-off-by: James Tucker <james@tailscale.com>
pull/12164/head
James Tucker 3 weeks ago committed by James Tucker
parent 6831a29f8b
commit 8d1249550a

@ -127,13 +127,13 @@ func printReport(dm *tailcfg.DERPMap, report *netcheck.Report) error {
printf("\nReport:\n")
printf("\t* UDP: %v\n", report.UDP)
if report.GlobalV4 != "" {
printf("\t* IPv4: yes, %v\n", report.GlobalV4)
if report.GlobalV4.IsValid() {
printf("\t* IPv4: yes, %s\n", report.GlobalV4)
} else {
printf("\t* IPv4: (no addr found)\n")
}
if report.GlobalV6 != "" {
printf("\t* IPv6: yes, %v\n", report.GlobalV6)
if report.GlobalV6.IsValid() {
printf("\t* IPv6: yes, %s\n", report.GlobalV6)
} else if report.IPv6 {
printf("\t* IPv6: (no addr found)\n")
} else if report.OSHasIPv6 {

@ -13,6 +13,7 @@ import (
"fmt"
"io"
"log"
"maps"
"math/rand"
"net"
"net/http"
@ -115,8 +116,11 @@ type Report struct {
RegionV4Latency map[int]time.Duration // keyed by DERP Region ID
RegionV6Latency map[int]time.Duration // keyed by DERP Region ID
GlobalV4 string // ip:port of global IPv4
GlobalV6 string // [ip]:port of global IPv6
GlobalV4Counters map[netip.AddrPort]int // keyed by IP:port, number of times observed
GlobalV6Counters map[netip.AddrPort]int // keyed by [IP]:port, number of times observed
GlobalV4 netip.AddrPort // ip:port of global IPv4
GlobalV6 netip.AddrPort // [ip]:port of global IPv6
// CaptivePortal is set when we think there's a captive portal that is
// intercepting HTTP traffic.
@ -125,6 +129,44 @@ type Report struct {
// TODO: update Clone when adding new fields
}
// GetGlobalAddrs returns the v4 and v6 global addresses observed during the
// netcheck, which includes the best latency endpoint first, followed by any
// other endpoints that were observed repeatedly. It excludes singular endpoints
// that are likely only the result of a hard NAT.
func (r *Report) GetGlobalAddrs() ([]netip.AddrPort, []netip.AddrPort) {
var v4, v6 []netip.AddrPort
// Always add the best latency entries first.
if r.GlobalV4.IsValid() {
v4 = append(v4, r.GlobalV4)
}
if r.GlobalV6.IsValid() {
v6 = append(v6, r.GlobalV6)
}
// Add any other entries for which we have multiple observations.
// This covers a case of bad NATs that start to provide new mappings for new
// STUN sessions mid-expiration, even while a live mapping for the best
// latency endpoint still exists. This has been observed on some Palo Alto
// Networks firewalls, wherein new traffic to the old endpoint will not
// succeed, but new traffic to the newly discovered endpoints does succeed.
for ipp, count := range r.GlobalV4Counters {
if ipp == r.GlobalV4 {
continue
}
if count > 1 {
v4 = append(v4, ipp)
}
}
for ipp, count := range r.GlobalV6Counters {
if ipp == r.GlobalV6 {
continue
}
if count > 1 {
v6 = append(v6, ipp)
}
}
return v4, v6
}
// AnyPortMappingChecked reports whether any of UPnP, PMP, or PCP are non-empty.
func (r *Report) AnyPortMappingChecked() bool {
return r.UPnP != "" || r.PMP != "" || r.PCP != ""
@ -138,6 +180,8 @@ func (r *Report) Clone() *Report {
r2.RegionLatency = cloneDurationMap(r2.RegionLatency)
r2.RegionV4Latency = cloneDurationMap(r2.RegionV4Latency)
r2.RegionV6Latency = cloneDurationMap(r2.RegionV6Latency)
r2.GlobalV4Counters = maps.Clone(r2.GlobalV4Counters)
r2.GlobalV6Counters = maps.Clone(r2.GlobalV6Counters)
return &r2
}
@ -533,7 +577,7 @@ type reportState struct {
sentHairCheck bool
report *Report // to be returned by GetReport
inFlight map[stun.TxID]func(netip.AddrPort) // called without c.mu held
gotEP4 string
gotEP4 netip.AddrPort
timers []*time.Timer
}
@ -640,11 +684,6 @@ func (rs *reportState) stopTimers() {
// is non-zero (for all but HTTPS replies), it's recorded as our UDP
// IP:port.
func (rs *reportState) addNodeLatency(node *tailcfg.DERPNode, ipp netip.AddrPort, d time.Duration) {
var ipPortStr string
if ipp != (netip.AddrPort{}) {
ipPortStr = net.JoinHostPort(ipp.Addr().String(), fmt.Sprint(ipp.Port()))
}
rs.mu.Lock()
defer rs.mu.Unlock()
ret := rs.report
@ -670,18 +709,20 @@ func (rs *reportState) addNodeLatency(node *tailcfg.DERPNode, ipp netip.AddrPort
case ipp.Addr().Is6():
updateLatency(ret.RegionV6Latency, node.RegionID, d)
ret.IPv6 = true
ret.GlobalV6 = ipPortStr
ret.GlobalV6 = ipp
mak.Set(&ret.GlobalV6Counters, ipp, ret.GlobalV6Counters[ipp]+1)
// TODO: track MappingVariesByDestIP for IPv6
// too? Would be sad if so, but who knows.
case ipp.Addr().Is4():
updateLatency(ret.RegionV4Latency, node.RegionID, d)
ret.IPv4 = true
if rs.gotEP4 == "" {
rs.gotEP4 = ipPortStr
ret.GlobalV4 = ipPortStr
mak.Set(&ret.GlobalV4Counters, ipp, ret.GlobalV4Counters[ipp]+1)
if !rs.gotEP4.IsValid() {
rs.gotEP4 = ipp
ret.GlobalV4 = ipp
rs.startHairCheckLocked(ipp)
} else {
if rs.gotEP4 != ipPortStr {
if rs.gotEP4 != ipp {
ret.MappingVariesByDestIP.Set(true)
} else if ret.MappingVariesByDestIP == "" {
ret.MappingVariesByDestIP.Set(false)
@ -1334,11 +1375,11 @@ func (c *Client) logConciseReport(r *Report, dm *tailcfg.DERPMap) {
} else {
fmt.Fprintf(w, " portmap=?")
}
if r.GlobalV4 != "" {
fmt.Fprintf(w, " v4a=%v", r.GlobalV4)
if r.GlobalV4.IsValid() {
fmt.Fprintf(w, " v4a=%s", r.GlobalV4)
}
if r.GlobalV6 != "" {
fmt.Fprintf(w, " v6a=%v", r.GlobalV6)
if r.GlobalV6.IsValid() {
fmt.Fprintf(w, " v6a=%s", r.GlobalV6)
}
if r.CaptivePortal != "" {
fmt.Fprintf(w, " captiveportal=%v", r.CaptivePortal)

@ -11,6 +11,7 @@ import (
"net/http"
"net/netip"
"reflect"
"slices"
"sort"
"strconv"
"strings"
@ -189,12 +190,50 @@ func TestBasic(t *testing.T) {
if _, ok := r.RegionLatency[1]; !ok {
t.Errorf("expected key 1 in DERPLatency; got %+v", r.RegionLatency)
}
if r.GlobalV4 == "" {
if !r.GlobalV4.IsValid() {
t.Error("expected GlobalV4 set")
}
if r.PreferredDERP != 1 {
t.Errorf("PreferredDERP = %v; want 1", r.PreferredDERP)
}
v4Addrs, _ := r.GetGlobalAddrs()
if len(v4Addrs) != 1 {
t.Error("expected one global IPv4 address")
}
if got, want := v4Addrs[0], r.GlobalV4; got != want {
t.Errorf("got %v; want %v", got, want)
}
}
func TestMultiGlobalAddressMapping(t *testing.T) {
c := &Client{
Logf: t.Logf,
}
rs := &reportState{
c: c,
start: time.Now(),
report: newReport(),
sentHairCheck: true, // prevent hair check start, not relevant here
}
derpNode := &tailcfg.DERPNode{}
port1 := netip.MustParseAddrPort("127.0.0.1:1234")
port2 := netip.MustParseAddrPort("127.0.0.1:2345")
port3 := netip.MustParseAddrPort("127.0.0.1:3456")
// First report for port1
rs.addNodeLatency(derpNode, port1, 10*time.Millisecond)
// Singular report for port2
rs.addNodeLatency(derpNode, port2, 11*time.Millisecond)
// Duplicate reports for port3
rs.addNodeLatency(derpNode, port3, 12*time.Millisecond)
rs.addNodeLatency(derpNode, port3, 13*time.Millisecond)
r := rs.report
v4Addrs, _ := r.GetGlobalAddrs()
wantV4Addrs := []netip.AddrPort{port1, port3}
if !slices.Equal(v4Addrs, wantV4Addrs) {
t.Errorf("got global addresses: %v, want %v", v4Addrs, wantV4Addrs)
}
}
func TestWorksWhenUDPBlocked(t *testing.T) {

@ -900,23 +900,24 @@ func (c *Conn) determineEndpoints(ctx context.Context) ([]tailcfg.Endpoint, erro
c.setNetInfoHavePortMap()
}
if nr.GlobalV4 != "" {
addAddr(ipp(nr.GlobalV4), tailcfg.EndpointSTUN)
v4Addrs, v6Addrs := nr.GetGlobalAddrs()
for _, addr := range v4Addrs {
addAddr(addr, tailcfg.EndpointSTUN)
}
for _, addr := range v6Addrs {
addAddr(addr, tailcfg.EndpointSTUN)
}
if len(v4Addrs) >= 1 {
// If they're behind a hard NAT and are using a fixed
// port locally, assume they might've added a static
// port mapping on their router to the same explicit
// port that tailscaled is running with. Worst case
// it's an invalid candidate mapping.
if port := c.port.Load(); nr.MappingVariesByDestIP.EqualBool(true) && port != 0 {
if ip, _, err := net.SplitHostPort(nr.GlobalV4); err == nil {
addAddr(ipp(net.JoinHostPort(ip, strconv.Itoa(int(port)))), tailcfg.EndpointSTUN4LocalPort)
}
addAddr(netip.AddrPortFrom(v4Addrs[0].Addr(), uint16(port)), tailcfg.EndpointSTUN4LocalPort)
}
}
if nr.GlobalV6 != "" {
addAddr(ipp(nr.GlobalV6), tailcfg.EndpointSTUN)
}
// Update our set of endpoints by adding any endpoints that we
// previously found but haven't expired yet. This also updates the

Loading…
Cancel
Save