// Copyright (c) Tailscale Inc & AUTHORS // SPDX-License-Identifier: BSD-3-Clause package magicsock import ( "bufio" "context" "encoding/binary" "errors" "fmt" "math" "math/rand" "net" "net/netip" "reflect" "runtime" "slices" "sync" "sync/atomic" "time" "golang.org/x/crypto/poly1305" xmaps "golang.org/x/exp/maps" "golang.org/x/net/ipv4" "golang.org/x/net/ipv6" "tailscale.com/disco" "tailscale.com/ipn/ipnstate" "tailscale.com/net/stun" "tailscale.com/net/tstun" "tailscale.com/tailcfg" "tailscale.com/tstime/mono" "tailscale.com/types/key" "tailscale.com/types/logger" "tailscale.com/util/mak" "tailscale.com/util/ringbuffer" ) var mtuProbePingSizesV4 []int var mtuProbePingSizesV6 []int func init() { for _, m := range tstun.WireMTUsToProbe { mtuProbePingSizesV4 = append(mtuProbePingSizesV4, pktLenToPingSize(m, false)) mtuProbePingSizesV6 = append(mtuProbePingSizesV6, pktLenToPingSize(m, true)) } } // endpoint is a wireguard/conn.Endpoint. In wireguard-go and kernel WireGuard // there is only one endpoint for a peer, but in Tailscale we distribute a // number of possible endpoints for a peer which would include the all the // likely addresses at which a peer may be reachable. This endpoint type holds // the information required that when wireguard-go wants to send to a // particular peer (essentially represented by this endpoint type), the send // function can use the currently best known Tailscale endpoint to send packets // to the peer. type endpoint struct { // atomically accessed; declared first for alignment reasons lastRecvWG mono.Time // last time there were incoming packets from this peer destined for wireguard-go (e.g. not disco) lastRecvUDPAny mono.Time // last time there were incoming UDP packets from this peer of any kind numStopAndResetAtomic int64 debugUpdates *ringbuffer.RingBuffer[EndpointChange] // These fields are initialized once and never modified. c *Conn nodeID tailcfg.NodeID publicKey key.NodePublic // peer public key (for WireGuard + DERP) publicKeyHex string // cached output of publicKey.UntypedHexString fakeWGAddr netip.AddrPort // the UDP address we tell wireguard-go we're using nodeAddr netip.Addr // the node's first tailscale address; used for logging & wireguard rate-limiting (Issue 6686) disco atomic.Pointer[endpointDisco] // if the peer supports disco, the key and short string // mu protects all following fields. mu sync.Mutex // Lock ordering: Conn.mu, then endpoint.mu heartBeatTimer *time.Timer // nil when idle lastSendExt mono.Time // last time there were outgoing packets sent to this peer from an external trigger (e.g. wireguard-go or disco pingCLI) lastSendAny mono.Time // last time there were outgoing packets sent this peer from any trigger, internal or external to magicsock lastFullPing mono.Time // last time we pinged all disco or wireguard only endpoints derpAddr netip.AddrPort // fallback/bootstrap path, if non-zero (non-zero for well-behaved clients) bestAddr addrQuality // best non-DERP path; zero if none; mutate via setBestAddrLocked() bestAddrAt mono.Time // time best address re-confirmed trustBestAddrUntil mono.Time // time when bestAddr expires sentPing map[stun.TxID]sentPing endpointState map[netip.AddrPort]*endpointState isCallMeMaybeEP map[netip.AddrPort]bool // The following fields are related to the new "silent disco" // implementation that's a WIP as of 2022-10-20. // See #540 for background. heartbeatDisabled bool probeUDPLifetime *probeUDPLifetime // UDP path lifetime probing; nil if disabled expired bool // whether the node has expired isWireguardOnly bool // whether the endpoint is WireGuard only } func (de *endpoint) setBestAddrLocked(v addrQuality) { if v.AddrPort != de.bestAddr.AddrPort { de.probeUDPLifetime.resetCycleEndpointLocked() } de.bestAddr = v } const ( // udpLifetimeProbeCliffSlack is how much slack to use relative to a // ProbeUDPLifetimeConfig.Cliffs duration in order to account for RTT, // scheduling jitter, buffers, etc. If the cliff is 10s, we attempt to probe // after 10s - 2s (8s) amount of inactivity. udpLifetimeProbeCliffSlack = time.Second * 2 // udpLifetimeProbeSchedulingTolerance is how much of a difference can be // tolerated between a UDP lifetime probe scheduling target and when it // actually fired. This must be some fraction of udpLifetimeProbeCliffSlack. udpLifetimeProbeSchedulingTolerance = udpLifetimeProbeCliffSlack / 8 ) // probeUDPLifetime represents the configuration and state tied to probing UDP // path lifetime. A probe "cycle" involves pinging the UDP path at various // timeout cliffs, which are pre-defined durations of interest commonly used by // NATs/firewalls as default stateful session timeout values. Cliffs are probed // in ascending order. A "cycle" completes when all cliffs have received a pong, // or when a ping times out. Cycles may extend across endpoint session lifetimes // if they are disrupted by user traffic. type probeUDPLifetime struct { // All fields are guarded by endpoint.mu. probeUDPLifetime methods are for // convenience. // config holds the probing configuration. config ProbeUDPLifetimeConfig // timer is nil when idle. A non-nil timer indicates we intend to probe a // timeout cliff in the future. timer *time.Timer // bestAddr contains the endpoint.bestAddr.AddrPort at the time a cycle was // scheduled to start. A probing cycle is 1:1 with the current // endpoint.bestAddr.AddrPort in the interest of simplicity. When // endpoint.bestAddr.AddrPort changes, any active probing cycle will reset. bestAddr netip.AddrPort // cycleStartedAt contains the time at which the first cliff // (ProbeUDPLifetimeConfig.Cliffs[0]) was pinged for the current/last cycle. cycleStartedAt time.Time // cycleActive is true if a probing cycle is active, otherwise false. cycleActive bool // currentCliff represents the index into ProbeUDPLifetimeConfig.Cliffs for // the cliff that we are waiting to ping, or waiting on a pong/timeout. currentCliff int // lastTxID is the ID for the last ping that was sent. lastTxID stun.TxID } func (p *probeUDPLifetime) currentCliffDurationEndpointLocked() time.Duration { if p == nil { return 0 } return p.config.Cliffs[p.currentCliff] } // cycleCompleteMaxCliffEndpointLocked records the max cliff (as an index of // ProbeUDPLifetimeConfig.Cliffs) a probing cycle reached, i.e. received a pong // for. A value < 0 indicates no cliff was reached. It is a no-op if the active // configuration does not equal defaultProbeUDPLifetimeConfig. func (p *probeUDPLifetime) cycleCompleteMaxCliffEndpointLocked(cliff int) { if !p.config.Equals(defaultProbeUDPLifetimeConfig) { return } switch { case cliff < 0: metricUDPLifetimeCycleCompleteNoCliffReached.Add(1) case cliff == 0: metricUDPLifetimeCycleCompleteAt10sCliff.Add(1) case cliff == 1: metricUDPLifetimeCycleCompleteAt30sCliff.Add(1) case cliff == 2: metricUDPLifetimeCycleCompleteAt60sCliff.Add(1) } } // resetCycleEndpointLocked resets the state contained in p to reflect an // inactive cycle. func (p *probeUDPLifetime) resetCycleEndpointLocked() { if p == nil { return } if p.timer != nil { p.timer.Stop() p.timer = nil } p.cycleActive = false p.currentCliff = 0 p.bestAddr = netip.AddrPort{} } // ProbeUDPLifetimeConfig represents the configuration for probing UDP path // lifetime. type ProbeUDPLifetimeConfig struct { // The timeout cliffs to probe. Values are in ascending order. Ascending // order is chosen over descending because we have limited opportunities to // probe. With a descending order we are stuck waiting for a new UDP // path/session if the first value times out. When that new path is // established is anyone's guess. Cliffs []time.Duration // CycleCanStartEvery represents the min duration between cycles starting // up. CycleCanStartEvery time.Duration } var ( // defaultProbeUDPLifetimeConfig is the configuration that must be used // for UDP path lifetime probing until it can be wholly disseminated (not // just on/off) from upstream control components, and associated metrics // (metricUDPLifetime*) have lifetime management. // // TODO(#10928): support dynamic config via tailcfg.PeerCapMap. defaultProbeUDPLifetimeConfig = &ProbeUDPLifetimeConfig{ Cliffs: []time.Duration{ time.Second * 10, time.Second * 30, time.Second * 60, }, CycleCanStartEvery: time.Hour * 24, } ) // Equals returns true if b equals p, otherwise false. If both sides are nil, // Equals returns true. If only one side is nil, Equals returns false. func (p *ProbeUDPLifetimeConfig) Equals(b *ProbeUDPLifetimeConfig) bool { if p == b { return true } if (p == nil && b != nil) || (b == nil && p != nil) { return false } if !slices.Equal(p.Cliffs, b.Cliffs) { return false } if p.CycleCanStartEvery != b.CycleCanStartEvery { return false } return true } // Valid returns true if p is valid, otherwise false. p must be non-nil. func (p *ProbeUDPLifetimeConfig) Valid() bool { if len(p.Cliffs) < 1 { // We need at least one cliff, otherwise there is nothing to probe. return false } if p.CycleCanStartEvery < 1 { // Probing must be constrained by a positive CycleCanStartEvery. return false } for i, c := range p.Cliffs { if c <= max(udpLifetimeProbeCliffSlack*2, heartbeatInterval) { // A timeout cliff less than or equal to twice // udpLifetimeProbeCliffSlack is invalid due to being effectively // zero when the cliff slack is subtracted from the cliff value at // scheduling time. // // A timeout cliff less or equal to the heartbeatInterval is also // invalid, as we may attempt to schedule on the tail end of the // last heartbeat tied to an active session. // // These values are constants, but max()'d in case they change in // the future. return false } if i == 0 { continue } if c <= p.Cliffs[i-1] { // Cliffs must be in ascending order. return false } } return true } // setProbeUDPLifetimeOn enables or disables probing of UDP path lifetime based // on v. In the case of enablement defaultProbeUDPLifetimeConfig is used as the // desired configuration. func (de *endpoint) setProbeUDPLifetimeOn(v bool) { de.mu.Lock() if v { de.setProbeUDPLifetimeConfigLocked(defaultProbeUDPLifetimeConfig) } else { de.setProbeUDPLifetimeConfigLocked(nil) } de.mu.Unlock() } // setProbeUDPLifetimeConfigLocked sets the desired configuration for probing // UDP path lifetime. Ownership of desired is passed to endpoint, it must not be // mutated once this call is made. A nil value disables the feature. If desired // is non-nil but desired.Valid() returns false this is a no-op. func (de *endpoint) setProbeUDPLifetimeConfigLocked(desired *ProbeUDPLifetimeConfig) { if de.isWireguardOnly { return } if desired == nil { if de.probeUDPLifetime == nil { // noop, not currently configured or desired return } de.probeUDPLifetime.resetCycleEndpointLocked() de.probeUDPLifetime = nil return } if !desired.Valid() { return } if de.probeUDPLifetime != nil { if de.probeUDPLifetime.config.Equals(desired) { // noop, current config equals desired return } de.probeUDPLifetime.resetCycleEndpointLocked() } else { de.probeUDPLifetime = &probeUDPLifetime{} } p := de.probeUDPLifetime p.config = *desired p.resetCycleEndpointLocked() } // endpointDisco is the current disco key and short string for an endpoint. This // structure is immutable. type endpointDisco struct { key key.DiscoPublic // for discovery messages. short string // ShortString of discoKey. } type sentPing struct { to netip.AddrPort at mono.Time timer *time.Timer // timeout timer purpose discoPingPurpose size int // size of the disco message resCB *pingResultAndCallback // or nil for internal use } // endpointState is some state and history for a specific endpoint of // a endpoint. (The subject is the endpoint.endpointState // map key) type endpointState struct { // all fields guarded by endpoint.mu // lastPing is the last (outgoing) ping time. lastPing mono.Time // lastGotPing, if non-zero, means that this was an endpoint // that we learned about at runtime (from an incoming ping) // and that is not in the network map. If so, we keep the time // updated and use it to discard old candidates. lastGotPing time.Time // lastGotPingTxID contains the TxID for the last incoming ping. This is // used to de-dup incoming pings that we may see on both the raw disco // socket on Linux, and UDP socket. We cannot rely solely on the raw socket // disco handling due to https://github.com/tailscale/tailscale/issues/7078. lastGotPingTxID stun.TxID // callMeMaybeTime, if non-zero, is the time this endpoint // was advertised last via a call-me-maybe disco message. callMeMaybeTime time.Time recentPongs []pongReply // ring buffer up to pongHistoryCount entries recentPong uint16 // index into recentPongs of most recent; older before, wrapped index int16 // index in nodecfg.Node.Endpoints; meaningless if lastGotPing non-zero } // clear removes all derived / probed state from an endpointState. func (s *endpointState) clear() { *s = endpointState{ index: s.index, lastGotPing: s.lastGotPing, } } // pongHistoryCount is how many pongReply values we keep per endpointState const pongHistoryCount = 64 type pongReply struct { latency time.Duration pongAt mono.Time // when we received the pong from netip.AddrPort // the pong's src (usually same as endpoint map key) pongSrc netip.AddrPort // what they reported they heard } // EndpointChange is a structure containing information about changes made to a // particular endpoint. This is not a stable interface and could change at any // time. type EndpointChange struct { When time.Time // when the change occurred What string // what this change is From any `json:",omitempty"` // information about the previous state To any `json:",omitempty"` // information about the new state } // shouldDeleteLocked reports whether we should delete this endpoint. func (st *endpointState) shouldDeleteLocked() bool { switch { case !st.callMeMaybeTime.IsZero(): return false case st.lastGotPing.IsZero(): // This was an endpoint from the network map. Is it still in the network map? return st.index == indexSentinelDeleted default: // This was an endpoint discovered at runtime. return time.Since(st.lastGotPing) > sessionActiveTimeout } } // latencyLocked returns the most recent latency measurement, if any. // endpoint.mu must be held. func (st *endpointState) latencyLocked() (lat time.Duration, ok bool) { if len(st.recentPongs) == 0 { return 0, false } return st.recentPongs[st.recentPong].latency, true } // endpoint.mu must be held. func (st *endpointState) addPongReplyLocked(r pongReply) { if n := len(st.recentPongs); n < pongHistoryCount { st.recentPong = uint16(n) st.recentPongs = append(st.recentPongs, r) return } i := st.recentPong + 1 if i == pongHistoryCount { i = 0 } st.recentPongs[i] = r st.recentPong = i } func (de *endpoint) deleteEndpointLocked(why string, ep netip.AddrPort) { de.debugUpdates.Add(EndpointChange{ When: time.Now(), What: "deleteEndpointLocked-" + why, From: ep, }) delete(de.endpointState, ep) if de.bestAddr.AddrPort == ep { de.debugUpdates.Add(EndpointChange{ When: time.Now(), What: "deleteEndpointLocked-bestAddr-" + why, From: de.bestAddr, }) de.setBestAddrLocked(addrQuality{}) } } // initFakeUDPAddr populates fakeWGAddr with a globally unique fake UDPAddr. // The current implementation just uses the pointer value of de jammed into an IPv6 // address, but it could also be, say, a counter. func (de *endpoint) initFakeUDPAddr() { var addr [16]byte addr[0] = 0xfd addr[1] = 0x00 binary.BigEndian.PutUint64(addr[2:], uint64(reflect.ValueOf(de).Pointer())) de.fakeWGAddr = netip.AddrPortFrom(netip.AddrFrom16(addr).Unmap(), 12345) } // noteRecvActivity records receive activity on de, and invokes // Conn.noteRecvActivity no more than once every 10s. func (de *endpoint) noteRecvActivity(ipp netip.AddrPort, now mono.Time) { if de.isWireguardOnly { de.mu.Lock() de.bestAddr.AddrPort = ipp de.bestAddrAt = now de.trustBestAddrUntil = now.Add(5 * time.Second) de.mu.Unlock() } else { // TODO(jwhited): subject to change as part of silent disco effort. // Necessary when heartbeat is disabled for the endpoint, otherwise we // kick off discovery disco pings every trustUDPAddrDuration and mirror // to DERP. de.mu.Lock() if de.heartbeatDisabled && de.bestAddr.AddrPort == ipp { de.trustBestAddrUntil = now.Add(trustUDPAddrDuration) } de.mu.Unlock() } elapsed := now.Sub(de.lastRecvWG.LoadAtomic()) if elapsed > 10*time.Second { de.lastRecvWG.StoreAtomic(now) if de.c.noteRecvActivity == nil { return } de.c.noteRecvActivity(de.publicKey) } } func (de *endpoint) discoShort() string { var short string if d := de.disco.Load(); d != nil { short = d.short } return short } // String exists purely so wireguard-go internals can log.Printf("%v") // its internal conn.Endpoints and we don't end up with data races // from fmt (via log) reading mutex fields and such. func (de *endpoint) String() string { return fmt.Sprintf("magicsock.endpoint{%v, %v}", de.publicKey.ShortString(), de.discoShort()) } func (de *endpoint) ClearSrc() {} func (de *endpoint) SrcToString() string { panic("unused") } // unused by wireguard-go func (de *endpoint) SrcIP() netip.Addr { panic("unused") } // unused by wireguard-go func (de *endpoint) DstToString() string { return de.publicKeyHex } func (de *endpoint) DstIP() netip.Addr { return de.nodeAddr } // see tailscale/tailscale#6686 func (de *endpoint) DstToBytes() []byte { return packIPPort(de.fakeWGAddr) } // addrForSendLocked returns the address(es) that should be used for // sending the next packet. Zero, one, or both of UDP address and DERP // addr may be non-zero. If the endpoint is WireGuard only and does not have // latency information, a bool is returned to indicate that the // WireGuard latency discovery pings should be sent. // // de.mu must be held. // // TODO(val): Rewrite the addrFor*Locked() variations to share code. func (de *endpoint) addrForSendLocked(now mono.Time) (udpAddr, derpAddr netip.AddrPort, sendWGPing bool) { udpAddr = de.bestAddr.AddrPort if udpAddr.IsValid() && !now.After(de.trustBestAddrUntil) { return udpAddr, netip.AddrPort{}, false } if de.isWireguardOnly { // If the endpoint is wireguard-only, we don't have a DERP // address to send to, so we have to send to the UDP address. udpAddr, shouldPing := de.addrForWireGuardSendLocked(now) return udpAddr, netip.AddrPort{}, shouldPing } // We had a bestAddr but it expired so send both to it // and DERP. return udpAddr, de.derpAddr, false } // addrForWireGuardSendLocked returns the address that should be used for // sending the next packet. If a packet has never or not recently been sent to // the endpoint, then a randomly selected address for the endpoint is returned, // as well as a bool indiciating that WireGuard discovery pings should be started. // If the addresses have latency information available, then the address with the // best latency is used. // // de.mu must be held. func (de *endpoint) addrForWireGuardSendLocked(now mono.Time) (udpAddr netip.AddrPort, shouldPing bool) { if len(de.endpointState) == 0 { de.c.logf("magicsock: addrForSendWireguardLocked: [unexpected] no candidates available for endpoint") return udpAddr, false } // lowestLatency is a high duration initially, so we // can be sure we're going to have a duration lower than this // for the first latency retrieved. lowestLatency := time.Hour var oldestPing mono.Time for ipp, state := range de.endpointState { if oldestPing.IsZero() { oldestPing = state.lastPing } else if state.lastPing.Before(oldestPing) { oldestPing = state.lastPing } if latency, ok := state.latencyLocked(); ok { if latency < lowestLatency || latency == lowestLatency && ipp.Addr().Is6() { // If we have the same latency,IPv6 is prioritized. // TODO(catzkorn): Consider a small increase in latency to use // IPv6 in comparison to IPv4, when possible. lowestLatency = latency udpAddr = ipp } } } needPing := len(de.endpointState) > 1 && now.Sub(oldestPing) > wireguardPingInterval if !udpAddr.IsValid() { candidates := xmaps.Keys(de.endpointState) // Randomly select an address to use until we retrieve latency information // and give it a short trustBestAddrUntil time so we avoid flapping between // addresses while waiting on latency information to be populated. udpAddr = candidates[rand.Intn(len(candidates))] } de.bestAddr.AddrPort = udpAddr // Only extend trustBestAddrUntil by one second to avoid packet // reordering and/or CPU usage from random selection during the first // second. We should receive a response due to a WireGuard handshake in // less than one second in good cases, in which case this will be then // extended to 15 seconds. de.trustBestAddrUntil = now.Add(time.Second) return udpAddr, needPing } // addrForPingSizeLocked returns the address(es) that should be used for sending // the next ping. It will only return addrs with a large enough path MTU to // permit a ping payload of size bytes to be delivered (DERP is always one such // addr as it is a TCP connection). If it returns a zero-value udpAddr, then we // should continue probing the MTU of all paths to this endpoint. Zero, one, or // both of the returned UDP address and DERP address may be non-zero. // // de.mu must be held. func (de *endpoint) addrForPingSizeLocked(now mono.Time, size int) (udpAddr, derpAddr netip.AddrPort) { if size == 0 { udpAddr, derpAddr, _ = de.addrForSendLocked(now) return } udpAddr = de.bestAddr.AddrPort pathMTU := de.bestAddr.wireMTU requestedMTU := pingSizeToPktLen(size, udpAddr.Addr().Is6()) mtuOk := requestedMTU <= pathMTU if udpAddr.IsValid() && mtuOk { if !now.After(de.trustBestAddrUntil) { return udpAddr, netip.AddrPort{} } // We had a bestAddr with large enough MTU but it expired, so // send both to it and DERP. return udpAddr, de.derpAddr } // The UDP address isn't valid or it doesn't have a path MTU big enough // for the packet. Return a zero-value udpAddr to signal that we should // keep probing the path MTU to all addresses for this endpoint, and a // valid DERP addr to signal that we should also send via DERP. return netip.AddrPort{}, de.derpAddr } // maybeProbeUDPLifetimeLocked returns an afterInactivityFor duration and true // if de is a candidate for UDP path lifetime probing in the future, otherwise // false. func (de *endpoint) maybeProbeUDPLifetimeLocked() (afterInactivityFor time.Duration, maybe bool) { p := de.probeUDPLifetime if p == nil { return afterInactivityFor, false } if !de.bestAddr.IsValid() { return afterInactivityFor, false } epDisco := de.disco.Load() if epDisco == nil { // peer does not support disco return afterInactivityFor, false } // We compare disco keys, which may have a shorter lifetime than node keys // since disco keys reset on startup. This has the desired side effect of // shuffling probing probability where the local node ends up with a large // key value lexicographically relative to the other nodes it tends to // communicate with. If de's disco key changes, the cycle will reset. if de.c.discoPublic.Compare(epDisco.key) >= 0 { // lower disco pub key node probes higher return afterInactivityFor, false } if !p.cycleActive && time.Since(p.cycleStartedAt) < p.config.CycleCanStartEvery { // This is conservative as it doesn't account for afterInactivityFor use // by the caller, potentially delaying the start of the next cycle. We // assume the cycle could start immediately following // maybeProbeUDPLifetimeLocked(), regardless of the value of // afterInactivityFor relative to latest packets in/out time. return afterInactivityFor, false } afterInactivityFor = p.currentCliffDurationEndpointLocked() - udpLifetimeProbeCliffSlack if afterInactivityFor < 0 { // shouldn't happen return afterInactivityFor, false } return afterInactivityFor, true } // heartbeatForLifetimeVia represents the scheduling source of // endpoint.heartbeatForLifetime(). type heartbeatForLifetimeVia string const ( heartbeatForLifetimeViaSessionInactive heartbeatForLifetimeVia = "session-inactive" heartbeatForLifetimeViaPongRx heartbeatForLifetimeVia = "pong-rx" heartbeatForLifetimeViaSelf heartbeatForLifetimeVia = "self" ) // scheduleHeartbeatForLifetimeLocked schedules de.heartbeatForLifetime to fire // in the future (after). The caller must describe themselves in the via arg. func (de *endpoint) scheduleHeartbeatForLifetimeLocked(after time.Duration, via heartbeatForLifetimeVia) { p := de.probeUDPLifetime if p == nil { return } de.c.dlogf("[v1] magicsock: disco: scheduling UDP lifetime probe for cliff=%v via=%v to %v (%v)", p.currentCliffDurationEndpointLocked(), via, de.publicKey.ShortString(), de.discoShort()) p.bestAddr = de.bestAddr.AddrPort p.timer = time.AfterFunc(after, de.heartbeatForLifetime) if via == heartbeatForLifetimeViaSelf { metricUDPLifetimeCliffsRescheduled.Add(1) } else { metricUDPLifetimeCliffsScheduled.Add(1) } } // heartbeatForLifetime sends a disco ping recorded locally with a purpose of // pingHeartbeatForUDPLifetime to de if de.bestAddr has remained stable, and it // has been inactive for a duration that is within the error bounds for current // lifetime probing cliff. Alternatively it may reschedule itself into the // future, which is one of three scheduling sources. The other scheduling // sources are de.heartbeat() and de.probeUDPLifetimeCliffDoneLocked(). func (de *endpoint) heartbeatForLifetime() { de.mu.Lock() defer de.mu.Unlock() p := de.probeUDPLifetime if p == nil || p.timer == nil { // We raced with a code path trying to p.timer.Stop() us. Give up early // in the interest of simplicity. If p.timer.Stop() happened in // de.heartbeat() presumably because of recent packets in/out we *could* // still probe here, and it would be meaningful, but the time logic // below would reschedule as-is. return } p.timer = nil if !p.bestAddr.IsValid() || de.bestAddr.AddrPort != p.bestAddr { // best path changed p.resetCycleEndpointLocked() return } afterInactivityFor, ok := de.maybeProbeUDPLifetimeLocked() if !ok { p.resetCycleEndpointLocked() return } inactiveFor := mono.Now().Sub(max(de.lastRecvUDPAny.LoadAtomic(), de.lastSendAny)) delta := afterInactivityFor - inactiveFor if delta.Abs() > udpLifetimeProbeSchedulingTolerance { if delta < 0 { // We missed our opportunity. We can resume this cliff at the tail // end of another session. metricUDPLifetimeCliffsMissed.Add(1) return } else { // We need to wait longer before sending a ping. This can happen for // a number of reasons, which are described in more detail in // de.heartbeat(). de.scheduleHeartbeatForLifetimeLocked(delta, heartbeatForLifetimeViaSelf) return } } if p.currentCliff == 0 { p.cycleStartedAt = time.Now() p.cycleActive = true } de.c.dlogf("[v1] magicsock: disco: sending disco ping for UDP lifetime probe cliff=%v to %v (%v)", p.currentCliffDurationEndpointLocked(), de.publicKey.ShortString(), de.discoShort()) de.startDiscoPingLocked(de.bestAddr.AddrPort, mono.Now(), pingHeartbeatForUDPLifetime, 0, nil) } // heartbeat is called every heartbeatInterval to keep the best UDP path alive, // kick off discovery of other paths, or schedule the probing of UDP path // lifetime on the tail end of an active session. func (de *endpoint) heartbeat() { de.mu.Lock() defer de.mu.Unlock() if de.probeUDPLifetime != nil && de.probeUDPLifetime.timer != nil { de.probeUDPLifetime.timer.Stop() de.probeUDPLifetime.timer = nil } de.heartBeatTimer = nil if de.heartbeatDisabled { // If control override to disable heartBeatTimer set, return early. return } if de.lastSendExt.IsZero() { // Shouldn't happen. return } now := mono.Now() if now.Sub(de.lastSendExt) > sessionActiveTimeout { // Session's idle. Stop heartbeating. de.c.dlogf("[v1] magicsock: disco: ending heartbeats for idle session to %v (%v)", de.publicKey.ShortString(), de.discoShort()) if afterInactivityFor, ok := de.maybeProbeUDPLifetimeLocked(); ok { // This is the best place to best effort schedule a probe of UDP // path lifetime in the future as it loosely translates to "UDP path // is inactive". // // Note: wireguard-go schedules a WireGuard keepalive packet (by // default, not tied to persistent keepalive feature) 10 seconds in // the future after receiving an authenticated data packet. It's // typically only sent by one side based on how the WireGuard state // machine controls the timer. So, if we are on the receiving end of // that keepalive, de.lastSendExt won't move, assuming there is no // other user-generated traffic. This is one reason why we perform // a more granular check of the last packets in/out time, below, as // a WireGuard keepalive may have fallen somewhere within the // sessionActiveTimeout window. heartbeatForLifetime will also // perform a similar check, and reschedule as necessary. inactiveFor := now.Sub(max(de.lastSendAny, de.lastRecvUDPAny.LoadAtomic())) after := afterInactivityFor - inactiveFor if after < 0 { // shouldn't happen return } de.scheduleHeartbeatForLifetimeLocked(after, heartbeatForLifetimeViaSessionInactive) } return } udpAddr, _, _ := de.addrForSendLocked(now) if udpAddr.IsValid() { // We have a preferred path. Ping that every 2 seconds. de.startDiscoPingLocked(udpAddr, now, pingHeartbeat, 0, nil) } if de.wantFullPingLocked(now) { de.sendDiscoPingsLocked(now, true) } de.heartBeatTimer = time.AfterFunc(heartbeatInterval, de.heartbeat) } // setHeartbeatDisabled sets heartbeatDisabled to the provided value. func (de *endpoint) setHeartbeatDisabled(v bool) { de.mu.Lock() defer de.mu.Unlock() de.heartbeatDisabled = v } // wantFullPingLocked reports whether we should ping to all our peers looking for // a better path. // // de.mu must be held. func (de *endpoint) wantFullPingLocked(now mono.Time) bool { if runtime.GOOS == "js" { return false } if !de.bestAddr.IsValid() || de.lastFullPing.IsZero() { return true } if now.After(de.trustBestAddrUntil) { return true } if de.bestAddr.latency <= goodEnoughLatency { return false } if now.Sub(de.lastFullPing) >= upgradeInterval { return true } return false } func (de *endpoint) noteTxActivityExtTriggerLocked(now mono.Time) { de.lastSendExt = now if de.heartBeatTimer == nil && !de.heartbeatDisabled { de.heartBeatTimer = time.AfterFunc(heartbeatInterval, de.heartbeat) } } // MaxDiscoPingSize is the largest useful ping message size that we // can send - the maximum packet size minus the IPv4 and UDP headers. var MaxDiscoPingSize = tstun.MaxPacketSize - 20 - 8 type pingResultAndCallback struct { taken atomic.Bool // first CompareAndSwamp from false to true takes ownership of res res *ipnstate.PingResult cb func(*ipnstate.PingResult) } func (p *pingResultAndCallback) reply() bool { return p != nil && p.taken.CompareAndSwap(false, true) } // discoPing starts a disco-level ping for the "tailscale ping" command (or other // callers, such as c2n). res is value to call cb with, already partially // filled. cb must be called at most once. Once called, ownership of res passes to cb. func (de *endpoint) discoPing(res *ipnstate.PingResult, size int, cb func(*ipnstate.PingResult)) { de.mu.Lock() defer de.mu.Unlock() if de.expired { res.Err = errExpired.Error() cb(res) return } if size > MaxDiscoPingSize { res.Err = errPingTooBig.Error() cb(res) return } resCB := &pingResultAndCallback{res: res, cb: cb} now := mono.Now() udpAddr, derpAddr := de.addrForPingSizeLocked(now, size) if derpAddr.IsValid() { de.startDiscoPingLocked(derpAddr, now, pingCLI, size, resCB) } if udpAddr.IsValid() && now.Before(de.trustBestAddrUntil) { // Already have an active session, so just ping the address we're using. // Otherwise "tailscale ping" results to a node on the local network // can look like they're bouncing between, say 10.0.0.0/9 and the peer's // IPv6 address, both 1ms away, and it's random who replies first. de.startDiscoPingLocked(udpAddr, now, pingCLI, size, resCB) } else { for ep := range de.endpointState { de.startDiscoPingLocked(ep, now, pingCLI, size, resCB) } } } var ( errExpired = errors.New("peer's node key has expired") errNoUDPOrDERP = errors.New("no UDP or DERP addr") errPingTooBig = errors.New("ping size too big") ) func (de *endpoint) send(buffs [][]byte) error { de.mu.Lock() if de.expired { de.mu.Unlock() return errExpired } now := mono.Now() udpAddr, derpAddr, startWGPing := de.addrForSendLocked(now) if de.isWireguardOnly { if startWGPing { de.sendWireGuardOnlyPingsLocked(now) } } else if !udpAddr.IsValid() || now.After(de.trustBestAddrUntil) { de.sendDiscoPingsLocked(now, true) } de.noteTxActivityExtTriggerLocked(now) de.lastSendAny = now de.mu.Unlock() if !udpAddr.IsValid() && !derpAddr.IsValid() { return errNoUDPOrDERP } var err error if udpAddr.IsValid() { _, err = de.c.sendUDPBatch(udpAddr, buffs) // If the error is known to indicate that the endpoint is no longer // usable, clear the endpoint statistics so that the next send will // re-evaluate the best endpoint. if err != nil && isBadEndpointErr(err) { de.noteBadEndpoint(udpAddr) } // TODO(raggi): needs updating for accuracy, as in error conditions we may have partial sends. if stats := de.c.stats.Load(); err == nil && stats != nil { var txBytes int for _, b := range buffs { txBytes += len(b) } stats.UpdateTxPhysical(de.nodeAddr, udpAddr, txBytes) } } if derpAddr.IsValid() { allOk := true for _, buff := range buffs { ok, _ := de.c.sendAddr(derpAddr, de.publicKey, buff) if stats := de.c.stats.Load(); stats != nil { stats.UpdateTxPhysical(de.nodeAddr, derpAddr, len(buff)) } if !ok { allOk = false } } if allOk { return nil } } return err } // probeUDPLifetimeCliffDoneLocked is called when a disco // pingHeartbeatForUDPLifetime is being cleaned up. result contains the reason // for the cleanup, txid contains the ping's txid. // probeUDPLifetimeCliffDoneLocked may schedule another // pingHeartbeatForUDPLifetime in the future if there is another cliff remaining // for the current probing cycle. func (de *endpoint) probeUDPLifetimeCliffDoneLocked(result discoPingResult, txid stun.TxID) { p := de.probeUDPLifetime if p == nil || !p.cycleActive || de.probeUDPLifetime.timer != nil || txid != p.lastTxID { // Probing may have been disabled while heartbeats were in flight. This // can also be a duplicate or late arriving result. return } metricUDPLifetimeCliffsCompleted.Add(1) if result != discoPongReceived || p.currentCliff >= len(p.config.Cliffs)-1 { maxCliffIndex := p.currentCliff if result != discoPongReceived { maxCliffIndex = p.currentCliff - 1 } var maxCliffDuration time.Duration if maxCliffIndex >= 0 { maxCliffDuration = p.config.Cliffs[maxCliffIndex] } p.cycleCompleteMaxCliffEndpointLocked(maxCliffIndex) de.c.dlogf("[v1] magicsock: disco: UDP lifetime probe cycle completed max cliff=%v for %v (%v)", maxCliffDuration, de.publicKey.ShortString(), de.discoShort()) metricUDPLifetimeCyclesCompleted.Add(1) p.resetCycleEndpointLocked() } else { p.currentCliff++ if after, ok := de.maybeProbeUDPLifetimeLocked(); ok { de.scheduleHeartbeatForLifetimeLocked(after, heartbeatForLifetimeViaPongRx) } } } func (de *endpoint) discoPingTimeout(txid stun.TxID) { de.mu.Lock() defer de.mu.Unlock() sp, ok := de.sentPing[txid] if !ok { return } if debugDisco() || !de.bestAddr.IsValid() || mono.Now().After(de.trustBestAddrUntil) { de.c.dlogf("[v1] magicsock: disco: timeout waiting for pong %x from %v (%v, %v)", txid[:6], sp.to, de.publicKey.ShortString(), de.discoShort()) } de.removeSentDiscoPingLocked(txid, sp, discoPingTimedOut) } // forgetDiscoPing is called when a ping fails to send. func (de *endpoint) forgetDiscoPing(txid stun.TxID) { de.mu.Lock() defer de.mu.Unlock() if sp, ok := de.sentPing[txid]; ok { de.removeSentDiscoPingLocked(txid, sp, discoPingFailed) } } // discoPingResult represents the result of an attempted disco ping send // operation. type discoPingResult int const ( discoPingResultUnknown discoPingResult = iota discoPingFailed discoPingTimedOut discoPongReceived ) func (de *endpoint) removeSentDiscoPingLocked(txid stun.TxID, sp sentPing, result discoPingResult) { // Stop the timer for the case where sendPing failed to write to UDP. // In the case of a timer already having fired, this is a no-op: sp.timer.Stop() if sp.purpose == pingHeartbeatForUDPLifetime { de.probeUDPLifetimeCliffDoneLocked(result, txid) } delete(de.sentPing, txid) } // discoPingSize is the size of a complete disco ping packet, without any padding. const discoPingSize = len(disco.Magic) + key.DiscoPublicRawLen + disco.NonceLen + poly1305.TagSize + disco.MessageHeaderLen + disco.PingLen // sendDiscoPing sends a ping with the provided txid to ep using de's discoKey. size // is the desired disco message size, including all disco headers but excluding IP/UDP // headers. // // The caller (startDiscoPingLocked) should've already recorded the ping in // sentPing and set up the timer. // // The caller should use de.discoKey as the discoKey argument. // It is passed in so that sendDiscoPing doesn't need to lock de.mu. func (de *endpoint) sendDiscoPing(ep netip.AddrPort, discoKey key.DiscoPublic, txid stun.TxID, size int, logLevel discoLogLevel) { size = min(size, MaxDiscoPingSize) padding := max(size-discoPingSize, 0) sent, _ := de.c.sendDiscoMessage(ep, de.publicKey, discoKey, &disco.Ping{ TxID: [12]byte(txid), NodeKey: de.c.publicKeyAtomic.Load(), Padding: padding, }, logLevel) if !sent { de.forgetDiscoPing(txid) return } if size != 0 { metricSentDiscoPeerMTUProbes.Add(1) metricSentDiscoPeerMTUProbeBytes.Add(int64(pingSizeToPktLen(size, ep.Addr().Is6()))) } } // discoPingPurpose is the reason why a discovery ping message was sent. type discoPingPurpose int //go:generate go run tailscale.com/cmd/addlicense -file discopingpurpose_string.go go run golang.org/x/tools/cmd/stringer -type=discoPingPurpose -trimprefix=ping const ( // pingDiscovery means that purpose of a ping was to see if a // path was valid. pingDiscovery discoPingPurpose = iota // pingHeartbeat means that purpose of a ping was whether a // peer was still there. pingHeartbeat // pingCLI means that the user is running "tailscale ping" // from the CLI. These types of pings can go over DERP. pingCLI // pingHeartbeatForUDPLifetime means that the purpose of a ping was to // discover whether the UDP path was still active through any and all // stateful middleboxes involved. pingHeartbeatForUDPLifetime ) // startDiscoPingLocked sends a disco ping to ep in a separate goroutine. resCB, // if non-nil, means that a caller external to the magicsock package internals // is interested in the result (such as a CLI "tailscale ping" or a c2n ping // request, etc) func (de *endpoint) startDiscoPingLocked(ep netip.AddrPort, now mono.Time, purpose discoPingPurpose, size int, resCB *pingResultAndCallback) { if runtime.GOOS == "js" { return } epDisco := de.disco.Load() if epDisco == nil { return } if purpose != pingCLI { st, ok := de.endpointState[ep] if !ok { // Shouldn't happen. But don't ping an endpoint that's // not active for us. de.c.logf("magicsock: disco: [unexpected] attempt to ping no longer live endpoint %v", ep) return } st.lastPing = now } // If we are doing a discovery ping or a CLI ping with no specified size // to a non DERP address, then probe the MTU. Otherwise just send the // one specified ping. // Default to sending a single ping of the specified size sizes := []int{size} if de.c.PeerMTUEnabled() { isDerp := ep.Addr() == tailcfg.DerpMagicIPAddr if !isDerp && ((purpose == pingDiscovery) || (purpose == pingCLI && size == 0)) { de.c.dlogf("[v1] magicsock: starting MTU probe") sizes = mtuProbePingSizesV4 if ep.Addr().Is6() { sizes = mtuProbePingSizesV6 } } } logLevel := discoLog if purpose == pingHeartbeat { logLevel = discoVerboseLog } if purpose == pingCLI { de.noteTxActivityExtTriggerLocked(now) } de.lastSendAny = now for _, s := range sizes { txid := stun.NewTxID() de.sentPing[txid] = sentPing{ to: ep, at: now, timer: time.AfterFunc(pingTimeoutDuration, func() { de.discoPingTimeout(txid) }), purpose: purpose, resCB: resCB, size: s, } if purpose == pingHeartbeatForUDPLifetime && de.probeUDPLifetime != nil { de.probeUDPLifetime.lastTxID = txid } go de.sendDiscoPing(ep, epDisco.key, txid, s, logLevel) } } // sendDiscoPingsLocked starts pinging all of ep's endpoints. func (de *endpoint) sendDiscoPingsLocked(now mono.Time, sendCallMeMaybe bool) { de.lastFullPing = now var sentAny bool for ep, st := range de.endpointState { if st.shouldDeleteLocked() { de.deleteEndpointLocked("sendPingsLocked", ep) continue } if runtime.GOOS == "js" { continue } if !st.lastPing.IsZero() && now.Sub(st.lastPing) < discoPingInterval { continue } firstPing := !sentAny sentAny = true if firstPing && sendCallMeMaybe { de.c.dlogf("[v1] magicsock: disco: send, starting discovery for %v (%v)", de.publicKey.ShortString(), de.discoShort()) } de.startDiscoPingLocked(ep, now, pingDiscovery, 0, nil) } derpAddr := de.derpAddr if sentAny && sendCallMeMaybe && derpAddr.IsValid() { // Have our magicsock.Conn figure out its STUN endpoint (if // it doesn't know already) and then send a CallMeMaybe // message to our peer via DERP informing them that we've // sent so our firewall ports are probably open and now // would be a good time for them to connect. go de.c.enqueueCallMeMaybe(derpAddr, de) } } // sendWireGuardOnlyPingsLocked evaluates all available addresses for // a WireGuard only endpoint and initates an ICMP ping for useable // addresses. func (de *endpoint) sendWireGuardOnlyPingsLocked(now mono.Time) { if runtime.GOOS == "js" { return } // Normally we only send pings at a low rate as the decision to start // sending a ping sets bestAddrAtUntil with a reasonable time to keep trying // that address, however, if that code changed we may want to be sure that // we don't ever send excessive pings to avoid impact to the client/user. if !now.After(de.lastFullPing.Add(10 * time.Second)) { return } de.lastFullPing = now for ipp := range de.endpointState { if ipp.Addr().Is4() && de.c.noV4.Load() { continue } if ipp.Addr().Is6() && de.c.noV6.Load() { continue } go de.sendWireGuardOnlyPing(ipp, now) } } // sendWireGuardOnlyPing sends a ICMP ping to a WireGuard only address to // discover the latency. func (de *endpoint) sendWireGuardOnlyPing(ipp netip.AddrPort, now mono.Time) { ctx, cancel := context.WithTimeout(de.c.connCtx, 5*time.Second) defer cancel() de.setLastPing(ipp, now) addr := &net.IPAddr{ IP: net.IP(ipp.Addr().AsSlice()), Zone: ipp.Addr().Zone(), } p := de.c.getPinger() if p == nil { de.c.logf("[v2] magicsock: sendWireGuardOnlyPingLocked: pinger is nil") return } latency, err := p.Send(ctx, addr, nil) if err != nil { de.c.logf("[v2] magicsock: sendWireGuardOnlyPingLocked: %s", err) return } de.mu.Lock() defer de.mu.Unlock() state, ok := de.endpointState[ipp] if !ok { return } state.addPongReplyLocked(pongReply{ latency: latency, pongAt: now, from: ipp, pongSrc: netip.AddrPort{}, // We don't know this. }) } // setLastPing sets lastPing on the endpointState to now. func (de *endpoint) setLastPing(ipp netip.AddrPort, now mono.Time) { de.mu.Lock() defer de.mu.Unlock() state, ok := de.endpointState[ipp] if !ok { return } state.lastPing = now } // updateFromNode updates the endpoint based on a tailcfg.Node from a NetMap // update. func (de *endpoint) updateFromNode(n tailcfg.NodeView, heartbeatDisabled bool, probeUDPLifetimeEnabled bool) { if !n.Valid() { panic("nil node when updating endpoint") } de.mu.Lock() defer de.mu.Unlock() de.heartbeatDisabled = heartbeatDisabled if probeUDPLifetimeEnabled { de.setProbeUDPLifetimeConfigLocked(defaultProbeUDPLifetimeConfig) } else { de.setProbeUDPLifetimeConfigLocked(nil) } de.expired = n.Expired() epDisco := de.disco.Load() var discoKey key.DiscoPublic if epDisco != nil { discoKey = epDisco.key } if discoKey != n.DiscoKey() { de.c.logf("[v1] magicsock: disco: node %s changed from %s to %s", de.publicKey.ShortString(), discoKey, n.DiscoKey()) de.disco.Store(&endpointDisco{ key: n.DiscoKey(), short: n.DiscoKey().ShortString(), }) de.debugUpdates.Add(EndpointChange{ When: time.Now(), What: "updateFromNode-resetLocked", }) de.resetLocked() } if n.DERP() == "" { if de.derpAddr.IsValid() { de.debugUpdates.Add(EndpointChange{ When: time.Now(), What: "updateFromNode-remove-DERP", From: de.derpAddr, }) } de.derpAddr = netip.AddrPort{} } else { newDerp, _ := netip.ParseAddrPort(n.DERP()) if de.derpAddr != newDerp { de.debugUpdates.Add(EndpointChange{ When: time.Now(), What: "updateFromNode-DERP", From: de.derpAddr, To: newDerp, }) } de.derpAddr = newDerp } de.setEndpointsLocked(n.Endpoints()) } func (de *endpoint) setEndpointsLocked(eps interface { Len() int At(i int) netip.AddrPort }) { for _, st := range de.endpointState { st.index = indexSentinelDeleted // assume deleted until updated in next loop } var newIpps []netip.AddrPort for i := range eps.Len() { if i > math.MaxInt16 { // Seems unlikely. break } ipp := eps.At(i) if !ipp.IsValid() { de.c.logf("magicsock: bogus netmap endpoint from %v", eps) continue } if st, ok := de.endpointState[ipp]; ok { st.index = int16(i) } else { de.endpointState[ipp] = &endpointState{index: int16(i)} newIpps = append(newIpps, ipp) } } if len(newIpps) > 0 { de.debugUpdates.Add(EndpointChange{ When: time.Now(), What: "updateFromNode-new-Endpoints", To: newIpps, }) } // Now delete anything unless it's still in the network map or // was a recently discovered endpoint. for ep, st := range de.endpointState { if st.shouldDeleteLocked() { de.deleteEndpointLocked("updateFromNode", ep) } } } // addCandidateEndpoint adds ep as an endpoint to which we should send // future pings. If there is an existing endpointState for ep, and forRxPingTxID // matches the last received ping TxID, this function reports true, otherwise // false. // // This is called once we've already verified that we got a valid // discovery message from de via ep. func (de *endpoint) addCandidateEndpoint(ep netip.AddrPort, forRxPingTxID stun.TxID) (duplicatePing bool) { de.mu.Lock() defer de.mu.Unlock() if st, ok := de.endpointState[ep]; ok { duplicatePing = forRxPingTxID == st.lastGotPingTxID if !duplicatePing { st.lastGotPingTxID = forRxPingTxID } if st.lastGotPing.IsZero() { // Already-known endpoint from the network map. return duplicatePing } st.lastGotPing = time.Now() return duplicatePing } // Newly discovered endpoint. Exciting! de.c.dlogf("[v1] magicsock: disco: adding %v as candidate endpoint for %v (%s)", ep, de.discoShort(), de.publicKey.ShortString()) de.endpointState[ep] = &endpointState{ lastGotPing: time.Now(), lastGotPingTxID: forRxPingTxID, } // If for some reason this gets very large, do some cleanup. if size := len(de.endpointState); size > 100 { for ep, st := range de.endpointState { if st.shouldDeleteLocked() { de.deleteEndpointLocked("addCandidateEndpoint", ep) } } size2 := len(de.endpointState) de.c.dlogf("[v1] magicsock: disco: addCandidateEndpoint pruned %v candidate set from %v to %v entries", size, size2) } return false } // clearBestAddrLocked clears the bestAddr and related fields such that future // packets will re-evaluate the best address to send to next. // // de.mu must be held. func (de *endpoint) clearBestAddrLocked() { de.setBestAddrLocked(addrQuality{}) de.bestAddrAt = 0 de.trustBestAddrUntil = 0 } // noteBadEndpoint marks ipp as a bad endpoint that would need to be // re-evaluated before future use, this should be called for example if a send // to ipp fails due to a host unreachable error or similar. func (de *endpoint) noteBadEndpoint(ipp netip.AddrPort) { de.mu.Lock() defer de.mu.Unlock() de.clearBestAddrLocked() if st, ok := de.endpointState[ipp]; ok { st.clear() } } // noteConnectivityChange is called when connectivity changes enough // that we should question our earlier assumptions about which paths // work. func (de *endpoint) noteConnectivityChange() { de.mu.Lock() defer de.mu.Unlock() de.clearBestAddrLocked() for k := range de.endpointState { de.endpointState[k].clear() } } // pingSizeToPktLen calculates the minimum path MTU that would permit // a disco ping message of length size to reach its target at // addr. size is the length of the entire disco message including // disco headers. If size is zero, assume it is the safe wire MTU. func pingSizeToPktLen(size int, is6 bool) tstun.WireMTU { if size == 0 { return tstun.SafeWireMTU() } headerLen := ipv4.HeaderLen if is6 { headerLen = ipv6.HeaderLen } headerLen += 8 // UDP header length return tstun.WireMTU(size + headerLen) } // pktLenToPingSize calculates the ping payload size that would // create a disco ping message whose on-the-wire length is exactly mtu // bytes long. If mtu is zero or less than the minimum ping size, then // no MTU probe is desired and return zero for an unpadded ping. func pktLenToPingSize(mtu tstun.WireMTU, is6 bool) int { if mtu == 0 { return 0 } headerLen := ipv4.HeaderLen if is6 { headerLen = ipv6.HeaderLen } headerLen += 8 // UDP header length if mtu < tstun.WireMTU(headerLen) { return 0 } return int(mtu) - headerLen } // handlePongConnLocked handles a Pong message (a reply to an earlier ping). // It should be called with the Conn.mu held. // // It reports whether m.TxID corresponds to a ping that this endpoint sent. func (de *endpoint) handlePongConnLocked(m *disco.Pong, di *discoInfo, src netip.AddrPort) (knownTxID bool) { de.mu.Lock() defer de.mu.Unlock() isDerp := src.Addr() == tailcfg.DerpMagicIPAddr sp, ok := de.sentPing[m.TxID] if !ok { // This is not a pong for a ping we sent. return false } knownTxID = true // for naked returns below de.removeSentDiscoPingLocked(m.TxID, sp, discoPongReceived) pktLen := int(pingSizeToPktLen(sp.size, sp.to.Addr().Is6())) if sp.size != 0 { m := getPeerMTUsProbedMetric(tstun.WireMTU(pktLen)) m.Add(1) if metricMaxPeerMTUProbed.Value() < int64(pktLen) { metricMaxPeerMTUProbed.Set(int64(pktLen)) } } now := mono.Now() latency := now.Sub(sp.at) if !isDerp { st, ok := de.endpointState[sp.to] if !ok { // This is no longer an endpoint we care about. return } de.c.peerMap.setNodeKeyForIPPort(src, de.publicKey) st.addPongReplyLocked(pongReply{ latency: latency, pongAt: now, from: src, pongSrc: m.Src, }) } if sp.purpose != pingHeartbeat && sp.purpose != pingHeartbeatForUDPLifetime { de.c.dlogf("[v1] magicsock: disco: %v<-%v (%v, %v) got pong tx=%x latency=%v pktlen=%v pong.src=%v%v", de.c.discoShort, de.discoShort(), de.publicKey.ShortString(), src, m.TxID[:6], latency.Round(time.Millisecond), pktLen, m.Src, logger.ArgWriter(func(bw *bufio.Writer) { if sp.to != src { fmt.Fprintf(bw, " ping.to=%v", sp.to) } })) } // Currently only CLI ping uses this callback. if sp.resCB.reply() { if sp.purpose == pingCLI { de.c.populateCLIPingResponseLocked(sp.resCB.res, latency, sp.to) } go sp.resCB.cb(sp.resCB.res) } // Promote this pong response to our current best address if it's lower latency. // TODO(bradfitz): decide how latency vs. preference order affects decision if !isDerp { thisPong := addrQuality{sp.to, latency, tstun.WireMTU(pingSizeToPktLen(sp.size, sp.to.Addr().Is6()))} if betterAddr(thisPong, de.bestAddr) { de.c.logf("magicsock: disco: node %v %v now using %v mtu=%v tx=%x", de.publicKey.ShortString(), de.discoShort(), sp.to, thisPong.wireMTU, m.TxID[:6]) de.debugUpdates.Add(EndpointChange{ When: time.Now(), What: "handlePingLocked-bestAddr-update", From: de.bestAddr, To: thisPong, }) de.setBestAddrLocked(thisPong) } if de.bestAddr.AddrPort == thisPong.AddrPort { de.debugUpdates.Add(EndpointChange{ When: time.Now(), What: "handlePingLocked-bestAddr-latency", From: de.bestAddr, To: thisPong, }) de.bestAddr.latency = latency de.bestAddrAt = now de.trustBestAddrUntil = now.Add(trustUDPAddrDuration) } } return } // addrQuality is an IPPort with an associated latency and path mtu. type addrQuality struct { netip.AddrPort latency time.Duration wireMTU tstun.WireMTU } func (a addrQuality) String() string { return fmt.Sprintf("%v@%v+%v", a.AddrPort, a.latency, a.wireMTU) } // betterAddr reports whether a is a better addr to use than b. func betterAddr(a, b addrQuality) bool { if a.AddrPort == b.AddrPort { if a.wireMTU > b.wireMTU { // TODO(val): Think harder about the case of lower // latency and smaller or unknown MTU, and higher // latency but larger MTU. Probably in most cases the // largest MTU will also be the lowest latency but we // can't depend on that. return true } return false } if !b.IsValid() { return true } if !a.IsValid() { return false } // Each address starts with a set of points (from 0 to 100) that // represents how much faster they are than the highest-latency // endpoint. For example, if a has latency 200ms and b has latency // 190ms, then a starts with 0 points and b starts with 5 points since // it's 5% faster. var aPoints, bPoints int if a.latency > b.latency && a.latency > 0 { bPoints = int(100 - ((b.latency * 100) / a.latency)) } else if b.latency > 0 { aPoints = int(100 - ((a.latency * 100) / b.latency)) } // Prefer private IPs over public IPs as long as the latencies are // roughly equivalent, since it's less likely that a user will have to // pay for the bandwidth in a cloud environment. // // Additionally, prefer any loopback address strongly over non-loopback // addresses, and prefer link-local unicast addresses over other types // of private IP addresses since it's definitionally more likely that // they'll be on the same network segment than a general private IP. if a.Addr().IsLoopback() { aPoints += 50 } else if a.Addr().IsLinkLocalUnicast() { aPoints += 30 } else if a.Addr().IsPrivate() { aPoints += 20 } if b.Addr().IsLoopback() { bPoints += 50 } else if b.Addr().IsLinkLocalUnicast() { bPoints += 30 } else if b.Addr().IsPrivate() { bPoints += 20 } // Prefer IPv6 for being a bit more robust, as long as // the latencies are roughly equivalent. if a.Addr().Is6() { aPoints += 10 } if b.Addr().Is6() { bPoints += 10 } // Don't change anything if the latency improvement is less than 1%; we // want a bit of "stickiness" (a.k.a. hysteresis) to avoid flapping if // there's two roughly-equivalent endpoints. // // Points are essentially the percentage improvement of latency vs. the // slower endpoint; absent any boosts from private IPs, IPv6, etc., a // will be a better address than b by a fraction of 1% or less if // aPoints <= 1 and bPoints == 0. if aPoints <= 1 && bPoints == 0 { return false } return aPoints > bPoints } // handleCallMeMaybe handles a CallMeMaybe discovery message via // DERP. The contract for use of this message is that the peer has // already sent to us via UDP, so their stateful firewall should be // open. Now we can Ping back and make it through. func (de *endpoint) handleCallMeMaybe(m *disco.CallMeMaybe) { if runtime.GOOS == "js" { // Nothing to do on js/wasm if we can't send UDP packets anyway. return } de.mu.Lock() defer de.mu.Unlock() now := time.Now() for ep := range de.isCallMeMaybeEP { de.isCallMeMaybeEP[ep] = false // mark for deletion } var newEPs []netip.AddrPort for _, ep := range m.MyNumber { if ep.Addr().Is6() && ep.Addr().IsLinkLocalUnicast() { // We send these out, but ignore them for now. // TODO: teach the ping code to ping on all interfaces // for these. continue } mak.Set(&de.isCallMeMaybeEP, ep, true) if es, ok := de.endpointState[ep]; ok { es.callMeMaybeTime = now } else { de.endpointState[ep] = &endpointState{callMeMaybeTime: now} newEPs = append(newEPs, ep) } } if len(newEPs) > 0 { de.debugUpdates.Add(EndpointChange{ When: time.Now(), What: "handleCallMeMaybe-new-endpoints", To: newEPs, }) de.c.dlogf("[v1] magicsock: disco: call-me-maybe from %v %v added new endpoints: %v", de.publicKey.ShortString(), de.discoShort(), logger.ArgWriter(func(w *bufio.Writer) { for i, ep := range newEPs { if i > 0 { w.WriteString(", ") } w.WriteString(ep.String()) } })) } // Delete any prior CallMeMaybe endpoints that weren't included // in this message. for ep, want := range de.isCallMeMaybeEP { if !want { delete(de.isCallMeMaybeEP, ep) de.deleteEndpointLocked("handleCallMeMaybe", ep) } } // Zero out all the lastPing times to force sendPingsLocked to send new ones, // even if it's been less than 5 seconds ago. for _, st := range de.endpointState { st.lastPing = 0 } de.sendDiscoPingsLocked(mono.Now(), false) } func (de *endpoint) populatePeerStatus(ps *ipnstate.PeerStatus) { de.mu.Lock() defer de.mu.Unlock() ps.Relay = de.c.derpRegionCodeOfIDLocked(int(de.derpAddr.Port())) if de.lastSendExt.IsZero() { return } now := mono.Now() ps.LastWrite = de.lastSendExt.WallTime() ps.Active = now.Sub(de.lastSendExt) < sessionActiveTimeout if udpAddr, derpAddr, _ := de.addrForSendLocked(now); udpAddr.IsValid() && !derpAddr.IsValid() { ps.CurAddr = udpAddr.String() } } // stopAndReset stops timers associated with de and resets its state back to zero. // It's called when a discovery endpoint is no longer present in the // NetworkMap, or when magicsock is transitioning from running to // stopped state (via SetPrivateKey(zero)) func (de *endpoint) stopAndReset() { atomic.AddInt64(&de.numStopAndResetAtomic, 1) de.mu.Lock() defer de.mu.Unlock() if closing := de.c.closing.Load(); !closing { if de.isWireguardOnly { de.c.logf("[v1] magicsock: doing cleanup for wireguard key %s", de.publicKey.ShortString()) } else { de.c.logf("[v1] magicsock: doing cleanup for discovery key %s", de.discoShort()) } } de.debugUpdates.Add(EndpointChange{ When: time.Now(), What: "stopAndReset-resetLocked", }) de.resetLocked() if de.heartBeatTimer != nil { de.heartBeatTimer.Stop() de.heartBeatTimer = nil } } // resetLocked clears all the endpoint's p2p state, reverting it to a // DERP-only endpoint. It does not stop the endpoint's heartbeat // timer, if one is running. func (de *endpoint) resetLocked() { de.lastSendExt = 0 de.lastFullPing = 0 de.clearBestAddrLocked() for _, es := range de.endpointState { es.lastPing = 0 } if !de.isWireguardOnly { for txid, sp := range de.sentPing { de.removeSentDiscoPingLocked(txid, sp, discoPingResultUnknown) } } de.probeUDPLifetime.resetCycleEndpointLocked() } func (de *endpoint) numStopAndReset() int64 { return atomic.LoadInt64(&de.numStopAndResetAtomic) } func (de *endpoint) setDERPHome(regionID uint16) { de.mu.Lock() defer de.mu.Unlock() de.derpAddr = netip.AddrPortFrom(tailcfg.DerpMagicIPAddr, uint16(regionID)) }