wgengine/magicsock: clear out endpoint statistics when it becomes bad

There are cases where we do not detect the non-viability of a route, but we will instead observe a failure to send. In a Disco path this would normally be handled as a side effect of Disco, which is not available to non-Disco WireGuard nodes. In both cases, recognizing the failure as such will result in faster convergence. Updates #8999 Signed-off-by: James Tucker <james@tailscale.com>
1 year ago · 5edb39d032
parent 7c9c68feed
commit 5edb39d032
3 changed files with 71 additions and 5 deletions
--- a/wgengine/magicsock/endpoint.go
+++ b/wgengine/magicsock/endpoint.go
@ -469,6 +469,14 @@ func (de *endpoint) send(buffs [][]byte) error {
 	var err error
 	if udpAddr.IsValid() {
 		_, err = de.c.sendUDPBatch(udpAddr, buffs)
+
+		// If the error is known to indicate that the endpoint is no longer
+		// usable, clear the endpoint statistics so that the next send will
+		// re-evaluate the best endpoint.
+		if err != nil && isBadEndpointErr(err) {
+			de.noteBadEndpoint(udpAddr)
+		}
+
 		// TODO(raggi): needs updating for accuracy, as in error conditions we may have partial sends.
 		if stats := de.c.stats.Load(); err == nil && stats != nil {
 			var txBytes int
@ -866,6 +874,30 @@ func (de *endpoint) addCandidateEndpoint(ep netip.AddrPort, forRxPingTxID stun.T
 	return false
 }

+// clearBestAddrLocked clears the bestAddr and related fields such that future
+// packets will re-evaluate the best address to send to next.
+//
+// de.mu must be held.
+func (de *endpoint) clearBestAddrLocked() {
+	de.bestAddr = addrLatency{}
+	de.bestAddrAt = 0
+	de.trustBestAddrUntil = 0
+}
+
+// noteBadEndpoint marks ipp as a bad endpoint that would need to be
+// re-evaluated before future use, this should be called for example if a send
+// to ipp fails due to a host unreachable error or similar.
+func (de *endpoint) noteBadEndpoint(ipp netip.AddrPort) {
+	de.mu.Lock()
+	defer de.mu.Unlock()
+
+	de.clearBestAddrLocked()
+
+	if st, ok := de.endpointState[ipp]; ok {
+		st.clear()
+	}
+}
+
 // noteConnectivityChange is called when connectivity changes enough
 // that we should question our earlier assumptions about which paths
 // work.
@ -873,8 +905,7 @@ func (de *endpoint) noteConnectivityChange() {
 	de.mu.Lock()
 	defer de.mu.Unlock()

-	de.bestAddr = addrLatency{}
-	de.trustBestAddrUntil = 0
+	de.clearBestAddrLocked()

 	for k := range de.endpointState {
 		de.endpointState[k].clear()
@ -1156,9 +1187,7 @@ func (de *endpoint) stopAndReset() {
 func (de *endpoint) resetLocked() {
 	de.lastSend = 0
 	de.lastFullPing = 0
-	de.bestAddr = addrLatency{}
-	de.bestAddrAt = 0
-	de.trustBestAddrUntil = 0
+	de.clearBestAddrLocked()
 	for _, es := range de.endpointState {
 		es.lastPing = 0
 	}
--- a/wgengine/magicsock/endpoint_default.go
+++ b/wgengine/magicsock/endpoint_default.go
@ -0,0 +1,23 @@
+// Copyright (c) Tailscale Inc & AUTHORS
+// SPDX-License-Identifier: BSD-3-Clause
+
+//go:build !js && !wasm
+// +build !js,!wasm
+
+package magicsock
+
+import (
+	"errors"
+	"syscall"
+)
+
+// errHOSTUNREACH wraps unix.EHOSTUNREACH in an interface type to pass to
+// errors.Is while avoiding an allocation per call.
+var errHOSTUNREACH error = syscall.EHOSTUNREACH
+
+// isBadEndpointErr checks if err is one which is known to report that an
+// endpoint can no longer be sent to. It is not exhaustive, and for unknown
+// errors always reports false.
+func isBadEndpointErr(err error) bool {
+	return errors.Is(err, errHOSTUNREACH)
+}
--- a/wgengine/magicsock/endpoint_js.go
+++ b/wgengine/magicsock/endpoint_js.go
@ -0,0 +1,14 @@
+// Copyright (c) Tailscale Inc & AUTHORS
+// SPDX-License-Identifier: BSD-3-Clause
+
+//go:build js || wasm
+// +build js wasm
+
+package magicsock
+
+// isBadEndpointErr checks if err is one which is known to report that an
+// endpoint can no longer be sent to. It is not exhaustive, but covers known
+// cases.
+func isBadEndpointErr(err error) bool {
+	return false
+}