ipn/ipnlocal: fix setAuthURL / setWgengineStatus race condition

If we received a wg engine status while processing an auth URL, there was a race condition where the authURL could be reset to "" immediately after we set it. To fix this we need to check that we are moving from a non-Running state to a Running state rather than always resetting the URL when we "move" into a Running state even if that is the current state. We also need to make sure that we do not return from stopEngineAndWait until the engine is stopped: before, we would return as soon as we received any engine status update, but that might have been an update already in-flight before we asked the engine to stop. Now we wait until we see an update that is indicative of a stopped engine. Updates #17388 Signed-off-by: James Sanderson <jsanderson@tailscale.com>
2 months ago · ee87f9ac2d
parent 05a4c8e839
commit ee87f9ac2d
2 changed files with 181 additions and 24 deletions
--- a/ipn/ipnlocal/local.go
+++ b/ipn/ipnlocal/local.go
@ -313,9 +313,8 @@ type LocalBackend struct {
 	serveListeners     map[netip.AddrPort]*localListener // listeners for local serve traffic
 	serveProxyHandlers sync.Map                          // string (HTTPHandler.Proxy) => *reverseProxy

-	// statusLock must be held before calling statusChanged.Wait() or
+	// mu must be held before calling statusChanged.Wait() or
 	// statusChanged.Broadcast().
-	statusLock    sync.Mutex
 	statusChanged *sync.Cond

 	// dialPlan is any dial plan that we've received from the control
@ -542,7 +541,7 @@ func NewLocalBackend(logf logger.Logf, logID logid.PublicID, sys *tsd.System, lo

 	b.setTCPPortsIntercepted(nil)

-	b.statusChanged = sync.NewCond(&b.statusLock)
+	b.statusChanged = sync.NewCond(&b.mu)
 	b.e.SetStatusCallback(b.setWgengineStatus)

 	b.prevIfState = netMon.InterfaceState()
@ -2237,14 +2236,15 @@ func (b *LocalBackend) setWgengineStatus(s *wgengine.Status, err error) {
 	b.send(ipn.Notify{Engine: &es})
 }

+// broadcastStatusChanged must not be called with b.mu held.
 func (b *LocalBackend) broadcastStatusChanged() {
 	// The sync.Cond docs say: "It is allowed but not required for the caller to hold c.L during the call."
-	// In this particular case, we must acquire b.statusLock. Otherwise we might broadcast before
+	// In this particular case, we must acquire b.mu. Otherwise we might broadcast before
 	// the waiter (in requestEngineStatusAndWait) starts to wait, in which case
 	// the waiter can get stuck indefinitely. See PR 2865.
-	b.statusLock.Lock()
+	b.mu.Lock()
 	b.statusChanged.Broadcast()
-	b.statusLock.Unlock()
+	b.mu.Unlock()
 }

 // SetNotifyCallback sets the function to call when the backend has something to
@ -3308,12 +3308,13 @@ func (b *LocalBackend) popBrowserAuthNow(url string, keyExpired bool, recipient
 	if !b.seamlessRenewalEnabled() || keyExpired {
 		b.blockEngineUpdates(true)
 		b.stopEngineAndWait()
-	}
-	b.tellRecipientToBrowseToURL(url, toNotificationTarget(recipient))
+
 		if b.State() == ipn.Running {
 			b.enterState(ipn.Starting)
 		}
 	}
+	b.tellRecipientToBrowseToURL(url, toNotificationTarget(recipient))
+}

 // validPopBrowserURL reports whether urlStr is a valid value for a
 // control server to send in a *URL field.
@ -5433,7 +5434,13 @@ func (b *LocalBackend) enterStateLockedOnEntry(newState ipn.State, unlock unlock
 	activeLogin := b.activeLogin
 	authURL := b.authURL
 	if newState == ipn.Running {
+		// TODO(zofrex): Is this needed? As of 2025-10-03 it doesn't seem to be
+		// necessary when logging in or authenticating. When do we need to reset it
+		// here, rather than the other places it is reset? We should test if it is
+		// necessary and add unit tests to cover those cases, or remove it.
+		if oldState != ipn.Running {
 			b.resetAuthURLLocked()
+		}

 		// Start a captive portal detection loop if none has been
 		// started. Create a new context if none is present, since it
@ -5670,29 +5677,34 @@ func (u unlockOnce) UnlockEarly() {
 }

 // stopEngineAndWait deconfigures the local network data plane, and
-// waits for it to deliver a status update before returning.
-//
-// TODO(danderson): this may be racy. We could unblock upon receiving
-// a status update that predates the "I've shut down" update.
+// waits for it to deliver a status update indicating it has stopped
+// before returning.
 func (b *LocalBackend) stopEngineAndWait() {
 	b.logf("stopEngineAndWait...")
 	b.e.Reconfig(&wgcfg.Config{}, &router.Config{}, &dns.Config{})
-	b.requestEngineStatusAndWait()
+	b.requestEngineStatusAndWaitForStopped()
 	b.logf("stopEngineAndWait: done.")
 }

-// Requests the wgengine status, and does not return until the status
-// was delivered (to the usual callback).
-func (b *LocalBackend) requestEngineStatusAndWait() {
-	b.logf("requestEngineStatusAndWait")
+// Requests the wgengine status, and does not return until a status was
+// delivered (to the usual callback) that indicates the engine is stopped.
+func (b *LocalBackend) requestEngineStatusAndWaitForStopped() {
+	b.logf("requestEngineStatusAndWaitForStopped")

-	b.statusLock.Lock()
-	defer b.statusLock.Unlock()
+	b.mu.Lock()
+	defer b.mu.Unlock()

 	b.goTracker.Go(b.e.RequestStatus)
-	b.logf("requestEngineStatusAndWait: waiting...")
+	b.logf("requestEngineStatusAndWaitForStopped: waiting...")
+	for {
 		b.statusChanged.Wait() // temporarily releases lock while waiting
-	b.logf("requestEngineStatusAndWait: got status update.")
+
+		if b.engineStatus.NumLive == 0 && b.engineStatus.LiveDERPs == 0 {
+			b.logf("requestEngineStatusAndWaitForStopped: engine is stopped.")
+			break
+		}
+		b.logf("requestEngineStatusAndWaitForStopped: engine is still running. Waiting...")
+	}
 }

 // setControlClientLocked sets the control client to cc,
--- a/ipn/ipnlocal/state_test.go
+++ b/ipn/ipnlocal/state_test.go
@ -1561,6 +1561,151 @@ func TestEngineReconfigOnStateChange(t *testing.T) {
 	}
 }

+// TestStateMachineURLRace tests that wgengine updates arriving in the middle of
+// processing an auth URL doesn't result in the auth URL being cleared.
+func TestStateMachineURLRace(t *testing.T) {
+	runTestStateMachineURLRace(t, false)
+}
+
+func TestStateMachineURLRaceSeamless(t *testing.T) {
+	runTestStateMachineURLRace(t, true)
+}
+
+func runTestStateMachineURLRace(t *testing.T, seamless bool) {
+	var cc *mockControl
+	b := newLocalBackendWithTestControl(t, true, func(tb testing.TB, opts controlclient.Options) controlclient.Client {
+		cc = newClient(t, opts)
+		return cc
+	})
+
+	var stateLock sync.Mutex
+	var state ipn.State
+	stateChanged := make(chan struct{})
+
+	b.SetNotifyCallback(func(n ipn.Notify) {
+		if n.State != nil || (n.Prefs != nil && n.Prefs.Valid()) || n.BrowseToURL != nil || n.LoginFinished != nil {
+			b.logf("Received notify: %+v", n)
+			if n.State != nil {
+				stateLock.Lock()
+				state = *n.State
+				stateLock.Unlock()
+				stateChanged <- struct{}{}
+			}
+		} else {
+			b.logf("(ignored) %v", n)
+		}
+	})
+
+	waitForState := func(want ipn.State) {
+		t.Helper()
+		t.Logf("waiting for state: %v", want)
+		for {
+			stateLock.Lock()
+			s := state
+			stateLock.Unlock()
+			if s == want {
+				return
+			}
+			select {
+			case <-stateChanged:
+			case <-time.After(6 * time.Second):
+				t.Fatalf("timed out waiting for state %v; currently %v", want, s)
+			}
+		}
+	}
+
+	t.Logf("Start")
+	b.Start(ipn.Options{
+		UpdatePrefs: &ipn.Prefs{
+			WantRunning: true,
+			ControlURL:  "https://localhost:1/",
+		},
+	})
+
+	waitForState(ipn.NeedsLogin)
+
+	t.Logf("LoginFinished")
+	cc.persist.UserProfile.LoginName = "user1"
+	cc.persist.NodeID = "node1"
+
+	if seamless {
+		b.sys.ControlKnobs().SeamlessKeyRenewal.Store(true)
+	}
+
+	cc.send(nil, "", true, &netmap.NetworkMap{
+		SelfNode: (&tailcfg.Node{MachineAuthorized: true}).View(),
+	})
+	waitForState(ipn.Starting)
+
+	t.Logf("Running")
+	b.setWgengineStatus(&wgengine.Status{AsOf: time.Now(), DERPs: 1}, nil)
+	waitForState(ipn.Running)
+
+	t.Logf("Re-auth (StartLoginInteractive)")
+	b.StartLoginInteractive(t.Context())
+
+	stop := make(chan struct{})
+	stopSpamming := sync.OnceFunc(func() {
+		stop <- struct{}{}
+	})
+	// if seamless renewal is enabled, the engine won't be disabled, and we won't
+	// ever call stopSpamming, so make sure it does get called
+	defer stopSpamming()
+
+	// Intercept updates between the engine and localBackend, so that we can see
+	// when the "stopped" update comes in and ensure we stop sending our "we're
+	// up" updates after that point.
+	b.e.SetStatusCallback(func(s *wgengine.Status, err error) {
+		// This is not one of our fake status updates, this is generated from the
+		// engine in response to LocalBackend calling RequestStatus. Stop spamming
+		// our fake statuses.
+		//
+		// TODO(zofrex): This is fragile, it works right now but would break if the
+		// calling pattern of RequestStatus changes. We should ensure that we keep
+		// sending "we're up" statuses right until Reconfig is called with
+		// zero-valued configs, and after that point only send "stopped" statuses.
+		stopSpamming()
+
+		// Once stopSpamming returns we are guaranteed to not send any more updates,
+		// so we can now send the real update (indicating shutdown) and be certain
+		// it will be received after any fake updates we sent. This is possibly a
+		// stronger guarantee than we get from the real engine?
+		b.setWgengineStatus(s, err)
+	})
+
+	// time needs to be >= last time for the status to be accepted, send all our
+	// spam with the same stale time so that when a real update comes in it will
+	// definitely be accepted.
+	time := b.lastStatusTime
+
+	// Flood localBackend with a lot of wgengine status updates, so if there are
+	// any race conditions in the multiple locks/unlocks that happen as we process
+	// the received auth URL, we will hit them.
+	go func() {
+		t.Logf("sending lots of fake wgengine status updates")
+		for {
+			select {
+			case <-stop:
+				t.Logf("stopping fake wgengine status updates")
+				return
+			default:
+				b.setWgengineStatus(&wgengine.Status{AsOf: time, DERPs: 1}, nil)
+			}
+		}
+	}()
+
+	t.Logf("Re-auth (receive URL)")
+	url1 := "https://localhost:1/1"
+	cc.send(nil, url1, false, nil)
+
+	// Don't need to wait on anything else - once .send completes, authURL should
+	// be set, and once .send has completed, any opportunities for a WG engine
+	// status update to trample it have ended as well.
+	if b.authURL == "" {
+		t.Fatalf("expected authURL to be set")
+	}
+}
+
 func buildNetmapWithPeers(self tailcfg.NodeView, peers ...tailcfg.NodeView) *netmap.NetworkMap {
 	const (
 		firstAutoUserID = tailcfg.UserID(10000)