From ee87f9ac2d250c78cfff109f27d33eb666228f76 Mon Sep 17 00:00:00 2001 From: James Sanderson Date: Fri, 3 Oct 2025 15:59:58 +0100 Subject: [PATCH] ipn/ipnlocal: fix setAuthURL / setWgengineStatus race condition If we received a wg engine status while processing an auth URL, there was a race condition where the authURL could be reset to "" immediately after we set it. To fix this we need to check that we are moving from a non-Running state to a Running state rather than always resetting the URL when we "move" into a Running state even if that is the current state. We also need to make sure that we do not return from stopEngineAndWait until the engine is stopped: before, we would return as soon as we received any engine status update, but that might have been an update already in-flight before we asked the engine to stop. Now we wait until we see an update that is indicative of a stopped engine. Updates #17388 Signed-off-by: James Sanderson --- ipn/ipnlocal/local.go | 60 +++++++++------ ipn/ipnlocal/state_test.go | 145 +++++++++++++++++++++++++++++++++++++ 2 files changed, 181 insertions(+), 24 deletions(-) diff --git a/ipn/ipnlocal/local.go b/ipn/ipnlocal/local.go index 09f317f0f..c9a1b88f0 100644 --- a/ipn/ipnlocal/local.go +++ b/ipn/ipnlocal/local.go @@ -313,9 +313,8 @@ type LocalBackend struct { serveListeners map[netip.AddrPort]*localListener // listeners for local serve traffic serveProxyHandlers sync.Map // string (HTTPHandler.Proxy) => *reverseProxy - // statusLock must be held before calling statusChanged.Wait() or + // mu must be held before calling statusChanged.Wait() or // statusChanged.Broadcast(). - statusLock sync.Mutex statusChanged *sync.Cond // dialPlan is any dial plan that we've received from the control @@ -542,7 +541,7 @@ func NewLocalBackend(logf logger.Logf, logID logid.PublicID, sys *tsd.System, lo b.setTCPPortsIntercepted(nil) - b.statusChanged = sync.NewCond(&b.statusLock) + b.statusChanged = sync.NewCond(&b.mu) b.e.SetStatusCallback(b.setWgengineStatus) b.prevIfState = netMon.InterfaceState() @@ -2237,14 +2236,15 @@ func (b *LocalBackend) setWgengineStatus(s *wgengine.Status, err error) { b.send(ipn.Notify{Engine: &es}) } +// broadcastStatusChanged must not be called with b.mu held. func (b *LocalBackend) broadcastStatusChanged() { // The sync.Cond docs say: "It is allowed but not required for the caller to hold c.L during the call." - // In this particular case, we must acquire b.statusLock. Otherwise we might broadcast before + // In this particular case, we must acquire b.mu. Otherwise we might broadcast before // the waiter (in requestEngineStatusAndWait) starts to wait, in which case // the waiter can get stuck indefinitely. See PR 2865. - b.statusLock.Lock() + b.mu.Lock() b.statusChanged.Broadcast() - b.statusLock.Unlock() + b.mu.Unlock() } // SetNotifyCallback sets the function to call when the backend has something to @@ -3308,11 +3308,12 @@ func (b *LocalBackend) popBrowserAuthNow(url string, keyExpired bool, recipient if !b.seamlessRenewalEnabled() || keyExpired { b.blockEngineUpdates(true) b.stopEngineAndWait() + + if b.State() == ipn.Running { + b.enterState(ipn.Starting) + } } b.tellRecipientToBrowseToURL(url, toNotificationTarget(recipient)) - if b.State() == ipn.Running { - b.enterState(ipn.Starting) - } } // validPopBrowserURL reports whether urlStr is a valid value for a @@ -5433,7 +5434,13 @@ func (b *LocalBackend) enterStateLockedOnEntry(newState ipn.State, unlock unlock activeLogin := b.activeLogin authURL := b.authURL if newState == ipn.Running { - b.resetAuthURLLocked() + // TODO(zofrex): Is this needed? As of 2025-10-03 it doesn't seem to be + // necessary when logging in or authenticating. When do we need to reset it + // here, rather than the other places it is reset? We should test if it is + // necessary and add unit tests to cover those cases, or remove it. + if oldState != ipn.Running { + b.resetAuthURLLocked() + } // Start a captive portal detection loop if none has been // started. Create a new context if none is present, since it @@ -5670,29 +5677,34 @@ func (u unlockOnce) UnlockEarly() { } // stopEngineAndWait deconfigures the local network data plane, and -// waits for it to deliver a status update before returning. -// -// TODO(danderson): this may be racy. We could unblock upon receiving -// a status update that predates the "I've shut down" update. +// waits for it to deliver a status update indicating it has stopped +// before returning. func (b *LocalBackend) stopEngineAndWait() { b.logf("stopEngineAndWait...") b.e.Reconfig(&wgcfg.Config{}, &router.Config{}, &dns.Config{}) - b.requestEngineStatusAndWait() + b.requestEngineStatusAndWaitForStopped() b.logf("stopEngineAndWait: done.") } -// Requests the wgengine status, and does not return until the status -// was delivered (to the usual callback). -func (b *LocalBackend) requestEngineStatusAndWait() { - b.logf("requestEngineStatusAndWait") +// Requests the wgengine status, and does not return until a status was +// delivered (to the usual callback) that indicates the engine is stopped. +func (b *LocalBackend) requestEngineStatusAndWaitForStopped() { + b.logf("requestEngineStatusAndWaitForStopped") - b.statusLock.Lock() - defer b.statusLock.Unlock() + b.mu.Lock() + defer b.mu.Unlock() b.goTracker.Go(b.e.RequestStatus) - b.logf("requestEngineStatusAndWait: waiting...") - b.statusChanged.Wait() // temporarily releases lock while waiting - b.logf("requestEngineStatusAndWait: got status update.") + b.logf("requestEngineStatusAndWaitForStopped: waiting...") + for { + b.statusChanged.Wait() // temporarily releases lock while waiting + + if b.engineStatus.NumLive == 0 && b.engineStatus.LiveDERPs == 0 { + b.logf("requestEngineStatusAndWaitForStopped: engine is stopped.") + break + } + b.logf("requestEngineStatusAndWaitForStopped: engine is still running. Waiting...") + } } // setControlClientLocked sets the control client to cc, diff --git a/ipn/ipnlocal/state_test.go b/ipn/ipnlocal/state_test.go index 347aaf8b8..da1e38bf9 100644 --- a/ipn/ipnlocal/state_test.go +++ b/ipn/ipnlocal/state_test.go @@ -1561,6 +1561,151 @@ func TestEngineReconfigOnStateChange(t *testing.T) { } } +// TestStateMachineURLRace tests that wgengine updates arriving in the middle of +// processing an auth URL doesn't result in the auth URL being cleared. +func TestStateMachineURLRace(t *testing.T) { + runTestStateMachineURLRace(t, false) +} + +func TestStateMachineURLRaceSeamless(t *testing.T) { + runTestStateMachineURLRace(t, true) +} + +func runTestStateMachineURLRace(t *testing.T, seamless bool) { + var cc *mockControl + b := newLocalBackendWithTestControl(t, true, func(tb testing.TB, opts controlclient.Options) controlclient.Client { + cc = newClient(t, opts) + return cc + }) + + var stateLock sync.Mutex + var state ipn.State + stateChanged := make(chan struct{}) + + b.SetNotifyCallback(func(n ipn.Notify) { + if n.State != nil || (n.Prefs != nil && n.Prefs.Valid()) || n.BrowseToURL != nil || n.LoginFinished != nil { + b.logf("Received notify: %+v", n) + if n.State != nil { + stateLock.Lock() + state = *n.State + stateLock.Unlock() + stateChanged <- struct{}{} + } + } else { + b.logf("(ignored) %v", n) + } + }) + + waitForState := func(want ipn.State) { + t.Helper() + t.Logf("waiting for state: %v", want) + for { + stateLock.Lock() + s := state + stateLock.Unlock() + if s == want { + return + } + select { + case <-stateChanged: + case <-time.After(6 * time.Second): + t.Fatalf("timed out waiting for state %v; currently %v", want, s) + } + } + } + + t.Logf("Start") + b.Start(ipn.Options{ + UpdatePrefs: &ipn.Prefs{ + WantRunning: true, + ControlURL: "https://localhost:1/", + }, + }) + + waitForState(ipn.NeedsLogin) + + t.Logf("LoginFinished") + cc.persist.UserProfile.LoginName = "user1" + cc.persist.NodeID = "node1" + + if seamless { + b.sys.ControlKnobs().SeamlessKeyRenewal.Store(true) + } + + cc.send(nil, "", true, &netmap.NetworkMap{ + SelfNode: (&tailcfg.Node{MachineAuthorized: true}).View(), + }) + waitForState(ipn.Starting) + + t.Logf("Running") + b.setWgengineStatus(&wgengine.Status{AsOf: time.Now(), DERPs: 1}, nil) + waitForState(ipn.Running) + + t.Logf("Re-auth (StartLoginInteractive)") + b.StartLoginInteractive(t.Context()) + + stop := make(chan struct{}) + stopSpamming := sync.OnceFunc(func() { + stop <- struct{}{} + }) + // if seamless renewal is enabled, the engine won't be disabled, and we won't + // ever call stopSpamming, so make sure it does get called + defer stopSpamming() + + // Intercept updates between the engine and localBackend, so that we can see + // when the "stopped" update comes in and ensure we stop sending our "we're + // up" updates after that point. + b.e.SetStatusCallback(func(s *wgengine.Status, err error) { + // This is not one of our fake status updates, this is generated from the + // engine in response to LocalBackend calling RequestStatus. Stop spamming + // our fake statuses. + // + // TODO(zofrex): This is fragile, it works right now but would break if the + // calling pattern of RequestStatus changes. We should ensure that we keep + // sending "we're up" statuses right until Reconfig is called with + // zero-valued configs, and after that point only send "stopped" statuses. + stopSpamming() + + // Once stopSpamming returns we are guaranteed to not send any more updates, + // so we can now send the real update (indicating shutdown) and be certain + // it will be received after any fake updates we sent. This is possibly a + // stronger guarantee than we get from the real engine? + b.setWgengineStatus(s, err) + }) + + // time needs to be >= last time for the status to be accepted, send all our + // spam with the same stale time so that when a real update comes in it will + // definitely be accepted. + time := b.lastStatusTime + + // Flood localBackend with a lot of wgengine status updates, so if there are + // any race conditions in the multiple locks/unlocks that happen as we process + // the received auth URL, we will hit them. + go func() { + t.Logf("sending lots of fake wgengine status updates") + for { + select { + case <-stop: + t.Logf("stopping fake wgengine status updates") + return + default: + b.setWgengineStatus(&wgengine.Status{AsOf: time, DERPs: 1}, nil) + } + } + }() + + t.Logf("Re-auth (receive URL)") + url1 := "https://localhost:1/1" + cc.send(nil, url1, false, nil) + + // Don't need to wait on anything else - once .send completes, authURL should + // be set, and once .send has completed, any opportunities for a WG engine + // status update to trample it have ended as well. + if b.authURL == "" { + t.Fatalf("expected authURL to be set") + } +} + func buildNetmapWithPeers(self tailcfg.NodeView, peers ...tailcfg.NodeView) *netmap.NetworkMap { const ( firstAutoUserID = tailcfg.UserID(10000)