ipn/ipnlocal: fix setAuthURL / setWgengineStatus race condition

If we received a wg engine status while processing an auth URL, there was a
race condition where the authURL could be reset to "" immediately after we
set it.

To fix this we need to check that we are moving from a non-Running state to
a Running state rather than always resetting the URL when we "move" into a
Running state even if that is the current state.

We also need to make sure that we do not return from stopEngineAndWait until
the engine is stopped: before, we would return as soon as we received any
engine status update, but that might have been an update already in-flight
before we asked the engine to stop. Now we wait until we see an update that
is indicative of a stopped engine.

Updates #17388

Signed-off-by: James Sanderson <jsanderson@tailscale.com>
zofrex/set-url-wg-status-race-2
James Sanderson 2 months ago
parent 05a4c8e839
commit ee87f9ac2d

@ -313,9 +313,8 @@ type LocalBackend struct {
serveListeners map[netip.AddrPort]*localListener // listeners for local serve traffic
serveProxyHandlers sync.Map // string (HTTPHandler.Proxy) => *reverseProxy
// statusLock must be held before calling statusChanged.Wait() or
// mu must be held before calling statusChanged.Wait() or
// statusChanged.Broadcast().
statusLock sync.Mutex
statusChanged *sync.Cond
// dialPlan is any dial plan that we've received from the control
@ -542,7 +541,7 @@ func NewLocalBackend(logf logger.Logf, logID logid.PublicID, sys *tsd.System, lo
b.setTCPPortsIntercepted(nil)
b.statusChanged = sync.NewCond(&b.statusLock)
b.statusChanged = sync.NewCond(&b.mu)
b.e.SetStatusCallback(b.setWgengineStatus)
b.prevIfState = netMon.InterfaceState()
@ -2237,14 +2236,15 @@ func (b *LocalBackend) setWgengineStatus(s *wgengine.Status, err error) {
b.send(ipn.Notify{Engine: &es})
}
// broadcastStatusChanged must not be called with b.mu held.
func (b *LocalBackend) broadcastStatusChanged() {
// The sync.Cond docs say: "It is allowed but not required for the caller to hold c.L during the call."
// In this particular case, we must acquire b.statusLock. Otherwise we might broadcast before
// In this particular case, we must acquire b.mu. Otherwise we might broadcast before
// the waiter (in requestEngineStatusAndWait) starts to wait, in which case
// the waiter can get stuck indefinitely. See PR 2865.
b.statusLock.Lock()
b.mu.Lock()
b.statusChanged.Broadcast()
b.statusLock.Unlock()
b.mu.Unlock()
}
// SetNotifyCallback sets the function to call when the backend has something to
@ -3308,12 +3308,13 @@ func (b *LocalBackend) popBrowserAuthNow(url string, keyExpired bool, recipient
if !b.seamlessRenewalEnabled() || keyExpired {
b.blockEngineUpdates(true)
b.stopEngineAndWait()
}
b.tellRecipientToBrowseToURL(url, toNotificationTarget(recipient))
if b.State() == ipn.Running {
b.enterState(ipn.Starting)
}
}
b.tellRecipientToBrowseToURL(url, toNotificationTarget(recipient))
}
// validPopBrowserURL reports whether urlStr is a valid value for a
// control server to send in a *URL field.
@ -5433,7 +5434,13 @@ func (b *LocalBackend) enterStateLockedOnEntry(newState ipn.State, unlock unlock
activeLogin := b.activeLogin
authURL := b.authURL
if newState == ipn.Running {
// TODO(zofrex): Is this needed? As of 2025-10-03 it doesn't seem to be
// necessary when logging in or authenticating. When do we need to reset it
// here, rather than the other places it is reset? We should test if it is
// necessary and add unit tests to cover those cases, or remove it.
if oldState != ipn.Running {
b.resetAuthURLLocked()
}
// Start a captive portal detection loop if none has been
// started. Create a new context if none is present, since it
@ -5670,29 +5677,34 @@ func (u unlockOnce) UnlockEarly() {
}
// stopEngineAndWait deconfigures the local network data plane, and
// waits for it to deliver a status update before returning.
//
// TODO(danderson): this may be racy. We could unblock upon receiving
// a status update that predates the "I've shut down" update.
// waits for it to deliver a status update indicating it has stopped
// before returning.
func (b *LocalBackend) stopEngineAndWait() {
b.logf("stopEngineAndWait...")
b.e.Reconfig(&wgcfg.Config{}, &router.Config{}, &dns.Config{})
b.requestEngineStatusAndWait()
b.requestEngineStatusAndWaitForStopped()
b.logf("stopEngineAndWait: done.")
}
// Requests the wgengine status, and does not return until the status
// was delivered (to the usual callback).
func (b *LocalBackend) requestEngineStatusAndWait() {
b.logf("requestEngineStatusAndWait")
// Requests the wgengine status, and does not return until a status was
// delivered (to the usual callback) that indicates the engine is stopped.
func (b *LocalBackend) requestEngineStatusAndWaitForStopped() {
b.logf("requestEngineStatusAndWaitForStopped")
b.statusLock.Lock()
defer b.statusLock.Unlock()
b.mu.Lock()
defer b.mu.Unlock()
b.goTracker.Go(b.e.RequestStatus)
b.logf("requestEngineStatusAndWait: waiting...")
b.logf("requestEngineStatusAndWaitForStopped: waiting...")
for {
b.statusChanged.Wait() // temporarily releases lock while waiting
b.logf("requestEngineStatusAndWait: got status update.")
if b.engineStatus.NumLive == 0 && b.engineStatus.LiveDERPs == 0 {
b.logf("requestEngineStatusAndWaitForStopped: engine is stopped.")
break
}
b.logf("requestEngineStatusAndWaitForStopped: engine is still running. Waiting...")
}
}
// setControlClientLocked sets the control client to cc,

@ -1561,6 +1561,151 @@ func TestEngineReconfigOnStateChange(t *testing.T) {
}
}
// TestStateMachineURLRace tests that wgengine updates arriving in the middle of
// processing an auth URL doesn't result in the auth URL being cleared.
func TestStateMachineURLRace(t *testing.T) {
runTestStateMachineURLRace(t, false)
}
func TestStateMachineURLRaceSeamless(t *testing.T) {
runTestStateMachineURLRace(t, true)
}
func runTestStateMachineURLRace(t *testing.T, seamless bool) {
var cc *mockControl
b := newLocalBackendWithTestControl(t, true, func(tb testing.TB, opts controlclient.Options) controlclient.Client {
cc = newClient(t, opts)
return cc
})
var stateLock sync.Mutex
var state ipn.State
stateChanged := make(chan struct{})
b.SetNotifyCallback(func(n ipn.Notify) {
if n.State != nil || (n.Prefs != nil && n.Prefs.Valid()) || n.BrowseToURL != nil || n.LoginFinished != nil {
b.logf("Received notify: %+v", n)
if n.State != nil {
stateLock.Lock()
state = *n.State
stateLock.Unlock()
stateChanged <- struct{}{}
}
} else {
b.logf("(ignored) %v", n)
}
})
waitForState := func(want ipn.State) {
t.Helper()
t.Logf("waiting for state: %v", want)
for {
stateLock.Lock()
s := state
stateLock.Unlock()
if s == want {
return
}
select {
case <-stateChanged:
case <-time.After(6 * time.Second):
t.Fatalf("timed out waiting for state %v; currently %v", want, s)
}
}
}
t.Logf("Start")
b.Start(ipn.Options{
UpdatePrefs: &ipn.Prefs{
WantRunning: true,
ControlURL: "https://localhost:1/",
},
})
waitForState(ipn.NeedsLogin)
t.Logf("LoginFinished")
cc.persist.UserProfile.LoginName = "user1"
cc.persist.NodeID = "node1"
if seamless {
b.sys.ControlKnobs().SeamlessKeyRenewal.Store(true)
}
cc.send(nil, "", true, &netmap.NetworkMap{
SelfNode: (&tailcfg.Node{MachineAuthorized: true}).View(),
})
waitForState(ipn.Starting)
t.Logf("Running")
b.setWgengineStatus(&wgengine.Status{AsOf: time.Now(), DERPs: 1}, nil)
waitForState(ipn.Running)
t.Logf("Re-auth (StartLoginInteractive)")
b.StartLoginInteractive(t.Context())
stop := make(chan struct{})
stopSpamming := sync.OnceFunc(func() {
stop <- struct{}{}
})
// if seamless renewal is enabled, the engine won't be disabled, and we won't
// ever call stopSpamming, so make sure it does get called
defer stopSpamming()
// Intercept updates between the engine and localBackend, so that we can see
// when the "stopped" update comes in and ensure we stop sending our "we're
// up" updates after that point.
b.e.SetStatusCallback(func(s *wgengine.Status, err error) {
// This is not one of our fake status updates, this is generated from the
// engine in response to LocalBackend calling RequestStatus. Stop spamming
// our fake statuses.
//
// TODO(zofrex): This is fragile, it works right now but would break if the
// calling pattern of RequestStatus changes. We should ensure that we keep
// sending "we're up" statuses right until Reconfig is called with
// zero-valued configs, and after that point only send "stopped" statuses.
stopSpamming()
// Once stopSpamming returns we are guaranteed to not send any more updates,
// so we can now send the real update (indicating shutdown) and be certain
// it will be received after any fake updates we sent. This is possibly a
// stronger guarantee than we get from the real engine?
b.setWgengineStatus(s, err)
})
// time needs to be >= last time for the status to be accepted, send all our
// spam with the same stale time so that when a real update comes in it will
// definitely be accepted.
time := b.lastStatusTime
// Flood localBackend with a lot of wgengine status updates, so if there are
// any race conditions in the multiple locks/unlocks that happen as we process
// the received auth URL, we will hit them.
go func() {
t.Logf("sending lots of fake wgengine status updates")
for {
select {
case <-stop:
t.Logf("stopping fake wgengine status updates")
return
default:
b.setWgengineStatus(&wgengine.Status{AsOf: time, DERPs: 1}, nil)
}
}
}()
t.Logf("Re-auth (receive URL)")
url1 := "https://localhost:1/1"
cc.send(nil, url1, false, nil)
// Don't need to wait on anything else - once .send completes, authURL should
// be set, and once .send has completed, any opportunities for a WG engine
// status update to trample it have ended as well.
if b.authURL == "" {
t.Fatalf("expected authURL to be set")
}
}
func buildNetmapWithPeers(self tailcfg.NodeView, peers ...tailcfg.NodeView) *netmap.NetworkMap {
const (
firstAutoUserID = tailcfg.UserID(10000)

Loading…
Cancel
Save