From d06a75dcd0bba48fba5a5b5979f9490f5c1e718b Mon Sep 17 00:00:00 2001 From: Maisem Ali Date: Sat, 2 Sep 2023 12:04:03 -0700 Subject: [PATCH] ipn/ipnlocal: fix deadlock in resetControlClientLocked resetControlClientLocked is called while b.mu was held and would call cc.Shutdown which would wait for the observer queue to drain. However, there may be active callbacks from cc already waiting for b.mu resulting in a deadlock. This makes it so that resetControlClientLocked does not call Shutdown, and instead just returns the value. It also makes it so that any status received from previous cc are ignored. Updates tailscale/corp#12827 Signed-off-by: Maisem Ali --- control/controlclient/auto.go | 3 +- control/controlclient/client.go | 3 ++ control/controlclient/direct.go | 6 ++- ipn/ipnlocal/local.go | 63 ++++++++++++++++++++----------- ipn/ipnlocal/network-lock_test.go | 2 +- ipn/ipnlocal/state_test.go | 2 +- 6 files changed, 52 insertions(+), 27 deletions(-) diff --git a/control/controlclient/auto.go b/control/controlclient/auto.go index fd2a13212..ed2b3a018 100644 --- a/control/controlclient/auto.go +++ b/control/controlclient/auto.go @@ -595,7 +595,7 @@ func (c *Auto) sendStatus(who string, err error, url string, nm *netmap.NetworkM // Launch a new goroutine to avoid blocking the caller while the observer // does its thing, which may result in a call back into the client. c.observerQueue.Add(func() { - c.observer.SetControlClientStatus(new) + c.observer.SetControlClientStatus(c, new) }) } @@ -667,6 +667,7 @@ func (c *Auto) Shutdown() { direct := c.direct if !closed { c.closed = true + c.observerQueue.shutdown() c.cancelAuthCtxLocked() c.cancelMapCtxLocked() for _, w := range c.unpauseWaiters { diff --git a/control/controlclient/client.go b/control/controlclient/client.go index f942676ae..b809f8192 100644 --- a/control/controlclient/client.go +++ b/control/controlclient/client.go @@ -25,6 +25,9 @@ const ( // Client represents a client connection to the control server. // Currently this is done through a pair of polling https requests in // the Auto client, but that might change eventually. +// +// The Client must be comparable as it is used by the Observer to detect stale +// clients. type Client interface { // Shutdown closes this session, which should not be used any further // afterwards. diff --git a/control/controlclient/direct.go b/control/controlclient/direct.go index d7c83e38f..b8951564d 100644 --- a/control/controlclient/direct.go +++ b/control/controlclient/direct.go @@ -104,7 +104,11 @@ type Direct struct { // Observer is implemented by users of the control client (such as LocalBackend) // to get notified of changes in the control client's status. type Observer interface { - SetControlClientStatus(Status) + // SetControlClientStatus is called when the client has a new status to + // report. The Client is provided to allow the Observer to track which + // Client is reporting the status, allowing it to ignore stale status + // reports from previous Clients. + SetControlClientStatus(Client, Status) } type Options struct { diff --git a/ipn/ipnlocal/local.go b/ipn/ipnlocal/local.go index c7f81f81b..ac2605b04 100644 --- a/ipn/ipnlocal/local.go +++ b/ipn/ipnlocal/local.go @@ -897,9 +897,16 @@ func (b *LocalBackend) SetDecompressor(fn func() (controlclient.Decompressor, er // SetControlClientStatus is the callback invoked by the control client whenever it posts a new status. // Among other things, this is where we update the netmap, packet filters, DNS and DERP maps. -func (b *LocalBackend) SetControlClientStatus(st controlclient.Status) { +func (b *LocalBackend) SetControlClientStatus(c controlclient.Client, st controlclient.Status) { + b.mu.Lock() + if b.cc != c { + b.logf("Ignoring SetControlClientStatus from old client") + b.mu.Unlock() + return + } // The following do not depend on any data for which we need to lock b. if st.Err != nil { + b.mu.Unlock() if errors.Is(st.Err, io.EOF) { b.logf("[v1] Received error: EOF") return @@ -916,8 +923,6 @@ func (b *LocalBackend) SetControlClientStatus(st controlclient.Status) { // Track the number of calls currCall := b.numClientStatusCalls.Add(1) - b.mu.Lock() - // Handle node expiry in the netmap if st.NetMap != nil { now := b.clock.Now() @@ -953,7 +958,7 @@ func (b *LocalBackend) SetControlClientStatus(st controlclient.Status) { // Call ourselves with the current status again; the logic in // setClientStatus will take care of updating the expired field // of peers in the netmap. - b.SetControlClientStatus(st) + b.SetControlClientStatus(c, st) }) } } @@ -1340,16 +1345,14 @@ func (b *LocalBackend) Start(opts ipn.Options) error { hostinfo.Userspace.Set(b.sys.IsNetstack()) hostinfo.UserspaceRouter.Set(b.sys.IsNetstackRouter()) - if b.cc != nil { - // TODO(apenwarr): avoid the need to reinit controlclient. - // This will trigger a full relogin/reconfigure cycle every - // time a Handle reconnects to the backend. Ideally, we - // would send the new Prefs and everything would get back - // into sync with the minimal changes. But that's not how it - // is right now, which is a sign that the code is still too - // complicated. - b.resetControlClientLockedAsync() - } + // TODO(apenwarr): avoid the need to reinit controlclient. + // This will trigger a full relogin/reconfigure cycle every + // time a Handle reconnects to the backend. Ideally, we + // would send the new Prefs and everything would get back + // into sync with the minimal changes. But that's not how it + // is right now, which is a sign that the code is still too + // complicated. + prevCC := b.resetControlClientLocked() httpTestClient := b.httpTestClient if b.hostinfo != nil { @@ -1429,6 +1432,10 @@ func (b *LocalBackend) Start(opts ipn.Options) error { debugFlags = append([]string{"netstack"}, debugFlags...) } + if prevCC != nil { + prevCC.Shutdown() + } + // TODO(apenwarr): The only way to change the ServerURL is to // re-run b.Start(), because this is the only place we create a // new controlclient. SetPrefs() allows you to overwrite ServerURL, @@ -3847,12 +3854,12 @@ func (b *LocalBackend) requestEngineStatusAndWait() { b.statusLock.Unlock() } -// resetControlClientLockedAsync sets b.cc to nil, and starts a -// goroutine to Shutdown the old client. It does not wait for the -// shutdown to complete. -func (b *LocalBackend) resetControlClientLockedAsync() { +// resetControlClientLocked sets b.cc to nil and returns the old value. If the +// returned value is non-nil, the caller must call Shutdown on it after +// releasing b.mu. +func (b *LocalBackend) resetControlClientLocked() controlclient.Client { if b.cc == nil { - return + return nil } // When we clear the control client, stop any outstanding netmap expiry @@ -3868,9 +3875,10 @@ func (b *LocalBackend) resetControlClientLockedAsync() { // will abort. b.numClientStatusCalls.Add(1) } - b.cc.Shutdown() + prev := b.cc b.cc = nil b.ccAuto = nil + return prev } // ResetForClientDisconnect resets the backend for GUI clients running @@ -3881,10 +3889,16 @@ func (b *LocalBackend) resetControlClientLockedAsync() { // when they restart the GUI. func (b *LocalBackend) ResetForClientDisconnect() { defer b.enterState(ipn.Stopped) + b.logf("LocalBackend.ResetForClientDisconnect") + b.mu.Lock() + prevCC := b.resetControlClientLocked() + if prevCC != nil { + // Needs to happen without b.mu held. + defer prevCC.Shutdown() + } defer b.mu.Unlock() - b.logf("LocalBackend.ResetForClientDisconnect") - b.resetControlClientLockedAsync() + b.setNetMapLocked(nil) b.pm.Reset() b.keyExpired = false @@ -4997,7 +5011,10 @@ func (b *LocalBackend) ListProfiles() []ipn.LoginProfile { // called to register it as new node. func (b *LocalBackend) ResetAuth() error { b.mu.Lock() - b.resetControlClientLockedAsync() + prevCC := b.resetControlClientLocked() + if prevCC != nil { + defer prevCC.Shutdown() // call must happen after release b.mu + } if err := b.clearMachineKeyLocked(); err != nil { b.mu.Unlock() return err diff --git a/ipn/ipnlocal/network-lock_test.go b/ipn/ipnlocal/network-lock_test.go index f5eb57101..e1f002471 100644 --- a/ipn/ipnlocal/network-lock_test.go +++ b/ipn/ipnlocal/network-lock_test.go @@ -31,7 +31,7 @@ import ( type observerFunc func(controlclient.Status) -func (f observerFunc) SetControlClientStatus(s controlclient.Status) { +func (f observerFunc) SetControlClientStatus(_ controlclient.Client, s controlclient.Status) { f(s) } diff --git a/ipn/ipnlocal/state_test.go b/ipn/ipnlocal/state_test.go index a354bb9a7..53c94ee6f 100644 --- a/ipn/ipnlocal/state_test.go +++ b/ipn/ipnlocal/state_test.go @@ -173,7 +173,7 @@ func (cc *mockControl) send(err error, url string, loginFinished bool, nm *netma } else if url == "" && err == nil && nm == nil { s.SetStateForTest(controlclient.StateNotAuthenticated) } - cc.opts.Observer.SetControlClientStatus(s) + cc.opts.Observer.SetControlClientStatus(cc, s) } }