ipn/ipnlocal: remove all the weird locking (LockedOnEntry, UnlockEarly, etc)

Fixes #11649 Updates #16369 Co-authored-by: James Sanderson <jsanderson@tailscale.com> Change-Id: I63eaa18fe870ddf81d84b949efac4d1b44c3db86 Signed-off-by: Brad Fitzpatrick <bradfitz@tailscale.com>
4 weeks ago · 146ea42822
parent 08e74effc0
commit 146ea42822
9 changed files with 360 additions and 505 deletions
--- a/control/controlclient/auto.go
+++ b/control/controlclient/auto.go
@ -615,6 +615,13 @@ func (c *Auto) sendStatus(who string, err error, url string, nm *netmap.NetworkM
 	// does its thing, which may result in a call back into the client.
 	metricQueued.Add(1)
 	c.observerQueue.Add(func() {
+		c.mu.Lock()
+		closed := c.closed
+		c.mu.Unlock()
+		if closed {
+			return
+		}
+
 		if canSkipStatus(newSt, c.lastStatus.Load()) {
 			metricSkippable.Add(1)
 			if !c.direct.controlKnobs.DisableSkipStatusQueue.Load() {
--- a/ipn/ipnext/ipnext.go
+++ b/ipn/ipnext/ipnext.go
@ -323,7 +323,8 @@ type ProfileStateChangeCallback func(_ ipn.LoginProfileView, _ ipn.PrefsView, sa
 // [ProfileStateChangeCallback]s are called first.
 //
 // It returns a function to be called when the cc is being shut down,
-// or nil if no cleanup is needed.
+// or nil if no cleanup is needed. That cleanup function should not call
+// back into LocalBackend, which may be locked during shutdown.
 type NewControlClientCallback func(controlclient.Client, ipn.LoginProfileView) (cleanup func())

 // Hooks is a collection of hooks that extensions can add to (non-concurrently)
--- a/ipn/ipnlocal/local.go
+++ b/ipn/ipnlocal/local.go
--- a/ipn/ipnlocal/local_test.go
+++ b/ipn/ipnlocal/local_test.go
@ -1503,15 +1503,6 @@ func wantExitNodeIDNotify(want tailcfg.StableNodeID) wantedNotification {
 	}
 }

-func wantStateNotify(want ipn.State) wantedNotification {
-	return wantedNotification{
-		name: "State=" + want.String(),
-		cond: func(_ testing.TB, _ ipnauth.Actor, n *ipn.Notify) bool {
-			return n.State != nil && *n.State == want
-		},
-	}
-}
-
 func TestInternalAndExternalInterfaces(t *testing.T) {
 	type interfacePrefix struct {
 		i   netmon.Interface
@ -4318,9 +4309,9 @@ func (b *LocalBackend) SetPrefsForTest(newp *ipn.Prefs) {
 	if newp == nil {
 		panic("SetPrefsForTest got nil prefs")
 	}
-	unlock := b.lockAndGetUnlock()
-	defer unlock()
-	b.setPrefsLockedOnEntry(newp, unlock)
+	b.mu.Lock()
+	defer b.mu.Unlock()
+	b.setPrefsLocked(newp)
 }

 type peerOptFunc func(*tailcfg.Node)
@ -5808,12 +5799,12 @@ func TestNotificationTargetMatch(t *testing.T) {

 type newTestControlFn func(tb testing.TB, opts controlclient.Options) controlclient.Client

-func newLocalBackendWithTestControl(t *testing.T, enableLogging bool, newControl newTestControlFn) *LocalBackend {
+func newLocalBackendWithTestControl(t testing.TB, enableLogging bool, newControl newTestControlFn) *LocalBackend {
 	bus := eventbustest.NewBus(t)
 	return newLocalBackendWithSysAndTestControl(t, enableLogging, tsd.NewSystemWithBus(bus), newControl)
 }

-func newLocalBackendWithSysAndTestControl(t *testing.T, enableLogging bool, sys *tsd.System, newControl newTestControlFn) *LocalBackend {
+func newLocalBackendWithSysAndTestControl(t testing.TB, enableLogging bool, sys *tsd.System, newControl newTestControlFn) *LocalBackend {
 	logf := logger.Discard
 	if enableLogging {
 		logf = tstest.WhileTestRunningLogger(t)
--- a/ipn/ipnlocal/state_test.go
+++ b/ipn/ipnlocal/state_test.go
@ -1542,6 +1542,11 @@ func TestEngineReconfigOnStateChange(t *testing.T) {
 				tt.steps(t, lb, cc)
 			}

+			// TODO(bradfitz): this whole event bus settling thing
+			// should be unnecessary once the bogus uses of eventbus
+			// are removed. (https://github.com/tailscale/tailscale/issues/16369)
+			lb.settleEventBus()
+
 			if gotState := lb.State(); gotState != tt.wantState {
 				t.Errorf("State: got %v; want %v", gotState, tt.wantState)
 			}
@ -1572,35 +1577,30 @@ func TestEngineReconfigOnStateChange(t *testing.T) {
 	}
 }

-// TestStateMachineURLRace tests that wgengine updates arriving in the middle of
+// TestSendPreservesAuthURL tests that wgengine updates arriving in the middle of
 // processing an auth URL doesn't result in the auth URL being cleared.
-func TestStateMachineURLRace(t *testing.T) {
-	runTestStateMachineURLRace(t, false)
+func TestSendPreservesAuthURL(t *testing.T) {
+	runTestSendPreservesAuthURL(t, false)
 }

-func TestStateMachineURLRaceSeamless(t *testing.T) {
-	runTestStateMachineURLRace(t, true)
+func TestSendPreservesAuthURLSeamless(t *testing.T) {
+	runTestSendPreservesAuthURL(t, true)
 }

-func runTestStateMachineURLRace(t *testing.T, seamless bool) {
+func runTestSendPreservesAuthURL(t *testing.T, seamless bool) {
 	var cc *mockControl
 	b := newLocalBackendWithTestControl(t, true, func(tb testing.TB, opts controlclient.Options) controlclient.Client {
 		cc = newClient(t, opts)
 		return cc
 	})

-	nw := newNotificationWatcher(t, b, &ipnauth.TestActor{})
-
 	t.Logf("Start")
-	nw.watch(0, []wantedNotification{
-		wantStateNotify(ipn.NeedsLogin)})
 	b.Start(ipn.Options{
 		UpdatePrefs: &ipn.Prefs{
 			WantRunning: true,
 			ControlURL:  "https://localhost:1/",
 		},
 	})
-	nw.check()

 	t.Logf("LoginFinished")
 	cc.persist.UserProfile.LoginName = "user1"
@ -1610,72 +1610,16 @@ func runTestStateMachineURLRace(t *testing.T, seamless bool) {
 		b.sys.ControlKnobs().SeamlessKeyRenewal.Store(true)
 	}

-	nw.watch(0, []wantedNotification{
-		wantStateNotify(ipn.Starting)})
 	cc.send(sendOpt{loginFinished: true, nm: &netmap.NetworkMap{
 		SelfNode: (&tailcfg.Node{MachineAuthorized: true}).View(),
 	}})
-	nw.check()

 	t.Logf("Running")
-	nw.watch(0, []wantedNotification{
-		wantStateNotify(ipn.Running)})
 	b.setWgengineStatus(&wgengine.Status{AsOf: time.Now(), DERPs: 1}, nil)
-	nw.check()

 	t.Logf("Re-auth (StartLoginInteractive)")
 	b.StartLoginInteractive(t.Context())

-	stop := make(chan struct{})
-	stopSpamming := sync.OnceFunc(func() {
-		stop <- struct{}{}
-	})
-	// if seamless renewal is enabled, the engine won't be disabled, and we won't
-	// ever call stopSpamming, so make sure it does get called
-	defer stopSpamming()
-
-	// Intercept updates between the engine and localBackend, so that we can see
-	// when the "stopped" update comes in and ensure we stop sending our "we're
-	// up" updates after that point.
-	b.e.SetStatusCallback(func(s *wgengine.Status, err error) {
-		// This is not one of our fake status updates, this is generated from the
-		// engine in response to LocalBackend calling RequestStatus. Stop spamming
-		// our fake statuses.
-		//
-		// TODO(zofrex): This is fragile, it works right now but would break if the
-		// calling pattern of RequestStatus changes. We should ensure that we keep
-		// sending "we're up" statuses right until Reconfig is called with
-		// zero-valued configs, and after that point only send "stopped" statuses.
-		stopSpamming()
-
-		// Once stopSpamming returns we are guaranteed to not send any more updates,
-		// so we can now send the real update (indicating shutdown) and be certain
-		// it will be received after any fake updates we sent. This is possibly a
-		// stronger guarantee than we get from the real engine?
-		b.setWgengineStatus(s, err)
-	})
-
-	// time needs to be >= last time for the status to be accepted, send all our
-	// spam with the same stale time so that when a real update comes in it will
-	// definitely be accepted.
-	time := b.lastStatusTime
-
-	// Flood localBackend with a lot of wgengine status updates, so if there are
-	// any race conditions in the multiple locks/unlocks that happen as we process
-	// the received auth URL, we will hit them.
-	go func() {
-		t.Logf("sending lots of fake wgengine status updates")
-		for {
-			select {
-			case <-stop:
-				t.Logf("stopping fake wgengine status updates")
-				return
-			default:
-				b.setWgengineStatus(&wgengine.Status{AsOf: time, DERPs: 1}, nil)
-			}
-		}
-	}()
-
 	t.Logf("Re-auth (receive URL)")
 	url1 := "https://localhost:1/1"
 	cc.send(sendOpt{url: url1})
@ -1685,122 +1629,11 @@ func runTestStateMachineURLRace(t *testing.T, seamless bool) {
 	// status update to trample it have ended as well.
 	if b.authURL == "" {
 		t.Fatalf("expected authURL to be set")
+	} else {
+		t.Log("authURL was set")
 	}
 }

-func TestWGEngineDownThenUpRace(t *testing.T) {
-	var cc *mockControl
-	b := newLocalBackendWithTestControl(t, true, func(tb testing.TB, opts controlclient.Options) controlclient.Client {
-		cc = newClient(t, opts)
-		return cc
-	})
-
-	nw := newNotificationWatcher(t, b, &ipnauth.TestActor{})
-
-	t.Logf("Start")
-	nw.watch(0, []wantedNotification{
-		wantStateNotify(ipn.NeedsLogin)})
-	b.Start(ipn.Options{
-		UpdatePrefs: &ipn.Prefs{
-			WantRunning: true,
-			ControlURL:  "https://localhost:1/",
-		},
-	})
-	nw.check()
-
-	t.Logf("LoginFinished")
-	cc.persist.UserProfile.LoginName = "user1"
-	cc.persist.NodeID = "node1"
-
-	nw.watch(0, []wantedNotification{
-		wantStateNotify(ipn.Starting)})
-	cc.send(sendOpt{loginFinished: true, nm: &netmap.NetworkMap{
-		SelfNode: (&tailcfg.Node{MachineAuthorized: true}).View(),
-	}})
-	nw.check()
-
-	nw.watch(0, []wantedNotification{
-		wantStateNotify(ipn.Running)})
-	b.setWgengineStatus(&wgengine.Status{AsOf: time.Now(), DERPs: 1}, nil)
-	nw.check()
-
-	t.Logf("Re-auth (StartLoginInteractive)")
-	b.StartLoginInteractive(t.Context())
-
-	var timeLock sync.RWMutex
-	timestamp := b.lastStatusTime
-
-	engineShutdown := make(chan struct{})
-	gotShutdown := sync.OnceFunc(func() {
-		t.Logf("engineShutdown")
-		engineShutdown <- struct{}{}
-	})
-
-	b.e.SetStatusCallback(func(s *wgengine.Status, err error) {
-		timeLock.Lock()
-		if s.AsOf.After(timestamp) {
-			timestamp = s.AsOf
-		}
-		timeLock.Unlock()
-
-		if err != nil || (s.DERPs == 0 && len(s.Peers) == 0) {
-			gotShutdown()
-		} else {
-			b.setWgengineStatus(s, err)
-		}
-	})
-
-	t.Logf("Re-auth (receive URL)")
-	url1 := "https://localhost:1/1"
-
-	done := make(chan struct{})
-	var wg sync.WaitGroup
-
-	wg.Go(func() {
-		t.Log("cc.send starting")
-		cc.send(sendOpt{url: url1}) // will block until engine stops
-		t.Log("cc.send returned")
-	})
-
-	<-engineShutdown // will get called once cc.send is blocked
-	gotShutdown = sync.OnceFunc(func() {
-		t.Logf("engineShutdown")
-		engineShutdown <- struct{}{}
-	})
-
-	wg.Go(func() {
-		t.Log("StartLoginInteractive starting")
-		b.StartLoginInteractive(t.Context()) // will also block until engine stops
-		t.Log("StartLoginInteractive returned")
-	})
-
-	<-engineShutdown // will get called once StartLoginInteractive is blocked
-
-	st := controlclient.Status{}
-	st.SetStateForTest(controlclient.StateAuthenticated)
-	b.SetControlClientStatus(cc, st)
-
-	timeLock.RLock()
-	b.setWgengineStatus(&wgengine.Status{AsOf: timestamp}, nil)           // engine is down event finally arrives
-	b.setWgengineStatus(&wgengine.Status{AsOf: timestamp, DERPs: 1}, nil) // engine is back up
-	timeLock.RUnlock()
-
-	go func() {
-		wg.Wait()
-		done <- struct{}{}
-	}()
-
-	t.Log("waiting for .send and .StartLoginInteractive to return")
-
-	select {
-	case <-done:
-	case <-time.After(10 * time.Second):
-		t.Fatalf("timed out waiting")
-	}
-
-	t.Log("both returned")
-}
-
 func buildNetmapWithPeers(self tailcfg.NodeView, peers ...tailcfg.NodeView) *netmap.NetworkMap {
 	const (
 		firstAutoUserID = tailcfg.UserID(10000)
@ -2033,6 +1866,14 @@ func (e *mockEngine) RequestStatus() {
 	}
 }

+func (e *mockEngine) ResetAndStop() (*wgengine.Status, error) {
+	err := e.Reconfig(&wgcfg.Config{}, &router.Config{}, &dns.Config{})
+	if err != nil {
+		return nil, err
+	}
+	return &wgengine.Status{AsOf: time.Now()}, nil
+}
+
 func (e *mockEngine) PeerByKey(key.NodePublic) (_ wgint.Peer, ok bool) {
 	return wgint.Peer{}, false
 }
--- a/util/execqueue/execqueue.go
+++ b/util/execqueue/execqueue.go
@ -12,6 +12,8 @@ import (

 type ExecQueue struct {
 	mu         sync.Mutex
+	ctx        context.Context    // context.Background + closed on Shutdown
+	cancel     context.CancelFunc // closes ctx
 	closed     bool
 	inFlight   bool          // whether a goroutine is running q.run
 	doneWaiter chan struct{} // non-nil if waiter is waiting, then closed
@ -24,6 +26,7 @@ func (q *ExecQueue) Add(f func()) {
 	if q.closed {
 		return
 	}
+	q.initCtxLocked()
 	if q.inFlight {
 		q.queue = append(q.queue, f)
 	} else {
@ -79,18 +82,32 @@ func (q *ExecQueue) Shutdown() {
 	q.mu.Lock()
 	defer q.mu.Unlock()
 	q.closed = true
+	if q.cancel != nil {
+		q.cancel()
+	}
 }

-// Wait waits for the queue to be empty.
+func (q *ExecQueue) initCtxLocked() {
+	if q.ctx == nil {
+		q.ctx, q.cancel = context.WithCancel(context.Background())
+	}
+}
+
+// Wait waits for the queue to be empty or shut down.
 func (q *ExecQueue) Wait(ctx context.Context) error {
 	q.mu.Lock()
+	q.initCtxLocked()
 	waitCh := q.doneWaiter
 	if q.inFlight && waitCh == nil {
 		waitCh = make(chan struct{})
 		q.doneWaiter = waitCh
 	}
+	closed := q.closed
 	q.mu.Unlock()

+	if closed {
+		return errors.New("execqueue shut down")
+	}
 	if waitCh == nil {
 		return nil
 	}
@ -98,6 +115,8 @@ func (q *ExecQueue) Wait(ctx context.Context) error {
 	select {
 	case <-waitCh:
 		return nil
+	case <-q.ctx.Done():
+		return errors.New("execqueue shut down")
 	case <-ctx.Done():
 		return ctx.Err()
 	}
--- a/wgengine/userspace.go
+++ b/wgengine/userspace.go
@ -47,6 +47,7 @@ import (
 	"tailscale.com/types/logger"
 	"tailscale.com/types/netmap"
 	"tailscale.com/types/views"
+	"tailscale.com/util/backoff"
 	"tailscale.com/util/checkchange"
 	"tailscale.com/util/clientmetric"
 	"tailscale.com/util/eventbus"
@ -924,6 +925,32 @@ func hasOverlap(aips, rips views.Slice[netip.Prefix]) bool {
 	return false
 }

+// ResetAndStop resets the engine to a clean state (like calling Reconfig
+// with all pointers to zero values) and waits for it to be fully stopped,
+// with no live peers or DERPs.
+//
+// Unlike Reconfig, it does not return ErrNoChanges.
+//
+// If the engine stops, returns the status. NB that this status will not be sent
+// to the registered status callback, it is on the caller to ensure this status
+// is handled appropriately.
+func (e *userspaceEngine) ResetAndStop() (*Status, error) {
+	if err := e.Reconfig(&wgcfg.Config{}, &router.Config{}, &dns.Config{}); err != nil && !errors.Is(err, ErrNoChanges) {
+		return nil, err
+	}
+	bo := backoff.NewBackoff("UserspaceEngineResetAndStop", e.logf, 1*time.Second)
+	for {
+		st, err := e.getStatus()
+		if err != nil {
+			return nil, err
+		}
+		if len(st.Peers) == 0 && st.DERPs == 0 {
+			return st, nil
+		}
+		bo.BackOff(context.Background(), fmt.Errorf("waiting for engine to stop: peers=%d derps=%d", len(st.Peers), st.DERPs))
+	}
+}
+
 func (e *userspaceEngine) Reconfig(cfg *wgcfg.Config, routerCfg *router.Config, dnsCfg *dns.Config) error {
 	if routerCfg == nil {
 		panic("routerCfg must not be nil")
--- a/wgengine/watchdog.go
+++ b/wgengine/watchdog.go
@ -124,6 +124,12 @@ func (e *watchdogEngine) watchdog(name string, fn func()) {
 func (e *watchdogEngine) Reconfig(cfg *wgcfg.Config, routerCfg *router.Config, dnsCfg *dns.Config) error {
 	return e.watchdogErr("Reconfig", func() error { return e.wrap.Reconfig(cfg, routerCfg, dnsCfg) })
 }
+func (e *watchdogEngine) ResetAndStop() (st *Status, err error) {
+	e.watchdog("ResetAndStop", func() {
+		st, err = e.wrap.ResetAndStop()
+	})
+	return st, err
+}
 func (e *watchdogEngine) GetFilter() *filter.Filter {
 	return e.wrap.GetFilter()
 }
--- a/wgengine/wgengine.go
+++ b/wgengine/wgengine.go
@ -69,6 +69,13 @@ type Engine interface {
 	// The returned error is ErrNoChanges if no changes were made.
 	Reconfig(*wgcfg.Config, *router.Config, *dns.Config) error

+	// ResetAndStop resets the engine to a clean state (like calling Reconfig
+	// with all pointers to zero values) and waits for it to be fully stopped,
+	// with no live peers or DERPs.
+	//
+	// Unlike Reconfig, it does not return ErrNoChanges.
+	ResetAndStop() (*Status, error)
+
 	// PeerForIP returns the node to which the provided IP routes,
 	// if any. If none is found, (nil, false) is returned.
 	PeerForIP(netip.Addr) (_ PeerForIP, ok bool)