ipn/ipnlocal: remove all the weird locking (LockedOnEntry, UnlockEarly, etc)

Fixes #11649
Updates #16369

Co-authored-by: James Sanderson <jsanderson@tailscale.com>
Change-Id: I63eaa18fe870ddf81d84b949efac4d1b44c3db86
Signed-off-by: Brad Fitzpatrick <bradfitz@tailscale.com>
pull/17889/head
Brad Fitzpatrick 4 weeks ago committed by Brad Fitzpatrick
parent 08e74effc0
commit 146ea42822

@ -615,6 +615,13 @@ func (c *Auto) sendStatus(who string, err error, url string, nm *netmap.NetworkM
// does its thing, which may result in a call back into the client.
metricQueued.Add(1)
c.observerQueue.Add(func() {
c.mu.Lock()
closed := c.closed
c.mu.Unlock()
if closed {
return
}
if canSkipStatus(newSt, c.lastStatus.Load()) {
metricSkippable.Add(1)
if !c.direct.controlKnobs.DisableSkipStatusQueue.Load() {

@ -323,7 +323,8 @@ type ProfileStateChangeCallback func(_ ipn.LoginProfileView, _ ipn.PrefsView, sa
// [ProfileStateChangeCallback]s are called first.
//
// It returns a function to be called when the cc is being shut down,
// or nil if no cleanup is needed.
// or nil if no cleanup is needed. That cleanup function should not call
// back into LocalBackend, which may be locked during shutdown.
type NewControlClientCallback func(controlclient.Client, ipn.LoginProfileView) (cleanup func())
// Hooks is a collection of hooks that extensions can add to (non-concurrently)

File diff suppressed because it is too large Load Diff

@ -1503,15 +1503,6 @@ func wantExitNodeIDNotify(want tailcfg.StableNodeID) wantedNotification {
}
}
func wantStateNotify(want ipn.State) wantedNotification {
return wantedNotification{
name: "State=" + want.String(),
cond: func(_ testing.TB, _ ipnauth.Actor, n *ipn.Notify) bool {
return n.State != nil && *n.State == want
},
}
}
func TestInternalAndExternalInterfaces(t *testing.T) {
type interfacePrefix struct {
i netmon.Interface
@ -4318,9 +4309,9 @@ func (b *LocalBackend) SetPrefsForTest(newp *ipn.Prefs) {
if newp == nil {
panic("SetPrefsForTest got nil prefs")
}
unlock := b.lockAndGetUnlock()
defer unlock()
b.setPrefsLockedOnEntry(newp, unlock)
b.mu.Lock()
defer b.mu.Unlock()
b.setPrefsLocked(newp)
}
type peerOptFunc func(*tailcfg.Node)
@ -5808,12 +5799,12 @@ func TestNotificationTargetMatch(t *testing.T) {
type newTestControlFn func(tb testing.TB, opts controlclient.Options) controlclient.Client
func newLocalBackendWithTestControl(t *testing.T, enableLogging bool, newControl newTestControlFn) *LocalBackend {
func newLocalBackendWithTestControl(t testing.TB, enableLogging bool, newControl newTestControlFn) *LocalBackend {
bus := eventbustest.NewBus(t)
return newLocalBackendWithSysAndTestControl(t, enableLogging, tsd.NewSystemWithBus(bus), newControl)
}
func newLocalBackendWithSysAndTestControl(t *testing.T, enableLogging bool, sys *tsd.System, newControl newTestControlFn) *LocalBackend {
func newLocalBackendWithSysAndTestControl(t testing.TB, enableLogging bool, sys *tsd.System, newControl newTestControlFn) *LocalBackend {
logf := logger.Discard
if enableLogging {
logf = tstest.WhileTestRunningLogger(t)

@ -1542,6 +1542,11 @@ func TestEngineReconfigOnStateChange(t *testing.T) {
tt.steps(t, lb, cc)
}
// TODO(bradfitz): this whole event bus settling thing
// should be unnecessary once the bogus uses of eventbus
// are removed. (https://github.com/tailscale/tailscale/issues/16369)
lb.settleEventBus()
if gotState := lb.State(); gotState != tt.wantState {
t.Errorf("State: got %v; want %v", gotState, tt.wantState)
}
@ -1572,35 +1577,30 @@ func TestEngineReconfigOnStateChange(t *testing.T) {
}
}
// TestStateMachineURLRace tests that wgengine updates arriving in the middle of
// TestSendPreservesAuthURL tests that wgengine updates arriving in the middle of
// processing an auth URL doesn't result in the auth URL being cleared.
func TestStateMachineURLRace(t *testing.T) {
runTestStateMachineURLRace(t, false)
func TestSendPreservesAuthURL(t *testing.T) {
runTestSendPreservesAuthURL(t, false)
}
func TestStateMachineURLRaceSeamless(t *testing.T) {
runTestStateMachineURLRace(t, true)
func TestSendPreservesAuthURLSeamless(t *testing.T) {
runTestSendPreservesAuthURL(t, true)
}
func runTestStateMachineURLRace(t *testing.T, seamless bool) {
func runTestSendPreservesAuthURL(t *testing.T, seamless bool) {
var cc *mockControl
b := newLocalBackendWithTestControl(t, true, func(tb testing.TB, opts controlclient.Options) controlclient.Client {
cc = newClient(t, opts)
return cc
})
nw := newNotificationWatcher(t, b, &ipnauth.TestActor{})
t.Logf("Start")
nw.watch(0, []wantedNotification{
wantStateNotify(ipn.NeedsLogin)})
b.Start(ipn.Options{
UpdatePrefs: &ipn.Prefs{
WantRunning: true,
ControlURL: "https://localhost:1/",
},
})
nw.check()
t.Logf("LoginFinished")
cc.persist.UserProfile.LoginName = "user1"
@ -1610,72 +1610,16 @@ func runTestStateMachineURLRace(t *testing.T, seamless bool) {
b.sys.ControlKnobs().SeamlessKeyRenewal.Store(true)
}
nw.watch(0, []wantedNotification{
wantStateNotify(ipn.Starting)})
cc.send(sendOpt{loginFinished: true, nm: &netmap.NetworkMap{
SelfNode: (&tailcfg.Node{MachineAuthorized: true}).View(),
}})
nw.check()
t.Logf("Running")
nw.watch(0, []wantedNotification{
wantStateNotify(ipn.Running)})
b.setWgengineStatus(&wgengine.Status{AsOf: time.Now(), DERPs: 1}, nil)
nw.check()
t.Logf("Re-auth (StartLoginInteractive)")
b.StartLoginInteractive(t.Context())
stop := make(chan struct{})
stopSpamming := sync.OnceFunc(func() {
stop <- struct{}{}
})
// if seamless renewal is enabled, the engine won't be disabled, and we won't
// ever call stopSpamming, so make sure it does get called
defer stopSpamming()
// Intercept updates between the engine and localBackend, so that we can see
// when the "stopped" update comes in and ensure we stop sending our "we're
// up" updates after that point.
b.e.SetStatusCallback(func(s *wgengine.Status, err error) {
// This is not one of our fake status updates, this is generated from the
// engine in response to LocalBackend calling RequestStatus. Stop spamming
// our fake statuses.
//
// TODO(zofrex): This is fragile, it works right now but would break if the
// calling pattern of RequestStatus changes. We should ensure that we keep
// sending "we're up" statuses right until Reconfig is called with
// zero-valued configs, and after that point only send "stopped" statuses.
stopSpamming()
// Once stopSpamming returns we are guaranteed to not send any more updates,
// so we can now send the real update (indicating shutdown) and be certain
// it will be received after any fake updates we sent. This is possibly a
// stronger guarantee than we get from the real engine?
b.setWgengineStatus(s, err)
})
// time needs to be >= last time for the status to be accepted, send all our
// spam with the same stale time so that when a real update comes in it will
// definitely be accepted.
time := b.lastStatusTime
// Flood localBackend with a lot of wgengine status updates, so if there are
// any race conditions in the multiple locks/unlocks that happen as we process
// the received auth URL, we will hit them.
go func() {
t.Logf("sending lots of fake wgengine status updates")
for {
select {
case <-stop:
t.Logf("stopping fake wgengine status updates")
return
default:
b.setWgengineStatus(&wgengine.Status{AsOf: time, DERPs: 1}, nil)
}
}
}()
t.Logf("Re-auth (receive URL)")
url1 := "https://localhost:1/1"
cc.send(sendOpt{url: url1})
@ -1685,122 +1629,11 @@ func runTestStateMachineURLRace(t *testing.T, seamless bool) {
// status update to trample it have ended as well.
if b.authURL == "" {
t.Fatalf("expected authURL to be set")
} else {
t.Log("authURL was set")
}
}
func TestWGEngineDownThenUpRace(t *testing.T) {
var cc *mockControl
b := newLocalBackendWithTestControl(t, true, func(tb testing.TB, opts controlclient.Options) controlclient.Client {
cc = newClient(t, opts)
return cc
})
nw := newNotificationWatcher(t, b, &ipnauth.TestActor{})
t.Logf("Start")
nw.watch(0, []wantedNotification{
wantStateNotify(ipn.NeedsLogin)})
b.Start(ipn.Options{
UpdatePrefs: &ipn.Prefs{
WantRunning: true,
ControlURL: "https://localhost:1/",
},
})
nw.check()
t.Logf("LoginFinished")
cc.persist.UserProfile.LoginName = "user1"
cc.persist.NodeID = "node1"
nw.watch(0, []wantedNotification{
wantStateNotify(ipn.Starting)})
cc.send(sendOpt{loginFinished: true, nm: &netmap.NetworkMap{
SelfNode: (&tailcfg.Node{MachineAuthorized: true}).View(),
}})
nw.check()
nw.watch(0, []wantedNotification{
wantStateNotify(ipn.Running)})
b.setWgengineStatus(&wgengine.Status{AsOf: time.Now(), DERPs: 1}, nil)
nw.check()
t.Logf("Re-auth (StartLoginInteractive)")
b.StartLoginInteractive(t.Context())
var timeLock sync.RWMutex
timestamp := b.lastStatusTime
engineShutdown := make(chan struct{})
gotShutdown := sync.OnceFunc(func() {
t.Logf("engineShutdown")
engineShutdown <- struct{}{}
})
b.e.SetStatusCallback(func(s *wgengine.Status, err error) {
timeLock.Lock()
if s.AsOf.After(timestamp) {
timestamp = s.AsOf
}
timeLock.Unlock()
if err != nil || (s.DERPs == 0 && len(s.Peers) == 0) {
gotShutdown()
} else {
b.setWgengineStatus(s, err)
}
})
t.Logf("Re-auth (receive URL)")
url1 := "https://localhost:1/1"
done := make(chan struct{})
var wg sync.WaitGroup
wg.Go(func() {
t.Log("cc.send starting")
cc.send(sendOpt{url: url1}) // will block until engine stops
t.Log("cc.send returned")
})
<-engineShutdown // will get called once cc.send is blocked
gotShutdown = sync.OnceFunc(func() {
t.Logf("engineShutdown")
engineShutdown <- struct{}{}
})
wg.Go(func() {
t.Log("StartLoginInteractive starting")
b.StartLoginInteractive(t.Context()) // will also block until engine stops
t.Log("StartLoginInteractive returned")
})
<-engineShutdown // will get called once StartLoginInteractive is blocked
st := controlclient.Status{}
st.SetStateForTest(controlclient.StateAuthenticated)
b.SetControlClientStatus(cc, st)
timeLock.RLock()
b.setWgengineStatus(&wgengine.Status{AsOf: timestamp}, nil) // engine is down event finally arrives
b.setWgengineStatus(&wgengine.Status{AsOf: timestamp, DERPs: 1}, nil) // engine is back up
timeLock.RUnlock()
go func() {
wg.Wait()
done <- struct{}{}
}()
t.Log("waiting for .send and .StartLoginInteractive to return")
select {
case <-done:
case <-time.After(10 * time.Second):
t.Fatalf("timed out waiting")
}
t.Log("both returned")
}
func buildNetmapWithPeers(self tailcfg.NodeView, peers ...tailcfg.NodeView) *netmap.NetworkMap {
const (
firstAutoUserID = tailcfg.UserID(10000)
@ -2033,6 +1866,14 @@ func (e *mockEngine) RequestStatus() {
}
}
func (e *mockEngine) ResetAndStop() (*wgengine.Status, error) {
err := e.Reconfig(&wgcfg.Config{}, &router.Config{}, &dns.Config{})
if err != nil {
return nil, err
}
return &wgengine.Status{AsOf: time.Now()}, nil
}
func (e *mockEngine) PeerByKey(key.NodePublic) (_ wgint.Peer, ok bool) {
return wgint.Peer{}, false
}

@ -12,6 +12,8 @@ import (
type ExecQueue struct {
mu sync.Mutex
ctx context.Context // context.Background + closed on Shutdown
cancel context.CancelFunc // closes ctx
closed bool
inFlight bool // whether a goroutine is running q.run
doneWaiter chan struct{} // non-nil if waiter is waiting, then closed
@ -24,6 +26,7 @@ func (q *ExecQueue) Add(f func()) {
if q.closed {
return
}
q.initCtxLocked()
if q.inFlight {
q.queue = append(q.queue, f)
} else {
@ -79,18 +82,32 @@ func (q *ExecQueue) Shutdown() {
q.mu.Lock()
defer q.mu.Unlock()
q.closed = true
if q.cancel != nil {
q.cancel()
}
}
// Wait waits for the queue to be empty.
func (q *ExecQueue) initCtxLocked() {
if q.ctx == nil {
q.ctx, q.cancel = context.WithCancel(context.Background())
}
}
// Wait waits for the queue to be empty or shut down.
func (q *ExecQueue) Wait(ctx context.Context) error {
q.mu.Lock()
q.initCtxLocked()
waitCh := q.doneWaiter
if q.inFlight && waitCh == nil {
waitCh = make(chan struct{})
q.doneWaiter = waitCh
}
closed := q.closed
q.mu.Unlock()
if closed {
return errors.New("execqueue shut down")
}
if waitCh == nil {
return nil
}
@ -98,6 +115,8 @@ func (q *ExecQueue) Wait(ctx context.Context) error {
select {
case <-waitCh:
return nil
case <-q.ctx.Done():
return errors.New("execqueue shut down")
case <-ctx.Done():
return ctx.Err()
}

@ -47,6 +47,7 @@ import (
"tailscale.com/types/logger"
"tailscale.com/types/netmap"
"tailscale.com/types/views"
"tailscale.com/util/backoff"
"tailscale.com/util/checkchange"
"tailscale.com/util/clientmetric"
"tailscale.com/util/eventbus"
@ -924,6 +925,32 @@ func hasOverlap(aips, rips views.Slice[netip.Prefix]) bool {
return false
}
// ResetAndStop resets the engine to a clean state (like calling Reconfig
// with all pointers to zero values) and waits for it to be fully stopped,
// with no live peers or DERPs.
//
// Unlike Reconfig, it does not return ErrNoChanges.
//
// If the engine stops, returns the status. NB that this status will not be sent
// to the registered status callback, it is on the caller to ensure this status
// is handled appropriately.
func (e *userspaceEngine) ResetAndStop() (*Status, error) {
if err := e.Reconfig(&wgcfg.Config{}, &router.Config{}, &dns.Config{}); err != nil && !errors.Is(err, ErrNoChanges) {
return nil, err
}
bo := backoff.NewBackoff("UserspaceEngineResetAndStop", e.logf, 1*time.Second)
for {
st, err := e.getStatus()
if err != nil {
return nil, err
}
if len(st.Peers) == 0 && st.DERPs == 0 {
return st, nil
}
bo.BackOff(context.Background(), fmt.Errorf("waiting for engine to stop: peers=%d derps=%d", len(st.Peers), st.DERPs))
}
}
func (e *userspaceEngine) Reconfig(cfg *wgcfg.Config, routerCfg *router.Config, dnsCfg *dns.Config) error {
if routerCfg == nil {
panic("routerCfg must not be nil")

@ -124,6 +124,12 @@ func (e *watchdogEngine) watchdog(name string, fn func()) {
func (e *watchdogEngine) Reconfig(cfg *wgcfg.Config, routerCfg *router.Config, dnsCfg *dns.Config) error {
return e.watchdogErr("Reconfig", func() error { return e.wrap.Reconfig(cfg, routerCfg, dnsCfg) })
}
func (e *watchdogEngine) ResetAndStop() (st *Status, err error) {
e.watchdog("ResetAndStop", func() {
st, err = e.wrap.ResetAndStop()
})
return st, err
}
func (e *watchdogEngine) GetFilter() *filter.Filter {
return e.wrap.GetFilter()
}

@ -69,6 +69,13 @@ type Engine interface {
// The returned error is ErrNoChanges if no changes were made.
Reconfig(*wgcfg.Config, *router.Config, *dns.Config) error
// ResetAndStop resets the engine to a clean state (like calling Reconfig
// with all pointers to zero values) and waits for it to be fully stopped,
// with no live peers or DERPs.
//
// Unlike Reconfig, it does not return ErrNoChanges.
ResetAndStop() (*Status, error)
// PeerForIP returns the node to which the provided IP routes,
// if any. If none is found, (nil, false) is returned.
PeerForIP(netip.Addr) (_ PeerForIP, ok bool)

Loading…
Cancel
Save