ipn/ipnlocal: make tkaSyncIfNeeded exclusive with a mutex

Running corp/ipn#TestNetworkLockE2E has a 1/300 chance of failing, and
deskchecking suggests thats whats happening are two netmaps are racing each
other to be processed through tkaSyncIfNeededLocked. This happens in the
first place because we release b.mu during network RPCs.

To fix this, we make the tka sync logic an exclusive section, so two
netmaps will need to wait for tka sync to complete serially (which is what
we would want anyway, as the second run through probably wont need to
sync).

Signed-off-by: Tom DNetto <tom@tailscale.com>
pull/5945/head
Tom DNetto 2 years ago committed by Tom
parent 227777154a
commit a515fc517b

@ -198,6 +198,14 @@ type LocalBackend struct {
// dialPlan is any dial plan that we've received from the control
// server during a previous connection; it is cleared on logout.
dialPlan atomic.Pointer[tailcfg.ControlDialPlan]
// tkaSyncLock is used to make tkaSyncIfNeeded an exclusive
// section. This is needed to stop two map-responses in quick succession
// from racing each other through TKA sync logic / RPCs.
//
// tkaSyncLock MUST be taken before mu (or inversely, mu must not be held
// at the moment that tkaSyncLock is taken).
tkaSyncLock sync.Mutex
}
// clientGen is a func that creates a control plane client.
@ -775,9 +783,12 @@ func (b *LocalBackend) setClientStatus(st controlclient.Status) {
}
}
if st.NetMap != nil {
if err := b.tkaSyncIfNeededLocked(st.NetMap); err != nil {
b.mu.Unlock() // respect locking rules for tkaSyncIfNeeded
if err := b.tkaSyncIfNeeded(st.NetMap); err != nil {
b.logf("[v1] TKA sync error: %v", err)
}
b.mu.Lock()
if !envknob.TKASkipSignatureCheck() {
b.tkaFilterNetmapLocked(st.NetMap)
}

@ -39,6 +39,8 @@ type tkaState struct {
// tkaFilterNetmapLocked checks the signatures on each node key, dropping
// nodes from the netmap who's signature does not verify.
//
// b.mu must be held.
func (b *LocalBackend) tkaFilterNetmapLocked(nm *netmap.NetworkMap) {
if !envknob.UseWIPCode() {
return // Feature-flag till network-lock is in Alpha.
@ -70,7 +72,7 @@ func (b *LocalBackend) tkaFilterNetmapLocked(nm *netmap.NetworkMap) {
nm.Peers = peers
}
// tkaSyncIfNeededLocked examines TKA info reported from the control plane,
// tkaSyncIfNeeded examines TKA info reported from the control plane,
// performing the steps necessary to synchronize local tka state.
//
// There are 4 scenarios handled here:
@ -85,13 +87,19 @@ func (b *LocalBackend) tkaFilterNetmapLocked(nm *netmap.NetworkMap) {
// - Everything up to date: All other cases.
// ∴ no action necessary.
//
// b.mu must be held. b.mu will be stepped out of (and back in) during network
// RPCs.
func (b *LocalBackend) tkaSyncIfNeededLocked(nm *netmap.NetworkMap) error {
// tkaSyncIfNeeded immediately takes b.takeSyncLock which is held throughout,
// and may take b.mu as required.
func (b *LocalBackend) tkaSyncIfNeeded(nm *netmap.NetworkMap) error {
if !envknob.UseWIPCode() {
// If the feature flag is not enabled, pretend we don't exist.
return nil
}
b.tkaSyncLock.Lock() // take tkaSyncLock to make this function an exclusive section.
defer b.tkaSyncLock.Unlock()
b.mu.Lock() // take mu to protect access to synchronized fields.
defer b.mu.Unlock()
ourNodeKey := b.prefs.Persist.PrivateNodeKey.Public()
isEnabled := b.tka != nil
@ -158,6 +166,8 @@ func toSyncOffer(head string, ancestors []string) (tka.SyncOffer, error) {
// tkaSyncLocked synchronizes TKA state with control. b.mu must be held
// and tka must be initialized. b.mu will be stepped out of (and back into)
// during network RPCs.
//
// b.mu must be held.
func (b *LocalBackend) tkaSyncLocked(ourNodeKey key.NodePublic) error {
offer, err := b.tka.authority.SyncOffer(b.tka.storage)
if err != nil {

@ -127,12 +127,10 @@ func TestTKAEnablementFlow(t *testing.T) {
},
}
b.mu.Lock()
err = b.tkaSyncIfNeededLocked(&netmap.NetworkMap{
err = b.tkaSyncIfNeeded(&netmap.NetworkMap{
TKAEnabled: true,
TKAHead: a1.Head(),
})
b.mu.Unlock()
if err != nil {
t.Errorf("tkaSyncIfNeededLocked() failed: %v", err)
}
@ -228,12 +226,10 @@ func TestTKADisablementFlow(t *testing.T) {
// Test that the wrong disablement secret does not shut down the authority.
returnWrongSecret = true
b.mu.Lock()
err = b.tkaSyncIfNeededLocked(&netmap.NetworkMap{
err = b.tkaSyncIfNeeded(&netmap.NetworkMap{
TKAEnabled: false,
TKAHead: authority.Head(),
})
b.mu.Unlock()
if err != nil {
t.Errorf("tkaSyncIfNeededLocked() failed: %v", err)
}
@ -243,12 +239,10 @@ func TestTKADisablementFlow(t *testing.T) {
// Test the correct disablement secret shuts down the authority.
returnWrongSecret = false
b.mu.Lock()
err = b.tkaSyncIfNeededLocked(&netmap.NetworkMap{
err = b.tkaSyncIfNeeded(&netmap.NetworkMap{
TKAEnabled: false,
TKAHead: authority.Head(),
})
b.mu.Unlock()
if err != nil {
t.Errorf("tkaSyncIfNeededLocked() failed: %v", err)
}
@ -468,12 +462,10 @@ func TestTKASync(t *testing.T) {
}
// Finally, lets trigger a sync.
b.mu.Lock()
err = b.tkaSyncIfNeededLocked(&netmap.NetworkMap{
err = b.tkaSyncIfNeeded(&netmap.NetworkMap{
TKAEnabled: true,
TKAHead: controlAuthority.Head(),
})
b.mu.Unlock()
if err != nil {
t.Errorf("tkaSyncIfNeededLocked() failed: %v", err)
}

Loading…
Cancel
Save