control/controlclient: improve handling of concurrent lite map requests

Prior to this change, if we were in the middle of a lite map update we'd
tear down the entire map session and restart it. With this change, we'll
cancel an in-flight lite map request up to 10 times and restart before
we tear down the streaming map request. We tear down everything after 10
retries to ensure that a steady stream of calls to sendNewMapRequest
doesn't fail to make progress by repeatedly canceling and restarting.

Signed-off-by: Andrew Dunham <andrew@du.nham.ca>
Co-authored-by: Maisem Ali <maisem@tailscale.com>
Change-Id: I9392bf8cf674e7a58ccd1e476039300a359ef3b1
pull/6660/merge
Andrew Dunham 1 year ago
parent b0cb39cda1
commit 48f6c1eba4

@ -66,6 +66,8 @@ type Auto struct {
synced bool // true if our netmap is up-to-date synced bool // true if our netmap is up-to-date
inPollNetMap bool // true if currently running a PollNetMap inPollNetMap bool // true if currently running a PollNetMap
inLiteMapUpdate bool // true if a lite (non-streaming) map request is outstanding inLiteMapUpdate bool // true if a lite (non-streaming) map request is outstanding
liteMapUpdateCancel func() // cancels a lite map update
liteMapUpdateCancels int // how many times we've canceled a lite map update
inSendStatus int // number of sendStatus calls currently in progress inSendStatus int // number of sendStatus calls currently in progress
state State state State
@ -168,28 +170,55 @@ func (c *Auto) Start() {
func (c *Auto) sendNewMapRequest() { func (c *Auto) sendNewMapRequest() {
c.mu.Lock() c.mu.Lock()
// If we're not already streaming a netmap, or if we're already stuck // If we're not already streaming a netmap, then tear down everything
// in a lite update, then tear down everything and start a new stream // and start a new stream (which starts by sending a new map request)
// (which starts by sending a new map request) if !c.inPollNetMap || !c.loggedIn {
if !c.inPollNetMap || c.inLiteMapUpdate || !c.loggedIn {
c.mu.Unlock() c.mu.Unlock()
c.cancelMapSafely() c.cancelMapSafely()
return return
} }
// If we are already in process of doing a LiteMapUpdate, cancel it and
// try a new one. If this is the 10th time we have done this
// cancelation, tear down everything and start again
if c.inLiteMapUpdate {
// Always cancel the in-flight lite map update, regardless of
// whether we cancel the streaming map request or not.
c.liteMapUpdateCancel()
c.inLiteMapUpdate = false
if c.liteMapUpdateCancels > 10 {
// Not making progress
c.mu.Unlock()
c.cancelMapSafely()
return
}
// Increment our cancel counter and continue below to start a
// new lite update.
c.liteMapUpdateCancels++
}
// Otherwise, send a lite update that doesn't keep a // Otherwise, send a lite update that doesn't keep a
// long-running stream response. // long-running stream response.
defer c.mu.Unlock() defer c.mu.Unlock()
c.inLiteMapUpdate = true c.inLiteMapUpdate = true
ctx, cancel := context.WithTimeout(c.mapCtx, 10*time.Second) ctx, cancel := context.WithTimeout(c.mapCtx, 10*time.Second)
c.liteMapUpdateCancel = cancel
go func() { go func() {
defer cancel() defer cancel()
t0 := time.Now() t0 := time.Now()
err := c.direct.SendLiteMapUpdate(ctx) err := c.direct.SendLiteMapUpdate(ctx)
d := time.Since(t0).Round(time.Millisecond) d := time.Since(t0).Round(time.Millisecond)
c.mu.Lock() c.mu.Lock()
c.inLiteMapUpdate = false c.inLiteMapUpdate = false
c.liteMapUpdateCancel = nil
if err == nil {
c.liteMapUpdateCancels = 0
}
c.mu.Unlock() c.mu.Unlock()
if err == nil { if err == nil {
c.logf("[v1] successful lite map update in %v", d) c.logf("[v1] successful lite map update in %v", d)
return return
@ -197,10 +226,13 @@ func (c *Auto) sendNewMapRequest() {
if ctx.Err() == nil { if ctx.Err() == nil {
c.logf("lite map update after %v: %v", d, err) c.logf("lite map update after %v: %v", d, err)
} }
if !errors.Is(ctx.Err(), context.Canceled) {
// Fall back to restarting the long-polling map // Fall back to restarting the long-polling map
// request (the old heavy way) if the lite update // request (the old heavy way) if the lite update
// failed for any reason. // failed for reasons other than the context being
// canceled.
c.cancelMapSafely() c.cancelMapSafely()
}
}() }()
} }
@ -237,6 +269,12 @@ func (c *Auto) cancelMapSafely() {
c.mu.Lock() c.mu.Lock()
defer c.mu.Unlock() defer c.mu.Unlock()
// Always reset our lite map cancels counter if we're canceling
// everything, since we're about to restart with a new map update; this
// allows future calls to sendNewMapRequest to retry sending lite
// updates.
c.liteMapUpdateCancels = 0
c.logf("[v1] cancelMapSafely: synced=%v", c.synced) c.logf("[v1] cancelMapSafely: synced=%v", c.synced)
if c.inPollNetMap { if c.inPollNetMap {

Loading…
Cancel
Save