From 2f04f49376c15af9146c91aa40ed563925dd8009 Mon Sep 17 00:00:00 2001 From: Brad Fitzpatrick Date: Wed, 23 Dec 2020 13:03:16 -0800 Subject: [PATCH] control/controlclient: use lite map request handler to avoid aborting streams Previously, any change to endpoints or hostinfo (or hostinfo's netinfo) would result in the long-running map request HTTP stream being torn down and restarted, losing all compression context along with it. This change makes us instead send a lite map request (OmitPeers: true, Stream: false) that doesn't subscribe to anything, and then the coordination server knows to not close other streams for that node when it recives a lite request. Fixes tailscale/corp#797 Signed-off-by: Brad Fitzpatrick --- control/controlclient/auto.go | 70 +++++++++++++++++++++++++++------ control/controlclient/direct.go | 26 ++++++++++++ 2 files changed, 83 insertions(+), 13 deletions(-) diff --git a/control/controlclient/auto.go b/control/controlclient/auto.go index 7acb5be63..8bbae5c20 100644 --- a/control/controlclient/auto.go +++ b/control/controlclient/auto.go @@ -117,15 +117,16 @@ type Client struct { mu sync.Mutex // mutex guards the following fields statusFunc func(Status) // called to update Client status - paused bool // whether we should stop making HTTP requests - unpauseWaiters []chan struct{} - loggedIn bool // true if currently logged in - loginGoal *LoginGoal // non-nil if some login activity is desired - synced bool // true if our netmap is up-to-date - hostinfo *tailcfg.Hostinfo - inPollNetMap bool // true if currently running a PollNetMap - inSendStatus int // number of sendStatus calls currently in progress - state State + paused bool // whether we should stop making HTTP requests + unpauseWaiters []chan struct{} + loggedIn bool // true if currently logged in + loginGoal *LoginGoal // non-nil if some login activity is desired + synced bool // true if our netmap is up-to-date + hostinfo *tailcfg.Hostinfo + inPollNetMap bool // true if currently running a PollNetMap + inLiteMapUpdate bool // true if a lite (non-streaming) map request is outstanding + inSendStatus int // number of sendStatus calls currently in progress + state State authCtx context.Context // context used for auth requests mapCtx context.Context // context used for netmap requests @@ -201,6 +202,50 @@ func (c *Client) Start() { go c.mapRoutine() } +// sendNewMapRequest either sends a new OmitPeers, non-streaming map request +// (to just send Hostinfo/Netinfo/Endpoints info, while keeping an existing +// streaming response open), or start a new streaming one if necessary. +// +// It should be called whenever there's something new to tell the server. +func (c *Client) sendNewMapRequest() { + c.mu.Lock() + + // If we're not already streaming a netmap, or if we're already stuck + // in a lite update, then tear down everything and start a new stream + // (which starts by sending a new map request) + if !c.inPollNetMap || c.inLiteMapUpdate { + c.mu.Unlock() + c.cancelMapSafely() + return + } + + // Otherwise, send a lite update that doesn't keep a + // long-running stream response. + defer c.mu.Unlock() + c.inLiteMapUpdate = true + ctx, cancel := context.WithTimeout(c.mapCtx, 10*time.Second) + go func() { + defer cancel() + t0 := time.Now() + err := c.direct.SendLiteMapUpdate(ctx) + d := time.Since(t0).Round(time.Millisecond) + c.mu.Lock() + c.inLiteMapUpdate = false + c.mu.Unlock() + if err == nil { + c.logf("[v1] successful lite map update in %v", d) + return + } + if ctx.Err() == nil { + c.logf("lite map update after %v: %v", d, err) + } + // Fall back to restarting the long-polling map + // request (the old heavy way) if the lite update + // failed for any reason. + c.cancelMapSafely() + }() +} + func (c *Client) cancelAuth() { c.mu.Lock() if c.authCancel != nil { @@ -545,7 +590,7 @@ func (c *Client) SetHostinfo(hi *tailcfg.Hostinfo) { c.logf("Hostinfo: %v", hi) // Send new Hostinfo to server - c.cancelMapSafely() + c.sendNewMapRequest() } func (c *Client) SetNetInfo(ni *tailcfg.NetInfo) { @@ -553,13 +598,12 @@ func (c *Client) SetNetInfo(ni *tailcfg.NetInfo) { panic("nil NetInfo") } if !c.direct.SetNetInfo(ni) { - c.logf("[unexpected] duplicate NetInfo: %v", ni) return } c.logf("NetInfo: %v", ni) // Send new Hostinfo (which includes NetInfo) to server - c.cancelMapSafely() + c.sendNewMapRequest() } func (c *Client) sendStatus(who string, err error, url string, nm *NetworkMap) { @@ -636,7 +680,7 @@ func (c *Client) Logout() { func (c *Client) UpdateEndpoints(localPort uint16, endpoints []string) { changed := c.direct.SetEndpoints(localPort, endpoints) if changed { - c.cancelMapSafely() + c.sendNewMapRequest() } } diff --git a/control/controlclient/direct.go b/control/controlclient/direct.go index c985037ae..612ccbfc6 100644 --- a/control/controlclient/direct.go +++ b/control/controlclient/direct.go @@ -521,6 +521,18 @@ func inTest() bool { return flag.Lookup("test.v") != nil } // maxPolls is how many network maps to download; common values are 1 // or -1 (to keep a long-poll query open to the server). func (c *Direct) PollNetMap(ctx context.Context, maxPolls int, cb func(*NetworkMap)) error { + return c.sendMapRequest(ctx, maxPolls, cb) +} + +// SendLiteMapUpdate makes a /map request to update the server of our latest state, +// but does not fetch anything. It returns an error if the server did not return a +// successful 200 OK response. +func (c *Direct) SendLiteMapUpdate(ctx context.Context) error { + return c.sendMapRequest(ctx, 1, nil) +} + +// cb nil means to omit peers. +func (c *Direct) sendMapRequest(ctx context.Context, maxPolls int, cb func(*NetworkMap)) error { c.mu.Lock() persist := c.persist serverURL := c.serverURL @@ -555,6 +567,7 @@ func (c *Direct) PollNetMap(ctx context.Context, maxPolls int, cb func(*NetworkM Stream: allowStream, Hostinfo: hostinfo, DebugFlags: c.debugFlags, + OmitPeers: cb == nil, } if hostinfo != nil && ipForwardingBroken(hostinfo.RoutableIPs) { old := request.DebugFlags @@ -607,6 +620,11 @@ func (c *Direct) PollNetMap(ctx context.Context, maxPolls int, cb func(*NetworkM } defer res.Body.Close() + if cb == nil { + io.Copy(ioutil.Discard, res.Body) + return nil + } + // If we go more than pollTimeout without hearing from the server, // end the long poll. We should be receiving a keep alive ping // every minute. @@ -724,6 +742,14 @@ func (c *Direct) PollNetMap(ctx context.Context, maxPolls int, cb func(*NetworkM lastParsedPacketFilter = c.parsePacketFilter(pf) } + // Get latest localPort. This might've changed if + // a lite map update occured meanwhile. This only affects + // the end-to-end test. + // TODO(bradfitz): remove the NetworkMap.LocalPort field entirely. + c.mu.Lock() + localPort = c.localPort + c.mu.Unlock() + nm := &NetworkMap{ NodeKey: tailcfg.NodeKey(persist.PrivateNodeKey.Public()), PrivateKey: persist.PrivateNodeKey,