From 85184a58ed8120529ede754a8df2446faa890107 Mon Sep 17 00:00:00 2001 From: Josh Bleecher Snyder Date: Wed, 10 Nov 2021 18:42:16 -0800 Subject: [PATCH] wgengine/wgcfg: recover from mismatched PublicKey/Endpoints In rare circumstances (tailscale/corp#3016), the PublicKey and Endpoints can diverge. This by itself doesn't cause any harm, but our early exit in response did, because it prevented us from recovering from it. Remove the early exit. Signed-off-by: Josh Bleecher Snyder --- wgengine/wgcfg/clone.go | 1 + wgengine/wgcfg/config.go | 5 +++++ wgengine/wgcfg/device.go | 2 +- wgengine/wgcfg/device_test.go | 4 ++-- wgengine/wgcfg/parser.go | 8 +++++--- wgengine/wgcfg/parser_test.go | 2 +- wgengine/wgcfg/writer.go | 14 ++++++++++++-- 7 files changed, 27 insertions(+), 9 deletions(-) diff --git a/wgengine/wgcfg/clone.go b/wgengine/wgcfg/clone.go index efd280a00..6dd3f4154 100644 --- a/wgengine/wgcfg/clone.go +++ b/wgengine/wgcfg/clone.go @@ -57,4 +57,5 @@ var _PeerCloneNeedsRegeneration = Peer(struct { DiscoKey key.DiscoPublic AllowedIPs []netaddr.IPPrefix PersistentKeepalive uint16 + WGEndpoint key.NodePublic }{}) diff --git a/wgengine/wgcfg/config.go b/wgengine/wgcfg/config.go index 6d64502ed..095e2ca1a 100644 --- a/wgengine/wgcfg/config.go +++ b/wgengine/wgcfg/config.go @@ -28,6 +28,11 @@ type Peer struct { DiscoKey key.DiscoPublic // present only so we can handle restarts within wgengine, not passed to WireGuard AllowedIPs []netaddr.IPPrefix PersistentKeepalive uint16 + // wireguard-go's endpoint for this peer. It should always equal Peer.PublicKey. + // We represent it explicitly so that we can detect if they diverge and recover. + // There is no need to set WGEndpoint explicitly when constructing a Peer by hand. + // It is only populated when reading Peers from wireguard-go. + WGEndpoint key.NodePublic } // PeerWithKey returns the Peer with key k and reports whether it was found. diff --git a/wgengine/wgcfg/device.go b/wgengine/wgcfg/device.go index 69cb80f2f..744d24cc6 100644 --- a/wgengine/wgcfg/device.go +++ b/wgengine/wgcfg/device.go @@ -62,7 +62,7 @@ func ReconfigDevice(d *device.Device, cfg *Config, logf logger.Logf) (err error) r.Close() }() - toErr := cfg.ToUAPI(w, prev) + toErr := cfg.ToUAPI(logf, w, prev) w.Close() setErr := <-errc return multierr.New(setErr, toErr) diff --git a/wgengine/wgcfg/device_test.go b/wgengine/wgcfg/device_test.go index e2252a390..b20fdce15 100644 --- a/wgengine/wgcfg/device_test.go +++ b/wgengine/wgcfg/device_test.go @@ -68,14 +68,14 @@ func TestDeviceConfig(t *testing.T) { } prev := new(Config) gotbuf := new(strings.Builder) - err = got.ToUAPI(gotbuf, prev) + err = got.ToUAPI(t.Logf, gotbuf, prev) gotStr := gotbuf.String() if err != nil { t.Errorf("got.ToUAPI(): error: %v", err) return } wantbuf := new(strings.Builder) - err = want.ToUAPI(wantbuf, prev) + err = want.ToUAPI(t.Logf, wantbuf, prev) wantStr := wantbuf.String() if err != nil { t.Errorf("want.ToUAPI(): error: %v", err) diff --git a/wgengine/wgcfg/parser.go b/wgengine/wgcfg/parser.go index 1a4230220..e112f4f70 100644 --- a/wgengine/wgcfg/parser.go +++ b/wgengine/wgcfg/parser.go @@ -151,9 +151,11 @@ func (cfg *Config) handlePeerLine(peer *Peer, k, value mem.RO, valueBytes []byte if err != nil { return fmt.Errorf("invalid endpoint %q for peer %q, expected a hex public key", value.StringCopy(), peer.PublicKey.ShortString()) } - if nk != peer.PublicKey { - return fmt.Errorf("unexpected endpoint %q for peer %q, expected the peer's public key", value.StringCopy(), peer.PublicKey.ShortString()) - } + // nk ought to equal peer.PublicKey. + // Under some rare circumstances, it might not. See corp issue #3016. + // Even if that happens, don't stop early, so that we can recover from it. + // Instead, note the value of nk so we can fix as needed. + peer.WGEndpoint = nk case k.EqualString("persistent_keepalive_interval"): n, err := mem.ParseUint(value, 10, 16) if err != nil { diff --git a/wgengine/wgcfg/parser_test.go b/wgengine/wgcfg/parser_test.go index 156dab14f..3835172ed 100644 --- a/wgengine/wgcfg/parser_test.go +++ b/wgengine/wgcfg/parser_test.go @@ -80,7 +80,7 @@ func BenchmarkFromUAPI(b *testing.B) { buf := new(bytes.Buffer) w := bufio.NewWriter(buf) - if err := cfg1.ToUAPI(w, &Config{}); err != nil { + if err := cfg1.ToUAPI(b.Logf, w, &Config{}); err != nil { b.Fatal(err) } w.Flush() diff --git a/wgengine/wgcfg/writer.go b/wgengine/wgcfg/writer.go index c7be7d024..ce26d794a 100644 --- a/wgengine/wgcfg/writer.go +++ b/wgengine/wgcfg/writer.go @@ -11,13 +11,14 @@ import ( "inet.af/netaddr" "tailscale.com/types/key" + "tailscale.com/types/logger" ) // ToUAPI writes cfg in UAPI format to w. // Prev is the previous device Config. // Prev is required so that we can remove now-defunct peers // without having to remove and re-add all peers. -func (cfg *Config) ToUAPI(w io.Writer, prev *Config) error { +func (cfg *Config) ToUAPI(logf logger.Logf, w io.Writer, prev *Config) error { var stickyErr error set := func(key, value string) { if stickyErr != nil { @@ -54,7 +55,16 @@ func (cfg *Config) ToUAPI(w io.Writer, prev *Config) error { // Avoid setting endpoints if the correct one is already known // to WireGuard, because doing so generates a bit more work in // calling magicsock's ParseEndpoint for effectively a no-op. - if !wasPresent { + if oldPeer.WGEndpoint != p.PublicKey { + if wasPresent { + // We had an endpoint, and it was wrong. + // By construction, this should not happen. + // If it does, keep going so that we can recover from it, + // but log so that we know about it, + // because it is an indicator of other failed invariants. + // See corp issue 3016. + logf("[unexpected] endpoint changed from %s to %s", oldPeer.WGEndpoint, p.PublicKey) + } set("endpoint", p.PublicKey.UntypedHexString()) }