diff --git a/wgengine/wgcfg/clone.go b/wgengine/wgcfg/clone.go index efd280a00..6dd3f4154 100644 --- a/wgengine/wgcfg/clone.go +++ b/wgengine/wgcfg/clone.go @@ -57,4 +57,5 @@ var _PeerCloneNeedsRegeneration = Peer(struct { DiscoKey key.DiscoPublic AllowedIPs []netaddr.IPPrefix PersistentKeepalive uint16 + WGEndpoint key.NodePublic }{}) diff --git a/wgengine/wgcfg/config.go b/wgengine/wgcfg/config.go index 6d64502ed..095e2ca1a 100644 --- a/wgengine/wgcfg/config.go +++ b/wgengine/wgcfg/config.go @@ -28,6 +28,11 @@ type Peer struct { DiscoKey key.DiscoPublic // present only so we can handle restarts within wgengine, not passed to WireGuard AllowedIPs []netaddr.IPPrefix PersistentKeepalive uint16 + // wireguard-go's endpoint for this peer. It should always equal Peer.PublicKey. + // We represent it explicitly so that we can detect if they diverge and recover. + // There is no need to set WGEndpoint explicitly when constructing a Peer by hand. + // It is only populated when reading Peers from wireguard-go. + WGEndpoint key.NodePublic } // PeerWithKey returns the Peer with key k and reports whether it was found. diff --git a/wgengine/wgcfg/device.go b/wgengine/wgcfg/device.go index 69cb80f2f..744d24cc6 100644 --- a/wgengine/wgcfg/device.go +++ b/wgengine/wgcfg/device.go @@ -62,7 +62,7 @@ func ReconfigDevice(d *device.Device, cfg *Config, logf logger.Logf) (err error) r.Close() }() - toErr := cfg.ToUAPI(w, prev) + toErr := cfg.ToUAPI(logf, w, prev) w.Close() setErr := <-errc return multierr.New(setErr, toErr) diff --git a/wgengine/wgcfg/device_test.go b/wgengine/wgcfg/device_test.go index e2252a390..b20fdce15 100644 --- a/wgengine/wgcfg/device_test.go +++ b/wgengine/wgcfg/device_test.go @@ -68,14 +68,14 @@ func TestDeviceConfig(t *testing.T) { } prev := new(Config) gotbuf := new(strings.Builder) - err = got.ToUAPI(gotbuf, prev) + err = got.ToUAPI(t.Logf, gotbuf, prev) gotStr := gotbuf.String() if err != nil { t.Errorf("got.ToUAPI(): error: %v", err) return } wantbuf := new(strings.Builder) - err = want.ToUAPI(wantbuf, prev) + err = want.ToUAPI(t.Logf, wantbuf, prev) wantStr := wantbuf.String() if err != nil { t.Errorf("want.ToUAPI(): error: %v", err) diff --git a/wgengine/wgcfg/parser.go b/wgengine/wgcfg/parser.go index 1a4230220..e112f4f70 100644 --- a/wgengine/wgcfg/parser.go +++ b/wgengine/wgcfg/parser.go @@ -151,9 +151,11 @@ func (cfg *Config) handlePeerLine(peer *Peer, k, value mem.RO, valueBytes []byte if err != nil { return fmt.Errorf("invalid endpoint %q for peer %q, expected a hex public key", value.StringCopy(), peer.PublicKey.ShortString()) } - if nk != peer.PublicKey { - return fmt.Errorf("unexpected endpoint %q for peer %q, expected the peer's public key", value.StringCopy(), peer.PublicKey.ShortString()) - } + // nk ought to equal peer.PublicKey. + // Under some rare circumstances, it might not. See corp issue #3016. + // Even if that happens, don't stop early, so that we can recover from it. + // Instead, note the value of nk so we can fix as needed. + peer.WGEndpoint = nk case k.EqualString("persistent_keepalive_interval"): n, err := mem.ParseUint(value, 10, 16) if err != nil { diff --git a/wgengine/wgcfg/parser_test.go b/wgengine/wgcfg/parser_test.go index 156dab14f..3835172ed 100644 --- a/wgengine/wgcfg/parser_test.go +++ b/wgengine/wgcfg/parser_test.go @@ -80,7 +80,7 @@ func BenchmarkFromUAPI(b *testing.B) { buf := new(bytes.Buffer) w := bufio.NewWriter(buf) - if err := cfg1.ToUAPI(w, &Config{}); err != nil { + if err := cfg1.ToUAPI(b.Logf, w, &Config{}); err != nil { b.Fatal(err) } w.Flush() diff --git a/wgengine/wgcfg/writer.go b/wgengine/wgcfg/writer.go index c7be7d024..ce26d794a 100644 --- a/wgengine/wgcfg/writer.go +++ b/wgengine/wgcfg/writer.go @@ -11,13 +11,14 @@ import ( "inet.af/netaddr" "tailscale.com/types/key" + "tailscale.com/types/logger" ) // ToUAPI writes cfg in UAPI format to w. // Prev is the previous device Config. // Prev is required so that we can remove now-defunct peers // without having to remove and re-add all peers. -func (cfg *Config) ToUAPI(w io.Writer, prev *Config) error { +func (cfg *Config) ToUAPI(logf logger.Logf, w io.Writer, prev *Config) error { var stickyErr error set := func(key, value string) { if stickyErr != nil { @@ -54,7 +55,16 @@ func (cfg *Config) ToUAPI(w io.Writer, prev *Config) error { // Avoid setting endpoints if the correct one is already known // to WireGuard, because doing so generates a bit more work in // calling magicsock's ParseEndpoint for effectively a no-op. - if !wasPresent { + if oldPeer.WGEndpoint != p.PublicKey { + if wasPresent { + // We had an endpoint, and it was wrong. + // By construction, this should not happen. + // If it does, keep going so that we can recover from it, + // but log so that we know about it, + // because it is an indicator of other failed invariants. + // See corp issue 3016. + logf("[unexpected] endpoint changed from %s to %s", oldPeer.WGEndpoint, p.PublicKey) + } set("endpoint", p.PublicKey.UntypedHexString()) }