wgengine/wgcfg: recover from mismatched PublicKey/Endpoints

In rare circumstances (tailscale/corp#3016), the PublicKey
and Endpoints can diverge.

This by itself doesn't cause any harm, but our early exit
in response did, because it prevented us from recovering from it.

Remove the early exit.

Signed-off-by: Josh Bleecher Snyder <josh@tailscale.com>
Xe/tailtlsproxy
Josh Bleecher Snyder 3 years ago committed by Josh Bleecher Snyder
parent 9fc4e876e3
commit 85184a58ed

@ -57,4 +57,5 @@ var _PeerCloneNeedsRegeneration = Peer(struct {
DiscoKey key.DiscoPublic DiscoKey key.DiscoPublic
AllowedIPs []netaddr.IPPrefix AllowedIPs []netaddr.IPPrefix
PersistentKeepalive uint16 PersistentKeepalive uint16
WGEndpoint key.NodePublic
}{}) }{})

@ -28,6 +28,11 @@ type Peer struct {
DiscoKey key.DiscoPublic // present only so we can handle restarts within wgengine, not passed to WireGuard DiscoKey key.DiscoPublic // present only so we can handle restarts within wgengine, not passed to WireGuard
AllowedIPs []netaddr.IPPrefix AllowedIPs []netaddr.IPPrefix
PersistentKeepalive uint16 PersistentKeepalive uint16
// wireguard-go's endpoint for this peer. It should always equal Peer.PublicKey.
// We represent it explicitly so that we can detect if they diverge and recover.
// There is no need to set WGEndpoint explicitly when constructing a Peer by hand.
// It is only populated when reading Peers from wireguard-go.
WGEndpoint key.NodePublic
} }
// PeerWithKey returns the Peer with key k and reports whether it was found. // PeerWithKey returns the Peer with key k and reports whether it was found.

@ -62,7 +62,7 @@ func ReconfigDevice(d *device.Device, cfg *Config, logf logger.Logf) (err error)
r.Close() r.Close()
}() }()
toErr := cfg.ToUAPI(w, prev) toErr := cfg.ToUAPI(logf, w, prev)
w.Close() w.Close()
setErr := <-errc setErr := <-errc
return multierr.New(setErr, toErr) return multierr.New(setErr, toErr)

@ -68,14 +68,14 @@ func TestDeviceConfig(t *testing.T) {
} }
prev := new(Config) prev := new(Config)
gotbuf := new(strings.Builder) gotbuf := new(strings.Builder)
err = got.ToUAPI(gotbuf, prev) err = got.ToUAPI(t.Logf, gotbuf, prev)
gotStr := gotbuf.String() gotStr := gotbuf.String()
if err != nil { if err != nil {
t.Errorf("got.ToUAPI(): error: %v", err) t.Errorf("got.ToUAPI(): error: %v", err)
return return
} }
wantbuf := new(strings.Builder) wantbuf := new(strings.Builder)
err = want.ToUAPI(wantbuf, prev) err = want.ToUAPI(t.Logf, wantbuf, prev)
wantStr := wantbuf.String() wantStr := wantbuf.String()
if err != nil { if err != nil {
t.Errorf("want.ToUAPI(): error: %v", err) t.Errorf("want.ToUAPI(): error: %v", err)

@ -151,9 +151,11 @@ func (cfg *Config) handlePeerLine(peer *Peer, k, value mem.RO, valueBytes []byte
if err != nil { if err != nil {
return fmt.Errorf("invalid endpoint %q for peer %q, expected a hex public key", value.StringCopy(), peer.PublicKey.ShortString()) return fmt.Errorf("invalid endpoint %q for peer %q, expected a hex public key", value.StringCopy(), peer.PublicKey.ShortString())
} }
if nk != peer.PublicKey { // nk ought to equal peer.PublicKey.
return fmt.Errorf("unexpected endpoint %q for peer %q, expected the peer's public key", value.StringCopy(), peer.PublicKey.ShortString()) // Under some rare circumstances, it might not. See corp issue #3016.
} // Even if that happens, don't stop early, so that we can recover from it.
// Instead, note the value of nk so we can fix as needed.
peer.WGEndpoint = nk
case k.EqualString("persistent_keepalive_interval"): case k.EqualString("persistent_keepalive_interval"):
n, err := mem.ParseUint(value, 10, 16) n, err := mem.ParseUint(value, 10, 16)
if err != nil { if err != nil {

@ -80,7 +80,7 @@ func BenchmarkFromUAPI(b *testing.B) {
buf := new(bytes.Buffer) buf := new(bytes.Buffer)
w := bufio.NewWriter(buf) w := bufio.NewWriter(buf)
if err := cfg1.ToUAPI(w, &Config{}); err != nil { if err := cfg1.ToUAPI(b.Logf, w, &Config{}); err != nil {
b.Fatal(err) b.Fatal(err)
} }
w.Flush() w.Flush()

@ -11,13 +11,14 @@ import (
"inet.af/netaddr" "inet.af/netaddr"
"tailscale.com/types/key" "tailscale.com/types/key"
"tailscale.com/types/logger"
) )
// ToUAPI writes cfg in UAPI format to w. // ToUAPI writes cfg in UAPI format to w.
// Prev is the previous device Config. // Prev is the previous device Config.
// Prev is required so that we can remove now-defunct peers // Prev is required so that we can remove now-defunct peers
// without having to remove and re-add all peers. // without having to remove and re-add all peers.
func (cfg *Config) ToUAPI(w io.Writer, prev *Config) error { func (cfg *Config) ToUAPI(logf logger.Logf, w io.Writer, prev *Config) error {
var stickyErr error var stickyErr error
set := func(key, value string) { set := func(key, value string) {
if stickyErr != nil { if stickyErr != nil {
@ -54,7 +55,16 @@ func (cfg *Config) ToUAPI(w io.Writer, prev *Config) error {
// Avoid setting endpoints if the correct one is already known // Avoid setting endpoints if the correct one is already known
// to WireGuard, because doing so generates a bit more work in // to WireGuard, because doing so generates a bit more work in
// calling magicsock's ParseEndpoint for effectively a no-op. // calling magicsock's ParseEndpoint for effectively a no-op.
if !wasPresent { if oldPeer.WGEndpoint != p.PublicKey {
if wasPresent {
// We had an endpoint, and it was wrong.
// By construction, this should not happen.
// If it does, keep going so that we can recover from it,
// but log so that we know about it,
// because it is an indicator of other failed invariants.
// See corp issue 3016.
logf("[unexpected] endpoint changed from %s to %s", oldPeer.WGEndpoint, p.PublicKey)
}
set("endpoint", p.PublicKey.UntypedHexString()) set("endpoint", p.PublicKey.UntypedHexString())
} }

Loading…
Cancel
Save