From c679aaba32c27681845466df9e6df69fe0704b95 Mon Sep 17 00:00:00 2001 From: Andrew Lytvynov Date: Thu, 20 Nov 2025 15:52:58 -0600 Subject: [PATCH] cmd/tailscaled,ipn: show a health warning when state store fails to open (#17883) With the introduction of node sealing, store.New fails in some cases due to the TPM device being reset or unavailable. Currently it results in tailscaled crashing at startup, which is not obvious to the user until they check the logs. Instead of crashing tailscaled at startup, start with an in-memory store with a health warning about state initialization and a link to (future) docs on what to do. When this health message is set, also block any login attempts to avoid masking the problem with an ephemeral node registration. Updates #15830 Updates #17654 Signed-off-by: Andrew Lytvynov --- cmd/tailscaled/depaware-min.txt | 2 +- cmd/tailscaled/depaware-minbox.txt | 2 +- cmd/tailscaled/tailscaled.go | 13 +++- cmd/tailscaled/tailscaled_test.go | 50 +++++++++++++ ipn/ipnlocal/local.go | 9 +++ ipn/localapi/localapi.go | 10 ++- ipn/localapi/localapi_test.go | 72 +++++++++++++++++++ ipn/store.go | 15 ++++ tstest/integration/integration_test.go | 37 ++++++++++ .../tailscaled_deps_test_darwin.go | 1 + .../tailscaled_deps_test_freebsd.go | 1 + .../integration/tailscaled_deps_test_linux.go | 1 + .../tailscaled_deps_test_openbsd.go | 1 + .../tailscaled_deps_test_windows.go | 1 + 14 files changed, 211 insertions(+), 4 deletions(-) diff --git a/cmd/tailscaled/depaware-min.txt b/cmd/tailscaled/depaware-min.txt index e750f86e6..3c111470f 100644 --- a/cmd/tailscaled/depaware-min.txt +++ b/cmd/tailscaled/depaware-min.txt @@ -69,7 +69,7 @@ tailscale.com/cmd/tailscaled dependencies: (generated by github.com/tailscale/de tailscale.com/ipn/ipnstate from tailscale.com/control/controlclient+ tailscale.com/ipn/localapi from tailscale.com/ipn/ipnserver tailscale.com/ipn/store from tailscale.com/cmd/tailscaled - tailscale.com/ipn/store/mem from tailscale.com/ipn/store + tailscale.com/ipn/store/mem from tailscale.com/ipn/store+ tailscale.com/kube/kubetypes from tailscale.com/envknob tailscale.com/log/filelogger from tailscale.com/logpolicy tailscale.com/log/sockstatlog from tailscale.com/ipn/ipnlocal diff --git a/cmd/tailscaled/depaware-minbox.txt b/cmd/tailscaled/depaware-minbox.txt index 17f1a22b2..40a1fb2a4 100644 --- a/cmd/tailscaled/depaware-minbox.txt +++ b/cmd/tailscaled/depaware-minbox.txt @@ -92,7 +92,7 @@ tailscale.com/cmd/tailscaled dependencies: (generated by github.com/tailscale/de tailscale.com/ipn/ipnstate from tailscale.com/control/controlclient+ tailscale.com/ipn/localapi from tailscale.com/ipn/ipnserver tailscale.com/ipn/store from tailscale.com/cmd/tailscaled - tailscale.com/ipn/store/mem from tailscale.com/ipn/store + tailscale.com/ipn/store/mem from tailscale.com/ipn/store+ tailscale.com/kube/kubetypes from tailscale.com/envknob tailscale.com/licenses from tailscale.com/cmd/tailscale/cli tailscale.com/log/filelogger from tailscale.com/logpolicy diff --git a/cmd/tailscaled/tailscaled.go b/cmd/tailscaled/tailscaled.go index f14cdcff0..d923ca1ed 100644 --- a/cmd/tailscaled/tailscaled.go +++ b/cmd/tailscaled/tailscaled.go @@ -33,12 +33,14 @@ import ( "tailscale.com/feature" "tailscale.com/feature/buildfeatures" _ "tailscale.com/feature/condregister" + "tailscale.com/health" "tailscale.com/hostinfo" "tailscale.com/ipn" "tailscale.com/ipn/conffile" "tailscale.com/ipn/ipnlocal" "tailscale.com/ipn/ipnserver" "tailscale.com/ipn/store" + "tailscale.com/ipn/store/mem" "tailscale.com/logpolicy" "tailscale.com/logtail" "tailscale.com/net/dns" @@ -644,7 +646,16 @@ func getLocalBackend(ctx context.Context, logf logger.Logf, logID logid.PublicID store, err := store.New(logf, statePathOrDefault()) if err != nil { - return nil, fmt.Errorf("store.New: %w", err) + // If we can't create the store (for example if it's TPM-sealed and the + // TPM is reset), create a dummy in-memory store to propagate the error + // to the user. + ht, ok := sys.HealthTracker.GetOK() + if !ok { + return nil, fmt.Errorf("store.New: %w", err) + } + logf("store.New failed: %v; starting with in-memory store with a health warning", err) + store = new(mem.Store) + ht.SetUnhealthy(ipn.StateStoreHealth, health.Args{health.ArgError: err.Error()}) } sys.Set(store) diff --git a/cmd/tailscaled/tailscaled_test.go b/cmd/tailscaled/tailscaled_test.go index c50c23759..1188ad35f 100644 --- a/cmd/tailscaled/tailscaled_test.go +++ b/cmd/tailscaled/tailscaled_test.go @@ -4,9 +4,17 @@ package main // import "tailscale.com/cmd/tailscaled" import ( + "os" + "strings" "testing" + "tailscale.com/envknob" + "tailscale.com/ipn" + "tailscale.com/net/netmon" + "tailscale.com/tsd" "tailscale.com/tstest/deptest" + "tailscale.com/types/logid" + "tailscale.com/util/must" ) func TestNothing(t *testing.T) { @@ -38,3 +46,45 @@ func TestDeps(t *testing.T) { }, }.Check(t) } + +func TestStateStoreError(t *testing.T) { + logID, err := logid.NewPrivateID() + if err != nil { + t.Fatal(err) + } + // Don't upload any logs from tests. + envknob.SetNoLogsNoSupport() + + args.statedir = t.TempDir() + args.tunname = "userspace-networking" + + t.Run("new state", func(t *testing.T) { + sys := tsd.NewSystem() + sys.NetMon.Set(must.Get(netmon.New(sys.Bus.Get(), t.Logf))) + lb, err := getLocalBackend(t.Context(), t.Logf, logID.Public(), sys) + if err != nil { + t.Fatal(err) + } + defer lb.Shutdown() + if lb.HealthTracker().IsUnhealthy(ipn.StateStoreHealth) { + t.Errorf("StateStoreHealth is unhealthy on fresh LocalBackend:\n%s", strings.Join(lb.HealthTracker().Strings(), "\n")) + } + }) + t.Run("corrupt state", func(t *testing.T) { + sys := tsd.NewSystem() + sys.NetMon.Set(must.Get(netmon.New(sys.Bus.Get(), t.Logf))) + // Populate the state file with something that will fail to parse to + // trigger an error from store.New. + if err := os.WriteFile(statePathOrDefault(), []byte("bad json"), 0644); err != nil { + t.Fatal(err) + } + lb, err := getLocalBackend(t.Context(), t.Logf, logID.Public(), sys) + if err != nil { + t.Fatal(err) + } + defer lb.Shutdown() + if !lb.HealthTracker().IsUnhealthy(ipn.StateStoreHealth) { + t.Errorf("StateStoreHealth is healthy when state file is corrupt") + } + }) +} diff --git a/ipn/ipnlocal/local.go b/ipn/ipnlocal/local.go index 0ff299399..72b230327 100644 --- a/ipn/ipnlocal/local.go +++ b/ipn/ipnlocal/local.go @@ -3747,6 +3747,9 @@ func (b *LocalBackend) StartLoginInteractive(ctx context.Context) error { // the control plane sends us one. Otherwise, the notification will be delivered to all // active [watchSession]s. func (b *LocalBackend) StartLoginInteractiveAs(ctx context.Context, user ipnauth.Actor) error { + if b.health.IsUnhealthy(ipn.StateStoreHealth) { + return errors.New("cannot log in when state store is unhealthy") + } b.mu.Lock() defer b.mu.Unlock() if b.cc == nil { @@ -5677,6 +5680,9 @@ func (b *LocalBackend) NodeKey() key.NodePublic { // // b.mu must be held func (b *LocalBackend) nextStateLocked() ipn.State { + if b.health.IsUnhealthy(ipn.StateStoreHealth) { + return ipn.NoState + } var ( cc = b.cc cn = b.currentNode() @@ -6936,6 +6942,9 @@ func (b *LocalBackend) CurrentProfile() ipn.LoginProfileView { // NewProfile creates and switches to the new profile. func (b *LocalBackend) NewProfile() error { + if b.health.IsUnhealthy(ipn.StateStoreHealth) { + return errors.New("cannot log in when state store is unhealthy") + } b.mu.Lock() defer b.mu.Unlock() diff --git a/ipn/localapi/localapi.go b/ipn/localapi/localapi.go index d3503d302..7f249fe53 100644 --- a/ipn/localapi/localapi.go +++ b/ipn/localapi/localapi.go @@ -930,7 +930,10 @@ func (h *Handler) serveLoginInteractive(w http.ResponseWriter, r *http.Request) http.Error(w, "want POST", http.StatusBadRequest) return } - h.b.StartLoginInteractiveAs(r.Context(), h.Actor) + if err := h.b.StartLoginInteractiveAs(r.Context(), h.Actor); err != nil { + http.Error(w, err.Error(), http.StatusInternalServerError) + return + } w.WriteHeader(http.StatusNoContent) return } @@ -949,6 +952,11 @@ func (h *Handler) serveStart(w http.ResponseWriter, r *http.Request) { http.Error(w, err.Error(), http.StatusBadRequest) return } + + if h.b.HealthTracker().IsUnhealthy(ipn.StateStoreHealth) { + http.Error(w, "cannot start backend when state store is unhealthy", http.StatusInternalServerError) + return + } err := h.b.Start(o) if err != nil { // TODO(bradfitz): map error to a good HTTP error diff --git a/ipn/localapi/localapi_test.go b/ipn/localapi/localapi_test.go index 6bb9b5182..5d228ffd6 100644 --- a/ipn/localapi/localapi_test.go +++ b/ipn/localapi/localapi_test.go @@ -25,9 +25,11 @@ import ( "testing" "tailscale.com/client/tailscale/apitype" + "tailscale.com/health" "tailscale.com/ipn" "tailscale.com/ipn/ipnauth" "tailscale.com/ipn/ipnlocal" + "tailscale.com/ipn/ipnstate" "tailscale.com/ipn/store/mem" "tailscale.com/tailcfg" "tailscale.com/tsd" @@ -428,3 +430,73 @@ func TestKeepItSorted(t *testing.T) { } } } + +func TestServeWithUnhealthyState(t *testing.T) { + tstest.Replace(t, &validLocalHostForTesting, true) + h := &Handler{ + PermitRead: true, + PermitWrite: true, + b: newTestLocalBackend(t), + logf: t.Logf, + } + h.b.HealthTracker().SetUnhealthy(ipn.StateStoreHealth, health.Args{health.ArgError: "testing"}) + if err := h.b.Start(ipn.Options{}); err != nil { + t.Fatal(err) + } + + check500Body := func(wantResp string) func(t *testing.T, code int, resp []byte) { + return func(t *testing.T, code int, resp []byte) { + if code != http.StatusInternalServerError { + t.Errorf("got code: %v, want %v\nresponse: %q", code, http.StatusInternalServerError, resp) + } + if got := strings.TrimSpace(string(resp)); got != wantResp { + t.Errorf("got response: %q, want %q", got, wantResp) + } + } + } + tests := []struct { + desc string + req *http.Request + check func(t *testing.T, code int, resp []byte) + }{ + { + desc: "status", + req: httptest.NewRequest("GET", "http://localhost:1234/localapi/v0/status", nil), + check: func(t *testing.T, code int, resp []byte) { + if code != http.StatusOK { + t.Errorf("got code: %v, want %v\nresponse: %q", code, http.StatusOK, resp) + } + var status ipnstate.Status + if err := json.Unmarshal(resp, &status); err != nil { + t.Fatal(err) + } + if status.BackendState != "NoState" { + t.Errorf("got backend state: %q, want %q", status.BackendState, "NoState") + } + }, + }, + { + desc: "login-interactive", + req: httptest.NewRequest("POST", "http://localhost:1234/localapi/v0/login-interactive", nil), + check: check500Body("cannot log in when state store is unhealthy"), + }, + { + desc: "start", + req: httptest.NewRequest("POST", "http://localhost:1234/localapi/v0/start", strings.NewReader("{}")), + check: check500Body("cannot start backend when state store is unhealthy"), + }, + { + desc: "new-profile", + req: httptest.NewRequest("PUT", "http://localhost:1234/localapi/v0/profiles/", nil), + check: check500Body("cannot log in when state store is unhealthy"), + }, + } + + for _, tt := range tests { + t.Run(tt.desc, func(t *testing.T) { + resp := httptest.NewRecorder() + h.ServeHTTP(resp, tt.req) + tt.check(t, resp.Code, resp.Body.Bytes()) + }) + } +} diff --git a/ipn/store.go b/ipn/store.go index 9da5288c0..2034ae09a 100644 --- a/ipn/store.go +++ b/ipn/store.go @@ -10,6 +10,8 @@ import ( "fmt" "net" "strconv" + + "tailscale.com/health" ) // ErrStateNotExist is returned by StateStore.ReadState when the @@ -60,6 +62,19 @@ const ( TaildropReceivedKey = StateKey("_taildrop-received") ) +// StateStoreHealth is a Warnable set when store.New fails at startup. If +// unhealthy, we block all login attempts and return a health message in status +// responses. +var StateStoreHealth = health.Register(&health.Warnable{ + Code: "state-store-health", + Severity: health.SeverityHigh, + Title: "Tailscale state store failed to initialize", + Text: func(args health.Args) string { + return fmt.Sprintf("State store failed to initialize, Tailscale will not work until this is resolved. See https://tailscale.com/s/state-store-init-error. Error: %s", args[health.ArgError]) + }, + ImpactsConnectivity: true, +}) + // CurrentProfileID returns the StateKey that stores the // current profile ID. The value is a JSON-encoded LoginProfile. // If the userID is empty, the key returned is CurrentProfileStateKey, diff --git a/tstest/integration/integration_test.go b/tstest/integration/integration_test.go index 9d75cfc29..543dc125c 100644 --- a/tstest/integration/integration_test.go +++ b/tstest/integration/integration_test.go @@ -22,6 +22,7 @@ import ( "path/filepath" "regexp" "runtime" + "slices" "strconv" "strings" "sync/atomic" @@ -36,6 +37,7 @@ import ( "tailscale.com/cmd/testwrapper/flakytest" "tailscale.com/feature" _ "tailscale.com/feature/clientupdate" + "tailscale.com/health" "tailscale.com/hostinfo" "tailscale.com/ipn" "tailscale.com/net/tsaddr" @@ -2246,3 +2248,38 @@ func TestNetworkLock(t *testing.T) { } }) } + +func TestNodeWithBadStateFile(t *testing.T) { + tstest.Shard(t) + tstest.Parallel(t) + env := NewTestEnv(t) + n1 := NewTestNode(t, env) + if err := os.WriteFile(n1.stateFile, []byte("bad json"), 0644); err != nil { + t.Fatal(err) + } + + d1 := n1.StartDaemon() + n1.AwaitResponding() + + // Make sure the health message shows up in status output. + n1.AwaitBackendState("NoState") + st := n1.MustStatus() + wantHealth := ipn.StateStoreHealth.Text(health.Args{health.ArgError: ""}) + if !slices.ContainsFunc(st.Health, func(m string) bool { return strings.HasPrefix(m, wantHealth) }) { + t.Errorf("Status does not contain expected health message %q\ngot health messages: %q", wantHealth, st.Health) + } + + // Make sure login attempts are rejected. + cmd := n1.Tailscale("up", "--login-server="+n1.env.ControlURL()) + t.Logf("Running %v ...", cmd) + out, err := cmd.CombinedOutput() + if err == nil { + t.Fatalf("up succeeded with output %q", out) + } + wantOut := "cannot start backend when state store is unhealthy" + if !strings.Contains(string(out), wantOut) { + t.Fatalf("got up output:\n%s\nwant:\n%s", string(out), wantOut) + } + + d1.MustCleanShutdown(t) +} diff --git a/tstest/integration/tailscaled_deps_test_darwin.go b/tstest/integration/tailscaled_deps_test_darwin.go index 217188f75..9f92839d8 100644 --- a/tstest/integration/tailscaled_deps_test_darwin.go +++ b/tstest/integration/tailscaled_deps_test_darwin.go @@ -27,6 +27,7 @@ import ( _ "tailscale.com/ipn/ipnlocal" _ "tailscale.com/ipn/ipnserver" _ "tailscale.com/ipn/store" + _ "tailscale.com/ipn/store/mem" _ "tailscale.com/logpolicy" _ "tailscale.com/logtail" _ "tailscale.com/net/dns" diff --git a/tstest/integration/tailscaled_deps_test_freebsd.go b/tstest/integration/tailscaled_deps_test_freebsd.go index 217188f75..9f92839d8 100644 --- a/tstest/integration/tailscaled_deps_test_freebsd.go +++ b/tstest/integration/tailscaled_deps_test_freebsd.go @@ -27,6 +27,7 @@ import ( _ "tailscale.com/ipn/ipnlocal" _ "tailscale.com/ipn/ipnserver" _ "tailscale.com/ipn/store" + _ "tailscale.com/ipn/store/mem" _ "tailscale.com/logpolicy" _ "tailscale.com/logtail" _ "tailscale.com/net/dns" diff --git a/tstest/integration/tailscaled_deps_test_linux.go b/tstest/integration/tailscaled_deps_test_linux.go index 217188f75..9f92839d8 100644 --- a/tstest/integration/tailscaled_deps_test_linux.go +++ b/tstest/integration/tailscaled_deps_test_linux.go @@ -27,6 +27,7 @@ import ( _ "tailscale.com/ipn/ipnlocal" _ "tailscale.com/ipn/ipnserver" _ "tailscale.com/ipn/store" + _ "tailscale.com/ipn/store/mem" _ "tailscale.com/logpolicy" _ "tailscale.com/logtail" _ "tailscale.com/net/dns" diff --git a/tstest/integration/tailscaled_deps_test_openbsd.go b/tstest/integration/tailscaled_deps_test_openbsd.go index 217188f75..9f92839d8 100644 --- a/tstest/integration/tailscaled_deps_test_openbsd.go +++ b/tstest/integration/tailscaled_deps_test_openbsd.go @@ -27,6 +27,7 @@ import ( _ "tailscale.com/ipn/ipnlocal" _ "tailscale.com/ipn/ipnserver" _ "tailscale.com/ipn/store" + _ "tailscale.com/ipn/store/mem" _ "tailscale.com/logpolicy" _ "tailscale.com/logtail" _ "tailscale.com/net/dns" diff --git a/tstest/integration/tailscaled_deps_test_windows.go b/tstest/integration/tailscaled_deps_test_windows.go index f3cd5e75b..82f8097c8 100644 --- a/tstest/integration/tailscaled_deps_test_windows.go +++ b/tstest/integration/tailscaled_deps_test_windows.go @@ -37,6 +37,7 @@ import ( _ "tailscale.com/ipn/ipnlocal" _ "tailscale.com/ipn/ipnserver" _ "tailscale.com/ipn/store" + _ "tailscale.com/ipn/store/mem" _ "tailscale.com/logpolicy" _ "tailscale.com/logtail" _ "tailscale.com/net/dns"