From 877fabc038c260cacb85d9f1211ebe1abf580881 Mon Sep 17 00:00:00 2001 From: Raj Singh Date: Wed, 29 Oct 2025 09:40:51 -0700 Subject: [PATCH] cmd/containerboot: exit with non-zero code on unexpected tailscaled death When tailscaled exits unexpectedly (crashes, killed directly), containerboot now exits with a non-zero code to signal failure to the orchestrator. The reaper now distinguishes between graceful shutdowns which still exit 0, and unexpected exits which propagate the child's exit code or force 1 if the child exited cleanly on its own. Updates #17650 Signed-off-by: Raj Singh --- cmd/containerboot/main.go | 39 ++++++++++++++-- cmd/containerboot/main_test.go | 67 ++++++++++++++++++++++++++++ cmd/containerboot/test_tailscaled.sh | 4 ++ 3 files changed, 107 insertions(+), 3 deletions(-) diff --git a/cmd/containerboot/main.go b/cmd/containerboot/main.go index f056d26f3..b34885f85 100644 --- a/cmd/containerboot/main.go +++ b/cmd/containerboot/main.go @@ -229,7 +229,10 @@ func run() error { } log.Printf("Sending SIGTERM to tailscaled") if err := daemonProcess.Signal(unix.SIGTERM); err != nil { - log.Fatalf("error shutting tailscaled down: %v", err) + // Process may have already exited, which is fine. + if !errors.Is(err, unix.ESRCH) { + log.Printf("error shutting tailscaled down: %v", err) + } } } defer killTailscaled() @@ -442,6 +445,8 @@ authLoop: // egress services in HA mode and errored. egressSvcsErrorChan := make(chan error) ingressSvcsErrorChan := make(chan error) + // reaperExitChan receives the exit code when tailscaled exits. + reaperExitChan := make(chan int, 1) defer t.Stop() // resetTimer resets timer for when to next attempt to resolve the DNS // name for the proxy configured with TS_EXPERIMENTAL_DEST_DNS_NAME. The @@ -728,8 +733,31 @@ runLoop: if err != nil { log.Fatalf("Waiting for tailscaled to exit: %v", err) } - log.Print("tailscaled exited") - os.Exit(0) + + // Signal exit code: 0 for graceful shutdown, non-zero for crash. + if ctx.Err() != nil { + // Graceful shutdown, we told tailscaled to exit. + log.Print("tailscaled exited after graceful shutdown") + reaperExitChan <- 0 + return + } + + // Unexpected exit, propagate the error. + exitCode := 1 + switch { + case status.Exited(): + exitCode = status.ExitStatus() + case status.Signaled(): + log.Printf("tailscaled terminated by signal: %v", status.Signal()) + exitCode = 128 + int(status.Signal()) + } + log.Printf("tailscaled exited unexpectedly with code %d", exitCode) + if exitCode == 0 { + // Tailscaled exited cleanly on its own, still unexpected. + exitCode = 1 + } + reaperExitChan <- exitCode + return } } wg.Add(1) @@ -758,6 +786,11 @@ runLoop: return fmt.Errorf("egress proxy failed: %v", e) case e := <-ingressSvcsErrorChan: return fmt.Errorf("ingress proxy failed: %v", e) + case exitCode := <-reaperExitChan: + if exitCode == 0 { + return nil + } + return fmt.Errorf("tailscaled exited unexpectedly") } } wg.Wait() diff --git a/cmd/containerboot/main_test.go b/cmd/containerboot/main_test.go index 96feef682..9a7b39843 100644 --- a/cmd/containerboot/main_test.go +++ b/cmd/containerboot/main_test.go @@ -92,6 +92,9 @@ func TestContainerBoot(t *testing.T) { // The signal to send to containerboot at the start of the phase. Signal *syscall.Signal + // If set, send this signal to the fake tailscaled process. + SignalTailscaled *syscall.Signal + EndpointStatuses map[string]int } runningNotify := &ipn.Notify{ @@ -993,6 +996,29 @@ func TestContainerBoot(t *testing.T) { }, } }, + "tailscaled_unexpected_exit": func(env *testEnv) testCase { + return testCase{ + Env: map[string]string{ + "TS_AUTHKEY": "tskey-key", + }, + Phases: []phase{ + { + WantCmds: []string{ + "/usr/bin/tailscaled --socket=/tmp/tailscaled.sock --state=mem: --statedir=/tmp --tun=userspace-networking", + "/usr/bin/tailscale --socket=/tmp/tailscaled.sock up --accept-dns=false --authkey=tskey-key", + }, + }, + { + Notify: runningNotify, + }, + { + SignalTailscaled: ptr.To(unix.SIGKILL), + WantLog: "tailscaled exited unexpectedly with code 137", + WantExitCode: ptr.To(1), + }, + }, + } + }, "kube_shutdown_during_state_write": func(env *testEnv) testCase { return testCase{ Env: map[string]string{ @@ -1103,6 +1129,9 @@ func TestContainerBoot(t *testing.T) { if p.Signal != nil { cmd.Process.Signal(*p.Signal) } + if p.SignalTailscaled != nil { + signalTailscaled(t, env.d, *p.SignalTailscaled) + } if p.WantLog != "" { err := tstest.WaitFor(2*time.Second, func() error { waitLogLine(t, time.Second, cbOut, p.WantLog) @@ -1586,6 +1615,44 @@ func egressSvcConfig(name, fqdn string) egressservices.Configs { } } +// signalTailscaled reads the PID of the fake tailscaled process and sends it a signal. +func signalTailscaled(t *testing.T, rootDir string, sig syscall.Signal) { + t.Helper() + pidFile := filepath.Join(rootDir, "tmp/tailscaled.pid") + + // Wait for PID file to exist (tailscaled may not have written it yet). + var pidBytes []byte + var err error + deadline := time.Now().Add(2 * time.Second) + for time.Now().Before(deadline) { + pidBytes, err = os.ReadFile(pidFile) + if err == nil { + break + } + if !errors.Is(err, fs.ErrNotExist) { + t.Fatalf("reading tailscaled PID: %v", err) + } + time.Sleep(10 * time.Millisecond) + } + if err != nil { + t.Fatalf("tailscaled PID file never appeared: %v", err) + } + + pid, err := strconv.Atoi(strings.TrimSpace(string(pidBytes))) + if err != nil { + t.Fatalf("parsing tailscaled PID %q: %v", string(pidBytes), err) + } + + proc, err := os.FindProcess(pid) + if err != nil { + t.Fatalf("finding tailscaled process %d: %v", pid, err) + } + + if err := proc.Signal(sig); err != nil { + t.Fatalf("signaling tailscaled with %v: %v", sig, err) + } +} + // testEnv represents the environment needed for a single sub-test so that tests // can run in parallel. type testEnv struct { diff --git a/cmd/containerboot/test_tailscaled.sh b/cmd/containerboot/test_tailscaled.sh index 335e2cb0d..ec7b15df4 100644 --- a/cmd/containerboot/test_tailscaled.sh +++ b/cmd/containerboot/test_tailscaled.sh @@ -35,4 +35,8 @@ fi ln -s "$TS_TEST_SOCKET" "$socket" trap 'rm -f "$socket"' EXIT +if [[ -n "$TS_TEST_ONLY_ROOT" ]]; then + echo $$ > "$TS_TEST_ONLY_ROOT/tmp/tailscaled.pid" +fi + while sleep 10; do :; done