cmd/containerboot: exit with non-zero code on unexpected tailscaled death

When tailscaled exits unexpectedly (crashes, killed directly), containerboot now exits with a non-zero code to signal failure to the orchestrator. The reaper now distinguishes between graceful shutdowns which still exit 0, and unexpected exits which propagate the child's exit code or force 1 if the child exited cleanly on its own.

Updates #17650

Signed-off-by: Raj Singh <raj@tailscale.com>
pull/17651/head
Raj Singh 1 month ago
parent 4346615d77
commit 877fabc038

@ -229,7 +229,10 @@ func run() error {
}
log.Printf("Sending SIGTERM to tailscaled")
if err := daemonProcess.Signal(unix.SIGTERM); err != nil {
log.Fatalf("error shutting tailscaled down: %v", err)
// Process may have already exited, which is fine.
if !errors.Is(err, unix.ESRCH) {
log.Printf("error shutting tailscaled down: %v", err)
}
}
}
defer killTailscaled()
@ -442,6 +445,8 @@ authLoop:
// egress services in HA mode and errored.
egressSvcsErrorChan := make(chan error)
ingressSvcsErrorChan := make(chan error)
// reaperExitChan receives the exit code when tailscaled exits.
reaperExitChan := make(chan int, 1)
defer t.Stop()
// resetTimer resets timer for when to next attempt to resolve the DNS
// name for the proxy configured with TS_EXPERIMENTAL_DEST_DNS_NAME. The
@ -728,8 +733,31 @@ runLoop:
if err != nil {
log.Fatalf("Waiting for tailscaled to exit: %v", err)
}
log.Print("tailscaled exited")
os.Exit(0)
// Signal exit code: 0 for graceful shutdown, non-zero for crash.
if ctx.Err() != nil {
// Graceful shutdown, we told tailscaled to exit.
log.Print("tailscaled exited after graceful shutdown")
reaperExitChan <- 0
return
}
// Unexpected exit, propagate the error.
exitCode := 1
switch {
case status.Exited():
exitCode = status.ExitStatus()
case status.Signaled():
log.Printf("tailscaled terminated by signal: %v", status.Signal())
exitCode = 128 + int(status.Signal())
}
log.Printf("tailscaled exited unexpectedly with code %d", exitCode)
if exitCode == 0 {
// Tailscaled exited cleanly on its own, still unexpected.
exitCode = 1
}
reaperExitChan <- exitCode
return
}
}
wg.Add(1)
@ -758,6 +786,11 @@ runLoop:
return fmt.Errorf("egress proxy failed: %v", e)
case e := <-ingressSvcsErrorChan:
return fmt.Errorf("ingress proxy failed: %v", e)
case exitCode := <-reaperExitChan:
if exitCode == 0 {
return nil
}
return fmt.Errorf("tailscaled exited unexpectedly")
}
}
wg.Wait()

@ -92,6 +92,9 @@ func TestContainerBoot(t *testing.T) {
// The signal to send to containerboot at the start of the phase.
Signal *syscall.Signal
// If set, send this signal to the fake tailscaled process.
SignalTailscaled *syscall.Signal
EndpointStatuses map[string]int
}
runningNotify := &ipn.Notify{
@ -993,6 +996,29 @@ func TestContainerBoot(t *testing.T) {
},
}
},
"tailscaled_unexpected_exit": func(env *testEnv) testCase {
return testCase{
Env: map[string]string{
"TS_AUTHKEY": "tskey-key",
},
Phases: []phase{
{
WantCmds: []string{
"/usr/bin/tailscaled --socket=/tmp/tailscaled.sock --state=mem: --statedir=/tmp --tun=userspace-networking",
"/usr/bin/tailscale --socket=/tmp/tailscaled.sock up --accept-dns=false --authkey=tskey-key",
},
},
{
Notify: runningNotify,
},
{
SignalTailscaled: ptr.To(unix.SIGKILL),
WantLog: "tailscaled exited unexpectedly with code 137",
WantExitCode: ptr.To(1),
},
},
}
},
"kube_shutdown_during_state_write": func(env *testEnv) testCase {
return testCase{
Env: map[string]string{
@ -1103,6 +1129,9 @@ func TestContainerBoot(t *testing.T) {
if p.Signal != nil {
cmd.Process.Signal(*p.Signal)
}
if p.SignalTailscaled != nil {
signalTailscaled(t, env.d, *p.SignalTailscaled)
}
if p.WantLog != "" {
err := tstest.WaitFor(2*time.Second, func() error {
waitLogLine(t, time.Second, cbOut, p.WantLog)
@ -1586,6 +1615,44 @@ func egressSvcConfig(name, fqdn string) egressservices.Configs {
}
}
// signalTailscaled reads the PID of the fake tailscaled process and sends it a signal.
func signalTailscaled(t *testing.T, rootDir string, sig syscall.Signal) {
t.Helper()
pidFile := filepath.Join(rootDir, "tmp/tailscaled.pid")
// Wait for PID file to exist (tailscaled may not have written it yet).
var pidBytes []byte
var err error
deadline := time.Now().Add(2 * time.Second)
for time.Now().Before(deadline) {
pidBytes, err = os.ReadFile(pidFile)
if err == nil {
break
}
if !errors.Is(err, fs.ErrNotExist) {
t.Fatalf("reading tailscaled PID: %v", err)
}
time.Sleep(10 * time.Millisecond)
}
if err != nil {
t.Fatalf("tailscaled PID file never appeared: %v", err)
}
pid, err := strconv.Atoi(strings.TrimSpace(string(pidBytes)))
if err != nil {
t.Fatalf("parsing tailscaled PID %q: %v", string(pidBytes), err)
}
proc, err := os.FindProcess(pid)
if err != nil {
t.Fatalf("finding tailscaled process %d: %v", pid, err)
}
if err := proc.Signal(sig); err != nil {
t.Fatalf("signaling tailscaled with %v: %v", sig, err)
}
}
// testEnv represents the environment needed for a single sub-test so that tests
// can run in parallel.
type testEnv struct {

@ -35,4 +35,8 @@ fi
ln -s "$TS_TEST_SOCKET" "$socket"
trap 'rm -f "$socket"' EXIT
if [[ -n "$TS_TEST_ONLY_ROOT" ]]; then
echo $$ > "$TS_TEST_ONLY_ROOT/tmp/tailscaled.pid"
fi
while sleep 10; do :; done

Loading…
Cancel
Save