diff --git a/cmd/containerboot/healthz.go b/cmd/containerboot/healthz.go index 6d03bd6d3..d6a64a37c 100644 --- a/cmd/containerboot/healthz.go +++ b/cmd/containerboot/healthz.go @@ -47,10 +47,10 @@ func (h *healthz) update(healthy bool) { h.hasAddrs = healthy } -// healthHandlers registers a simple health handler at /healthz. +// registerHealthHandlers registers a simple health handler at /healthz. // A containerized tailscale instance is considered healthy if // it has at least one tailnet IP address. -func healthHandlers(mux *http.ServeMux, podIPv4 string) *healthz { +func registerHealthHandlers(mux *http.ServeMux, podIPv4 string) *healthz { h := &healthz{podIPv4: podIPv4} mux.Handle("GET /healthz", h) return h diff --git a/cmd/containerboot/main.go b/cmd/containerboot/main.go index 5f8052bb9..9425571e6 100644 --- a/cmd/containerboot/main.go +++ b/cmd/containerboot/main.go @@ -195,18 +195,21 @@ func run() error { return fmt.Errorf("failed to bring up tailscale: %w", err) } killTailscaled := func() { + // The default termination grace period for a Pod is 30s. We wait 25s at + // most so that we still reserve some of that budget for tailscaled + // to receive and react to a SIGTERM before the SIGKILL that k8s + // will send at the end of the grace period. + ctx, cancel := context.WithTimeout(context.Background(), 25*time.Second) + defer cancel() + + if err := ensureServicesNotAdvertised(ctx, client); err != nil { + log.Printf("Error ensuring services are not advertised: %v", err) + } + if hasKubeStateStore(cfg) { // Check we're not shutting tailscaled down while it's still writing // state. If we authenticate and fail to write all the state, we'll // never recover automatically. - // - // The default termination grace period for a Pod is 30s. We wait 25s at - // most so that we still reserve some of that budget for tailscaled - // to receive and react to a SIGTERM before the SIGKILL that k8s - // will send at the end of the grace period. - ctx, cancel := context.WithTimeout(context.Background(), 25*time.Second) - defer cancel() - log.Printf("Checking for consistent state") err := kc.waitForConsistentState(ctx) if err != nil { @@ -226,7 +229,7 @@ func run() error { mux := http.NewServeMux() log.Printf("Running healthcheck endpoint at %s/healthz", cfg.HealthCheckAddrPort) - healthCheck = healthHandlers(mux, cfg.PodIPv4) + healthCheck = registerHealthHandlers(mux, cfg.PodIPv4) close := runHTTPServer(mux, cfg.HealthCheckAddrPort) defer close() @@ -237,15 +240,16 @@ func run() error { if cfg.localMetricsEnabled() { log.Printf("Running metrics endpoint at %s/metrics", cfg.LocalAddrPort) - metricsHandlers(mux, client, cfg.DebugAddrPort) + registerMetricsHandlers(mux, client, cfg.DebugAddrPort) } if cfg.localHealthEnabled() { log.Printf("Running healthcheck endpoint at %s/healthz", cfg.LocalAddrPort) - healthCheck = healthHandlers(mux, cfg.PodIPv4) + healthCheck = registerHealthHandlers(mux, cfg.PodIPv4) } - if cfg.EgressProxiesCfgPath != "" { - log.Printf("Running preshutdown hook at %s%s", cfg.LocalAddrPort, kubetypes.EgessServicesPreshutdownEP) + + if cfg.egressSvcsTerminateEPEnabled() { + log.Printf("Running egress preshutdown hook at %s%s", cfg.LocalAddrPort, kubetypes.EgessServicesPreshutdownEP) ep.registerHandlers(mux) } diff --git a/cmd/containerboot/metrics.go b/cmd/containerboot/metrics.go index 0bcd231ab..bbd050de6 100644 --- a/cmd/containerboot/metrics.go +++ b/cmd/containerboot/metrics.go @@ -62,13 +62,13 @@ func (m *metrics) handleDebug(w http.ResponseWriter, r *http.Request) { proxy(w, r, debugURL, http.DefaultClient.Do) } -// metricsHandlers registers a simple HTTP metrics handler at /metrics, forwarding +// registerMetricsHandlers registers a simple HTTP metrics handler at /metrics, forwarding // requests to tailscaled's /localapi/v0/usermetrics API. // // In 1.78.x and 1.80.x, it also proxies debug paths to tailscaled's debug // endpoint if configured to ease migration for a breaking change serving user // metrics instead of debug metrics on the "metrics" port. -func metricsHandlers(mux *http.ServeMux, lc *local.Client, debugAddrPort string) { +func registerMetricsHandlers(mux *http.ServeMux, lc *local.Client, debugAddrPort string) { m := &metrics{ lc: lc, debugEndpoint: debugAddrPort, diff --git a/cmd/containerboot/serve.go b/cmd/containerboot/serve.go index 37fd49777..bdf9432b5 100644 --- a/cmd/containerboot/serve.go +++ b/cmd/containerboot/serve.go @@ -9,6 +9,7 @@ import ( "bytes" "context" "encoding/json" + "fmt" "log" "os" "path/filepath" @@ -169,3 +170,46 @@ func readServeConfig(path, certDomain string) (*ipn.ServeConfig, error) { } return &sc, nil } + +func ensureServicesNotAdvertised(ctx context.Context, lc *local.Client) error { + prefs, err := lc.GetPrefs(ctx) + if err != nil { + return fmt.Errorf("error getting prefs: %w", err) + } + if len(prefs.AdvertiseServices) == 0 { + return nil + } + + log.Printf("serve proxy: unadvertising services: %v", prefs.AdvertiseServices) + if _, err := lc.EditPrefs(ctx, &ipn.MaskedPrefs{ + AdvertiseServicesSet: true, + Prefs: ipn.Prefs{ + AdvertiseServices: nil, + }, + }); err != nil { + // EditPrefs only returns an error if it fails _set_ its local prefs. + // If it fails to _persist_ the prefs in state, we don't get an error + // and we continue waiting below, as control will failover as usual. + return fmt.Errorf("error setting prefs AdvertiseServices: %w", err) + } + + // Services use the same (failover XOR regional routing) mechanism that + // HA subnet routers use. Unfortunately we don't yet get a reliable signal + // from control that it's responded to our unadvertisement, so the best we + // can do is wait for 20 seconds, where 15s is the approximate maximum time + // it should take for control to choose a new primary, and 5s is for buffer. + // + // Note: There is no guarantee that clients have been _informed_ of the new + // primary no matter how long we wait. We would need a mechanism to await + // netmap updates for peers to know for sure. + // + // See https://tailscale.com/kb/1115/high-availability for more details. + // TODO(tomhjp): Wait for a netmap update instead of sleeping when control + // supports that. + select { + case <-ctx.Done(): + return nil + case <-time.After(20 * time.Second): + return nil + } +} diff --git a/cmd/k8s-operator/proxygroup_specs.go b/cmd/k8s-operator/proxygroup_specs.go index 16deea278..0cf88b738 100644 --- a/cmd/k8s-operator/proxygroup_specs.go +++ b/cmd/k8s-operator/proxygroup_specs.go @@ -197,6 +197,16 @@ func pgStatefulSet(pg *tsapi.ProxyGroup, namespace, image, tsFirewallMode string // This mechanism currently (2025-01-26) rely on the local health check being accessible on the Pod's // IP, so they are not supported for ProxyGroups where users have configured TS_LOCAL_ADDR_PORT to a custom // value. + // + // NB: For _Ingress_ ProxyGroups, we run shutdown logic within containerboot + // in reaction to a SIGTERM signal instead of using a pre-stop hook. This is + // because Ingress pods need to unadvertise services, and it's preferable to + // avoid triggering those side-effects from a GET request that would be + // accessible to the whole cluster network (in the absence of NetworkPolicy + // rules). + // + // TODO(tomhjp): add a readiness probe or gate to Ingress Pods. There is a + // small window where the Pod is marked ready but routing can still fail. if pg.Spec.Type == tsapi.ProxyGroupTypeEgress && !hasLocalAddrPortSet(proxyClass) { c.Lifecycle = &corev1.Lifecycle{ PreStop: &corev1.LifecycleHandler{