cmd/{containerboot,k8s-proxy}: disconnect from control before shutdown

This commit modifies containerboot to call the local api endpoint to disconnect from the control plane upon shutdown. This functionality is enabled using a new environment variable: `EXPERIMENTAL_DISCONNECT_ON_SHUTDOWN` When using an instance as a HA app connector or subnet router this environment variable should be set to give connected clients enough time to update their netmap and fail over to the next active instance. This commit also includes some minor spelling/grammar fixes. Updates #14020 Signed-off-by: David Bond <davidsbond93@gmail.com>
3 months ago · 7a8976b312
parent 005e264b54
commit 7a8976b312
5 changed files with 79 additions and 62 deletions
--- a/cmd/containerboot/main.go
+++ b/cmd/containerboot/main.go
@ -87,6 +87,9 @@
 //     cluster using the same hostname (in this case, the MagicDNS name of the ingress proxy)
 //     as a non-cluster workload on tailnet.
 //     This is only meant to be configured by the Kubernetes operator.
+//   - EXPERIMENTAL_DISCONNECT_ON_SHUTDOWN: if set to true, the containerboot instance
+//     will disconnect from the control-plane on shutdown. This is used by HA subnet
+//     routers and exit nodes to more quickly trigger failover to other replicas.
 //
 // When running on Kubernetes, containerboot defaults to storing state in the
 // "tailscale" kube secret. To store state on local disk instead, set
@ -118,6 +121,7 @@ import (
 	"time"

 	"golang.org/x/sys/unix"
+	"tailscale.com/client/local"
 	"tailscale.com/client/tailscale"
 	"tailscale.com/ipn"
 	kubeutils "tailscale.com/k8s-operator"
@ -205,34 +209,7 @@ func run() error {
 	if err != nil {
 		return fmt.Errorf("failed to bring up tailscale: %w", err)
 	}
-	killTailscaled := func() {
-		// The default termination grace period for a Pod is 30s. We wait 25s at
-		// most so that we still reserve some of that budget for tailscaled
-		// to receive and react to a SIGTERM before the SIGKILL that k8s
-		// will send at the end of the grace period.
-		ctx, cancel := context.WithTimeout(context.Background(), 25*time.Second)
-		defer cancel()
-
-		if err := services.EnsureServicesNotAdvertised(ctx, client, log.Printf); err != nil {
-			log.Printf("Error ensuring services are not advertised: %v", err)
-		}
-
-		if hasKubeStateStore(cfg) {
-			// Check we're not shutting tailscaled down while it's still writing
-			// state. If we authenticate and fail to write all the state, we'll
-			// never recover automatically.
-			log.Printf("Checking for consistent state")
-			err := kc.waitForConsistentState(ctx)
-			if err != nil {
-				log.Printf("Error waiting for consistent state on shutdown: %v", err)
-			}
-		}
-		log.Printf("Sending SIGTERM to tailscaled")
-		if err := daemonProcess.Signal(unix.SIGTERM); err != nil {
-			log.Fatalf("error shutting tailscaled down: %v", err)
-		}
-	}
-	defer killTailscaled()
+	defer killTailscaled(client, cfg, daemonProcess, kc)

 	var healthCheck *healthz.Healthz
 	ep := &egressProxy{}
@ -491,7 +468,7 @@ runLoop:
 			// have started the reaper defined below, we need to
 			// kill tailscaled and let reaper clean up child
 			// processes.
-			killTailscaled()
+			killTailscaled(client, cfg, daemonProcess, kc)
 			break runLoop
 		case err := <-errChan:
 			return fmt.Errorf("failed to read from tailscaled: %w", err)
@ -499,7 +476,7 @@ runLoop:
 			return fmt.Errorf("failed to watch tailscaled config: %w", err)
 		case n := <-notifyChan:
 			if n.State != nil && *n.State != ipn.Running {
-				// Something's gone wrong and we've left the authenticated state.
+				// Something's gone wrong, and we've left the authenticated state.
 				// Our container image never recovered gracefully from this, and the
 				// control flow required to make it work now is hard. So, just crash
 				// the container and rely on the container runtime to restart us,
@ -632,11 +609,11 @@ runLoop:
 				// route setup has succeeded. IPs and FQDN are
 				// read from the Secret by the Tailscale
 				// Kubernetes operator and, for some proxy
-				// types, such as Tailscale Ingress, advertized
+				// types, such as Tailscale Ingress, advertised
 				// on the Ingress status. Writing them to the
 				// Secret only after the proxy routing has been
 				// set up ensures that the operator does not
-				// advertize endpoints of broken proxies.
+				// advertise endpoints of broken proxies.
 				// TODO (irbekrm): instead of using the IP and FQDN, have some other mechanism for the proxy signal that it is 'Ready'.
 				deviceEndpoints := []any{n.NetMap.SelfNode.Name(), n.NetMap.SelfNode.Addresses()}
 				if hasKubeStateStore(cfg) && deephash.Update(&currentDeviceEndpoints, &deviceEndpoints) {
@ -892,3 +869,43 @@ func runHTTPServer(mux *http.ServeMux, addr string) (close func() error) {
 		return errors.Join(err, ln.Close())
 	}
 }
+
+func killTailscaled(client *local.Client, cfg *settings, daemonProcess *os.Process, kc *kubeClient) {
+	// The default termination grace period for a Pod is 30s. We wait 25s at
+	// most so that we still reserve some of that budget for tailscaled
+	// to receive and react to a SIGTERM before the SIGKILL that k8s
+	// will send at the end of the grace period.
+	ctx, cancel := context.WithTimeout(context.Background(), 25*time.Second)
+	defer cancel()
+
+	if err := services.EnsureServicesNotAdvertised(ctx, client, log.Printf); err != nil {
+		log.Printf("Error ensuring services are not advertised: %v", err)
+	}
+
+	if hasKubeStateStore(cfg) {
+		// Check we're not shutting tailscaled down while it's still writing
+		// state. If we authenticate and fail to write all the state, we'll
+		// never recover automatically.
+		log.Printf("Checking for consistent state")
+		if err := kc.waitForConsistentState(ctx); err != nil {
+			log.Printf("Error waiting for consistent state on shutdown: %v", err)
+		}
+	}
+
+	if cfg.DisconnectOnShutdown {
+		// Forcibly disconnect the local Tailscale instance from the control plane. This is useful when running as a HA
+		// app connector or subnet router to speed up switching over to another replica.
+		if err := client.DisconnectControl(ctx); err != nil {
+			log.Printf("Error disconnecting from control: %v", err)
+		}
+	}
+
+	log.Printf("Sending SIGTERM to tailscaled")
+	if err := daemonProcess.Signal(unix.SIGTERM); err != nil {
+		log.Fatalf("error shutting tailscaled down: %v", err)
+	}
+
+	// Run out the clock for the grace period, so that any clients still connected have enough time to get a netmap
+	// update and switch over.
+	<-ctx.Done()
+}
--- a/cmd/containerboot/main_test.go
+++ b/cmd/containerboot/main_test.go
@ -1104,8 +1104,8 @@ func TestContainerBoot(t *testing.T) {
 					cmd.Process.Signal(*p.Signal)
 				}
 				if p.WantLog != "" {
-					err := tstest.WaitFor(2*time.Second, func() error {
-						waitLogLine(t, time.Second, cbOut, p.WantLog)
+					err := tstest.WaitFor(time.Minute, func() error {
+						waitLogLine(t, time.Minute, cbOut, p.WantLog)
 						return nil
 					})
 					if err != nil {
@ -1213,6 +1213,8 @@ func (b *lockingBuffer) String() string {
 // waitLogLine fails the entire test if path doesn't contain want
 // before the timeout.
 func waitLogLine(t *testing.T, timeout time.Duration, b *lockingBuffer, want string) {
+	t.Helper()
+
 	deadline := time.Now().Add(timeout)
 	for time.Now().Before(deadline) {
 		for _, line := range strings.Split(b.String(), "\n") {
@ -1338,6 +1340,17 @@ func (l *localAPI) ServeHTTP(w http.ResponseWriter, r *http.Request) {
 		}
 		w.Write([]byte("fake metrics"))
 		return
+	case "/localapi/v0/disconnect-control":
+		if r.Method != "POST" {
+			panic(fmt.Sprintf("unsupported method %q", r.Method))
+		}
+	case "/localapi/v0/prefs":
+		if r.Method != "GET" {
+			panic(fmt.Sprintf("unsupported method %q", r.Method))
+		}
+
+		w.Write([]byte("{}"))
+		return
 	default:
 		panic(fmt.Sprintf("unsupported path %q", r.URL.Path))
 	}
--- a/cmd/containerboot/settings.go
+++ b/cmd/containerboot/settings.go
@ -81,6 +81,12 @@ type settings struct {
 	// certs) and 'rw' for Pods that should manage the TLS certs shared
 	// amongst the replicas.
 	CertShareMode string
+	// DisconnectOnShutdown is set for subnet routers & app connectors that
+	// are running in an HA configuration. When set, it forces the application
+	// to wait for the entirety of the termination grace period before exiting
+	// to give time for clients to receive an updated netmap that points them
+	// to an active subnet router/app connector.
+	DisconnectOnShutdown bool
 }

 func configFromEnv() (*settings, error) {
@ -117,6 +123,7 @@ func configFromEnv() (*settings, error) {
 		EgressProxiesCfgPath:                  defaultEnv("TS_EGRESS_PROXIES_CONFIG_PATH", ""),
 		IngressProxiesCfgPath:                 defaultEnv("TS_INGRESS_PROXIES_CONFIG_PATH", ""),
 		PodUID:                                defaultEnv("POD_UID", ""),
+		DisconnectOnShutdown:                  defaultBool("EXPERIMENTAL_DISCONNECT_ON_SHUTDOWN", false),
 	}
 	podIPs, ok := os.LookupEnv("POD_IPS")
 	if ok {
--- a/cmd/k8s-proxy/k8s-proxy.go
+++ b/cmd/k8s-proxy/k8s-proxy.go
@ -318,9 +318,9 @@ func run(logger *zap.SugaredLogger) error {
 			// Context cancelled, exit.
 			logger.Info("Context cancelled, exiting")
 			shutdownCtx, shutdownCancel := context.WithTimeout(serveCtx, 20*time.Second)
+			defer shutdownCancel()
 			unadvertiseErr := services.EnsureServicesNotAdvertised(shutdownCtx, lc, logger.Infof)
-			shutdownCancel()
-			serveCancel()
+			<-shutdownCtx.Done()
 			return errors.Join(unadvertiseErr, group.Wait())
 		case cfg = <-cfgChan:
 			// Handle config reload.
--- a/kube/services/services.go
+++ b/kube/services/services.go
@ -8,7 +8,6 @@ package services
 import (
 	"context"
 	"fmt"
-	"time"

 	"tailscale.com/client/local"
 	"tailscale.com/ipn"
@ -24,40 +23,21 @@ func EnsureServicesNotAdvertised(ctx context.Context, lc *local.Client, logf log
 	if err != nil {
 		return fmt.Errorf("error getting prefs: %w", err)
 	}
+
 	if len(prefs.AdvertiseServices) == 0 {
 		return nil
 	}

 	logf("unadvertising services: %v", prefs.AdvertiseServices)
-	if _, err := lc.EditPrefs(ctx, &ipn.MaskedPrefs{
+	_, err = lc.EditPrefs(ctx, &ipn.MaskedPrefs{
 		AdvertiseServicesSet: true,
 		Prefs: ipn.Prefs{
 			AdvertiseServices: nil,
-		},
-	}); err != nil {
-		// EditPrefs only returns an error if it fails _set_ its local prefs.
-		// If it fails to _persist_ the prefs in state, we don't get an error
-		// and we continue waiting below, as control will failover as usual.
+		}})
+	if err != nil {
+		// EditPrefs only returns an error if it fails to _set_ its local prefs.
 		return fmt.Errorf("error setting prefs AdvertiseServices: %w", err)
 	}

-	// Services use the same (failover XOR regional routing) mechanism that
-	// HA subnet routers use. Unfortunately we don't yet get a reliable signal
-	// from control that it's responded to our unadvertisement, so the best we
-	// can do is wait for 20 seconds, where 15s is the approximate maximum time
-	// it should take for control to choose a new primary, and 5s is for buffer.
-	//
-	// Note: There is no guarantee that clients have been _informed_ of the new
-	// primary no matter how long we wait. We would need a mechanism to await
-	// netmap updates for peers to know for sure.
-	//
-	// See https://tailscale.com/kb/1115/high-availability for more details.
-	// TODO(tomhjp): Wait for a netmap update instead of sleeping when control
-	// supports that.
-	select {
-	case <-ctx.Done():
-		return nil
-	case <-time.After(20 * time.Second):
-		return nil
-	}
+	return nil
 }