You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
tailscale/cmd/containerboot/main.go

770 lines
31 KiB
Go

// Copyright (c) Tailscale Inc & AUTHORS
// SPDX-License-Identifier: BSD-3-Clause
//go:build linux
// The containerboot binary is a wrapper for starting tailscaled in a container.
// It handles reading the desired mode of operation out of environment
// variables, bringing up and authenticating Tailscale, and any other
// kubernetes-specific side jobs.
//
// As with most container things, configuration is passed through environment
// variables. All configuration is optional.
//
// - TS_AUTHKEY: the authkey to use for login.
// - TS_HOSTNAME: the hostname to request for the node.
// - TS_ROUTES: subnet routes to advertise. Explicitly setting it to an empty
// value will cause containerboot to stop acting as a subnet router for any
// previously advertised routes. To accept routes, use TS_EXTRA_ARGS to pass
// in --accept-routes.
// - TS_DEST_IP: proxy all incoming Tailscale traffic to the given
// destination defined by an IP address.
// - TS_EXPERIMENTAL_DEST_DNS_NAME: proxy all incoming Tailscale traffic to the given
// destination defined by a DNS name. The DNS name will be periodically resolved and firewall rules updated accordingly.
// This is currently intended to be used by the Kubernetes operator (ExternalName Services).
// This is an experimental env var and will likely change in the future.
// - TS_TAILNET_TARGET_IP: proxy all incoming non-Tailscale traffic to the given
// destination defined by an IP.
// - TS_TAILNET_TARGET_FQDN: proxy all incoming non-Tailscale traffic to the given
// destination defined by a MagicDNS name.
// - TS_TAILSCALED_EXTRA_ARGS: extra arguments to 'tailscaled'.
// - TS_EXTRA_ARGS: extra arguments to 'tailscale up'.
// - TS_USERSPACE: run with userspace networking (the default)
// instead of kernel networking.
// - TS_STATE_DIR: the directory in which to store tailscaled
// state. The data should persist across container
// restarts.
// - TS_ACCEPT_DNS: whether to use the tailnet's DNS configuration.
// - TS_KUBE_SECRET: the name of the Kubernetes secret in which to
// store tailscaled state.
// - TS_SOCKS5_SERVER: the address on which to listen for SOCKS5
// proxying into the tailnet.
// - TS_OUTBOUND_HTTP_PROXY_LISTEN: the address on which to listen
// for HTTP proxying into the tailnet.
// - TS_SOCKET: the path where the tailscaled LocalAPI socket should
// be created.
// - TS_AUTH_ONCE: if true, only attempt to log in if not already
// logged in. If false (the default, for backwards
// compatibility), forcibly log in every time the
// container starts.
// - TS_SERVE_CONFIG: if specified, is the file path where the ipn.ServeConfig is located.
// It will be applied once tailscaled is up and running. If the file contains
// ${TS_CERT_DOMAIN}, it will be replaced with the value of the available FQDN.
// It cannot be used in conjunction with TS_DEST_IP. The file is watched for changes,
// and will be re-applied when it changes.
// - TS_HEALTHCHECK_ADDR_PORT: if specified, an HTTP health endpoint will be
// served at /healthz at the provided address, which should be in form [<address>]:<port>.
// If not set, no health check will be run. If set to :<port>, addr will default to 0.0.0.0
// The health endpoint will return 200 OK if this node has at least one tailnet IP address,
// otherwise returns 503.
// NB: the health criteria might change in the future.
// - TS_EXPERIMENTAL_VERSIONED_CONFIG_DIR: if specified, a path to a
// directory that containers tailscaled config in file. The config file needs to be
// named cap-<current-tailscaled-cap>.hujson. If this is set, TS_HOSTNAME,
// TS_EXTRA_ARGS, TS_AUTHKEY,
// TS_ROUTES, TS_ACCEPT_DNS env vars must not be set. If this is set,
// containerboot only runs `tailscaled --config <path-to-this-configfile>`
// and not `tailscale up` or `tailscale set`.
// The config file contents are currently read once on container start.
// NB: This env var is currently experimental and the logic will likely change!
// TS_EXPERIMENTAL_ENABLE_FORWARDING_OPTIMIZATIONS: set to true to
// autoconfigure the default network interface for optimal performance for
// Tailscale subnet router/exit node.
// https://tailscale.com/kb/1320/performance-best-practices#linux-optimizations-for-subnet-routers-and-exit-nodes
// NB: This env var is currently experimental and the logic will likely change!
// - EXPERIMENTAL_ALLOW_PROXYING_CLUSTER_TRAFFIC_VIA_INGRESS: if set to true
// and if this containerboot instance is an L7 ingress proxy (created by
// the Kubernetes operator), set up rules to allow proxying cluster traffic,
// received on the Pod IP of this node, to the ingress target in the cluster.
// This, in conjunction with MagicDNS name resolution in cluster, can be
// useful for cases where a cluster workload needs to access a target in
// cluster using the same hostname (in this case, the MagicDNS name of the ingress proxy)
// as a non-cluster workload on tailnet.
// This is only meant to be configured by the Kubernetes operator.
//
// When running on Kubernetes, containerboot defaults to storing state in the
// "tailscale" kube secret. To store state on local disk instead, set
// TS_KUBE_SECRET="" and TS_STATE_DIR=/path/to/storage/dir. The state dir should
// be persistent storage.
//
// Additionally, if TS_AUTHKEY is not set and the TS_KUBE_SECRET contains an
// "authkey" field, that key is used as the tailscale authkey.
package main
import (
"context"
"errors"
"fmt"
"io/fs"
"log"
"math"
"net"
"net/netip"
"os"
"os/signal"
"path"
"path/filepath"
"slices"
"strings"
"sync"
"sync/atomic"
"syscall"
"time"
"golang.org/x/sys/unix"
"tailscale.com/client/tailscale"
"tailscale.com/ipn"
kubeutils "tailscale.com/k8s-operator"
"tailscale.com/tailcfg"
"tailscale.com/types/logger"
"tailscale.com/types/ptr"
"tailscale.com/util/deephash"
"tailscale.com/util/linuxfw"
)
func newNetfilterRunner(logf logger.Logf) (linuxfw.NetfilterRunner, error) {
if defaultBool("TS_TEST_FAKE_NETFILTER", false) {
return linuxfw.NewFakeIPTablesRunner(), nil
}
return linuxfw.New(logf, "")
}
func main() {
log.SetPrefix("boot: ")
tailscale.I_Acknowledge_This_API_Is_Unstable = true
cfg := &settings{
AuthKey: defaultEnvs([]string{"TS_AUTHKEY", "TS_AUTH_KEY"}, ""),
Hostname: defaultEnv("TS_HOSTNAME", ""),
Routes: defaultEnvStringPointer("TS_ROUTES"),
ServeConfigPath: defaultEnv("TS_SERVE_CONFIG", ""),
ProxyTargetIP: defaultEnv("TS_DEST_IP", ""),
ProxyTargetDNSName: defaultEnv("TS_EXPERIMENTAL_DEST_DNS_NAME", ""),
TailnetTargetIP: defaultEnv("TS_TAILNET_TARGET_IP", ""),
TailnetTargetFQDN: defaultEnv("TS_TAILNET_TARGET_FQDN", ""),
DaemonExtraArgs: defaultEnv("TS_TAILSCALED_EXTRA_ARGS", ""),
ExtraArgs: defaultEnv("TS_EXTRA_ARGS", ""),
InKubernetes: os.Getenv("KUBERNETES_SERVICE_HOST") != "",
UserspaceMode: defaultBool("TS_USERSPACE", true),
StateDir: defaultEnv("TS_STATE_DIR", ""),
AcceptDNS: defaultEnvBoolPointer("TS_ACCEPT_DNS"),
KubeSecret: defaultEnv("TS_KUBE_SECRET", "tailscale"),
SOCKSProxyAddr: defaultEnv("TS_SOCKS5_SERVER", ""),
HTTPProxyAddr: defaultEnv("TS_OUTBOUND_HTTP_PROXY_LISTEN", ""),
Socket: defaultEnv("TS_SOCKET", "/tmp/tailscaled.sock"),
AuthOnce: defaultBool("TS_AUTH_ONCE", false),
Root: defaultEnv("TS_TEST_ONLY_ROOT", "/"),
TailscaledConfigFilePath: tailscaledConfigFilePath(),
AllowProxyingClusterTrafficViaIngress: defaultBool("EXPERIMENTAL_ALLOW_PROXYING_CLUSTER_TRAFFIC_VIA_INGRESS", false),
PodIP: defaultEnv("POD_IP", ""),
EnableForwardingOptimizations: defaultBool("TS_EXPERIMENTAL_ENABLE_FORWARDING_OPTIMIZATIONS", false),
HealthCheckAddrPort: defaultEnv("TS_HEALTHCHECK_ADDR_PORT", ""),
EgressSvcsCfgPath: defaultEnv("TS_EGRESS_SERVICES_CONFIG_PATH", ""),
}
if err := cfg.validate(); err != nil {
log.Fatalf("invalid configuration: %v", err)
}
if !cfg.UserspaceMode {
if err := ensureTunFile(cfg.Root); err != nil {
log.Fatalf("Unable to create tuntap device file: %v", err)
}
if cfg.ProxyTargetIP != "" || cfg.ProxyTargetDNSName != "" || cfg.Routes != nil || cfg.TailnetTargetIP != "" || cfg.TailnetTargetFQDN != "" {
if err := ensureIPForwarding(cfg.Root, cfg.ProxyTargetIP, cfg.TailnetTargetIP, cfg.TailnetTargetFQDN, cfg.Routes); err != nil {
log.Printf("Failed to enable IP forwarding: %v", err)
log.Printf("To run tailscale as a proxy or router container, IP forwarding must be enabled.")
if cfg.InKubernetes {
log.Fatalf("You can either set the sysctls as a privileged initContainer, or run the tailscale container with privileged=true.")
} else {
log.Fatalf("You can fix this by running the container with privileged=true, or the equivalent in your container runtime that permits access to sysctls.")
}
}
}
}
// Context is used for all setup stuff until we're in steady
// state, so that if something is hanging we eventually time out
// and crashloop the container.
bootCtx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
defer cancel()
if cfg.InKubernetes {
initKubeClient(cfg.Root)
if err := cfg.setupKube(bootCtx); err != nil {
log.Fatalf("error setting up for running on Kubernetes: %v", err)
}
}
client, daemonProcess, err := startTailscaled(bootCtx, cfg)
if err != nil {
log.Fatalf("failed to bring up tailscale: %v", err)
}
killTailscaled := func() {
if err := daemonProcess.Signal(unix.SIGTERM); err != nil {
log.Fatalf("error shutting tailscaled down: %v", err)
}
}
defer killTailscaled()
if cfg.EnableForwardingOptimizations {
if err := client.SetUDPGROForwarding(bootCtx); err != nil {
log.Printf("[unexpected] error enabling UDP GRO forwarding: %v", err)
}
}
w, err := client.WatchIPNBus(bootCtx, ipn.NotifyInitialNetMap|ipn.NotifyInitialPrefs|ipn.NotifyInitialState)
if err != nil {
log.Fatalf("failed to watch tailscaled for updates: %v", err)
}
// Now that we've started tailscaled, we can symlink the socket to the
// default location if needed.
const defaultTailscaledSocketPath = "/var/run/tailscale/tailscaled.sock"
if cfg.Socket != "" && cfg.Socket != defaultTailscaledSocketPath {
// If we were given a socket path, symlink it to the default location so
// that the CLI can find it without any extra flags.
// See #6849.
dir := filepath.Dir(defaultTailscaledSocketPath)
err := os.MkdirAll(dir, 0700)
if err == nil {
err = syscall.Symlink(cfg.Socket, defaultTailscaledSocketPath)
}
if err != nil {
log.Printf("[warning] failed to symlink socket: %v\n\tTo interact with the Tailscale CLI please use `tailscale --socket=%q`", err, cfg.Socket)
}
}
// Because we're still shelling out to `tailscale up` to get access to its
// flag parser, we have to stop watching the IPN bus so that we can block on
// the subcommand without stalling anything. Then once it's done, we resume
// watching the bus.
//
// Depending on the requested mode of operation, this auth step happens at
// different points in containerboot's lifecycle, hence the helper function.
didLogin := false
authTailscale := func() error {
if didLogin {
return nil
}
didLogin = true
w.Close()
if err := tailscaleUp(bootCtx, cfg); err != nil {
return fmt.Errorf("failed to auth tailscale: %v", err)
}
w, err = client.WatchIPNBus(bootCtx, ipn.NotifyInitialNetMap|ipn.NotifyInitialState)
if err != nil {
return fmt.Errorf("rewatching tailscaled for updates after auth: %v", err)
}
return nil
}
if isTwoStepConfigAlwaysAuth(cfg) {
if err := authTailscale(); err != nil {
log.Fatalf("failed to auth tailscale: %v", err)
}
}
authLoop:
for {
n, err := w.Next()
if err != nil {
log.Fatalf("failed to read from tailscaled: %v", err)
}
if n.State != nil {
switch *n.State {
case ipn.NeedsLogin:
if isOneStepConfig(cfg) {
// This could happen if this is the first time tailscaled was run for this
// device and the auth key was not passed via the configfile.
log.Fatalf("invalid state: tailscaled daemon started with a config file, but tailscale is not logged in: ensure you pass a valid auth key in the config file.")
}
if err := authTailscale(); err != nil {
log.Fatalf("failed to auth tailscale: %v", err)
}
case ipn.NeedsMachineAuth:
log.Printf("machine authorization required, please visit the admin panel")
case ipn.Running:
// Technically, all we want is to keep monitoring the bus for
// netmap updates. However, in order to make the container crash
// if tailscale doesn't initially come up, the watch has a
// startup deadline on it. So, we have to break out of this
// watch loop, cancel the watch, and watch again with no
// deadline to continue monitoring for changes.
break authLoop
default:
log.Printf("tailscaled in state %q, waiting", *n.State)
}
}
}
w.Close()
ctx, cancel := contextWithExitSignalWatch()
defer cancel()
if isTwoStepConfigAuthOnce(cfg) {
// Now that we are authenticated, we can set/reset any of the
// settings that we need to.
if err := tailscaleSet(ctx, cfg); err != nil {
log.Fatalf("failed to auth tailscale: %v", err)
}
}
if cfg.ServeConfigPath != "" {
// Remove any serve config that may have been set by a previous run of
// containerboot, but only if we're providing a new one.
if err := client.SetServeConfig(ctx, new(ipn.ServeConfig)); err != nil {
log.Fatalf("failed to unset serve config: %v", err)
}
}
if hasKubeStateStore(cfg) && isTwoStepConfigAuthOnce(cfg) {
// We were told to only auth once, so any secret-bound
// authkey is no longer needed. We don't strictly need to
// wipe it, but it's good hygiene.
log.Printf("Deleting authkey from kube secret")
if err := deleteAuthKey(ctx, cfg.KubeSecret); err != nil {
log.Fatalf("deleting authkey from kube secret: %v", err)
}
}
w, err = client.WatchIPNBus(ctx, ipn.NotifyInitialNetMap|ipn.NotifyInitialState)
if err != nil {
log.Fatalf("rewatching tailscaled for updates after auth: %v", err)
}
var (
startupTasksDone = false
currentIPs deephash.Sum // tailscale IPs assigned to device
currentDeviceID deephash.Sum // device ID
currentDeviceEndpoints deephash.Sum // device FQDN and IPs
currentEgressIPs deephash.Sum
addrs []netip.Prefix
backendAddrs []net.IP
certDomain = new(atomic.Pointer[string])
certDomainChanged = make(chan bool, 1)
h = &healthz{} // http server for the healthz endpoint
healthzRunner = sync.OnceFunc(func() { runHealthz(cfg.HealthCheckAddrPort, h) })
)
if cfg.ServeConfigPath != "" {
go watchServeConfigChanges(ctx, cfg.ServeConfigPath, certDomainChanged, certDomain, client)
}
var nfr linuxfw.NetfilterRunner
if isL3Proxy(cfg) {
nfr, err = newNetfilterRunner(log.Printf)
if err != nil {
log.Fatalf("error creating new netfilter runner: %v", err)
}
}
// Setup for proxies that are configured to proxy to a target specified
// by a DNS name (TS_EXPERIMENTAL_DEST_DNS_NAME).
const defaultCheckPeriod = time.Minute * 10 // how often to check what IPs the DNS name resolves to
var (
tc = make(chan string, 1)
failedResolveAttempts int
t *time.Timer = time.AfterFunc(defaultCheckPeriod, func() {
if cfg.ProxyTargetDNSName != "" {
tc <- "recheck"
}
})
)
// egressSvcsErrorChan will get an error sent to it if this containerboot instance is configured to expose 1+
// egress services in HA mode and errored.
var egressSvcsErrorChan = make(chan error)
defer t.Stop()
// resetTimer resets timer for when to next attempt to resolve the DNS
// name for the proxy configured with TS_EXPERIMENTAL_DEST_DNS_NAME. The
// timer gets reset to 10 minutes from now unless the last resolution
// attempt failed. If one or more consecutive previous resolution
// attempts failed, the next resolution attempt will happen after the smallest
// of (10 minutes, 2 ^ number-of-consecutive-failed-resolution-attempts
// seconds) i.e 2s, 4s, 8s ... 10 minutes.
resetTimer := func(lastResolveFailed bool) {
if !lastResolveFailed {
log.Printf("reconfigureTimer: next DNS resolution attempt in %s", defaultCheckPeriod)
t.Reset(defaultCheckPeriod)
failedResolveAttempts = 0
return
}
minDelay := 2 // 2 seconds
nextTick := time.Second * time.Duration(math.Pow(float64(minDelay), float64(failedResolveAttempts)))
if nextTick > defaultCheckPeriod {
nextTick = defaultCheckPeriod // cap at 10 minutes
}
log.Printf("reconfigureTimer: last DNS resolution attempt failed, next DNS resolution attempt in %v", nextTick)
t.Reset(nextTick)
failedResolveAttempts++
}
var egressSvcsNotify chan ipn.Notify
notifyChan := make(chan ipn.Notify)
errChan := make(chan error)
go func() {
for {
n, err := w.Next()
if err != nil {
errChan <- err
break
} else {
notifyChan <- n
}
}
}()
var wg sync.WaitGroup
runLoop:
for {
select {
case <-ctx.Done():
// Although killTailscaled() is deferred earlier, if we
// have started the reaper defined below, we need to
// kill tailscaled and let reaper clean up child
// processes.
killTailscaled()
break runLoop
case err := <-errChan:
log.Fatalf("failed to read from tailscaled: %v", err)
case n := <-notifyChan:
if n.State != nil && *n.State != ipn.Running {
// Something's gone wrong and we've left the authenticated state.
// Our container image never recovered gracefully from this, and the
// control flow required to make it work now is hard. So, just crash
// the container and rely on the container runtime to restart us,
// whereupon we'll go through initial auth again.
log.Fatalf("tailscaled left running state (now in state %q), exiting", *n.State)
}
if n.NetMap != nil {
addrs = n.NetMap.SelfNode.Addresses().AsSlice()
newCurrentIPs := deephash.Hash(&addrs)
ipsHaveChanged := newCurrentIPs != currentIPs
// Store device ID in a Kubernetes Secret before
// setting up any routing rules. This ensures
// that, for containerboot instances that are
// Kubernetes operator proxies, the operator is
// able to retrieve the device ID from the
// Kubernetes Secret to clean up tailnet nodes
// for proxies whose route setup continuously
// fails.
deviceID := n.NetMap.SelfNode.StableID()
if hasKubeStateStore(cfg) && deephash.Update(&currentDeviceID, &deviceID) {
if err := storeDeviceID(ctx, cfg.KubeSecret, n.NetMap.SelfNode.StableID()); err != nil {
log.Fatalf("storing device ID in Kubernetes Secret: %v", err)
}
}
if cfg.TailnetTargetFQDN != "" {
var (
egressAddrs []netip.Prefix
newCurentEgressIPs deephash.Sum
egressIPsHaveChanged bool
node tailcfg.NodeView
nodeFound bool
)
for _, n := range n.NetMap.Peers {
if strings.EqualFold(n.Name(), cfg.TailnetTargetFQDN) {
node = n
nodeFound = true
break
}
}
if !nodeFound {
log.Printf("Tailscale node %q not found; it either does not exist, or not reachable because of ACLs", cfg.TailnetTargetFQDN)
break
}
egressAddrs = node.Addresses().AsSlice()
newCurentEgressIPs = deephash.Hash(&egressAddrs)
egressIPsHaveChanged = newCurentEgressIPs != currentEgressIPs
if egressIPsHaveChanged && len(egressAddrs) != 0 {
var rulesInstalled bool
for _, egressAddr := range egressAddrs {
ea := egressAddr.Addr()
if ea.Is4() || (ea.Is6() && nfr.HasIPV6NAT()) {
rulesInstalled = true
log.Printf("Installing forwarding rules for destination %v", ea.String())
if err := installEgressForwardingRule(ctx, ea.String(), addrs, nfr); err != nil {
log.Fatalf("installing egress proxy rules for destination %s: %v", ea.String(), err)
}
}
}
if !rulesInstalled {
log.Fatalf("no forwarding rules for egress addresses %v, host supports IPv6: %v", egressAddrs, nfr.HasIPV6NAT())
}
}
currentEgressIPs = newCurentEgressIPs
}
if cfg.ProxyTargetIP != "" && len(addrs) != 0 && ipsHaveChanged {
log.Printf("Installing proxy rules")
if err := installIngressForwardingRule(ctx, cfg.ProxyTargetIP, addrs, nfr); err != nil {
log.Fatalf("installing ingress proxy rules: %v", err)
}
}
if cfg.ProxyTargetDNSName != "" && len(addrs) != 0 && ipsHaveChanged {
newBackendAddrs, err := resolveDNS(ctx, cfg.ProxyTargetDNSName)
if err != nil {
log.Printf("[unexpected] error resolving DNS name %s: %v", cfg.ProxyTargetDNSName, err)
resetTimer(true)
continue
}
backendsHaveChanged := !(slices.EqualFunc(backendAddrs, newBackendAddrs, func(ip1 net.IP, ip2 net.IP) bool {
return slices.ContainsFunc(newBackendAddrs, func(ip net.IP) bool { return ip.Equal(ip1) })
}))
if backendsHaveChanged {
log.Printf("installing ingress proxy rules for backends %v", newBackendAddrs)
if err := installIngressForwardingRuleForDNSTarget(ctx, newBackendAddrs, addrs, nfr); err != nil {
log.Fatalf("error installing ingress proxy rules: %v", err)
}
}
resetTimer(false)
backendAddrs = newBackendAddrs
}
if cfg.ServeConfigPath != "" && len(n.NetMap.DNS.CertDomains) != 0 {
cd := n.NetMap.DNS.CertDomains[0]
prev := certDomain.Swap(ptr.To(cd))
if prev == nil || *prev != cd {
select {
case certDomainChanged <- true:
default:
}
}
}
if cfg.TailnetTargetIP != "" && ipsHaveChanged && len(addrs) != 0 {
log.Printf("Installing forwarding rules for destination %v", cfg.TailnetTargetIP)
if err := installEgressForwardingRule(ctx, cfg.TailnetTargetIP, addrs, nfr); err != nil {
log.Fatalf("installing egress proxy rules: %v", err)
}
}
// If this is a L7 cluster ingress proxy (set up
// by Kubernetes operator) and proxying of
// cluster traffic to the ingress target is
// enabled, set up proxy rule each time the
// tailnet IPs of this node change (including
// the first time they become available).
if cfg.AllowProxyingClusterTrafficViaIngress && cfg.ServeConfigPath != "" && ipsHaveChanged && len(addrs) != 0 {
log.Printf("installing rules to forward traffic for %s to node's tailnet IP", cfg.PodIP)
if err := installTSForwardingRuleForDestination(ctx, cfg.PodIP, addrs, nfr); err != nil {
log.Fatalf("installing rules to forward traffic to node's tailnet IP: %v", err)
}
}
currentIPs = newCurrentIPs
// Only store device FQDN and IP addresses to
// Kubernetes Secret when any required proxy
// route setup has succeeded. IPs and FQDN are
// read from the Secret by the Tailscale
// Kubernetes operator and, for some proxy
// types, such as Tailscale Ingress, advertized
// on the Ingress status. Writing them to the
// Secret only after the proxy routing has been
// set up ensures that the operator does not
// advertize endpoints of broken proxies.
// TODO (irbekrm): instead of using the IP and FQDN, have some other mechanism for the proxy signal that it is 'Ready'.
deviceEndpoints := []any{n.NetMap.SelfNode.Name(), n.NetMap.SelfNode.Addresses()}
if hasKubeStateStore(cfg) && deephash.Update(&currentDeviceEndpoints, &deviceEndpoints) {
if err := storeDeviceEndpoints(ctx, cfg.KubeSecret, n.NetMap.SelfNode.Name(), n.NetMap.SelfNode.Addresses().AsSlice()); err != nil {
log.Fatalf("storing device IPs and FQDN in Kubernetes Secret: %v", err)
}
}
if cfg.HealthCheckAddrPort != "" {
h.Lock()
h.hasAddrs = len(addrs) != 0
h.Unlock()
healthzRunner()
}
if egressSvcsNotify != nil {
egressSvcsNotify <- n
}
}
if !startupTasksDone {
// For containerboot instances that act as TCP proxies (proxying traffic to an endpoint
// passed via one of the env vars that containerboot reads) and store state in a
// Kubernetes Secret, we consider startup tasks done at the point when device info has
// been successfully stored to state Secret. For all other containerboot instances, if
// we just get to this point the startup tasks can be considered done.
if !isL3Proxy(cfg) || !hasKubeStateStore(cfg) || (currentDeviceEndpoints != deephash.Sum{} && currentDeviceID != deephash.Sum{}) {
// This log message is used in tests to detect when all
// post-auth configuration is done.
log.Println("Startup complete, waiting for shutdown signal")
startupTasksDone = true
// Configure egress proxy. Egress proxy will set up firewall rules to proxy
// traffic to tailnet targets configured in the provided configuration file. It
// will then continuously monitor the config file and netmap updates and
// reconfigure the firewall rules as needed. If any of its operations fail, it
// will crash this node.
if cfg.EgressSvcsCfgPath != "" {
log.Printf("configuring egress proxy using configuration file at %s", cfg.EgressSvcsCfgPath)
egressSvcsNotify = make(chan ipn.Notify)
ep := egressProxy{
cfgPath: cfg.EgressSvcsCfgPath,
nfr: nfr,
kc: kc,
stateSecret: cfg.KubeSecret,
netmapChan: egressSvcsNotify,
podIP: cfg.PodIP,
tailnetAddrs: addrs,
}
go func() {
if err := ep.run(ctx, n); err != nil {
egressSvcsErrorChan <- err
}
}()
}
// Wait on tailscaled process. It won't be cleaned up by default when the
// container exits as it is not PID1. TODO (irbekrm): perhaps we can replace the
// reaper by a running cmd.Wait in a goroutine immediately after starting
// tailscaled?
reaper := func() {
defer wg.Done()
for {
var status unix.WaitStatus
_, err := unix.Wait4(daemonProcess.Pid, &status, 0, nil)
if errors.Is(err, unix.EINTR) {
continue
}
if err != nil {
log.Fatalf("Waiting for tailscaled to exit: %v", err)
}
log.Print("tailscaled exited")
os.Exit(0)
}
}
wg.Add(1)
go reaper()
}
}
case <-tc:
newBackendAddrs, err := resolveDNS(ctx, cfg.ProxyTargetDNSName)
if err != nil {
log.Printf("[unexpected] error resolving DNS name %s: %v", cfg.ProxyTargetDNSName, err)
resetTimer(true)
continue
}
backendsHaveChanged := !(slices.EqualFunc(backendAddrs, newBackendAddrs, func(ip1 net.IP, ip2 net.IP) bool {
return slices.ContainsFunc(newBackendAddrs, func(ip net.IP) bool { return ip.Equal(ip1) })
}))
if backendsHaveChanged && len(addrs) != 0 {
log.Printf("Backend address change detected, installing proxy rules for backends %v", newBackendAddrs)
if err := installIngressForwardingRuleForDNSTarget(ctx, newBackendAddrs, addrs, nfr); err != nil {
log.Fatalf("installing ingress proxy rules for DNS target %s: %v", cfg.ProxyTargetDNSName, err)
}
}
backendAddrs = newBackendAddrs
resetTimer(false)
case e := <-egressSvcsErrorChan:
log.Fatalf("egress proxy failed: %v", e)
}
}
wg.Wait()
}
// ensureTunFile checks that /dev/net/tun exists, creating it if
// missing.
func ensureTunFile(root string) error {
// Verify that /dev/net/tun exists, in some container envs it
// needs to be mknod-ed.
if _, err := os.Stat(filepath.Join(root, "dev/net")); errors.Is(err, fs.ErrNotExist) {
if err := os.MkdirAll(filepath.Join(root, "dev/net"), 0755); err != nil {
return err
}
}
if _, err := os.Stat(filepath.Join(root, "dev/net/tun")); errors.Is(err, fs.ErrNotExist) {
dev := unix.Mkdev(10, 200) // tuntap major and minor
if err := unix.Mknod(filepath.Join(root, "dev/net/tun"), 0600|unix.S_IFCHR, int(dev)); err != nil {
return err
}
}
return nil
}
func resolveDNS(ctx context.Context, name string) ([]net.IP, error) {
// TODO (irbekrm): look at using recursive.Resolver instead to resolve
// the DNS names as well as retrieve TTLs. It looks though that this
// seems to return very short TTLs (shorter than on the actual records).
ip4s, err := net.DefaultResolver.LookupIP(ctx, "ip4", name)
if err != nil {
if e, ok := err.(*net.DNSError); !(ok && e.IsNotFound) {
return nil, fmt.Errorf("error looking up IPv4 addresses: %v", err)
}
}
ip6s, err := net.DefaultResolver.LookupIP(ctx, "ip6", name)
if err != nil {
if e, ok := err.(*net.DNSError); !(ok && e.IsNotFound) {
return nil, fmt.Errorf("error looking up IPv6 addresses: %v", err)
}
}
if len(ip4s) == 0 && len(ip6s) == 0 {
return nil, fmt.Errorf("no IPv4 or IPv6 addresses found for host: %s", name)
}
return append(ip4s, ip6s...), nil
}
// contextWithExitSignalWatch watches for SIGTERM/SIGINT signals. It returns a
// context that gets cancelled when a signal is received and a cancel function
// that can be called to free the resources when the watch should be stopped.
func contextWithExitSignalWatch() (context.Context, func()) {
closeChan := make(chan string)
ctx, cancel := context.WithCancel(context.Background())
signalChan := make(chan os.Signal, 1)
signal.Notify(signalChan, syscall.SIGINT, syscall.SIGTERM)
go func() {
select {
case <-signalChan:
cancel()
case <-closeChan:
return
}
}()
f := func() {
closeChan <- "goodbye"
}
return ctx, f
}
// tailscaledConfigFilePath returns the path to the tailscaled config file that
// should be used for the current capability version. It is determined by the
// TS_EXPERIMENTAL_VERSIONED_CONFIG_DIR environment variable and looks for a
// file named cap-<capability_version>.hujson in the directory. It searches for
// the highest capability version that is less than or equal to the current
// capability version.
func tailscaledConfigFilePath() string {
dir := os.Getenv("TS_EXPERIMENTAL_VERSIONED_CONFIG_DIR")
if dir == "" {
return ""
}
fe, err := os.ReadDir(dir)
if err != nil {
log.Fatalf("error reading tailscaled config directory %q: %v", dir, err)
}
maxCompatVer := tailcfg.CapabilityVersion(-1)
for _, e := range fe {
// We don't check if type if file as in most cases this will
// come from a mounted kube Secret, where the directory contents
// will be various symlinks.
if e.Type().IsDir() {
continue
}
cv, err := kubeutils.CapVerFromFileName(e.Name())
if err != nil {
log.Printf("skipping file %q in tailscaled config directory %q: %v", e.Name(), dir, err)
continue
}
if cv > maxCompatVer && cv <= tailcfg.CurrentCapabilityVersion {
maxCompatVer = cv
}
}
if maxCompatVer == -1 {
log.Fatalf("no tailscaled config file found in %q for current capability version %q", dir, tailcfg.CurrentCapabilityVersion)
}
log.Printf("Using tailscaled config file %q for capability version %q", maxCompatVer, tailcfg.CurrentCapabilityVersion)
return path.Join(dir, kubeutils.TailscaledConfigFileNameForCap(maxCompatVer))
}