From c62b0732d23ec5f19c2006c14497dcaa86b92dd8 Mon Sep 17 00:00:00 2001 From: Irbe Krumina Date: Fri, 27 Sep 2024 17:47:27 +0100 Subject: [PATCH] cmd/k8s-operator: remove auth key once proxy has logged in (#13612) The operator creates a non-reusable auth key for each of the cluster proxies that it creates and puts in the tailscaled configfile mounted to the proxies. The proxies are always tagged, and their state is persisted in a Kubernetes Secret, so their node keys are expected to never be regenerated, so that they don't need to re-auth. Some tailnet configurations however have seen issues where the auth keys being left in the tailscaled configfile cause the proxies to end up in unauthorized state after a restart at a later point in time. Currently, we have not found a way to reproduce this issue, however this commit removes the auth key from the config once the proxy can be assumed to have logged in. If an existing, logged-in proxy is upgraded to this version, its redundant auth key will be removed from the conffile. If an existing, logged-in proxy is downgraded from this version to a previous version, it will work as before without re-issuing key as the previous code did not enforce that a key must be present. Updates tailscale/tailscale#13451 Signed-off-by: Irbe Krumina --- cmd/k8s-operator/operator_test.go | 66 ++++++++++++++++++++++++++++++ cmd/k8s-operator/sts.go | 66 ++++++++++++++++++------------ cmd/k8s-operator/testutils_test.go | 8 ++++ 3 files changed, 114 insertions(+), 26 deletions(-) diff --git a/cmd/k8s-operator/operator_test.go b/cmd/k8s-operator/operator_test.go index 8b08e9ffa..7ea8c09e1 100644 --- a/cmd/k8s-operator/operator_test.go +++ b/cmd/k8s-operator/operator_test.go @@ -1487,6 +1487,72 @@ func Test_clusterDomainFromResolverConf(t *testing.T) { }) } } +func Test_authKeyRemoval(t *testing.T) { + fc := fake.NewFakeClient() + ft := &fakeTSClient{} + zl, err := zap.NewDevelopment() + if err != nil { + t.Fatal(err) + } + + // 1. A new Service that should be exposed via Tailscale gets created, a Secret with a config that contains auth + // key is generated. + clock := tstest.NewClock(tstest.ClockOpts{}) + sr := &ServiceReconciler{ + Client: fc, + ssr: &tailscaleSTSReconciler{ + Client: fc, + tsClient: ft, + defaultTags: []string{"tag:k8s"}, + operatorNamespace: "operator-ns", + proxyImage: "tailscale/tailscale", + }, + logger: zl.Sugar(), + clock: clock, + } + + mustCreate(t, fc, &corev1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test", + Namespace: "default", + UID: types.UID("1234-UID"), + }, + Spec: corev1.ServiceSpec{ + ClusterIP: "10.20.30.40", + Type: corev1.ServiceTypeLoadBalancer, + LoadBalancerClass: ptr.To("tailscale"), + }, + }) + + expectReconciled(t, sr, "default", "test") + + fullName, shortName := findGenName(t, fc, "default", "test", "svc") + opts := configOpts{ + stsName: shortName, + secretName: fullName, + namespace: "default", + parentType: "svc", + hostname: "default-test", + clusterTargetIP: "10.20.30.40", + app: kubetypes.AppIngressProxy, + } + + expectEqual(t, fc, expectedSecret(t, fc, opts), nil) + expectEqual(t, fc, expectedHeadlessService(shortName, "svc"), nil) + expectEqual(t, fc, expectedSTS(t, fc, opts), removeHashAnnotation) + + // 2. Apply update to the Secret that imitates the proxy setting device_id. + s := expectedSecret(t, fc, opts) + mustUpdate(t, fc, s.Namespace, s.Name, func(s *corev1.Secret) { + mak.Set(&s.Data, "device_id", []byte("dkkdi4CNTRL")) + }) + + // 3. Config should no longer contain auth key + expectReconciled(t, sr, "default", "test") + opts.shouldRemoveAuthKey = true + opts.secretExtraData = map[string][]byte{"device_id": []byte("dkkdi4CNTRL")} + expectEqual(t, fc, expectedSecret(t, fc, opts), nil) +} func Test_externalNameService(t *testing.T) { fc := fake.NewFakeClient() diff --git a/cmd/k8s-operator/sts.go b/cmd/k8s-operator/sts.go index 1d87d6c5c..49f879230 100644 --- a/cmd/k8s-operator/sts.go +++ b/cmd/k8s-operator/sts.go @@ -821,33 +821,12 @@ func tailscaledConfig(stsC *tailscaleSTSConfig, newAuthkey string, oldSecret *co if newAuthkey != "" { conf.AuthKey = &newAuthkey - } else if oldSecret != nil { - var err error - latest := tailcfg.CapabilityVersion(-1) - latestStr := "" - for k, data := range oldSecret.Data { - // write to StringData, read from Data as StringData is write-only - if len(data) == 0 { - continue - } - v, err := tsoperator.CapVerFromFileName(k) - if err != nil { - continue - } - if v > latest { - latestStr = k - latest = v - } - } - // Allow for configs that don't contain an auth key. Perhaps - // users have some mechanisms to delete them. Auth key is - // normally not needed after the initial login. - if latestStr != "" { - conf.AuthKey, err = readAuthKey(oldSecret, latestStr) - if err != nil { - return nil, err - } + } else if shouldRetainAuthKey(oldSecret) { + key, err := authKeyFromSecret(oldSecret) + if err != nil { + return nil, fmt.Errorf("error retrieving auth key from Secret: %w", err) } + conf.AuthKey = key } capVerConfigs := make(map[tailcfg.CapabilityVersion]ipn.ConfigVAlpha) capVerConfigs[95] = *conf @@ -857,6 +836,41 @@ func tailscaledConfig(stsC *tailscaleSTSConfig, newAuthkey string, oldSecret *co return capVerConfigs, nil } +func authKeyFromSecret(s *corev1.Secret) (key *string, err error) { + latest := tailcfg.CapabilityVersion(-1) + latestStr := "" + for k, data := range s.Data { + // write to StringData, read from Data as StringData is write-only + if len(data) == 0 { + continue + } + v, err := tsoperator.CapVerFromFileName(k) + if err != nil { + continue + } + if v > latest { + latestStr = k + latest = v + } + } + // Allow for configs that don't contain an auth key. Perhaps + // users have some mechanisms to delete them. Auth key is + // normally not needed after the initial login. + if latestStr != "" { + return readAuthKey(s, latestStr) + } + return key, nil +} + +// shouldRetainAuthKey returns true if the state stored in a proxy's state Secret suggests that auth key should be +// retained (because the proxy has not yet successfully authenticated). +func shouldRetainAuthKey(s *corev1.Secret) bool { + if s == nil { + return false // nothing to retain here + } + return len(s.Data["device_id"]) == 0 // proxy has not authed yet +} + func shouldAcceptRoutes(pc *tsapi.ProxyClass) bool { return pc != nil && pc.Spec.TailscaleConfig != nil && pc.Spec.TailscaleConfig.AcceptRoutes } diff --git a/cmd/k8s-operator/testutils_test.go b/cmd/k8s-operator/testutils_test.go index 9e37d32a9..cb28b6bca 100644 --- a/cmd/k8s-operator/testutils_test.go +++ b/cmd/k8s-operator/testutils_test.go @@ -53,6 +53,8 @@ type configOpts struct { shouldEnableForwardingClusterTrafficViaIngress bool proxyClass string // configuration from the named ProxyClass should be applied to proxy resources app string + shouldRemoveAuthKey bool + secretExtraData map[string][]byte } func expectedSTS(t *testing.T, cl client.Client, opts configOpts) *appsv1.StatefulSet { @@ -365,6 +367,9 @@ func expectedSecret(t *testing.T, cl client.Client, opts configOpts) *corev1.Sec conf.AcceptRoutes = "true" } } + if opts.shouldRemoveAuthKey { + conf.AuthKey = nil + } var routes []netip.Prefix if opts.subnetRoutes != "" || opts.isExitNode { r := opts.subnetRoutes @@ -405,6 +410,9 @@ func expectedSecret(t *testing.T, cl client.Client, opts configOpts) *corev1.Sec labels["tailscale.com/parent-resource-ns"] = "" // Connector is cluster scoped } s.Labels = labels + for key, val := range opts.secretExtraData { + mak.Set(&s.Data, key, val) + } return s }