diff --git a/cmd/k8s-operator/deploy/crds/tailscale.com_recorders.yaml b/cmd/k8s-operator/deploy/crds/tailscale.com_recorders.yaml index 0f3dcfcca..48db3ef4b 100644 --- a/cmd/k8s-operator/deploy/crds/tailscale.com_recorders.yaml +++ b/cmd/k8s-operator/deploy/crds/tailscale.com_recorders.yaml @@ -68,6 +68,11 @@ spec: Corresponds to --ui tsrecorder flag https://tailscale.com/kb/1246/tailscale-ssh-session-recording#deploy-a-recorder-node. Required if S3 storage is not set up, to ensure that recordings are accessible. type: boolean + replicas: + description: Replicas specifies how many instances of tsrecorder to run. Defaults to 1. + type: integer + format: int32 + minimum: 0 statefulSet: description: |- Configuration parameters for the Recorder's StatefulSet. The operator @@ -1683,6 +1688,9 @@ spec: items: type: string pattern: ^tag:[a-zA-Z][a-zA-Z0-9-]*$ + x-kubernetes-validations: + - rule: '!(self.replicas > 1 && (!has(self.storage) || !has(self.storage.s3)))' + message: S3 storage must be used when deploying multiple Recorder replicas status: description: |- RecorderStatus describes the status of the recorder. This is set diff --git a/cmd/k8s-operator/deploy/manifests/operator.yaml b/cmd/k8s-operator/deploy/manifests/operator.yaml index c5da367e0..2757f09e5 100644 --- a/cmd/k8s-operator/deploy/manifests/operator.yaml +++ b/cmd/k8s-operator/deploy/manifests/operator.yaml @@ -3348,6 +3348,11 @@ spec: Corresponds to --ui tsrecorder flag https://tailscale.com/kb/1246/tailscale-ssh-session-recording#deploy-a-recorder-node. Required if S3 storage is not set up, to ensure that recordings are accessible. type: boolean + replicas: + description: Replicas specifies how many instances of tsrecorder to run. Defaults to 1. + format: int32 + minimum: 0 + type: integer statefulSet: description: |- Configuration parameters for the Recorder's StatefulSet. The operator @@ -4964,6 +4969,9 @@ spec: type: string type: array type: object + x-kubernetes-validations: + - message: S3 storage must be used when deploying multiple Recorder replicas + rule: '!(self.replicas > 1 && (!has(self.storage) || !has(self.storage.s3)))' status: description: |- RecorderStatus describes the status of the recorder. This is set diff --git a/cmd/k8s-operator/operator.go b/cmd/k8s-operator/operator.go index 6b545a827..816fea566 100644 --- a/cmd/k8s-operator/operator.go +++ b/cmd/k8s-operator/operator.go @@ -44,10 +44,10 @@ import ( "sigs.k8s.io/controller-runtime/pkg/manager/signals" "sigs.k8s.io/controller-runtime/pkg/predicate" "sigs.k8s.io/controller-runtime/pkg/reconcile" - "tailscale.com/envknob" "tailscale.com/client/local" "tailscale.com/client/tailscale" + "tailscale.com/envknob" "tailscale.com/hostinfo" "tailscale.com/ipn" "tailscale.com/ipn/store/kubestore" diff --git a/cmd/k8s-operator/tsrecorder.go b/cmd/k8s-operator/tsrecorder.go index c922f78fe..bfb01fa86 100644 --- a/cmd/k8s-operator/tsrecorder.go +++ b/cmd/k8s-operator/tsrecorder.go @@ -12,6 +12,7 @@ import ( "fmt" "net/http" "slices" + "strconv" "strings" "sync" @@ -29,6 +30,7 @@ import ( "k8s.io/client-go/tools/record" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/reconcile" + "tailscale.com/client/tailscale" tsoperator "tailscale.com/k8s-operator" tsapi "tailscale.com/k8s-operator/apis/v1alpha1" @@ -69,13 +71,13 @@ func (r *RecorderReconciler) logger(name string) *zap.SugaredLogger { return r.log.With("Recorder", name) } -func (r *RecorderReconciler) Reconcile(ctx context.Context, req reconcile.Request) (_ reconcile.Result, err error) { +func (r *RecorderReconciler) Reconcile(ctx context.Context, req reconcile.Request) (reconcile.Result, error) { logger := r.logger(req.Name) logger.Debugf("starting reconcile") defer logger.Debugf("reconcile finished") tsr := new(tsapi.Recorder) - err = r.Get(ctx, req.NamespacedName, tsr) + err := r.Get(ctx, req.NamespacedName, tsr) if apierrors.IsNotFound(err) { logger.Debugf("Recorder not found, assuming it was deleted") return reconcile.Result{}, nil @@ -98,7 +100,7 @@ func (r *RecorderReconciler) Reconcile(ctx context.Context, req reconcile.Reques } tsr.Finalizers = slices.Delete(tsr.Finalizers, ix, ix+1) - if err := r.Update(ctx, tsr); err != nil { + if err = r.Update(ctx, tsr); err != nil { return reconcile.Result{}, err } return reconcile.Result{}, nil @@ -110,10 +112,11 @@ func (r *RecorderReconciler) Reconcile(ctx context.Context, req reconcile.Reques if !apiequality.Semantic.DeepEqual(oldTSRStatus, &tsr.Status) { // An error encountered here should get returned by the Reconcile function. if updateErr := r.Client.Status().Update(ctx, tsr); updateErr != nil { - err = errors.Join(err, updateErr) + return reconcile.Result{}, errors.Join(err, updateErr) } } - return reconcile.Result{}, err + + return reconcile.Result{}, nil } if !slices.Contains(tsr.Finalizers, FinalizerName) { @@ -123,12 +126,12 @@ func (r *RecorderReconciler) Reconcile(ctx context.Context, req reconcile.Reques // operation is underway. logger.Infof("ensuring Recorder is set up") tsr.Finalizers = append(tsr.Finalizers, FinalizerName) - if err := r.Update(ctx, tsr); err != nil { + if err = r.Update(ctx, tsr); err != nil { return setStatusReady(tsr, metav1.ConditionFalse, reasonRecorderCreationFailed, reasonRecorderCreationFailed) } } - if err := r.validate(ctx, tsr); err != nil { + if err = r.validate(ctx, tsr); err != nil { message := fmt.Sprintf("Recorder is invalid: %s", err) r.recorder.Eventf(tsr, corev1.EventTypeWarning, reasonRecorderInvalid, message) return setStatusReady(tsr, metav1.ConditionFalse, reasonRecorderInvalid, message) @@ -160,19 +163,29 @@ func (r *RecorderReconciler) maybeProvision(ctx context.Context, tsr *tsapi.Reco gaugeRecorderResources.Set(int64(r.recorders.Len())) r.mu.Unlock() - if err := r.ensureAuthSecretCreated(ctx, tsr); err != nil { + if err := r.ensureAuthSecretsCreated(ctx, tsr); err != nil { return fmt.Errorf("error creating secrets: %w", err) } - // State Secret is precreated so we can use the Recorder CR as its owner ref. - sec := tsrStateSecret(tsr, r.tsNamespace) - if _, err := createOrUpdate(ctx, r.Client, r.tsNamespace, sec, func(s *corev1.Secret) { - s.ObjectMeta.Labels = sec.ObjectMeta.Labels - s.ObjectMeta.Annotations = sec.ObjectMeta.Annotations - }); err != nil { - return fmt.Errorf("error creating state Secret: %w", err) + + // State Secrets are pre-created so we can use the Recorder CR as its owner ref. + var replicas int32 = 1 + if tsr.Spec.Replicas != nil { + replicas = *tsr.Spec.Replicas + } + + for replica := range replicas { + sec := tsrStateSecret(tsr, r.tsNamespace, replica) + _, err := createOrUpdate(ctx, r.Client, r.tsNamespace, sec, func(s *corev1.Secret) { + s.ObjectMeta.Labels = sec.ObjectMeta.Labels + s.ObjectMeta.Annotations = sec.ObjectMeta.Annotations + }) + if err != nil { + return fmt.Errorf("error creating state Secret %q: %w", sec.Name, err) + } } + sa := tsrServiceAccount(tsr, r.tsNamespace) - if _, err := createOrMaybeUpdate(ctx, r.Client, r.tsNamespace, sa, func(s *corev1.ServiceAccount) error { + _, err := createOrMaybeUpdate(ctx, r.Client, r.tsNamespace, sa, func(s *corev1.ServiceAccount) error { // Perform this check within the update function to make sure we don't // have a race condition between the previous check and the update. if err := saOwnedByRecorder(s, tsr); err != nil { @@ -183,54 +196,68 @@ func (r *RecorderReconciler) maybeProvision(ctx context.Context, tsr *tsapi.Reco s.ObjectMeta.Annotations = sa.ObjectMeta.Annotations return nil - }); err != nil { + }) + if err != nil { return fmt.Errorf("error creating ServiceAccount: %w", err) } + role := tsrRole(tsr, r.tsNamespace) - if _, err := createOrUpdate(ctx, r.Client, r.tsNamespace, role, func(r *rbacv1.Role) { + _, err = createOrUpdate(ctx, r.Client, r.tsNamespace, role, func(r *rbacv1.Role) { r.ObjectMeta.Labels = role.ObjectMeta.Labels r.ObjectMeta.Annotations = role.ObjectMeta.Annotations r.Rules = role.Rules - }); err != nil { + }) + if err != nil { return fmt.Errorf("error creating Role: %w", err) } + roleBinding := tsrRoleBinding(tsr, r.tsNamespace) - if _, err := createOrUpdate(ctx, r.Client, r.tsNamespace, roleBinding, func(r *rbacv1.RoleBinding) { + _, err = createOrUpdate(ctx, r.Client, r.tsNamespace, roleBinding, func(r *rbacv1.RoleBinding) { r.ObjectMeta.Labels = roleBinding.ObjectMeta.Labels r.ObjectMeta.Annotations = roleBinding.ObjectMeta.Annotations r.RoleRef = roleBinding.RoleRef r.Subjects = roleBinding.Subjects - }); err != nil { + }) + if err != nil { return fmt.Errorf("error creating RoleBinding: %w", err) } + ss := tsrStatefulSet(tsr, r.tsNamespace, r.loginServer) - if _, err := createOrUpdate(ctx, r.Client, r.tsNamespace, ss, func(s *appsv1.StatefulSet) { + _, err = createOrUpdate(ctx, r.Client, r.tsNamespace, ss, func(s *appsv1.StatefulSet) { s.ObjectMeta.Labels = ss.ObjectMeta.Labels s.ObjectMeta.Annotations = ss.ObjectMeta.Annotations s.Spec = ss.Spec - }); err != nil { + }) + if err != nil { return fmt.Errorf("error creating StatefulSet: %w", err) } // ServiceAccount name may have changed, in which case we need to clean up // the previous ServiceAccount. RoleBinding will already be updated to point // to the new ServiceAccount. - if err := r.maybeCleanupServiceAccounts(ctx, tsr, sa.Name); err != nil { + if err = r.maybeCleanupServiceAccounts(ctx, tsr, sa.Name); err != nil { return fmt.Errorf("error cleaning up ServiceAccounts: %w", err) } + // If we have scaled the recorder down, we will have dangling state secrets + // that we need to clean up. + if err = r.maybeCleanupSecrets(ctx, tsr); err != nil { + return fmt.Errorf("error cleaning up Secrets: %w", err) + } + var devices []tsapi.RecorderTailnetDevice + for replica := range replicas { + dev, ok, err := r.getDeviceInfo(ctx, tsr.Name, replica) + switch { + case err != nil: + return fmt.Errorf("failed to get device info: %w", err) + case !ok: + logger.Debugf("no Tailscale hostname known yet, waiting for Recorder pod to finish auth") + continue + } - device, ok, err := r.getDeviceInfo(ctx, tsr.Name) - if err != nil { - return fmt.Errorf("failed to get device info: %w", err) + devices = append(devices, dev) } - if !ok { - logger.Debugf("no Tailscale hostname known yet, waiting for Recorder pod to finish auth") - return nil - } - - devices = append(devices, device) tsr.Status.Devices = devices @@ -257,22 +284,89 @@ func saOwnedByRecorder(sa *corev1.ServiceAccount, tsr *tsapi.Recorder) error { func (r *RecorderReconciler) maybeCleanupServiceAccounts(ctx context.Context, tsr *tsapi.Recorder, currentName string) error { logger := r.logger(tsr.Name) - // List all ServiceAccounts owned by this Recorder. + options := []client.ListOption{ + client.InNamespace(r.tsNamespace), + client.MatchingLabels(tsrLabels("recorder", tsr.Name, nil)), + } + sas := &corev1.ServiceAccountList{} - if err := r.List(ctx, sas, client.InNamespace(r.tsNamespace), client.MatchingLabels(labels("recorder", tsr.Name, nil))); err != nil { + if err := r.List(ctx, sas, options...); err != nil { return fmt.Errorf("error listing ServiceAccounts for cleanup: %w", err) } - for _, sa := range sas.Items { - if sa.Name == currentName { + + for _, serviceAccount := range sas.Items { + if serviceAccount.Name == currentName { + continue + } + + err := r.Delete(ctx, &serviceAccount) + switch { + case apierrors.IsNotFound(err): + logger.Debugf("ServiceAccount %s not found, likely already deleted", serviceAccount.Name) + continue + case err != nil: + return fmt.Errorf("error deleting ServiceAccount %s: %w", serviceAccount.Name, err) + } + } + + return nil +} + +func (r *RecorderReconciler) maybeCleanupSecrets(ctx context.Context, tsr *tsapi.Recorder) error { + options := []client.ListOption{ + client.InNamespace(r.tsNamespace), + client.MatchingLabels(tsrLabels("recorder", tsr.Name, nil)), + } + + secrets := &corev1.SecretList{} + if err := r.List(ctx, secrets, options...); err != nil { + return fmt.Errorf("error listing Secrets for cleanup: %w", err) + } + + // Get the largest ordinal suffix that we expect. Then we'll go through the list of secrets owned by this + // recorder and remove them. + var replicas int32 = 1 + if tsr.Spec.Replicas != nil { + replicas = *tsr.Spec.Replicas + } + + for _, secret := range secrets.Items { + parts := strings.Split(secret.Name, "-") + if len(parts) == 0 { + continue + } + + ordinal, err := strconv.ParseUint(parts[len(parts)-1], 10, 32) + if err != nil { + return fmt.Errorf("error parsing secret name %q: %w", secret.Name, err) + } + + if int32(ordinal) < replicas { continue } - if err := r.Delete(ctx, &sa); err != nil { - if apierrors.IsNotFound(err) { - logger.Debugf("ServiceAccount %s not found, likely already deleted", sa.Name) - } else { - return fmt.Errorf("error deleting ServiceAccount %s: %w", sa.Name, err) + + devicePrefs, ok, err := getDevicePrefs(&secret) + if err != nil { + return err + } + + if ok { + var errResp *tailscale.ErrResponse + + r.log.Debugf("deleting device %s", devicePrefs.Config.NodeID) + err = r.tsClient.DeleteDevice(ctx, string(devicePrefs.Config.NodeID)) + switch { + case errors.As(err, &errResp) && errResp.Status == http.StatusNotFound: + // This device has possibly already been deleted in the admin console. So we can ignore this + // and move on to removing the secret. + case err != nil: + return err } } + + if err = r.Delete(ctx, &secret); err != nil { + return err + } } return nil @@ -284,30 +378,38 @@ func (r *RecorderReconciler) maybeCleanupServiceAccounts(ctx context.Context, ts func (r *RecorderReconciler) maybeCleanup(ctx context.Context, tsr *tsapi.Recorder) (bool, error) { logger := r.logger(tsr.Name) - prefs, ok, err := r.getDevicePrefs(ctx, tsr.Name) - if err != nil { - return false, err + var replicas int32 = 1 + if tsr.Spec.Replicas != nil { + replicas = *tsr.Spec.Replicas } - if !ok { - logger.Debugf("state Secret %s-0 not found or does not contain node ID, continuing cleanup", tsr.Name) - r.mu.Lock() - r.recorders.Remove(tsr.UID) - gaugeRecorderResources.Set(int64(r.recorders.Len())) - r.mu.Unlock() - return true, nil - } - - id := string(prefs.Config.NodeID) - logger.Debugf("deleting device %s from control", string(id)) - if err := r.tsClient.DeleteDevice(ctx, string(id)); err != nil { - errResp := &tailscale.ErrResponse{} - if ok := errors.As(err, errResp); ok && errResp.Status == http.StatusNotFound { - logger.Debugf("device %s not found, likely because it has already been deleted from control", string(id)) - } else { + + for replica := range replicas { + devicePrefs, ok, err := r.getDevicePrefs(ctx, tsr.Name, replica) + if err != nil { + return false, err + } + if !ok { + logger.Debugf("state Secret %s-%d not found or does not contain node ID, continuing cleanup", tsr.Name, replica) + r.mu.Lock() + r.recorders.Remove(tsr.UID) + gaugeRecorderResources.Set(int64(r.recorders.Len())) + r.mu.Unlock() + return true, nil + } + + nodeID := string(devicePrefs.Config.NodeID) + logger.Debugf("deleting device %s from control", nodeID) + if err = r.tsClient.DeleteDevice(ctx, nodeID); err != nil { + errResp := &tailscale.ErrResponse{} + if errors.As(err, errResp) && errResp.Status == http.StatusNotFound { + logger.Debugf("device %s not found, likely because it has already been deleted from control", nodeID) + continue + } + return false, fmt.Errorf("error deleting device: %w", err) } - } else { - logger.Debugf("device %s deleted from control", string(id)) + + logger.Debugf("device %s deleted from control", nodeID) } // Unlike most log entries in the reconcile loop, this will get printed @@ -319,38 +421,46 @@ func (r *RecorderReconciler) maybeCleanup(ctx context.Context, tsr *tsapi.Record r.recorders.Remove(tsr.UID) gaugeRecorderResources.Set(int64(r.recorders.Len())) r.mu.Unlock() + return true, nil } -func (r *RecorderReconciler) ensureAuthSecretCreated(ctx context.Context, tsr *tsapi.Recorder) error { - logger := r.logger(tsr.Name) - key := types.NamespacedName{ - Namespace: r.tsNamespace, - Name: tsr.Name, - } - if err := r.Get(ctx, key, &corev1.Secret{}); err == nil { - // No updates, already created the auth key. - logger.Debugf("auth Secret %s already exists", key.Name) - return nil - } else if !apierrors.IsNotFound(err) { - return err +func (r *RecorderReconciler) ensureAuthSecretsCreated(ctx context.Context, tsr *tsapi.Recorder) error { + var replicas int32 = 1 + if tsr.Spec.Replicas != nil { + replicas = *tsr.Spec.Replicas } - // Create the auth key Secret which is going to be used by the StatefulSet - // to authenticate with Tailscale. - logger.Debugf("creating authkey for new Recorder") tags := tsr.Spec.Tags if len(tags) == 0 { tags = tsapi.Tags{"tag:k8s"} } - authKey, err := newAuthKey(ctx, r.tsClient, tags.Stringify()) - if err != nil { - return err - } - logger.Debug("creating a new Secret for the Recorder") - if err := r.Create(ctx, tsrAuthSecret(tsr, r.tsNamespace, authKey)); err != nil { - return err + logger := r.logger(tsr.Name) + + for replica := range replicas { + key := types.NamespacedName{ + Namespace: r.tsNamespace, + Name: fmt.Sprintf("%s-auth-%d", tsr.Name, replica), + } + + err := r.Get(ctx, key, &corev1.Secret{}) + switch { + case err == nil: + logger.Debugf("auth Secret %q already exists", key.Name) + continue + case !apierrors.IsNotFound(err): + return fmt.Errorf("failed to get Secret %q: %w", key.Name, err) + } + + authKey, err := newAuthKey(ctx, r.tsClient, tags.Stringify()) + if err != nil { + return err + } + + if err = r.Create(ctx, tsrAuthSecret(tsr, r.tsNamespace, authKey, replica)); err != nil { + return err + } } return nil @@ -361,6 +471,10 @@ func (r *RecorderReconciler) validate(ctx context.Context, tsr *tsapi.Recorder) return errors.New("must either enable UI or use S3 storage to ensure recordings are accessible") } + if tsr.Spec.Replicas != nil && *tsr.Spec.Replicas > 1 && tsr.Spec.Storage.S3 == nil { + return errors.New("must use S3 storage when using multiple replicas to ensure recordings are accessible") + } + // Check any custom ServiceAccount config doesn't conflict with pre-existing // ServiceAccounts. This check is performed once during validation to ensure // errors are raised early, but also again during any Updates to prevent a race. @@ -394,11 +508,11 @@ func (r *RecorderReconciler) validate(ctx context.Context, tsr *tsapi.Recorder) return nil } -func (r *RecorderReconciler) getStateSecret(ctx context.Context, tsrName string) (*corev1.Secret, error) { +func (r *RecorderReconciler) getStateSecret(ctx context.Context, tsrName string, replica int32) (*corev1.Secret, error) { secret := &corev1.Secret{ ObjectMeta: metav1.ObjectMeta{ Namespace: r.tsNamespace, - Name: fmt.Sprintf("%s-0", tsrName), + Name: fmt.Sprintf("%s-%d", tsrName, replica), }, } if err := r.Get(ctx, client.ObjectKeyFromObject(secret), secret); err != nil { @@ -412,8 +526,8 @@ func (r *RecorderReconciler) getStateSecret(ctx context.Context, tsrName string) return secret, nil } -func (r *RecorderReconciler) getDevicePrefs(ctx context.Context, tsrName string) (prefs prefs, ok bool, err error) { - secret, err := r.getStateSecret(ctx, tsrName) +func (r *RecorderReconciler) getDevicePrefs(ctx context.Context, tsrName string, replica int32) (prefs prefs, ok bool, err error) { + secret, err := r.getStateSecret(ctx, tsrName, replica) if err != nil || secret == nil { return prefs, false, err } @@ -441,8 +555,8 @@ func getDevicePrefs(secret *corev1.Secret) (prefs prefs, ok bool, err error) { return prefs, ok, nil } -func (r *RecorderReconciler) getDeviceInfo(ctx context.Context, tsrName string) (d tsapi.RecorderTailnetDevice, ok bool, err error) { - secret, err := r.getStateSecret(ctx, tsrName) +func (r *RecorderReconciler) getDeviceInfo(ctx context.Context, tsrName string, replica int32) (d tsapi.RecorderTailnetDevice, ok bool, err error) { + secret, err := r.getStateSecret(ctx, tsrName, replica) if err != nil || secret == nil { return tsapi.RecorderTailnetDevice{}, false, err } diff --git a/cmd/k8s-operator/tsrecorder_specs.go b/cmd/k8s-operator/tsrecorder_specs.go index 83d7439db..b4a10f296 100644 --- a/cmd/k8s-operator/tsrecorder_specs.go +++ b/cmd/k8s-operator/tsrecorder_specs.go @@ -12,30 +12,36 @@ import ( corev1 "k8s.io/api/core/v1" rbacv1 "k8s.io/api/rbac/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + tsapi "tailscale.com/k8s-operator/apis/v1alpha1" "tailscale.com/types/ptr" "tailscale.com/version" ) func tsrStatefulSet(tsr *tsapi.Recorder, namespace string, loginServer string) *appsv1.StatefulSet { - return &appsv1.StatefulSet{ + var replicas int32 = 1 + if tsr.Spec.Replicas != nil { + replicas = *tsr.Spec.Replicas + } + + ss := &appsv1.StatefulSet{ ObjectMeta: metav1.ObjectMeta{ Name: tsr.Name, Namespace: namespace, - Labels: labels("recorder", tsr.Name, tsr.Spec.StatefulSet.Labels), + Labels: tsrLabels("recorder", tsr.Name, tsr.Spec.StatefulSet.Labels), OwnerReferences: tsrOwnerReference(tsr), Annotations: tsr.Spec.StatefulSet.Annotations, }, Spec: appsv1.StatefulSetSpec{ - Replicas: ptr.To[int32](1), + Replicas: ptr.To(replicas), Selector: &metav1.LabelSelector{ - MatchLabels: labels("recorder", tsr.Name, tsr.Spec.StatefulSet.Pod.Labels), + MatchLabels: tsrLabels("recorder", tsr.Name, tsr.Spec.StatefulSet.Pod.Labels), }, Template: corev1.PodTemplateSpec{ ObjectMeta: metav1.ObjectMeta{ Name: tsr.Name, Namespace: namespace, - Labels: labels("recorder", tsr.Name, tsr.Spec.StatefulSet.Pod.Labels), + Labels: tsrLabels("recorder", tsr.Name, tsr.Spec.StatefulSet.Pod.Labels), Annotations: tsr.Spec.StatefulSet.Pod.Annotations, }, Spec: corev1.PodSpec{ @@ -59,7 +65,7 @@ func tsrStatefulSet(tsr *tsapi.Recorder, namespace string, loginServer string) * ImagePullPolicy: tsr.Spec.StatefulSet.Pod.Container.ImagePullPolicy, Resources: tsr.Spec.StatefulSet.Pod.Container.Resources, SecurityContext: tsr.Spec.StatefulSet.Pod.Container.SecurityContext, - Env: env(tsr, loginServer), + Env: tsrEnv(tsr, loginServer), EnvFrom: func() []corev1.EnvFromSource { if tsr.Spec.Storage.S3 == nil || tsr.Spec.Storage.S3.Credentials.Secret.Name == "" { return nil @@ -95,6 +101,28 @@ func tsrStatefulSet(tsr *tsapi.Recorder, namespace string, loginServer string) * }, }, } + + for replica := range replicas { + volumeName := fmt.Sprintf("authkey-%d", replica) + + ss.Spec.Template.Spec.Containers[0].VolumeMounts = append(ss.Spec.Template.Spec.Containers[0].VolumeMounts, corev1.VolumeMount{ + Name: volumeName, + ReadOnly: true, + MountPath: fmt.Sprintf("/etc/tailscaled/%s-%d", ss.Name, replica), + }) + + ss.Spec.Template.Spec.Volumes = append(ss.Spec.Template.Spec.Volumes, corev1.Volume{ + Name: volumeName, + VolumeSource: corev1.VolumeSource{ + Secret: &corev1.SecretVolumeSource{ + SecretName: fmt.Sprintf("%s-auth-%d", tsr.Name, replica), + Items: []corev1.KeyToPath{{Key: "authkey", Path: "authkey"}}, + }, + }, + }) + } + + return ss } func tsrServiceAccount(tsr *tsapi.Recorder, namespace string) *corev1.ServiceAccount { @@ -102,7 +130,7 @@ func tsrServiceAccount(tsr *tsapi.Recorder, namespace string) *corev1.ServiceAcc ObjectMeta: metav1.ObjectMeta{ Name: tsrServiceAccountName(tsr), Namespace: namespace, - Labels: labels("recorder", tsr.Name, nil), + Labels: tsrLabels("recorder", tsr.Name, nil), OwnerReferences: tsrOwnerReference(tsr), Annotations: tsr.Spec.StatefulSet.Pod.ServiceAccount.Annotations, }, @@ -120,11 +148,24 @@ func tsrServiceAccountName(tsr *tsapi.Recorder) string { } func tsrRole(tsr *tsapi.Recorder, namespace string) *rbacv1.Role { + var replicas int32 = 1 + if tsr.Spec.Replicas != nil { + replicas = *tsr.Spec.Replicas + } + + resourceNames := make([]string, 0) + for replica := range replicas { + resourceNames = append(resourceNames, + fmt.Sprintf("%s-%d", tsr.Name, replica), // State secret. + fmt.Sprintf("%s-auth-%d", tsr.Name, replica), // Auth key secret. + ) + } + return &rbacv1.Role{ ObjectMeta: metav1.ObjectMeta{ Name: tsr.Name, Namespace: namespace, - Labels: labels("recorder", tsr.Name, nil), + Labels: tsrLabels("recorder", tsr.Name, nil), OwnerReferences: tsrOwnerReference(tsr), }, Rules: []rbacv1.PolicyRule{ @@ -136,10 +177,7 @@ func tsrRole(tsr *tsapi.Recorder, namespace string) *rbacv1.Role { "patch", "update", }, - ResourceNames: []string{ - tsr.Name, // Contains the auth key. - fmt.Sprintf("%s-0", tsr.Name), // Contains the node state. - }, + ResourceNames: resourceNames, }, { APIGroups: []string{""}, @@ -159,7 +197,7 @@ func tsrRoleBinding(tsr *tsapi.Recorder, namespace string) *rbacv1.RoleBinding { ObjectMeta: metav1.ObjectMeta{ Name: tsr.Name, Namespace: namespace, - Labels: labels("recorder", tsr.Name, nil), + Labels: tsrLabels("recorder", tsr.Name, nil), OwnerReferences: tsrOwnerReference(tsr), }, Subjects: []rbacv1.Subject{ @@ -176,12 +214,12 @@ func tsrRoleBinding(tsr *tsapi.Recorder, namespace string) *rbacv1.RoleBinding { } } -func tsrAuthSecret(tsr *tsapi.Recorder, namespace string, authKey string) *corev1.Secret { +func tsrAuthSecret(tsr *tsapi.Recorder, namespace string, authKey string, replica int32) *corev1.Secret { return &corev1.Secret{ ObjectMeta: metav1.ObjectMeta{ Namespace: namespace, - Name: tsr.Name, - Labels: labels("recorder", tsr.Name, nil), + Name: fmt.Sprintf("%s-auth-%d", tsr.Name, replica), + Labels: tsrLabels("recorder", tsr.Name, nil), OwnerReferences: tsrOwnerReference(tsr), }, StringData: map[string]string{ @@ -190,30 +228,19 @@ func tsrAuthSecret(tsr *tsapi.Recorder, namespace string, authKey string) *corev } } -func tsrStateSecret(tsr *tsapi.Recorder, namespace string) *corev1.Secret { +func tsrStateSecret(tsr *tsapi.Recorder, namespace string, replica int32) *corev1.Secret { return &corev1.Secret{ ObjectMeta: metav1.ObjectMeta{ - Name: fmt.Sprintf("%s-0", tsr.Name), + Name: fmt.Sprintf("%s-%d", tsr.Name, replica), Namespace: namespace, - Labels: labels("recorder", tsr.Name, nil), + Labels: tsrLabels("recorder", tsr.Name, nil), OwnerReferences: tsrOwnerReference(tsr), }, } } -func env(tsr *tsapi.Recorder, loginServer string) []corev1.EnvVar { +func tsrEnv(tsr *tsapi.Recorder, loginServer string) []corev1.EnvVar { envs := []corev1.EnvVar{ - { - Name: "TS_AUTHKEY", - ValueFrom: &corev1.EnvVarSource{ - SecretKeyRef: &corev1.SecretKeySelector{ - LocalObjectReference: corev1.LocalObjectReference{ - Name: tsr.Name, - }, - Key: "authkey", - }, - }, - }, { Name: "POD_NAME", ValueFrom: &corev1.EnvVarSource{ @@ -231,6 +258,10 @@ func env(tsr *tsapi.Recorder, loginServer string) []corev1.EnvVar { }, }, }, + { + Name: "TS_AUTHKEY_FILE", + Value: "/etc/tailscaled/$(POD_NAME)/authkey", + }, { Name: "TS_STATE", Value: "kube:$(POD_NAME)", @@ -280,7 +311,7 @@ func env(tsr *tsapi.Recorder, loginServer string) []corev1.EnvVar { return envs } -func labels(app, instance string, customLabels map[string]string) map[string]string { +func tsrLabels(app, instance string, customLabels map[string]string) map[string]string { labels := make(map[string]string, len(customLabels)+3) for k, v := range customLabels { labels[k] = v diff --git a/cmd/k8s-operator/tsrecorder_specs_test.go b/cmd/k8s-operator/tsrecorder_specs_test.go index 49332d09b..0d78129fc 100644 --- a/cmd/k8s-operator/tsrecorder_specs_test.go +++ b/cmd/k8s-operator/tsrecorder_specs_test.go @@ -12,6 +12,7 @@ import ( corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + tsapi "tailscale.com/k8s-operator/apis/v1alpha1" "tailscale.com/types/ptr" ) @@ -23,6 +24,7 @@ func TestRecorderSpecs(t *testing.T) { Name: "test", }, Spec: tsapi.RecorderSpec{ + Replicas: ptr.To[int32](3), StatefulSet: tsapi.RecorderStatefulSet{ Labels: map[string]string{ "ss-label-key": "ss-label-value", @@ -101,10 +103,10 @@ func TestRecorderSpecs(t *testing.T) { } // Pod-level. - if diff := cmp.Diff(ss.Labels, labels("recorder", "test", tsr.Spec.StatefulSet.Labels)); diff != "" { + if diff := cmp.Diff(ss.Labels, tsrLabels("recorder", "test", tsr.Spec.StatefulSet.Labels)); diff != "" { t.Errorf("(-got +want):\n%s", diff) } - if diff := cmp.Diff(ss.Spec.Template.Labels, labels("recorder", "test", tsr.Spec.StatefulSet.Pod.Labels)); diff != "" { + if diff := cmp.Diff(ss.Spec.Template.Labels, tsrLabels("recorder", "test", tsr.Spec.StatefulSet.Pod.Labels)); diff != "" { t.Errorf("(-got +want):\n%s", diff) } if diff := cmp.Diff(ss.Spec.Template.Spec.Affinity, tsr.Spec.StatefulSet.Pod.Affinity); diff != "" { @@ -124,7 +126,7 @@ func TestRecorderSpecs(t *testing.T) { } // Container-level. - if diff := cmp.Diff(ss.Spec.Template.Spec.Containers[0].Env, env(tsr, tsLoginServer)); diff != "" { + if diff := cmp.Diff(ss.Spec.Template.Spec.Containers[0].Env, tsrEnv(tsr, tsLoginServer)); diff != "" { t.Errorf("(-got +want):\n%s", diff) } if diff := cmp.Diff(ss.Spec.Template.Spec.Containers[0].Image, tsr.Spec.StatefulSet.Pod.Container.Image); diff != "" { @@ -139,5 +141,17 @@ func TestRecorderSpecs(t *testing.T) { if diff := cmp.Diff(ss.Spec.Template.Spec.Containers[0].Resources, tsr.Spec.StatefulSet.Pod.Container.Resources); diff != "" { t.Errorf("(-got +want):\n%s", diff) } + + if *ss.Spec.Replicas != *tsr.Spec.Replicas { + t.Errorf("expected %d replicas, got %d", *tsr.Spec.Replicas, *ss.Spec.Replicas) + } + + if len(ss.Spec.Template.Spec.Volumes) != int(*tsr.Spec.Replicas)+1 { + t.Errorf("expected %d volumes, got %d", *tsr.Spec.Replicas+1, len(ss.Spec.Template.Spec.Volumes)) + } + + if len(ss.Spec.Template.Spec.Containers[0].VolumeMounts) != int(*tsr.Spec.Replicas)+1 { + t.Errorf("expected %d volume mounts, got %d", *tsr.Spec.Replicas+1, len(ss.Spec.Template.Spec.Containers[0].VolumeMounts)) + } }) } diff --git a/cmd/k8s-operator/tsrecorder_test.go b/cmd/k8s-operator/tsrecorder_test.go index 184af2344..f7ff797b1 100644 --- a/cmd/k8s-operator/tsrecorder_test.go +++ b/cmd/k8s-operator/tsrecorder_test.go @@ -8,6 +8,7 @@ package main import ( "context" "encoding/json" + "fmt" "strings" "testing" @@ -20,9 +21,11 @@ import ( "k8s.io/client-go/tools/record" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" + tsoperator "tailscale.com/k8s-operator" tsapi "tailscale.com/k8s-operator/apis/v1alpha1" "tailscale.com/tstest" + "tailscale.com/types/ptr" ) const ( @@ -36,6 +39,9 @@ func TestRecorder(t *testing.T) { Name: "test", Finalizers: []string{"tailscale.com/finalizer"}, }, + Spec: tsapi.RecorderSpec{ + Replicas: ptr.To[int32](3), + }, } fc := fake.NewClientBuilder(). @@ -80,6 +86,15 @@ func TestRecorder(t *testing.T) { }) expectReconciled(t, reconciler, "", tsr.Name) + expectedEvent = "Warning RecorderInvalid Recorder is invalid: must use S3 storage when using multiple replicas to ensure recordings are accessible" + expectEvents(t, fr, []string{expectedEvent}) + + tsr.Spec.Storage.S3 = &tsapi.S3{} + mustUpdate(t, fc, "", "test", func(t *tsapi.Recorder) { + t.Spec = tsr.Spec + }) + expectReconciled(t, reconciler, "", tsr.Name) + // Only check part of this error message, because it's defined in an // external package and may change. if err := fc.Get(context.Background(), client.ObjectKey{ @@ -180,33 +195,47 @@ func TestRecorder(t *testing.T) { }) t.Run("populate_node_info_in_state_secret_and_see_it_appear_in_status", func(t *testing.T) { - bytes, err := json.Marshal(map[string]any{ - "Config": map[string]any{ - "NodeID": "nodeid-123", - "UserProfile": map[string]any{ - "LoginName": "test-0.example.ts.net", - }, - }, - }) - if err != nil { - t.Fatal(err) - } const key = "profile-abc" - mustUpdate(t, fc, tsNamespace, "test-0", func(s *corev1.Secret) { - s.Data = map[string][]byte{ - currentProfileKey: []byte(key), - key: bytes, + for replica := range *tsr.Spec.Replicas { + bytes, err := json.Marshal(map[string]any{ + "Config": map[string]any{ + "NodeID": fmt.Sprintf("node-%d", replica), + "UserProfile": map[string]any{ + "LoginName": fmt.Sprintf("test-%d.example.ts.net", replica), + }, + }, + }) + if err != nil { + t.Fatal(err) } - }) + + name := fmt.Sprintf("%s-%d", "test", replica) + mustUpdate(t, fc, tsNamespace, name, func(s *corev1.Secret) { + s.Data = map[string][]byte{ + currentProfileKey: []byte(key), + key: bytes, + } + }) + } expectReconciled(t, reconciler, "", tsr.Name) tsr.Status.Devices = []tsapi.RecorderTailnetDevice{ { - Hostname: "hostname-nodeid-123", + Hostname: "hostname-node-0", TailnetIPs: []string{"1.2.3.4", "::1"}, URL: "https://test-0.example.ts.net", }, + { + Hostname: "hostname-node-1", + TailnetIPs: []string{"1.2.3.4", "::1"}, + URL: "https://test-1.example.ts.net", + }, + { + Hostname: "hostname-node-2", + TailnetIPs: []string{"1.2.3.4", "::1"}, + URL: "https://test-2.example.ts.net", + }, } expectEqual(t, fc, tsr) }) @@ -222,7 +251,7 @@ func TestRecorder(t *testing.T) { if expected := 0; reconciler.recorders.Len() != expected { t.Fatalf("expected %d recorders, got %d", expected, reconciler.recorders.Len()) } - if diff := cmp.Diff(tsClient.deleted, []string{"nodeid-123"}); diff != "" { + if diff := cmp.Diff(tsClient.deleted, []string{"node-0", "node-1", "node-2"}); diff != "" { t.Fatalf("unexpected deleted devices (-got +want):\n%s", diff) } // The fake client does not clean up objects whose owner has been @@ -233,26 +262,38 @@ func TestRecorder(t *testing.T) { func expectRecorderResources(t *testing.T, fc client.WithWatch, tsr *tsapi.Recorder, shouldExist bool) { t.Helper() - auth := tsrAuthSecret(tsr, tsNamespace, "secret-authkey") - state := tsrStateSecret(tsr, tsNamespace) + var replicas int32 = 1 + if tsr.Spec.Replicas != nil { + replicas = *tsr.Spec.Replicas + } + role := tsrRole(tsr, tsNamespace) roleBinding := tsrRoleBinding(tsr, tsNamespace) serviceAccount := tsrServiceAccount(tsr, tsNamespace) statefulSet := tsrStatefulSet(tsr, tsNamespace, tsLoginServer) if shouldExist { - expectEqual(t, fc, auth) - expectEqual(t, fc, state) expectEqual(t, fc, role) expectEqual(t, fc, roleBinding) expectEqual(t, fc, serviceAccount) expectEqual(t, fc, statefulSet, removeResourceReqs) } else { - expectMissing[corev1.Secret](t, fc, auth.Namespace, auth.Name) - expectMissing[corev1.Secret](t, fc, state.Namespace, state.Name) expectMissing[rbacv1.Role](t, fc, role.Namespace, role.Name) expectMissing[rbacv1.RoleBinding](t, fc, roleBinding.Namespace, roleBinding.Name) expectMissing[corev1.ServiceAccount](t, fc, serviceAccount.Namespace, serviceAccount.Name) expectMissing[appsv1.StatefulSet](t, fc, statefulSet.Namespace, statefulSet.Name) } + + for replica := range replicas { + auth := tsrAuthSecret(tsr, tsNamespace, "secret-authkey", replica) + state := tsrStateSecret(tsr, tsNamespace, replica) + + if shouldExist { + expectEqual(t, fc, auth) + expectEqual(t, fc, state) + } else { + expectMissing[corev1.Secret](t, fc, auth.Namespace, auth.Name) + expectMissing[corev1.Secret](t, fc, state.Namespace, state.Name) + } + } } diff --git a/k8s-operator/api.md b/k8s-operator/api.md index 979d199cb..3a4e692d9 100644 --- a/k8s-operator/api.md +++ b/k8s-operator/api.md @@ -887,7 +887,7 @@ _Appears in:_ - +RecorderSpec describes a tsrecorder instance to be deployed in the cluster @@ -900,6 +900,7 @@ _Appears in:_ | `tags` _[Tags](#tags)_ | Tags that the Tailscale device will be tagged with. Defaults to [tag:k8s].
If you specify custom tags here, make sure you also make the operator
an owner of these tags.
See https://tailscale.com/kb/1236/kubernetes-operator/#setting-up-the-kubernetes-operator.
Tags cannot be changed once a Recorder node has been created.
Tag values must be in form ^tag:[a-zA-Z][a-zA-Z0-9-]*$. | | Pattern: `^tag:[a-zA-Z][a-zA-Z0-9-]*$`
Type: string
| | `enableUI` _boolean_ | Set to true to enable the Recorder UI. The UI lists and plays recorded sessions.
The UI will be served at :443. Defaults to false.
Corresponds to --ui tsrecorder flag https://tailscale.com/kb/1246/tailscale-ssh-session-recording#deploy-a-recorder-node.
Required if S3 storage is not set up, to ensure that recordings are accessible. | | | | `storage` _[Storage](#storage)_ | Configure where to store session recordings. By default, recordings will
be stored in a local ephemeral volume, and will not be persisted past the
lifetime of a specific pod. | | | +| `replicas` _integer_ | Replicas specifies how many instances of tsrecorder to run. Defaults to 1. | | Minimum: 0
| #### RecorderStatefulSet diff --git a/k8s-operator/apis/v1alpha1/types_recorder.go b/k8s-operator/apis/v1alpha1/types_recorder.go index 16a610b26..67cffbf09 100644 --- a/k8s-operator/apis/v1alpha1/types_recorder.go +++ b/k8s-operator/apis/v1alpha1/types_recorder.go @@ -44,6 +44,8 @@ type RecorderList struct { Items []Recorder `json:"items"` } +// RecorderSpec describes a tsrecorder instance to be deployed in the cluster +// +kubebuilder:validation:XValidation:rule="!(self.replicas > 1 && (!has(self.storage) || !has(self.storage.s3)))",message="S3 storage must be used when deploying multiple Recorder replicas" type RecorderSpec struct { // Configuration parameters for the Recorder's StatefulSet. The operator // deploys a StatefulSet for each Recorder resource. @@ -74,6 +76,11 @@ type RecorderSpec struct { // lifetime of a specific pod. // +optional Storage Storage `json:"storage,omitempty"` + + // Replicas specifies how many instances of tsrecorder to run. Defaults to 1. + // +optional + // +kubebuilder:validation:Minimum=0 + Replicas *int32 `json:"replicas,omitzero"` } type RecorderStatefulSet struct { diff --git a/k8s-operator/apis/v1alpha1/zz_generated.deepcopy.go b/k8s-operator/apis/v1alpha1/zz_generated.deepcopy.go index 7492f1e54..ff0f3f6ac 100644 --- a/k8s-operator/apis/v1alpha1/zz_generated.deepcopy.go +++ b/k8s-operator/apis/v1alpha1/zz_generated.deepcopy.go @@ -1068,6 +1068,11 @@ func (in *RecorderSpec) DeepCopyInto(out *RecorderSpec) { copy(*out, *in) } in.Storage.DeepCopyInto(&out.Storage) + if in.Replicas != nil { + in, out := &in.Replicas, &out.Replicas + *out = new(int32) + **out = **in + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RecorderSpec.