From 42a52620168f9171c378b06eecb3c8d262f85e2e Mon Sep 17 00:00:00 2001 From: David Bond Date: Thu, 20 Nov 2025 11:46:34 +0000 Subject: [PATCH 01/33] cmd/k8s-operator: add multi replica support for recorders (#17864) This commit adds the `spec.replicas` field to the `Recorder` custom resource that allows for a highly available deployment of `tsrecorder` within a kubernetes cluster. Many changes were required here as the code hard-coded the assumption of a single replica. This has required a few loops, similar to what we do for the `Connector` resource to create auth and state secrets. It was also required to add a check to remove dangling state and auth secrets should the recorder be scaled down. Updates: https://github.com/tailscale/tailscale/issues/17965 Signed-off-by: David Bond --- .../deploy/crds/tailscale.com_recorders.yaml | 8 + .../deploy/manifests/operator.yaml | 8 + cmd/k8s-operator/operator.go | 2 +- cmd/k8s-operator/tsrecorder.go | 296 ++++++++++++------ cmd/k8s-operator/tsrecorder_specs.go | 95 ++++-- cmd/k8s-operator/tsrecorder_specs_test.go | 20 +- cmd/k8s-operator/tsrecorder_test.go | 89 ++++-- k8s-operator/api.md | 3 +- k8s-operator/apis/v1alpha1/types_recorder.go | 7 + .../apis/v1alpha1/zz_generated.deepcopy.go | 5 + 10 files changed, 381 insertions(+), 152 deletions(-) diff --git a/cmd/k8s-operator/deploy/crds/tailscale.com_recorders.yaml b/cmd/k8s-operator/deploy/crds/tailscale.com_recorders.yaml index 0f3dcfcca..48db3ef4b 100644 --- a/cmd/k8s-operator/deploy/crds/tailscale.com_recorders.yaml +++ b/cmd/k8s-operator/deploy/crds/tailscale.com_recorders.yaml @@ -68,6 +68,11 @@ spec: Corresponds to --ui tsrecorder flag https://tailscale.com/kb/1246/tailscale-ssh-session-recording#deploy-a-recorder-node. Required if S3 storage is not set up, to ensure that recordings are accessible. type: boolean + replicas: + description: Replicas specifies how many instances of tsrecorder to run. Defaults to 1. + type: integer + format: int32 + minimum: 0 statefulSet: description: |- Configuration parameters for the Recorder's StatefulSet. The operator @@ -1683,6 +1688,9 @@ spec: items: type: string pattern: ^tag:[a-zA-Z][a-zA-Z0-9-]*$ + x-kubernetes-validations: + - rule: '!(self.replicas > 1 && (!has(self.storage) || !has(self.storage.s3)))' + message: S3 storage must be used when deploying multiple Recorder replicas status: description: |- RecorderStatus describes the status of the recorder. This is set diff --git a/cmd/k8s-operator/deploy/manifests/operator.yaml b/cmd/k8s-operator/deploy/manifests/operator.yaml index c5da367e0..2757f09e5 100644 --- a/cmd/k8s-operator/deploy/manifests/operator.yaml +++ b/cmd/k8s-operator/deploy/manifests/operator.yaml @@ -3348,6 +3348,11 @@ spec: Corresponds to --ui tsrecorder flag https://tailscale.com/kb/1246/tailscale-ssh-session-recording#deploy-a-recorder-node. Required if S3 storage is not set up, to ensure that recordings are accessible. type: boolean + replicas: + description: Replicas specifies how many instances of tsrecorder to run. Defaults to 1. + format: int32 + minimum: 0 + type: integer statefulSet: description: |- Configuration parameters for the Recorder's StatefulSet. The operator @@ -4964,6 +4969,9 @@ spec: type: string type: array type: object + x-kubernetes-validations: + - message: S3 storage must be used when deploying multiple Recorder replicas + rule: '!(self.replicas > 1 && (!has(self.storage) || !has(self.storage.s3)))' status: description: |- RecorderStatus describes the status of the recorder. This is set diff --git a/cmd/k8s-operator/operator.go b/cmd/k8s-operator/operator.go index 6b545a827..816fea566 100644 --- a/cmd/k8s-operator/operator.go +++ b/cmd/k8s-operator/operator.go @@ -44,10 +44,10 @@ import ( "sigs.k8s.io/controller-runtime/pkg/manager/signals" "sigs.k8s.io/controller-runtime/pkg/predicate" "sigs.k8s.io/controller-runtime/pkg/reconcile" - "tailscale.com/envknob" "tailscale.com/client/local" "tailscale.com/client/tailscale" + "tailscale.com/envknob" "tailscale.com/hostinfo" "tailscale.com/ipn" "tailscale.com/ipn/store/kubestore" diff --git a/cmd/k8s-operator/tsrecorder.go b/cmd/k8s-operator/tsrecorder.go index c922f78fe..bfb01fa86 100644 --- a/cmd/k8s-operator/tsrecorder.go +++ b/cmd/k8s-operator/tsrecorder.go @@ -12,6 +12,7 @@ import ( "fmt" "net/http" "slices" + "strconv" "strings" "sync" @@ -29,6 +30,7 @@ import ( "k8s.io/client-go/tools/record" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/reconcile" + "tailscale.com/client/tailscale" tsoperator "tailscale.com/k8s-operator" tsapi "tailscale.com/k8s-operator/apis/v1alpha1" @@ -69,13 +71,13 @@ func (r *RecorderReconciler) logger(name string) *zap.SugaredLogger { return r.log.With("Recorder", name) } -func (r *RecorderReconciler) Reconcile(ctx context.Context, req reconcile.Request) (_ reconcile.Result, err error) { +func (r *RecorderReconciler) Reconcile(ctx context.Context, req reconcile.Request) (reconcile.Result, error) { logger := r.logger(req.Name) logger.Debugf("starting reconcile") defer logger.Debugf("reconcile finished") tsr := new(tsapi.Recorder) - err = r.Get(ctx, req.NamespacedName, tsr) + err := r.Get(ctx, req.NamespacedName, tsr) if apierrors.IsNotFound(err) { logger.Debugf("Recorder not found, assuming it was deleted") return reconcile.Result{}, nil @@ -98,7 +100,7 @@ func (r *RecorderReconciler) Reconcile(ctx context.Context, req reconcile.Reques } tsr.Finalizers = slices.Delete(tsr.Finalizers, ix, ix+1) - if err := r.Update(ctx, tsr); err != nil { + if err = r.Update(ctx, tsr); err != nil { return reconcile.Result{}, err } return reconcile.Result{}, nil @@ -110,10 +112,11 @@ func (r *RecorderReconciler) Reconcile(ctx context.Context, req reconcile.Reques if !apiequality.Semantic.DeepEqual(oldTSRStatus, &tsr.Status) { // An error encountered here should get returned by the Reconcile function. if updateErr := r.Client.Status().Update(ctx, tsr); updateErr != nil { - err = errors.Join(err, updateErr) + return reconcile.Result{}, errors.Join(err, updateErr) } } - return reconcile.Result{}, err + + return reconcile.Result{}, nil } if !slices.Contains(tsr.Finalizers, FinalizerName) { @@ -123,12 +126,12 @@ func (r *RecorderReconciler) Reconcile(ctx context.Context, req reconcile.Reques // operation is underway. logger.Infof("ensuring Recorder is set up") tsr.Finalizers = append(tsr.Finalizers, FinalizerName) - if err := r.Update(ctx, tsr); err != nil { + if err = r.Update(ctx, tsr); err != nil { return setStatusReady(tsr, metav1.ConditionFalse, reasonRecorderCreationFailed, reasonRecorderCreationFailed) } } - if err := r.validate(ctx, tsr); err != nil { + if err = r.validate(ctx, tsr); err != nil { message := fmt.Sprintf("Recorder is invalid: %s", err) r.recorder.Eventf(tsr, corev1.EventTypeWarning, reasonRecorderInvalid, message) return setStatusReady(tsr, metav1.ConditionFalse, reasonRecorderInvalid, message) @@ -160,19 +163,29 @@ func (r *RecorderReconciler) maybeProvision(ctx context.Context, tsr *tsapi.Reco gaugeRecorderResources.Set(int64(r.recorders.Len())) r.mu.Unlock() - if err := r.ensureAuthSecretCreated(ctx, tsr); err != nil { + if err := r.ensureAuthSecretsCreated(ctx, tsr); err != nil { return fmt.Errorf("error creating secrets: %w", err) } - // State Secret is precreated so we can use the Recorder CR as its owner ref. - sec := tsrStateSecret(tsr, r.tsNamespace) - if _, err := createOrUpdate(ctx, r.Client, r.tsNamespace, sec, func(s *corev1.Secret) { - s.ObjectMeta.Labels = sec.ObjectMeta.Labels - s.ObjectMeta.Annotations = sec.ObjectMeta.Annotations - }); err != nil { - return fmt.Errorf("error creating state Secret: %w", err) + + // State Secrets are pre-created so we can use the Recorder CR as its owner ref. + var replicas int32 = 1 + if tsr.Spec.Replicas != nil { + replicas = *tsr.Spec.Replicas + } + + for replica := range replicas { + sec := tsrStateSecret(tsr, r.tsNamespace, replica) + _, err := createOrUpdate(ctx, r.Client, r.tsNamespace, sec, func(s *corev1.Secret) { + s.ObjectMeta.Labels = sec.ObjectMeta.Labels + s.ObjectMeta.Annotations = sec.ObjectMeta.Annotations + }) + if err != nil { + return fmt.Errorf("error creating state Secret %q: %w", sec.Name, err) + } } + sa := tsrServiceAccount(tsr, r.tsNamespace) - if _, err := createOrMaybeUpdate(ctx, r.Client, r.tsNamespace, sa, func(s *corev1.ServiceAccount) error { + _, err := createOrMaybeUpdate(ctx, r.Client, r.tsNamespace, sa, func(s *corev1.ServiceAccount) error { // Perform this check within the update function to make sure we don't // have a race condition between the previous check and the update. if err := saOwnedByRecorder(s, tsr); err != nil { @@ -183,54 +196,68 @@ func (r *RecorderReconciler) maybeProvision(ctx context.Context, tsr *tsapi.Reco s.ObjectMeta.Annotations = sa.ObjectMeta.Annotations return nil - }); err != nil { + }) + if err != nil { return fmt.Errorf("error creating ServiceAccount: %w", err) } + role := tsrRole(tsr, r.tsNamespace) - if _, err := createOrUpdate(ctx, r.Client, r.tsNamespace, role, func(r *rbacv1.Role) { + _, err = createOrUpdate(ctx, r.Client, r.tsNamespace, role, func(r *rbacv1.Role) { r.ObjectMeta.Labels = role.ObjectMeta.Labels r.ObjectMeta.Annotations = role.ObjectMeta.Annotations r.Rules = role.Rules - }); err != nil { + }) + if err != nil { return fmt.Errorf("error creating Role: %w", err) } + roleBinding := tsrRoleBinding(tsr, r.tsNamespace) - if _, err := createOrUpdate(ctx, r.Client, r.tsNamespace, roleBinding, func(r *rbacv1.RoleBinding) { + _, err = createOrUpdate(ctx, r.Client, r.tsNamespace, roleBinding, func(r *rbacv1.RoleBinding) { r.ObjectMeta.Labels = roleBinding.ObjectMeta.Labels r.ObjectMeta.Annotations = roleBinding.ObjectMeta.Annotations r.RoleRef = roleBinding.RoleRef r.Subjects = roleBinding.Subjects - }); err != nil { + }) + if err != nil { return fmt.Errorf("error creating RoleBinding: %w", err) } + ss := tsrStatefulSet(tsr, r.tsNamespace, r.loginServer) - if _, err := createOrUpdate(ctx, r.Client, r.tsNamespace, ss, func(s *appsv1.StatefulSet) { + _, err = createOrUpdate(ctx, r.Client, r.tsNamespace, ss, func(s *appsv1.StatefulSet) { s.ObjectMeta.Labels = ss.ObjectMeta.Labels s.ObjectMeta.Annotations = ss.ObjectMeta.Annotations s.Spec = ss.Spec - }); err != nil { + }) + if err != nil { return fmt.Errorf("error creating StatefulSet: %w", err) } // ServiceAccount name may have changed, in which case we need to clean up // the previous ServiceAccount. RoleBinding will already be updated to point // to the new ServiceAccount. - if err := r.maybeCleanupServiceAccounts(ctx, tsr, sa.Name); err != nil { + if err = r.maybeCleanupServiceAccounts(ctx, tsr, sa.Name); err != nil { return fmt.Errorf("error cleaning up ServiceAccounts: %w", err) } + // If we have scaled the recorder down, we will have dangling state secrets + // that we need to clean up. + if err = r.maybeCleanupSecrets(ctx, tsr); err != nil { + return fmt.Errorf("error cleaning up Secrets: %w", err) + } + var devices []tsapi.RecorderTailnetDevice + for replica := range replicas { + dev, ok, err := r.getDeviceInfo(ctx, tsr.Name, replica) + switch { + case err != nil: + return fmt.Errorf("failed to get device info: %w", err) + case !ok: + logger.Debugf("no Tailscale hostname known yet, waiting for Recorder pod to finish auth") + continue + } - device, ok, err := r.getDeviceInfo(ctx, tsr.Name) - if err != nil { - return fmt.Errorf("failed to get device info: %w", err) + devices = append(devices, dev) } - if !ok { - logger.Debugf("no Tailscale hostname known yet, waiting for Recorder pod to finish auth") - return nil - } - - devices = append(devices, device) tsr.Status.Devices = devices @@ -257,22 +284,89 @@ func saOwnedByRecorder(sa *corev1.ServiceAccount, tsr *tsapi.Recorder) error { func (r *RecorderReconciler) maybeCleanupServiceAccounts(ctx context.Context, tsr *tsapi.Recorder, currentName string) error { logger := r.logger(tsr.Name) - // List all ServiceAccounts owned by this Recorder. + options := []client.ListOption{ + client.InNamespace(r.tsNamespace), + client.MatchingLabels(tsrLabels("recorder", tsr.Name, nil)), + } + sas := &corev1.ServiceAccountList{} - if err := r.List(ctx, sas, client.InNamespace(r.tsNamespace), client.MatchingLabels(labels("recorder", tsr.Name, nil))); err != nil { + if err := r.List(ctx, sas, options...); err != nil { return fmt.Errorf("error listing ServiceAccounts for cleanup: %w", err) } - for _, sa := range sas.Items { - if sa.Name == currentName { + + for _, serviceAccount := range sas.Items { + if serviceAccount.Name == currentName { + continue + } + + err := r.Delete(ctx, &serviceAccount) + switch { + case apierrors.IsNotFound(err): + logger.Debugf("ServiceAccount %s not found, likely already deleted", serviceAccount.Name) + continue + case err != nil: + return fmt.Errorf("error deleting ServiceAccount %s: %w", serviceAccount.Name, err) + } + } + + return nil +} + +func (r *RecorderReconciler) maybeCleanupSecrets(ctx context.Context, tsr *tsapi.Recorder) error { + options := []client.ListOption{ + client.InNamespace(r.tsNamespace), + client.MatchingLabels(tsrLabels("recorder", tsr.Name, nil)), + } + + secrets := &corev1.SecretList{} + if err := r.List(ctx, secrets, options...); err != nil { + return fmt.Errorf("error listing Secrets for cleanup: %w", err) + } + + // Get the largest ordinal suffix that we expect. Then we'll go through the list of secrets owned by this + // recorder and remove them. + var replicas int32 = 1 + if tsr.Spec.Replicas != nil { + replicas = *tsr.Spec.Replicas + } + + for _, secret := range secrets.Items { + parts := strings.Split(secret.Name, "-") + if len(parts) == 0 { + continue + } + + ordinal, err := strconv.ParseUint(parts[len(parts)-1], 10, 32) + if err != nil { + return fmt.Errorf("error parsing secret name %q: %w", secret.Name, err) + } + + if int32(ordinal) < replicas { continue } - if err := r.Delete(ctx, &sa); err != nil { - if apierrors.IsNotFound(err) { - logger.Debugf("ServiceAccount %s not found, likely already deleted", sa.Name) - } else { - return fmt.Errorf("error deleting ServiceAccount %s: %w", sa.Name, err) + + devicePrefs, ok, err := getDevicePrefs(&secret) + if err != nil { + return err + } + + if ok { + var errResp *tailscale.ErrResponse + + r.log.Debugf("deleting device %s", devicePrefs.Config.NodeID) + err = r.tsClient.DeleteDevice(ctx, string(devicePrefs.Config.NodeID)) + switch { + case errors.As(err, &errResp) && errResp.Status == http.StatusNotFound: + // This device has possibly already been deleted in the admin console. So we can ignore this + // and move on to removing the secret. + case err != nil: + return err } } + + if err = r.Delete(ctx, &secret); err != nil { + return err + } } return nil @@ -284,30 +378,38 @@ func (r *RecorderReconciler) maybeCleanupServiceAccounts(ctx context.Context, ts func (r *RecorderReconciler) maybeCleanup(ctx context.Context, tsr *tsapi.Recorder) (bool, error) { logger := r.logger(tsr.Name) - prefs, ok, err := r.getDevicePrefs(ctx, tsr.Name) - if err != nil { - return false, err + var replicas int32 = 1 + if tsr.Spec.Replicas != nil { + replicas = *tsr.Spec.Replicas } - if !ok { - logger.Debugf("state Secret %s-0 not found or does not contain node ID, continuing cleanup", tsr.Name) - r.mu.Lock() - r.recorders.Remove(tsr.UID) - gaugeRecorderResources.Set(int64(r.recorders.Len())) - r.mu.Unlock() - return true, nil - } - - id := string(prefs.Config.NodeID) - logger.Debugf("deleting device %s from control", string(id)) - if err := r.tsClient.DeleteDevice(ctx, string(id)); err != nil { - errResp := &tailscale.ErrResponse{} - if ok := errors.As(err, errResp); ok && errResp.Status == http.StatusNotFound { - logger.Debugf("device %s not found, likely because it has already been deleted from control", string(id)) - } else { + + for replica := range replicas { + devicePrefs, ok, err := r.getDevicePrefs(ctx, tsr.Name, replica) + if err != nil { + return false, err + } + if !ok { + logger.Debugf("state Secret %s-%d not found or does not contain node ID, continuing cleanup", tsr.Name, replica) + r.mu.Lock() + r.recorders.Remove(tsr.UID) + gaugeRecorderResources.Set(int64(r.recorders.Len())) + r.mu.Unlock() + return true, nil + } + + nodeID := string(devicePrefs.Config.NodeID) + logger.Debugf("deleting device %s from control", nodeID) + if err = r.tsClient.DeleteDevice(ctx, nodeID); err != nil { + errResp := &tailscale.ErrResponse{} + if errors.As(err, errResp) && errResp.Status == http.StatusNotFound { + logger.Debugf("device %s not found, likely because it has already been deleted from control", nodeID) + continue + } + return false, fmt.Errorf("error deleting device: %w", err) } - } else { - logger.Debugf("device %s deleted from control", string(id)) + + logger.Debugf("device %s deleted from control", nodeID) } // Unlike most log entries in the reconcile loop, this will get printed @@ -319,38 +421,46 @@ func (r *RecorderReconciler) maybeCleanup(ctx context.Context, tsr *tsapi.Record r.recorders.Remove(tsr.UID) gaugeRecorderResources.Set(int64(r.recorders.Len())) r.mu.Unlock() + return true, nil } -func (r *RecorderReconciler) ensureAuthSecretCreated(ctx context.Context, tsr *tsapi.Recorder) error { - logger := r.logger(tsr.Name) - key := types.NamespacedName{ - Namespace: r.tsNamespace, - Name: tsr.Name, - } - if err := r.Get(ctx, key, &corev1.Secret{}); err == nil { - // No updates, already created the auth key. - logger.Debugf("auth Secret %s already exists", key.Name) - return nil - } else if !apierrors.IsNotFound(err) { - return err +func (r *RecorderReconciler) ensureAuthSecretsCreated(ctx context.Context, tsr *tsapi.Recorder) error { + var replicas int32 = 1 + if tsr.Spec.Replicas != nil { + replicas = *tsr.Spec.Replicas } - // Create the auth key Secret which is going to be used by the StatefulSet - // to authenticate with Tailscale. - logger.Debugf("creating authkey for new Recorder") tags := tsr.Spec.Tags if len(tags) == 0 { tags = tsapi.Tags{"tag:k8s"} } - authKey, err := newAuthKey(ctx, r.tsClient, tags.Stringify()) - if err != nil { - return err - } - logger.Debug("creating a new Secret for the Recorder") - if err := r.Create(ctx, tsrAuthSecret(tsr, r.tsNamespace, authKey)); err != nil { - return err + logger := r.logger(tsr.Name) + + for replica := range replicas { + key := types.NamespacedName{ + Namespace: r.tsNamespace, + Name: fmt.Sprintf("%s-auth-%d", tsr.Name, replica), + } + + err := r.Get(ctx, key, &corev1.Secret{}) + switch { + case err == nil: + logger.Debugf("auth Secret %q already exists", key.Name) + continue + case !apierrors.IsNotFound(err): + return fmt.Errorf("failed to get Secret %q: %w", key.Name, err) + } + + authKey, err := newAuthKey(ctx, r.tsClient, tags.Stringify()) + if err != nil { + return err + } + + if err = r.Create(ctx, tsrAuthSecret(tsr, r.tsNamespace, authKey, replica)); err != nil { + return err + } } return nil @@ -361,6 +471,10 @@ func (r *RecorderReconciler) validate(ctx context.Context, tsr *tsapi.Recorder) return errors.New("must either enable UI or use S3 storage to ensure recordings are accessible") } + if tsr.Spec.Replicas != nil && *tsr.Spec.Replicas > 1 && tsr.Spec.Storage.S3 == nil { + return errors.New("must use S3 storage when using multiple replicas to ensure recordings are accessible") + } + // Check any custom ServiceAccount config doesn't conflict with pre-existing // ServiceAccounts. This check is performed once during validation to ensure // errors are raised early, but also again during any Updates to prevent a race. @@ -394,11 +508,11 @@ func (r *RecorderReconciler) validate(ctx context.Context, tsr *tsapi.Recorder) return nil } -func (r *RecorderReconciler) getStateSecret(ctx context.Context, tsrName string) (*corev1.Secret, error) { +func (r *RecorderReconciler) getStateSecret(ctx context.Context, tsrName string, replica int32) (*corev1.Secret, error) { secret := &corev1.Secret{ ObjectMeta: metav1.ObjectMeta{ Namespace: r.tsNamespace, - Name: fmt.Sprintf("%s-0", tsrName), + Name: fmt.Sprintf("%s-%d", tsrName, replica), }, } if err := r.Get(ctx, client.ObjectKeyFromObject(secret), secret); err != nil { @@ -412,8 +526,8 @@ func (r *RecorderReconciler) getStateSecret(ctx context.Context, tsrName string) return secret, nil } -func (r *RecorderReconciler) getDevicePrefs(ctx context.Context, tsrName string) (prefs prefs, ok bool, err error) { - secret, err := r.getStateSecret(ctx, tsrName) +func (r *RecorderReconciler) getDevicePrefs(ctx context.Context, tsrName string, replica int32) (prefs prefs, ok bool, err error) { + secret, err := r.getStateSecret(ctx, tsrName, replica) if err != nil || secret == nil { return prefs, false, err } @@ -441,8 +555,8 @@ func getDevicePrefs(secret *corev1.Secret) (prefs prefs, ok bool, err error) { return prefs, ok, nil } -func (r *RecorderReconciler) getDeviceInfo(ctx context.Context, tsrName string) (d tsapi.RecorderTailnetDevice, ok bool, err error) { - secret, err := r.getStateSecret(ctx, tsrName) +func (r *RecorderReconciler) getDeviceInfo(ctx context.Context, tsrName string, replica int32) (d tsapi.RecorderTailnetDevice, ok bool, err error) { + secret, err := r.getStateSecret(ctx, tsrName, replica) if err != nil || secret == nil { return tsapi.RecorderTailnetDevice{}, false, err } diff --git a/cmd/k8s-operator/tsrecorder_specs.go b/cmd/k8s-operator/tsrecorder_specs.go index 83d7439db..b4a10f296 100644 --- a/cmd/k8s-operator/tsrecorder_specs.go +++ b/cmd/k8s-operator/tsrecorder_specs.go @@ -12,30 +12,36 @@ import ( corev1 "k8s.io/api/core/v1" rbacv1 "k8s.io/api/rbac/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + tsapi "tailscale.com/k8s-operator/apis/v1alpha1" "tailscale.com/types/ptr" "tailscale.com/version" ) func tsrStatefulSet(tsr *tsapi.Recorder, namespace string, loginServer string) *appsv1.StatefulSet { - return &appsv1.StatefulSet{ + var replicas int32 = 1 + if tsr.Spec.Replicas != nil { + replicas = *tsr.Spec.Replicas + } + + ss := &appsv1.StatefulSet{ ObjectMeta: metav1.ObjectMeta{ Name: tsr.Name, Namespace: namespace, - Labels: labels("recorder", tsr.Name, tsr.Spec.StatefulSet.Labels), + Labels: tsrLabels("recorder", tsr.Name, tsr.Spec.StatefulSet.Labels), OwnerReferences: tsrOwnerReference(tsr), Annotations: tsr.Spec.StatefulSet.Annotations, }, Spec: appsv1.StatefulSetSpec{ - Replicas: ptr.To[int32](1), + Replicas: ptr.To(replicas), Selector: &metav1.LabelSelector{ - MatchLabels: labels("recorder", tsr.Name, tsr.Spec.StatefulSet.Pod.Labels), + MatchLabels: tsrLabels("recorder", tsr.Name, tsr.Spec.StatefulSet.Pod.Labels), }, Template: corev1.PodTemplateSpec{ ObjectMeta: metav1.ObjectMeta{ Name: tsr.Name, Namespace: namespace, - Labels: labels("recorder", tsr.Name, tsr.Spec.StatefulSet.Pod.Labels), + Labels: tsrLabels("recorder", tsr.Name, tsr.Spec.StatefulSet.Pod.Labels), Annotations: tsr.Spec.StatefulSet.Pod.Annotations, }, Spec: corev1.PodSpec{ @@ -59,7 +65,7 @@ func tsrStatefulSet(tsr *tsapi.Recorder, namespace string, loginServer string) * ImagePullPolicy: tsr.Spec.StatefulSet.Pod.Container.ImagePullPolicy, Resources: tsr.Spec.StatefulSet.Pod.Container.Resources, SecurityContext: tsr.Spec.StatefulSet.Pod.Container.SecurityContext, - Env: env(tsr, loginServer), + Env: tsrEnv(tsr, loginServer), EnvFrom: func() []corev1.EnvFromSource { if tsr.Spec.Storage.S3 == nil || tsr.Spec.Storage.S3.Credentials.Secret.Name == "" { return nil @@ -95,6 +101,28 @@ func tsrStatefulSet(tsr *tsapi.Recorder, namespace string, loginServer string) * }, }, } + + for replica := range replicas { + volumeName := fmt.Sprintf("authkey-%d", replica) + + ss.Spec.Template.Spec.Containers[0].VolumeMounts = append(ss.Spec.Template.Spec.Containers[0].VolumeMounts, corev1.VolumeMount{ + Name: volumeName, + ReadOnly: true, + MountPath: fmt.Sprintf("/etc/tailscaled/%s-%d", ss.Name, replica), + }) + + ss.Spec.Template.Spec.Volumes = append(ss.Spec.Template.Spec.Volumes, corev1.Volume{ + Name: volumeName, + VolumeSource: corev1.VolumeSource{ + Secret: &corev1.SecretVolumeSource{ + SecretName: fmt.Sprintf("%s-auth-%d", tsr.Name, replica), + Items: []corev1.KeyToPath{{Key: "authkey", Path: "authkey"}}, + }, + }, + }) + } + + return ss } func tsrServiceAccount(tsr *tsapi.Recorder, namespace string) *corev1.ServiceAccount { @@ -102,7 +130,7 @@ func tsrServiceAccount(tsr *tsapi.Recorder, namespace string) *corev1.ServiceAcc ObjectMeta: metav1.ObjectMeta{ Name: tsrServiceAccountName(tsr), Namespace: namespace, - Labels: labels("recorder", tsr.Name, nil), + Labels: tsrLabels("recorder", tsr.Name, nil), OwnerReferences: tsrOwnerReference(tsr), Annotations: tsr.Spec.StatefulSet.Pod.ServiceAccount.Annotations, }, @@ -120,11 +148,24 @@ func tsrServiceAccountName(tsr *tsapi.Recorder) string { } func tsrRole(tsr *tsapi.Recorder, namespace string) *rbacv1.Role { + var replicas int32 = 1 + if tsr.Spec.Replicas != nil { + replicas = *tsr.Spec.Replicas + } + + resourceNames := make([]string, 0) + for replica := range replicas { + resourceNames = append(resourceNames, + fmt.Sprintf("%s-%d", tsr.Name, replica), // State secret. + fmt.Sprintf("%s-auth-%d", tsr.Name, replica), // Auth key secret. + ) + } + return &rbacv1.Role{ ObjectMeta: metav1.ObjectMeta{ Name: tsr.Name, Namespace: namespace, - Labels: labels("recorder", tsr.Name, nil), + Labels: tsrLabels("recorder", tsr.Name, nil), OwnerReferences: tsrOwnerReference(tsr), }, Rules: []rbacv1.PolicyRule{ @@ -136,10 +177,7 @@ func tsrRole(tsr *tsapi.Recorder, namespace string) *rbacv1.Role { "patch", "update", }, - ResourceNames: []string{ - tsr.Name, // Contains the auth key. - fmt.Sprintf("%s-0", tsr.Name), // Contains the node state. - }, + ResourceNames: resourceNames, }, { APIGroups: []string{""}, @@ -159,7 +197,7 @@ func tsrRoleBinding(tsr *tsapi.Recorder, namespace string) *rbacv1.RoleBinding { ObjectMeta: metav1.ObjectMeta{ Name: tsr.Name, Namespace: namespace, - Labels: labels("recorder", tsr.Name, nil), + Labels: tsrLabels("recorder", tsr.Name, nil), OwnerReferences: tsrOwnerReference(tsr), }, Subjects: []rbacv1.Subject{ @@ -176,12 +214,12 @@ func tsrRoleBinding(tsr *tsapi.Recorder, namespace string) *rbacv1.RoleBinding { } } -func tsrAuthSecret(tsr *tsapi.Recorder, namespace string, authKey string) *corev1.Secret { +func tsrAuthSecret(tsr *tsapi.Recorder, namespace string, authKey string, replica int32) *corev1.Secret { return &corev1.Secret{ ObjectMeta: metav1.ObjectMeta{ Namespace: namespace, - Name: tsr.Name, - Labels: labels("recorder", tsr.Name, nil), + Name: fmt.Sprintf("%s-auth-%d", tsr.Name, replica), + Labels: tsrLabels("recorder", tsr.Name, nil), OwnerReferences: tsrOwnerReference(tsr), }, StringData: map[string]string{ @@ -190,30 +228,19 @@ func tsrAuthSecret(tsr *tsapi.Recorder, namespace string, authKey string) *corev } } -func tsrStateSecret(tsr *tsapi.Recorder, namespace string) *corev1.Secret { +func tsrStateSecret(tsr *tsapi.Recorder, namespace string, replica int32) *corev1.Secret { return &corev1.Secret{ ObjectMeta: metav1.ObjectMeta{ - Name: fmt.Sprintf("%s-0", tsr.Name), + Name: fmt.Sprintf("%s-%d", tsr.Name, replica), Namespace: namespace, - Labels: labels("recorder", tsr.Name, nil), + Labels: tsrLabels("recorder", tsr.Name, nil), OwnerReferences: tsrOwnerReference(tsr), }, } } -func env(tsr *tsapi.Recorder, loginServer string) []corev1.EnvVar { +func tsrEnv(tsr *tsapi.Recorder, loginServer string) []corev1.EnvVar { envs := []corev1.EnvVar{ - { - Name: "TS_AUTHKEY", - ValueFrom: &corev1.EnvVarSource{ - SecretKeyRef: &corev1.SecretKeySelector{ - LocalObjectReference: corev1.LocalObjectReference{ - Name: tsr.Name, - }, - Key: "authkey", - }, - }, - }, { Name: "POD_NAME", ValueFrom: &corev1.EnvVarSource{ @@ -231,6 +258,10 @@ func env(tsr *tsapi.Recorder, loginServer string) []corev1.EnvVar { }, }, }, + { + Name: "TS_AUTHKEY_FILE", + Value: "/etc/tailscaled/$(POD_NAME)/authkey", + }, { Name: "TS_STATE", Value: "kube:$(POD_NAME)", @@ -280,7 +311,7 @@ func env(tsr *tsapi.Recorder, loginServer string) []corev1.EnvVar { return envs } -func labels(app, instance string, customLabels map[string]string) map[string]string { +func tsrLabels(app, instance string, customLabels map[string]string) map[string]string { labels := make(map[string]string, len(customLabels)+3) for k, v := range customLabels { labels[k] = v diff --git a/cmd/k8s-operator/tsrecorder_specs_test.go b/cmd/k8s-operator/tsrecorder_specs_test.go index 49332d09b..0d78129fc 100644 --- a/cmd/k8s-operator/tsrecorder_specs_test.go +++ b/cmd/k8s-operator/tsrecorder_specs_test.go @@ -12,6 +12,7 @@ import ( corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + tsapi "tailscale.com/k8s-operator/apis/v1alpha1" "tailscale.com/types/ptr" ) @@ -23,6 +24,7 @@ func TestRecorderSpecs(t *testing.T) { Name: "test", }, Spec: tsapi.RecorderSpec{ + Replicas: ptr.To[int32](3), StatefulSet: tsapi.RecorderStatefulSet{ Labels: map[string]string{ "ss-label-key": "ss-label-value", @@ -101,10 +103,10 @@ func TestRecorderSpecs(t *testing.T) { } // Pod-level. - if diff := cmp.Diff(ss.Labels, labels("recorder", "test", tsr.Spec.StatefulSet.Labels)); diff != "" { + if diff := cmp.Diff(ss.Labels, tsrLabels("recorder", "test", tsr.Spec.StatefulSet.Labels)); diff != "" { t.Errorf("(-got +want):\n%s", diff) } - if diff := cmp.Diff(ss.Spec.Template.Labels, labels("recorder", "test", tsr.Spec.StatefulSet.Pod.Labels)); diff != "" { + if diff := cmp.Diff(ss.Spec.Template.Labels, tsrLabels("recorder", "test", tsr.Spec.StatefulSet.Pod.Labels)); diff != "" { t.Errorf("(-got +want):\n%s", diff) } if diff := cmp.Diff(ss.Spec.Template.Spec.Affinity, tsr.Spec.StatefulSet.Pod.Affinity); diff != "" { @@ -124,7 +126,7 @@ func TestRecorderSpecs(t *testing.T) { } // Container-level. - if diff := cmp.Diff(ss.Spec.Template.Spec.Containers[0].Env, env(tsr, tsLoginServer)); diff != "" { + if diff := cmp.Diff(ss.Spec.Template.Spec.Containers[0].Env, tsrEnv(tsr, tsLoginServer)); diff != "" { t.Errorf("(-got +want):\n%s", diff) } if diff := cmp.Diff(ss.Spec.Template.Spec.Containers[0].Image, tsr.Spec.StatefulSet.Pod.Container.Image); diff != "" { @@ -139,5 +141,17 @@ func TestRecorderSpecs(t *testing.T) { if diff := cmp.Diff(ss.Spec.Template.Spec.Containers[0].Resources, tsr.Spec.StatefulSet.Pod.Container.Resources); diff != "" { t.Errorf("(-got +want):\n%s", diff) } + + if *ss.Spec.Replicas != *tsr.Spec.Replicas { + t.Errorf("expected %d replicas, got %d", *tsr.Spec.Replicas, *ss.Spec.Replicas) + } + + if len(ss.Spec.Template.Spec.Volumes) != int(*tsr.Spec.Replicas)+1 { + t.Errorf("expected %d volumes, got %d", *tsr.Spec.Replicas+1, len(ss.Spec.Template.Spec.Volumes)) + } + + if len(ss.Spec.Template.Spec.Containers[0].VolumeMounts) != int(*tsr.Spec.Replicas)+1 { + t.Errorf("expected %d volume mounts, got %d", *tsr.Spec.Replicas+1, len(ss.Spec.Template.Spec.Containers[0].VolumeMounts)) + } }) } diff --git a/cmd/k8s-operator/tsrecorder_test.go b/cmd/k8s-operator/tsrecorder_test.go index 184af2344..f7ff797b1 100644 --- a/cmd/k8s-operator/tsrecorder_test.go +++ b/cmd/k8s-operator/tsrecorder_test.go @@ -8,6 +8,7 @@ package main import ( "context" "encoding/json" + "fmt" "strings" "testing" @@ -20,9 +21,11 @@ import ( "k8s.io/client-go/tools/record" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" + tsoperator "tailscale.com/k8s-operator" tsapi "tailscale.com/k8s-operator/apis/v1alpha1" "tailscale.com/tstest" + "tailscale.com/types/ptr" ) const ( @@ -36,6 +39,9 @@ func TestRecorder(t *testing.T) { Name: "test", Finalizers: []string{"tailscale.com/finalizer"}, }, + Spec: tsapi.RecorderSpec{ + Replicas: ptr.To[int32](3), + }, } fc := fake.NewClientBuilder(). @@ -80,6 +86,15 @@ func TestRecorder(t *testing.T) { }) expectReconciled(t, reconciler, "", tsr.Name) + expectedEvent = "Warning RecorderInvalid Recorder is invalid: must use S3 storage when using multiple replicas to ensure recordings are accessible" + expectEvents(t, fr, []string{expectedEvent}) + + tsr.Spec.Storage.S3 = &tsapi.S3{} + mustUpdate(t, fc, "", "test", func(t *tsapi.Recorder) { + t.Spec = tsr.Spec + }) + expectReconciled(t, reconciler, "", tsr.Name) + // Only check part of this error message, because it's defined in an // external package and may change. if err := fc.Get(context.Background(), client.ObjectKey{ @@ -180,33 +195,47 @@ func TestRecorder(t *testing.T) { }) t.Run("populate_node_info_in_state_secret_and_see_it_appear_in_status", func(t *testing.T) { - bytes, err := json.Marshal(map[string]any{ - "Config": map[string]any{ - "NodeID": "nodeid-123", - "UserProfile": map[string]any{ - "LoginName": "test-0.example.ts.net", - }, - }, - }) - if err != nil { - t.Fatal(err) - } const key = "profile-abc" - mustUpdate(t, fc, tsNamespace, "test-0", func(s *corev1.Secret) { - s.Data = map[string][]byte{ - currentProfileKey: []byte(key), - key: bytes, + for replica := range *tsr.Spec.Replicas { + bytes, err := json.Marshal(map[string]any{ + "Config": map[string]any{ + "NodeID": fmt.Sprintf("node-%d", replica), + "UserProfile": map[string]any{ + "LoginName": fmt.Sprintf("test-%d.example.ts.net", replica), + }, + }, + }) + if err != nil { + t.Fatal(err) } - }) + + name := fmt.Sprintf("%s-%d", "test", replica) + mustUpdate(t, fc, tsNamespace, name, func(s *corev1.Secret) { + s.Data = map[string][]byte{ + currentProfileKey: []byte(key), + key: bytes, + } + }) + } expectReconciled(t, reconciler, "", tsr.Name) tsr.Status.Devices = []tsapi.RecorderTailnetDevice{ { - Hostname: "hostname-nodeid-123", + Hostname: "hostname-node-0", TailnetIPs: []string{"1.2.3.4", "::1"}, URL: "https://test-0.example.ts.net", }, + { + Hostname: "hostname-node-1", + TailnetIPs: []string{"1.2.3.4", "::1"}, + URL: "https://test-1.example.ts.net", + }, + { + Hostname: "hostname-node-2", + TailnetIPs: []string{"1.2.3.4", "::1"}, + URL: "https://test-2.example.ts.net", + }, } expectEqual(t, fc, tsr) }) @@ -222,7 +251,7 @@ func TestRecorder(t *testing.T) { if expected := 0; reconciler.recorders.Len() != expected { t.Fatalf("expected %d recorders, got %d", expected, reconciler.recorders.Len()) } - if diff := cmp.Diff(tsClient.deleted, []string{"nodeid-123"}); diff != "" { + if diff := cmp.Diff(tsClient.deleted, []string{"node-0", "node-1", "node-2"}); diff != "" { t.Fatalf("unexpected deleted devices (-got +want):\n%s", diff) } // The fake client does not clean up objects whose owner has been @@ -233,26 +262,38 @@ func TestRecorder(t *testing.T) { func expectRecorderResources(t *testing.T, fc client.WithWatch, tsr *tsapi.Recorder, shouldExist bool) { t.Helper() - auth := tsrAuthSecret(tsr, tsNamespace, "secret-authkey") - state := tsrStateSecret(tsr, tsNamespace) + var replicas int32 = 1 + if tsr.Spec.Replicas != nil { + replicas = *tsr.Spec.Replicas + } + role := tsrRole(tsr, tsNamespace) roleBinding := tsrRoleBinding(tsr, tsNamespace) serviceAccount := tsrServiceAccount(tsr, tsNamespace) statefulSet := tsrStatefulSet(tsr, tsNamespace, tsLoginServer) if shouldExist { - expectEqual(t, fc, auth) - expectEqual(t, fc, state) expectEqual(t, fc, role) expectEqual(t, fc, roleBinding) expectEqual(t, fc, serviceAccount) expectEqual(t, fc, statefulSet, removeResourceReqs) } else { - expectMissing[corev1.Secret](t, fc, auth.Namespace, auth.Name) - expectMissing[corev1.Secret](t, fc, state.Namespace, state.Name) expectMissing[rbacv1.Role](t, fc, role.Namespace, role.Name) expectMissing[rbacv1.RoleBinding](t, fc, roleBinding.Namespace, roleBinding.Name) expectMissing[corev1.ServiceAccount](t, fc, serviceAccount.Namespace, serviceAccount.Name) expectMissing[appsv1.StatefulSet](t, fc, statefulSet.Namespace, statefulSet.Name) } + + for replica := range replicas { + auth := tsrAuthSecret(tsr, tsNamespace, "secret-authkey", replica) + state := tsrStateSecret(tsr, tsNamespace, replica) + + if shouldExist { + expectEqual(t, fc, auth) + expectEqual(t, fc, state) + } else { + expectMissing[corev1.Secret](t, fc, auth.Namespace, auth.Name) + expectMissing[corev1.Secret](t, fc, state.Namespace, state.Name) + } + } } diff --git a/k8s-operator/api.md b/k8s-operator/api.md index 979d199cb..3a4e692d9 100644 --- a/k8s-operator/api.md +++ b/k8s-operator/api.md @@ -887,7 +887,7 @@ _Appears in:_ - +RecorderSpec describes a tsrecorder instance to be deployed in the cluster @@ -900,6 +900,7 @@ _Appears in:_ | `tags` _[Tags](#tags)_ | Tags that the Tailscale device will be tagged with. Defaults to [tag:k8s].
If you specify custom tags here, make sure you also make the operator
an owner of these tags.
See https://tailscale.com/kb/1236/kubernetes-operator/#setting-up-the-kubernetes-operator.
Tags cannot be changed once a Recorder node has been created.
Tag values must be in form ^tag:[a-zA-Z][a-zA-Z0-9-]*$. | | Pattern: `^tag:[a-zA-Z][a-zA-Z0-9-]*$`
Type: string
| | `enableUI` _boolean_ | Set to true to enable the Recorder UI. The UI lists and plays recorded sessions.
The UI will be served at :443. Defaults to false.
Corresponds to --ui tsrecorder flag https://tailscale.com/kb/1246/tailscale-ssh-session-recording#deploy-a-recorder-node.
Required if S3 storage is not set up, to ensure that recordings are accessible. | | | | `storage` _[Storage](#storage)_ | Configure where to store session recordings. By default, recordings will
be stored in a local ephemeral volume, and will not be persisted past the
lifetime of a specific pod. | | | +| `replicas` _integer_ | Replicas specifies how many instances of tsrecorder to run. Defaults to 1. | | Minimum: 0
| #### RecorderStatefulSet diff --git a/k8s-operator/apis/v1alpha1/types_recorder.go b/k8s-operator/apis/v1alpha1/types_recorder.go index 16a610b26..67cffbf09 100644 --- a/k8s-operator/apis/v1alpha1/types_recorder.go +++ b/k8s-operator/apis/v1alpha1/types_recorder.go @@ -44,6 +44,8 @@ type RecorderList struct { Items []Recorder `json:"items"` } +// RecorderSpec describes a tsrecorder instance to be deployed in the cluster +// +kubebuilder:validation:XValidation:rule="!(self.replicas > 1 && (!has(self.storage) || !has(self.storage.s3)))",message="S3 storage must be used when deploying multiple Recorder replicas" type RecorderSpec struct { // Configuration parameters for the Recorder's StatefulSet. The operator // deploys a StatefulSet for each Recorder resource. @@ -74,6 +76,11 @@ type RecorderSpec struct { // lifetime of a specific pod. // +optional Storage Storage `json:"storage,omitempty"` + + // Replicas specifies how many instances of tsrecorder to run. Defaults to 1. + // +optional + // +kubebuilder:validation:Minimum=0 + Replicas *int32 `json:"replicas,omitzero"` } type RecorderStatefulSet struct { diff --git a/k8s-operator/apis/v1alpha1/zz_generated.deepcopy.go b/k8s-operator/apis/v1alpha1/zz_generated.deepcopy.go index 7492f1e54..ff0f3f6ac 100644 --- a/k8s-operator/apis/v1alpha1/zz_generated.deepcopy.go +++ b/k8s-operator/apis/v1alpha1/zz_generated.deepcopy.go @@ -1068,6 +1068,11 @@ func (in *RecorderSpec) DeepCopyInto(out *RecorderSpec) { copy(*out, *in) } in.Storage.DeepCopyInto(&out.Storage) + if in.Replicas != nil { + in, out := &in.Replicas, &out.Replicas + *out = new(int32) + **out = **in + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RecorderSpec. From ac74d28190e73af85fe181b81173ef686331f51c Mon Sep 17 00:00:00 2001 From: Harry Harpham Date: Thu, 20 Nov 2025 12:40:05 -0700 Subject: [PATCH 02/33] ipn/ipnlocal: add validations when setting serve config (#17950) These validations were previously performed in the CLI frontend. There are two motivations for moving these to the local backend: 1. The backend controls synchronization around the relevant state, so only the backend can guarantee many of these validations. 2. Doing these validations in the back-end avoids the need to repeat them across every frontend (e.g. the CLI and tsnet). Updates tailscale/corp#27200 Signed-off-by: Harry Harpham --- cmd/tailscale/cli/serve_v2.go | 68 ------ cmd/tailscale/cli/serve_v2_test.go | 204 ------------------ ipn/ipnlocal/serve.go | 155 +++++++++++++- ipn/ipnlocal/serve_test.go | 326 ++++++++++++++++++++++++++++- ipn/serve.go | 44 ++-- 5 files changed, 483 insertions(+), 314 deletions(-) diff --git a/cmd/tailscale/cli/serve_v2.go b/cmd/tailscale/cli/serve_v2.go index b60e645f3..89d247be9 100644 --- a/cmd/tailscale/cli/serve_v2.go +++ b/cmd/tailscale/cli/serve_v2.go @@ -478,11 +478,6 @@ func (e *serveEnv) runServeCombined(subcmd serveMode) execFunc { } wantFg := !e.bg.Value && !turnOff if wantFg { - // validate the config before creating a WatchIPNBus session - if err := e.validateConfig(parentSC, srvPort, srvType, svcName); err != nil { - return err - } - // if foreground mode, create a WatchIPNBus session // and use the nested config for all following operations // TODO(marwan-at-work): nested-config validations should happen here or previous to this point. @@ -508,9 +503,6 @@ func (e *serveEnv) runServeCombined(subcmd serveMode) execFunc { // only unset serve when trying to unset with type and port flags. err = e.unsetServe(sc, dnsName, srvType, srvPort, mount, magicDNSSuffix) } else { - if err := e.validateConfig(parentSC, srvPort, srvType, svcName); err != nil { - return err - } if forService { e.addServiceToPrefs(ctx, svcName) } @@ -907,66 +899,6 @@ func (e *serveEnv) runServeSetConfig(ctx context.Context, args []string) (err er return e.lc.SetServeConfig(ctx, sc) } -const backgroundExistsMsg = "background configuration already exists, use `tailscale %s --%s=%d off` to remove the existing configuration" - -// validateConfig checks if the serve config is valid to serve the type wanted on the port. -// dnsName is a FQDN or a serviceName (with `svc:` prefix). -func (e *serveEnv) validateConfig(sc *ipn.ServeConfig, port uint16, wantServe serveType, svcName tailcfg.ServiceName) error { - var tcpHandlerForPort *ipn.TCPPortHandler - if svcName != noService { - svc := sc.Services[svcName] - if svc == nil { - return nil - } - if wantServe == serveTypeTUN && (svc.TCP != nil || svc.Web != nil) { - return errors.New("service already has a TCP or Web handler, cannot serve in TUN mode") - } - if svc.Tun && wantServe != serveTypeTUN { - return errors.New("service is already being served in TUN mode") - } - if svc.TCP[port] == nil { - return nil - } - tcpHandlerForPort = svc.TCP[port] - } else { - sc, isFg := sc.FindConfig(port) - if sc == nil { - return nil - } - if isFg { - return errors.New("foreground already exists under this port") - } - if !e.bg.Value { - return fmt.Errorf(backgroundExistsMsg, infoMap[e.subcmd].Name, wantServe.String(), port) - } - tcpHandlerForPort = sc.TCP[port] - } - existingServe := serveFromPortHandler(tcpHandlerForPort) - if wantServe != existingServe { - target := svcName - if target == noService { - target = "machine" - } - return fmt.Errorf("want to serve %q but port is already serving %q for %q", wantServe, existingServe, target) - } - return nil -} - -func serveFromPortHandler(tcp *ipn.TCPPortHandler) serveType { - switch { - case tcp.HTTP: - return serveTypeHTTP - case tcp.HTTPS: - return serveTypeHTTPS - case tcp.TerminateTLS != "": - return serveTypeTLSTerminatedTCP - case tcp.TCPForward != "": - return serveTypeTCP - default: - return -1 - } -} - func (e *serveEnv) setServe(sc *ipn.ServeConfig, dnsName string, srvType serveType, srvPort uint16, mount string, target string, allowFunnel bool, mds string, caps []tailcfg.PeerCapability, proxyProtocol int) error { // update serve config based on the type switch srvType { diff --git a/cmd/tailscale/cli/serve_v2_test.go b/cmd/tailscale/cli/serve_v2_test.go index 491baf9dd..513c0d1ec 100644 --- a/cmd/tailscale/cli/serve_v2_test.go +++ b/cmd/tailscale/cli/serve_v2_test.go @@ -819,26 +819,6 @@ func TestServeDevConfigMutations(t *testing.T) { }, }, }, - { - name: "forground_with_bg_conflict", - steps: []step{ - { - command: cmd("serve --bg --http=3000 localhost:3000"), - want: &ipn.ServeConfig{ - TCP: map[uint16]*ipn.TCPPortHandler{3000: {HTTP: true}}, - Web: map[ipn.HostPort]*ipn.WebServerConfig{ - "foo.test.ts.net:3000": {Handlers: map[string]*ipn.HTTPHandler{ - "/": {Proxy: "http://localhost:3000"}, - }}, - }, - }, - }, - { - command: cmd("serve --http=3000 localhost:3000"), - wantErr: exactErrMsg(fmt.Errorf(backgroundExistsMsg, "serve", "http", 3000)), - }, - }, - }, { name: "advertise_service", initialState: fakeLocalServeClient{ @@ -1067,190 +1047,6 @@ func TestServeDevConfigMutations(t *testing.T) { } } -func TestValidateConfig(t *testing.T) { - tests := [...]struct { - name string - desc string - cfg *ipn.ServeConfig - svc tailcfg.ServiceName - servePort uint16 - serveType serveType - bg bgBoolFlag - wantErr bool - }{ - { - name: "nil_config", - desc: "when config is nil, all requests valid", - cfg: nil, - servePort: 3000, - serveType: serveTypeHTTPS, - }, - { - name: "new_bg_tcp", - desc: "no error when config exists but we're adding a new bg tcp port", - cfg: &ipn.ServeConfig{ - TCP: map[uint16]*ipn.TCPPortHandler{ - 443: {HTTPS: true}, - }, - }, - bg: bgBoolFlag{true, false}, - servePort: 10000, - serveType: serveTypeHTTPS, - }, - { - name: "override_bg_tcp", - desc: "no error when overwriting previous port under the same serve type", - cfg: &ipn.ServeConfig{ - TCP: map[uint16]*ipn.TCPPortHandler{ - 443: {TCPForward: "http://localhost:4545"}, - }, - }, - bg: bgBoolFlag{true, false}, - servePort: 443, - serveType: serveTypeTCP, - }, - { - name: "override_bg_tcp", - desc: "error when overwriting previous port under a different serve type", - cfg: &ipn.ServeConfig{ - TCP: map[uint16]*ipn.TCPPortHandler{ - 443: {HTTPS: true}, - }, - }, - bg: bgBoolFlag{true, false}, - servePort: 443, - serveType: serveTypeHTTP, - wantErr: true, - }, - { - name: "new_fg_port", - desc: "no error when serving a new foreground port", - cfg: &ipn.ServeConfig{ - TCP: map[uint16]*ipn.TCPPortHandler{ - 443: {HTTPS: true}, - }, - Foreground: map[string]*ipn.ServeConfig{ - "abc123": { - TCP: map[uint16]*ipn.TCPPortHandler{ - 3000: {HTTPS: true}, - }, - }, - }, - }, - servePort: 4040, - serveType: serveTypeTCP, - }, - { - name: "same_fg_port", - desc: "error when overwriting a previous fg port", - cfg: &ipn.ServeConfig{ - Foreground: map[string]*ipn.ServeConfig{ - "abc123": { - TCP: map[uint16]*ipn.TCPPortHandler{ - 3000: {HTTPS: true}, - }, - }, - }, - }, - servePort: 3000, - serveType: serveTypeTCP, - wantErr: true, - }, - { - name: "new_service_tcp", - desc: "no error when adding a new service port", - cfg: &ipn.ServeConfig{ - Services: map[tailcfg.ServiceName]*ipn.ServiceConfig{ - "svc:foo": { - TCP: map[uint16]*ipn.TCPPortHandler{80: {HTTP: true}}, - }, - }, - }, - svc: "svc:foo", - servePort: 8080, - serveType: serveTypeTCP, - }, - { - name: "override_service_tcp", - desc: "no error when overwriting a previous service port", - cfg: &ipn.ServeConfig{ - Services: map[tailcfg.ServiceName]*ipn.ServiceConfig{ - "svc:foo": { - TCP: map[uint16]*ipn.TCPPortHandler{ - 443: {TCPForward: "http://localhost:4545"}, - }, - }, - }, - }, - svc: "svc:foo", - servePort: 443, - serveType: serveTypeTCP, - }, - { - name: "override_service_tcp", - desc: "error when overwriting a previous service port with a different serve type", - cfg: &ipn.ServeConfig{ - Services: map[tailcfg.ServiceName]*ipn.ServiceConfig{ - "svc:foo": { - TCP: map[uint16]*ipn.TCPPortHandler{ - 443: {HTTPS: true}, - }, - }, - }, - }, - svc: "svc:foo", - servePort: 443, - serveType: serveTypeHTTP, - wantErr: true, - }, - { - name: "override_service_tcp", - desc: "error when setting previous tcp service to tun mode", - cfg: &ipn.ServeConfig{ - Services: map[tailcfg.ServiceName]*ipn.ServiceConfig{ - "svc:foo": { - TCP: map[uint16]*ipn.TCPPortHandler{ - 443: {TCPForward: "http://localhost:4545"}, - }, - }, - }, - }, - svc: "svc:foo", - serveType: serveTypeTUN, - wantErr: true, - }, - { - name: "override_service_tun", - desc: "error when setting previous tun service to tcp forwarder", - cfg: &ipn.ServeConfig{ - Services: map[tailcfg.ServiceName]*ipn.ServiceConfig{ - "svc:foo": { - Tun: true, - }, - }, - }, - svc: "svc:foo", - serveType: serveTypeTCP, - servePort: 443, - wantErr: true, - }, - } - - for _, tc := range tests { - t.Run(tc.name, func(t *testing.T) { - se := serveEnv{bg: tc.bg} - err := se.validateConfig(tc.cfg, tc.servePort, tc.serveType, tc.svc) - if err == nil && tc.wantErr { - t.Fatal("expected an error but got nil") - } - if err != nil && !tc.wantErr { - t.Fatalf("expected no error but got: %v", err) - } - }) - } - -} - func TestSrcTypeFromFlags(t *testing.T) { tests := []struct { name string diff --git a/ipn/ipnlocal/serve.go b/ipn/ipnlocal/serve.go index b5118873b..ef4e91545 100644 --- a/ipn/ipnlocal/serve.go +++ b/ipn/ipnlocal/serve.go @@ -292,6 +292,10 @@ func (b *LocalBackend) updateServeTCPPortNetMapAddrListenersLocked(ports []uint1 // SetServeConfig establishes or replaces the current serve config. // ETag is an optional parameter to enforce Optimistic Concurrency Control. // If it is an empty string, then the config will be overwritten. +// +// New foreground config cannot override existing listeners--neither existing +// foreground listeners nor existing background listeners. Background config can +// change as long as the serve type (e.g. HTTP, TCP, etc.) remains the same. func (b *LocalBackend) SetServeConfig(config *ipn.ServeConfig, etag string) error { b.mu.Lock() defer b.mu.Unlock() @@ -307,12 +311,6 @@ func (b *LocalBackend) setServeConfigLocked(config *ipn.ServeConfig, etag string return errors.New("can't reconfigure tailscaled when using a config file; config file is locked") } - if config != nil { - if err := config.CheckValidServicesConfig(); err != nil { - return err - } - } - nm := b.NetMap() if nm == nil { return errors.New("netMap is nil") @@ -340,6 +338,10 @@ func (b *LocalBackend) setServeConfigLocked(config *ipn.ServeConfig, etag string } } + if err := validateServeConfigUpdate(prevConfig, config.View()); err != nil { + return err + } + var bs []byte if config != nil { j, err := json.Marshal(config) @@ -1566,3 +1568,144 @@ func vipServiceHash(logf logger.Logf, services []*tailcfg.VIPService) string { h.Sum(buf[:0]) return hex.EncodeToString(buf[:]) } + +// validateServeConfigUpdate validates changes proposed by incoming serve +// configuration. +func validateServeConfigUpdate(existing, incoming ipn.ServeConfigView) error { + // Error messages returned by this function may be presented to end-users by + // frontends like the CLI. Thus these error messages should provide enough + // information for end-users to diagnose and resolve conflicts. + + if !incoming.Valid() { + return nil + } + + // For Services, TUN mode is mutually exclusive with L4 or L7 handlers. + for svcName, svcCfg := range incoming.Services().All() { + hasTCP := svcCfg.TCP().Len() > 0 + hasWeb := svcCfg.Web().Len() > 0 + if svcCfg.Tun() && (hasTCP || hasWeb) { + return fmt.Errorf("cannot configure TUN mode in combination with TCP or web handlers for %s", svcName) + } + } + + if !existing.Valid() { + return nil + } + + // New foreground listeners must be on open ports. + for sessionID, incomingFg := range incoming.Foreground().All() { + if !existing.Foreground().Has(sessionID) { + // This is a new session. + for port := range incomingFg.TCPs() { + if _, exists := existing.FindTCP(port); exists { + return fmt.Errorf("listener already exists for port %d", port) + } + } + } + } + + // New background listeners cannot overwrite existing foreground listeners. + for port := range incoming.TCP().All() { + if _, exists := existing.FindForegroundTCP(port); exists { + return fmt.Errorf("foreground listener already exists for port %d", port) + } + } + + // Incoming configuration cannot change the serve type in use by a port. + for port, incomingHandler := range incoming.TCP().All() { + existingHandler, exists := existing.FindTCP(port) + if !exists { + continue + } + + existingServeType := serveTypeFromPortHandler(existingHandler) + incomingServeType := serveTypeFromPortHandler(incomingHandler) + if incomingServeType != existingServeType { + return fmt.Errorf("want to serve %q, but port %d is already serving %q", incomingServeType, port, existingServeType) + } + } + + // Validations for Tailscale Services. + for svcName, incomingSvcCfg := range incoming.Services().All() { + existingSvcCfg, exists := existing.Services().GetOk(svcName) + if !exists { + continue + } + + // Incoming configuration cannot change the serve type in use by a port. + for port, incomingHandler := range incomingSvcCfg.TCP().All() { + existingHandler, exists := existingSvcCfg.TCP().GetOk(port) + if !exists { + continue + } + + existingServeType := serveTypeFromPortHandler(existingHandler) + incomingServeType := serveTypeFromPortHandler(incomingHandler) + if incomingServeType != existingServeType { + return fmt.Errorf("want to serve %q, but port %d is already serving %q for %s", incomingServeType, port, existingServeType, svcName) + } + } + + existingHasTCP := existingSvcCfg.TCP().Len() > 0 + existingHasWeb := existingSvcCfg.Web().Len() > 0 + + // A Service cannot turn on TUN mode if TCP or web handlers exist. + if incomingSvcCfg.Tun() && (existingHasTCP || existingHasWeb) { + return fmt.Errorf("cannot turn on TUN mode with existing TCP or web handlers for %s", svcName) + } + + incomingHasTCP := incomingSvcCfg.TCP().Len() > 0 + incomingHasWeb := incomingSvcCfg.Web().Len() > 0 + + // A Service cannot add TCP or web handlers if TUN mode is enabled. + if (incomingHasTCP || incomingHasWeb) && existingSvcCfg.Tun() { + return fmt.Errorf("cannot add TCP or web handlers as TUN mode is enabled for %s", svcName) + } + } + + return nil +} + +// serveType is a high-level descriptor of the kind of serve performed by a TCP +// port handler. +type serveType int + +const ( + serveTypeHTTPS serveType = iota + serveTypeHTTP + serveTypeTCP + serveTypeTLSTerminatedTCP +) + +func (s serveType) String() string { + switch s { + case serveTypeHTTP: + return "http" + case serveTypeHTTPS: + return "https" + case serveTypeTCP: + return "tcp" + case serveTypeTLSTerminatedTCP: + return "tls-terminated-tcp" + default: + return "unknownServeType" + } +} + +// serveTypeFromPortHandler is used to get a high-level descriptor of the kind +// of serve being performed by a port handler. +func serveTypeFromPortHandler(ph ipn.TCPPortHandlerView) serveType { + switch { + case ph.HTTP(): + return serveTypeHTTP + case ph.HTTPS(): + return serveTypeHTTPS + case ph.TerminateTLS() != "": + return serveTypeTLSTerminatedTCP + case ph.TCPForward() != "": + return serveTypeTCP + default: + return -1 + } +} diff --git a/ipn/ipnlocal/serve_test.go b/ipn/ipnlocal/serve_test.go index c3e5b2ff9..6ee2181a0 100644 --- a/ipn/ipnlocal/serve_test.go +++ b/ipn/ipnlocal/serve_test.go @@ -388,7 +388,7 @@ func TestServeConfigServices(t *testing.T) { tests := []struct { name string conf *ipn.ServeConfig - expectedErr error + errExpected bool packetDstAddrPort []netip.AddrPort intercepted bool }{ @@ -412,7 +412,7 @@ func TestServeConfigServices(t *testing.T) { }, }, }, - expectedErr: ipn.ErrServiceConfigHasBothTCPAndTun, + errExpected: true, }, { // one correctly configured service with packet should be intercepted @@ -519,13 +519,13 @@ func TestServeConfigServices(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { err := b.SetServeConfig(tt.conf, "") - if err != nil && tt.expectedErr != nil { - if !errors.Is(err, tt.expectedErr) { - t.Fatalf("expected error %v,\n got %v", tt.expectedErr, err) - } - return + if err == nil && tt.errExpected { + t.Fatal("expected error") } if err != nil { + if tt.errExpected { + return + } t.Fatal(err) } for _, addrPort := range tt.packetDstAddrPort { @@ -1454,3 +1454,315 @@ func TestServeHTTPRedirect(t *testing.T) { }) } } + +func TestValidateServeConfigUpdate(t *testing.T) { + tests := []struct { + name, description string + existing, incoming *ipn.ServeConfig + wantError bool + }{ + { + name: "empty existing config", + description: "should be able to update with empty existing config", + existing: &ipn.ServeConfig{}, + incoming: &ipn.ServeConfig{ + TCP: map[uint16]*ipn.TCPPortHandler{ + 8080: {}, + }, + }, + wantError: false, + }, + { + name: "no existing config", + description: "should be able to update with no existing config", + existing: nil, + incoming: &ipn.ServeConfig{ + TCP: map[uint16]*ipn.TCPPortHandler{ + 8080: {}, + }, + }, + wantError: false, + }, + { + name: "empty incoming config", + description: "wiping config should work", + existing: &ipn.ServeConfig{ + TCP: map[uint16]*ipn.TCPPortHandler{ + 80: {}, + }, + }, + incoming: &ipn.ServeConfig{}, + wantError: false, + }, + { + name: "no incoming config", + description: "missing incoming config should not result in an error", + existing: &ipn.ServeConfig{ + TCP: map[uint16]*ipn.TCPPortHandler{ + 80: {}, + }, + }, + incoming: nil, + wantError: false, + }, + { + name: "non-overlapping update", + description: "non-overlapping update should work", + existing: &ipn.ServeConfig{ + TCP: map[uint16]*ipn.TCPPortHandler{ + 80: {}, + }, + }, + incoming: &ipn.ServeConfig{ + TCP: map[uint16]*ipn.TCPPortHandler{ + 8080: {}, + }, + }, + wantError: false, + }, + { + name: "overwriting background port", + description: "should be able to overwrite a background port", + existing: &ipn.ServeConfig{ + TCP: map[uint16]*ipn.TCPPortHandler{ + 80: { + TCPForward: "localhost:8080", + }, + }, + }, + incoming: &ipn.ServeConfig{ + TCP: map[uint16]*ipn.TCPPortHandler{ + 80: { + TCPForward: "localhost:9999", + }, + }, + }, + wantError: false, + }, + { + name: "broken existing config", + description: "broken existing config should not prevent new config updates", + existing: &ipn.ServeConfig{ + TCP: map[uint16]*ipn.TCPPortHandler{ + // Broken because HTTPS and TCPForward are mutually exclusive. + 9000: { + HTTPS: true, + TCPForward: "127.0.0.1:9000", + }, + // Broken because foreground and background handlers cannot coexist. + 443: {}, + }, + Foreground: map[string]*ipn.ServeConfig{ + "12345": { + TCP: map[uint16]*ipn.TCPPortHandler{ + // Broken because foreground and background handlers cannot coexist. + 443: {}, + }, + }, + }, + // Broken because Services cannot specify TUN mode and a TCP handler. + Services: map[tailcfg.ServiceName]*ipn.ServiceConfig{ + "svc:foo": { + TCP: map[uint16]*ipn.TCPPortHandler{ + 6060: {}, + }, + Tun: true, + }, + }, + }, + incoming: &ipn.ServeConfig{ + TCP: map[uint16]*ipn.TCPPortHandler{ + 80: {}, + }, + }, + wantError: false, + }, + { + name: "services same port as background", + description: "services should be able to use the same port as background listeners", + existing: &ipn.ServeConfig{ + TCP: map[uint16]*ipn.TCPPortHandler{ + 80: {}, + }, + }, + incoming: &ipn.ServeConfig{ + Services: map[tailcfg.ServiceName]*ipn.ServiceConfig{ + "svc:foo": { + TCP: map[uint16]*ipn.TCPPortHandler{ + 80: {}, + }, + }, + }, + }, + wantError: false, + }, + { + name: "services tun mode", + description: "TUN mode should be mutually exclusive with TCP or web handlers for new Services", + existing: &ipn.ServeConfig{}, + incoming: &ipn.ServeConfig{ + Services: map[tailcfg.ServiceName]*ipn.ServiceConfig{ + "svc:foo": { + TCP: map[uint16]*ipn.TCPPortHandler{ + 6060: {}, + }, + Tun: true, + }, + }, + }, + wantError: true, + }, + { + name: "new foreground listener", + description: "new foreground listeners must be on open ports", + existing: &ipn.ServeConfig{ + TCP: map[uint16]*ipn.TCPPortHandler{ + 80: {}, + }, + }, + incoming: &ipn.ServeConfig{ + Foreground: map[string]*ipn.ServeConfig{ + "12345": { + TCP: map[uint16]*ipn.TCPPortHandler{ + 80: {}, + }, + }, + }, + }, + wantError: true, + }, + { + name: "new background listener", + description: "new background listers cannot overwrite foreground listeners", + existing: &ipn.ServeConfig{ + Foreground: map[string]*ipn.ServeConfig{ + "12345": { + TCP: map[uint16]*ipn.TCPPortHandler{ + 80: {}, + }, + }, + }, + }, + incoming: &ipn.ServeConfig{ + TCP: map[uint16]*ipn.TCPPortHandler{ + 80: {}, + }, + }, + wantError: true, + }, + { + name: "serve type overwrite", + description: "incoming configuration cannot change the serve type in use by a port", + existing: &ipn.ServeConfig{ + TCP: map[uint16]*ipn.TCPPortHandler{ + 80: { + HTTP: true, + }, + }, + }, + incoming: &ipn.ServeConfig{ + TCP: map[uint16]*ipn.TCPPortHandler{ + 80: { + TCPForward: "localhost:8080", + }, + }, + }, + wantError: true, + }, + { + name: "serve type overwrite services", + description: "incoming Services configuration cannot change the serve type in use by a port", + existing: &ipn.ServeConfig{ + Services: map[tailcfg.ServiceName]*ipn.ServiceConfig{ + "svc:foo": { + TCP: map[uint16]*ipn.TCPPortHandler{ + 80: { + HTTP: true, + }, + }, + }, + }, + }, + incoming: &ipn.ServeConfig{ + Services: map[tailcfg.ServiceName]*ipn.ServiceConfig{ + "svc:foo": { + TCP: map[uint16]*ipn.TCPPortHandler{ + 80: { + TCPForward: "localhost:8080", + }, + }, + }, + }, + }, + wantError: true, + }, + { + name: "tun mode with handlers", + description: "Services cannot enable TUN mode if L4 or L7 handlers already exist", + existing: &ipn.ServeConfig{ + Services: map[tailcfg.ServiceName]*ipn.ServiceConfig{ + "svc:foo": { + TCP: map[uint16]*ipn.TCPPortHandler{ + 443: { + HTTPS: true, + }, + }, + Web: map[ipn.HostPort]*ipn.WebServerConfig{ + "127.0.0.1:443": { + Handlers: map[string]*ipn.HTTPHandler{}, + }, + }, + }, + }, + }, + incoming: &ipn.ServeConfig{ + Services: map[tailcfg.ServiceName]*ipn.ServiceConfig{ + "svc:foo": { + Tun: true, + }, + }, + }, + wantError: true, + }, + { + name: "handlers with tun mode", + description: "Services cannot add L4 or L7 handlers if TUN mode is already enabled", + existing: &ipn.ServeConfig{ + Services: map[tailcfg.ServiceName]*ipn.ServiceConfig{ + "svc:foo": { + Tun: true, + }, + }, + }, + incoming: &ipn.ServeConfig{ + Services: map[tailcfg.ServiceName]*ipn.ServiceConfig{ + "svc:foo": { + TCP: map[uint16]*ipn.TCPPortHandler{ + 443: { + HTTPS: true, + }, + }, + Web: map[ipn.HostPort]*ipn.WebServerConfig{ + "127.0.0.1:443": { + Handlers: map[string]*ipn.HTTPHandler{}, + }, + }, + }, + }, + }, + wantError: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := validateServeConfigUpdate(tt.existing.View(), tt.incoming.View()) + if err != nil && !tt.wantError { + t.Error("unexpected error:", err) + } + if err == nil && tt.wantError { + t.Error("expected error, got nil;", tt.description) + } + }) + } +} diff --git a/ipn/serve.go b/ipn/serve.go index 74195191c..7ee78ef0d 100644 --- a/ipn/serve.go +++ b/ipn/serve.go @@ -802,6 +802,7 @@ func (v ServeConfigView) FindServiceTCP(svcName tailcfg.ServiceName, port uint16 return svcCfg.TCP().GetOk(port) } +// FindServiceWeb returns the web handler for the service's host-port. func (v ServeConfigView) FindServiceWeb(svcName tailcfg.ServiceName, hp HostPort) (res WebServerConfigView, ok bool) { if svcCfg, ok := v.Services().GetOk(svcName); ok { if res, ok := svcCfg.Web().GetOk(hp); ok { @@ -815,10 +816,9 @@ func (v ServeConfigView) FindServiceWeb(svcName tailcfg.ServiceName, hp HostPort // prefers a foreground match first followed by a background search if none // existed. func (v ServeConfigView) FindTCP(port uint16) (res TCPPortHandlerView, ok bool) { - for _, conf := range v.Foreground().All() { - if res, ok := conf.TCP().GetOk(port); ok { - return res, ok - } + res, ok = v.FindForegroundTCP(port) + if ok { + return res, ok } return v.TCP().GetOk(port) } @@ -835,6 +835,17 @@ func (v ServeConfigView) FindWeb(hp HostPort) (res WebServerConfigView, ok bool) return v.Web().GetOk(hp) } +// FindForegroundTCP returns the first foreground TCP handler matching the input +// port. +func (v ServeConfigView) FindForegroundTCP(port uint16) (res TCPPortHandlerView, ok bool) { + for _, conf := range v.Foreground().All() { + if res, ok := conf.TCP().GetOk(port); ok { + return res, ok + } + } + return res, false +} + // HasAllowFunnel returns whether this config has at least one AllowFunnel // set in the background or foreground configs. func (v ServeConfigView) HasAllowFunnel() bool { @@ -863,17 +874,6 @@ func (v ServeConfigView) HasFunnelForTarget(target HostPort) bool { return false } -// CheckValidServicesConfig reports whether the ServeConfig has -// invalid service configurations. -func (sc *ServeConfig) CheckValidServicesConfig() error { - for svcName, service := range sc.Services { - if err := service.checkValidConfig(); err != nil { - return fmt.Errorf("invalid service configuration for %q: %w", svcName, err) - } - } - return nil -} - // ServicePortRange returns the list of tailcfg.ProtoPortRange that represents // the proto/ports pairs that are being served by the service. // @@ -911,17 +911,3 @@ func (v ServiceConfigView) ServicePortRange() []tailcfg.ProtoPortRange { } return ranges } - -// ErrServiceConfigHasBothTCPAndTun signals that a service -// in Tun mode cannot also has TCP or Web handlers set. -var ErrServiceConfigHasBothTCPAndTun = errors.New("the VIP Service configuration can not set TUN at the same time as TCP or Web") - -// checkValidConfig checks if the service configuration is valid. -// Currently, the only invalid configuration is when the service is in Tun mode -// and has TCP or Web handlers. -func (v *ServiceConfig) checkValidConfig() error { - if v.Tun && (len(v.TCP) > 0 || len(v.Web) > 0) { - return ErrServiceConfigHasBothTCPAndTun - } - return nil -} From de8ed203e08b9e32e40648331c47980faab92c46 Mon Sep 17 00:00:00 2001 From: Andrew Lytvynov Date: Thu, 20 Nov 2025 14:10:38 -0600 Subject: [PATCH 03/33] go.mod: bump golang.org/x/crypto (#18011) Pick up fixes for https://pkg.go.dev/vuln/GO-2025-4134 Updates #cleanup Signed-off-by: Andrew Lytvynov --- flake.nix | 2 +- go.mod | 2 +- go.mod.sri | 2 +- go.sum | 4 ++-- shell.nix | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/flake.nix b/flake.nix index fc3a466fc..c075bce0e 100644 --- a/flake.nix +++ b/flake.nix @@ -151,5 +151,5 @@ }); }; } -# nix-direnv cache busting line: sha256-sGPgML2YM/XNWfsAdDZvzWHagcydwCmR6nKOHJj5COs= +# nix-direnv cache busting line: sha256-3jAfCtp714acePnwgdNto8Sj3vFwtpO9os6IwXQ07A4= diff --git a/go.mod b/go.mod index 3b4f34b2d..e6baad0dc 100644 --- a/go.mod +++ b/go.mod @@ -102,7 +102,7 @@ require ( go.uber.org/zap v1.27.0 go4.org/mem v0.0.0-20240501181205-ae6ca9944745 go4.org/netipx v0.0.0-20231129151722-fdeea329fbba - golang.org/x/crypto v0.44.0 + golang.org/x/crypto v0.45.0 golang.org/x/exp v0.0.0-20250210185358-939b2ce775ac golang.org/x/mod v0.30.0 golang.org/x/net v0.47.0 diff --git a/go.mod.sri b/go.mod.sri index 76c72f0c9..737ea7d2b 100644 --- a/go.mod.sri +++ b/go.mod.sri @@ -1 +1 @@ -sha256-sGPgML2YM/XNWfsAdDZvzWHagcydwCmR6nKOHJj5COs= +sha256-3jAfCtp714acePnwgdNto8Sj3vFwtpO9os6IwXQ07A4= diff --git a/go.sum b/go.sum index f0758f2d4..1106932f2 100644 --- a/go.sum +++ b/go.sum @@ -1128,8 +1128,8 @@ golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5y golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= golang.org/x/crypto v0.1.0/go.mod h1:RecgLatLF4+eUMCP1PoPZQb+cVrJcOPbHkTkbkB9sbw= golang.org/x/crypto v0.3.0/go.mod h1:hebNnKkNXi2UzZN1eVRvBB7co0a+JxK6XbPiWVs/3J4= -golang.org/x/crypto v0.44.0 h1:A97SsFvM3AIwEEmTBiaxPPTYpDC47w720rdiiUvgoAU= -golang.org/x/crypto v0.44.0/go.mod h1:013i+Nw79BMiQiMsOPcVCB5ZIJbYkerPrGnOa00tvmc= +golang.org/x/crypto v0.45.0 h1:jMBrvKuj23MTlT0bQEOBcAE0mjg8mK9RXFhRH6nyF3Q= +golang.org/x/crypto v0.45.0/go.mod h1:XTGrrkGJve7CYK7J8PEww4aY7gM3qMCElcJQ8n8JdX4= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190510132918-efd6b22b2522/go.mod h1:ZjyILWgesfNpC6sMxTJOJm9Kp84zZh5NQWvqDGG3Qr8= diff --git a/shell.nix b/shell.nix index ffb28a183..8554b9258 100644 --- a/shell.nix +++ b/shell.nix @@ -16,4 +16,4 @@ ) { src = ./.; }).shellNix -# nix-direnv cache busting line: sha256-sGPgML2YM/XNWfsAdDZvzWHagcydwCmR6nKOHJj5COs= +# nix-direnv cache busting line: sha256-3jAfCtp714acePnwgdNto8Sj3vFwtpO9os6IwXQ07A4= From c679aaba32c27681845466df9e6df69fe0704b95 Mon Sep 17 00:00:00 2001 From: Andrew Lytvynov Date: Thu, 20 Nov 2025 15:52:58 -0600 Subject: [PATCH 04/33] cmd/tailscaled,ipn: show a health warning when state store fails to open (#17883) With the introduction of node sealing, store.New fails in some cases due to the TPM device being reset or unavailable. Currently it results in tailscaled crashing at startup, which is not obvious to the user until they check the logs. Instead of crashing tailscaled at startup, start with an in-memory store with a health warning about state initialization and a link to (future) docs on what to do. When this health message is set, also block any login attempts to avoid masking the problem with an ephemeral node registration. Updates #15830 Updates #17654 Signed-off-by: Andrew Lytvynov --- cmd/tailscaled/depaware-min.txt | 2 +- cmd/tailscaled/depaware-minbox.txt | 2 +- cmd/tailscaled/tailscaled.go | 13 +++- cmd/tailscaled/tailscaled_test.go | 50 +++++++++++++ ipn/ipnlocal/local.go | 9 +++ ipn/localapi/localapi.go | 10 ++- ipn/localapi/localapi_test.go | 72 +++++++++++++++++++ ipn/store.go | 15 ++++ tstest/integration/integration_test.go | 37 ++++++++++ .../tailscaled_deps_test_darwin.go | 1 + .../tailscaled_deps_test_freebsd.go | 1 + .../integration/tailscaled_deps_test_linux.go | 1 + .../tailscaled_deps_test_openbsd.go | 1 + .../tailscaled_deps_test_windows.go | 1 + 14 files changed, 211 insertions(+), 4 deletions(-) diff --git a/cmd/tailscaled/depaware-min.txt b/cmd/tailscaled/depaware-min.txt index e750f86e6..3c111470f 100644 --- a/cmd/tailscaled/depaware-min.txt +++ b/cmd/tailscaled/depaware-min.txt @@ -69,7 +69,7 @@ tailscale.com/cmd/tailscaled dependencies: (generated by github.com/tailscale/de tailscale.com/ipn/ipnstate from tailscale.com/control/controlclient+ tailscale.com/ipn/localapi from tailscale.com/ipn/ipnserver tailscale.com/ipn/store from tailscale.com/cmd/tailscaled - tailscale.com/ipn/store/mem from tailscale.com/ipn/store + tailscale.com/ipn/store/mem from tailscale.com/ipn/store+ tailscale.com/kube/kubetypes from tailscale.com/envknob tailscale.com/log/filelogger from tailscale.com/logpolicy tailscale.com/log/sockstatlog from tailscale.com/ipn/ipnlocal diff --git a/cmd/tailscaled/depaware-minbox.txt b/cmd/tailscaled/depaware-minbox.txt index 17f1a22b2..40a1fb2a4 100644 --- a/cmd/tailscaled/depaware-minbox.txt +++ b/cmd/tailscaled/depaware-minbox.txt @@ -92,7 +92,7 @@ tailscale.com/cmd/tailscaled dependencies: (generated by github.com/tailscale/de tailscale.com/ipn/ipnstate from tailscale.com/control/controlclient+ tailscale.com/ipn/localapi from tailscale.com/ipn/ipnserver tailscale.com/ipn/store from tailscale.com/cmd/tailscaled - tailscale.com/ipn/store/mem from tailscale.com/ipn/store + tailscale.com/ipn/store/mem from tailscale.com/ipn/store+ tailscale.com/kube/kubetypes from tailscale.com/envknob tailscale.com/licenses from tailscale.com/cmd/tailscale/cli tailscale.com/log/filelogger from tailscale.com/logpolicy diff --git a/cmd/tailscaled/tailscaled.go b/cmd/tailscaled/tailscaled.go index f14cdcff0..d923ca1ed 100644 --- a/cmd/tailscaled/tailscaled.go +++ b/cmd/tailscaled/tailscaled.go @@ -33,12 +33,14 @@ import ( "tailscale.com/feature" "tailscale.com/feature/buildfeatures" _ "tailscale.com/feature/condregister" + "tailscale.com/health" "tailscale.com/hostinfo" "tailscale.com/ipn" "tailscale.com/ipn/conffile" "tailscale.com/ipn/ipnlocal" "tailscale.com/ipn/ipnserver" "tailscale.com/ipn/store" + "tailscale.com/ipn/store/mem" "tailscale.com/logpolicy" "tailscale.com/logtail" "tailscale.com/net/dns" @@ -644,7 +646,16 @@ func getLocalBackend(ctx context.Context, logf logger.Logf, logID logid.PublicID store, err := store.New(logf, statePathOrDefault()) if err != nil { - return nil, fmt.Errorf("store.New: %w", err) + // If we can't create the store (for example if it's TPM-sealed and the + // TPM is reset), create a dummy in-memory store to propagate the error + // to the user. + ht, ok := sys.HealthTracker.GetOK() + if !ok { + return nil, fmt.Errorf("store.New: %w", err) + } + logf("store.New failed: %v; starting with in-memory store with a health warning", err) + store = new(mem.Store) + ht.SetUnhealthy(ipn.StateStoreHealth, health.Args{health.ArgError: err.Error()}) } sys.Set(store) diff --git a/cmd/tailscaled/tailscaled_test.go b/cmd/tailscaled/tailscaled_test.go index c50c23759..1188ad35f 100644 --- a/cmd/tailscaled/tailscaled_test.go +++ b/cmd/tailscaled/tailscaled_test.go @@ -4,9 +4,17 @@ package main // import "tailscale.com/cmd/tailscaled" import ( + "os" + "strings" "testing" + "tailscale.com/envknob" + "tailscale.com/ipn" + "tailscale.com/net/netmon" + "tailscale.com/tsd" "tailscale.com/tstest/deptest" + "tailscale.com/types/logid" + "tailscale.com/util/must" ) func TestNothing(t *testing.T) { @@ -38,3 +46,45 @@ func TestDeps(t *testing.T) { }, }.Check(t) } + +func TestStateStoreError(t *testing.T) { + logID, err := logid.NewPrivateID() + if err != nil { + t.Fatal(err) + } + // Don't upload any logs from tests. + envknob.SetNoLogsNoSupport() + + args.statedir = t.TempDir() + args.tunname = "userspace-networking" + + t.Run("new state", func(t *testing.T) { + sys := tsd.NewSystem() + sys.NetMon.Set(must.Get(netmon.New(sys.Bus.Get(), t.Logf))) + lb, err := getLocalBackend(t.Context(), t.Logf, logID.Public(), sys) + if err != nil { + t.Fatal(err) + } + defer lb.Shutdown() + if lb.HealthTracker().IsUnhealthy(ipn.StateStoreHealth) { + t.Errorf("StateStoreHealth is unhealthy on fresh LocalBackend:\n%s", strings.Join(lb.HealthTracker().Strings(), "\n")) + } + }) + t.Run("corrupt state", func(t *testing.T) { + sys := tsd.NewSystem() + sys.NetMon.Set(must.Get(netmon.New(sys.Bus.Get(), t.Logf))) + // Populate the state file with something that will fail to parse to + // trigger an error from store.New. + if err := os.WriteFile(statePathOrDefault(), []byte("bad json"), 0644); err != nil { + t.Fatal(err) + } + lb, err := getLocalBackend(t.Context(), t.Logf, logID.Public(), sys) + if err != nil { + t.Fatal(err) + } + defer lb.Shutdown() + if !lb.HealthTracker().IsUnhealthy(ipn.StateStoreHealth) { + t.Errorf("StateStoreHealth is healthy when state file is corrupt") + } + }) +} diff --git a/ipn/ipnlocal/local.go b/ipn/ipnlocal/local.go index 0ff299399..72b230327 100644 --- a/ipn/ipnlocal/local.go +++ b/ipn/ipnlocal/local.go @@ -3747,6 +3747,9 @@ func (b *LocalBackend) StartLoginInteractive(ctx context.Context) error { // the control plane sends us one. Otherwise, the notification will be delivered to all // active [watchSession]s. func (b *LocalBackend) StartLoginInteractiveAs(ctx context.Context, user ipnauth.Actor) error { + if b.health.IsUnhealthy(ipn.StateStoreHealth) { + return errors.New("cannot log in when state store is unhealthy") + } b.mu.Lock() defer b.mu.Unlock() if b.cc == nil { @@ -5677,6 +5680,9 @@ func (b *LocalBackend) NodeKey() key.NodePublic { // // b.mu must be held func (b *LocalBackend) nextStateLocked() ipn.State { + if b.health.IsUnhealthy(ipn.StateStoreHealth) { + return ipn.NoState + } var ( cc = b.cc cn = b.currentNode() @@ -6936,6 +6942,9 @@ func (b *LocalBackend) CurrentProfile() ipn.LoginProfileView { // NewProfile creates and switches to the new profile. func (b *LocalBackend) NewProfile() error { + if b.health.IsUnhealthy(ipn.StateStoreHealth) { + return errors.New("cannot log in when state store is unhealthy") + } b.mu.Lock() defer b.mu.Unlock() diff --git a/ipn/localapi/localapi.go b/ipn/localapi/localapi.go index d3503d302..7f249fe53 100644 --- a/ipn/localapi/localapi.go +++ b/ipn/localapi/localapi.go @@ -930,7 +930,10 @@ func (h *Handler) serveLoginInteractive(w http.ResponseWriter, r *http.Request) http.Error(w, "want POST", http.StatusBadRequest) return } - h.b.StartLoginInteractiveAs(r.Context(), h.Actor) + if err := h.b.StartLoginInteractiveAs(r.Context(), h.Actor); err != nil { + http.Error(w, err.Error(), http.StatusInternalServerError) + return + } w.WriteHeader(http.StatusNoContent) return } @@ -949,6 +952,11 @@ func (h *Handler) serveStart(w http.ResponseWriter, r *http.Request) { http.Error(w, err.Error(), http.StatusBadRequest) return } + + if h.b.HealthTracker().IsUnhealthy(ipn.StateStoreHealth) { + http.Error(w, "cannot start backend when state store is unhealthy", http.StatusInternalServerError) + return + } err := h.b.Start(o) if err != nil { // TODO(bradfitz): map error to a good HTTP error diff --git a/ipn/localapi/localapi_test.go b/ipn/localapi/localapi_test.go index 6bb9b5182..5d228ffd6 100644 --- a/ipn/localapi/localapi_test.go +++ b/ipn/localapi/localapi_test.go @@ -25,9 +25,11 @@ import ( "testing" "tailscale.com/client/tailscale/apitype" + "tailscale.com/health" "tailscale.com/ipn" "tailscale.com/ipn/ipnauth" "tailscale.com/ipn/ipnlocal" + "tailscale.com/ipn/ipnstate" "tailscale.com/ipn/store/mem" "tailscale.com/tailcfg" "tailscale.com/tsd" @@ -428,3 +430,73 @@ func TestKeepItSorted(t *testing.T) { } } } + +func TestServeWithUnhealthyState(t *testing.T) { + tstest.Replace(t, &validLocalHostForTesting, true) + h := &Handler{ + PermitRead: true, + PermitWrite: true, + b: newTestLocalBackend(t), + logf: t.Logf, + } + h.b.HealthTracker().SetUnhealthy(ipn.StateStoreHealth, health.Args{health.ArgError: "testing"}) + if err := h.b.Start(ipn.Options{}); err != nil { + t.Fatal(err) + } + + check500Body := func(wantResp string) func(t *testing.T, code int, resp []byte) { + return func(t *testing.T, code int, resp []byte) { + if code != http.StatusInternalServerError { + t.Errorf("got code: %v, want %v\nresponse: %q", code, http.StatusInternalServerError, resp) + } + if got := strings.TrimSpace(string(resp)); got != wantResp { + t.Errorf("got response: %q, want %q", got, wantResp) + } + } + } + tests := []struct { + desc string + req *http.Request + check func(t *testing.T, code int, resp []byte) + }{ + { + desc: "status", + req: httptest.NewRequest("GET", "http://localhost:1234/localapi/v0/status", nil), + check: func(t *testing.T, code int, resp []byte) { + if code != http.StatusOK { + t.Errorf("got code: %v, want %v\nresponse: %q", code, http.StatusOK, resp) + } + var status ipnstate.Status + if err := json.Unmarshal(resp, &status); err != nil { + t.Fatal(err) + } + if status.BackendState != "NoState" { + t.Errorf("got backend state: %q, want %q", status.BackendState, "NoState") + } + }, + }, + { + desc: "login-interactive", + req: httptest.NewRequest("POST", "http://localhost:1234/localapi/v0/login-interactive", nil), + check: check500Body("cannot log in when state store is unhealthy"), + }, + { + desc: "start", + req: httptest.NewRequest("POST", "http://localhost:1234/localapi/v0/start", strings.NewReader("{}")), + check: check500Body("cannot start backend when state store is unhealthy"), + }, + { + desc: "new-profile", + req: httptest.NewRequest("PUT", "http://localhost:1234/localapi/v0/profiles/", nil), + check: check500Body("cannot log in when state store is unhealthy"), + }, + } + + for _, tt := range tests { + t.Run(tt.desc, func(t *testing.T) { + resp := httptest.NewRecorder() + h.ServeHTTP(resp, tt.req) + tt.check(t, resp.Code, resp.Body.Bytes()) + }) + } +} diff --git a/ipn/store.go b/ipn/store.go index 9da5288c0..2034ae09a 100644 --- a/ipn/store.go +++ b/ipn/store.go @@ -10,6 +10,8 @@ import ( "fmt" "net" "strconv" + + "tailscale.com/health" ) // ErrStateNotExist is returned by StateStore.ReadState when the @@ -60,6 +62,19 @@ const ( TaildropReceivedKey = StateKey("_taildrop-received") ) +// StateStoreHealth is a Warnable set when store.New fails at startup. If +// unhealthy, we block all login attempts and return a health message in status +// responses. +var StateStoreHealth = health.Register(&health.Warnable{ + Code: "state-store-health", + Severity: health.SeverityHigh, + Title: "Tailscale state store failed to initialize", + Text: func(args health.Args) string { + return fmt.Sprintf("State store failed to initialize, Tailscale will not work until this is resolved. See https://tailscale.com/s/state-store-init-error. Error: %s", args[health.ArgError]) + }, + ImpactsConnectivity: true, +}) + // CurrentProfileID returns the StateKey that stores the // current profile ID. The value is a JSON-encoded LoginProfile. // If the userID is empty, the key returned is CurrentProfileStateKey, diff --git a/tstest/integration/integration_test.go b/tstest/integration/integration_test.go index 9d75cfc29..543dc125c 100644 --- a/tstest/integration/integration_test.go +++ b/tstest/integration/integration_test.go @@ -22,6 +22,7 @@ import ( "path/filepath" "regexp" "runtime" + "slices" "strconv" "strings" "sync/atomic" @@ -36,6 +37,7 @@ import ( "tailscale.com/cmd/testwrapper/flakytest" "tailscale.com/feature" _ "tailscale.com/feature/clientupdate" + "tailscale.com/health" "tailscale.com/hostinfo" "tailscale.com/ipn" "tailscale.com/net/tsaddr" @@ -2246,3 +2248,38 @@ func TestNetworkLock(t *testing.T) { } }) } + +func TestNodeWithBadStateFile(t *testing.T) { + tstest.Shard(t) + tstest.Parallel(t) + env := NewTestEnv(t) + n1 := NewTestNode(t, env) + if err := os.WriteFile(n1.stateFile, []byte("bad json"), 0644); err != nil { + t.Fatal(err) + } + + d1 := n1.StartDaemon() + n1.AwaitResponding() + + // Make sure the health message shows up in status output. + n1.AwaitBackendState("NoState") + st := n1.MustStatus() + wantHealth := ipn.StateStoreHealth.Text(health.Args{health.ArgError: ""}) + if !slices.ContainsFunc(st.Health, func(m string) bool { return strings.HasPrefix(m, wantHealth) }) { + t.Errorf("Status does not contain expected health message %q\ngot health messages: %q", wantHealth, st.Health) + } + + // Make sure login attempts are rejected. + cmd := n1.Tailscale("up", "--login-server="+n1.env.ControlURL()) + t.Logf("Running %v ...", cmd) + out, err := cmd.CombinedOutput() + if err == nil { + t.Fatalf("up succeeded with output %q", out) + } + wantOut := "cannot start backend when state store is unhealthy" + if !strings.Contains(string(out), wantOut) { + t.Fatalf("got up output:\n%s\nwant:\n%s", string(out), wantOut) + } + + d1.MustCleanShutdown(t) +} diff --git a/tstest/integration/tailscaled_deps_test_darwin.go b/tstest/integration/tailscaled_deps_test_darwin.go index 217188f75..9f92839d8 100644 --- a/tstest/integration/tailscaled_deps_test_darwin.go +++ b/tstest/integration/tailscaled_deps_test_darwin.go @@ -27,6 +27,7 @@ import ( _ "tailscale.com/ipn/ipnlocal" _ "tailscale.com/ipn/ipnserver" _ "tailscale.com/ipn/store" + _ "tailscale.com/ipn/store/mem" _ "tailscale.com/logpolicy" _ "tailscale.com/logtail" _ "tailscale.com/net/dns" diff --git a/tstest/integration/tailscaled_deps_test_freebsd.go b/tstest/integration/tailscaled_deps_test_freebsd.go index 217188f75..9f92839d8 100644 --- a/tstest/integration/tailscaled_deps_test_freebsd.go +++ b/tstest/integration/tailscaled_deps_test_freebsd.go @@ -27,6 +27,7 @@ import ( _ "tailscale.com/ipn/ipnlocal" _ "tailscale.com/ipn/ipnserver" _ "tailscale.com/ipn/store" + _ "tailscale.com/ipn/store/mem" _ "tailscale.com/logpolicy" _ "tailscale.com/logtail" _ "tailscale.com/net/dns" diff --git a/tstest/integration/tailscaled_deps_test_linux.go b/tstest/integration/tailscaled_deps_test_linux.go index 217188f75..9f92839d8 100644 --- a/tstest/integration/tailscaled_deps_test_linux.go +++ b/tstest/integration/tailscaled_deps_test_linux.go @@ -27,6 +27,7 @@ import ( _ "tailscale.com/ipn/ipnlocal" _ "tailscale.com/ipn/ipnserver" _ "tailscale.com/ipn/store" + _ "tailscale.com/ipn/store/mem" _ "tailscale.com/logpolicy" _ "tailscale.com/logtail" _ "tailscale.com/net/dns" diff --git a/tstest/integration/tailscaled_deps_test_openbsd.go b/tstest/integration/tailscaled_deps_test_openbsd.go index 217188f75..9f92839d8 100644 --- a/tstest/integration/tailscaled_deps_test_openbsd.go +++ b/tstest/integration/tailscaled_deps_test_openbsd.go @@ -27,6 +27,7 @@ import ( _ "tailscale.com/ipn/ipnlocal" _ "tailscale.com/ipn/ipnserver" _ "tailscale.com/ipn/store" + _ "tailscale.com/ipn/store/mem" _ "tailscale.com/logpolicy" _ "tailscale.com/logtail" _ "tailscale.com/net/dns" diff --git a/tstest/integration/tailscaled_deps_test_windows.go b/tstest/integration/tailscaled_deps_test_windows.go index f3cd5e75b..82f8097c8 100644 --- a/tstest/integration/tailscaled_deps_test_windows.go +++ b/tstest/integration/tailscaled_deps_test_windows.go @@ -37,6 +37,7 @@ import ( _ "tailscale.com/ipn/ipnlocal" _ "tailscale.com/ipn/ipnserver" _ "tailscale.com/ipn/store" + _ "tailscale.com/ipn/store/mem" _ "tailscale.com/logpolicy" _ "tailscale.com/logtail" _ "tailscale.com/net/dns" From ce95bc77fb0c323e2e4335665bc75d93bf1e7cfc Mon Sep 17 00:00:00 2001 From: Alex Chan Date: Fri, 21 Nov 2025 16:40:37 +0000 Subject: [PATCH 05/33] tka: don't panic if no clock set in tka.Mem This is causing confusing panics in tailscale/corp#34485. We'll keep using the tka.ChonkMem constructor as much as we can, but don't panic if you create a tka.Mem directly -- we know what the sensible thing is. Updates #cleanup Signed-off-by: Alex Chan Change-Id: I49309f5f403fc26ce4f9a6cf0edc8eddf6a6f3a4 --- tka/tailchonk.go | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/tka/tailchonk.go b/tka/tailchonk.go index a55033bcd..13bdf6aac 100644 --- a/tka/tailchonk.go +++ b/tka/tailchonk.go @@ -193,7 +193,7 @@ updateLoop: for _, aum := range updates { aumHash := aum.Hash() c.aums[aumHash] = aum - c.commitTimes[aumHash] = c.clock.Now() + c.commitTimes[aumHash] = c.now() parent, ok := aum.Parent() if ok { @@ -209,6 +209,16 @@ updateLoop: return nil } +// now returns the current time, optionally using the overridden +// clock if set. +func (c *Mem) now() time.Time { + if c.clock == nil { + return time.Now() + } else { + return c.clock.Now() + } +} + // RemoveAll permanently and completely clears the TKA state. func (c *Mem) RemoveAll() error { c.mu.Lock() From 016ccae2da9fae1f6d8ffb29c694f86cb78cca4a Mon Sep 17 00:00:00 2001 From: Nick Khyl Date: Wed, 19 Nov 2025 20:13:18 -0600 Subject: [PATCH 06/33] util/eventbus: add tests for a subscriber trying to acquire the same mutex as a publisher As of 2025-11-20, publishing more events than the eventbus's internal queues can hold may deadlock if a subscriber tries to acquire a mutex that can also be held by a publisher. This commit adds a test that demonstrates this deadlock, and skips it until the bug is fixed. Updates #17973 Signed-off-by: Nick Khyl --- util/eventbus/bus_test.go | 70 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/util/eventbus/bus_test.go b/util/eventbus/bus_test.go index 61728fbfd..e025e5bed 100644 --- a/util/eventbus/bus_test.go +++ b/util/eventbus/bus_test.go @@ -9,6 +9,7 @@ import ( "fmt" "log" "regexp" + "sync" "testing" "testing/synctest" "time" @@ -593,6 +594,75 @@ func TestRegression(t *testing.T) { }) } +const ( + maxQueuedItems = 16 // same as in queue.go + totalMaxQueuedItems = maxQueuedItems * 2 // both publisher and subscriber sides +) + +func TestPublishWithMutex(t *testing.T) { + t.Run("FewEvents", func(t *testing.T) { + // As of 2025-11-20, publishing up to [totalMaxQueuedItems] is fine. + testPublishWithMutex(t, totalMaxQueuedItems) + }) + t.Run("ManyEvents", func(t *testing.T) { + // As of 2025-11-20, publishing more than [totalMaxQueuedItems] may deadlock. + t.Skip("TODO: fix deadlock in https://github.com/tailscale/tailscale/issues/17973") + + const N = 3 // N larger than one increases the chance of deadlock. + testPublishWithMutex(t, totalMaxQueuedItems+N) + }) +} + +// testPublishWithMutex publishes the specified number of events, +// acquiring and releasing a mutex around each publish and each +// subscriber event receive. +// +// The test fails if it loses any events or times out due to a deadlock. +// Unfortunately, a goroutine waiting on a mutex held by a durably blocked +// goroutine is not itself considered durably blocked, so [synctest] cannot +// detect this deadlock on its own. +func testPublishWithMutex(t *testing.T, n int) { + synctest.Test(t, func(t *testing.T) { + b := eventbus.New() + defer b.Close() + + c := b.Client("TestClient") + + evts := make([]any, n) + for i := range evts { + evts[i] = EventA{Counter: i} + } + exp := expectEvents(t, evts...) + + var mu sync.Mutex + eventbus.SubscribeFunc[EventA](c, func(e EventA) { + // As of 2025-11-20, this can deadlock if n is large enough + // and event queues fill up. + mu.Lock() + mu.Unlock() + + // Mark event as received, so we can check for lost events. + // Not required for the deadlock to occur. + exp.Got(e) + }) + + p := eventbus.Publish[EventA](c) + go func() { + for i := range n { + mu.Lock() + p.Publish(EventA{Counter: i}) + mu.Unlock() + } + }() + + synctest.Wait() + + if !exp.Empty() { + t.Errorf("unexpected extra events: %+v", exp.want) + } + }) +} + type queueChecker struct { t *testing.T want []any From 3780f25d51522f7148ae11d5b28b066d292e06e4 Mon Sep 17 00:00:00 2001 From: Nick Khyl Date: Thu, 20 Nov 2025 11:04:54 -0600 Subject: [PATCH 07/33] util/eventbus: add tests for a subscriber publishing events As of 2025-11-20, publishing more events than the eventbus's internal queues can hold may deadlock if a subscriber tries to publish events itself. This commit adds a test that demonstrates this deadlock, and skips it until the bug is fixed. Updates #18012 Signed-off-by: Nick Khyl --- util/eventbus/bus_test.go | 60 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/util/eventbus/bus_test.go b/util/eventbus/bus_test.go index e025e5bed..23fe633f3 100644 --- a/util/eventbus/bus_test.go +++ b/util/eventbus/bus_test.go @@ -636,6 +636,7 @@ func testPublishWithMutex(t *testing.T, n int) { var mu sync.Mutex eventbus.SubscribeFunc[EventA](c, func(e EventA) { + // Acquire the same mutex as the publisher. // As of 2025-11-20, this can deadlock if n is large enough // and event queues fill up. mu.Lock() @@ -648,6 +649,7 @@ func testPublishWithMutex(t *testing.T, n int) { p := eventbus.Publish[EventA](c) go func() { + // Publish events, acquiring the mutex around each publish. for i := range n { mu.Lock() p.Publish(EventA{Counter: i}) @@ -663,6 +665,64 @@ func testPublishWithMutex(t *testing.T, n int) { }) } +func TestPublishFromSubscriber(t *testing.T) { + t.Run("FewEvents", func(t *testing.T) { + // Publishing up to [totalMaxQueuedItems]-1 is fine. + testPublishFromSubscriber(t, totalMaxQueuedItems-1) + }) + t.Run("ManyEvents", func(t *testing.T) { + // As of 2025-11-20, publishing more than [totalMaxQueuedItems] may deadlock. + t.Skip("TODO: fix deadlock in https://github.com/tailscale/tailscale/issues/18012") + + // Using 2x to increase chance of deadlock. + testPublishFromSubscriber(t, totalMaxQueuedItems*2) + }) +} + +// testPublishFromSubscriber publishes the specified number of EventA events. +// Each EventA causes the subscriber to publish an EventB. +// The test fails if it loses any events or if a deadlock occurs. +func testPublishFromSubscriber(t *testing.T, n int) { + synctest.Test(t, func(t *testing.T) { + b := eventbus.New() + defer b.Close() + + c := b.Client("TestClient") + + // Ultimately we expect to receive n EventB events + // published as a result of receiving n EventA events. + evts := make([]any, n) + for i := range evts { + evts[i] = EventB{Counter: i} + } + exp := expectEvents(t, evts...) + + pubA := eventbus.Publish[EventA](c) + pubB := eventbus.Publish[EventB](c) + + eventbus.SubscribeFunc[EventA](c, func(e EventA) { + // Upon receiving EventA, publish EventB. + // As of 2025-11-20, this can deadlock if n is large enough + // and event queues fill up. + pubB.Publish(EventB{Counter: e.Counter}) + }) + eventbus.SubscribeFunc[EventB](c, func(e EventB) { + // Mark EventB as received. + exp.Got(e) + }) + + for i := range n { + pubA.Publish(EventA{Counter: i}) + } + + synctest.Wait() + + if !exp.Empty() { + t.Errorf("unexpected extra events: %+v", exp.want) + } + }) +} + type queueChecker struct { t *testing.T want []any From e7f5ca1d5ed23d2e3ae2fc9711b25dbd936bdb68 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Claus=20Lensb=C3=B8l?= Date: Fri, 21 Nov 2025 14:49:37 -0500 Subject: [PATCH 08/33] wgengine/userspace: run link change subscribers in eventqueue (#18024) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Updates #17996 Signed-off-by: Claus Lensbøl --- wgengine/userspace.go | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/wgengine/userspace.go b/wgengine/userspace.go index 8ad771fc5..e4c99ded2 100644 --- a/wgengine/userspace.go +++ b/wgengine/userspace.go @@ -51,6 +51,7 @@ import ( "tailscale.com/util/checkchange" "tailscale.com/util/clientmetric" "tailscale.com/util/eventbus" + "tailscale.com/util/execqueue" "tailscale.com/util/mak" "tailscale.com/util/set" "tailscale.com/util/testenv" @@ -98,6 +99,8 @@ type userspaceEngine struct { eventBus *eventbus.Bus eventClient *eventbus.Client + linkChangeQueue execqueue.ExecQueue + logf logger.Logf wgLogger *wglog.Logger // a wireguard-go logging wrapper reqCh chan struct{} @@ -544,7 +547,7 @@ func NewUserspaceEngine(logf logger.Logf, conf Config) (_ Engine, reterr error) if f, ok := feature.HookProxyInvalidateCache.GetOk(); ok { f() } - e.linkChange(&cd) + e.linkChangeQueue.Add(func() { e.linkChange(&cd) }) }) e.eventClient = ec e.logf("Engine created.") @@ -1288,6 +1291,9 @@ func (e *userspaceEngine) RequestStatus() { func (e *userspaceEngine) Close() { e.eventClient.Close() + // TODO(cmol): Should we wait for it too? + // Same question raised in appconnector.go. + e.linkChangeQueue.Shutdown() e.mu.Lock() if e.closing { e.mu.Unlock() From 9245c7131b4228810852a18613bcc7badd057f3a Mon Sep 17 00:00:00 2001 From: Jordan Whited Date: Fri, 21 Nov 2025 11:10:24 -0800 Subject: [PATCH 09/33] feature/relayserver: don't publish from within a subscribe fn goroutine Updates #17830 Signed-off-by: Jordan Whited --- feature/relayserver/relayserver.go | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/feature/relayserver/relayserver.go b/feature/relayserver/relayserver.go index 7d12d62e5..b7457210f 100644 --- a/feature/relayserver/relayserver.go +++ b/feature/relayserver/relayserver.go @@ -147,7 +147,12 @@ func (e *extension) onAllocReq(req magicsock.UDPRelayAllocReq) { e.logf("error allocating endpoint: %v", err) return } - e.respPub.Publish(magicsock.UDPRelayAllocResp{ + // Take a defensive stance around publishing from within an + // [*eventbus.SubscribeFunc] by publishing from a separate goroutine. At the + // time of writing (2025-11-21), publishing from within the + // [*eventbus.SubscribeFunc] goroutine is potentially unsafe if publisher + // and subscriber share a lock. + go e.respPub.Publish(magicsock.UDPRelayAllocResp{ ReqRxFromNodeKey: req.RxFromNodeKey, ReqRxFromDiscoKey: req.RxFromDiscoKey, Message: &disco.AllocateUDPRelayEndpointResponse{ From 1ccece0f783ae5059c1d74894566461072db6471 Mon Sep 17 00:00:00 2001 From: Nick Khyl Date: Fri, 21 Nov 2025 07:53:23 -0600 Subject: [PATCH 10/33] util/eventbus: use unbounded event queues for DeliveredEvents in subscribers Bounded DeliveredEvent queues reduce memory usage, but they can deadlock under load. Two common scenarios trigger deadlocks when the number of events published in a short period exceeds twice the queue capacity (there's a PublishedEvent queue of the same size): - a subscriber tries to acquire the same mutex as held by a publisher, or - a subscriber for A events publishes B events Avoiding these scenarios is not practical and would limit eventbus usefulness and reduce its adoption, pushing us back to callbacks and other legacy mechanisms. These deadlocks already occurred in customer devices, dev machines, and tests. They also make it harder to identify and fix slow subscribers and similar issues we have been seeing recently. Choosing an arbitrary large fixed queue capacity would only mask the problem. A client running on a sufficiently large and complex customer environment can exceed any meaningful constant limit, since event volume depends on the number of peers and other factors. Behavior also changes based on scheduling of publishers and subscribers by the Go runtime, OS, and hardware, as the issue is essentially a race between publishers and subscribers. Additionally, on lower-end devices, an unreasonably high constant capacity is practically the same as using unbounded queues. Therefore, this PR changes the event queue implementation to be unbounded by default. The PublishedEvent queue keeps its existing capacity of 16 items, while subscribers' DeliveredEvent queues become unbounded. This change fixes known deadlocks and makes the system stable under load, at the cost of higher potential memory usage, including cases where a queue grows during an event burst and does not shrink when load decreases. Further improvements can be implemented in the future as needed. Fixes #17973 Fixes #18012 Signed-off-by: Nick Khyl --- util/eventbus/bus.go | 9 ++++++++- util/eventbus/bus_test.go | 34 ++-------------------------------- util/eventbus/queue.go | 12 ++++++------ 3 files changed, 16 insertions(+), 39 deletions(-) diff --git a/util/eventbus/bus.go b/util/eventbus/bus.go index aa6880d01..880e075cc 100644 --- a/util/eventbus/bus.go +++ b/util/eventbus/bus.go @@ -120,7 +120,14 @@ func (b *Bus) Close() { } func (b *Bus) pump(ctx context.Context) { - var vals queue[PublishedEvent] + // Limit how many published events we can buffer in the PublishedEvent queue. + // + // Subscribers have unbounded DeliveredEvent queues (see tailscale/tailscale#18020), + // so this queue doesn't need to be unbounded. Keeping it bounded may also help + // catch cases where subscribers stop pumping events completely, such as due to a bug + // in [subscribeState.pump], [Subscriber.dispatch], or [SubscriberFunc.dispatch]). + const maxPublishedEvents = 16 + vals := queue[PublishedEvent]{capacity: maxPublishedEvents} acceptCh := func() chan PublishedEvent { if vals.Full() { return nil diff --git a/util/eventbus/bus_test.go b/util/eventbus/bus_test.go index 23fe633f3..88e11e719 100644 --- a/util/eventbus/bus_test.go +++ b/util/eventbus/bus_test.go @@ -594,23 +594,8 @@ func TestRegression(t *testing.T) { }) } -const ( - maxQueuedItems = 16 // same as in queue.go - totalMaxQueuedItems = maxQueuedItems * 2 // both publisher and subscriber sides -) - func TestPublishWithMutex(t *testing.T) { - t.Run("FewEvents", func(t *testing.T) { - // As of 2025-11-20, publishing up to [totalMaxQueuedItems] is fine. - testPublishWithMutex(t, totalMaxQueuedItems) - }) - t.Run("ManyEvents", func(t *testing.T) { - // As of 2025-11-20, publishing more than [totalMaxQueuedItems] may deadlock. - t.Skip("TODO: fix deadlock in https://github.com/tailscale/tailscale/issues/17973") - - const N = 3 // N larger than one increases the chance of deadlock. - testPublishWithMutex(t, totalMaxQueuedItems+N) - }) + testPublishWithMutex(t, 1024) // arbitrary large number of events } // testPublishWithMutex publishes the specified number of events, @@ -637,13 +622,10 @@ func testPublishWithMutex(t *testing.T, n int) { var mu sync.Mutex eventbus.SubscribeFunc[EventA](c, func(e EventA) { // Acquire the same mutex as the publisher. - // As of 2025-11-20, this can deadlock if n is large enough - // and event queues fill up. mu.Lock() mu.Unlock() // Mark event as received, so we can check for lost events. - // Not required for the deadlock to occur. exp.Got(e) }) @@ -666,17 +648,7 @@ func testPublishWithMutex(t *testing.T, n int) { } func TestPublishFromSubscriber(t *testing.T) { - t.Run("FewEvents", func(t *testing.T) { - // Publishing up to [totalMaxQueuedItems]-1 is fine. - testPublishFromSubscriber(t, totalMaxQueuedItems-1) - }) - t.Run("ManyEvents", func(t *testing.T) { - // As of 2025-11-20, publishing more than [totalMaxQueuedItems] may deadlock. - t.Skip("TODO: fix deadlock in https://github.com/tailscale/tailscale/issues/18012") - - // Using 2x to increase chance of deadlock. - testPublishFromSubscriber(t, totalMaxQueuedItems*2) - }) + testPublishFromSubscriber(t, 1024) // arbitrary large number of events } // testPublishFromSubscriber publishes the specified number of EventA events. @@ -702,8 +674,6 @@ func testPublishFromSubscriber(t *testing.T, n int) { eventbus.SubscribeFunc[EventA](c, func(e EventA) { // Upon receiving EventA, publish EventB. - // As of 2025-11-20, this can deadlock if n is large enough - // and event queues fill up. pubB.Publish(EventB{Counter: e.Counter}) }) eventbus.SubscribeFunc[EventB](c, func(e EventB) { diff --git a/util/eventbus/queue.go b/util/eventbus/queue.go index a62bf3c62..2589b75ce 100644 --- a/util/eventbus/queue.go +++ b/util/eventbus/queue.go @@ -7,18 +7,18 @@ import ( "slices" ) -const maxQueuedItems = 16 - -// queue is an ordered queue of length up to maxQueuedItems. +// queue is an ordered queue of length up to capacity, +// if capacity is non-zero. Otherwise it is unbounded. type queue[T any] struct { - vals []T - start int + vals []T + start int + capacity int // zero means unbounded } // canAppend reports whether a value can be appended to q.vals without // shifting values around. func (q *queue[T]) canAppend() bool { - return cap(q.vals) < maxQueuedItems || len(q.vals) < cap(q.vals) + return q.capacity == 0 || cap(q.vals) < q.capacity || len(q.vals) < cap(q.vals) } func (q *queue[T]) Full() bool { From 16587746ed5446247d44dd0c50cec36cf61a0c80 Mon Sep 17 00:00:00 2001 From: Andrew Dunham Date: Fri, 21 Nov 2025 17:55:14 -0500 Subject: [PATCH 11/33] portlist,tstest: skip tests on kernels with /proc/net/tcp regression Linux kernel versions 6.6.102-104 and 6.12.42-45 have a regression in /proc/net/tcp that causes seek operations to fail with "illegal seek". This breaks portlist tests on these kernels. Add kernel version detection for Linux systems and a SkipOnKernelVersions helper to tstest. Use it to skip affected portlist tests on the broken kernel versions. Thanks to philiptaron for the list of kernels with the issue and fix. Updates #16966 Signed-off-by: Andrew Dunham --- portlist/portlist_test.go | 15 ++++++++++++ tstest/kernel_linux.go | 50 +++++++++++++++++++++++++++++++++++++++ tstest/kernel_other.go | 11 +++++++++ tstest/tstest.go | 18 ++++++++++++++ tstest/tstest_test.go | 19 ++++++++++++++- 5 files changed, 112 insertions(+), 1 deletion(-) create mode 100644 tstest/kernel_linux.go create mode 100644 tstest/kernel_other.go diff --git a/portlist/portlist_test.go b/portlist/portlist_test.go index 34277fdba..791a8b118 100644 --- a/portlist/portlist_test.go +++ b/portlist/portlist_test.go @@ -5,12 +5,24 @@ package portlist import ( "net" + "runtime" "testing" "tailscale.com/tstest" ) +func maybeSkip(t *testing.T) { + if runtime.GOOS == "linux" { + tstest.SkipOnKernelVersions(t, + "https://github.com/tailscale/tailscale/issues/16966", + "6.6.102", "6.6.103", "6.6.104", + "6.12.42", "6.12.43", "6.12.44", "6.12.45", + ) + } +} + func TestGetList(t *testing.T) { + maybeSkip(t) tstest.ResourceCheck(t) var p Poller @@ -25,6 +37,7 @@ func TestGetList(t *testing.T) { } func TestIgnoreLocallyBoundPorts(t *testing.T) { + maybeSkip(t) tstest.ResourceCheck(t) ln, err := net.Listen("tcp", "127.0.0.1:0") @@ -47,6 +60,8 @@ func TestIgnoreLocallyBoundPorts(t *testing.T) { } func TestPoller(t *testing.T) { + maybeSkip(t) + var p Poller p.IncludeLocalhost = true get := func(t *testing.T) []Port { diff --git a/tstest/kernel_linux.go b/tstest/kernel_linux.go new file mode 100644 index 000000000..664fe9bdd --- /dev/null +++ b/tstest/kernel_linux.go @@ -0,0 +1,50 @@ +// Copyright (c) Tailscale Inc & AUTHORS +// SPDX-License-Identifier: BSD-3-Clause + +//go:build linux + +package tstest + +import ( + "strconv" + "strings" + + "golang.org/x/sys/unix" +) + +// KernelVersion returns the major, minor, and patch version of the Linux kernel. +// It returns (0, 0, 0) if the version cannot be determined. +func KernelVersion() (major, minor, patch int) { + var uname unix.Utsname + if err := unix.Uname(&uname); err != nil { + return 0, 0, 0 + } + release := unix.ByteSliceToString(uname.Release[:]) + + // Parse version string (e.g., "5.15.0-...") + parts := strings.Split(release, ".") + if len(parts) < 3 { + return 0, 0, 0 + } + + major, err := strconv.Atoi(parts[0]) + if err != nil { + return 0, 0, 0 + } + + minor, err = strconv.Atoi(parts[1]) + if err != nil { + return 0, 0, 0 + } + + // Patch version may have additional info after a hyphen (e.g., "0-76-generic") + // Extract just the numeric part before any hyphen + patchStr, _, _ := strings.Cut(parts[2], "-") + + patch, err = strconv.Atoi(patchStr) + if err != nil { + return 0, 0, 0 + } + + return major, minor, patch +} diff --git a/tstest/kernel_other.go b/tstest/kernel_other.go new file mode 100644 index 000000000..bf69be6df --- /dev/null +++ b/tstest/kernel_other.go @@ -0,0 +1,11 @@ +// Copyright (c) Tailscale Inc & AUTHORS +// SPDX-License-Identifier: BSD-3-Clause + +//go:build !linux + +package tstest + +// KernelVersion returns (0, 0, 0) on unsupported platforms. +func KernelVersion() (major, minor, patch int) { + return 0, 0, 0 +} diff --git a/tstest/tstest.go b/tstest/tstest.go index 169450686..d0828f508 100644 --- a/tstest/tstest.go +++ b/tstest/tstest.go @@ -6,6 +6,7 @@ package tstest import ( "context" + "fmt" "os" "strconv" "strings" @@ -93,3 +94,20 @@ func Parallel(t *testing.T) { t.Parallel() } } + +// SkipOnKernelVersions skips the test if the current +// kernel version is in the specified list. +func SkipOnKernelVersions(t testing.TB, issue string, versions ...string) { + major, minor, patch := KernelVersion() + if major == 0 && minor == 0 && patch == 0 { + t.Logf("could not determine kernel version") + return + } + + current := fmt.Sprintf("%d.%d.%d", major, minor, patch) + for _, v := range versions { + if v == current { + t.Skipf("skipping on kernel version %q - see issue %s", current, issue) + } + } +} diff --git a/tstest/tstest_test.go b/tstest/tstest_test.go index e988d5d56..ce59bde53 100644 --- a/tstest/tstest_test.go +++ b/tstest/tstest_test.go @@ -3,7 +3,10 @@ package tstest -import "testing" +import ( + "runtime" + "testing" +) func TestReplace(t *testing.T) { before := "before" @@ -22,3 +25,17 @@ func TestReplace(t *testing.T) { t.Errorf("before = %q; want %q", before, "before") } } + +func TestKernelVersion(t *testing.T) { + switch runtime.GOOS { + case "linux": + default: + t.Skipf("skipping test on %s", runtime.GOOS) + } + + major, minor, patch := KernelVersion() + if major == 0 && minor == 0 && patch == 0 { + t.Fatal("KernelVersion returned (0, 0, 0); expected valid version") + } + t.Logf("Kernel version: %d.%d.%d", major, minor, patch) +} From a20cdb5c938204d45502d4c52fafc8ad0b0afed9 Mon Sep 17 00:00:00 2001 From: Andrew Dunham Date: Fri, 21 Nov 2025 16:50:28 -0500 Subject: [PATCH 12/33] tstest/integration/testcontrol: de-flake TestUserMetricsRouteGauges SetSubnetRoutes was not sending update notifications to nodes when their approved routes changed, causing nodes to not fetch updated netmaps with PrimaryRoutes populated. This resulted in TestUserMetricsRouteGauges flaking because it waited for PrimaryRoutes to be set, which only happened if the node happened to poll for other reasons. Now send updateSelfChanged notification to affected nodes so they fetch an updated netmap immediately. Fixes #17962 Signed-off-by: Andrew Dunham --- tstest/integration/testcontrol/testcontrol.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tstest/integration/testcontrol/testcontrol.go b/tstest/integration/testcontrol/testcontrol.go index f9a33705b..268f2f19b 100644 --- a/tstest/integration/testcontrol/testcontrol.go +++ b/tstest/integration/testcontrol/testcontrol.go @@ -464,6 +464,9 @@ func (s *Server) SetSubnetRoutes(nodeKey key.NodePublic, routes []netip.Prefix) defer s.mu.Unlock() s.logf("Setting subnet routes for %s: %v", nodeKey.ShortString(), routes) mak.Set(&s.nodeSubnetRoutes, nodeKey, routes) + if node, ok := s.nodes[nodeKey]; ok { + sendUpdate(s.updates[node.ID], updateSelfChanged) + } } // MasqueradePair is a pair of nodes and the IP address that the From 698eecda040e6ee21b2f4502d3b98a6db1b60f6d Mon Sep 17 00:00:00 2001 From: Andrew Dunham Date: Fri, 21 Nov 2025 17:25:56 -0500 Subject: [PATCH 13/33] ipn/ipnlocal: fix panic in driveTransport on network error When the underlying transport returns a network error, the RoundTrip method returns (nil, error). The defer was attempting to access resp without checking if it was nil first, causing a panic. Fix this by checking for nil in the defer. Also changes driveTransport.tr from *http.Transport to http.RoundTripper and adds a test. Fixes #17306 Signed-off-by: Andrew Dunham Change-Id: Icf38a020b45aaa9cfbc1415d55fd8b70b978f54c --- ipn/ipnlocal/drive.go | 75 ++++++++++++++++++++------------------ ipn/ipnlocal/drive_test.go | 50 +++++++++++++++++++++++++ 2 files changed, 89 insertions(+), 36 deletions(-) create mode 100644 ipn/ipnlocal/drive_test.go diff --git a/ipn/ipnlocal/drive.go b/ipn/ipnlocal/drive.go index 7d6dc2427..456cd4544 100644 --- a/ipn/ipnlocal/drive.go +++ b/ipn/ipnlocal/drive.go @@ -433,7 +433,7 @@ func (rbw *responseBodyWrapper) Close() error { // b.Dialer().PeerAPITransport() with metrics tracking. type driveTransport struct { b *LocalBackend - tr *http.Transport + tr http.RoundTripper } func (b *LocalBackend) newDriveTransport() *driveTransport { @@ -443,7 +443,7 @@ func (b *LocalBackend) newDriveTransport() *driveTransport { } } -func (dt *driveTransport) RoundTrip(req *http.Request) (resp *http.Response, err error) { +func (dt *driveTransport) RoundTrip(req *http.Request) (*http.Response, error) { // Some WebDAV clients include origin and refer headers, which peerapi does // not like. Remove them. req.Header.Del("origin") @@ -455,42 +455,45 @@ func (dt *driveTransport) RoundTrip(req *http.Request) (resp *http.Response, err req.Body = bw } - defer func() { - contentType := "unknown" - if ct := req.Header.Get("Content-Type"); ct != "" { - contentType = ct - } + resp, err := dt.tr.RoundTrip(req) + if err != nil { + return nil, err + } - dt.b.mu.Lock() - selfNodeKey := dt.b.currentNode().Self().Key().ShortString() - dt.b.mu.Unlock() - n, _, ok := dt.b.WhoIs("tcp", netip.MustParseAddrPort(req.URL.Host)) - shareNodeKey := "unknown" - if ok { - shareNodeKey = string(n.Key().ShortString()) - } + contentType := "unknown" + if ct := req.Header.Get("Content-Type"); ct != "" { + contentType = ct + } - rbw := responseBodyWrapper{ - log: dt.b.logf, - logVerbose: req.Method != httpm.GET && req.Method != httpm.PUT, // other requests like PROPFIND are quite chatty, so we log those at verbose level - method: req.Method, - bytesTx: int64(bw.bytesRead), - selfNodeKey: selfNodeKey, - shareNodeKey: shareNodeKey, - contentType: contentType, - contentLength: resp.ContentLength, - fileExtension: parseDriveFileExtensionForLog(req.URL.Path), - statusCode: resp.StatusCode, - ReadCloser: resp.Body, - } + dt.b.mu.Lock() + selfNodeKey := dt.b.currentNode().Self().Key().ShortString() + dt.b.mu.Unlock() + n, _, ok := dt.b.WhoIs("tcp", netip.MustParseAddrPort(req.URL.Host)) + shareNodeKey := "unknown" + if ok { + shareNodeKey = string(n.Key().ShortString()) + } - if resp.StatusCode >= 400 { - // in case of error response, just log immediately - rbw.logAccess("") - } else { - resp.Body = &rbw - } - }() + rbw := responseBodyWrapper{ + log: dt.b.logf, + logVerbose: req.Method != httpm.GET && req.Method != httpm.PUT, // other requests like PROPFIND are quite chatty, so we log those at verbose level + method: req.Method, + bytesTx: int64(bw.bytesRead), + selfNodeKey: selfNodeKey, + shareNodeKey: shareNodeKey, + contentType: contentType, + contentLength: resp.ContentLength, + fileExtension: parseDriveFileExtensionForLog(req.URL.Path), + statusCode: resp.StatusCode, + ReadCloser: resp.Body, + } + + if resp.StatusCode >= 400 { + // in case of error response, just log immediately + rbw.logAccess("") + } else { + resp.Body = &rbw + } - return dt.tr.RoundTrip(req) + return resp, nil } diff --git a/ipn/ipnlocal/drive_test.go b/ipn/ipnlocal/drive_test.go new file mode 100644 index 000000000..323c38214 --- /dev/null +++ b/ipn/ipnlocal/drive_test.go @@ -0,0 +1,50 @@ +// Copyright (c) Tailscale Inc & AUTHORS +// SPDX-License-Identifier: BSD-3-Clause + +//go:build !ts_omit_drive + +package ipnlocal + +import ( + "errors" + "net/http" + "net/http/httptest" + "testing" +) + +// TestDriveTransportRoundTrip_NetworkError tests that driveTransport.RoundTrip +// doesn't panic when the underlying transport returns a nil response with an +// error. +// +// See: https://github.com/tailscale/tailscale/issues/17306 +func TestDriveTransportRoundTrip_NetworkError(t *testing.T) { + b := newTestLocalBackend(t) + + testErr := errors.New("network connection failed") + mockTransport := &mockRoundTripper{ + err: testErr, + } + dt := &driveTransport{ + b: b, + tr: mockTransport, + } + + req := httptest.NewRequest("GET", "http://100.64.0.1:1234/some/path", nil) + resp, err := dt.RoundTrip(req) + if err == nil { + t.Fatal("got nil error, expected non-nil") + } else if !errors.Is(err, testErr) { + t.Errorf("got error %v, expected %v", err, testErr) + } + if resp != nil { + t.Errorf("wanted nil response, got %v", resp) + } +} + +type mockRoundTripper struct { + err error +} + +func (m *mockRoundTripper) RoundTrip(req *http.Request) (*http.Response, error) { + return nil, m.err +} From 6637003cc8c5a73a56ed10f57f207a2a2c9f2c7c Mon Sep 17 00:00:00 2001 From: Tom Proctor Date: Tue, 18 Nov 2025 17:11:27 +0000 Subject: [PATCH 14/33] cmd/cigocacher,go.mod: add cigocacher cmd Adds cmd/cigocacher as the client to cigocached for Go caching over HTTP. The HTTP cache is best-effort only, and builds will fall back to disk-only cache if it's not available, much like regular builds. Not yet used in CI; that will follow in another PR once we have runners available in this repo with the right network setup for reaching cigocached. Updates tailscale/corp#10808 Change-Id: I13ae1a12450eb2a05bd9843f358474243989e967 Signed-off-by: Tom Proctor --- cmd/cigocacher/cigocacher.go | 308 +++++++++++++++++++++++++++++ cmd/cigocacher/http.go | 115 +++++++++++ cmd/derper/depaware.txt | 8 +- cmd/k8s-operator/depaware.txt | 12 +- cmd/stund/depaware.txt | 8 +- cmd/tailscaled/depaware-min.txt | 1 + cmd/tailscaled/depaware-minbox.txt | 1 + cmd/tailscaled/depaware.txt | 1 + cmd/tsidp/depaware.txt | 1 + flake.nix | 3 +- go.mod | 15 +- go.mod.sri | 2 +- go.sum | 30 +-- shell.nix | 2 +- tsnet/depaware.txt | 1 + 15 files changed, 470 insertions(+), 38 deletions(-) create mode 100644 cmd/cigocacher/cigocacher.go create mode 100644 cmd/cigocacher/http.go diff --git a/cmd/cigocacher/cigocacher.go b/cmd/cigocacher/cigocacher.go new file mode 100644 index 000000000..b38df4c2b --- /dev/null +++ b/cmd/cigocacher/cigocacher.go @@ -0,0 +1,308 @@ +// Copyright (c) Tailscale Inc & AUTHORS +// SPDX-License-Identifier: BSD-3-Clause + +// cigocacher is an opinionated-to-Tailscale client for gocached. It connects +// at a URL like "https://ci-gocached-azure-1.corp.ts.net:31364", but that is +// stored in a GitHub actions variable so that its hostname can be updated for +// all branches at the same time in sync with the actual infrastructure. +// +// It authenticates using GitHub OIDC tokens, and all HTTP errors are ignored +// so that its failure mode is just that builds get slower and fall back to +// disk-only cache. +package main + +import ( + "bytes" + "context" + jsonv1 "encoding/json" + "errors" + "flag" + "fmt" + "io" + "log" + "net" + "net/http" + "os" + "path/filepath" + "strings" + "sync/atomic" + "time" + + "github.com/bradfitz/go-tool-cache/cacheproc" + "github.com/bradfitz/go-tool-cache/cachers" +) + +func main() { + var ( + auth = flag.Bool("auth", false, "auth with cigocached and exit, printing the access token as output") + token = flag.String("token", "", "the cigocached access token to use, as created using --auth") + cigocachedURL = flag.String("cigocached-url", "", "optional cigocached URL (scheme, host, and port). empty means to not use one.") + verbose = flag.Bool("verbose", false, "enable verbose logging") + ) + flag.Parse() + + if *auth { + if *cigocachedURL == "" { + log.Print("--cigocached-url is empty, skipping auth") + return + } + tk, err := fetchAccessToken(httpClient(), os.Getenv("ACTIONS_ID_TOKEN_REQUEST_URL"), os.Getenv("ACTIONS_ID_TOKEN_REQUEST_TOKEN"), *cigocachedURL) + if err != nil { + log.Printf("error fetching access token, skipping auth: %v", err) + return + } + fmt.Println(tk) + return + } + + d, err := os.UserCacheDir() + if err != nil { + log.Fatal(err) + } + d = filepath.Join(d, "go-cacher") + log.Printf("Defaulting to cache dir %v ...", d) + if err := os.MkdirAll(d, 0750); err != nil { + log.Fatal(err) + } + + c := &cigocacher{ + disk: &cachers.DiskCache{Dir: d}, + verbose: *verbose, + } + if *cigocachedURL != "" { + log.Printf("Using cigocached at %s", *cigocachedURL) + c.gocached = &gocachedClient{ + baseURL: *cigocachedURL, + cl: httpClient(), + accessToken: *token, + verbose: *verbose, + } + } + var p *cacheproc.Process + p = &cacheproc.Process{ + Close: func() error { + log.Printf("gocacheprog: closing; %d gets (%d hits, %d misses, %d errors); %d puts (%d errors)", + p.Gets.Load(), p.GetHits.Load(), p.GetMisses.Load(), p.GetErrors.Load(), p.Puts.Load(), p.PutErrors.Load()) + return c.close() + }, + Get: c.get, + Put: c.put, + } + + if err := p.Run(); err != nil { + log.Fatal(err) + } +} + +func httpClient() *http.Client { + return &http.Client{ + Transport: &http.Transport{ + DialContext: func(ctx context.Context, network, addr string) (net.Conn, error) { + host, port, err := net.SplitHostPort(addr) + if err == nil { + // This does not run in a tailnet. We serve corp.ts.net + // TLS certs, and override DNS resolution to lookup the + // private IP for the VM by its hostname. + if vm, ok := strings.CutSuffix(host, ".corp.ts.net"); ok { + addr = net.JoinHostPort(vm, port) + } + } + var d net.Dialer + return d.DialContext(ctx, network, addr) + }, + }, + } +} + +type cigocacher struct { + disk *cachers.DiskCache + gocached *gocachedClient + verbose bool + + getNanos atomic.Int64 // total nanoseconds spent in gets + putNanos atomic.Int64 // total nanoseconds spent in puts + getHTTP atomic.Int64 // HTTP get requests made + getHTTPBytes atomic.Int64 // HTTP get bytes transferred + getHTTPHits atomic.Int64 // HTTP get hits + getHTTPMisses atomic.Int64 // HTTP get misses + getHTTPErrors atomic.Int64 // HTTP get errors ignored on best-effort basis + getHTTPNanos atomic.Int64 // total nanoseconds spent in HTTP gets + putHTTP atomic.Int64 // HTTP put requests made + putHTTPBytes atomic.Int64 // HTTP put bytes transferred + putHTTPErrors atomic.Int64 // HTTP put errors ignored on best-effort basis + putHTTPNanos atomic.Int64 // total nanoseconds spent in HTTP puts +} + +func (c *cigocacher) get(ctx context.Context, actionID string) (outputID, diskPath string, err error) { + t0 := time.Now() + defer func() { + c.getNanos.Add(time.Since(t0).Nanoseconds()) + }() + if c.gocached == nil { + return c.disk.Get(ctx, actionID) + } + + outputID, diskPath, err = c.disk.Get(ctx, actionID) + if err == nil && outputID != "" { + return outputID, diskPath, nil + } + + c.getHTTP.Add(1) + t0HTTP := time.Now() + defer func() { + c.getHTTPNanos.Add(time.Since(t0HTTP).Nanoseconds()) + }() + outputID, res, err := c.gocached.get(ctx, actionID) + if err != nil { + c.getHTTPErrors.Add(1) + return "", "", nil + } + if outputID == "" || res == nil { + c.getHTTPMisses.Add(1) + return "", "", nil + } + + defer res.Body.Close() + + // TODO(tomhjp): make sure we timeout if cigocached disappears, but for some + // reason, this seemed to tank network performance. + // ctx, cancel := context.WithTimeout(ctx, httpTimeout(res.ContentLength)) + // defer cancel() + diskPath, err = c.disk.Put(ctx, actionID, outputID, res.ContentLength, res.Body) + if err != nil { + return "", "", fmt.Errorf("error filling disk cache from HTTP: %w", err) + } + + c.getHTTPHits.Add(1) + c.getHTTPBytes.Add(res.ContentLength) + return outputID, diskPath, nil +} + +func (c *cigocacher) put(ctx context.Context, actionID, outputID string, size int64, r io.Reader) (diskPath string, err error) { + t0 := time.Now() + defer func() { + c.putNanos.Add(time.Since(t0).Nanoseconds()) + }() + if c.gocached == nil { + return c.disk.Put(ctx, actionID, outputID, size, r) + } + + c.putHTTP.Add(1) + var diskReader, httpReader io.Reader + tee := &bestEffortTeeReader{r: r} + if size == 0 { + // Special case the empty file so NewRequest sets "Content-Length: 0", + // as opposed to thinking we didn't set it and not being able to sniff its size + // from the type. + diskReader, httpReader = bytes.NewReader(nil), bytes.NewReader(nil) + } else { + pr, pw := io.Pipe() + defer pw.Close() + // The diskReader is in the driving seat. We will try to forward data + // to httpReader as well, but only best-effort. + diskReader = tee + tee.w = pw + httpReader = pr + } + httpErrCh := make(chan error) + go func() { + // TODO(tomhjp): make sure we timeout if cigocached disappears, but for some + // reason, this seemed to tank network performance. + // ctx, cancel := context.WithTimeout(ctx, httpTimeout(size)) + // defer cancel() + t0HTTP := time.Now() + defer func() { + c.putHTTPNanos.Add(time.Since(t0HTTP).Nanoseconds()) + }() + httpErrCh <- c.gocached.put(ctx, actionID, outputID, size, httpReader) + }() + + diskPath, err = c.disk.Put(ctx, actionID, outputID, size, diskReader) + if err != nil { + return "", fmt.Errorf("error writing to disk cache: %w", errors.Join(err, tee.err)) + } + + select { + case err := <-httpErrCh: + if err != nil { + c.putHTTPErrors.Add(1) + } else { + c.putHTTPBytes.Add(size) + } + case <-ctx.Done(): + } + + return diskPath, nil +} + +func (c *cigocacher) close() error { + log.Printf("cigocacher HTTP stats: %d gets (%.1fMiB, %.2fs, %d hits, %d misses, %d errors ignored); %d puts (%.1fMiB, %.2fs, %d errors ignored)", + c.getHTTP.Load(), float64(c.getHTTPBytes.Load())/float64(1<<20), float64(c.getHTTPNanos.Load())/float64(time.Second), c.getHTTPHits.Load(), c.getHTTPMisses.Load(), c.getHTTPErrors.Load(), + c.putHTTP.Load(), float64(c.putHTTPBytes.Load())/float64(1<<20), float64(c.putHTTPNanos.Load())/float64(time.Second), c.putHTTPErrors.Load()) + if !c.verbose || c.gocached == nil { + return nil + } + + stats, err := c.gocached.fetchStats() + if err != nil { + log.Printf("error fetching gocached stats: %v", err) + } else { + log.Printf("gocached session stats: %s", stats) + } + + return nil +} + +func fetchAccessToken(cl *http.Client, idTokenURL, idTokenRequestToken, gocachedURL string) (string, error) { + req, err := http.NewRequest("GET", idTokenURL+"&audience=gocached", nil) + if err != nil { + return "", err + } + req.Header.Set("Authorization", "Bearer "+idTokenRequestToken) + resp, err := cl.Do(req) + if err != nil { + return "", err + } + defer resp.Body.Close() + type idTokenResp struct { + Value string `json:"value"` + } + var idToken idTokenResp + if err := jsonv1.NewDecoder(resp.Body).Decode(&idToken); err != nil { + return "", err + } + + req, _ = http.NewRequest("POST", gocachedURL+"/auth/exchange-token", strings.NewReader(`{"jwt":"`+idToken.Value+`"}`)) + req.Header.Set("Content-Type", "application/json") + resp, err = cl.Do(req) + if err != nil { + return "", err + } + defer resp.Body.Close() + type accessTokenResp struct { + AccessToken string `json:"access_token"` + } + var accessToken accessTokenResp + if err := jsonv1.NewDecoder(resp.Body).Decode(&accessToken); err != nil { + return "", err + } + + return accessToken.AccessToken, nil +} + +type bestEffortTeeReader struct { + r io.Reader + w io.WriteCloser + err error +} + +func (t *bestEffortTeeReader) Read(p []byte) (int, error) { + n, err := t.r.Read(p) + if n > 0 && t.w != nil { + if _, err := t.w.Write(p[:n]); err != nil { + t.err = errors.Join(err, t.w.Close()) + t.w = nil + } + } + return n, err +} diff --git a/cmd/cigocacher/http.go b/cmd/cigocacher/http.go new file mode 100644 index 000000000..57d3bfb45 --- /dev/null +++ b/cmd/cigocacher/http.go @@ -0,0 +1,115 @@ +// Copyright (c) Tailscale Inc & AUTHORS +// SPDX-License-Identifier: BSD-3-Clause + +package main + +import ( + "context" + "fmt" + "io" + "log" + "net/http" +) + +type gocachedClient struct { + baseURL string // base URL of the cacher server, like "http://localhost:31364". + cl *http.Client // http.Client to use. + accessToken string // Bearer token to use in the Authorization header. + verbose bool +} + +// drainAndClose reads and throws away a small bounded amount of data. This is a +// best-effort attempt to allow connection reuse; Go's HTTP/1 Transport won't +// reuse a TCP connection unless you fully consume HTTP responses. +func drainAndClose(body io.ReadCloser) { + io.CopyN(io.Discard, body, 4<<10) + body.Close() +} + +func tryReadErrorMessage(res *http.Response) []byte { + msg, _ := io.ReadAll(io.LimitReader(res.Body, 4<<10)) + return msg +} + +func (c *gocachedClient) get(ctx context.Context, actionID string) (outputID string, resp *http.Response, err error) { + // TODO(tomhjp): make sure we timeout if cigocached disappears, but for some + // reason, this seemed to tank network performance. + // // Set a generous upper limit on the time we'll wait for a response. We'll + // // shorten this deadline later once we know the content length. + // ctx, cancel := context.WithTimeout(ctx, time.Minute) + // defer cancel() + req, _ := http.NewRequestWithContext(ctx, "GET", c.baseURL+"/action/"+actionID, nil) + req.Header.Set("Want-Object", "1") // opt in to single roundtrip protocol + if c.accessToken != "" { + req.Header.Set("Authorization", "Bearer "+c.accessToken) + } + + res, err := c.cl.Do(req) + if err != nil { + return "", nil, err + } + defer func() { + if resp == nil { + drainAndClose(res.Body) + } + }() + if res.StatusCode == http.StatusNotFound { + return "", nil, nil + } + if res.StatusCode != http.StatusOK { + msg := tryReadErrorMessage(res) + if c.verbose { + log.Printf("error GET /action/%s: %v, %s", actionID, res.Status, msg) + } + return "", nil, fmt.Errorf("unexpected GET /action/%s status %v", actionID, res.Status) + } + + outputID = res.Header.Get("Go-Output-Id") + if outputID == "" { + return "", nil, fmt.Errorf("missing Go-Output-Id header in response") + } + if res.ContentLength == -1 { + return "", nil, fmt.Errorf("no Content-Length from server") + } + return outputID, res, nil +} + +func (c *gocachedClient) put(ctx context.Context, actionID, outputID string, size int64, body io.Reader) error { + req, _ := http.NewRequestWithContext(ctx, "PUT", c.baseURL+"/"+actionID+"/"+outputID, body) + req.ContentLength = size + if c.accessToken != "" { + req.Header.Set("Authorization", "Bearer "+c.accessToken) + } + res, err := c.cl.Do(req) + if err != nil { + if c.verbose { + log.Printf("error PUT /%s/%s: %v", actionID, outputID, err) + } + return err + } + defer res.Body.Close() + if res.StatusCode != http.StatusNoContent { + msg := tryReadErrorMessage(res) + if c.verbose { + log.Printf("error PUT /%s/%s: %v, %s", actionID, outputID, res.Status, msg) + } + return fmt.Errorf("unexpected PUT /%s/%s status %v", actionID, outputID, res.Status) + } + + return nil +} + +func (c *gocachedClient) fetchStats() (string, error) { + req, _ := http.NewRequest("GET", c.baseURL+"/session/stats", nil) + req.Header.Set("Authorization", "Bearer "+c.accessToken) + resp, err := c.cl.Do(req) + if err != nil { + return "", err + } + defer resp.Body.Close() + b, err := io.ReadAll(resp.Body) + if err != nil { + return "", err + } + return string(b), nil +} diff --git a/cmd/derper/depaware.txt b/cmd/derper/depaware.txt index 0a75ac43e..6608faaf7 100644 --- a/cmd/derper/depaware.txt +++ b/cmd/derper/depaware.txt @@ -30,9 +30,9 @@ tailscale.com/cmd/derper dependencies: (generated by github.com/tailscale/depawa github.com/prometheus/client_model/go from github.com/prometheus/client_golang/prometheus+ github.com/prometheus/common/expfmt from github.com/prometheus/client_golang/prometheus+ github.com/prometheus/common/model from github.com/prometheus/client_golang/prometheus+ - LD github.com/prometheus/procfs from github.com/prometheus/client_golang/prometheus - LD github.com/prometheus/procfs/internal/fs from github.com/prometheus/procfs - LD github.com/prometheus/procfs/internal/util from github.com/prometheus/procfs + L github.com/prometheus/procfs from github.com/prometheus/client_golang/prometheus + L github.com/prometheus/procfs/internal/fs from github.com/prometheus/procfs + L github.com/prometheus/procfs/internal/util from github.com/prometheus/procfs W 💣 github.com/tailscale/go-winio from tailscale.com/safesocket W 💣 github.com/tailscale/go-winio/internal/fs from github.com/tailscale/go-winio W 💣 github.com/tailscale/go-winio/internal/socket from github.com/tailscale/go-winio @@ -72,7 +72,7 @@ tailscale.com/cmd/derper dependencies: (generated by github.com/tailscale/depawa google.golang.org/protobuf/reflect/protoregistry from google.golang.org/protobuf/encoding/prototext+ google.golang.org/protobuf/runtime/protoiface from google.golang.org/protobuf/internal/impl+ google.golang.org/protobuf/runtime/protoimpl from github.com/prometheus/client_model/go+ - google.golang.org/protobuf/types/known/timestamppb from github.com/prometheus/client_golang/prometheus+ + 💣 google.golang.org/protobuf/types/known/timestamppb from github.com/prometheus/client_golang/prometheus+ tailscale.com from tailscale.com/version 💣 tailscale.com/atomicfile from tailscale.com/cmd/derper+ tailscale.com/client/local from tailscale.com/derp/derpserver diff --git a/cmd/k8s-operator/depaware.txt b/cmd/k8s-operator/depaware.txt index 16ad089f3..c76a4236e 100644 --- a/cmd/k8s-operator/depaware.txt +++ b/cmd/k8s-operator/depaware.txt @@ -71,8 +71,9 @@ tailscale.com/cmd/k8s-operator dependencies: (generated by github.com/tailscale/ github.com/klauspost/compress/fse from github.com/klauspost/compress/huff0 github.com/klauspost/compress/huff0 from github.com/klauspost/compress/zstd github.com/klauspost/compress/internal/cpuinfo from github.com/klauspost/compress/huff0+ + 💣 github.com/klauspost/compress/internal/le from github.com/klauspost/compress/huff0+ github.com/klauspost/compress/internal/snapref from github.com/klauspost/compress/zstd - github.com/klauspost/compress/zstd from tailscale.com/util/zstdframe+ + github.com/klauspost/compress/zstd from tailscale.com/util/zstdframe github.com/klauspost/compress/zstd/internal/xxhash from github.com/klauspost/compress/zstd github.com/mailru/easyjson/buffer from github.com/mailru/easyjson/jwriter 💣 github.com/mailru/easyjson/jlexer from github.com/go-openapi/swag @@ -94,6 +95,7 @@ tailscale.com/cmd/k8s-operator dependencies: (generated by github.com/tailscale/ github.com/prometheus/client_golang/prometheus/collectors from sigs.k8s.io/controller-runtime/pkg/internal/controller/metrics+ github.com/prometheus/client_golang/prometheus/internal from github.com/prometheus/client_golang/prometheus+ github.com/prometheus/client_golang/prometheus/promhttp from sigs.k8s.io/controller-runtime/pkg/metrics/server+ + github.com/prometheus/client_golang/prometheus/promhttp/internal from github.com/prometheus/client_golang/prometheus/promhttp github.com/prometheus/client_model/go from github.com/prometheus/client_golang/prometheus+ github.com/prometheus/common/expfmt from github.com/prometheus/client_golang/prometheus+ github.com/prometheus/common/model from github.com/prometheus/client_golang/prometheus+ @@ -180,10 +182,10 @@ tailscale.com/cmd/k8s-operator dependencies: (generated by github.com/tailscale/ google.golang.org/protobuf/reflect/protoregistry from github.com/golang/protobuf/proto+ google.golang.org/protobuf/runtime/protoiface from github.com/golang/protobuf/proto+ google.golang.org/protobuf/runtime/protoimpl from github.com/golang/protobuf/proto+ - google.golang.org/protobuf/types/descriptorpb from github.com/google/gnostic-models/openapiv3+ - google.golang.org/protobuf/types/gofeaturespb from google.golang.org/protobuf/reflect/protodesc - google.golang.org/protobuf/types/known/anypb from github.com/google/gnostic-models/compiler+ - google.golang.org/protobuf/types/known/timestamppb from github.com/prometheus/client_golang/prometheus+ + 💣 google.golang.org/protobuf/types/descriptorpb from github.com/google/gnostic-models/openapiv3+ + 💣 google.golang.org/protobuf/types/gofeaturespb from google.golang.org/protobuf/reflect/protodesc + 💣 google.golang.org/protobuf/types/known/anypb from github.com/google/gnostic-models/compiler+ + 💣 google.golang.org/protobuf/types/known/timestamppb from github.com/prometheus/client_golang/prometheus+ gopkg.in/evanphx/json-patch.v4 from k8s.io/client-go/testing gopkg.in/inf.v0 from k8s.io/apimachinery/pkg/api/resource gopkg.in/yaml.v3 from github.com/go-openapi/swag+ diff --git a/cmd/stund/depaware.txt b/cmd/stund/depaware.txt index 7b3d05f94..7b945dd77 100644 --- a/cmd/stund/depaware.txt +++ b/cmd/stund/depaware.txt @@ -14,9 +14,9 @@ tailscale.com/cmd/stund dependencies: (generated by github.com/tailscale/depawar github.com/prometheus/client_model/go from github.com/prometheus/client_golang/prometheus+ github.com/prometheus/common/expfmt from github.com/prometheus/client_golang/prometheus+ github.com/prometheus/common/model from github.com/prometheus/client_golang/prometheus+ - LD github.com/prometheus/procfs from github.com/prometheus/client_golang/prometheus - LD github.com/prometheus/procfs/internal/fs from github.com/prometheus/procfs - LD github.com/prometheus/procfs/internal/util from github.com/prometheus/procfs + L github.com/prometheus/procfs from github.com/prometheus/client_golang/prometheus + L github.com/prometheus/procfs/internal/fs from github.com/prometheus/procfs + L github.com/prometheus/procfs/internal/util from github.com/prometheus/procfs 💣 go4.org/mem from tailscale.com/metrics+ go4.org/netipx from tailscale.com/net/tsaddr google.golang.org/protobuf/encoding/protodelim from github.com/prometheus/common/expfmt @@ -47,7 +47,7 @@ tailscale.com/cmd/stund dependencies: (generated by github.com/tailscale/depawar google.golang.org/protobuf/reflect/protoregistry from google.golang.org/protobuf/encoding/prototext+ google.golang.org/protobuf/runtime/protoiface from google.golang.org/protobuf/internal/impl+ google.golang.org/protobuf/runtime/protoimpl from github.com/prometheus/client_model/go+ - google.golang.org/protobuf/types/known/timestamppb from github.com/prometheus/client_golang/prometheus+ + 💣 google.golang.org/protobuf/types/known/timestamppb from github.com/prometheus/client_golang/prometheus+ tailscale.com from tailscale.com/version tailscale.com/envknob from tailscale.com/tsweb+ tailscale.com/feature from tailscale.com/tsweb diff --git a/cmd/tailscaled/depaware-min.txt b/cmd/tailscaled/depaware-min.txt index 3c111470f..69e6559a0 100644 --- a/cmd/tailscaled/depaware-min.txt +++ b/cmd/tailscaled/depaware-min.txt @@ -16,6 +16,7 @@ tailscale.com/cmd/tailscaled dependencies: (generated by github.com/tailscale/de github.com/klauspost/compress/fse from github.com/klauspost/compress/huff0 github.com/klauspost/compress/huff0 from github.com/klauspost/compress/zstd github.com/klauspost/compress/internal/cpuinfo from github.com/klauspost/compress/huff0+ + 💣 github.com/klauspost/compress/internal/le from github.com/klauspost/compress/huff0+ github.com/klauspost/compress/internal/snapref from github.com/klauspost/compress/zstd github.com/klauspost/compress/zstd from tailscale.com/util/zstdframe github.com/klauspost/compress/zstd/internal/xxhash from github.com/klauspost/compress/zstd diff --git a/cmd/tailscaled/depaware-minbox.txt b/cmd/tailscaled/depaware-minbox.txt index 40a1fb2a4..55a21c426 100644 --- a/cmd/tailscaled/depaware-minbox.txt +++ b/cmd/tailscaled/depaware-minbox.txt @@ -20,6 +20,7 @@ tailscale.com/cmd/tailscaled dependencies: (generated by github.com/tailscale/de github.com/klauspost/compress/fse from github.com/klauspost/compress/huff0 github.com/klauspost/compress/huff0 from github.com/klauspost/compress/zstd github.com/klauspost/compress/internal/cpuinfo from github.com/klauspost/compress/huff0+ + 💣 github.com/klauspost/compress/internal/le from github.com/klauspost/compress/huff0+ github.com/klauspost/compress/internal/snapref from github.com/klauspost/compress/zstd github.com/klauspost/compress/zstd from tailscale.com/util/zstdframe github.com/klauspost/compress/zstd/internal/xxhash from github.com/klauspost/compress/zstd diff --git a/cmd/tailscaled/depaware.txt b/cmd/tailscaled/depaware.txt index d15402092..79f92deb9 100644 --- a/cmd/tailscaled/depaware.txt +++ b/cmd/tailscaled/depaware.txt @@ -139,6 +139,7 @@ tailscale.com/cmd/tailscaled dependencies: (generated by github.com/tailscale/de github.com/klauspost/compress/fse from github.com/klauspost/compress/huff0 github.com/klauspost/compress/huff0 from github.com/klauspost/compress/zstd github.com/klauspost/compress/internal/cpuinfo from github.com/klauspost/compress/huff0+ + 💣 github.com/klauspost/compress/internal/le from github.com/klauspost/compress/huff0+ github.com/klauspost/compress/internal/snapref from github.com/klauspost/compress/zstd github.com/klauspost/compress/zstd from tailscale.com/util/zstdframe github.com/klauspost/compress/zstd/internal/xxhash from github.com/klauspost/compress/zstd diff --git a/cmd/tsidp/depaware.txt b/cmd/tsidp/depaware.txt index 14db7414a..5c6aae512 100644 --- a/cmd/tsidp/depaware.txt +++ b/cmd/tsidp/depaware.txt @@ -36,6 +36,7 @@ tailscale.com/cmd/tsidp dependencies: (generated by github.com/tailscale/depawar github.com/klauspost/compress/fse from github.com/klauspost/compress/huff0 github.com/klauspost/compress/huff0 from github.com/klauspost/compress/zstd github.com/klauspost/compress/internal/cpuinfo from github.com/klauspost/compress/huff0+ + 💣 github.com/klauspost/compress/internal/le from github.com/klauspost/compress/huff0+ github.com/klauspost/compress/internal/snapref from github.com/klauspost/compress/zstd github.com/klauspost/compress/zstd from tailscale.com/util/zstdframe github.com/klauspost/compress/zstd/internal/xxhash from github.com/klauspost/compress/zstd diff --git a/flake.nix b/flake.nix index c075bce0e..505061a76 100644 --- a/flake.nix +++ b/flake.nix @@ -151,5 +151,4 @@ }); }; } -# nix-direnv cache busting line: sha256-3jAfCtp714acePnwgdNto8Sj3vFwtpO9os6IwXQ07A4= - +# nix-direnv cache busting line: sha256-jJSSXMyUqcJoZuqfSlBsKDQezyqS+jDkRglMMjG1K8g= diff --git a/go.mod b/go.mod index e6baad0dc..a49a9724f 100644 --- a/go.mod +++ b/go.mod @@ -16,6 +16,7 @@ require ( github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.17.58 github.com/aws/aws-sdk-go-v2/service/s3 v1.75.3 github.com/aws/aws-sdk-go-v2/service/ssm v1.44.7 + github.com/bradfitz/go-tool-cache v0.0.0-20251113223507-0124e698e0bd github.com/bramvdbogaerde/go-scp v1.4.0 github.com/cilium/ebpf v0.15.0 github.com/coder/websocket v1.8.12 @@ -60,7 +61,7 @@ require ( github.com/jellydator/ttlcache/v3 v3.1.0 github.com/jsimonetti/rtnetlink v1.4.0 github.com/kballard/go-shellquote v0.0.0-20180428030007-95032a82bc51 - github.com/klauspost/compress v1.17.11 + github.com/klauspost/compress v1.18.0 github.com/kortschak/wol v0.0.0-20200729010619-da482cc4850a github.com/mattn/go-colorable v0.1.13 github.com/mattn/go-isatty v0.0.20 @@ -74,8 +75,8 @@ require ( github.com/pkg/errors v0.9.1 github.com/pkg/sftp v1.13.6 github.com/prometheus-community/pro-bing v0.4.0 - github.com/prometheus/client_golang v1.20.5 - github.com/prometheus/common v0.55.0 + github.com/prometheus/client_golang v1.23.0 + github.com/prometheus/common v0.65.0 github.com/prometheus/prometheus v0.49.2-0.20240125131847-c3b8ef1694ff github.com/safchain/ethtool v0.3.0 github.com/skip2/go-qrcode v0.0.0-20200617195104-da1b6568686e @@ -103,7 +104,7 @@ require ( go4.org/mem v0.0.0-20240501181205-ae6ca9944745 go4.org/netipx v0.0.0-20231129151722-fdeea329fbba golang.org/x/crypto v0.45.0 - golang.org/x/exp v0.0.0-20250210185358-939b2ce775ac + golang.org/x/exp v0.0.0-20250620022241-b7579e27df2b golang.org/x/mod v0.30.0 golang.org/x/net v0.47.0 golang.org/x/oauth2 v0.30.0 @@ -355,8 +356,8 @@ require ( github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect github.com/polyfloyd/go-errorlint v1.4.8 // indirect - github.com/prometheus/client_model v0.6.1 - github.com/prometheus/procfs v0.15.1 // indirect + github.com/prometheus/client_model v0.6.2 + github.com/prometheus/procfs v0.16.1 // indirect github.com/quasilyte/go-ruleguard v0.4.2 // indirect github.com/quasilyte/gogrep v0.5.0 // indirect github.com/quasilyte/regex/syntax v0.0.0-20210819130434-b3f0c404a727 // indirect @@ -414,7 +415,7 @@ require ( golang.org/x/image v0.27.0 // indirect golang.org/x/text v0.31.0 // indirect gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect - google.golang.org/protobuf v1.36.3 // indirect + google.golang.org/protobuf v1.36.6 // indirect gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/ini.v1 v1.67.0 // indirect gopkg.in/warnings.v0 v0.1.2 // indirect diff --git a/go.mod.sri b/go.mod.sri index 737ea7d2b..66422652e 100644 --- a/go.mod.sri +++ b/go.mod.sri @@ -1 +1 @@ -sha256-3jAfCtp714acePnwgdNto8Sj3vFwtpO9os6IwXQ07A4= +sha256-jJSSXMyUqcJoZuqfSlBsKDQezyqS+jDkRglMMjG1K8g= diff --git a/go.sum b/go.sum index 1106932f2..f70fe9159 100644 --- a/go.sum +++ b/go.sum @@ -186,6 +186,8 @@ github.com/boltdb/bolt v1.3.1 h1:JQmyP4ZBrce+ZQu0dY660FMfatumYDLun9hBCUVIkF4= github.com/boltdb/bolt v1.3.1/go.mod h1:clJnj/oiGkjum5o1McbSZDSLxVThjynRyGBgiAx27Ps= github.com/bombsimon/wsl/v4 v4.2.1 h1:Cxg6u+XDWff75SIFFmNsqnIOgob+Q9hG6y/ioKbRFiM= github.com/bombsimon/wsl/v4 v4.2.1/go.mod h1:Xu/kDxGZTofQcDGCtQe9KCzhHphIe0fDuyWTxER9Feo= +github.com/bradfitz/go-tool-cache v0.0.0-20251113223507-0124e698e0bd h1:1Df3FBmfyUCIQ4eKzAPXIWTfewY89L0fWPWO56zWCyI= +github.com/bradfitz/go-tool-cache v0.0.0-20251113223507-0124e698e0bd/go.mod h1:2+xptBAd0m2kZ1wLO4AYZhldLEFPy+KeGwmnlXLvy+w= github.com/bramvdbogaerde/go-scp v1.4.0 h1:jKMwpwCbcX1KyvDbm/PDJuXcMuNVlLGi0Q0reuzjyKY= github.com/bramvdbogaerde/go-scp v1.4.0/go.mod h1:on2aH5AxaFb2G0N5Vsdy6B0Ml7k9HuHSwfo1y0QzAbQ= github.com/breml/bidichk v0.2.7 h1:dAkKQPLl/Qrk7hnP6P+E0xOodrq8Us7+U0o4UBOAlQY= @@ -662,8 +664,8 @@ github.com/kisielk/errcheck v1.7.0/go.mod h1:1kLL+jV4e+CFfueBmI1dSK2ADDyQnlrnrY/ github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= github.com/kkHAIKE/contextcheck v1.1.4 h1:B6zAaLhOEEcjvUgIYEqystmnFk1Oemn8bvJhbt0GMb8= github.com/kkHAIKE/contextcheck v1.1.4/go.mod h1:1+i/gWqokIa+dm31mqGLZhZJ7Uh44DJGZVmr6QRBNJg= -github.com/klauspost/compress v1.17.11 h1:In6xLpyWOi1+C7tXUUWv2ot1QvBjxevKAaI6IXrJmUc= -github.com/klauspost/compress v1.17.11/go.mod h1:pMDklpSncoRMuLFrf1W9Ss9KT+0rH90U12bZKk7uwG0= +github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo= +github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ= github.com/klauspost/pgzip v1.2.6 h1:8RXeL5crjEUFnR2/Sn6GJNWtSQ3Dk8pq4CL3jvdDyjU= github.com/klauspost/pgzip v1.2.6/go.mod h1:Ch1tH69qFZu15pkjo5kYi6mth2Zzwzt50oCQKQE9RUs= github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= @@ -840,29 +842,29 @@ github.com/prometheus/client_golang v1.7.1/go.mod h1:PY5Wy2awLA44sXw4AOSfFBetzPP github.com/prometheus/client_golang v1.11.0/go.mod h1:Z6t4BnS23TR94PD6BsDNk8yVqroYurpAkEiz0P2BEV0= github.com/prometheus/client_golang v1.11.1/go.mod h1:Z6t4BnS23TR94PD6BsDNk8yVqroYurpAkEiz0P2BEV0= github.com/prometheus/client_golang v1.12.1/go.mod h1:3Z9XVyYiZYEO+YQWt3RD2R3jrbd179Rt297l4aS6nDY= -github.com/prometheus/client_golang v1.20.5 h1:cxppBPuYhUnsO6yo/aoRol4L7q7UFfdm+bR9r+8l63Y= -github.com/prometheus/client_golang v1.20.5/go.mod h1:PIEt8X02hGcP8JWbeHyeZ53Y/jReSnHgO035n//V5WE= +github.com/prometheus/client_golang v1.23.0 h1:ust4zpdl9r4trLY/gSjlm07PuiBq2ynaXXlptpfy8Uc= +github.com/prometheus/client_golang v1.23.0/go.mod h1:i/o0R9ByOnHX0McrTMTyhYvKE4haaf2mW08I+jGAjEE= github.com/prometheus/client_model v0.0.0-20180712105110-5c3871d89910/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo= github.com/prometheus/client_model v0.0.0-20190129233127-fd36f4220a90/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= github.com/prometheus/client_model v0.2.0/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= -github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E= -github.com/prometheus/client_model v0.6.1/go.mod h1:OrxVMOVHjw3lKMa8+x6HeMGkHMQyHDk9E3jmP2AmGiY= +github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk= +github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE= github.com/prometheus/common v0.4.1/go.mod h1:TNfzLD0ON7rHzMJeJkieUDPYmFC7Snx/y86RQel1bk4= github.com/prometheus/common v0.9.1/go.mod h1:yhUN8i9wzaXS3w1O07YhxHEBxD+W35wd8bs7vj7HSQ4= github.com/prometheus/common v0.10.0/go.mod h1:Tlit/dnDKsSWFlCLTWaA1cyBgKHSMdTB80sz/V91rCo= github.com/prometheus/common v0.26.0/go.mod h1:M7rCNAaPfAosfx8veZJCuw84e35h3Cfd9VFqTh1DIvc= github.com/prometheus/common v0.32.1/go.mod h1:vu+V0TpY+O6vW9J44gczi3Ap/oXXR10b+M/gUGO4Hls= -github.com/prometheus/common v0.55.0 h1:KEi6DK7lXW/m7Ig5i47x0vRzuBsHuvJdi5ee6Y3G1dc= -github.com/prometheus/common v0.55.0/go.mod h1:2SECS4xJG1kd8XF9IcM1gMX6510RAEL65zxzNImwdc8= +github.com/prometheus/common v0.65.0 h1:QDwzd+G1twt//Kwj/Ww6E9FQq1iVMmODnILtW1t2VzE= +github.com/prometheus/common v0.65.0/go.mod h1:0gZns+BLRQ3V6NdaerOhMbwwRbNh9hkGINtQAsP5GS8= github.com/prometheus/procfs v0.0.0-20181005140218-185b4288413d/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk= github.com/prometheus/procfs v0.0.2/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsTZCD3I8kEA= github.com/prometheus/procfs v0.0.8/go.mod h1:7Qr8sr6344vo1JqZ6HhLceV9o3AJ1Ff+GxbHq6oeK9A= github.com/prometheus/procfs v0.1.3/go.mod h1:lV6e/gmhEcM9IjHGsFOCxxuZ+z1YqCvr4OA4YeYWdaU= github.com/prometheus/procfs v0.6.0/go.mod h1:cz+aTbrPOrUb4q7XlbU9ygM+/jj0fzG6c1xBZuNvfVA= github.com/prometheus/procfs v0.7.3/go.mod h1:cz+aTbrPOrUb4q7XlbU9ygM+/jj0fzG6c1xBZuNvfVA= -github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc= -github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk= +github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzMyRg= +github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is= github.com/prometheus/prometheus v0.49.2-0.20240125131847-c3b8ef1694ff h1:X1Tly81aZ22DA1fxBdfvR3iw8+yFoUBUHMEd+AX/ZXI= github.com/prometheus/prometheus v0.49.2-0.20240125131847-c3b8ef1694ff/go.mod h1:FvE8dtQ1Ww63IlyKBn1V4s+zMwF9kHkVNkQBR1pM4CU= github.com/puzpuzpuz/xsync v1.5.2 h1:yRAP4wqSOZG+/4pxJ08fPTwrfL0IzE/LKQ/cw509qGY= @@ -1140,8 +1142,8 @@ golang.org/x/exp v0.0.0-20191227195350-da58074b4299/go.mod h1:2RIsYlXP63K8oxa1u0 golang.org/x/exp v0.0.0-20200119233911-0405dc783f0a/go.mod h1:2RIsYlXP63K8oxa1u096TMicItID8zy7Y6sNkU49FU4= golang.org/x/exp v0.0.0-20200207192155-f17229e696bd/go.mod h1:J/WKrq2StrnmMY6+EHIKF9dgMWnmCNThgcyBT1FY9mM= golang.org/x/exp v0.0.0-20200224162631-6cc2880d07d6/go.mod h1:3jZMyOhIsHpP37uCMkUooju7aAi5cS1Q23tOzKc+0MU= -golang.org/x/exp v0.0.0-20250210185358-939b2ce775ac h1:l5+whBCLH3iH2ZNHYLbAe58bo7yrN4mVcnkHDYz5vvs= -golang.org/x/exp v0.0.0-20250210185358-939b2ce775ac/go.mod h1:hH+7mtFmImwwcMvScyxUhjuVHR3HGaDPMn9rMSUUbxo= +golang.org/x/exp v0.0.0-20250620022241-b7579e27df2b h1:M2rDM6z3Fhozi9O7NWsxAkg/yqS/lQJ6PmkyIV3YP+o= +golang.org/x/exp v0.0.0-20250620022241-b7579e27df2b/go.mod h1:3//PLf8L/X+8b4vuAfHzxeRUl04Adcb341+IGKfnqS8= golang.org/x/exp/typeparams v0.0.0-20220428152302-39d4317da171/go.mod h1:AbB0pIl9nAr9wVwH+Z2ZpaocVmF5I4GyWCDIsVjR0bk= golang.org/x/exp/typeparams v0.0.0-20230203172020-98cc5a0785f9/go.mod h1:AbB0pIl9nAr9wVwH+Z2ZpaocVmF5I4GyWCDIsVjR0bk= golang.org/x/exp/typeparams v0.0.0-20240314144324-c7f7c6466f7f h1:phY1HzDcf18Aq9A8KkmRtY9WvOFIxN8wgfvy6Zm1DV8= @@ -1498,8 +1500,8 @@ google.golang.org/protobuf v1.24.0/go.mod h1:r/3tXBNzIEhYS9I1OUVjXDlt8tc493IdKGj google.golang.org/protobuf v1.25.0/go.mod h1:9JNX74DMeImyA3h4bdi1ymwjUzf21/xIlbajtzgsN7c= google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= -google.golang.org/protobuf v1.36.3 h1:82DV7MYdb8anAVi3qge1wSnMDrnKK7ebr+I0hHRN1BU= -google.golang.org/protobuf v1.36.3/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE= +google.golang.org/protobuf v1.36.6 h1:z1NpPI8ku2WgiWnf+t9wTPsn6eP1L7ksHUlkfLvd9xY= +google.golang.org/protobuf v1.36.6/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY= gopkg.in/alecthomas/kingpin.v2 v2.2.6/go.mod h1:FMv+mEhP44yOT+4EoQTLFTRgOQ1FBLkstjWtayDeSgw= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= diff --git a/shell.nix b/shell.nix index 8554b9258..d412693d9 100644 --- a/shell.nix +++ b/shell.nix @@ -16,4 +16,4 @@ ) { src = ./.; }).shellNix -# nix-direnv cache busting line: sha256-3jAfCtp714acePnwgdNto8Sj3vFwtpO9os6IwXQ07A4= +# nix-direnv cache busting line: sha256-jJSSXMyUqcJoZuqfSlBsKDQezyqS+jDkRglMMjG1K8g= diff --git a/tsnet/depaware.txt b/tsnet/depaware.txt index 7d5ec0a60..825a39e34 100644 --- a/tsnet/depaware.txt +++ b/tsnet/depaware.txt @@ -36,6 +36,7 @@ tailscale.com/tsnet dependencies: (generated by github.com/tailscale/depaware) github.com/klauspost/compress/fse from github.com/klauspost/compress/huff0 github.com/klauspost/compress/huff0 from github.com/klauspost/compress/zstd github.com/klauspost/compress/internal/cpuinfo from github.com/klauspost/compress/huff0+ + 💣 github.com/klauspost/compress/internal/le from github.com/klauspost/compress/huff0+ github.com/klauspost/compress/internal/snapref from github.com/klauspost/compress/zstd github.com/klauspost/compress/zstd from tailscale.com/util/zstdframe github.com/klauspost/compress/zstd/internal/xxhash from github.com/klauspost/compress/zstd From 755309c04eae75e4dda61b79042a4ca1112b5a45 Mon Sep 17 00:00:00 2001 From: Jordan Whited Date: Thu, 20 Nov 2025 19:33:18 -0800 Subject: [PATCH 15/33] net/udprelay: use blake2s-256 MAC for handshake challenge This commit replaces crypto/rand challenge generation with a blake2s-256 MAC. This enables the peer relay server to respond to multiple forward disco.BindUDPRelayEndpoint messages per handshake generation without sacrificing the proof of IP ownership properties of the handshake. Responding to multiple forward disco.BindUDPRelayEndpoint messages per handshake generation improves client address/path selection where lowest client->server path/addr one-way delay does not necessarily equate to lowest client<->server round trip delay. It also improves situations where outbound traffic is filtered independent of input, and the first reply disco.BindUDPRelayEndpointChallenge message is dropped on the reply path, but a later reply using a different source would make it through. Reduction in serverEndpoint state saves 112 bytes per instance, trading for slightly more expensive crypto ops: 277ns/op vs 321ns/op on an M1 Macbook Pro. Updates tailscale/corp#34414 Signed-off-by: Jordan Whited --- net/udprelay/server.go | 168 ++++++++++++++++++++++++------------ net/udprelay/server_test.go | 116 +++++++++++++++++++++++++ 2 files changed, 227 insertions(+), 57 deletions(-) diff --git a/net/udprelay/server.go b/net/udprelay/server.go index 7138cec7a..b260955e0 100644 --- a/net/udprelay/server.go +++ b/net/udprelay/server.go @@ -10,6 +10,7 @@ import ( "bytes" "context" "crypto/rand" + "encoding/binary" "errors" "fmt" "net" @@ -20,6 +21,7 @@ import ( "time" "go4.org/mem" + "golang.org/x/crypto/blake2s" "golang.org/x/net/ipv6" "tailscale.com/disco" "tailscale.com/net/batching" @@ -73,7 +75,9 @@ type Server struct { closeCh chan struct{} netChecker *netcheck.Client - mu sync.Mutex // guards the following fields + mu sync.Mutex // guards the following fields + macSecrets [][blake2s.Size]byte // [0] is most recent, max 2 elements + macSecretRotatedAt time.Time derpMap *tailcfg.DERPMap onlyStaticAddrPorts bool // no dynamic addr port discovery when set staticAddrPorts views.Slice[netip.AddrPort] // static ip:port pairs set with [Server.SetStaticAddrPorts] @@ -85,6 +89,8 @@ type Server struct { byDisco map[key.SortedPairOfDiscoPublic]*serverEndpoint } +const macSecretRotationInterval = time.Minute * 2 + const ( minVNI = uint32(1) maxVNI = uint32(1<<24 - 1) @@ -98,22 +104,42 @@ type serverEndpoint struct { // indexing of this array aligns with the following fields, e.g. // discoSharedSecrets[0] is the shared secret to use when sealing // Disco protocol messages for transmission towards discoPubKeys[0]. - discoPubKeys key.SortedPairOfDiscoPublic - discoSharedSecrets [2]key.DiscoShared - handshakeGeneration [2]uint32 // or zero if a handshake has never started for that relay leg - handshakeAddrPorts [2]netip.AddrPort // or zero value if a handshake has never started for that relay leg - boundAddrPorts [2]netip.AddrPort // or zero value if a handshake has never completed for that relay leg - lastSeen [2]time.Time // TODO(jwhited): consider using mono.Time - challenge [2][disco.BindUDPRelayChallengeLen]byte - packetsRx [2]uint64 // num packets received from/sent by each client after they are bound - bytesRx [2]uint64 // num bytes received from/sent by each client after they are bound + discoPubKeys key.SortedPairOfDiscoPublic + discoSharedSecrets [2]key.DiscoShared + inProgressGeneration [2]uint32 // or zero if a handshake has never started, or has just completed + boundAddrPorts [2]netip.AddrPort // or zero value if a handshake has never completed for that relay leg + lastSeen [2]time.Time // TODO(jwhited): consider using mono.Time + packetsRx [2]uint64 // num packets received from/sent by each client after they are bound + bytesRx [2]uint64 // num bytes received from/sent by each client after they are bound lamportID uint64 vni uint32 allocatedAt time.Time } -func (e *serverEndpoint) handleDiscoControlMsg(from netip.AddrPort, senderIndex int, discoMsg disco.Message, serverDisco key.DiscoPublic) (write []byte, to netip.AddrPort) { +func blakeMACFromBindMsg(blakeKey [blake2s.Size]byte, src netip.AddrPort, msg disco.BindUDPRelayEndpointCommon) ([blake2s.Size]byte, error) { + input := make([]byte, 8, 4+4+32+18) // vni + generation + invited party disco key + addr:port + binary.BigEndian.PutUint32(input[0:4], msg.VNI) + binary.BigEndian.PutUint32(input[4:8], msg.Generation) + input = msg.RemoteKey.AppendTo(input) + input, err := src.AppendBinary(input) + if err != nil { + return [blake2s.Size]byte{}, err + } + h, err := blake2s.New256(blakeKey[:]) + if err != nil { + return [blake2s.Size]byte{}, err + } + _, err = h.Write(input) + if err != nil { + return [blake2s.Size]byte{}, err + } + var out [blake2s.Size]byte + h.Sum(out[:0]) + return out, nil +} + +func (e *serverEndpoint) handleDiscoControlMsg(from netip.AddrPort, senderIndex int, discoMsg disco.Message, serverDisco key.DiscoPublic, macSecrets [][blake2s.Size]byte) (write []byte, to netip.AddrPort) { if senderIndex != 0 && senderIndex != 1 { return nil, netip.AddrPort{} } @@ -144,18 +170,11 @@ func (e *serverEndpoint) handleDiscoControlMsg(from netip.AddrPort, senderIndex // Generation must be nonzero, silently drop return nil, netip.AddrPort{} } - if e.handshakeGeneration[senderIndex] == discoMsg.Generation { - // we've seen this generation before, silently drop - return nil, netip.AddrPort{} - } - e.handshakeGeneration[senderIndex] = discoMsg.Generation - e.handshakeAddrPorts[senderIndex] = from + e.inProgressGeneration[senderIndex] = discoMsg.Generation m := new(disco.BindUDPRelayEndpointChallenge) m.VNI = e.vni m.Generation = discoMsg.Generation m.RemoteKey = e.discoPubKeys.Get()[otherSender] - rand.Read(e.challenge[senderIndex][:]) - copy(m.Challenge[:], e.challenge[senderIndex][:]) reply := make([]byte, packet.GeneveFixedHeaderLength, 512) gh := packet.GeneveHeader{Control: true, Protocol: packet.GeneveProtocolDisco} gh.VNI.Set(e.vni) @@ -165,6 +184,11 @@ func (e *serverEndpoint) handleDiscoControlMsg(from netip.AddrPort, senderIndex } reply = append(reply, disco.Magic...) reply = serverDisco.AppendTo(reply) + mac, err := blakeMACFromBindMsg(macSecrets[0], from, m.BindUDPRelayEndpointCommon) + if err != nil { + return nil, netip.AddrPort{} + } + m.Challenge = mac box := e.discoSharedSecrets[senderIndex].Seal(m.AppendMarshal(nil)) reply = append(reply, box...) return reply, from @@ -174,17 +198,29 @@ func (e *serverEndpoint) handleDiscoControlMsg(from netip.AddrPort, senderIndex // silently drop return nil, netip.AddrPort{} } - generation := e.handshakeGeneration[senderIndex] - if generation == 0 || // we have no active handshake - generation != discoMsg.Generation || // mismatching generation for the active handshake - e.handshakeAddrPorts[senderIndex] != from || // mismatching source for the active handshake - !bytes.Equal(e.challenge[senderIndex][:], discoMsg.Challenge[:]) { // mismatching answer for the active handshake + generation := e.inProgressGeneration[senderIndex] + if generation == 0 || // we have no in-progress handshake + generation != discoMsg.Generation { // mismatching generation for the in-progress handshake // silently drop return nil, netip.AddrPort{} } - // Handshake complete. Update the binding for this sender. - e.boundAddrPorts[senderIndex] = from - e.lastSeen[senderIndex] = time.Now() // record last seen as bound time + for _, macSecret := range macSecrets { + mac, err := blakeMACFromBindMsg(macSecret, from, discoMsg.BindUDPRelayEndpointCommon) + if err != nil { + // silently drop + return nil, netip.AddrPort{} + } + // Speed is favored over constant-time comparison here. The sender is + // already authenticated via disco. + if bytes.Equal(mac[:], discoMsg.Challenge[:]) { + // Handshake complete. Update the binding for this sender. + e.boundAddrPorts[senderIndex] = from + e.lastSeen[senderIndex] = time.Now() // record last seen as bound time + e.inProgressGeneration[senderIndex] = 0 // reset to zero, which indicates there is no in-progress handshake + return nil, netip.AddrPort{} + } + } + // MAC does not match, silently drop return nil, netip.AddrPort{} default: // unexpected message types, silently drop @@ -192,7 +228,7 @@ func (e *serverEndpoint) handleDiscoControlMsg(from netip.AddrPort, senderIndex } } -func (e *serverEndpoint) handleSealedDiscoControlMsg(from netip.AddrPort, b []byte, serverDisco key.DiscoPublic) (write []byte, to netip.AddrPort) { +func (e *serverEndpoint) handleSealedDiscoControlMsg(from netip.AddrPort, b []byte, serverDisco key.DiscoPublic, macSecrets [][blake2s.Size]byte) (write []byte, to netip.AddrPort) { senderRaw, isDiscoMsg := disco.Source(b) if !isDiscoMsg { // Not a Disco message @@ -223,39 +259,29 @@ func (e *serverEndpoint) handleSealedDiscoControlMsg(from netip.AddrPort, b []by return nil, netip.AddrPort{} } - return e.handleDiscoControlMsg(from, senderIndex, discoMsg, serverDisco) + return e.handleDiscoControlMsg(from, senderIndex, discoMsg, serverDisco, macSecrets) } -func (e *serverEndpoint) handlePacket(from netip.AddrPort, gh packet.GeneveHeader, b []byte, serverDisco key.DiscoPublic) (write []byte, to netip.AddrPort) { - if !gh.Control { - if !e.isBound() { - // not a control packet, but serverEndpoint isn't bound - return nil, netip.AddrPort{} - } - switch { - case from == e.boundAddrPorts[0]: - e.lastSeen[0] = time.Now() - e.packetsRx[0]++ - e.bytesRx[0] += uint64(len(b)) - return b, e.boundAddrPorts[1] - case from == e.boundAddrPorts[1]: - e.lastSeen[1] = time.Now() - e.packetsRx[1]++ - e.bytesRx[1] += uint64(len(b)) - return b, e.boundAddrPorts[0] - default: - // unrecognized source - return nil, netip.AddrPort{} - } +func (e *serverEndpoint) handleDataPacket(from netip.AddrPort, b []byte, now time.Time) (write []byte, to netip.AddrPort) { + if !e.isBound() { + // not a control packet, but serverEndpoint isn't bound + return nil, netip.AddrPort{} } - - if gh.Protocol != packet.GeneveProtocolDisco { - // control packet, but not Disco + switch { + case from == e.boundAddrPorts[0]: + e.lastSeen[0] = now + e.packetsRx[0]++ + e.bytesRx[0] += uint64(len(b)) + return b, e.boundAddrPorts[1] + case from == e.boundAddrPorts[1]: + e.lastSeen[1] = now + e.packetsRx[1]++ + e.bytesRx[1] += uint64(len(b)) + return b, e.boundAddrPorts[0] + default: + // unrecognized source return nil, netip.AddrPort{} } - - msg := b[packet.GeneveFixedHeaderLength:] - return e.handleSealedDiscoControlMsg(from, msg, serverDisco) } func (e *serverEndpoint) isExpired(now time.Time, bindLifetime, steadyStateLifetime time.Duration) bool { @@ -621,7 +647,35 @@ func (s *Server) handlePacket(from netip.AddrPort, b []byte) (write []byte, to n return nil, netip.AddrPort{} } - return e.handlePacket(from, gh, b, s.discoPublic) + now := time.Now() + if gh.Control { + if gh.Protocol != packet.GeneveProtocolDisco { + // control packet, but not Disco + return nil, netip.AddrPort{} + } + msg := b[packet.GeneveFixedHeaderLength:] + s.maybeRotateMACSecretLocked(now) + return e.handleSealedDiscoControlMsg(from, msg, s.discoPublic, s.macSecrets) + } + return e.handleDataPacket(from, b, now) +} + +func (s *Server) maybeRotateMACSecretLocked(now time.Time) { + if !s.macSecretRotatedAt.IsZero() && now.Sub(s.macSecretRotatedAt) < macSecretRotationInterval { + return + } + switch len(s.macSecrets) { + case 0: + s.macSecrets = make([][blake2s.Size]byte, 1, 2) + case 1: + s.macSecrets = append(s.macSecrets, [blake2s.Size]byte{}) + fallthrough + case 2: + s.macSecrets[1] = s.macSecrets[0] + } + rand.Read(s.macSecrets[0][:]) + s.macSecretRotatedAt = now + return } func (s *Server) packetReadLoop(readFromSocket, otherSocket batching.Conn, readFromSocketIsIPv4 bool) { diff --git a/net/udprelay/server_test.go b/net/udprelay/server_test.go index 6c3d61658..582d4cf67 100644 --- a/net/udprelay/server_test.go +++ b/net/udprelay/server_test.go @@ -5,6 +5,7 @@ package udprelay import ( "bytes" + "crypto/rand" "net" "net/netip" "testing" @@ -14,6 +15,7 @@ import ( "github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp/cmpopts" "go4.org/mem" + "golang.org/x/crypto/blake2s" "tailscale.com/disco" "tailscale.com/net/packet" "tailscale.com/types/key" @@ -352,3 +354,117 @@ func TestServer_getNextVNILocked(t *testing.T) { _, err = s.getNextVNILocked() c.Assert(err, qt.IsNil) } + +func Test_blakeMACFromBindMsg(t *testing.T) { + var macSecret [blake2s.Size]byte + rand.Read(macSecret[:]) + src := netip.MustParseAddrPort("[2001:db8::1]:7") + + msgA := disco.BindUDPRelayEndpointCommon{ + VNI: 1, + Generation: 1, + RemoteKey: key.NewDisco().Public(), + Challenge: [32]byte{}, + } + macA, err := blakeMACFromBindMsg(macSecret, src, msgA) + if err != nil { + t.Fatal(err) + } + + msgB := msgA + msgB.VNI++ + macB, err := blakeMACFromBindMsg(macSecret, src, msgB) + if err != nil { + t.Fatal(err) + } + if macA == macB { + t.Fatalf("varying VNI input produced identical mac: %v", macA) + } + + msgC := msgA + msgC.Generation++ + macC, err := blakeMACFromBindMsg(macSecret, src, msgC) + if err != nil { + t.Fatal(err) + } + if macA == macC { + t.Fatalf("varying Generation input produced identical mac: %v", macA) + } + + msgD := msgA + msgD.RemoteKey = key.NewDisco().Public() + macD, err := blakeMACFromBindMsg(macSecret, src, msgD) + if err != nil { + t.Fatal(err) + } + if macA == macD { + t.Fatalf("varying RemoteKey input produced identical mac: %v", macA) + } + + msgE := msgA + msgE.Challenge = [32]byte{0x01} // challenge is not part of the MAC and should be ignored + macE, err := blakeMACFromBindMsg(macSecret, src, msgE) + if err != nil { + t.Fatal(err) + } + if macA != macE { + t.Fatalf("varying Challenge input produced varying mac: %v", macA) + } + + macSecretB := macSecret + macSecretB[0] ^= 0xFF + macF, err := blakeMACFromBindMsg(macSecretB, src, msgA) + if err != nil { + t.Fatal(err) + } + if macA == macF { + t.Fatalf("varying macSecret input produced identical mac: %v", macA) + } + + srcB := netip.AddrPortFrom(src.Addr(), src.Port()+1) + macG, err := blakeMACFromBindMsg(macSecret, srcB, msgA) + if err != nil { + t.Fatal(err) + } + if macA == macG { + t.Fatalf("varying src input produced identical mac: %v", macA) + } +} + +func Benchmark_blakeMACFromBindMsg(b *testing.B) { + var macSecret [blake2s.Size]byte + rand.Read(macSecret[:]) + src := netip.MustParseAddrPort("[2001:db8::1]:7") + msg := disco.BindUDPRelayEndpointCommon{ + VNI: 1, + Generation: 1, + RemoteKey: key.NewDisco().Public(), + Challenge: [32]byte{}, + } + b.ReportAllocs() + for b.Loop() { + _, err := blakeMACFromBindMsg(macSecret, src, msg) + if err != nil { + b.Fatal(err) + } + } +} + +func TestServer_maybeRotateMACSecretLocked(t *testing.T) { + s := &Server{} + start := time.Now() + s.maybeRotateMACSecretLocked(start) + qt.Assert(t, len(s.macSecrets), qt.Equals, 1) + macSecret := s.macSecrets[0] + s.maybeRotateMACSecretLocked(start.Add(macSecretRotationInterval - time.Nanosecond)) + qt.Assert(t, len(s.macSecrets), qt.Equals, 1) + qt.Assert(t, s.macSecrets[0], qt.Equals, macSecret) + s.maybeRotateMACSecretLocked(start.Add(macSecretRotationInterval)) + qt.Assert(t, len(s.macSecrets), qt.Equals, 2) + qt.Assert(t, s.macSecrets[1], qt.Equals, macSecret) + qt.Assert(t, s.macSecrets[0], qt.Not(qt.Equals), s.macSecrets[1]) + s.maybeRotateMACSecretLocked(s.macSecretRotatedAt.Add(macSecretRotationInterval)) + qt.Assert(t, macSecret, qt.Not(qt.Equals), s.macSecrets[0]) + qt.Assert(t, macSecret, qt.Not(qt.Equals), s.macSecrets[1]) + qt.Assert(t, s.macSecrets[0], qt.Not(qt.Equals), s.macSecrets[1]) +} From 7426eca163354e3f9b400c9dff9ad4c9be5c2d03 Mon Sep 17 00:00:00 2001 From: Jordan Whited Date: Fri, 21 Nov 2025 15:29:25 -0800 Subject: [PATCH 16/33] cmd/tailscale,feature/relayserver,ipn: add relay-server-static-endpoints set flag Updates tailscale/corp#31489 Updates #17791 Signed-off-by: Jordan Whited --- cmd/tailscale/cli/set.go | 65 +++++++++++++------- cmd/tailscale/cli/up.go | 1 + feature/relayserver/relayserver.go | 25 ++++++-- feature/relayserver/relayserver_test.go | 82 ++++++++++++++++++++++--- ipn/ipn_clone.go | 70 +++++++++++---------- ipn/ipn_view.go | 76 +++++++++++++---------- ipn/prefs.go | 76 +++++++++++++---------- ipn/prefs_test.go | 21 +++++++ 8 files changed, 278 insertions(+), 138 deletions(-) diff --git a/cmd/tailscale/cli/set.go b/cmd/tailscale/cli/set.go index cb3a07a6f..c2316580c 100644 --- a/cmd/tailscale/cli/set.go +++ b/cmd/tailscale/cli/set.go @@ -11,6 +11,7 @@ import ( "net/netip" "os/exec" "runtime" + "slices" "strconv" "strings" @@ -25,6 +26,7 @@ import ( "tailscale.com/types/opt" "tailscale.com/types/ptr" "tailscale.com/types/views" + "tailscale.com/util/set" "tailscale.com/version" ) @@ -43,29 +45,30 @@ Only settings explicitly mentioned will be set. There are no default values.`, } type setArgsT struct { - acceptRoutes bool - acceptDNS bool - exitNodeIP string - exitNodeAllowLANAccess bool - shieldsUp bool - runSSH bool - runWebClient bool - hostname string - advertiseRoutes string - advertiseDefaultRoute bool - advertiseConnector bool - opUser string - acceptedRisks string - profileName string - forceDaemon bool - updateCheck bool - updateApply bool - reportPosture bool - snat bool - statefulFiltering bool - sync bool - netfilterMode string - relayServerPort string + acceptRoutes bool + acceptDNS bool + exitNodeIP string + exitNodeAllowLANAccess bool + shieldsUp bool + runSSH bool + runWebClient bool + hostname string + advertiseRoutes string + advertiseDefaultRoute bool + advertiseConnector bool + opUser string + acceptedRisks string + profileName string + forceDaemon bool + updateCheck bool + updateApply bool + reportPosture bool + snat bool + statefulFiltering bool + sync bool + netfilterMode string + relayServerPort string + relayServerStaticEndpoints string } func newSetFlagSet(goos string, setArgs *setArgsT) *flag.FlagSet { @@ -88,6 +91,7 @@ func newSetFlagSet(goos string, setArgs *setArgsT) *flag.FlagSet { setf.BoolVar(&setArgs.runWebClient, "webclient", false, "expose the web interface for managing this node over Tailscale at port 5252") setf.BoolVar(&setArgs.sync, "sync", false, hidden+"actively sync configuration from the control plane (set to false only for network failure testing)") setf.StringVar(&setArgs.relayServerPort, "relay-server-port", "", "UDP port number (0 will pick a random unused port) for the relay server to bind to, on all interfaces, or empty string to disable relay server functionality") + setf.StringVar(&setArgs.relayServerStaticEndpoints, "relay-server-static-endpoints", "", "static IP:port endpoints to advertise as candidates for relay connections (comma-separated, e.g. \"[2001:db8::1]:40000,192.0.2.1:40000\") or empty string to not advertise any static endpoints") ffcomplete.Flag(setf, "exit-node", func(args []string) ([]string, ffcomplete.ShellCompDirective, error) { st, err := localClient.Status(context.Background()) @@ -248,6 +252,21 @@ func runSet(ctx context.Context, args []string) (retErr error) { maskedPrefs.Prefs.RelayServerPort = ptr.To(int(uport)) } + if setArgs.relayServerStaticEndpoints != "" { + endpointsSet := make(set.Set[netip.AddrPort]) + endpointsSplit := strings.Split(setArgs.relayServerStaticEndpoints, ",") + for _, s := range endpointsSplit { + ap, err := netip.ParseAddrPort(s) + if err != nil { + return fmt.Errorf("failed to set relay server static endpoints: %q is not a valid IP:port", s) + } + endpointsSet.Add(ap) + } + endpoints := endpointsSet.Slice() + slices.SortFunc(endpoints, netip.AddrPort.Compare) + maskedPrefs.Prefs.RelayServerStaticEndpoints = endpoints + } + checkPrefs := curPrefs.Clone() checkPrefs.ApplyEdits(maskedPrefs) if err := localClient.CheckPrefs(ctx, checkPrefs); err != nil { diff --git a/cmd/tailscale/cli/up.go b/cmd/tailscale/cli/up.go index 7f5b2e6b4..72515400d 100644 --- a/cmd/tailscale/cli/up.go +++ b/cmd/tailscale/cli/up.go @@ -887,6 +887,7 @@ func init() { addPrefFlagMapping("report-posture", "PostureChecking") addPrefFlagMapping("relay-server-port", "RelayServerPort") addPrefFlagMapping("sync", "Sync") + addPrefFlagMapping("relay-server-static-endpoints", "RelayServerStaticEndpoints") } func addPrefFlagMapping(flagName string, prefNames ...string) { diff --git a/feature/relayserver/relayserver.go b/feature/relayserver/relayserver.go index b7457210f..e85576e50 100644 --- a/feature/relayserver/relayserver.go +++ b/feature/relayserver/relayserver.go @@ -9,6 +9,7 @@ import ( "encoding/json" "fmt" "net/http" + "net/netip" "tailscale.com/disco" "tailscale.com/feature" @@ -23,6 +24,7 @@ import ( "tailscale.com/types/key" "tailscale.com/types/logger" "tailscale.com/types/ptr" + "tailscale.com/types/views" "tailscale.com/util/eventbus" "tailscale.com/wgengine/magicsock" ) @@ -85,6 +87,7 @@ type relayServer interface { AllocateEndpoint(discoA, discoB key.DiscoPublic) (endpoint.ServerEndpoint, error) GetSessions() []status.ServerSession SetDERPMapView(tailcfg.DERPMapView) + SetStaticAddrPorts(addrPorts views.Slice[netip.AddrPort]) } // extension is an [ipnext.Extension] managing the relay server on platforms @@ -95,12 +98,13 @@ type extension struct { ec *eventbus.Client respPub *eventbus.Publisher[magicsock.UDPRelayAllocResp] - mu syncs.Mutex // guards the following fields - shutdown bool // true if Shutdown() has been called - rs relayServer // nil when disabled - port *int // ipn.Prefs.RelayServerPort, nil if disabled - derpMapView tailcfg.DERPMapView // latest seen over the eventbus - hasNodeAttrDisableRelayServer bool // [tailcfg.NodeAttrDisableRelayServer] + mu syncs.Mutex // guards the following fields + shutdown bool // true if Shutdown() has been called + rs relayServer // nil when disabled + port *int // ipn.Prefs.RelayServerPort, nil if disabled + staticEndpoints views.Slice[netip.AddrPort] // ipn.Prefs.RelayServerStaticEndpoints + derpMapView tailcfg.DERPMapView // latest seen over the eventbus + hasNodeAttrDisableRelayServer bool // [tailcfg.NodeAttrDisableRelayServer] } // Name implements [ipnext.Extension]. @@ -186,6 +190,7 @@ func (e *extension) relayServerShouldBeRunningLocked() bool { // handleRelayServerLifetimeLocked handles the lifetime of [e.rs]. func (e *extension) handleRelayServerLifetimeLocked() { + defer e.handleRelayServerStaticAddrPortsLocked() if !e.relayServerShouldBeRunningLocked() { e.stopRelayServerLocked() return @@ -195,6 +200,13 @@ func (e *extension) handleRelayServerLifetimeLocked() { e.tryStartRelayServerLocked() } +func (e *extension) handleRelayServerStaticAddrPortsLocked() { + if e.rs != nil { + // TODO(jwhited): env var support + e.rs.SetStaticAddrPorts(e.staticEndpoints) + } +} + func (e *extension) selfNodeViewChanged(nodeView tailcfg.NodeView) { e.mu.Lock() defer e.mu.Unlock() @@ -205,6 +217,7 @@ func (e *extension) selfNodeViewChanged(nodeView tailcfg.NodeView) { func (e *extension) profileStateChanged(_ ipn.LoginProfileView, prefs ipn.PrefsView, sameNode bool) { e.mu.Lock() defer e.mu.Unlock() + e.staticEndpoints = prefs.RelayServerStaticEndpoints() newPort, ok := prefs.RelayServerPort().GetOk() enableOrDisableServer := ok != (e.port != nil) portChanged := ok && e.port != nil && newPort != *e.port diff --git a/feature/relayserver/relayserver_test.go b/feature/relayserver/relayserver_test.go index 3d71c55d7..d77d2df26 100644 --- a/feature/relayserver/relayserver_test.go +++ b/feature/relayserver/relayserver_test.go @@ -5,7 +5,9 @@ package relayserver import ( "errors" + "net/netip" "reflect" + "slices" "testing" "tailscale.com/ipn" @@ -17,15 +19,21 @@ import ( "tailscale.com/types/key" "tailscale.com/types/logger" "tailscale.com/types/ptr" + "tailscale.com/types/views" ) func Test_extension_profileStateChanged(t *testing.T) { prefsWithPortOne := ipn.Prefs{RelayServerPort: ptr.To(1)} prefsWithNilPort := ipn.Prefs{RelayServerPort: nil} + prefsWithPortOneRelayEndpoints := ipn.Prefs{ + RelayServerPort: ptr.To(1), + RelayServerStaticEndpoints: []netip.AddrPort{netip.MustParseAddrPort("127.0.0.1:7777")}, + } type fields struct { - port *int - rs relayServer + port *int + staticEndpoints views.Slice[netip.AddrPort] + rs relayServer } type args struct { prefs ipn.PrefsView @@ -38,6 +46,7 @@ func Test_extension_profileStateChanged(t *testing.T) { wantPort *int wantRelayServerFieldNonNil bool wantRelayServerFieldMutated bool + wantEndpoints []netip.AddrPort }{ { name: "no changes non-nil port previously running", @@ -53,6 +62,52 @@ func Test_extension_profileStateChanged(t *testing.T) { wantRelayServerFieldNonNil: true, wantRelayServerFieldMutated: false, }, + { + name: "set addr ports unchanged port previously running", + fields: fields{ + port: ptr.To(1), + rs: mockRelayServerNotZeroVal(), + }, + args: args{ + prefs: prefsWithPortOneRelayEndpoints.View(), + sameNode: true, + }, + wantPort: ptr.To(1), + wantRelayServerFieldNonNil: true, + wantRelayServerFieldMutated: false, + wantEndpoints: prefsWithPortOneRelayEndpoints.RelayServerStaticEndpoints, + }, + { + name: "set addr ports not previously running", + fields: fields{ + port: nil, + rs: nil, + }, + args: args{ + prefs: prefsWithPortOneRelayEndpoints.View(), + sameNode: true, + }, + wantPort: ptr.To(1), + wantRelayServerFieldNonNil: true, + wantRelayServerFieldMutated: true, + wantEndpoints: prefsWithPortOneRelayEndpoints.RelayServerStaticEndpoints, + }, + { + name: "clear addr ports unchanged port previously running", + fields: fields{ + port: ptr.To(1), + staticEndpoints: views.SliceOf(prefsWithPortOneRelayEndpoints.RelayServerStaticEndpoints), + rs: mockRelayServerNotZeroVal(), + }, + args: args{ + prefs: prefsWithPortOne.View(), + sameNode: true, + }, + wantPort: ptr.To(1), + wantRelayServerFieldNonNil: true, + wantRelayServerFieldMutated: false, + wantEndpoints: nil, + }, { name: "prefs port nil", fields: fields{ @@ -160,6 +215,7 @@ func Test_extension_profileStateChanged(t *testing.T) { return &mockRelayServer{}, nil } e.port = tt.fields.port + e.staticEndpoints = tt.fields.staticEndpoints e.rs = tt.fields.rs defer e.Shutdown() e.profileStateChanged(ipn.LoginProfileView{}, tt.args.prefs, tt.args.sameNode) @@ -174,24 +230,34 @@ func Test_extension_profileStateChanged(t *testing.T) { if tt.wantRelayServerFieldMutated != !reflect.DeepEqual(tt.fields.rs, e.rs) { t.Errorf("wantRelayServerFieldMutated: %v != !reflect.DeepEqual(tt.fields.rs, e.rs): %v", tt.wantRelayServerFieldMutated, !reflect.DeepEqual(tt.fields.rs, e.rs)) } + if !slices.Equal(tt.wantEndpoints, e.staticEndpoints.AsSlice()) { + t.Errorf("wantEndpoints: %v != %v", tt.wantEndpoints, e.staticEndpoints.AsSlice()) + } + if e.rs != nil && !slices.Equal(tt.wantEndpoints, e.rs.(*mockRelayServer).addrPorts.AsSlice()) { + t.Errorf("wantEndpoints: %v != %v", tt.wantEndpoints, e.rs.(*mockRelayServer).addrPorts.AsSlice()) + } }) } } func mockRelayServerNotZeroVal() *mockRelayServer { - return &mockRelayServer{true} + return &mockRelayServer{set: true} } type mockRelayServer struct { - set bool + set bool + addrPorts views.Slice[netip.AddrPort] } -func (mockRelayServer) Close() error { return nil } -func (mockRelayServer) AllocateEndpoint(_, _ key.DiscoPublic) (endpoint.ServerEndpoint, error) { +func (m *mockRelayServer) Close() error { return nil } +func (m *mockRelayServer) AllocateEndpoint(_, _ key.DiscoPublic) (endpoint.ServerEndpoint, error) { return endpoint.ServerEndpoint{}, errors.New("not implemented") } -func (mockRelayServer) GetSessions() []status.ServerSession { return nil } -func (mockRelayServer) SetDERPMapView(tailcfg.DERPMapView) { return } +func (m *mockRelayServer) GetSessions() []status.ServerSession { return nil } +func (m *mockRelayServer) SetDERPMapView(tailcfg.DERPMapView) { return } +func (m *mockRelayServer) SetStaticAddrPorts(aps views.Slice[netip.AddrPort]) { + m.addrPorts = aps +} type mockSafeBackend struct { sys *tsd.System diff --git a/ipn/ipn_clone.go b/ipn/ipn_clone.go index 1be716197..fae85adee 100644 --- a/ipn/ipn_clone.go +++ b/ipn/ipn_clone.go @@ -64,46 +64,48 @@ func (src *Prefs) Clone() *Prefs { if dst.RelayServerPort != nil { dst.RelayServerPort = ptr.To(*src.RelayServerPort) } + dst.RelayServerStaticEndpoints = append(src.RelayServerStaticEndpoints[:0:0], src.RelayServerStaticEndpoints...) dst.Persist = src.Persist.Clone() return dst } // A compilation failure here means this code must be regenerated, with the command at the top of this file. var _PrefsCloneNeedsRegeneration = Prefs(struct { - ControlURL string - RouteAll bool - ExitNodeID tailcfg.StableNodeID - ExitNodeIP netip.Addr - AutoExitNode ExitNodeExpression - InternalExitNodePrior tailcfg.StableNodeID - ExitNodeAllowLANAccess bool - CorpDNS bool - RunSSH bool - RunWebClient bool - WantRunning bool - LoggedOut bool - ShieldsUp bool - AdvertiseTags []string - Hostname string - NotepadURLs bool - ForceDaemon bool - Egg bool - AdvertiseRoutes []netip.Prefix - AdvertiseServices []string - Sync opt.Bool - NoSNAT bool - NoStatefulFiltering opt.Bool - NetfilterMode preftype.NetfilterMode - OperatorUser string - ProfileName string - AutoUpdate AutoUpdatePrefs - AppConnector AppConnectorPrefs - PostureChecking bool - NetfilterKind string - DriveShares []*drive.Share - RelayServerPort *int - AllowSingleHosts marshalAsTrueInJSON - Persist *persist.Persist + ControlURL string + RouteAll bool + ExitNodeID tailcfg.StableNodeID + ExitNodeIP netip.Addr + AutoExitNode ExitNodeExpression + InternalExitNodePrior tailcfg.StableNodeID + ExitNodeAllowLANAccess bool + CorpDNS bool + RunSSH bool + RunWebClient bool + WantRunning bool + LoggedOut bool + ShieldsUp bool + AdvertiseTags []string + Hostname string + NotepadURLs bool + ForceDaemon bool + Egg bool + AdvertiseRoutes []netip.Prefix + AdvertiseServices []string + Sync opt.Bool + NoSNAT bool + NoStatefulFiltering opt.Bool + NetfilterMode preftype.NetfilterMode + OperatorUser string + ProfileName string + AutoUpdate AutoUpdatePrefs + AppConnector AppConnectorPrefs + PostureChecking bool + NetfilterKind string + DriveShares []*drive.Share + RelayServerPort *int + RelayServerStaticEndpoints []netip.AddrPort + AllowSingleHosts marshalAsTrueInJSON + Persist *persist.Persist }{}) // Clone makes a deep copy of ServeConfig. diff --git a/ipn/ipn_view.go b/ipn/ipn_view.go index d3836416b..aac8cb4d7 100644 --- a/ipn/ipn_view.go +++ b/ipn/ipn_view.go @@ -448,6 +448,13 @@ func (v PrefsView) RelayServerPort() views.ValuePointer[int] { return views.ValuePointerOf(v.ж.RelayServerPort) } +// RelayServerStaticEndpoints are static IP:port endpoints to advertise as +// candidates for relay connections. Only relevant when RelayServerPort is +// non-nil. +func (v PrefsView) RelayServerStaticEndpoints() views.Slice[netip.AddrPort] { + return views.SliceOf(v.ж.RelayServerStaticEndpoints) +} + // AllowSingleHosts was a legacy field that was always true // for the past 4.5 years. It controlled whether Tailscale // peers got /32 or /128 routes for each other. @@ -468,40 +475,41 @@ func (v PrefsView) Persist() persist.PersistView { return v.ж.Persist.View() } // A compilation failure here means this code must be regenerated, with the command at the top of this file. var _PrefsViewNeedsRegeneration = Prefs(struct { - ControlURL string - RouteAll bool - ExitNodeID tailcfg.StableNodeID - ExitNodeIP netip.Addr - AutoExitNode ExitNodeExpression - InternalExitNodePrior tailcfg.StableNodeID - ExitNodeAllowLANAccess bool - CorpDNS bool - RunSSH bool - RunWebClient bool - WantRunning bool - LoggedOut bool - ShieldsUp bool - AdvertiseTags []string - Hostname string - NotepadURLs bool - ForceDaemon bool - Egg bool - AdvertiseRoutes []netip.Prefix - AdvertiseServices []string - Sync opt.Bool - NoSNAT bool - NoStatefulFiltering opt.Bool - NetfilterMode preftype.NetfilterMode - OperatorUser string - ProfileName string - AutoUpdate AutoUpdatePrefs - AppConnector AppConnectorPrefs - PostureChecking bool - NetfilterKind string - DriveShares []*drive.Share - RelayServerPort *int - AllowSingleHosts marshalAsTrueInJSON - Persist *persist.Persist + ControlURL string + RouteAll bool + ExitNodeID tailcfg.StableNodeID + ExitNodeIP netip.Addr + AutoExitNode ExitNodeExpression + InternalExitNodePrior tailcfg.StableNodeID + ExitNodeAllowLANAccess bool + CorpDNS bool + RunSSH bool + RunWebClient bool + WantRunning bool + LoggedOut bool + ShieldsUp bool + AdvertiseTags []string + Hostname string + NotepadURLs bool + ForceDaemon bool + Egg bool + AdvertiseRoutes []netip.Prefix + AdvertiseServices []string + Sync opt.Bool + NoSNAT bool + NoStatefulFiltering opt.Bool + NetfilterMode preftype.NetfilterMode + OperatorUser string + ProfileName string + AutoUpdate AutoUpdatePrefs + AppConnector AppConnectorPrefs + PostureChecking bool + NetfilterKind string + DriveShares []*drive.Share + RelayServerPort *int + RelayServerStaticEndpoints []netip.AddrPort + AllowSingleHosts marshalAsTrueInJSON + Persist *persist.Persist }{}) // View returns a read-only view of ServeConfig. diff --git a/ipn/prefs.go b/ipn/prefs.go index 7f8216c60..6f3cb65f8 100644 --- a/ipn/prefs.go +++ b/ipn/prefs.go @@ -288,6 +288,11 @@ type Prefs struct { // non-nil/enabled. RelayServerPort *int `json:",omitempty"` + // RelayServerStaticEndpoints are static IP:port endpoints to advertise as + // candidates for relay connections. Only relevant when RelayServerPort is + // non-nil. + RelayServerStaticEndpoints []netip.AddrPort `json:",omitempty"` + // AllowSingleHosts was a legacy field that was always true // for the past 4.5 years. It controlled whether Tailscale // peers got /32 or /128 routes for each other. @@ -350,38 +355,39 @@ type AppConnectorPrefs struct { type MaskedPrefs struct { Prefs - ControlURLSet bool `json:",omitempty"` - RouteAllSet bool `json:",omitempty"` - ExitNodeIDSet bool `json:",omitempty"` - ExitNodeIPSet bool `json:",omitempty"` - AutoExitNodeSet bool `json:",omitempty"` - InternalExitNodePriorSet bool `json:",omitempty"` // Internal; can't be set by LocalAPI clients - ExitNodeAllowLANAccessSet bool `json:",omitempty"` - CorpDNSSet bool `json:",omitempty"` - RunSSHSet bool `json:",omitempty"` - RunWebClientSet bool `json:",omitempty"` - WantRunningSet bool `json:",omitempty"` - LoggedOutSet bool `json:",omitempty"` - ShieldsUpSet bool `json:",omitempty"` - AdvertiseTagsSet bool `json:",omitempty"` - HostnameSet bool `json:",omitempty"` - NotepadURLsSet bool `json:",omitempty"` - ForceDaemonSet bool `json:",omitempty"` - EggSet bool `json:",omitempty"` - AdvertiseRoutesSet bool `json:",omitempty"` - AdvertiseServicesSet bool `json:",omitempty"` - SyncSet bool `json:",omitzero"` - NoSNATSet bool `json:",omitempty"` - NoStatefulFilteringSet bool `json:",omitempty"` - NetfilterModeSet bool `json:",omitempty"` - OperatorUserSet bool `json:",omitempty"` - ProfileNameSet bool `json:",omitempty"` - AutoUpdateSet AutoUpdatePrefsMask `json:",omitzero"` - AppConnectorSet bool `json:",omitempty"` - PostureCheckingSet bool `json:",omitempty"` - NetfilterKindSet bool `json:",omitempty"` - DriveSharesSet bool `json:",omitempty"` - RelayServerPortSet bool `json:",omitempty"` + ControlURLSet bool `json:",omitempty"` + RouteAllSet bool `json:",omitempty"` + ExitNodeIDSet bool `json:",omitempty"` + ExitNodeIPSet bool `json:",omitempty"` + AutoExitNodeSet bool `json:",omitempty"` + InternalExitNodePriorSet bool `json:",omitempty"` // Internal; can't be set by LocalAPI clients + ExitNodeAllowLANAccessSet bool `json:",omitempty"` + CorpDNSSet bool `json:",omitempty"` + RunSSHSet bool `json:",omitempty"` + RunWebClientSet bool `json:",omitempty"` + WantRunningSet bool `json:",omitempty"` + LoggedOutSet bool `json:",omitempty"` + ShieldsUpSet bool `json:",omitempty"` + AdvertiseTagsSet bool `json:",omitempty"` + HostnameSet bool `json:",omitempty"` + NotepadURLsSet bool `json:",omitempty"` + ForceDaemonSet bool `json:",omitempty"` + EggSet bool `json:",omitempty"` + AdvertiseRoutesSet bool `json:",omitempty"` + AdvertiseServicesSet bool `json:",omitempty"` + SyncSet bool `json:",omitzero"` + NoSNATSet bool `json:",omitempty"` + NoStatefulFilteringSet bool `json:",omitempty"` + NetfilterModeSet bool `json:",omitempty"` + OperatorUserSet bool `json:",omitempty"` + ProfileNameSet bool `json:",omitempty"` + AutoUpdateSet AutoUpdatePrefsMask `json:",omitzero"` + AppConnectorSet bool `json:",omitempty"` + PostureCheckingSet bool `json:",omitempty"` + NetfilterKindSet bool `json:",omitempty"` + DriveSharesSet bool `json:",omitempty"` + RelayServerPortSet bool `json:",omitempty"` + RelayServerStaticEndpointsSet bool `json:",omitzero"` } // SetsInternal reports whether mp has any of the Internal*Set field bools set @@ -621,6 +627,9 @@ func (p *Prefs) pretty(goos string) string { if buildfeatures.HasRelayServer && p.RelayServerPort != nil { fmt.Fprintf(&sb, "relayServerPort=%d ", *p.RelayServerPort) } + if buildfeatures.HasRelayServer && len(p.RelayServerStaticEndpoints) > 0 { + fmt.Fprintf(&sb, "relayServerStaticEndpoints=%v ", p.RelayServerStaticEndpoints) + } if p.Persist != nil { sb.WriteString(p.Persist.Pretty()) } else { @@ -685,7 +694,8 @@ func (p *Prefs) Equals(p2 *Prefs) bool { p.PostureChecking == p2.PostureChecking && slices.EqualFunc(p.DriveShares, p2.DriveShares, drive.SharesEqual) && p.NetfilterKind == p2.NetfilterKind && - compareIntPtrs(p.RelayServerPort, p2.RelayServerPort) + compareIntPtrs(p.RelayServerPort, p2.RelayServerPort) && + slices.Equal(p.RelayServerStaticEndpoints, p2.RelayServerStaticEndpoints) } func (au AutoUpdatePrefs) Pretty() string { diff --git a/ipn/prefs_test.go b/ipn/prefs_test.go index 7c9c3ef43..cf0750706 100644 --- a/ipn/prefs_test.go +++ b/ipn/prefs_test.go @@ -69,6 +69,7 @@ func TestPrefsEqual(t *testing.T) { "NetfilterKind", "DriveShares", "RelayServerPort", + "RelayServerStaticEndpoints", "AllowSingleHosts", "Persist", } @@ -90,6 +91,16 @@ func TestPrefsEqual(t *testing.T) { } return ns } + aps := func(strs ...string) (ret []netip.AddrPort) { + for _, s := range strs { + n, err := netip.ParseAddrPort(s) + if err != nil { + panic(err) + } + ret = append(ret, n) + } + return ret + } tests := []struct { a, b *Prefs want bool @@ -369,6 +380,16 @@ func TestPrefsEqual(t *testing.T) { &Prefs{RelayServerPort: relayServerPort(1)}, false, }, + { + &Prefs{RelayServerStaticEndpoints: aps("[2001:db8::1]:40000", "192.0.2.1:40000")}, + &Prefs{RelayServerStaticEndpoints: aps("[2001:db8::1]:40000", "192.0.2.1:40000")}, + true, + }, + { + &Prefs{RelayServerStaticEndpoints: aps("[2001:db8::1]:40000", "192.0.2.2:40000")}, + &Prefs{RelayServerStaticEndpoints: aps("[2001:db8::1]:40000", "192.0.2.1:40000")}, + false, + }, } for i, tt := range tests { got := tt.a.Equals(tt.b) From 9c3a2aa79789542262ebae3c3d224da16dc61dbb Mon Sep 17 00:00:00 2001 From: Simon Law Date: Mon, 24 Nov 2025 17:42:58 -0800 Subject: [PATCH 17/33] ipn/ipnlocal: replace log.Printf with logf (#18045) Updates #cleanup Signed-off-by: Simon Law --- ipn/ipnlocal/local.go | 19 +++++++++++-------- ipn/ipnlocal/local_test.go | 5 ++++- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/ipn/ipnlocal/local.go b/ipn/ipnlocal/local.go index 72b230327..3665999e8 100644 --- a/ipn/ipnlocal/local.go +++ b/ipn/ipnlocal/local.go @@ -14,7 +14,6 @@ import ( "errors" "fmt" "io" - "log" "math" "math/rand/v2" "net" @@ -544,7 +543,7 @@ func NewLocalBackend(logf logger.Logf, logID logid.PublicID, sys *tsd.System, lo netMon := sys.NetMon.Get() b.sockstatLogger, err = sockstatlog.NewLogger(logpolicy.LogsDir(logf), logf, logID, netMon, sys.HealthTracker.Get(), sys.Bus.Get()) if err != nil { - log.Printf("error setting up sockstat logger: %v", err) + logf("error setting up sockstat logger: %v", err) } // Enable sockstats logs only on non-mobile unstable builds if version.IsUnstableBuild() && !version.IsMobile() && b.sockstatLogger != nil { @@ -7259,7 +7258,12 @@ func (b *LocalBackend) refreshAllowedSuggestions() { } b.allowedSuggestedExitNodesMu.Lock() defer b.allowedSuggestedExitNodesMu.Unlock() - b.allowedSuggestedExitNodes = fillAllowedSuggestions(b.polc) + + var err error + b.allowedSuggestedExitNodes, err = fillAllowedSuggestions(b.polc) + if err != nil { + b.logf("error refreshing allowed suggestions: %v", err) + } } // selectRegionFunc returns a DERP region from the slice of candidate regions. @@ -7271,20 +7275,19 @@ type selectRegionFunc func(views.Slice[int]) int // choice. type selectNodeFunc func(nodes views.Slice[tailcfg.NodeView], last tailcfg.StableNodeID) tailcfg.NodeView -func fillAllowedSuggestions(polc policyclient.Client) set.Set[tailcfg.StableNodeID] { +func fillAllowedSuggestions(polc policyclient.Client) (set.Set[tailcfg.StableNodeID], error) { nodes, err := polc.GetStringArray(pkey.AllowedSuggestedExitNodes, nil) if err != nil { - log.Printf("fillAllowedSuggestions: unable to look up %q policy: %v", pkey.AllowedSuggestedExitNodes, err) - return nil + return nil, fmt.Errorf("fillAllowedSuggestions: unable to look up %q policy: %w", pkey.AllowedSuggestedExitNodes, err) } if nodes == nil { - return nil + return nil, nil } s := make(set.Set[tailcfg.StableNodeID], len(nodes)) for _, n := range nodes { s.Add(tailcfg.StableNodeID(n)) } - return s + return s, nil } // suggestExitNode returns a suggestion for reasonably good exit node based on diff --git a/ipn/ipnlocal/local_test.go b/ipn/ipnlocal/local_test.go index f17fabb60..3da014fd6 100644 --- a/ipn/ipnlocal/local_test.go +++ b/ipn/ipnlocal/local_test.go @@ -5590,7 +5590,10 @@ func TestFillAllowedSuggestions(t *testing.T) { var pol policytest.Config pol.Set(pkey.AllowedSuggestedExitNodes, tt.allowPolicy) - got := fillAllowedSuggestions(pol) + got, err := fillAllowedSuggestions(pol) + if err != nil { + t.Fatal(err) + } if got == nil { if tt.want == nil { return From d4821cdc2f49094a933e4379fec1fd140bcc958c Mon Sep 17 00:00:00 2001 From: David Bond Date: Tue, 25 Nov 2025 12:41:39 +0000 Subject: [PATCH 18/33] cmd/k8s-operator: allow HA ingresses to be deleted when VIP service does not exist (#18050) This commit fixes a bug in our HA ingress reconciler where ingress resources would be stuck in a deleting state should their associated VIP service be deleted within control. The reconciliation loop would check for the existence of the VIP service and if not found perform no additional cleanup steps. The code has been modified to continue onwards even if the VIP service is not found. Fixes: https://github.com/tailscale/tailscale/issues/18049 Signed-off-by: David Bond --- cmd/k8s-operator/api-server-proxy-pg_test.go | 8 +-- cmd/k8s-operator/ingress-for-pg.go | 15 ++-- cmd/k8s-operator/ingress-for-pg_test.go | 74 ++++++++++++++++---- 3 files changed, 71 insertions(+), 26 deletions(-) diff --git a/cmd/k8s-operator/api-server-proxy-pg_test.go b/cmd/k8s-operator/api-server-proxy-pg_test.go index dfef63f22..dee505723 100644 --- a/cmd/k8s-operator/api-server-proxy-pg_test.go +++ b/cmd/k8s-operator/api-server-proxy-pg_test.go @@ -182,9 +182,7 @@ func TestAPIServerProxyReconciler(t *testing.T) { expectEqual(t, fc, certSecretRoleBinding(pg, ns, defaultDomain)) // Simulate certs being issued; should observe AdvertiseServices config change. - if err := populateTLSSecret(t.Context(), fc, pgName, defaultDomain); err != nil { - t.Fatalf("populating TLS Secret: %v", err) - } + populateTLSSecret(t, fc, pgName, defaultDomain) expectReconciled(t, r, "", pgName) expectedCfg.AdvertiseServices = []string{"svc:" + pgName} @@ -247,9 +245,7 @@ func TestAPIServerProxyReconciler(t *testing.T) { expectMissing[rbacv1.RoleBinding](t, fc, ns, defaultDomain) // Check we get the new hostname in the status once ready. - if err := populateTLSSecret(t.Context(), fc, pgName, updatedDomain); err != nil { - t.Fatalf("populating TLS Secret: %v", err) - } + populateTLSSecret(t, fc, pgName, updatedDomain) mustUpdate(t, fc, "operator-ns", "test-pg-0", func(s *corev1.Secret) { s.Data["profile-foo"] = []byte(`{"AdvertiseServices":["svc:test-pg"],"Config":{"NodeID":"node-foo"}}`) }) diff --git a/cmd/k8s-operator/ingress-for-pg.go b/cmd/k8s-operator/ingress-for-pg.go index 4d8311805..460a1914e 100644 --- a/cmd/k8s-operator/ingress-for-pg.go +++ b/cmd/k8s-operator/ingress-for-pg.go @@ -29,6 +29,7 @@ import ( "k8s.io/client-go/tools/record" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/reconcile" + "tailscale.com/internal/client/tailscale" "tailscale.com/ipn" "tailscale.com/ipn/ipnstate" @@ -504,10 +505,7 @@ func (r *HAIngressReconciler) maybeCleanup(ctx context.Context, hostname string, logger.Infof("Ensuring that Tailscale Service %q configuration is cleaned up", hostname) serviceName := tailcfg.ServiceName("svc:" + hostname) svc, err := r.tsClient.GetVIPService(ctx, serviceName) - if err != nil { - if isErrorTailscaleServiceNotFound(err) { - return false, nil - } + if err != nil && !isErrorTailscaleServiceNotFound(err) { return false, fmt.Errorf("error getting Tailscale Service: %w", err) } @@ -713,10 +711,15 @@ func (r *HAIngressReconciler) cleanupTailscaleService(ctx context.Context, svc * } if len(o.OwnerRefs) == 1 { logger.Infof("Deleting Tailscale Service %q", svc.Name) - return false, r.tsClient.DeleteVIPService(ctx, svc.Name) + if err = r.tsClient.DeleteVIPService(ctx, svc.Name); err != nil && !isErrorTailscaleServiceNotFound(err) { + return false, err + } + + return false, nil } + o.OwnerRefs = slices.Delete(o.OwnerRefs, ix, ix+1) - logger.Infof("Deleting Tailscale Service %q", svc.Name) + logger.Infof("Creating/Updating Tailscale Service %q", svc.Name) json, err := json.Marshal(o) if err != nil { return false, fmt.Errorf("error marshalling updated Tailscale Service owner reference: %w", err) diff --git a/cmd/k8s-operator/ingress-for-pg_test.go b/cmd/k8s-operator/ingress-for-pg_test.go index 77e5ecb37..5cc806ad1 100644 --- a/cmd/k8s-operator/ingress-for-pg_test.go +++ b/cmd/k8s-operator/ingress-for-pg_test.go @@ -25,6 +25,7 @@ import ( "k8s.io/client-go/tools/record" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" + "tailscale.com/internal/client/tailscale" "tailscale.com/ipn" "tailscale.com/ipn/ipnstate" @@ -67,7 +68,7 @@ func TestIngressPGReconciler(t *testing.T) { // Verify initial reconciliation expectReconciled(t, ingPGR, "default", "test-ingress") - populateTLSSecret(context.Background(), fc, "test-pg", "my-svc.ts.net") + populateTLSSecret(t, fc, "test-pg", "my-svc.ts.net") expectReconciled(t, ingPGR, "default", "test-ingress") verifyServeConfig(t, fc, "svc:my-svc", false) verifyTailscaleService(t, ft, "svc:my-svc", []string{"tcp:443"}) @@ -89,7 +90,7 @@ func TestIngressPGReconciler(t *testing.T) { expectReconciled(t, ingPGR, "default", "test-ingress") // Verify Tailscale Service uses custom tags - tsSvc, err := ft.GetVIPService(context.Background(), "svc:my-svc") + tsSvc, err := ft.GetVIPService(t.Context(), "svc:my-svc") if err != nil { t.Fatalf("getting Tailscale Service: %v", err) } @@ -134,7 +135,7 @@ func TestIngressPGReconciler(t *testing.T) { // Verify second Ingress reconciliation expectReconciled(t, ingPGR, "default", "my-other-ingress") - populateTLSSecret(context.Background(), fc, "test-pg", "my-other-svc.ts.net") + populateTLSSecret(t, fc, "test-pg", "my-other-svc.ts.net") expectReconciled(t, ingPGR, "default", "my-other-ingress") verifyServeConfig(t, fc, "svc:my-other-svc", false) verifyTailscaleService(t, ft, "svc:my-other-svc", []string{"tcp:443"}) @@ -151,14 +152,14 @@ func TestIngressPGReconciler(t *testing.T) { verifyTailscaledConfig(t, fc, "test-pg", []string{"svc:my-svc", "svc:my-other-svc"}) // Delete second Ingress - if err := fc.Delete(context.Background(), ing2); err != nil { + if err := fc.Delete(t.Context(), ing2); err != nil { t.Fatalf("deleting second Ingress: %v", err) } expectReconciled(t, ingPGR, "default", "my-other-ingress") // Verify second Ingress cleanup cm := &corev1.ConfigMap{} - if err := fc.Get(context.Background(), types.NamespacedName{ + if err := fc.Get(t.Context(), types.NamespacedName{ Name: "test-pg-ingress-config", Namespace: "operator-ns", }, cm); err != nil { @@ -199,7 +200,7 @@ func TestIngressPGReconciler(t *testing.T) { expectEqual(t, fc, certSecretRoleBinding(pg, "operator-ns", "my-svc.ts.net")) // Delete the first Ingress and verify cleanup - if err := fc.Delete(context.Background(), ing); err != nil { + if err := fc.Delete(t.Context(), ing); err != nil { t.Fatalf("deleting Ingress: %v", err) } @@ -207,7 +208,7 @@ func TestIngressPGReconciler(t *testing.T) { // Verify the ConfigMap was cleaned up cm = &corev1.ConfigMap{} - if err := fc.Get(context.Background(), types.NamespacedName{ + if err := fc.Get(t.Context(), types.NamespacedName{ Name: "test-pg-second-ingress-config", Namespace: "operator-ns", }, cm); err != nil { @@ -228,6 +229,47 @@ func TestIngressPGReconciler(t *testing.T) { expectMissing[corev1.Secret](t, fc, "operator-ns", "my-svc.ts.net") expectMissing[rbacv1.Role](t, fc, "operator-ns", "my-svc.ts.net") expectMissing[rbacv1.RoleBinding](t, fc, "operator-ns", "my-svc.ts.net") + + // Create a third ingress + ing3 := &networkingv1.Ingress{ + TypeMeta: metav1.TypeMeta{Kind: "Ingress", APIVersion: "networking.k8s.io/v1"}, + ObjectMeta: metav1.ObjectMeta{ + Name: "my-other-ingress", + Namespace: "default", + UID: types.UID("5678-UID"), + Annotations: map[string]string{ + "tailscale.com/proxy-group": "test-pg", + }, + }, + Spec: networkingv1.IngressSpec{ + IngressClassName: ptr.To("tailscale"), + DefaultBackend: &networkingv1.IngressBackend{ + Service: &networkingv1.IngressServiceBackend{ + Name: "test", + Port: networkingv1.ServiceBackendPort{ + Number: 8080, + }, + }, + }, + TLS: []networkingv1.IngressTLS{ + {Hosts: []string{"my-other-svc.tailnetxyz.ts.net"}}, + }, + }, + } + + mustCreate(t, fc, ing3) + expectReconciled(t, ingPGR, ing3.Namespace, ing3.Name) + + // Delete the service from "control" + ft.vipServices = make(map[tailcfg.ServiceName]*tailscale.VIPService) + + // Delete the ingress and confirm we don't get stuck due to the VIP service not existing. + if err = fc.Delete(t.Context(), ing3); err != nil { + t.Fatalf("deleting Ingress: %v", err) + } + + expectReconciled(t, ingPGR, ing3.Namespace, ing3.Name) + expectMissing[networkingv1.Ingress](t, fc, ing3.Namespace, ing3.Name) } func TestIngressPGReconciler_UpdateIngressHostname(t *testing.T) { @@ -262,7 +304,7 @@ func TestIngressPGReconciler_UpdateIngressHostname(t *testing.T) { // Verify initial reconciliation expectReconciled(t, ingPGR, "default", "test-ingress") - populateTLSSecret(context.Background(), fc, "test-pg", "my-svc.ts.net") + populateTLSSecret(t, fc, "test-pg", "my-svc.ts.net") expectReconciled(t, ingPGR, "default", "test-ingress") verifyServeConfig(t, fc, "svc:my-svc", false) verifyTailscaleService(t, ft, "svc:my-svc", []string{"tcp:443"}) @@ -273,13 +315,13 @@ func TestIngressPGReconciler_UpdateIngressHostname(t *testing.T) { ing.Spec.TLS[0].Hosts[0] = "updated-svc" }) expectReconciled(t, ingPGR, "default", "test-ingress") - populateTLSSecret(context.Background(), fc, "test-pg", "updated-svc.ts.net") + populateTLSSecret(t, fc, "test-pg", "updated-svc.ts.net") expectReconciled(t, ingPGR, "default", "test-ingress") verifyServeConfig(t, fc, "svc:updated-svc", false) verifyTailscaleService(t, ft, "svc:updated-svc", []string{"tcp:443"}) verifyTailscaledConfig(t, fc, "test-pg", []string{"svc:updated-svc"}) - _, err := ft.GetVIPService(context.Background(), tailcfg.ServiceName("svc:my-svc")) + _, err := ft.GetVIPService(context.Background(), "svc:my-svc") if err == nil { t.Fatalf("svc:my-svc not cleaned up") } @@ -500,7 +542,7 @@ func TestIngressPGReconciler_HTTPEndpoint(t *testing.T) { // Verify initial reconciliation with HTTP enabled expectReconciled(t, ingPGR, "default", "test-ingress") - populateTLSSecret(context.Background(), fc, "test-pg", "my-svc.ts.net") + populateTLSSecret(t, fc, "test-pg", "my-svc.ts.net") expectReconciled(t, ingPGR, "default", "test-ingress") verifyTailscaleService(t, ft, "svc:my-svc", []string{"tcp:80", "tcp:443"}) verifyServeConfig(t, fc, "svc:my-svc", true) @@ -717,7 +759,9 @@ func TestOwnerAnnotations(t *testing.T) { } } -func populateTLSSecret(ctx context.Context, c client.Client, pgName, domain string) error { +func populateTLSSecret(t *testing.T, c client.Client, pgName, domain string) { + t.Helper() + secret := &corev1.Secret{ ObjectMeta: metav1.ObjectMeta{ Name: domain, @@ -736,10 +780,12 @@ func populateTLSSecret(ctx context.Context, c client.Client, pgName, domain stri }, } - _, err := createOrUpdate(ctx, c, "operator-ns", secret, func(s *corev1.Secret) { + _, err := createOrUpdate(t.Context(), c, "operator-ns", secret, func(s *corev1.Secret) { s.Data = secret.Data }) - return err + if err != nil { + t.Fatalf("failed to populate TLS secret: %v", err) + } } func verifyTailscaleService(t *testing.T, ft *fakeTSClient, serviceName string, wantPorts []string) { From 7073f246d3e94a849d20420eaff69d7be7e494b7 Mon Sep 17 00:00:00 2001 From: Nick Khyl Date: Tue, 25 Nov 2025 08:58:36 -0600 Subject: [PATCH 19/33] ipn/ipnlocal: do not call controlclient.Client.Shutdown with b.mu held This fixes a regression in #17804 that caused a deadlock. Updates #18052 Signed-off-by: Nick Khyl --- ipn/ipnlocal/local.go | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/ipn/ipnlocal/local.go b/ipn/ipnlocal/local.go index 3665999e8..8cdfa0608 100644 --- a/ipn/ipnlocal/local.go +++ b/ipn/ipnlocal/local.go @@ -944,12 +944,12 @@ func (b *LocalBackend) pauseOrResumeControlClientLocked() { // down, clients switch over to other replicas whilst the existing connections are kept alive for some period of time. func (b *LocalBackend) DisconnectControl() { b.mu.Lock() - defer b.mu.Unlock() cc := b.resetControlClientLocked() - if cc == nil { - return + b.mu.Unlock() + + if cc != nil { + cc.Shutdown() } - cc.Shutdown() } // linkChange is our network monitor callback, called whenever the network changes. @@ -2408,7 +2408,8 @@ func (b *LocalBackend) startLocked(opts ipn.Options) error { var clientToShutdown controlclient.Client defer func() { if clientToShutdown != nil { - clientToShutdown.Shutdown() + // Shutdown outside of b.mu to avoid deadlocks. + b.goTracker.Go(clientToShutdown.Shutdown) } }() @@ -6891,7 +6892,8 @@ func (b *LocalBackend) resetForProfileChangeLocked() error { // Reset the NetworkMap in the engine b.e.SetNetworkMap(new(netmap.NetworkMap)) if prevCC := b.resetControlClientLocked(); prevCC != nil { - defer prevCC.Shutdown() + // Shutdown outside of b.mu to avoid deadlocks. + b.goTracker.Go(prevCC.Shutdown) } // TKA errors should not prevent resetting the backend state. // However, we should still return the error to the caller. @@ -6972,7 +6974,8 @@ func (b *LocalBackend) ResetAuth() error { defer b.mu.Unlock() if prevCC := b.resetControlClientLocked(); prevCC != nil { - defer prevCC.Shutdown() + // Shutdown outside of b.mu to avoid deadlocks. + b.goTracker.Go(prevCC.Shutdown) } if err := b.clearMachineKeyLocked(); err != nil { return err From 848978e664aebb28e86c17bdad4b048f981079df Mon Sep 17 00:00:00 2001 From: Simon Law Date: Tue, 25 Nov 2025 09:21:55 -0800 Subject: [PATCH 20/33] ipn/ipnlocal: test traffic-steering when feature is not enabled (#17997) In PR tailscale/corp#34401, the `traffic-steering` feature flag does not automatically enable traffic steering for all nodes. Instead, an admin must add the `traffic-steering` node attribute to each client node that they want opted-in. For backwards compatibility with older clients, tailscale/corp#34401 strips out the `traffic-steering` node attribute if the feature flag is not enabled, even if it is set in the policy file. This lets us safely disable the feature flag. This PR adds a missing test case for suggested exit nodes that have no priority. Updates tailscale/corp#34399 Signed-off-by: Simon Law --- ipn/ipnlocal/local.go | 3 +++ ipn/ipnlocal/local_test.go | 20 ++++++++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/ipn/ipnlocal/local.go b/ipn/ipnlocal/local.go index 8cdfa0608..defa558ed 100644 --- a/ipn/ipnlocal/local.go +++ b/ipn/ipnlocal/local.go @@ -7301,6 +7301,9 @@ func suggestExitNode(report *netcheck.Report, nb *nodeBackend, prevSuggestion ta // The traffic-steering feature flag is enabled on this tailnet. return suggestExitNodeUsingTrafficSteering(nb, allowList) default: + // The control plane will always strip the `traffic-steering` + // node attribute if it isn’t enabled for this tailnet, even if + // it is set in the policy file: tailscale/corp#34401 return suggestExitNodeUsingDERP(report, nb, prevSuggestion, selectRegion, selectNode, allowList) } } diff --git a/ipn/ipnlocal/local_test.go b/ipn/ipnlocal/local_test.go index 3da014fd6..68bb2618c 100644 --- a/ipn/ipnlocal/local_test.go +++ b/ipn/ipnlocal/local_test.go @@ -5173,6 +5173,26 @@ func TestSuggestExitNodeTrafficSteering(t *testing.T) { wantID: "stable3", wantName: "peer3", }, + { + name: "exit-nodes-without-priority-for-suggestions", + netMap: &netmap.NetworkMap{ + SelfNode: selfNode.View(), + Peers: []tailcfg.NodeView{ + makePeer(1, + withExitRoutes(), + withSuggest()), + makePeer(2, + withExitRoutes(), + withSuggest()), + makePeer(3, + withExitRoutes(), + withLocationPriority(1)), + }, + }, + wantID: "stable1", + wantName: "peer1", + wantPri: 0, + }, { name: "exit-nodes-with-and-without-priority", netMap: &netmap.NetworkMap{ From ac0b15356d25c011e0b9f060c06d0f9b87973721 Mon Sep 17 00:00:00 2001 From: Brad Fitzpatrick Date: Mon, 29 Sep 2025 12:17:52 -0700 Subject: [PATCH 21/33] tailcfg, control/controlclient: start moving MapResponse.DefaultAutoUpdate to a nodeattr And fix up the TestAutoUpdateDefaults integration tests as they weren't testing reality: the DefaultAutoUpdate is supposed to only be relevant on the first MapResponse in the stream, but the tests weren't testing that. They were instead injecting a 2nd+ MapResponse. This changes the test control server to add a hook to modify the first map response, and then makes the test control when the node goes up and down to make new map responses. Also, the test now runs on macOS where the auto-update feature being disabled would've previously t.Skipped the whole test. Updates #11502 Change-Id: If2319bd1f71e108b57d79fe500b2acedbc76e1a6 Signed-off-by: Brad Fitzpatrick --- cmd/vet/jsontags_allowlist | 2 +- control/controlclient/direct.go | 14 ++- feature/feature.go | 15 +++ feature/hooks.go | 9 ++ tailcfg/tailcfg.go | 23 +++- tstest/integration/integration.go | 4 + tstest/integration/integration_test.go | 112 ++++++++++++------ tstest/integration/testcontrol/testcontrol.go | 9 ++ types/netmap/nodemut.go | 2 +- 9 files changed, 147 insertions(+), 43 deletions(-) diff --git a/cmd/vet/jsontags_allowlist b/cmd/vet/jsontags_allowlist index 060a81b05..9526f44ef 100644 --- a/cmd/vet/jsontags_allowlist +++ b/cmd/vet/jsontags_allowlist @@ -107,7 +107,7 @@ OmitEmptyShouldBeOmitZero tailscale.com/tailcfg.MapResponse.ClientVersion OmitEmptyShouldBeOmitZero tailscale.com/tailcfg.MapResponse.CollectServices OmitEmptyShouldBeOmitZero tailscale.com/tailcfg.MapResponse.ControlDialPlan OmitEmptyShouldBeOmitZero tailscale.com/tailcfg.MapResponse.Debug -OmitEmptyShouldBeOmitZero tailscale.com/tailcfg.MapResponse.DefaultAutoUpdate +OmitEmptyShouldBeOmitZero tailscale.com/tailcfg.MapResponse.DeprecatedDefaultAutoUpdate OmitEmptyShouldBeOmitZero tailscale.com/tailcfg.MapResponse.DERPMap OmitEmptyShouldBeOmitZero tailscale.com/tailcfg.MapResponse.DNSConfig OmitEmptyShouldBeOmitZero tailscale.com/tailcfg.MapResponse.Node diff --git a/control/controlclient/direct.go b/control/controlclient/direct.go index 006a801ef..d5cd6a13e 100644 --- a/control/controlclient/direct.go +++ b/control/controlclient/direct.go @@ -1184,7 +1184,19 @@ func (c *Direct) sendMapRequest(ctx context.Context, isStreaming bool, nu Netmap metricMapResponseKeepAlives.Add(1) continue } - if au, ok := resp.DefaultAutoUpdate.Get(); ok { + + // DefaultAutoUpdate in its CapMap and deprecated top-level field forms. + if self := resp.Node; self != nil { + for _, v := range self.CapMap[tailcfg.NodeAttrDefaultAutoUpdate] { + switch v { + case "true", "false": + c.autoUpdatePub.Publish(AutoUpdate{c.controlClientID, v == "true"}) + default: + c.logf("netmap: [unexpected] unknown %s in CapMap: %q", tailcfg.NodeAttrDefaultAutoUpdate, v) + } + } + } + if au, ok := resp.DeprecatedDefaultAutoUpdate.Get(); ok { c.autoUpdatePub.Publish(AutoUpdate{c.controlClientID, au}) } diff --git a/feature/feature.go b/feature/feature.go index 110b104da..48a4aff43 100644 --- a/feature/feature.go +++ b/feature/feature.go @@ -7,6 +7,8 @@ package feature import ( "errors" "reflect" + + "tailscale.com/util/testenv" ) var ErrUnavailable = errors.New("feature not included in this build") @@ -55,6 +57,19 @@ func (h *Hook[Func]) Set(f Func) { h.ok = true } +// SetForTest sets the hook function for tests, blowing +// away any previous value. It will panic if called from +// non-test code. +// +// It returns a restore function that resets the hook +// to its previous value. +func (h *Hook[Func]) SetForTest(f Func) (restore func()) { + testenv.AssertInTest() + old := *h + h.f, h.ok = f, true + return func() { *h = old } +} + // Get returns the hook function, or panics if it hasn't been set. // Use IsSet to check if it's been set, or use GetOrNil if you're // okay with a nil return value. diff --git a/feature/hooks.go b/feature/hooks.go index a3c6c0395..7e31061a7 100644 --- a/feature/hooks.go +++ b/feature/hooks.go @@ -6,6 +6,8 @@ package feature import ( "net/http" "net/url" + "os" + "sync" "tailscale.com/types/logger" "tailscale.com/types/persist" @@ -15,9 +17,16 @@ import ( // to conditionally initialize. var HookCanAutoUpdate Hook[func() bool] +var testAllowAutoUpdate = sync.OnceValue(func() bool { + return os.Getenv("TS_TEST_ALLOW_AUTO_UPDATE") == "1" +}) + // CanAutoUpdate reports whether the current binary is built with auto-update // support and, if so, whether the current platform supports it. func CanAutoUpdate() bool { + if testAllowAutoUpdate() { + return true + } if f, ok := HookCanAutoUpdate.GetOk(); ok { return f() } diff --git a/tailcfg/tailcfg.go b/tailcfg/tailcfg.go index 41e0a0b28..8468aa09e 100644 --- a/tailcfg/tailcfg.go +++ b/tailcfg/tailcfg.go @@ -177,7 +177,8 @@ type CapabilityVersion int // - 128: 2025-10-02: can handle C2N /debug/health. // - 129: 2025-10-04: Fixed sleep/wake deadlock in magicsock when using peer relay (PR #17449) // - 130: 2025-10-06: client can send key.HardwareAttestationPublic and key.HardwareAttestationKeySignature in MapRequest -const CurrentCapabilityVersion CapabilityVersion = 130 +// - 131: 2025-11-25: client respects [NodeAttrDefaultAutoUpdate] +const CurrentCapabilityVersion CapabilityVersion = 131 // ID is an integer ID for a user, node, or login allocated by the // control plane. @@ -2149,12 +2150,14 @@ type MapResponse struct { // or nothing to report. ClientVersion *ClientVersion `json:",omitempty"` - // DefaultAutoUpdate is the default node auto-update setting for this + // DeprecatedDefaultAutoUpdate is the default node auto-update setting for this // tailnet. The node is free to opt-in or out locally regardless of this - // value. This value is only used on first MapResponse from control, the - // auto-update setting doesn't change if the tailnet admin flips the - // default after the node registered. - DefaultAutoUpdate opt.Bool `json:",omitempty"` + // value. Once this value has been set and stored in the client, future + // changes from the control plane are ignored. + // + // Deprecated: use NodeAttrDefaultAutoUpdate instead. See + // https://github.com/tailscale/tailscale/issues/11502. + DeprecatedDefaultAutoUpdate opt.Bool `json:"DefaultAutoUpdate,omitempty"` } // DisplayMessage represents a health state of the node from the control plane's @@ -2721,6 +2724,14 @@ const ( // default behavior is to trust the control plane when it claims that a // node is no longer online, but that is not a reliable signal. NodeAttrClientSideReachability = "client-side-reachability" + + // NodeAttrDefaultAutoUpdate advertises the default node auto-update setting + // for this tailnet. The node is free to opt-in or out locally regardless of + // this value. Once this has been set and stored in the client, future + // changes from the control plane are ignored. + // + // The value of the key in [NodeCapMap] is a JSON boolean. + NodeAttrDefaultAutoUpdate NodeCapability = "default-auto-update" ) // SetDNSRequest is a request to add a DNS record. diff --git a/tstest/integration/integration.go b/tstest/integration/integration.go index 6700205cf..ea5747b7d 100644 --- a/tstest/integration/integration.go +++ b/tstest/integration/integration.go @@ -576,6 +576,7 @@ type TestNode struct { stateFile string upFlagGOOS string // if non-empty, sets TS_DEBUG_UP_FLAG_GOOS for cmd/tailscale CLI encryptState bool + allowUpdates bool mu sync.Mutex onLogLine []func([]byte) @@ -840,6 +841,9 @@ func (n *TestNode) StartDaemonAsIPNGOOS(ipnGOOS string) *Daemon { "TS_DISABLE_PORTMAPPER=1", // shouldn't be needed; test is all localhost "TS_DEBUG_LOG_RATE=all", ) + if n.allowUpdates { + cmd.Env = append(cmd.Env, "TS_TEST_ALLOW_AUTO_UPDATE=1") + } if n.env.loopbackPort != nil { cmd.Env = append(cmd.Env, "TS_DEBUG_NETSTACK_LOOPBACK_PORT="+strconv.Itoa(*n.env.loopbackPort)) } diff --git a/tstest/integration/integration_test.go b/tstest/integration/integration_test.go index 543dc125c..3739a3011 100644 --- a/tstest/integration/integration_test.go +++ b/tstest/integration/integration_test.go @@ -25,6 +25,7 @@ import ( "slices" "strconv" "strings" + "sync" "sync/atomic" "testing" "time" @@ -1412,14 +1413,27 @@ func TestLogoutRemovesAllPeers(t *testing.T) { wantNode0PeerCount(expectedPeers) // all existing peers and the new node } -func TestAutoUpdateDefaults(t *testing.T) { - if !feature.CanAutoUpdate() { - t.Skip("auto-updates not supported on this platform") - } +func TestAutoUpdateDefaults(t *testing.T) { testAutoUpdateDefaults(t, false) } +func TestAutoUpdateDefaults_cap(t *testing.T) { testAutoUpdateDefaults(t, true) } + +// useCap is whether to use NodeAttrDefaultAutoUpdate (as opposed to the old +// DeprecatedDefaultAutoUpdate top-level MapResponse field). +func testAutoUpdateDefaults(t *testing.T, useCap bool) { + t.Cleanup(feature.HookCanAutoUpdate.SetForTest(func() bool { return true })) + tstest.Shard(t) - tstest.Parallel(t) env := NewTestEnv(t) + var ( + modifyMu sync.Mutex + modifyFirstMapResponse = func(*tailcfg.MapResponse, *tailcfg.MapRequest) {} + ) + env.Control.ModifyFirstMapResponse = func(mr *tailcfg.MapResponse, req *tailcfg.MapRequest) { + modifyMu.Lock() + defer modifyMu.Unlock() + modifyFirstMapResponse(mr, req) + } + checkDefault := func(n *TestNode, want bool) error { enabled, ok := n.diskPrefs().AutoUpdate.Apply.Get() if !ok { @@ -1431,17 +1445,23 @@ func TestAutoUpdateDefaults(t *testing.T) { return nil } - sendAndCheckDefault := func(t *testing.T, n *TestNode, send, want bool) { - t.Helper() - if !env.Control.AddRawMapResponse(n.MustStatus().Self.PublicKey, &tailcfg.MapResponse{ - DefaultAutoUpdate: opt.NewBool(send), - }) { - t.Fatal("failed to send MapResponse to node") - } - if err := tstest.WaitFor(2*time.Second, func() error { - return checkDefault(n, want) - }); err != nil { - t.Fatal(err) + setDefaultAutoUpdate := func(send bool) { + modifyMu.Lock() + defer modifyMu.Unlock() + modifyFirstMapResponse = func(mr *tailcfg.MapResponse, req *tailcfg.MapRequest) { + if mr.Node == nil { + mr.Node = &tailcfg.Node{} + } + if useCap { + if mr.Node.CapMap == nil { + mr.Node.CapMap = make(tailcfg.NodeCapMap) + } + mr.Node.CapMap[tailcfg.NodeAttrDefaultAutoUpdate] = []tailcfg.RawMessage{ + tailcfg.RawMessage(fmt.Sprintf("%t", send)), + } + } else { + mr.DeprecatedDefaultAutoUpdate = opt.NewBool(send) + } } } @@ -1452,29 +1472,54 @@ func TestAutoUpdateDefaults(t *testing.T) { { desc: "tailnet-default-false", run: func(t *testing.T, n *TestNode) { - // First received default "false". - sendAndCheckDefault(t, n, false, false) - // Should not be changed even if sent "true" later. - sendAndCheckDefault(t, n, true, false) + + // First the server sends "false", and client should remember that. + setDefaultAutoUpdate(false) + n.MustUp() + n.AwaitRunning() + checkDefault(n, false) + + // Now we disconnect and change the server to send "true", which + // the client should ignore, having previously remembered + // "false". + n.MustDown() + setDefaultAutoUpdate(true) // control sends default "true" + n.MustUp() + n.AwaitRunning() + checkDefault(n, false) // still false + // But can be changed explicitly by the user. if out, err := n.TailscaleForOutput("set", "--auto-update").CombinedOutput(); err != nil { t.Fatalf("failed to enable auto-update on node: %v\noutput: %s", err, out) } - sendAndCheckDefault(t, n, false, true) + checkDefault(n, true) }, }, { desc: "tailnet-default-true", run: func(t *testing.T, n *TestNode) { - // First received default "true". - sendAndCheckDefault(t, n, true, true) - // Should not be changed even if sent "false" later. - sendAndCheckDefault(t, n, false, true) + // Same as above but starting with default "true". + + // First the server sends "true", and client should remember that. + setDefaultAutoUpdate(true) + n.MustUp() + n.AwaitRunning() + checkDefault(n, true) + + // Now we disconnect and change the server to send "false", which + // the client should ignore, having previously remembered + // "true". + n.MustDown() + setDefaultAutoUpdate(false) // control sends default "false" + n.MustUp() + n.AwaitRunning() + checkDefault(n, true) // still true + // But can be changed explicitly by the user. if out, err := n.TailscaleForOutput("set", "--auto-update=false").CombinedOutput(); err != nil { - t.Fatalf("failed to disable auto-update on node: %v\noutput: %s", err, out) + t.Fatalf("failed to enable auto-update on node: %v\noutput: %s", err, out) } - sendAndCheckDefault(t, n, true, false) + checkDefault(n, false) }, }, { @@ -1484,22 +1529,21 @@ func TestAutoUpdateDefaults(t *testing.T) { if out, err := n.TailscaleForOutput("set", "--auto-update=false").CombinedOutput(); err != nil { t.Fatalf("failed to disable auto-update on node: %v\noutput: %s", err, out) } - // Defaults sent from control should be ignored. - sendAndCheckDefault(t, n, true, false) - sendAndCheckDefault(t, n, false, false) + + setDefaultAutoUpdate(true) + n.MustUp() + n.AwaitRunning() + checkDefault(n, false) }, }, } for _, tt := range tests { t.Run(tt.desc, func(t *testing.T) { n := NewTestNode(t, env) + n.allowUpdates = true d := n.StartDaemon() defer d.MustCleanShutdown(t) - n.AwaitResponding() - n.MustUp() - n.AwaitRunning() - tt.run(t, n) }) } diff --git a/tstest/integration/testcontrol/testcontrol.go b/tstest/integration/testcontrol/testcontrol.go index 268f2f19b..d0959ff25 100644 --- a/tstest/integration/testcontrol/testcontrol.go +++ b/tstest/integration/testcontrol/testcontrol.go @@ -79,6 +79,10 @@ type Server struct { ExplicitBaseURL string // e.g. "http://127.0.0.1:1234" with no trailing URL HTTPTestServer *httptest.Server // if non-nil, used to get BaseURL + // ModifyFirstMapResponse, if non-nil, is called exactly once per + // MapResponse stream to modify the first MapResponse sent in response to it. + ModifyFirstMapResponse func(*tailcfg.MapResponse, *tailcfg.MapRequest) + initMuxOnce sync.Once mux *http.ServeMux @@ -993,6 +997,7 @@ func (s *Server) serveMap(w http.ResponseWriter, r *http.Request, mkey key.Machi // register an updatesCh to get updates. streaming := req.Stream && !req.ReadOnly compress := req.Compress != "" + first := true w.WriteHeader(200) for { @@ -1025,6 +1030,10 @@ func (s *Server) serveMap(w http.ResponseWriter, r *http.Request, mkey key.Machi if allExpired { res.Node.KeyExpiry = time.Now().Add(-1 * time.Minute) } + if f := s.ModifyFirstMapResponse; first && f != nil { + first = false + f(res, req) + } // TODO: add minner if/when needed resBytes, err := json.Marshal(res) if err != nil { diff --git a/types/netmap/nodemut.go b/types/netmap/nodemut.go index f4de1bf0b..4f93be21c 100644 --- a/types/netmap/nodemut.go +++ b/types/netmap/nodemut.go @@ -177,5 +177,5 @@ func mapResponseContainsNonPatchFields(res *tailcfg.MapResponse) bool { // function is called, so it should never be set anyway. But for // completedness, and for tests, check it too: res.PeersChanged != nil || - res.DefaultAutoUpdate != "" + res.DeprecatedDefaultAutoUpdate != "" } From f4a4bab105a89da491bb9f5ae1effb9b4f44b7f2 Mon Sep 17 00:00:00 2001 From: Fran Bull Date: Fri, 21 Nov 2025 08:12:20 -0800 Subject: [PATCH 22/33] tsconsensus: skip integration tests in CI There is an issue to add non-integration tests: #18022 Fixes #15627 #16340 Signed-off-by: Fran Bull --- tsconsensus/tsconsensus_test.go | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/tsconsensus/tsconsensus_test.go b/tsconsensus/tsconsensus_test.go index 7f89eb48a..796c8f51b 100644 --- a/tsconsensus/tsconsensus_test.go +++ b/tsconsensus/tsconsensus_test.go @@ -17,7 +17,6 @@ import ( "net/netip" "os" "path/filepath" - "runtime" "strings" "sync" "testing" @@ -27,7 +26,6 @@ import ( "github.com/hashicorp/go-hclog" "github.com/hashicorp/raft" "tailscale.com/client/tailscale" - "tailscale.com/cmd/testwrapper/flakytest" "tailscale.com/ipn/store/mem" "tailscale.com/net/netns" "tailscale.com/tailcfg" @@ -115,8 +113,8 @@ func (f *fsm) Restore(rc io.ReadCloser) error { } func testConfig(t *testing.T) { - if runtime.GOOS == "windows" && cibuild.On() { - t.Skip("cmd/natc isn't supported on Windows, so skipping tsconsensus tests on CI for now; see https://github.com/tailscale/tailscale/issues/16340") + if cibuild.On() { + t.Skip("these integration tests don't always work well in CI and that's bad for CI; see https://github.com/tailscale/tailscale/issues/16340 and https://github.com/tailscale/tailscale/issues/18022") } // -race AND Parallel makes things start to take too long. if !racebuild.On { @@ -251,7 +249,6 @@ func warnLogConfig() Config { } func TestStart(t *testing.T) { - flakytest.Mark(t, "https://github.com/tailscale/tailscale/issues/15627") testConfig(t) control, controlURL := startControl(t) ctx := context.Background() @@ -372,7 +369,6 @@ func createConsensusCluster(t testing.TB, ctx context.Context, clusterTag string } func TestApply(t *testing.T) { - flakytest.Mark(t, "https://github.com/tailscale/tailscale/issues/15627") testConfig(t) ctx := context.Background() clusterTag := "tag:whatever" @@ -437,7 +433,6 @@ func assertCommandsWorkOnAnyNode(t testing.TB, participants []*participant) { } func TestConfig(t *testing.T) { - flakytest.Mark(t, "https://github.com/tailscale/tailscale/issues/15627") testConfig(t) ctx := context.Background() clusterTag := "tag:whatever" @@ -477,7 +472,6 @@ func TestConfig(t *testing.T) { } func TestFollowerFailover(t *testing.T) { - flakytest.Mark(t, "https://github.com/tailscale/tailscale/issues/15627") testConfig(t) ctx := context.Background() clusterTag := "tag:whatever" @@ -549,7 +543,6 @@ func TestFollowerFailover(t *testing.T) { } func TestRejoin(t *testing.T) { - flakytest.Mark(t, "https://github.com/tailscale/tailscale/issues/15627") testConfig(t) ctx := context.Background() clusterTag := "tag:whatever" @@ -585,7 +578,6 @@ func TestRejoin(t *testing.T) { } func TestOnlyTaggedPeersCanDialRaftPort(t *testing.T) { - flakytest.Mark(t, "https://github.com/tailscale/tailscale/issues/15627") testConfig(t) ctx := context.Background() clusterTag := "tag:whatever" @@ -643,7 +635,6 @@ func TestOnlyTaggedPeersCanDialRaftPort(t *testing.T) { } func TestOnlyTaggedPeersCanBeDialed(t *testing.T) { - flakytest.Mark(t, "https://github.com/tailscale/tailscale/issues/15627") testConfig(t) ctx := context.Background() clusterTag := "tag:whatever" From b38dd1ae06c456fcd65e31e642990a5f1520c63b Mon Sep 17 00:00:00 2001 From: Alex Chan Date: Tue, 25 Nov 2025 10:22:08 +0000 Subject: [PATCH 23/33] ipn/ipnlocal: don't panic if there are no suitable exit nodes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In suggestExitNodeLocked, if no exit node candidates have a home DERP or valid location info, `bestCandidates` is an empty slice. This slice is passed to `selectNode` (`randomNode` in prod): ```go func randomNode(nodes views.Slice[tailcfg.NodeView], …) tailcfg.NodeView { … return nodes.At(rand.IntN(nodes.Len())) } ``` An empty slice becomes a call to `rand.IntN(0)`, which panics. This patch changes the behaviour, so if we've filtered out all the candidates before calling `selectNode`, reset the list and then pick from any of the available candidates. This patch also updates our tests to give us more coverage of `randomNode`, so we can spot other potential issues. Updates #17661 Change-Id: I63eb5e4494d45a1df5b1f4b1b5c6d5576322aa72 Signed-off-by: Alex Chan --- ipn/ipnlocal/local.go | 10 +++++++ ipn/ipnlocal/local_test.go | 56 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+) diff --git a/ipn/ipnlocal/local.go b/ipn/ipnlocal/local.go index defa558ed..3e7054896 100644 --- a/ipn/ipnlocal/local.go +++ b/ipn/ipnlocal/local.go @@ -7432,6 +7432,16 @@ func suggestExitNodeUsingDERP(report *netcheck.Report, nb *nodeBackend, prevSugg } } bestCandidates := pickWeighted(pickFrom) + + // We may have an empty list of candidates here, if none of the candidates + // have home DERP info. + // + // We know that candidates is non-empty or we'd already have returned, so if + // we've filtered everything out of bestCandidates, just use candidates. + if len(bestCandidates) == 0 { + bestCandidates = candidates + } + chosen := selectNode(views.SliceOf(bestCandidates), prevSuggestion) if !chosen.Valid() { return res, errors.New("chosen candidate invalid: this is a bug") diff --git a/ipn/ipnlocal/local_test.go b/ipn/ipnlocal/local_test.go index 68bb2618c..02997a0e1 100644 --- a/ipn/ipnlocal/local_test.go +++ b/ipn/ipnlocal/local_test.go @@ -4436,6 +4436,14 @@ func deterministicRegionForTest(t testing.TB, want views.Slice[int], use int) se } } +// deterministicNodeForTest returns a deterministic selectNodeFunc, which +// allows us to make stable assertions about which exit node will be chosen +// from a list of possible candidates. +// +// When given a list of candidates, it checks that `use` is in the list and +// returns that. +// +// It verifies that `wantLast` was passed to `selectNode(…, want)`. func deterministicNodeForTest(t testing.TB, want views.Slice[tailcfg.StableNodeID], wantLast tailcfg.StableNodeID, use tailcfg.StableNodeID) selectNodeFunc { t.Helper() @@ -4444,6 +4452,16 @@ func deterministicNodeForTest(t testing.TB, want views.Slice[tailcfg.StableNodeI } return func(got views.Slice[tailcfg.NodeView], last tailcfg.StableNodeID) tailcfg.NodeView { + // In the tests, we choose nodes deterministically so we can get + // stable results, but in the real code, we choose nodes randomly. + // + // Call the randomNode function anyway, and ensure it returns + // a sensible result. + view := randomNode(got, last) + if !views.SliceContains(got, view) { + t.Fatalf("randomNode returns an unexpected node") + } + var ret tailcfg.NodeView gotIDs := make([]tailcfg.StableNodeID, got.Len()) @@ -4529,6 +4547,7 @@ func TestSuggestExitNode(t *testing.T) { Longitude: -97.3325, Priority: 100, } + var emptyLocation *tailcfg.Location peer1 := makePeer(1, withExitRoutes(), @@ -4568,6 +4587,18 @@ func TestSuggestExitNode(t *testing.T) { withExitRoutes(), withSuggest(), withLocation(fortWorthLowPriority.View())) + emptyLocationPeer9 := makePeer(9, + withoutDERP(), + withExitRoutes(), + withSuggest(), + withLocation(emptyLocation.View()), + ) + emptyLocationPeer10 := makePeer(10, + withoutDERP(), + withExitRoutes(), + withSuggest(), + withLocation(emptyLocation.View()), + ) selfNode := tailcfg.Node{ Addresses: []netip.Prefix{ @@ -4898,6 +4929,31 @@ func TestSuggestExitNode(t *testing.T) { wantName: "San Jose", wantLocation: sanJose.View(), }, + { + // Regression test for https://github.com/tailscale/tailscale/issues/17661 + name: "exit nodes with no home DERP, randomly selected", + lastReport: &netcheck.Report{ + RegionLatency: map[int]time.Duration{ + 1: 10, + 2: 20, + 3: 10, + }, + PreferredDERP: 1, + }, + netMap: &netmap.NetworkMap{ + SelfNode: selfNode.View(), + DERPMap: defaultDERPMap, + Peers: []tailcfg.NodeView{ + emptyLocationPeer9, + emptyLocationPeer10, + }, + }, + wantRegions: []int{1, 2}, + wantName: "peer9", + wantNodes: []tailcfg.StableNodeID{"stable9", "stable10"}, + wantID: "stable9", + useRegion: 1, + }, } for _, tt := range tests { From c54d243690817d664b03ba0139d7930388e62b33 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Claus=20Lensb=C3=B8l?= Date: Tue, 25 Nov 2025 21:35:38 +0100 Subject: [PATCH 24/33] net/tstun: add TSMPDiscoAdvertisement to TSMPPing (#17995) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a new types of TSMP messages for advertising disco keys keys to/from a peer, and implements the advertising triggered by a TSMP ping. Needed as part of the effort to cache the netmap and still let clients connect without control being reachable. Updates #12639 Signed-off-by: Claus Lensbøl Co-authored-by: James Tucker --- net/packet/tsmp.go | 55 +++++++++++++++++++++++ net/packet/tsmp_test.go | 65 ++++++++++++++++++++++++++++ net/tstun/wrap.go | 29 ++++++++++--- net/tstun/wrap_test.go | 62 ++++++++++++++++++++------ wgengine/magicsock/magicsock_test.go | 5 +-- wgengine/userspace.go | 31 ++++++++++++- wgengine/userspace_test.go | 58 +++++++++++++++++++++++++ 7 files changed, 280 insertions(+), 25 deletions(-) diff --git a/net/packet/tsmp.go b/net/packet/tsmp.go index 0ea321e84..8fad1d503 100644 --- a/net/packet/tsmp.go +++ b/net/packet/tsmp.go @@ -15,7 +15,9 @@ import ( "fmt" "net/netip" + "go4.org/mem" "tailscale.com/types/ipproto" + "tailscale.com/types/key" ) const minTSMPSize = 7 // the rejected body is 7 bytes @@ -72,6 +74,9 @@ const ( // TSMPTypePong is the type byte for a TailscalePongResponse. TSMPTypePong TSMPType = 'o' + + // TSPMTypeDiscoAdvertisement is the type byte for sending disco keys + TSMPTypeDiscoAdvertisement TSMPType = 'a' ) type TailscaleRejectReason byte @@ -259,3 +264,53 @@ func (h TSMPPongReply) Marshal(buf []byte) error { binary.BigEndian.PutUint16(buf[9:11], h.PeerAPIPort) return nil } + +// TSMPDiscoKeyAdvertisement is a TSMP message that's used for distributing Disco Keys. +// +// On the wire, after the IP header, it's currently 33 bytes: +// - 'a' (TSMPTypeDiscoAdvertisement) +// - 32 disco key bytes +type TSMPDiscoKeyAdvertisement struct { + Src, Dst netip.Addr + Key key.DiscoPublic +} + +func (ka *TSMPDiscoKeyAdvertisement) Marshal() ([]byte, error) { + var iph Header + if ka.Src.Is4() { + iph = IP4Header{ + IPProto: ipproto.TSMP, + Src: ka.Src, + Dst: ka.Dst, + } + } else { + iph = IP6Header{ + IPProto: ipproto.TSMP, + Src: ka.Src, + Dst: ka.Dst, + } + } + payload := make([]byte, 0, 33) + payload = append(payload, byte(TSMPTypeDiscoAdvertisement)) + payload = ka.Key.AppendTo(payload) + if len(payload) != 33 { + // Mostly to safeguard against ourselves changing this in the future. + return []byte{}, fmt.Errorf("expected payload length 33, got %d", len(payload)) + } + + return Generate(iph, payload), nil +} + +func (pp *Parsed) AsTSMPDiscoAdvertisement() (tka TSMPDiscoKeyAdvertisement, ok bool) { + if pp.IPProto != ipproto.TSMP { + return + } + p := pp.Payload() + if len(p) < 33 || p[0] != byte(TSMPTypeDiscoAdvertisement) { + return + } + tka.Src = pp.Src.Addr() + tka.Key = key.DiscoPublicFromRaw32(mem.B(p[1:33])) + + return tka, true +} diff --git a/net/packet/tsmp_test.go b/net/packet/tsmp_test.go index e261e6a41..d8f1d38d5 100644 --- a/net/packet/tsmp_test.go +++ b/net/packet/tsmp_test.go @@ -4,8 +4,14 @@ package packet import ( + "bytes" + "encoding/hex" "net/netip" + "slices" "testing" + + "go4.org/mem" + "tailscale.com/types/key" ) func TestTailscaleRejectedHeader(t *testing.T) { @@ -71,3 +77,62 @@ func TestTailscaleRejectedHeader(t *testing.T) { } } } + +func TestTSMPDiscoKeyAdvertisementMarshal(t *testing.T) { + var ( + // IPv4: Ver(4)Len(5), TOS, Len(53), ID, Flags, TTL(64), Proto(99), Cksum + headerV4, _ = hex.DecodeString("45000035000000004063705d") + // IPv6: Ver(6)TCFlow, Len(33), NextHdr(99), HopLim(64) + headerV6, _ = hex.DecodeString("6000000000216340") + + packetType = []byte{'a'} + testKey = bytes.Repeat([]byte{'a'}, 32) + + // IPs + srcV4 = netip.MustParseAddr("1.2.3.4") + dstV4 = netip.MustParseAddr("4.3.2.1") + srcV6 = netip.MustParseAddr("2001:db8::1") + dstV6 = netip.MustParseAddr("2001:db8::2") + ) + + join := func(parts ...[]byte) []byte { + return bytes.Join(parts, nil) + } + + tests := []struct { + name string + tka TSMPDiscoKeyAdvertisement + want []byte + }{ + { + name: "v4Header", + tka: TSMPDiscoKeyAdvertisement{ + Src: srcV4, + Dst: dstV4, + Key: key.DiscoPublicFromRaw32(mem.B(testKey)), + }, + want: join(headerV4, srcV4.AsSlice(), dstV4.AsSlice(), packetType, testKey), + }, + { + name: "v6Header", + tka: TSMPDiscoKeyAdvertisement{ + Src: srcV6, + Dst: dstV6, + Key: key.DiscoPublicFromRaw32(mem.B(testKey)), + }, + want: join(headerV6, srcV6.AsSlice(), dstV6.AsSlice(), packetType, testKey), + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got, err := tt.tka.Marshal() + if err != nil { + t.Errorf("error mashalling TSMPDiscoAdvertisement: %s", err) + } + if !slices.Equal(got, tt.want) { + t.Errorf("error mashalling TSMPDiscoAdvertisement, expected: \n%x, \ngot:\n%x", tt.want, got) + } + }) + } +} diff --git a/net/tstun/wrap.go b/net/tstun/wrap.go index db4f689bf..6e07c7a3d 100644 --- a/net/tstun/wrap.go +++ b/net/tstun/wrap.go @@ -34,6 +34,7 @@ import ( "tailscale.com/types/logger" "tailscale.com/types/netlogfunc" "tailscale.com/util/clientmetric" + "tailscale.com/util/eventbus" "tailscale.com/util/usermetric" "tailscale.com/wgengine/filter" "tailscale.com/wgengine/netstack/gro" @@ -209,6 +210,9 @@ type Wrapper struct { captureHook syncs.AtomicValue[packet.CaptureCallback] metrics *metrics + + eventClient *eventbus.Client + discoKeyAdvertisementPub *eventbus.Publisher[DiscoKeyAdvertisement] } type metrics struct { @@ -254,15 +258,15 @@ func (w *Wrapper) Start() { close(w.startCh) } -func WrapTAP(logf logger.Logf, tdev tun.Device, m *usermetric.Registry) *Wrapper { - return wrap(logf, tdev, true, m) +func WrapTAP(logf logger.Logf, tdev tun.Device, m *usermetric.Registry, bus *eventbus.Bus) *Wrapper { + return wrap(logf, tdev, true, m, bus) } -func Wrap(logf logger.Logf, tdev tun.Device, m *usermetric.Registry) *Wrapper { - return wrap(logf, tdev, false, m) +func Wrap(logf logger.Logf, tdev tun.Device, m *usermetric.Registry, bus *eventbus.Bus) *Wrapper { + return wrap(logf, tdev, false, m, bus) } -func wrap(logf logger.Logf, tdev tun.Device, isTAP bool, m *usermetric.Registry) *Wrapper { +func wrap(logf logger.Logf, tdev tun.Device, isTAP bool, m *usermetric.Registry, bus *eventbus.Bus) *Wrapper { logf = logger.WithPrefix(logf, "tstun: ") w := &Wrapper{ logf: logf, @@ -283,6 +287,9 @@ func wrap(logf logger.Logf, tdev tun.Device, isTAP bool, m *usermetric.Registry) metrics: registerMetrics(m), } + w.eventClient = bus.Client("net.tstun") + w.discoKeyAdvertisementPub = eventbus.Publish[DiscoKeyAdvertisement](w.eventClient) + w.vectorBuffer = make([][]byte, tdev.BatchSize()) for i := range w.vectorBuffer { w.vectorBuffer[i] = make([]byte, maxBufferSize) @@ -357,6 +364,7 @@ func (t *Wrapper) Close() error { close(t.vectorOutbound) t.outboundMu.Unlock() err = t.tdev.Close() + t.eventClient.Close() }) return err } @@ -1118,6 +1126,11 @@ func (t *Wrapper) injectedRead(res tunInjectedRead, outBuffs [][]byte, sizes []i return n, err } +type DiscoKeyAdvertisement struct { + Src netip.Addr + Key key.DiscoPublic +} + func (t *Wrapper) filterPacketInboundFromWireGuard(p *packet.Parsed, captHook packet.CaptureCallback, pc *peerConfigTable, gro *gro.GRO) (filter.Response, *gro.GRO) { if captHook != nil { captHook(packet.FromPeer, t.now(), p.Buffer(), p.CaptureMeta) @@ -1128,6 +1141,12 @@ func (t *Wrapper) filterPacketInboundFromWireGuard(p *packet.Parsed, captHook pa t.noteActivity() t.injectOutboundPong(p, pingReq) return filter.DropSilently, gro + } else if discoKeyAdvert, ok := p.AsTSMPDiscoAdvertisement(); ok { + t.discoKeyAdvertisementPub.Publish(DiscoKeyAdvertisement{ + Src: discoKeyAdvert.Src, + Key: discoKeyAdvert.Key, + }) + return filter.DropSilently, gro } else if data, ok := p.AsTSMPPong(); ok { if f := t.OnTSMPPongReceived; f != nil { f(data) diff --git a/net/tstun/wrap_test.go b/net/tstun/wrap_test.go index 75cf5afb2..c7d0708df 100644 --- a/net/tstun/wrap_test.go +++ b/net/tstun/wrap_test.go @@ -36,6 +36,8 @@ import ( "tailscale.com/types/netlogtype" "tailscale.com/types/ptr" "tailscale.com/types/views" + "tailscale.com/util/eventbus" + "tailscale.com/util/eventbus/eventbustest" "tailscale.com/util/must" "tailscale.com/util/usermetric" "tailscale.com/wgengine/filter" @@ -170,10 +172,10 @@ func setfilter(logf logger.Logf, tun *Wrapper) { tun.SetFilter(filter.New(matches, nil, ipSet, ipSet, nil, logf)) } -func newChannelTUN(logf logger.Logf, secure bool) (*tuntest.ChannelTUN, *Wrapper) { +func newChannelTUN(logf logger.Logf, bus *eventbus.Bus, secure bool) (*tuntest.ChannelTUN, *Wrapper) { chtun := tuntest.NewChannelTUN() reg := new(usermetric.Registry) - tun := Wrap(logf, chtun.TUN(), reg) + tun := Wrap(logf, chtun.TUN(), reg, bus) if secure { setfilter(logf, tun) } else { @@ -183,10 +185,10 @@ func newChannelTUN(logf logger.Logf, secure bool) (*tuntest.ChannelTUN, *Wrapper return chtun, tun } -func newFakeTUN(logf logger.Logf, secure bool) (*fakeTUN, *Wrapper) { +func newFakeTUN(logf logger.Logf, bus *eventbus.Bus, secure bool) (*fakeTUN, *Wrapper) { ftun := NewFake() reg := new(usermetric.Registry) - tun := Wrap(logf, ftun, reg) + tun := Wrap(logf, ftun, reg, bus) if secure { setfilter(logf, tun) } else { @@ -196,7 +198,8 @@ func newFakeTUN(logf logger.Logf, secure bool) (*fakeTUN, *Wrapper) { } func TestReadAndInject(t *testing.T) { - chtun, tun := newChannelTUN(t.Logf, false) + bus := eventbustest.NewBus(t) + chtun, tun := newChannelTUN(t.Logf, bus, false) defer tun.Close() const size = 2 // all payloads have this size @@ -221,7 +224,7 @@ func TestReadAndInject(t *testing.T) { } var buf [MaxPacketSize]byte - var seen = make(map[string]bool) + seen := make(map[string]bool) sizes := make([]int, 1) // We expect the same packets back, in no particular order. for i := range len(written) + len(injected) { @@ -257,7 +260,8 @@ func TestReadAndInject(t *testing.T) { } func TestWriteAndInject(t *testing.T) { - chtun, tun := newChannelTUN(t.Logf, false) + bus := eventbustest.NewBus(t) + chtun, tun := newChannelTUN(t.Logf, bus, false) defer tun.Close() written := []string{"w0", "w1"} @@ -316,8 +320,8 @@ func mustHexDecode(s string) []byte { } func TestFilter(t *testing.T) { - - chtun, tun := newChannelTUN(t.Logf, true) + bus := eventbustest.NewBus(t) + chtun, tun := newChannelTUN(t.Logf, bus, true) defer tun.Close() // Reset the metrics before test. These are global @@ -462,7 +466,8 @@ func assertMetricPackets(t *testing.T, metricName string, want, got int64) { } func TestAllocs(t *testing.T) { - ftun, tun := newFakeTUN(t.Logf, false) + bus := eventbustest.NewBus(t) + ftun, tun := newFakeTUN(t.Logf, bus, false) defer tun.Close() buf := [][]byte{{0x00}} @@ -473,14 +478,14 @@ func TestAllocs(t *testing.T) { return } }) - if err != nil { t.Error(err) } } func TestClose(t *testing.T) { - ftun, tun := newFakeTUN(t.Logf, false) + bus := eventbustest.NewBus(t) + ftun, tun := newFakeTUN(t.Logf, bus, false) data := [][]byte{udp4("1.2.3.4", "5.6.7.8", 98, 98)} _, err := ftun.Write(data, 0) @@ -497,7 +502,8 @@ func TestClose(t *testing.T) { func BenchmarkWrite(b *testing.B) { b.ReportAllocs() - ftun, tun := newFakeTUN(b.Logf, true) + bus := eventbustest.NewBus(b) + ftun, tun := newFakeTUN(b.Logf, bus, true) defer tun.Close() packet := [][]byte{udp4("5.6.7.8", "1.2.3.4", 89, 89)} @@ -887,7 +893,8 @@ func TestCaptureHook(t *testing.T) { now := time.Unix(1682085856, 0) - _, w := newFakeTUN(t.Logf, true) + bus := eventbustest.NewBus(t) + _, w := newFakeTUN(t.Logf, bus, true) w.timeNow = func() time.Time { return now } @@ -957,3 +964,30 @@ func TestCaptureHook(t *testing.T) { captured, want) } } + +func TestTSMPDisco(t *testing.T) { + t.Run("IPv6DiscoAdvert", func(t *testing.T) { + src := netip.MustParseAddr("2001:db8::1") + dst := netip.MustParseAddr("2001:db8::2") + discoKey := key.NewDisco() + buf, _ := (&packet.TSMPDiscoKeyAdvertisement{ + Src: src, + Dst: dst, + Key: discoKey.Public(), + }).Marshal() + + var p packet.Parsed + p.Decode(buf) + + tda, ok := p.AsTSMPDiscoAdvertisement() + if !ok { + t.Error("Unable to parse message as TSMPDiscoAdversitement") + } + if tda.Src != src { + t.Errorf("Src address did not match, expected %v, got %v", src, tda.Src) + } + if !reflect.DeepEqual(tda.Key, discoKey.Public()) { + t.Errorf("Key did not match, expected %q, got %q", discoKey.Public(), tda.Key) + } + }) +} diff --git a/wgengine/magicsock/magicsock_test.go b/wgengine/magicsock/magicsock_test.go index 7ae422906..4e1024886 100644 --- a/wgengine/magicsock/magicsock_test.go +++ b/wgengine/magicsock/magicsock_test.go @@ -211,7 +211,7 @@ func newMagicStackWithKey(t testing.TB, logf logger.Logf, ln nettype.PacketListe } tun := tuntest.NewChannelTUN() - tsTun := tstun.Wrap(logf, tun.TUN(), ®) + tsTun := tstun.Wrap(logf, tun.TUN(), ®, bus) tsTun.SetFilter(filter.NewAllowAllForTest(logf)) tsTun.Start() @@ -1771,7 +1771,6 @@ func TestEndpointSetsEqual(t *testing.T) { t.Errorf("%q vs %q = %v; want %v", tt.a, tt.b, got, tt.want) } } - } func TestBetterAddr(t *testing.T) { @@ -1915,7 +1914,6 @@ func TestBetterAddr(t *testing.T) { t.Errorf("[%d] betterAddr(%+v, %+v) and betterAddr(%+v, %+v) both unexpectedly true", i, tt.a, tt.b, tt.b, tt.a) } } - } func epFromTyped(eps []tailcfg.Endpoint) (ret []netip.AddrPort) { @@ -3138,7 +3136,6 @@ func TestMaybeRebindOnError(t *testing.T) { t.Errorf("expected at least 5 seconds between %s and %s", lastRebindTime, newTime) } } - }) }) } diff --git a/wgengine/userspace.go b/wgengine/userspace.go index e4c99ded2..a369fa343 100644 --- a/wgengine/userspace.go +++ b/wgengine/userspace.go @@ -323,9 +323,9 @@ func NewUserspaceEngine(logf logger.Logf, conf Config) (_ Engine, reterr error) var tsTUNDev *tstun.Wrapper if conf.IsTAP { - tsTUNDev = tstun.WrapTAP(logf, conf.Tun, conf.Metrics) + tsTUNDev = tstun.WrapTAP(logf, conf.Tun, conf.Metrics, conf.EventBus) } else { - tsTUNDev = tstun.Wrap(logf, conf.Tun, conf.Metrics) + tsTUNDev = tstun.Wrap(logf, conf.Tun, conf.Metrics, conf.EventBus) } closePool.add(tsTUNDev) @@ -1436,6 +1436,7 @@ func (e *userspaceEngine) Ping(ip netip.Addr, pingType tailcfg.PingType, size in e.magicConn.Ping(peer, res, size, cb) case "TSMP": e.sendTSMPPing(ip, peer, res, cb) + e.sendTSMPDiscoAdvertisement(ip) case "ICMP": e.sendICMPEchoRequest(ip, peer, res, cb) } @@ -1556,6 +1557,29 @@ func (e *userspaceEngine) sendTSMPPing(ip netip.Addr, peer tailcfg.NodeView, res e.tundev.InjectOutbound(tsmpPing) } +func (e *userspaceEngine) sendTSMPDiscoAdvertisement(ip netip.Addr) { + srcIP, err := e.mySelfIPMatchingFamily(ip) + if err != nil { + e.logf("getting matching node: %s", err) + return + } + tdka := packet.TSMPDiscoKeyAdvertisement{ + Src: srcIP, + Dst: ip, + Key: e.magicConn.DiscoPublicKey(), + } + payload, err := tdka.Marshal() + if err != nil { + e.logf("error generating TSMP Advertisement: %s", err) + metricTSMPDiscoKeyAdvertisementError.Add(1) + } else if err := e.tundev.InjectOutbound(payload); err != nil { + e.logf("error sending TSMP Advertisement: %s", err) + metricTSMPDiscoKeyAdvertisementError.Add(1) + } else { + metricTSMPDiscoKeyAdvertisementSent.Add(1) + } +} + func (e *userspaceEngine) setTSMPPongCallback(data [8]byte, cb func(packet.TSMPPongReply)) { e.mu.Lock() defer e.mu.Unlock() @@ -1722,6 +1746,9 @@ var ( metricNumMajorChanges = clientmetric.NewCounter("wgengine_major_changes") metricNumMinorChanges = clientmetric.NewCounter("wgengine_minor_changes") + + metricTSMPDiscoKeyAdvertisementSent = clientmetric.NewCounter("magicsock_tsmp_disco_key_advertisement_sent") + metricTSMPDiscoKeyAdvertisementError = clientmetric.NewCounter("magicsock_tsmp_disco_key_advertisement_error") ) func (e *userspaceEngine) InstallCaptureHook(cb packet.CaptureCallback) { diff --git a/wgengine/userspace_test.go b/wgengine/userspace_test.go index 89d75b98a..0a1d2924d 100644 --- a/wgengine/userspace_test.go +++ b/wgengine/userspace_test.go @@ -325,6 +325,64 @@ func TestUserspaceEnginePeerMTUReconfig(t *testing.T) { } } +func TestTSMPKeyAdvertisement(t *testing.T) { + var knobs controlknobs.Knobs + + bus := eventbustest.NewBus(t) + ht := health.NewTracker(bus) + reg := new(usermetric.Registry) + e, err := NewFakeUserspaceEngine(t.Logf, 0, &knobs, ht, reg, bus) + if err != nil { + t.Fatal(err) + } + t.Cleanup(e.Close) + ue := e.(*userspaceEngine) + routerCfg := &router.Config{} + nodeKey := nkFromHex("bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb") + nm := &netmap.NetworkMap{ + Peers: nodeViews([]*tailcfg.Node{ + { + ID: 1, + Key: nodeKey, + }, + }), + SelfNode: (&tailcfg.Node{ + StableID: "TESTCTRL00000001", + Name: "test-node.test.ts.net", + Addresses: []netip.Prefix{netip.MustParsePrefix("100.64.0.1/32"), netip.MustParsePrefix("fd7a:115c:a1e0:ab12:4843:cd96:0:1/128")}, + }).View(), + } + cfg := &wgcfg.Config{ + Peers: []wgcfg.Peer{ + { + PublicKey: nodeKey, + AllowedIPs: []netip.Prefix{ + netip.PrefixFrom(netaddr.IPv4(100, 100, 99, 1), 32), + }, + }, + }, + } + + ue.SetNetworkMap(nm) + err = ue.Reconfig(cfg, routerCfg, &dns.Config{}) + if err != nil { + t.Fatal(err) + } + + addr := netip.MustParseAddr("100.100.99.1") + previousValue := metricTSMPDiscoKeyAdvertisementSent.Value() + ue.sendTSMPDiscoAdvertisement(addr) + if val := metricTSMPDiscoKeyAdvertisementSent.Value(); val <= previousValue { + errs := metricTSMPDiscoKeyAdvertisementError.Value() + t.Errorf("Expected 1 disco key advert, got %d, errors %d", val, errs) + } + // Remove config to have the engine shut down more consistently + err = ue.Reconfig(&wgcfg.Config{}, &router.Config{}, &dns.Config{}) + if err != nil { + t.Fatal(err) + } +} + func nkFromHex(hex string) key.NodePublic { if len(hex) != 64 { panic(fmt.Sprintf("%q is len %d; want 64", hex, len(hex))) From 53476ce8721f049250f835335dbcaef558852c9e Mon Sep 17 00:00:00 2001 From: Sachin Iyer Date: Mon, 10 Nov 2025 16:52:26 -0800 Subject: [PATCH 25/33] ipn/serve: validate service paths in HasPathHandler Fixes #17839 Signed-off-by: Sachin Iyer --- ipn/serve.go | 14 ++++++++++++++ ipn/serve_test.go | 30 ++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+) diff --git a/ipn/serve.go b/ipn/serve.go index 7ee78ef0d..1f1557889 100644 --- a/ipn/serve.go +++ b/ipn/serve.go @@ -238,6 +238,20 @@ func (sc *ServeConfig) HasPathHandler() bool { } } + if sc.Services != nil { + for _, serviceConfig := range sc.Services { + if serviceConfig.Web != nil { + for _, webServerConfig := range serviceConfig.Web { + for _, httpHandler := range webServerConfig.Handlers { + if httpHandler.Path != "" { + return true + } + } + } + } + } + } + if sc.Foreground != nil { for _, fgConfig := range sc.Foreground { if fgConfig.HasPathHandler() { diff --git a/ipn/serve_test.go b/ipn/serve_test.go index 063ff3a87..5e0f4a43a 100644 --- a/ipn/serve_test.go +++ b/ipn/serve_test.go @@ -117,6 +117,36 @@ func TestHasPathHandler(t *testing.T) { }, want: false, }, + { + name: "with-service-path-handler", + cfg: ServeConfig{ + Services: map[tailcfg.ServiceName]*ServiceConfig{ + "svc:foo": { + Web: map[HostPort]*WebServerConfig{ + "foo.test.ts.net:443": {Handlers: map[string]*HTTPHandler{ + "/": {Path: "/tmp"}, + }}, + }, + }, + }, + }, + want: true, + }, + { + name: "with-service-proxy-handler", + cfg: ServeConfig{ + Services: map[tailcfg.ServiceName]*ServiceConfig{ + "svc:foo": { + Web: map[HostPort]*WebServerConfig{ + "foo.test.ts.net:443": {Handlers: map[string]*HTTPHandler{ + "/": {Proxy: "http://127.0.0.1:3000"}, + }}, + }, + }, + }, + }, + want: false, + }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { From 824027305a2b986b523b5b29dab7b96dba4475aa Mon Sep 17 00:00:00 2001 From: Jordan Whited Date: Tue, 25 Nov 2025 15:05:04 -0800 Subject: [PATCH 26/33] cmd/tailscale/cli,ipn,all: make peer relay server port a *uint16 In preparation for exposing its configuration via ipn.ConfigVAlpha, change {Masked}Prefs.RelayServerPort from *int to *uint16. This takes a defensive stance against invalid inputs at JSON decode time. 'tailscale set --relay-server-port' is currently the only input to this pref, and has always sanitized input to fit within a uint16. Updates tailscale/corp#34591 Signed-off-by: Jordan Whited --- cmd/tailscale/cli/set.go | 2 +- feature/relayserver/relayserver.go | 6 +-- feature/relayserver/relayserver_test.go | 58 ++++++++++++------------- ipn/ipn_clone.go | 2 +- ipn/ipn_view.go | 8 ++-- ipn/prefs.go | 10 ++--- ipn/prefs_test.go | 2 +- net/udprelay/server.go | 6 +-- net/udprelay/status/status.go | 5 ++- 9 files changed, 48 insertions(+), 51 deletions(-) diff --git a/cmd/tailscale/cli/set.go b/cmd/tailscale/cli/set.go index c2316580c..31662392f 100644 --- a/cmd/tailscale/cli/set.go +++ b/cmd/tailscale/cli/set.go @@ -249,7 +249,7 @@ func runSet(ctx context.Context, args []string) (retErr error) { if err != nil { return fmt.Errorf("failed to set relay server port: %v", err) } - maskedPrefs.Prefs.RelayServerPort = ptr.To(int(uport)) + maskedPrefs.Prefs.RelayServerPort = ptr.To(uint16(uport)) } if setArgs.relayServerStaticEndpoints != "" { diff --git a/feature/relayserver/relayserver.go b/feature/relayserver/relayserver.go index e85576e50..4f23ae18e 100644 --- a/feature/relayserver/relayserver.go +++ b/feature/relayserver/relayserver.go @@ -69,7 +69,7 @@ func servePeerRelayDebugSessions(h *localapi.Handler, w http.ResponseWriter, r * // imported. func newExtension(logf logger.Logf, sb ipnext.SafeBackend) (ipnext.Extension, error) { e := &extension{ - newServerFn: func(logf logger.Logf, port int, onlyStaticAddrPorts bool) (relayServer, error) { + newServerFn: func(logf logger.Logf, port uint16, onlyStaticAddrPorts bool) (relayServer, error) { return udprelay.NewServer(logf, port, onlyStaticAddrPorts) }, logf: logger.WithPrefix(logf, featureName+": "), @@ -93,7 +93,7 @@ type relayServer interface { // extension is an [ipnext.Extension] managing the relay server on platforms // that import this package. type extension struct { - newServerFn func(logf logger.Logf, port int, onlyStaticAddrPorts bool) (relayServer, error) // swappable for tests + newServerFn func(logf logger.Logf, port uint16, onlyStaticAddrPorts bool) (relayServer, error) // swappable for tests logf logger.Logf ec *eventbus.Client respPub *eventbus.Publisher[magicsock.UDPRelayAllocResp] @@ -101,7 +101,7 @@ type extension struct { mu syncs.Mutex // guards the following fields shutdown bool // true if Shutdown() has been called rs relayServer // nil when disabled - port *int // ipn.Prefs.RelayServerPort, nil if disabled + port *uint16 // ipn.Prefs.RelayServerPort, nil if disabled staticEndpoints views.Slice[netip.AddrPort] // ipn.Prefs.RelayServerStaticEndpoints derpMapView tailcfg.DERPMapView // latest seen over the eventbus hasNodeAttrDisableRelayServer bool // [tailcfg.NodeAttrDisableRelayServer] diff --git a/feature/relayserver/relayserver_test.go b/feature/relayserver/relayserver_test.go index d77d2df26..807306c70 100644 --- a/feature/relayserver/relayserver_test.go +++ b/feature/relayserver/relayserver_test.go @@ -23,15 +23,15 @@ import ( ) func Test_extension_profileStateChanged(t *testing.T) { - prefsWithPortOne := ipn.Prefs{RelayServerPort: ptr.To(1)} + prefsWithPortOne := ipn.Prefs{RelayServerPort: ptr.To(uint16(1))} prefsWithNilPort := ipn.Prefs{RelayServerPort: nil} prefsWithPortOneRelayEndpoints := ipn.Prefs{ - RelayServerPort: ptr.To(1), + RelayServerPort: ptr.To(uint16(1)), RelayServerStaticEndpoints: []netip.AddrPort{netip.MustParseAddrPort("127.0.0.1:7777")}, } type fields struct { - port *int + port *uint16 staticEndpoints views.Slice[netip.AddrPort] rs relayServer } @@ -43,7 +43,7 @@ func Test_extension_profileStateChanged(t *testing.T) { name string fields fields args args - wantPort *int + wantPort *uint16 wantRelayServerFieldNonNil bool wantRelayServerFieldMutated bool wantEndpoints []netip.AddrPort @@ -51,28 +51,28 @@ func Test_extension_profileStateChanged(t *testing.T) { { name: "no changes non-nil port previously running", fields: fields{ - port: ptr.To(1), + port: ptr.To(uint16(1)), rs: mockRelayServerNotZeroVal(), }, args: args{ prefs: prefsWithPortOne.View(), sameNode: true, }, - wantPort: ptr.To(1), + wantPort: ptr.To(uint16(1)), wantRelayServerFieldNonNil: true, wantRelayServerFieldMutated: false, }, { name: "set addr ports unchanged port previously running", fields: fields{ - port: ptr.To(1), + port: ptr.To(uint16(1)), rs: mockRelayServerNotZeroVal(), }, args: args{ prefs: prefsWithPortOneRelayEndpoints.View(), sameNode: true, }, - wantPort: ptr.To(1), + wantPort: ptr.To(uint16(1)), wantRelayServerFieldNonNil: true, wantRelayServerFieldMutated: false, wantEndpoints: prefsWithPortOneRelayEndpoints.RelayServerStaticEndpoints, @@ -87,7 +87,7 @@ func Test_extension_profileStateChanged(t *testing.T) { prefs: prefsWithPortOneRelayEndpoints.View(), sameNode: true, }, - wantPort: ptr.To(1), + wantPort: ptr.To(uint16(1)), wantRelayServerFieldNonNil: true, wantRelayServerFieldMutated: true, wantEndpoints: prefsWithPortOneRelayEndpoints.RelayServerStaticEndpoints, @@ -95,7 +95,7 @@ func Test_extension_profileStateChanged(t *testing.T) { { name: "clear addr ports unchanged port previously running", fields: fields{ - port: ptr.To(1), + port: ptr.To(uint16(1)), staticEndpoints: views.SliceOf(prefsWithPortOneRelayEndpoints.RelayServerStaticEndpoints), rs: mockRelayServerNotZeroVal(), }, @@ -103,7 +103,7 @@ func Test_extension_profileStateChanged(t *testing.T) { prefs: prefsWithPortOne.View(), sameNode: true, }, - wantPort: ptr.To(1), + wantPort: ptr.To(uint16(1)), wantRelayServerFieldNonNil: true, wantRelayServerFieldMutated: false, wantEndpoints: nil, @@ -111,7 +111,7 @@ func Test_extension_profileStateChanged(t *testing.T) { { name: "prefs port nil", fields: fields{ - port: ptr.To(1), + port: ptr.To(uint16(1)), }, args: args{ prefs: prefsWithNilPort.View(), @@ -124,7 +124,7 @@ func Test_extension_profileStateChanged(t *testing.T) { { name: "prefs port nil previously running", fields: fields{ - port: ptr.To(1), + port: ptr.To(uint16(1)), rs: mockRelayServerNotZeroVal(), }, args: args{ @@ -138,54 +138,54 @@ func Test_extension_profileStateChanged(t *testing.T) { { name: "prefs port changed", fields: fields{ - port: ptr.To(2), + port: ptr.To(uint16(2)), }, args: args{ prefs: prefsWithPortOne.View(), sameNode: true, }, - wantPort: ptr.To(1), + wantPort: ptr.To(uint16(1)), wantRelayServerFieldNonNil: true, wantRelayServerFieldMutated: true, }, { name: "prefs port changed previously running", fields: fields{ - port: ptr.To(2), + port: ptr.To(uint16(2)), rs: mockRelayServerNotZeroVal(), }, args: args{ prefs: prefsWithPortOne.View(), sameNode: true, }, - wantPort: ptr.To(1), + wantPort: ptr.To(uint16(1)), wantRelayServerFieldNonNil: true, wantRelayServerFieldMutated: true, }, { name: "sameNode false", fields: fields{ - port: ptr.To(1), + port: ptr.To(uint16(1)), }, args: args{ prefs: prefsWithPortOne.View(), sameNode: false, }, - wantPort: ptr.To(1), + wantPort: ptr.To(uint16(1)), wantRelayServerFieldNonNil: true, wantRelayServerFieldMutated: true, }, { name: "sameNode false previously running", fields: fields{ - port: ptr.To(1), + port: ptr.To(uint16(1)), rs: mockRelayServerNotZeroVal(), }, args: args{ prefs: prefsWithPortOne.View(), sameNode: false, }, - wantPort: ptr.To(1), + wantPort: ptr.To(uint16(1)), wantRelayServerFieldNonNil: true, wantRelayServerFieldMutated: true, }, @@ -198,7 +198,7 @@ func Test_extension_profileStateChanged(t *testing.T) { prefs: prefsWithPortOne.View(), sameNode: false, }, - wantPort: ptr.To(1), + wantPort: ptr.To(uint16(1)), wantRelayServerFieldNonNil: true, wantRelayServerFieldMutated: true, }, @@ -211,7 +211,7 @@ func Test_extension_profileStateChanged(t *testing.T) { t.Fatal(err) } e := ipne.(*extension) - e.newServerFn = func(logf logger.Logf, port int, onlyStaticAddrPorts bool) (relayServer, error) { + e.newServerFn = func(logf logger.Logf, port uint16, onlyStaticAddrPorts bool) (relayServer, error) { return &mockRelayServer{}, nil } e.port = tt.fields.port @@ -271,7 +271,7 @@ func Test_extension_handleRelayServerLifetimeLocked(t *testing.T) { tests := []struct { name string shutdown bool - port *int + port *uint16 rs relayServer hasNodeAttrDisableRelayServer bool wantRelayServerFieldNonNil bool @@ -280,7 +280,7 @@ func Test_extension_handleRelayServerLifetimeLocked(t *testing.T) { { name: "want running", shutdown: false, - port: ptr.To(1), + port: ptr.To(uint16(1)), hasNodeAttrDisableRelayServer: false, wantRelayServerFieldNonNil: true, wantRelayServerFieldMutated: true, @@ -288,7 +288,7 @@ func Test_extension_handleRelayServerLifetimeLocked(t *testing.T) { { name: "want running previously running", shutdown: false, - port: ptr.To(1), + port: ptr.To(uint16(1)), rs: mockRelayServerNotZeroVal(), hasNodeAttrDisableRelayServer: false, wantRelayServerFieldNonNil: true, @@ -297,7 +297,7 @@ func Test_extension_handleRelayServerLifetimeLocked(t *testing.T) { { name: "shutdown true", shutdown: true, - port: ptr.To(1), + port: ptr.To(uint16(1)), hasNodeAttrDisableRelayServer: false, wantRelayServerFieldNonNil: false, wantRelayServerFieldMutated: false, @@ -305,7 +305,7 @@ func Test_extension_handleRelayServerLifetimeLocked(t *testing.T) { { name: "shutdown true previously running", shutdown: true, - port: ptr.To(1), + port: ptr.To(uint16(1)), rs: mockRelayServerNotZeroVal(), hasNodeAttrDisableRelayServer: false, wantRelayServerFieldNonNil: false, @@ -354,7 +354,7 @@ func Test_extension_handleRelayServerLifetimeLocked(t *testing.T) { t.Fatal(err) } e := ipne.(*extension) - e.newServerFn = func(logf logger.Logf, port int, onlyStaticAddrPorts bool) (relayServer, error) { + e.newServerFn = func(logf logger.Logf, port uint16, onlyStaticAddrPorts bool) (relayServer, error) { return &mockRelayServer{}, nil } e.shutdown = tt.shutdown diff --git a/ipn/ipn_clone.go b/ipn/ipn_clone.go index fae85adee..4bf78b40b 100644 --- a/ipn/ipn_clone.go +++ b/ipn/ipn_clone.go @@ -102,7 +102,7 @@ var _PrefsCloneNeedsRegeneration = Prefs(struct { PostureChecking bool NetfilterKind string DriveShares []*drive.Share - RelayServerPort *int + RelayServerPort *uint16 RelayServerStaticEndpoints []netip.AddrPort AllowSingleHosts marshalAsTrueInJSON Persist *persist.Persist diff --git a/ipn/ipn_view.go b/ipn/ipn_view.go index aac8cb4d7..4157ec76e 100644 --- a/ipn/ipn_view.go +++ b/ipn/ipn_view.go @@ -441,10 +441,8 @@ func (v PrefsView) DriveShares() views.SliceView[*drive.Share, drive.ShareView] // RelayServerPort is the UDP port number for the relay server to bind to, // on all interfaces. A non-nil zero value signifies a random unused port // should be used. A nil value signifies relay server functionality -// should be disabled. This field is currently experimental, and therefore -// no guarantees are made about its current naming and functionality when -// non-nil/enabled. -func (v PrefsView) RelayServerPort() views.ValuePointer[int] { +// should be disabled. +func (v PrefsView) RelayServerPort() views.ValuePointer[uint16] { return views.ValuePointerOf(v.ж.RelayServerPort) } @@ -506,7 +504,7 @@ var _PrefsViewNeedsRegeneration = Prefs(struct { PostureChecking bool NetfilterKind string DriveShares []*drive.Share - RelayServerPort *int + RelayServerPort *uint16 RelayServerStaticEndpoints []netip.AddrPort AllowSingleHosts marshalAsTrueInJSON Persist *persist.Persist diff --git a/ipn/prefs.go b/ipn/prefs.go index 6f3cb65f8..9f98465d2 100644 --- a/ipn/prefs.go +++ b/ipn/prefs.go @@ -283,10 +283,8 @@ type Prefs struct { // RelayServerPort is the UDP port number for the relay server to bind to, // on all interfaces. A non-nil zero value signifies a random unused port // should be used. A nil value signifies relay server functionality - // should be disabled. This field is currently experimental, and therefore - // no guarantees are made about its current naming and functionality when - // non-nil/enabled. - RelayServerPort *int `json:",omitempty"` + // should be disabled. + RelayServerPort *uint16 `json:",omitempty"` // RelayServerStaticEndpoints are static IP:port endpoints to advertise as // candidates for relay connections. Only relevant when RelayServerPort is @@ -694,7 +692,7 @@ func (p *Prefs) Equals(p2 *Prefs) bool { p.PostureChecking == p2.PostureChecking && slices.EqualFunc(p.DriveShares, p2.DriveShares, drive.SharesEqual) && p.NetfilterKind == p2.NetfilterKind && - compareIntPtrs(p.RelayServerPort, p2.RelayServerPort) && + compareUint16Ptrs(p.RelayServerPort, p2.RelayServerPort) && slices.Equal(p.RelayServerStaticEndpoints, p2.RelayServerStaticEndpoints) } @@ -715,7 +713,7 @@ func (ap AppConnectorPrefs) Pretty() string { return "" } -func compareIntPtrs(a, b *int) bool { +func compareUint16Ptrs(a, b *uint16) bool { if (a == nil) != (b == nil) { return false } diff --git a/ipn/prefs_test.go b/ipn/prefs_test.go index cf0750706..aa152843a 100644 --- a/ipn/prefs_test.go +++ b/ipn/prefs_test.go @@ -78,7 +78,7 @@ func TestPrefsEqual(t *testing.T) { have, prefsHandles) } - relayServerPort := func(port int) *int { + relayServerPort := func(port uint16) *uint16 { return &port } nets := func(strs ...string) (ns []netip.Prefix) { diff --git a/net/udprelay/server.go b/net/udprelay/server.go index b260955e0..e7ca24960 100644 --- a/net/udprelay/server.go +++ b/net/udprelay/server.go @@ -309,7 +309,7 @@ func (e *serverEndpoint) isBound() bool { // onlyStaticAddrPorts is true, then dynamic addr:port discovery will be // disabled, and only addr:port's set via [Server.SetStaticAddrPorts] will be // used. -func NewServer(logf logger.Logf, port int, onlyStaticAddrPorts bool) (s *Server, err error) { +func NewServer(logf logger.Logf, port uint16, onlyStaticAddrPorts bool) (s *Server, err error) { s = &Server{ logf: logf, disco: key.NewDisco(), @@ -526,9 +526,9 @@ func trySetUDPSocketOptions(pconn nettype.PacketConn, logf logger.Logf) { // [magicsock.RebindingConn], which would also remove the need for // [singlePacketConn], as [magicsock.RebindingConn] also handles fallback to // single packet syscall operations. -func (s *Server) listenOn(port int) error { +func (s *Server) listenOn(port uint16) error { for _, network := range []string{"udp4", "udp6"} { - uc, err := net.ListenUDP(network, &net.UDPAddr{Port: port}) + uc, err := net.ListenUDP(network, &net.UDPAddr{Port: int(port)}) if err != nil { if network == "udp4" { return err diff --git a/net/udprelay/status/status.go b/net/udprelay/status/status.go index 3866efada..9ed9a0d2a 100644 --- a/net/udprelay/status/status.go +++ b/net/udprelay/status/status.go @@ -14,8 +14,9 @@ import ( type ServerStatus struct { // UDPPort is the UDP port number that the peer relay server forwards over, // as configured by the user with 'tailscale set --relay-server-port='. - // If the port has not been configured, UDPPort will be nil. - UDPPort *int + // If the port has not been configured, UDPPort will be nil. A non-nil zero + // value signifies the user has opted for a random unused port. + UDPPort *uint16 // Sessions is a slice of detailed status information about each peer // relay session that this node's peer relay server is involved with. It // may be empty. From b7658a4ad2d13da515daee2bd8dd7d50a9067708 Mon Sep 17 00:00:00 2001 From: Alex Chan Date: Wed, 19 Nov 2025 09:41:43 +0000 Subject: [PATCH 27/33] tstest/integration: add integration test for Tailnet Lock This patch adds an integration test for Tailnet Lock, checking that a node can't talk to peers in the tailnet until it becomes signed. This patch also introduces a new package `tstest/tkatest`, which has some helpers for constructing a mock control server that responds to TKA requests. This allows us to reduce boilerplate in the IPN tests. Updates tailscale/corp#33599 Signed-off-by: Alex Chan --- ipn/ipnlocal/network-lock.go | 37 +-- ipn/ipnlocal/network-lock_test.go | 300 +++--------------- tka/sync.go | 35 ++ tstest/integration/integration.go | 42 ++- tstest/integration/integration_test.go | 75 ++++- tstest/integration/testcontrol/testcontrol.go | 150 ++++++++- tstest/tkatest/tkatest.go | 220 +++++++++++++ 7 files changed, 573 insertions(+), 286 deletions(-) create mode 100644 tstest/tkatest/tkatest.go diff --git a/ipn/ipnlocal/network-lock.go b/ipn/ipnlocal/network-lock.go index 78d4d236d..f25c6fa9b 100644 --- a/ipn/ipnlocal/network-lock.go +++ b/ipn/ipnlocal/network-lock.go @@ -368,20 +368,6 @@ func (b *LocalBackend) tkaSyncIfNeeded(nm *netmap.NetworkMap, prefs ipn.PrefsVie return nil } -func toSyncOffer(head string, ancestors []string) (tka.SyncOffer, error) { - var out tka.SyncOffer - if err := out.Head.UnmarshalText([]byte(head)); err != nil { - return tka.SyncOffer{}, fmt.Errorf("head.UnmarshalText: %v", err) - } - out.Ancestors = make([]tka.AUMHash, len(ancestors)) - for i, a := range ancestors { - if err := out.Ancestors[i].UnmarshalText([]byte(a)); err != nil { - return tka.SyncOffer{}, fmt.Errorf("ancestor[%d].UnmarshalText: %v", i, err) - } - } - return out, nil -} - // tkaSyncLocked synchronizes TKA state with control. b.mu must be held // and tka must be initialized. b.mu will be stepped out of (and back into) // during network RPCs. @@ -399,7 +385,7 @@ func (b *LocalBackend) tkaSyncLocked(ourNodeKey key.NodePublic) error { if err != nil { return fmt.Errorf("offer RPC: %w", err) } - controlOffer, err := toSyncOffer(offerResp.Head, offerResp.Ancestors) + controlOffer, err := tka.ToSyncOffer(offerResp.Head, offerResp.Ancestors) if err != nil { return fmt.Errorf("control offer: %v", err) } @@ -694,7 +680,7 @@ func (b *LocalBackend) NetworkLockInit(keys []tka.Key, disablementValues [][]byt // Our genesis AUM was accepted but before Control turns on enforcement of // node-key signatures, we need to sign keys for all the existing nodes. - // If we don't get these signatures ahead of time, everyone will loose + // If we don't get these signatures ahead of time, everyone will lose // connectivity because control won't have any signatures to send which // satisfy network-lock checks. sigs := make(map[tailcfg.NodeID]tkatype.MarshaledSignature, len(initResp.NeedSignatures)) @@ -1294,27 +1280,10 @@ func (b *LocalBackend) tkaFetchBootstrap(ourNodeKey key.NodePublic, head tka.AUM return a, nil } -func fromSyncOffer(offer tka.SyncOffer) (head string, ancestors []string, err error) { - headBytes, err := offer.Head.MarshalText() - if err != nil { - return "", nil, fmt.Errorf("head.MarshalText: %v", err) - } - - ancestors = make([]string, len(offer.Ancestors)) - for i, ancestor := range offer.Ancestors { - hash, err := ancestor.MarshalText() - if err != nil { - return "", nil, fmt.Errorf("ancestor[%d].MarshalText: %v", i, err) - } - ancestors[i] = string(hash) - } - return string(headBytes), ancestors, nil -} - // tkaDoSyncOffer sends a /machine/tka/sync/offer RPC to the control plane // over noise. This is the first of two RPCs implementing tka synchronization. func (b *LocalBackend) tkaDoSyncOffer(ourNodeKey key.NodePublic, offer tka.SyncOffer) (*tailcfg.TKASyncOfferResponse, error) { - head, ancestors, err := fromSyncOffer(offer) + head, ancestors, err := tka.FromSyncOffer(offer) if err != nil { return nil, fmt.Errorf("encoding offer: %v", err) } diff --git a/ipn/ipnlocal/network-lock_test.go b/ipn/ipnlocal/network-lock_test.go index 5d22425a1..e5df38bdb 100644 --- a/ipn/ipnlocal/network-lock_test.go +++ b/ipn/ipnlocal/network-lock_test.go @@ -33,6 +33,7 @@ import ( "tailscale.com/tka" "tailscale.com/tsd" "tailscale.com/tstest" + "tailscale.com/tstest/tkatest" "tailscale.com/types/key" "tailscale.com/types/netmap" "tailscale.com/types/persist" @@ -101,7 +102,8 @@ func TestTKAEnablementFlow(t *testing.T) { // our mock server can communicate. nlPriv := key.NewNLPrivate() key := tka.Key{Kind: tka.Key25519, Public: nlPriv.Public().Verifier(), Votes: 2} - a1, genesisAUM, err := tka.Create(tka.ChonkMem(), tka.State{ + chonk := tka.ChonkMem() + a1, genesisAUM, err := tka.Create(chonk, tka.State{ Keys: []tka.Key{key}, DisablementSecrets: [][]byte{bytes.Repeat([]byte{0xa5}, 32)}, }, nlPriv) @@ -113,51 +115,31 @@ func TestTKAEnablementFlow(t *testing.T) { defer r.Body.Close() switch r.URL.Path { case "/machine/tka/bootstrap": - body := new(tailcfg.TKABootstrapRequest) - if err := json.NewDecoder(r.Body).Decode(body); err != nil { - t.Fatal(err) - } - if body.Version != tailcfg.CurrentCapabilityVersion { - t.Errorf("bootstrap CapVer = %v, want %v", body.Version, tailcfg.CurrentCapabilityVersion) - } - if body.NodeKey != nodePriv.Public() { - t.Errorf("bootstrap nodeKey=%v, want %v", body.NodeKey, nodePriv.Public()) + resp := tailcfg.TKABootstrapResponse{ + GenesisAUM: genesisAUM.Serialize(), } - if body.Head != "" { - t.Errorf("bootstrap head=%s, want empty hash", body.Head) + req, err := tkatest.HandleTKABootstrap(w, r, resp) + if err != nil { + t.Errorf("HandleTKABootstrap: %v", err) } - - w.WriteHeader(200) - out := tailcfg.TKABootstrapResponse{ - GenesisAUM: genesisAUM.Serialize(), + if req.NodeKey != nodePriv.Public() { + t.Errorf("bootstrap nodeKey=%v, want %v", req.NodeKey, nodePriv.Public()) } - if err := json.NewEncoder(w).Encode(out); err != nil { - t.Fatal(err) + if req.Head != "" { + t.Errorf("bootstrap head=%s, want empty hash", req.Head) } // Sync offer/send endpoints are hit even though the node is up-to-date, // so we implement enough of a fake that the client doesn't explode. case "/machine/tka/sync/offer": - head, err := a1.Head().MarshalText() + err := tkatest.HandleTKASyncOffer(w, r, a1, chonk) if err != nil { - t.Fatal(err) - } - w.WriteHeader(200) - if err := json.NewEncoder(w).Encode(tailcfg.TKASyncOfferResponse{ - Head: string(head), - }); err != nil { - t.Fatal(err) + t.Errorf("HandleTKASyncOffer: %v", err) } case "/machine/tka/sync/send": - head, err := a1.Head().MarshalText() + err := tkatest.HandleTKASyncSend(w, r, a1, chonk) if err != nil { - t.Fatal(err) - } - w.WriteHeader(200) - if err := json.NewEncoder(w).Encode(tailcfg.TKASyncSendResponse{ - Head: string(head), - }); err != nil { - t.Fatal(err) + t.Errorf("HandleTKASyncOffer: %v", err) } default: @@ -225,37 +207,28 @@ func TestTKADisablementFlow(t *testing.T) { defer r.Body.Close() switch r.URL.Path { case "/machine/tka/bootstrap": - body := new(tailcfg.TKABootstrapRequest) - if err := json.NewDecoder(r.Body).Decode(body); err != nil { - t.Fatal(err) - } - if body.Version != tailcfg.CurrentCapabilityVersion { - t.Errorf("bootstrap CapVer = %v, want %v", body.Version, tailcfg.CurrentCapabilityVersion) - } - if body.NodeKey != nodePriv.Public() { - t.Errorf("nodeKey=%v, want %v", body.NodeKey, nodePriv.Public()) - } - var head tka.AUMHash - if err := head.UnmarshalText([]byte(body.Head)); err != nil { - t.Fatalf("failed unmarshal of body.Head: %v", err) - } - if head != authority.Head() { - t.Errorf("reported head = %x, want %x", head, authority.Head()) - } - var disablement []byte if returnWrongSecret { disablement = bytes.Repeat([]byte{0x42}, 32) // wrong secret } else { disablement = disablementSecret } - - w.WriteHeader(200) - out := tailcfg.TKABootstrapResponse{ + resp := tailcfg.TKABootstrapResponse{ DisablementSecret: disablement, } - if err := json.NewEncoder(w).Encode(out); err != nil { - t.Fatal(err) + req, err := tkatest.HandleTKABootstrap(w, r, resp) + if err != nil { + t.Errorf("HandleTKABootstrap: %v", err) + } + if req.NodeKey != nodePriv.Public() { + t.Errorf("nodeKey=%v, want %v", req.NodeKey, nodePriv.Public()) + } + var head tka.AUMHash + if err := head.UnmarshalText([]byte(req.Head)); err != nil { + t.Fatalf("failed unmarshal of body.Head: %v", err) + } + if head != authority.Head() { + t.Errorf("reported head = %x, want %x", head, authority.Head()) } default: @@ -430,76 +403,15 @@ func TestTKASync(t *testing.T) { defer r.Body.Close() switch r.URL.Path { case "/machine/tka/sync/offer": - body := new(tailcfg.TKASyncOfferRequest) - if err := json.NewDecoder(r.Body).Decode(body); err != nil { - t.Fatal(err) - } - t.Logf("got sync offer:\n%+v", body) - nodeOffer, err := toSyncOffer(body.Head, body.Ancestors) + err := tkatest.HandleTKASyncOffer(w, r, controlAuthority, controlStorage) if err != nil { - t.Fatal(err) - } - controlOffer, err := controlAuthority.SyncOffer(controlStorage) - if err != nil { - t.Fatal(err) - } - sendAUMs, err := controlAuthority.MissingAUMs(controlStorage, nodeOffer) - if err != nil { - t.Fatal(err) - } - - head, ancestors, err := fromSyncOffer(controlOffer) - if err != nil { - t.Fatal(err) - } - resp := tailcfg.TKASyncOfferResponse{ - Head: head, - Ancestors: ancestors, - MissingAUMs: make([]tkatype.MarshaledAUM, len(sendAUMs)), - } - for i, a := range sendAUMs { - resp.MissingAUMs[i] = a.Serialize() - } - - t.Logf("responding to sync offer with:\n%+v", resp) - w.WriteHeader(200) - if err := json.NewEncoder(w).Encode(resp); err != nil { - t.Fatal(err) + t.Errorf("HandleTKASyncOffer: %v", err) } case "/machine/tka/sync/send": - body := new(tailcfg.TKASyncSendRequest) - if err := json.NewDecoder(r.Body).Decode(body); err != nil { - t.Fatal(err) - } - t.Logf("got sync send:\n%+v", body) - - var remoteHead tka.AUMHash - if err := remoteHead.UnmarshalText([]byte(body.Head)); err != nil { - t.Fatalf("head unmarshal: %v", err) - } - toApply := make([]tka.AUM, len(body.MissingAUMs)) - for i, a := range body.MissingAUMs { - if err := toApply[i].Unserialize(a); err != nil { - t.Fatalf("decoding missingAUM[%d]: %v", i, err) - } - } - - if len(toApply) > 0 { - if err := controlAuthority.Inform(controlStorage, toApply); err != nil { - t.Fatalf("control.Inform(%+v) failed: %v", toApply, err) - } - } - head, err := controlAuthority.Head().MarshalText() + err := tkatest.HandleTKASyncSend(w, r, controlAuthority, controlStorage) if err != nil { - t.Fatal(err) - } - - w.WriteHeader(200) - if err := json.NewEncoder(w).Encode(tailcfg.TKASyncSendResponse{ - Head: string(head), - }); err != nil { - t.Fatal(err) + t.Errorf("HandleTKASyncSend: %v", err) } default: @@ -608,76 +520,15 @@ func TestTKASyncTriggersCompact(t *testing.T) { defer r.Body.Close() switch r.URL.Path { case "/machine/tka/sync/offer": - body := new(tailcfg.TKASyncOfferRequest) - if err := json.NewDecoder(r.Body).Decode(body); err != nil { - t.Fatal(err) - } - t.Logf("got sync offer:\n%+v", body) - nodeOffer, err := toSyncOffer(body.Head, body.Ancestors) - if err != nil { - t.Fatal(err) - } - controlOffer, err := controlAuthority.SyncOffer(controlStorage) - if err != nil { - t.Fatal(err) - } - sendAUMs, err := controlAuthority.MissingAUMs(controlStorage, nodeOffer) - if err != nil { - t.Fatal(err) - } - - head, ancestors, err := fromSyncOffer(controlOffer) + err := tkatest.HandleTKASyncOffer(w, r, controlAuthority, controlStorage) if err != nil { - t.Fatal(err) - } - resp := tailcfg.TKASyncOfferResponse{ - Head: head, - Ancestors: ancestors, - MissingAUMs: make([]tkatype.MarshaledAUM, len(sendAUMs)), - } - for i, a := range sendAUMs { - resp.MissingAUMs[i] = a.Serialize() - } - - t.Logf("responding to sync offer with:\n%+v", resp) - w.WriteHeader(200) - if err := json.NewEncoder(w).Encode(resp); err != nil { - t.Fatal(err) + t.Errorf("HandleTKASyncOffer: %v", err) } case "/machine/tka/sync/send": - body := new(tailcfg.TKASyncSendRequest) - if err := json.NewDecoder(r.Body).Decode(body); err != nil { - t.Fatal(err) - } - t.Logf("got sync send:\n%+v", body) - - var remoteHead tka.AUMHash - if err := remoteHead.UnmarshalText([]byte(body.Head)); err != nil { - t.Fatalf("head unmarshal: %v", err) - } - toApply := make([]tka.AUM, len(body.MissingAUMs)) - for i, a := range body.MissingAUMs { - if err := toApply[i].Unserialize(a); err != nil { - t.Fatalf("decoding missingAUM[%d]: %v", i, err) - } - } - - if len(toApply) > 0 { - if err := controlAuthority.Inform(controlStorage, toApply); err != nil { - t.Fatalf("control.Inform(%+v) failed: %v", toApply, err) - } - } - head, err := controlAuthority.Head().MarshalText() + err := tkatest.HandleTKASyncSend(w, r, controlAuthority, controlStorage) if err != nil { - t.Fatal(err) - } - - w.WriteHeader(200) - if err := json.NewEncoder(w).Encode(tailcfg.TKASyncSendResponse{ - Head: string(head), - }); err != nil { - t.Fatal(err) + t.Errorf("HandleTKASyncSend: %v", err) } default: @@ -1019,29 +870,9 @@ func TestTKASign(t *testing.T) { defer r.Body.Close() switch r.URL.Path { case "/machine/tka/sign": - body := new(tailcfg.TKASubmitSignatureRequest) - if err := json.NewDecoder(r.Body).Decode(body); err != nil { - t.Fatal(err) - } - if body.Version != tailcfg.CurrentCapabilityVersion { - t.Errorf("sign CapVer = %v, want %v", body.Version, tailcfg.CurrentCapabilityVersion) - } - if body.NodeKey != nodePriv.Public() { - t.Errorf("nodeKey = %v, want %v", body.NodeKey, nodePriv.Public()) - } - - var sig tka.NodeKeySignature - if err := sig.Unserialize(body.Signature); err != nil { - t.Fatalf("malformed signature: %v", err) - } - - if err := authority.NodeKeyAuthorized(toSign.Public(), body.Signature); err != nil { - t.Errorf("signature does not verify: %v", err) - } - - w.WriteHeader(200) - if err := json.NewEncoder(w).Encode(tailcfg.TKASubmitSignatureResponse{}); err != nil { - t.Fatal(err) + _, _, err := tkatest.HandleTKASign(w, r, authority) + if err != nil { + t.Errorf("HandleTKASign: %v", err) } default: @@ -1098,23 +929,15 @@ func TestTKAForceDisable(t *testing.T) { defer r.Body.Close() switch r.URL.Path { case "/machine/tka/bootstrap": - body := new(tailcfg.TKABootstrapRequest) - if err := json.NewDecoder(r.Body).Decode(body); err != nil { - t.Fatal(err) - } - if body.Version != tailcfg.CurrentCapabilityVersion { - t.Errorf("bootstrap CapVer = %v, want %v", body.Version, tailcfg.CurrentCapabilityVersion) - } - if body.NodeKey != nodePriv.Public() { - t.Errorf("nodeKey=%v, want %v", body.NodeKey, nodePriv.Public()) - } - - w.WriteHeader(200) - out := tailcfg.TKABootstrapResponse{ + resp := tailcfg.TKABootstrapResponse{ GenesisAUM: genesis.Serialize(), } - if err := json.NewEncoder(w).Encode(out); err != nil { - t.Fatal(err) + req, err := tkatest.HandleTKABootstrap(w, r, resp) + if err != nil { + t.Errorf("HandleTKABootstrap: %v", err) + } + if req.NodeKey != nodePriv.Public() { + t.Errorf("nodeKey=%v, want %v", req.NodeKey, nodePriv.Public()) } default: @@ -1323,37 +1146,16 @@ func TestTKARecoverCompromisedKeyFlow(t *testing.T) { defer r.Body.Close() switch r.URL.Path { case "/machine/tka/sync/send": - body := new(tailcfg.TKASyncSendRequest) - if err := json.NewDecoder(r.Body).Decode(body); err != nil { - t.Fatal(err) - } - t.Logf("got sync send:\n%+v", body) - - var remoteHead tka.AUMHash - if err := remoteHead.UnmarshalText([]byte(body.Head)); err != nil { - t.Fatalf("head unmarshal: %v", err) - } - toApply := make([]tka.AUM, len(body.MissingAUMs)) - for i, a := range body.MissingAUMs { - if err := toApply[i].Unserialize(a); err != nil { - t.Fatalf("decoding missingAUM[%d]: %v", i, err) - } + err := tkatest.HandleTKASyncSend(w, r, authority, chonk) + if err != nil { + t.Errorf("HandleTKASyncSend: %v", err) } - // Apply the recovery AUM to an authority to make sure it works. - if err := authority.Inform(chonk, toApply); err != nil { - t.Errorf("recovery AUM could not be applied: %v", err) - } // Make sure the key we removed isn't trusted. if authority.KeyTrusted(compromisedPriv.KeyID()) { t.Error("compromised key was not removed from tka") } - w.WriteHeader(200) - if err := json.NewEncoder(w).Encode(tailcfg.TKASubmitSignatureResponse{}); err != nil { - t.Fatal(err) - } - default: t.Errorf("unhandled endpoint path: %v", r.URL.Path) w.WriteHeader(404) diff --git a/tka/sync.go b/tka/sync.go index e3a858c15..2dbfb7ac4 100644 --- a/tka/sync.go +++ b/tka/sync.go @@ -32,6 +32,41 @@ type SyncOffer struct { Ancestors []AUMHash } +// ToSyncOffer creates a SyncOffer from the fields received in +// a [tailcfg.TKASyncOfferRequest]. +func ToSyncOffer(head string, ancestors []string) (SyncOffer, error) { + var out SyncOffer + if err := out.Head.UnmarshalText([]byte(head)); err != nil { + return SyncOffer{}, fmt.Errorf("head.UnmarshalText: %v", err) + } + out.Ancestors = make([]AUMHash, len(ancestors)) + for i, a := range ancestors { + if err := out.Ancestors[i].UnmarshalText([]byte(a)); err != nil { + return SyncOffer{}, fmt.Errorf("ancestor[%d].UnmarshalText: %v", i, err) + } + } + return out, nil +} + +// FromSyncOffer marshals the fields of a SyncOffer so they can be +// sent in a [tailcfg.TKASyncOfferRequest]. +func FromSyncOffer(offer SyncOffer) (head string, ancestors []string, err error) { + headBytes, err := offer.Head.MarshalText() + if err != nil { + return "", nil, fmt.Errorf("head.MarshalText: %v", err) + } + + ancestors = make([]string, len(offer.Ancestors)) + for i, ancestor := range offer.Ancestors { + hash, err := ancestor.MarshalText() + if err != nil { + return "", nil, fmt.Errorf("ancestor[%d].MarshalText: %v", i, err) + } + ancestors[i] = string(hash) + } + return string(headBytes), ancestors, nil +} + const ( // The starting number of AUMs to skip when listing // ancestors in a SyncOffer. diff --git a/tstest/integration/integration.go b/tstest/integration/integration.go index ea5747b7d..a62173ae3 100644 --- a/tstest/integration/integration.go +++ b/tstest/integration/integration.go @@ -918,7 +918,7 @@ func (n *TestNode) Ping(otherNode *TestNode) error { t := n.env.t ip := otherNode.AwaitIP4().String() t.Logf("Running ping %v (from %v)...", ip, n.AwaitIP4()) - return n.Tailscale("ping", ip).Run() + return n.Tailscale("ping", "--timeout=1s", ip).Run() } // AwaitListening waits for the tailscaled to be serving local clients @@ -1077,6 +1077,46 @@ func (n *TestNode) MustStatus() *ipnstate.Status { return st } +// PublicKey returns the hex-encoded public key of this node, +// e.g. `nodekey:123456abc` +func (n *TestNode) PublicKey() string { + tb := n.env.t + tb.Helper() + cmd := n.Tailscale("status", "--json") + out, err := cmd.CombinedOutput() + if err != nil { + tb.Fatalf("running `tailscale status`: %v, %s", err, out) + } + + type Self struct{ PublicKey string } + type StatusOutput struct{ Self Self } + + var st StatusOutput + if err := json.Unmarshal(out, &st); err != nil { + tb.Fatalf("decoding `tailscale status` JSON: %v\njson:\n%s", err, out) + } + return st.Self.PublicKey +} + +// NLPublicKey returns the hex-encoded network lock public key of +// this node, e.g. `tlpub:123456abc` +func (n *TestNode) NLPublicKey() string { + tb := n.env.t + tb.Helper() + cmd := n.Tailscale("lock", "status", "--json") + out, err := cmd.CombinedOutput() + if err != nil { + tb.Fatalf("running `tailscale lock status`: %v, %s", err, out) + } + st := struct { + PublicKey string `json:"PublicKey"` + }{} + if err := json.Unmarshal(out, &st); err != nil { + tb.Fatalf("decoding `tailscale lock status` JSON: %v\njson:\n%s", err, out) + } + return st.PublicKey +} + // trafficTrap is an HTTP proxy handler to note whether any // HTTP traffic tries to leave localhost from tailscaled. We don't // expect any, so any request triggers a failure. diff --git a/tstest/integration/integration_test.go b/tstest/integration/integration_test.go index 3739a3011..fc891ad72 100644 --- a/tstest/integration/integration_test.go +++ b/tstest/integration/integration_test.go @@ -2253,7 +2253,7 @@ func TestC2NDebugNetmap(t *testing.T) { } } -func TestNetworkLock(t *testing.T) { +func TestTailnetLock(t *testing.T) { // If you run `tailscale lock log` on a node where Tailnet Lock isn't // enabled, you get an error explaining that. @@ -2291,6 +2291,79 @@ func TestNetworkLock(t *testing.T) { t.Fatalf("stderr: want %q, got %q", wantErr, errBuf.String()) } }) + + // If you create a tailnet with two signed nodes and one unsigned, + // the signed nodes can talk to each other but the unsigned node cannot + // talk to anybody. + t.Run("node-connectivity", func(t *testing.T) { + tstest.Shard(t) + t.Parallel() + + env := NewTestEnv(t) + env.Control.DefaultNodeCapabilities = &tailcfg.NodeCapMap{ + tailcfg.CapabilityTailnetLock: []tailcfg.RawMessage{}, + } + + // Start two nodes which will be our signing nodes. + signing1 := NewTestNode(t, env) + signing2 := NewTestNode(t, env) + + nodes := []*TestNode{signing1, signing2} + for _, n := range nodes { + d := n.StartDaemon() + defer d.MustCleanShutdown(t) + + n.MustUp() + n.AwaitRunning() + } + + // Initiate Tailnet Lock with the two signing nodes. + initCmd := signing1.Tailscale("lock", "init", + "--gen-disablements", "10", + "--confirm", + signing1.NLPublicKey(), signing2.NLPublicKey(), + ) + out, err := initCmd.CombinedOutput() + if err != nil { + t.Fatalf("init command failed: %q\noutput=%v", err, string(out)) + } + + // Check that the two signing nodes can ping each other + if err := signing1.Ping(signing2); err != nil { + t.Fatalf("ping signing1 -> signing2: %v", err) + } + if err := signing2.Ping(signing1); err != nil { + t.Fatalf("ping signing2 -> signing1: %v", err) + } + + // Create and start a third node + node3 := NewTestNode(t, env) + d3 := node3.StartDaemon() + defer d3.MustCleanShutdown(t) + node3.MustUp() + node3.AwaitRunning() + + if err := signing1.Ping(node3); err == nil { + t.Fatal("ping signing1 -> node3: expected err, but succeeded") + } + if err := node3.Ping(signing1); err == nil { + t.Fatal("ping node3 -> signing1: expected err, but succeeded") + } + + // Sign node3, and check the nodes can now talk to each other + signCmd := signing1.Tailscale("lock", "sign", node3.PublicKey()) + out, err = signCmd.CombinedOutput() + if err != nil { + t.Fatalf("sign command failed: %q\noutput = %v", err, string(out)) + } + + if err := signing1.Ping(node3); err != nil { + t.Fatalf("ping signing1 -> node3: expected success, got err: %v", err) + } + if err := node3.Ping(signing1); err != nil { + t.Fatalf("ping node3 -> signing1: expected success, got err: %v", err) + } + }) } func TestNodeWithBadStateFile(t *testing.T) { diff --git a/tstest/integration/testcontrol/testcontrol.go b/tstest/integration/testcontrol/testcontrol.go index d0959ff25..19964c91f 100644 --- a/tstest/integration/testcontrol/testcontrol.go +++ b/tstest/integration/testcontrol/testcontrol.go @@ -33,6 +33,8 @@ import ( "tailscale.com/net/tsaddr" "tailscale.com/syncs" "tailscale.com/tailcfg" + "tailscale.com/tka" + "tailscale.com/tstest/tkatest" "tailscale.com/types/key" "tailscale.com/types/logger" "tailscale.com/types/opt" @@ -123,6 +125,10 @@ type Server struct { nodeKeyAuthed set.Set[key.NodePublic] msgToSend map[key.NodePublic]any // value is *tailcfg.PingRequest or entire *tailcfg.MapResponse allExpired bool // All nodes will be told their node key is expired. + + // tkaStorage records the Tailnet Lock state, if any. + // If nil, Tailnet Lock is not enabled in the Tailnet. + tkaStorage tka.CompactableChonk } // BaseURL returns the server's base URL, without trailing slash. @@ -329,6 +335,7 @@ func (s *Server) initMux() { w.WriteHeader(http.StatusNoContent) }) s.mux.HandleFunc("/key", s.serveKey) + s.mux.HandleFunc("/machine/tka/", s.serveTKA) s.mux.HandleFunc("/machine/", s.serveMachine) s.mux.HandleFunc("/ts2021", s.serveNoiseUpgrade) s.mux.HandleFunc("/c2n/", s.serveC2N) @@ -439,7 +446,7 @@ func (s *Server) serveKey(w http.ResponseWriter, r *http.Request) { func (s *Server) serveMachine(w http.ResponseWriter, r *http.Request) { if r.Method != "POST" { - http.Error(w, "POST required", 400) + http.Error(w, "POST required for serveMachine", 400) return } ctx := r.Context() @@ -861,6 +868,132 @@ func (s *Server) serveRegister(w http.ResponseWriter, r *http.Request, mkey key. w.Write(res) } +func (s *Server) serveTKA(w http.ResponseWriter, r *http.Request) { + if r.Method != "GET" { + http.Error(w, "GET required for serveTKA", 400) + return + } + + switch r.URL.Path { + case "/machine/tka/init/begin": + s.serveTKAInitBegin(w, r) + case "/machine/tka/init/finish": + s.serveTKAInitFinish(w, r) + case "/machine/tka/bootstrap": + s.serveTKABootstrap(w, r) + case "/machine/tka/sync/offer": + s.serveTKASyncOffer(w, r) + case "/machine/tka/sign": + s.serveTKASign(w, r) + default: + s.serveUnhandled(w, r) + } +} + +func (s *Server) serveTKAInitBegin(w http.ResponseWriter, r *http.Request) { + s.mu.Lock() + defer s.mu.Unlock() + + nodes := maps.Values(s.nodes) + genesisAUM, err := tkatest.HandleTKAInitBegin(w, r, nodes) + if err != nil { + go panic(fmt.Sprintf("HandleTKAInitBegin: %v", err)) + } + s.tkaStorage = tka.ChonkMem() + s.tkaStorage.CommitVerifiedAUMs([]tka.AUM{*genesisAUM}) +} + +func (s *Server) serveTKAInitFinish(w http.ResponseWriter, r *http.Request) { + signatures, err := tkatest.HandleTKAInitFinish(w, r) + if err != nil { + go panic(fmt.Sprintf("HandleTKAInitFinish: %v", err)) + } + + s.mu.Lock() + defer s.mu.Unlock() + + // Apply the signatures to each of the nodes. Because s.nodes is keyed + // by public key instead of node ID, we have to do this inefficiently. + // + // We only have small tailnets in the integration tests, so this isn't + // much of an issue. + for nodeID, sig := range signatures { + for _, n := range s.nodes { + if n.ID == nodeID { + n.KeySignature = sig + } + } + } +} + +func (s *Server) serveTKABootstrap(w http.ResponseWriter, r *http.Request) { + s.mu.Lock() + defer s.mu.Unlock() + if s.tkaStorage == nil { + http.Error(w, "no TKA state when calling serveTKABootstrap", 400) + return + } + + // Find the genesis AUM, which we need to include in the response. + var genesis *tka.AUM + allAUMs, err := s.tkaStorage.AllAUMs() + if err != nil { + http.Error(w, "unable to retrieve all AUMs from TKA state", 500) + return + } + for _, h := range allAUMs { + aum := must.Get(s.tkaStorage.AUM(h)) + if _, hasParent := aum.Parent(); !hasParent { + genesis = &aum + break + } + } + if genesis == nil { + http.Error(w, "unable to find genesis AUM in TKA state", 500) + return + } + + resp := tailcfg.TKABootstrapResponse{ + GenesisAUM: genesis.Serialize(), + } + _, err = tkatest.HandleTKABootstrap(w, r, resp) + if err != nil { + go panic(fmt.Sprintf("HandleTKABootstrap: %v", err)) + } +} + +func (s *Server) serveTKASyncOffer(w http.ResponseWriter, r *http.Request) { + s.mu.Lock() + defer s.mu.Unlock() + + authority, err := tka.Open(s.tkaStorage) + if err != nil { + go panic(fmt.Sprintf("serveTKASyncOffer: tka.Open: %v", err)) + } + + err = tkatest.HandleTKASyncOffer(w, r, authority, s.tkaStorage) + if err != nil { + go panic(fmt.Sprintf("HandleTKASyncOffer: %v", err)) + } +} + +func (s *Server) serveTKASign(w http.ResponseWriter, r *http.Request) { + s.mu.Lock() + defer s.mu.Unlock() + + authority, err := tka.Open(s.tkaStorage) + if err != nil { + go panic(fmt.Sprintf("serveTKASign: tka.Open: %v", err)) + } + + sig, keyBeingSigned, err := tkatest.HandleTKASign(w, r, authority) + if err != nil { + go panic(fmt.Sprintf("HandleTKASign: %v", err)) + } + s.nodes[*keyBeingSigned].KeySignature = *sig + s.updateLocked("TKASign", s.nodeIDsLocked(0)) +} + // updateType indicates why a long-polling map request is being woken // up for an update. type updateType int @@ -1197,6 +1330,21 @@ func (s *Server) MapResponse(req *tailcfg.MapRequest) (res *tailcfg.MapResponse, v6Prefix, } + // If the server is tracking TKA state, and there's a single TKA head, + // add it to the MapResponse. + if s.tkaStorage != nil { + heads, err := s.tkaStorage.Heads() + if err != nil { + log.Printf("unable to get TKA heads: %v", err) + } else if len(heads) != 1 { + log.Printf("unable to get single TKA head, got %v", heads) + } else { + res.TKAInfo = &tailcfg.TKAInfo{ + Head: heads[0].Hash().String(), + } + } + } + s.mu.Lock() defer s.mu.Unlock() res.Node.PrimaryRoutes = s.nodeSubnetRoutes[nk] diff --git a/tstest/tkatest/tkatest.go b/tstest/tkatest/tkatest.go new file mode 100644 index 000000000..fb157a1a1 --- /dev/null +++ b/tstest/tkatest/tkatest.go @@ -0,0 +1,220 @@ +// Copyright (c) Tailscale Inc & AUTHORS +// SPDX-License-Identifier: BSD-3-Clause + +// tkatest has functions for creating a mock control server that responds +// to TKA endpoints. +package tkatest + +import ( + "encoding/json" + "errors" + "fmt" + "iter" + "log" + "net/http" + + "tailscale.com/tailcfg" + "tailscale.com/tka" + "tailscale.com/types/key" + "tailscale.com/types/tkatype" +) + +func serverError(w http.ResponseWriter, format string, a ...any) error { + err := fmt.Sprintf(format, a...) + http.Error(w, err, 500) + log.Printf("returning HTTP 500 error: %v", err) + return errors.New(err) +} + +func userError(w http.ResponseWriter, format string, a ...any) error { + err := fmt.Sprintf(format, a...) + http.Error(w, err, 400) + return errors.New(err) +} + +// HandleTKAInitBegin handles a request to /machine/tka/init/begin. +// +// If the request contains a valid genesis AUM, it sends a response to the +// client, and returns the AUM to the caller. +func HandleTKAInitBegin(w http.ResponseWriter, r *http.Request, nodes iter.Seq[*tailcfg.Node]) (*tka.AUM, error) { + var req *tailcfg.TKAInitBeginRequest + if err := json.NewDecoder(r.Body).Decode(&req); err != nil { + return nil, userError(w, "Decode: %v", err) + } + var aum tka.AUM + if err := aum.Unserialize(req.GenesisAUM); err != nil { + return nil, userError(w, "invalid genesis AUM: %v", err) + } + beginResp := tailcfg.TKAInitBeginResponse{} + for n := range nodes { + beginResp.NeedSignatures = append( + beginResp.NeedSignatures, + tailcfg.TKASignInfo{ + NodeID: n.ID, + NodePublic: n.Key, + }, + ) + } + + w.WriteHeader(200) + if err := json.NewEncoder(w).Encode(beginResp); err != nil { + return nil, serverError(w, "Encode: %v", err) + } + return &aum, nil +} + +// HandleTKAInitFinish handles a request to /machine/tka/init/finish. +// +// It sends a response to the client, and gives the caller a list of node +// signatures to apply. +// +// This method assumes that the node signatures are valid, and does not +// verify them with the supplied public key. +func HandleTKAInitFinish(w http.ResponseWriter, r *http.Request) (map[tailcfg.NodeID]tkatype.MarshaledSignature, error) { + var req *tailcfg.TKAInitFinishRequest + if err := json.NewDecoder(r.Body).Decode(&req); err != nil { + return nil, userError(w, "Decode: %v", err) + } + + w.WriteHeader(200) + w.Write([]byte("{}")) + + return req.Signatures, nil +} + +// HandleTKABootstrap handles a request to /tka/bootstrap. +// +// If the request is valid, it sends a response to the client, and returns +// the parsed request to the caller. +func HandleTKABootstrap(w http.ResponseWriter, r *http.Request, resp tailcfg.TKABootstrapResponse) (*tailcfg.TKABootstrapRequest, error) { + req := new(tailcfg.TKABootstrapRequest) + if err := json.NewDecoder(r.Body).Decode(req); err != nil { + return nil, userError(w, "Decode: %v", err) + } + if req.Version != tailcfg.CurrentCapabilityVersion { + return nil, userError(w, "bootstrap CapVer = %v, want %v", req.Version, tailcfg.CurrentCapabilityVersion) + } + + w.WriteHeader(200) + if err := json.NewEncoder(w).Encode(resp); err != nil { + return nil, serverError(w, "Encode: %v", err) + } + return req, nil +} + +func HandleTKASyncOffer(w http.ResponseWriter, r *http.Request, authority *tka.Authority, chonk tka.Chonk) error { + body := new(tailcfg.TKASyncOfferRequest) + if err := json.NewDecoder(r.Body).Decode(body); err != nil { + return userError(w, "Decode: %v", err) + } + + log.Printf("got sync offer:\n%+v", body) + + nodeOffer, err := tka.ToSyncOffer(body.Head, body.Ancestors) + if err != nil { + return userError(w, "ToSyncOffer: %v", err) + } + + controlOffer, err := authority.SyncOffer(chonk) + if err != nil { + return serverError(w, "authority.SyncOffer: %v", err) + } + sendAUMs, err := authority.MissingAUMs(chonk, nodeOffer) + if err != nil { + return serverError(w, "authority.MissingAUMs: %v", err) + } + + head, ancestors, err := tka.FromSyncOffer(controlOffer) + if err != nil { + return serverError(w, "FromSyncOffer: %v", err) + } + resp := tailcfg.TKASyncOfferResponse{ + Head: head, + Ancestors: ancestors, + MissingAUMs: make([]tkatype.MarshaledAUM, len(sendAUMs)), + } + for i, a := range sendAUMs { + resp.MissingAUMs[i] = a.Serialize() + } + + log.Printf("responding to sync offer with:\n%+v", resp) + w.WriteHeader(200) + if err := json.NewEncoder(w).Encode(resp); err != nil { + return serverError(w, "Encode: %v", err) + } + return nil +} + +// HandleTKASign handles a request to /machine/tka/sign. +// +// If the signature request is valid, it sends a response to the client, and +// gives the caller the signature and public key of the node being signed. +func HandleTKASign(w http.ResponseWriter, r *http.Request, authority *tka.Authority) (*tkatype.MarshaledSignature, *key.NodePublic, error) { + req := new(tailcfg.TKASubmitSignatureRequest) + if err := json.NewDecoder(r.Body).Decode(req); err != nil { + return nil, nil, userError(w, "Decode: %v", err) + } + if req.Version != tailcfg.CurrentCapabilityVersion { + return nil, nil, userError(w, "sign CapVer = %v, want %v", req.Version, tailcfg.CurrentCapabilityVersion) + } + + var sig tka.NodeKeySignature + if err := sig.Unserialize(req.Signature); err != nil { + return nil, nil, userError(w, "malformed signature: %v", err) + } + var keyBeingSigned key.NodePublic + if err := keyBeingSigned.UnmarshalBinary(sig.Pubkey); err != nil { + return nil, nil, userError(w, "malformed signature pubkey: %v", err) + } + if err := authority.NodeKeyAuthorized(keyBeingSigned, req.Signature); err != nil { + return nil, nil, userError(w, "signature does not verify: %v", err) + } + + w.WriteHeader(200) + if err := json.NewEncoder(w).Encode(tailcfg.TKASubmitSignatureResponse{}); err != nil { + return nil, nil, serverError(w, "Encode: %v", err) + } + return &req.Signature, &keyBeingSigned, nil +} + +// HandleTKASyncSend handles a request to /machine/tka/send. +// +// If the request is valid, it adds the new AUMs to the authority, and sends +// a response to the client with the new head. +func HandleTKASyncSend(w http.ResponseWriter, r *http.Request, authority *tka.Authority, chonk tka.Chonk) error { + body := new(tailcfg.TKASyncSendRequest) + if err := json.NewDecoder(r.Body).Decode(body); err != nil { + return userError(w, "Decode: %v", err) + } + log.Printf("got sync send:\n%+v", body) + + var remoteHead tka.AUMHash + if err := remoteHead.UnmarshalText([]byte(body.Head)); err != nil { + return userError(w, "head unmarshal: %v", err) + } + toApply := make([]tka.AUM, len(body.MissingAUMs)) + for i, a := range body.MissingAUMs { + if err := toApply[i].Unserialize(a); err != nil { + return userError(w, "decoding missingAUM[%d]: %v", i, err) + } + } + + if len(toApply) > 0 { + if err := authority.Inform(chonk, toApply); err != nil { + return serverError(w, "control.Inform(%+v) failed: %v", toApply, err) + } + } + head, err := authority.Head().MarshalText() + if err != nil { + return serverError(w, "head marshal: %v", err) + } + + resp := tailcfg.TKASyncSendResponse{ + Head: string(head), + } + w.WriteHeader(200) + if err := json.NewEncoder(w).Encode(resp); err != nil { + return serverError(w, "Encode: %v", err) + } + return nil +} From 8af7778ce04457a5f84a45e7cc8f58f02b7bfb4c Mon Sep 17 00:00:00 2001 From: Brad Fitzpatrick Date: Wed, 26 Nov 2025 09:26:08 -0800 Subject: [PATCH 28/33] util/execqueue: don't hold mutex in RunSync We don't hold q.mu while running normal ExecQueue.Add funcs, so we shouldn't in RunSync either. Otherwise code it calls can't shut down the queue, as seen in #18502. Updates #18052 Co-authored-by: Nick Khyl Change-Id: Ic5e53440411eca5e9fabac7f4a68a9f6ef026de1 Signed-off-by: Brad Fitzpatrick --- util/execqueue/execqueue.go | 37 +++++++++++++++++--------------- util/execqueue/execqueue_test.go | 9 ++++++++ 2 files changed, 29 insertions(+), 17 deletions(-) diff --git a/util/execqueue/execqueue.go b/util/execqueue/execqueue.go index 2ea0c1f2f..87616a6b5 100644 --- a/util/execqueue/execqueue.go +++ b/util/execqueue/execqueue.go @@ -39,21 +39,21 @@ func (q *ExecQueue) Add(f func()) { // RunSync waits for the queue to be drained and then synchronously runs f. // It returns an error if the queue is closed before f is run or ctx expires. func (q *ExecQueue) RunSync(ctx context.Context, f func()) error { - for { - if err := q.Wait(ctx); err != nil { - return err - } - q.mu.Lock() - if q.inFlight { - q.mu.Unlock() - continue - } - defer q.mu.Unlock() - if q.closed { - return errors.New("closed") - } - f() + q.mu.Lock() + q.initCtxLocked() + shutdownCtx := q.ctx + q.mu.Unlock() + + ch := make(chan struct{}) + q.Add(f) + q.Add(func() { close(ch) }) + select { + case <-ch: return nil + case <-ctx.Done(): + return ctx.Err() + case <-shutdownCtx.Done(): + return errExecQueueShutdown } } @@ -94,6 +94,8 @@ func (q *ExecQueue) initCtxLocked() { } } +var errExecQueueShutdown = errors.New("execqueue shut down") + // Wait waits for the queue to be empty or shut down. func (q *ExecQueue) Wait(ctx context.Context) error { q.mu.Lock() @@ -104,10 +106,11 @@ func (q *ExecQueue) Wait(ctx context.Context) error { q.doneWaiter = waitCh } closed := q.closed + shutdownCtx := q.ctx q.mu.Unlock() if closed { - return errors.New("execqueue shut down") + return errExecQueueShutdown } if waitCh == nil { return nil @@ -116,8 +119,8 @@ func (q *ExecQueue) Wait(ctx context.Context) error { select { case <-waitCh: return nil - case <-q.ctx.Done(): - return errors.New("execqueue shut down") + case <-shutdownCtx.Done(): + return errExecQueueShutdown case <-ctx.Done(): return ctx.Err() } diff --git a/util/execqueue/execqueue_test.go b/util/execqueue/execqueue_test.go index d10b741f7..1bce69556 100644 --- a/util/execqueue/execqueue_test.go +++ b/util/execqueue/execqueue_test.go @@ -20,3 +20,12 @@ func TestExecQueue(t *testing.T) { t.Errorf("n=%d; want 1", got) } } + +// Test that RunSync doesn't hold q.mu and block Shutdown +// as we saw in tailscale/tailscale#18502 +func TestExecQueueRunSyncLocking(t *testing.T) { + q := &ExecQueue{} + q.RunSync(t.Context(), func() { + q.Shutdown() + }) +} From 9eff8a45034bc36a17004dce1fe6e7732af631a4 Mon Sep 17 00:00:00 2001 From: Andrew Lytvynov Date: Wed, 26 Nov 2025 12:35:24 -0600 Subject: [PATCH 29/33] feature/tpm: return opening errors from both /dev/tpmrm0 and /dev/tpm0 (#18071) This might help users diagnose why TPM access is failing for tpmrm0. Fixes #18026 Signed-off-by: Andrew Lytvynov --- feature/tpm/tpm_linux.go | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/feature/tpm/tpm_linux.go b/feature/tpm/tpm_linux.go index 6c8131e8d..3f05c9a8c 100644 --- a/feature/tpm/tpm_linux.go +++ b/feature/tpm/tpm_linux.go @@ -4,6 +4,8 @@ package tpm import ( + "errors" + "github.com/google/go-tpm/tpm2/transport" "github.com/google/go-tpm/tpm2/transport/linuxtpm" ) @@ -13,5 +15,10 @@ func open() (transport.TPMCloser, error) { if err == nil { return tpm, nil } - return linuxtpm.Open("/dev/tpm0") + errs := []error{err} + tpm, err = linuxtpm.Open("/dev/tpm0") + if err == nil { + return tpm, nil + } + return nil, errors.Join(errs...) } From 5ee0c6bf1df5a96f5f58198dcf9d3b241a8ccef1 Mon Sep 17 00:00:00 2001 From: James Tucker Date: Tue, 25 Nov 2025 15:56:44 -0800 Subject: [PATCH 30/33] derp/derpserver: add a unique sender cardinality estimate Adds an observation point that may identify potentially abusive traffic patterns at outlier values. Updates tailscale/corp#24681 Signed-off-by: James Tucker --- cmd/derper/depaware.txt | 2 + derp/derpserver/derpserver.go | 33 ++++- derp/derpserver/derpserver_test.go | 195 +++++++++++++++++++++++++++++ flake.nix | 2 +- go.mod | 2 + go.mod.sri | 2 +- go.sum | 4 + shell.nix | 2 +- 8 files changed, 238 insertions(+), 4 deletions(-) diff --git a/cmd/derper/depaware.txt b/cmd/derper/depaware.txt index 6608faaf7..9c720fa60 100644 --- a/cmd/derper/depaware.txt +++ b/cmd/derper/depaware.txt @@ -2,6 +2,7 @@ tailscale.com/cmd/derper dependencies: (generated by github.com/tailscale/depawa filippo.io/edwards25519 from github.com/hdevalence/ed25519consensus filippo.io/edwards25519/field from filippo.io/edwards25519 + github.com/axiomhq/hyperloglog from tailscale.com/derp/derpserver github.com/beorn7/perks/quantile from github.com/prometheus/client_golang/prometheus 💣 github.com/cespare/xxhash/v2 from github.com/prometheus/client_golang/prometheus github.com/coder/websocket from tailscale.com/cmd/derper+ @@ -9,6 +10,7 @@ tailscale.com/cmd/derper dependencies: (generated by github.com/tailscale/depawa github.com/coder/websocket/internal/util from github.com/coder/websocket github.com/coder/websocket/internal/xsync from github.com/coder/websocket W 💣 github.com/dblohm7/wingoes from tailscale.com/util/winutil + github.com/dgryski/go-metro from github.com/axiomhq/hyperloglog github.com/fxamacker/cbor/v2 from tailscale.com/tka github.com/go-json-experiment/json from tailscale.com/types/opt+ github.com/go-json-experiment/json/internal from github.com/go-json-experiment/json+ diff --git a/derp/derpserver/derpserver.go b/derp/derpserver/derpserver.go index 0bbc66780..1879e0c53 100644 --- a/derp/derpserver/derpserver.go +++ b/derp/derpserver/derpserver.go @@ -36,6 +36,7 @@ import ( "sync/atomic" "time" + "github.com/axiomhq/hyperloglog" "go4.org/mem" "golang.org/x/sync/errgroup" "tailscale.com/client/local" @@ -1643,6 +1644,12 @@ type sclient struct { sawSrc map[key.NodePublic]set.Handle bw *lazyBufioWriter + // senderCardinality estimates the number of unique peers that have + // sent packets to this client. Owned by sendLoop, protected by + // senderCardinalityMu for reads from other goroutines. + senderCardinalityMu sync.Mutex + senderCardinality *hyperloglog.Sketch + // Guarded by s.mu // // peerStateChange is used by mesh peers (a set of regional @@ -1778,6 +1785,8 @@ func (c *sclient) onSendLoopDone() { func (c *sclient) sendLoop(ctx context.Context) error { defer c.onSendLoopDone() + c.senderCardinality = hyperloglog.New() + jitter := rand.N(5 * time.Second) keepAliveTick, keepAliveTickChannel := c.s.clock.NewTicker(derp.KeepAlive + jitter) defer keepAliveTick.Stop() @@ -2000,6 +2009,11 @@ func (c *sclient) sendPacket(srcKey key.NodePublic, contents []byte) (err error) if withKey { pktLen += key.NodePublicRawLen c.noteSendFromSrc(srcKey) + if c.senderCardinality != nil { + c.senderCardinalityMu.Lock() + c.senderCardinality.Insert(srcKey.AppendTo(nil)) + c.senderCardinalityMu.Unlock() + } } if err = derp.WriteFrameHeader(c.bw.bw(), derp.FrameRecvPacket, uint32(pktLen)); err != nil { return err @@ -2013,6 +2027,17 @@ func (c *sclient) sendPacket(srcKey key.NodePublic, contents []byte) (err error) return err } +// EstimatedUniqueSenders returns an estimate of the number of unique peers +// that have sent packets to this client. +func (c *sclient) EstimatedUniqueSenders() uint64 { + c.senderCardinalityMu.Lock() + defer c.senderCardinalityMu.Unlock() + if c.senderCardinality == nil { + return 0 + } + return c.senderCardinality.Estimate() +} + // noteSendFromSrc notes that we are about to write a packet // from src to sclient. // @@ -2295,7 +2320,8 @@ type BytesSentRecv struct { Sent uint64 Recv uint64 // Key is the public key of the client which sent/received these bytes. - Key key.NodePublic + Key key.NodePublic + UniqueSenders uint64 `json:",omitzero"` } // parseSSOutput parses the output from the specific call to ss in ServeDebugTraffic. @@ -2349,6 +2375,11 @@ func (s *Server) ServeDebugTraffic(w http.ResponseWriter, r *http.Request) { if prev.Sent < next.Sent || prev.Recv < next.Recv { if pkey, ok := s.keyOfAddr[k]; ok { next.Key = pkey + if cs, ok := s.clients[pkey]; ok { + if c := cs.activeClient.Load(); c != nil { + next.UniqueSenders = c.EstimatedUniqueSenders() + } + } if err := enc.Encode(next); err != nil { s.mu.Unlock() return diff --git a/derp/derpserver/derpserver_test.go b/derp/derpserver/derpserver_test.go index 2db5f25bc..1dd86f314 100644 --- a/derp/derpserver/derpserver_test.go +++ b/derp/derpserver/derpserver_test.go @@ -9,6 +9,7 @@ import ( "context" "crypto/x509" "encoding/asn1" + "encoding/binary" "expvar" "fmt" "log" @@ -20,6 +21,7 @@ import ( "testing" "time" + "github.com/axiomhq/hyperloglog" qt "github.com/frankban/quicktest" "go4.org/mem" "golang.org/x/time/rate" @@ -755,6 +757,35 @@ func TestParseSSOutput(t *testing.T) { } } +func TestServeDebugTrafficUniqueSenders(t *testing.T) { + s := New(key.NewNode(), t.Logf) + defer s.Close() + + clientKey := key.NewNode().Public() + c := &sclient{ + key: clientKey, + s: s, + logf: logger.Discard, + senderCardinality: hyperloglog.New(), + } + + for i := 0; i < 5; i++ { + c.senderCardinality.Insert(key.NewNode().Public().AppendTo(nil)) + } + + s.mu.Lock() + cs := &clientSet{} + cs.activeClient.Store(c) + s.clients[clientKey] = cs + s.mu.Unlock() + + estimate := c.EstimatedUniqueSenders() + t.Logf("Estimated unique senders: %d", estimate) + if estimate < 4 || estimate > 6 { + t.Errorf("EstimatedUniqueSenders() = %d, want ~5 (4-6 range)", estimate) + } +} + func TestGetPerClientSendQueueDepth(t *testing.T) { c := qt.New(t) envKey := "TS_DEBUG_DERP_PER_CLIENT_SEND_QUEUE_DEPTH" @@ -780,3 +811,167 @@ func TestGetPerClientSendQueueDepth(t *testing.T) { }) } } + +func TestSenderCardinality(t *testing.T) { + s := New(key.NewNode(), t.Logf) + defer s.Close() + + c := &sclient{ + key: key.NewNode().Public(), + s: s, + logf: logger.WithPrefix(t.Logf, "test client: "), + } + + if got := c.EstimatedUniqueSenders(); got != 0 { + t.Errorf("EstimatedUniqueSenders() before init = %d, want 0", got) + } + + c.senderCardinality = hyperloglog.New() + + if got := c.EstimatedUniqueSenders(); got != 0 { + t.Errorf("EstimatedUniqueSenders() with no senders = %d, want 0", got) + } + + senders := make([]key.NodePublic, 10) + for i := range senders { + senders[i] = key.NewNode().Public() + c.senderCardinality.Insert(senders[i].AppendTo(nil)) + } + + estimate := c.EstimatedUniqueSenders() + t.Logf("Estimated unique senders after 10 inserts: %d", estimate) + + if estimate < 8 || estimate > 12 { + t.Errorf("EstimatedUniqueSenders() = %d, want ~10 (8-12 range)", estimate) + } + + for i := 0; i < 5; i++ { + c.senderCardinality.Insert(senders[i].AppendTo(nil)) + } + + estimate2 := c.EstimatedUniqueSenders() + t.Logf("Estimated unique senders after duplicates: %d", estimate2) + + if estimate2 < 8 || estimate2 > 12 { + t.Errorf("EstimatedUniqueSenders() after duplicates = %d, want ~10 (8-12 range)", estimate2) + } +} + +func TestSenderCardinality100(t *testing.T) { + s := New(key.NewNode(), t.Logf) + defer s.Close() + + c := &sclient{ + key: key.NewNode().Public(), + s: s, + logf: logger.WithPrefix(t.Logf, "test client: "), + senderCardinality: hyperloglog.New(), + } + + numSenders := 100 + for i := 0; i < numSenders; i++ { + c.senderCardinality.Insert(key.NewNode().Public().AppendTo(nil)) + } + + estimate := c.EstimatedUniqueSenders() + t.Logf("Estimated unique senders for 100 actual senders: %d", estimate) + + if estimate < 85 || estimate > 115 { + t.Errorf("EstimatedUniqueSenders() = %d, want ~100 (85-115 range)", estimate) + } +} + +func TestSenderCardinalityTracking(t *testing.T) { + s := New(key.NewNode(), t.Logf) + defer s.Close() + + c := &sclient{ + key: key.NewNode().Public(), + s: s, + logf: logger.WithPrefix(t.Logf, "test client: "), + senderCardinality: hyperloglog.New(), + } + + zeroKey := key.NodePublic{} + if zeroKey != (key.NodePublic{}) { + c.senderCardinality.Insert(zeroKey.AppendTo(nil)) + } + + if estimate := c.EstimatedUniqueSenders(); estimate != 0 { + t.Errorf("EstimatedUniqueSenders() after zero key = %d, want 0", estimate) + } + + sender1 := key.NewNode().Public() + sender2 := key.NewNode().Public() + + if sender1 != (key.NodePublic{}) { + c.senderCardinality.Insert(sender1.AppendTo(nil)) + } + if sender2 != (key.NodePublic{}) { + c.senderCardinality.Insert(sender2.AppendTo(nil)) + } + + estimate := c.EstimatedUniqueSenders() + t.Logf("Estimated unique senders after 2 senders: %d", estimate) + + if estimate < 1 || estimate > 3 { + t.Errorf("EstimatedUniqueSenders() = %d, want ~2 (1-3 range)", estimate) + } +} + +func BenchmarkHyperLogLogInsert(b *testing.B) { + hll := hyperloglog.New() + sender := key.NewNode().Public() + senderBytes := sender.AppendTo(nil) + + b.ResetTimer() + for i := 0; i < b.N; i++ { + hll.Insert(senderBytes) + } +} + +func BenchmarkHyperLogLogInsertUnique(b *testing.B) { + hll := hyperloglog.New() + + b.ResetTimer() + + buf := make([]byte, 32) + for i := 0; i < b.N; i++ { + binary.LittleEndian.PutUint64(buf, uint64(i)) + hll.Insert(buf) + } +} + +func BenchmarkHyperLogLogEstimate(b *testing.B) { + hll := hyperloglog.New() + + for i := 0; i < 100; i++ { + hll.Insert(key.NewNode().Public().AppendTo(nil)) + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + _ = hll.Estimate() + } +} + +func BenchmarkSenderCardinalityOverhead(b *testing.B) { + hll := hyperloglog.New() + sender := key.NewNode().Public() + + b.Run("WithTracking", func(b *testing.B) { + b.ReportAllocs() + for i := 0; i < b.N; i++ { + if hll != nil { + hll.Insert(sender.AppendTo(nil)) + } + } + }) + + b.Run("WithoutTracking", func(b *testing.B) { + b.ReportAllocs() + for i := 0; i < b.N; i++ { + _ = sender.AppendTo(nil) + } + }) +} diff --git a/flake.nix b/flake.nix index 505061a76..855ce555b 100644 --- a/flake.nix +++ b/flake.nix @@ -151,4 +151,4 @@ }); }; } -# nix-direnv cache busting line: sha256-jJSSXMyUqcJoZuqfSlBsKDQezyqS+jDkRglMMjG1K8g= +# nix-direnv cache busting line: sha256-IkodqRYdueML7U2Hh8vRw6Et7+WII+VXuPJ3jZ2xYx8= diff --git a/go.mod b/go.mod index a49a9724f..bd6fe441d 100644 --- a/go.mod +++ b/go.mod @@ -16,6 +16,7 @@ require ( github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.17.58 github.com/aws/aws-sdk-go-v2/service/s3 v1.75.3 github.com/aws/aws-sdk-go-v2/service/ssm v1.44.7 + github.com/axiomhq/hyperloglog v0.0.0-20240319100328-84253e514e02 github.com/bradfitz/go-tool-cache v0.0.0-20251113223507-0124e698e0bd github.com/bramvdbogaerde/go-scp v1.4.0 github.com/cilium/ebpf v0.15.0 @@ -149,6 +150,7 @@ require ( github.com/containerd/typeurl/v2 v2.2.3 // indirect github.com/cyphar/filepath-securejoin v0.3.6 // indirect github.com/deckarep/golang-set/v2 v2.8.0 // indirect + github.com/dgryski/go-metro v0.0.0-20180109044635-280f6062b5bc // indirect github.com/docker/go-connections v0.5.0 // indirect github.com/docker/go-units v0.5.0 // indirect github.com/felixge/httpsnoop v1.0.4 // indirect diff --git a/go.mod.sri b/go.mod.sri index 66422652e..329fe9405 100644 --- a/go.mod.sri +++ b/go.mod.sri @@ -1 +1 @@ -sha256-jJSSXMyUqcJoZuqfSlBsKDQezyqS+jDkRglMMjG1K8g= +sha256-IkodqRYdueML7U2Hh8vRw6Et7+WII+VXuPJ3jZ2xYx8= diff --git a/go.sum b/go.sum index f70fe9159..111c99ac9 100644 --- a/go.sum +++ b/go.sum @@ -170,6 +170,8 @@ github.com/aws/aws-sdk-go-v2/service/sts v1.33.13 h1:3LXNnmtH3TURctC23hnC0p/39Q5 github.com/aws/aws-sdk-go-v2/service/sts v1.33.13/go.mod h1:7Yn+p66q/jt38qMoVfNvjbm3D89mGBnkwDcijgtih8w= github.com/aws/smithy-go v1.22.2 h1:6D9hW43xKFrRx/tXXfAlIZc4JI+yQe6snnWcQyxSyLQ= github.com/aws/smithy-go v1.22.2/go.mod h1:irrKGvNn1InZwb2d7fkIRNucdfwR8R+Ts3wxYa/cJHg= +github.com/axiomhq/hyperloglog v0.0.0-20240319100328-84253e514e02 h1:bXAPYSbdYbS5VTy92NIUbeDI1qyggi+JYh5op9IFlcQ= +github.com/axiomhq/hyperloglog v0.0.0-20240319100328-84253e514e02/go.mod h1:k08r+Yj1PRAmuayFiRK6MYuR5Ve4IuZtTfxErMIh0+c= github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q= github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= @@ -271,6 +273,8 @@ github.com/deckarep/golang-set/v2 v2.8.0 h1:swm0rlPCmdWn9mESxKOjWk8hXSqoxOp+Zlfu github.com/deckarep/golang-set/v2 v2.8.0/go.mod h1:VAky9rY/yGXJOLEDv3OMci+7wtDpOF4IN+y82NBOac4= github.com/denis-tingaikin/go-header v0.5.0 h1:SRdnP5ZKvcO9KKRP1KJrhFR3RrlGuD+42t4429eC9k8= github.com/denis-tingaikin/go-header v0.5.0/go.mod h1:mMenU5bWrok6Wl2UsZjy+1okegmwQ3UgWl4V1D8gjlY= +github.com/dgryski/go-metro v0.0.0-20180109044635-280f6062b5bc h1:8WFBn63wegobsYAX0YjD+8suexZDga5CctH4CCTx2+8= +github.com/dgryski/go-metro v0.0.0-20180109044635-280f6062b5bc/go.mod h1:c9O8+fpSOX1DM8cPNSkX/qsBWdkD4yd2dpciOWQjpBw= github.com/digitalocean/go-smbios v0.0.0-20180907143718-390a4f403a8e h1:vUmf0yezR0y7jJ5pceLHthLaYf4bA5T14B6q39S4q2Q= github.com/digitalocean/go-smbios v0.0.0-20180907143718-390a4f403a8e/go.mod h1:YTIHhz/QFSYnu/EhlF2SpU2Uk+32abacUYA5ZPljz1A= github.com/distribution/reference v0.6.0 h1:0IXCQ5g4/QMHHkarYzh5l+u8T3t73zM5QvfrDyIgxBk= diff --git a/shell.nix b/shell.nix index d412693d9..28bdbdafb 100644 --- a/shell.nix +++ b/shell.nix @@ -16,4 +16,4 @@ ) { src = ./.; }).shellNix -# nix-direnv cache busting line: sha256-jJSSXMyUqcJoZuqfSlBsKDQezyqS+jDkRglMMjG1K8g= +# nix-direnv cache busting line: sha256-IkodqRYdueML7U2Hh8vRw6Et7+WII+VXuPJ3jZ2xYx8= From 3f9f0ed93c010eb0aae1ddf968ed2f81c4d42a5d Mon Sep 17 00:00:00 2001 From: Jonathan Nobels Date: Wed, 26 Nov 2025 15:49:52 -0500 Subject: [PATCH 31/33] VERSION.txt: this is v1.93.0 (#18074) Signed-off-by: Jonathan Nobels --- VERSION.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION.txt b/VERSION.txt index 6979a6c06..95784efdd 100644 --- a/VERSION.txt +++ b/VERSION.txt @@ -1 +1 @@ -1.91.0 +1.93.0 From 74ed589042c4fc255d148fc5356dc7e3aa1693be Mon Sep 17 00:00:00 2001 From: Brad Fitzpatrick Date: Wed, 19 Nov 2025 10:54:42 -0800 Subject: [PATCH 32/33] syncs: add means of declare locking assumptions for debug mode validation Updates #17852 Change-Id: I42a64a990dcc8f708fa23a516a40731a19967aba Signed-off-by: Brad Fitzpatrick --- ipn/ipnlocal/local.go | 39 +++++++++++++++++++++++++++++++++++++++ syncs/mutex.go | 5 +++++ syncs/mutex_debug.go | 4 ++++ 3 files changed, 48 insertions(+) diff --git a/ipn/ipnlocal/local.go b/ipn/ipnlocal/local.go index 3e7054896..fbf34aa42 100644 --- a/ipn/ipnlocal/local.go +++ b/ipn/ipnlocal/local.go @@ -876,6 +876,7 @@ func (b *LocalBackend) initPrefsFromConfig(conf *conffile.Config) error { } func (b *LocalBackend) setStaticEndpointsFromConfigLocked(conf *conffile.Config) { + syncs.RequiresMutex(&b.mu) if conf.Parsed.StaticEndpoints == nil && (b.conf == nil || b.conf.Parsed.StaticEndpoints == nil) { return } @@ -894,6 +895,7 @@ func (b *LocalBackend) setStaticEndpointsFromConfigLocked(conf *conffile.Config) } func (b *LocalBackend) setStateLocked(state ipn.State) { + syncs.RequiresMutex(&b.mu) if b.state == state { return } @@ -906,6 +908,7 @@ func (b *LocalBackend) setStateLocked(state ipn.State) { // setConfigLocked uses the provided config to update the backend's prefs // and other state. func (b *LocalBackend) setConfigLocked(conf *conffile.Config) error { + syncs.RequiresMutex(&b.mu) p := b.pm.CurrentPrefs().AsStruct() mp, err := conf.Parsed.ToPrefs() if err != nil { @@ -927,6 +930,7 @@ var assumeNetworkUpdateForTest = envknob.RegisterBool("TS_ASSUME_NETWORK_UP_FOR_ // // b.mu must be held. func (b *LocalBackend) pauseOrResumeControlClientLocked() { + syncs.RequiresMutex(&b.mu) if b.cc == nil { return } @@ -1204,6 +1208,7 @@ func (b *LocalBackend) Prefs() ipn.PrefsView { } func (b *LocalBackend) sanitizedPrefsLocked() ipn.PrefsView { + syncs.RequiresMutex(&b.mu) return stripKeysFromPrefs(b.pm.CurrentPrefs()) } @@ -1335,6 +1340,7 @@ func (b *LocalBackend) UpdateStatus(sb *ipnstate.StatusBuilder) { } func (b *LocalBackend) populatePeerStatusLocked(sb *ipnstate.StatusBuilder) { + syncs.RequiresMutex(&b.mu) cn := b.currentNode() nm := cn.NetMap() if nm == nil { @@ -1873,6 +1879,8 @@ func (b *LocalBackend) applySysPolicyLocked(prefs *ipn.Prefs) (anyChange bool) { if !buildfeatures.HasSystemPolicy { return false } + syncs.RequiresMutex(&b.mu) + if controlURL, err := b.polc.GetString(pkey.ControlURL, prefs.ControlURL); err == nil && prefs.ControlURL != controlURL { prefs.ControlURL = controlURL anyChange = true @@ -1941,6 +1949,8 @@ func (b *LocalBackend) applyExitNodeSysPolicyLocked(prefs *ipn.Prefs) (anyChange if !buildfeatures.HasUseExitNode { return false } + syncs.RequiresMutex(&b.mu) + if exitNodeIDStr, _ := b.polc.GetString(pkey.ExitNodeID, ""); exitNodeIDStr != "" { exitNodeID := tailcfg.StableNodeID(exitNodeIDStr) @@ -2182,6 +2192,8 @@ func (b *LocalBackend) resolveAutoExitNodeLocked(prefs *ipn.Prefs) (prefsChanged if !buildfeatures.HasUseExitNode { return false } + syncs.RequiresMutex(&b.mu) + // As of 2025-07-08, the only supported auto exit node expression is [ipn.AnyExitNode]. // // However, to maintain forward compatibility with future auto exit node expressions, @@ -2295,6 +2307,8 @@ func (b *LocalBackend) setWgengineStatus(s *wgengine.Status, err error) { // // b.mu must be held. func (b *LocalBackend) setWgengineStatusLocked(s *wgengine.Status) { + syncs.RequiresMutex(&b.mu) + es := b.parseWgStatusLocked(s) cc := b.cc @@ -4312,6 +4326,7 @@ func (b *LocalBackend) EditPrefsAs(mp *ipn.MaskedPrefs, actor ipnauth.Actor) (ip // // b.mu must be held. func (b *LocalBackend) checkEditPrefsAccessLocked(actor ipnauth.Actor, prefs ipn.PrefsView, mp *ipn.MaskedPrefs) error { + syncs.RequiresMutex(&b.mu) var errs []error if mp.RunSSHSet && mp.RunSSH && !envknob.CanSSHD() { @@ -4362,6 +4377,7 @@ func (b *LocalBackend) checkEditPrefsAccessLocked(actor ipnauth.Actor, prefs ipn // // b.mu must be held. func (b *LocalBackend) changeDisablesExitNodeLocked(prefs ipn.PrefsView, change *ipn.MaskedPrefs) bool { + syncs.RequiresMutex(&b.mu) if !buildfeatures.HasUseExitNode { return false } @@ -4403,6 +4419,7 @@ func (b *LocalBackend) changeDisablesExitNodeLocked(prefs ipn.PrefsView, change // // b.mu must be held. func (b *LocalBackend) adjustEditPrefsLocked(prefs ipn.PrefsView, mp *ipn.MaskedPrefs) { + syncs.RequiresMutex(&b.mu) // Zeroing the ExitNodeID via localAPI must also zero the prior exit node. if mp.ExitNodeIDSet && mp.ExitNodeID == "" && !mp.InternalExitNodePriorSet { mp.InternalExitNodePrior = "" @@ -4480,6 +4497,7 @@ func (b *LocalBackend) onEditPrefsLocked(_ ipnauth.Actor, mp *ipn.MaskedPrefs, o // startReconnectTimerLocked sets a timer to automatically set WantRunning to true // after the specified duration. func (b *LocalBackend) startReconnectTimerLocked(d time.Duration) { + syncs.RequiresMutex(&b.mu) if b.reconnectTimer != nil { // Stop may return false if the timer has already fired, // and the function has been called in its own goroutine, @@ -4522,11 +4540,13 @@ func (b *LocalBackend) startReconnectTimerLocked(d time.Duration) { } func (b *LocalBackend) resetAlwaysOnOverrideLocked() { + syncs.RequiresMutex(&b.mu) b.overrideAlwaysOn = false b.stopReconnectTimerLocked() } func (b *LocalBackend) stopReconnectTimerLocked() { + syncs.RequiresMutex(&b.mu) if b.reconnectTimer != nil { // Stop may return false if the timer has already fired, // and the function has been called in its own goroutine, @@ -4542,6 +4562,7 @@ func (b *LocalBackend) stopReconnectTimerLocked() { // b.mu must be held. func (b *LocalBackend) editPrefsLocked(actor ipnauth.Actor, mp *ipn.MaskedPrefs) (ipn.PrefsView, error) { + syncs.RequiresMutex(&b.mu) p0 := b.pm.CurrentPrefs() // Check if the changes in mp are allowed. @@ -5660,6 +5681,7 @@ func (b *LocalBackend) enterStateLocked(newState ipn.State) { } func (b *LocalBackend) hasNodeKeyLocked() bool { + syncs.RequiresMutex(&b.mu) // we can't use b.Prefs(), because it strips the keys, oops! p := b.pm.CurrentPrefs() return p.Valid() && p.Persist().Valid() && !p.Persist().PrivateNodeKey().IsZero() @@ -5680,9 +5702,11 @@ func (b *LocalBackend) NodeKey() key.NodePublic { // // b.mu must be held func (b *LocalBackend) nextStateLocked() ipn.State { + syncs.RequiresMutex(&b.mu) if b.health.IsUnhealthy(ipn.StateStoreHealth) { return ipn.NoState } + var ( cc = b.cc cn = b.currentNode() @@ -5758,6 +5782,8 @@ func (b *LocalBackend) nextStateLocked() ipn.State { // // requires b.mu to be held. func (b *LocalBackend) stateMachineLocked() { + syncs.RequiresMutex(&b.mu) + b.enterStateLocked(b.nextStateLocked()) } @@ -5767,6 +5793,7 @@ func (b *LocalBackend) stateMachineLocked() { // // b.mu must be held. func (b *LocalBackend) stopEngineAndWaitLocked() { + syncs.RequiresMutex(&b.mu) b.logf("stopEngineAndWait...") st, _ := b.e.ResetAndStop() // TODO: what should we do if this returns an error? b.setWgengineStatusLocked(st) @@ -5787,6 +5814,7 @@ func (b *LocalBackend) setControlClientLocked(cc controlclient.Client) { // returned value is non-nil, the caller must call Shutdown on it after // releasing b.mu. func (b *LocalBackend) resetControlClientLocked() controlclient.Client { + syncs.RequiresMutex(&b.mu) if b.cc == nil { return nil } @@ -5813,6 +5841,8 @@ func (b *LocalBackend) resetControlClientLocked() controlclient.Client { // resetAuthURLLocked resets authURL, canceling any pending interactive login. func (b *LocalBackend) resetAuthURLLocked() { + syncs.RequiresMutex(&b.mu) + b.authURL = "" b.authURLTime = time.Time{} b.authActor = nil @@ -5842,6 +5872,8 @@ func (b *LocalBackend) ShouldExposeRemoteWebClient() bool { // // b.mu must be held. func (b *LocalBackend) setWebClientAtomicBoolLocked(nm *netmap.NetworkMap) { + syncs.RequiresMutex(&b.mu) + shouldRun := !nm.HasCap(tailcfg.NodeAttrDisableWebClient) wasRunning := b.webClientAtomicBool.Swap(shouldRun) if wasRunning && !shouldRun { @@ -5854,6 +5886,8 @@ func (b *LocalBackend) setWebClientAtomicBoolLocked(nm *netmap.NetworkMap) { // // b.mu must be held. func (b *LocalBackend) setExposeRemoteWebClientAtomicBoolLocked(prefs ipn.PrefsView) { + syncs.RequiresMutex(&b.mu) + if !buildfeatures.HasWebClient { return } @@ -5982,6 +6016,8 @@ func (b *LocalBackend) RefreshExitNode() { // refreshExitNodeLocked is like RefreshExitNode but requires b.mu be held. func (b *LocalBackend) refreshExitNodeLocked() { + syncs.RequiresMutex(&b.mu) + if b.resolveExitNodeLocked() { b.authReconfigLocked() } @@ -5997,6 +6033,8 @@ func (b *LocalBackend) refreshExitNodeLocked() { // // b.mu must be held. func (b *LocalBackend) resolveExitNodeLocked() (changed bool) { + syncs.RequiresMutex(&b.mu) + if !buildfeatures.HasUseExitNode { return false } @@ -6058,6 +6096,7 @@ func (b *LocalBackend) reconcilePrefsLocked(prefs *ipn.Prefs) (changed bool) { // // b.mu must be held. func (b *LocalBackend) resolveExitNodeInPrefsLocked(prefs *ipn.Prefs) (changed bool) { + syncs.RequiresMutex(&b.mu) if !buildfeatures.HasUseExitNode { return false } diff --git a/syncs/mutex.go b/syncs/mutex.go index e61d1d1ab..8034e1712 100644 --- a/syncs/mutex.go +++ b/syncs/mutex.go @@ -16,3 +16,8 @@ type Mutex = sync.Mutex // // It's only not a sync.RWMutex when built with the ts_mutex_debug build tag. type RWMutex = sync.RWMutex + +// RequiresMutex declares the caller assumes it has the given +// mutex held. In non-debug builds, it's a no-op and compiles to +// nothing. +func RequiresMutex(mu *sync.Mutex) {} diff --git a/syncs/mutex_debug.go b/syncs/mutex_debug.go index 14b52ffe3..55a9b1231 100644 --- a/syncs/mutex_debug.go +++ b/syncs/mutex_debug.go @@ -15,4 +15,8 @@ type RWMutex struct { sync.RWMutex } +func RequiresMutex(mu *sync.Mutex) { + // TODO: check +} + // TODO(bradfitz): actually track stuff when in debug mode. From 9cc07bf9c0ba448792818b84b53cdf55137977bb Mon Sep 17 00:00:00 2001 From: Mario Minardi Date: Wed, 26 Nov 2025 16:55:38 -0700 Subject: [PATCH 33/33] .github/workflows: skip draft PRs for request review workflows Skip the "request review" workflows for PRs that are in draft to reduce noise / skip adding reviewers to PRs that are intentionally marked as not ready to review. Updates #cleanup Signed-off-by: Mario Minardi --- .github/workflows/request-dataplane-review.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/request-dataplane-review.yml b/.github/workflows/request-dataplane-review.yml index 7ae5668c3..58f6d3d0b 100644 --- a/.github/workflows/request-dataplane-review.yml +++ b/.github/workflows/request-dataplane-review.yml @@ -2,6 +2,7 @@ name: request-dataplane-review on: pull_request: + types: [ opened, synchronize, reopened, ready_for_review ] paths: - ".github/workflows/request-dataplane-review.yml" - "**/*derp*" @@ -10,6 +11,7 @@ on: jobs: request-dataplane-review: + if: github.event.pull_request.draft == false name: Request Dataplane Review runs-on: ubuntu-latest steps: