mirror of https://github.com/tailscale/tailscale/
You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
321 lines
11 KiB
Go
321 lines
11 KiB
Go
// Copyright (c) Tailscale Inc & AUTHORS
|
|
// SPDX-License-Identifier: BSD-3-Clause
|
|
|
|
// Package metrics provides logging and reporting for policy settings and scopes.
|
|
package metrics
|
|
|
|
import (
|
|
"strings"
|
|
"sync"
|
|
|
|
xmaps "golang.org/x/exp/maps"
|
|
|
|
"tailscale.com/syncs"
|
|
"tailscale.com/types/lazy"
|
|
"tailscale.com/util/clientmetric"
|
|
"tailscale.com/util/mak"
|
|
"tailscale.com/util/slicesx"
|
|
"tailscale.com/util/syspolicy/internal"
|
|
"tailscale.com/util/syspolicy/internal/loggerx"
|
|
"tailscale.com/util/syspolicy/setting"
|
|
"tailscale.com/util/testenv"
|
|
)
|
|
|
|
var lazyReportMetrics lazy.SyncValue[bool] // used as a test hook
|
|
|
|
// ShouldReport reports whether metrics should be reported on the current environment.
|
|
func ShouldReport() bool {
|
|
return lazyReportMetrics.Get(func() bool {
|
|
// macOS, iOS and tvOS create their own metrics,
|
|
// and we don't have syspolicy on any other platforms.
|
|
return setting.PlatformList{"android", "windows"}.HasCurrent()
|
|
})
|
|
}
|
|
|
|
// Reset metrics for the specified policy origin.
|
|
func Reset(origin *setting.Origin) {
|
|
scopeMetrics(origin).Reset()
|
|
}
|
|
|
|
// ReportConfigured updates metrics and logs that the specified setting is
|
|
// configured with the given value in the origin.
|
|
func ReportConfigured(origin *setting.Origin, setting *setting.Definition, value any) {
|
|
settingMetricsFor(setting).ReportValue(origin, value)
|
|
}
|
|
|
|
// ReportError updates metrics and logs that the specified setting has an error
|
|
// in the origin.
|
|
func ReportError(origin *setting.Origin, setting *setting.Definition, err error) {
|
|
settingMetricsFor(setting).ReportError(origin, err)
|
|
}
|
|
|
|
// ReportNotConfigured updates metrics and logs that the specified setting is
|
|
// not configured in the origin.
|
|
func ReportNotConfigured(origin *setting.Origin, setting *setting.Definition) {
|
|
settingMetricsFor(setting).Reset(origin)
|
|
}
|
|
|
|
// metric is an interface implemented by [clientmetric.Metric] and [funcMetric].
|
|
type metric interface {
|
|
Add(v int64)
|
|
Set(v int64)
|
|
}
|
|
|
|
// policyScopeMetrics are metrics that apply to an entire policy scope rather
|
|
// than a specific policy setting.
|
|
type policyScopeMetrics struct {
|
|
hasAny metric
|
|
numErrored metric
|
|
}
|
|
|
|
func newScopeMetrics(scope setting.Scope) *policyScopeMetrics {
|
|
prefix := metricScopeName(scope)
|
|
// {os}_syspolicy_{scope_unless_device}_any
|
|
// Example: windows_syspolicy_any or windows_syspolicy_user_any.
|
|
hasAny := newMetric([]string{prefix, "any"}, clientmetric.TypeGauge)
|
|
// {os}_syspolicy_{scope_unless_device}_errors
|
|
// Example: windows_syspolicy_errors or windows_syspolicy_user_errors.
|
|
//
|
|
// TODO(nickkhyl): maybe make the `{os}_syspolicy_errors` metric a gauge rather than a counter?
|
|
// It was a counter prior to https://github.com/tailscale/tailscale/issues/12687, so I kept it as such.
|
|
// But I think a gauge makes more sense: syspolicy errors indicate a mismatch between the expected
|
|
// policy value type or format and the actual value read from the underlying store (like the Windows Registry).
|
|
// We'll encounter the same error every time we re-read the policy setting from the backing store
|
|
// until the policy value is corrected by the user, or until we fix the bug in the code or ADMX.
|
|
// There's probably no reason to count and accumulate them over time.
|
|
//
|
|
// Brief discussion: https://github.com/tailscale/tailscale/pull/13113#discussion_r1723475136
|
|
numErrored := newMetric([]string{prefix, "errors"}, clientmetric.TypeCounter)
|
|
return &policyScopeMetrics{hasAny, numErrored}
|
|
}
|
|
|
|
// ReportHasSettings is called when there's any configured policy setting in the scope.
|
|
func (m *policyScopeMetrics) ReportHasSettings() {
|
|
if m != nil {
|
|
m.hasAny.Set(1)
|
|
}
|
|
}
|
|
|
|
// ReportError is called when there's any errored policy setting in the scope.
|
|
func (m *policyScopeMetrics) ReportError() {
|
|
if m != nil {
|
|
m.numErrored.Add(1)
|
|
}
|
|
}
|
|
|
|
// Reset is called to reset the policy scope metrics, such as when the policy scope
|
|
// is about to be reloaded.
|
|
func (m *policyScopeMetrics) Reset() {
|
|
if m != nil {
|
|
m.hasAny.Set(0)
|
|
// numErrored is a counter and cannot be (re-)set.
|
|
}
|
|
}
|
|
|
|
// settingMetrics are metrics for a single policy setting in one or more scopes.
|
|
type settingMetrics struct {
|
|
definition *setting.Definition
|
|
isSet []metric // by scope
|
|
hasErrors []metric // by scope
|
|
}
|
|
|
|
// ReportValue is called when the policy setting is found to be configured in the specified source.
|
|
func (m *settingMetrics) ReportValue(origin *setting.Origin, v any) {
|
|
if m == nil {
|
|
return
|
|
}
|
|
if scope := origin.Scope().Kind(); scope >= 0 && int(scope) < len(m.isSet) {
|
|
m.isSet[scope].Set(1)
|
|
m.hasErrors[scope].Set(0)
|
|
}
|
|
scopeMetrics(origin).ReportHasSettings()
|
|
loggerx.Verbosef("%v(%q) = %v", origin, m.definition.Key(), v)
|
|
}
|
|
|
|
// ReportError is called when there's an error with the policy setting in the specified source.
|
|
func (m *settingMetrics) ReportError(origin *setting.Origin, err error) {
|
|
if m == nil {
|
|
return
|
|
}
|
|
if scope := origin.Scope().Kind(); int(scope) < len(m.hasErrors) {
|
|
m.isSet[scope].Set(0)
|
|
m.hasErrors[scope].Set(1)
|
|
}
|
|
scopeMetrics(origin).ReportError()
|
|
loggerx.Errorf("%v(%q): %v", origin, m.definition.Key(), err)
|
|
}
|
|
|
|
// Reset is called to reset the policy setting's metrics, such as when
|
|
// the policy setting does not exist or the source containing the policy
|
|
// is about to be reloaded.
|
|
func (m *settingMetrics) Reset(origin *setting.Origin) {
|
|
if m == nil {
|
|
return
|
|
}
|
|
if scope := origin.Scope().Kind(); scope >= 0 && int(scope) < len(m.isSet) {
|
|
m.isSet[scope].Set(0)
|
|
m.hasErrors[scope].Set(0)
|
|
}
|
|
}
|
|
|
|
// metricFn is a function that adds or sets a metric value.
|
|
type metricFn func(name string, typ clientmetric.Type, v int64)
|
|
|
|
// funcMetric implements [metric] by calling the specified add and set functions.
|
|
// Used for testing, and with nil functions on platforms that do not support
|
|
// syspolicy, and on platforms that report policy metrics from the GUI.
|
|
type funcMetric struct {
|
|
name string
|
|
typ clientmetric.Type
|
|
add, set metricFn
|
|
}
|
|
|
|
func (m funcMetric) Add(v int64) {
|
|
if m.add != nil {
|
|
m.add(m.name, m.typ, v)
|
|
}
|
|
}
|
|
|
|
func (m funcMetric) Set(v int64) {
|
|
if m.set != nil {
|
|
m.set(m.name, m.typ, v)
|
|
}
|
|
}
|
|
|
|
var (
|
|
lazyDeviceMetrics lazy.SyncValue[*policyScopeMetrics]
|
|
lazyProfileMetrics lazy.SyncValue[*policyScopeMetrics]
|
|
lazyUserMetrics lazy.SyncValue[*policyScopeMetrics]
|
|
)
|
|
|
|
func scopeMetrics(origin *setting.Origin) *policyScopeMetrics {
|
|
switch origin.Scope().Kind() {
|
|
case setting.DeviceSetting:
|
|
return lazyDeviceMetrics.Get(func() *policyScopeMetrics {
|
|
return newScopeMetrics(setting.DeviceSetting)
|
|
})
|
|
case setting.ProfileSetting:
|
|
return lazyProfileMetrics.Get(func() *policyScopeMetrics {
|
|
return newScopeMetrics(setting.ProfileSetting)
|
|
})
|
|
case setting.UserSetting:
|
|
return lazyUserMetrics.Get(func() *policyScopeMetrics {
|
|
return newScopeMetrics(setting.UserSetting)
|
|
})
|
|
default:
|
|
panic("unreachable")
|
|
}
|
|
}
|
|
|
|
var (
|
|
settingMetricsMu sync.RWMutex
|
|
settingMetricsMap map[setting.Key]*settingMetrics
|
|
)
|
|
|
|
func settingMetricsFor(setting *setting.Definition) *settingMetrics {
|
|
settingMetricsMu.RLock()
|
|
metrics, ok := settingMetricsMap[setting.Key()]
|
|
settingMetricsMu.RUnlock()
|
|
if ok {
|
|
return metrics
|
|
}
|
|
return settingMetricsForSlow(setting)
|
|
}
|
|
|
|
func settingMetricsForSlow(d *setting.Definition) *settingMetrics {
|
|
settingMetricsMu.Lock()
|
|
defer settingMetricsMu.Unlock()
|
|
if metrics, ok := settingMetricsMap[d.Key()]; ok {
|
|
return metrics
|
|
}
|
|
|
|
// The loop below initializes metrics for each scope where a policy setting defined in 'd'
|
|
// can be configured. The [setting.Definition.Scope] returns the narrowest scope at which the policy
|
|
// setting may be configured, and more specific scopes always have higher numeric values.
|
|
// In other words, [setting.UserSetting] > [setting.ProfileScope] > [setting.DeviceScope].
|
|
// It's impossible for a policy setting to be configured in a scope with a higher numeric value than
|
|
// the [setting.Definition.Scope] returns. Therefore, a policy setting can be configured in at
|
|
// most d.Scope()+1 different scopes, and having d.Scope()+1 metrics for the corresponding scopes
|
|
// is always sufficient for [settingMetrics]; it won't access elements past the end of the slice
|
|
// or need to reallocate with a longer slice if one of those arrives.
|
|
isSet := make([]metric, d.Scope()+1)
|
|
hasErrors := make([]metric, d.Scope()+1)
|
|
for i := range isSet {
|
|
scope := setting.Scope(i)
|
|
// {os}_syspolicy_{key}_{scope_unless_device}
|
|
// Example: windows_syspolicy_AdminConsole or windows_syspolicy_AdminConsole_user.
|
|
isSet[i] = newSettingMetric(d.Key(), scope, "", clientmetric.TypeGauge)
|
|
// {os}_syspolicy_{key}_{scope_unless_device}_error
|
|
// Example: windows_syspolicy_AdminConsole_error or windows_syspolicy_TestSetting01_user_error.
|
|
hasErrors[i] = newSettingMetric(d.Key(), scope, "error", clientmetric.TypeGauge)
|
|
}
|
|
metrics := &settingMetrics{d, isSet, hasErrors}
|
|
mak.Set(&settingMetricsMap, d.Key(), metrics)
|
|
return metrics
|
|
}
|
|
|
|
// hooks for testing
|
|
var addMetricTestHook, setMetricTestHook syncs.AtomicValue[metricFn]
|
|
|
|
// SetHooksForTest sets the specified addMetric and setMetric functions
|
|
// as the metric functions for the duration of tb and all its subtests.
|
|
func SetHooksForTest(tb internal.TB, addMetric, setMetric metricFn) {
|
|
oldAddMetric := addMetricTestHook.Swap(addMetric)
|
|
oldSetMetric := setMetricTestHook.Swap(setMetric)
|
|
tb.Cleanup(func() {
|
|
addMetricTestHook.Store(oldAddMetric)
|
|
setMetricTestHook.Store(oldSetMetric)
|
|
})
|
|
|
|
settingMetricsMu.Lock()
|
|
oldSettingMetricsMap := xmaps.Clone(settingMetricsMap)
|
|
clear(settingMetricsMap)
|
|
settingMetricsMu.Unlock()
|
|
tb.Cleanup(func() {
|
|
settingMetricsMu.Lock()
|
|
settingMetricsMap = oldSettingMetricsMap
|
|
settingMetricsMu.Unlock()
|
|
})
|
|
|
|
// (re-)set the scope metrics to use the test hooks for the duration of tb.
|
|
lazyDeviceMetrics.SetForTest(tb, newScopeMetrics(setting.DeviceSetting), nil)
|
|
lazyProfileMetrics.SetForTest(tb, newScopeMetrics(setting.ProfileSetting), nil)
|
|
lazyUserMetrics.SetForTest(tb, newScopeMetrics(setting.UserSetting), nil)
|
|
}
|
|
|
|
func newSettingMetric(key setting.Key, scope setting.Scope, suffix string, typ clientmetric.Type) metric {
|
|
name := strings.ReplaceAll(string(key), string(setting.KeyPathSeparator), "_")
|
|
return newMetric([]string{name, metricScopeName(scope), suffix}, typ)
|
|
}
|
|
|
|
func newMetric(nameParts []string, typ clientmetric.Type) metric {
|
|
name := strings.Join(slicesx.Filter([]string{internal.OS(), "syspolicy"}, nameParts, isNonEmpty), "_")
|
|
switch {
|
|
case !ShouldReport():
|
|
return &funcMetric{name: name, typ: typ}
|
|
case testenv.InTest():
|
|
return &funcMetric{name, typ, addMetricTestHook.Load(), setMetricTestHook.Load()}
|
|
case typ == clientmetric.TypeCounter:
|
|
return clientmetric.NewCounter(name)
|
|
case typ == clientmetric.TypeGauge:
|
|
return clientmetric.NewGauge(name)
|
|
default:
|
|
panic("unreachable")
|
|
}
|
|
}
|
|
|
|
func isNonEmpty(s string) bool { return s != "" }
|
|
|
|
func metricScopeName(scope setting.Scope) string {
|
|
switch scope {
|
|
case setting.DeviceSetting:
|
|
return ""
|
|
case setting.ProfileSetting:
|
|
return "profile"
|
|
case setting.UserSetting:
|
|
return "user"
|
|
default:
|
|
panic("unreachable")
|
|
}
|
|
}
|