logtail: be less aggressive about re-uploads (#8117)

The retry logic was pathological in the following ways:

* If we restarted the logging service, any pending uploads
would be placed in a retry-loop where it depended on backoff.Backoff,
which was too aggresive. It would retry failures within milliseconds,
taking at least 10 retries to hit a delay of 1 second.

* In the event where a logstream was rate limited,
the aggressive retry logic would severely exacerbate the problem
since each retry would also log an error message.
It is by chance that the rate of log error spam
does not happen to exceed the rate limit itself.

We modify the retry logic in the following ways:

* We now respect the "Retry-After" header sent by the logging service.

* Lacking a "Retry-After" header, we retry after a hard-coded period of
30 to 60 seconds. This avoids the thundering-herd effect when all nodes
try reconnecting to the logging service at the same time after a restart.

* We do not treat a status 400 as having been uploaded.
This is simply not the behavior of the logging service.

Updates #tailscale/corp#11213

Signed-off-by: Joe Tsai <joetsai@digital-static.net>
pull/8121/head
Joe Tsai 2 years ago committed by GitHub
parent da90fab899
commit 84c99fe0d9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -283,7 +283,7 @@ tailscale.com/cmd/tailscaled dependencies: (generated by github.com/tailscale/de
tailscale.com/tka from tailscale.com/ipn/ipnlocal+ tailscale.com/tka from tailscale.com/ipn/ipnlocal+
W tailscale.com/tsconst from tailscale.com/net/interfaces W tailscale.com/tsconst from tailscale.com/net/interfaces
tailscale.com/tsd from tailscale.com/cmd/tailscaled+ tailscale.com/tsd from tailscale.com/cmd/tailscaled+
tailscale.com/tstime from tailscale.com/wgengine/magicsock tailscale.com/tstime from tailscale.com/wgengine/magicsock+
💣 tailscale.com/tstime/mono from tailscale.com/net/tstun+ 💣 tailscale.com/tstime/mono from tailscale.com/net/tstun+
tailscale.com/tstime/rate from tailscale.com/wgengine/filter+ tailscale.com/tstime/rate from tailscale.com/wgengine/filter+
tailscale.com/tsweb/varz from tailscale.com/cmd/tailscaled tailscale.com/tsweb/varz from tailscale.com/cmd/tailscaled

@ -13,19 +13,19 @@ import (
"fmt" "fmt"
"io" "io"
"log" "log"
mrand "math/rand"
"net/http" "net/http"
"os" "os"
"strconv" "strconv"
"strings"
"sync" "sync"
"sync/atomic" "sync/atomic"
"time" "time"
"tailscale.com/envknob" "tailscale.com/envknob"
"tailscale.com/logtail/backoff"
"tailscale.com/net/interfaces" "tailscale.com/net/interfaces"
"tailscale.com/net/netmon" "tailscale.com/net/netmon"
"tailscale.com/net/sockstats" "tailscale.com/net/sockstats"
"tailscale.com/tstime"
tslogger "tailscale.com/types/logger" tslogger "tailscale.com/types/logger"
"tailscale.com/types/logid" "tailscale.com/types/logid"
"tailscale.com/util/set" "tailscale.com/util/set"
@ -128,9 +128,6 @@ func NewLogger(cfg Config, logf tslogger.Logf) *Logger {
cfg.FlushDelayFn = func() time.Duration { return 0 } cfg.FlushDelayFn = func() time.Duration { return 0 }
} }
stdLogf := func(f string, a ...any) {
fmt.Fprintf(cfg.Stderr, strings.TrimSuffix(f, "\n")+"\n", a...)
}
var urlSuffix string var urlSuffix string
if !cfg.CopyPrivateID.IsZero() { if !cfg.CopyPrivateID.IsZero() {
urlSuffix = "?copyId=" + cfg.CopyPrivateID.String() urlSuffix = "?copyId=" + cfg.CopyPrivateID.String()
@ -148,7 +145,6 @@ func NewLogger(cfg Config, logf tslogger.Logf) *Logger {
sentinel: make(chan int32, 16), sentinel: make(chan int32, 16),
flushDelayFn: cfg.FlushDelayFn, flushDelayFn: cfg.FlushDelayFn,
timeNow: cfg.TimeNow, timeNow: cfg.TimeNow,
bo: backoff.NewBackoff("logtail", stdLogf, 30*time.Second),
metricsDelta: cfg.MetricsDelta, metricsDelta: cfg.MetricsDelta,
sockstatsLabel: sockstats.LabelLogtailLogger, sockstatsLabel: sockstats.LabelLogtailLogger,
@ -186,7 +182,6 @@ type Logger struct {
flushPending atomic.Bool flushPending atomic.Bool
sentinel chan int32 sentinel chan int32
timeNow func() time.Time timeNow func() time.Time
bo *backoff.Backoff
zstdEncoder Encoder zstdEncoder Encoder
uploadCancel func() uploadCancel func()
explainedRaw bool explainedRaw bool
@ -373,23 +368,38 @@ func (l *Logger) uploading(ctx context.Context) {
} }
} }
for len(body) > 0 { var lastError string
select { var numFailures int
case <-ctx.Done(): var firstFailure time.Time
return for len(body) > 0 && ctx.Err() == nil {
default: retryAfter, err := l.upload(ctx, body, origlen)
}
uploaded, err := l.upload(ctx, body, origlen)
if err != nil { if err != nil {
numFailures++
firstFailure = time.Now()
if !l.internetUp() { if !l.internetUp() {
fmt.Fprintf(l.stderr, "logtail: internet down; waiting\n") fmt.Fprintf(l.stderr, "logtail: internet down; waiting\n")
l.awaitInternetUp(ctx) l.awaitInternetUp(ctx)
continue continue
} }
// Only print the same message once.
if currError := err.Error(); lastError != currError {
fmt.Fprintf(l.stderr, "logtail: upload: %v\n", err) fmt.Fprintf(l.stderr, "logtail: upload: %v\n", err)
lastError = currError
}
// Sleep for the specified retryAfter period,
// otherwise default to some random value.
if retryAfter <= 0 {
retryAfter = time.Duration(30+mrand.Intn(30)) * time.Second
}
tstime.Sleep(ctx, retryAfter)
} else {
// Only print a success message after recovery.
if numFailures > 0 {
fmt.Fprintf(l.stderr, "logtail: upload succeeded after %d failures and %s\n", numFailures, time.Since(firstFailure).Round(time.Second))
} }
l.bo.BackOff(ctx, err)
if uploaded {
break break
} }
} }
@ -433,7 +443,7 @@ func (l *Logger) awaitInternetUp(ctx context.Context) {
// upload uploads body to the log server. // upload uploads body to the log server.
// origlen indicates the pre-compression body length. // origlen indicates the pre-compression body length.
// origlen of -1 indicates that the body is not compressed. // origlen of -1 indicates that the body is not compressed.
func (l *Logger) upload(ctx context.Context, body []byte, origlen int) (uploaded bool, err error) { func (l *Logger) upload(ctx context.Context, body []byte, origlen int) (retryAfter time.Duration, err error) {
const maxUploadTime = 45 * time.Second const maxUploadTime = 45 * time.Second
ctx = sockstats.WithSockStats(ctx, l.sockstatsLabel, l.Logf) ctx = sockstats.WithSockStats(ctx, l.sockstatsLabel, l.Logf)
ctx, cancel := context.WithTimeout(ctx, maxUploadTime) ctx, cancel := context.WithTimeout(ctx, maxUploadTime)
@ -460,17 +470,16 @@ func (l *Logger) upload(ctx context.Context, body []byte, origlen int) (uploaded
l.httpDoCalls.Add(1) l.httpDoCalls.Add(1)
resp, err := l.httpc.Do(req) resp, err := l.httpc.Do(req)
if err != nil { if err != nil {
return false, fmt.Errorf("log upload of %d bytes %s failed: %v", len(body), compressedNote, err) return 0, fmt.Errorf("log upload of %d bytes %s failed: %v", len(body), compressedNote, err)
} }
defer resp.Body.Close() defer resp.Body.Close()
if resp.StatusCode != 200 { if resp.StatusCode != http.StatusOK {
uploaded = resp.StatusCode == 400 // the server saved the logs anyway n, _ := strconv.Atoi(resp.Header.Get("Retry-After"))
b, _ := io.ReadAll(io.LimitReader(resp.Body, 1<<20)) b, _ := io.ReadAll(io.LimitReader(resp.Body, 1<<10))
return uploaded, fmt.Errorf("log upload of %d bytes %s failed %d: %q", len(body), compressedNote, resp.StatusCode, b) return time.Duration(n) * time.Second, fmt.Errorf("log upload of %d bytes %s failed %d: %s", len(body), compressedNote, resp.StatusCode, bytes.TrimSpace(b))
} }
return 0, nil
return true, nil
} }
// Flush uploads all logs to the server. It blocks until complete or there is an // Flush uploads all logs to the server. It blocks until complete or there is an

Loading…
Cancel
Save