tstest, tstest/integration, github/workflows: shard integration tests

Over four jobs for now. Updates #cleanup Change-Id: Ic2b1a739a454916893945a3f9efc480d6fcbd70b Signed-off-by: Brad Fitzpatrick <bradfitz@tailscale.com>
2 years ago · 4dec0c6eb9
parent e6ab7d3c14
commit 4dec0c6eb9
4 changed files with 58 additions and 0 deletions
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@ -41,6 +41,14 @@ concurrency:
 jobs:
  race-root-integration:
    runs-on: ubuntu-22.04
+    strategy:
+      fail-fast: false # don't abort the entire matrix if one element fails
+      matrix:
+        include:
+          - shard: '1/4'
+          - shard: '2/4'
+          - shard: '3/4'
+          - shard: '4/4'
    steps:
    - name: checkout
      uses: actions/checkout@v4
@ -48,6 +56,8 @@ jobs:
      run: ./tool/go build -o /tmp/testwrapper ./cmd/testwrapper
    - name: integration tests as root
      run: PATH=$PWD/tool:$PATH /tmp/testwrapper -exec "sudo -E" -race ./tstest/integration/
+      env:
+        TS_TEST_SHARD: ${{ matrix.shard }}

  test:
    strategy:
--- a/cmd/testwrapper/testwrapper.go
+++ b/cmd/testwrapper/testwrapper.go
@ -85,6 +85,9 @@ func runTests(ctx context.Context, attempt int, pt *packageTests, goTestArgs, te
 		fmt.Println("running", strings.Join(args, " "))
 	}
 	cmd := exec.CommandContext(ctx, "go", args...)
+	if len(pt.Tests) > 0 {
+		cmd.Env = append(os.Environ(), "TS_TEST_SHARD=") // clear test shard; run all tests we say to run
+	}
 	r, err := cmd.StdoutPipe()
 	if err != nil {
 		log.Printf("error creating stdout pipe: %v", err)
--- a/tstest/integration/integration_test.go
+++ b/tstest/integration/integration_test.go
@ -71,6 +71,7 @@ func TestMain(m *testing.M) {
 // Tests that tailscaled starts up in TUN mode, and also without data races:
 // https://github.com/tailscale/tailscale/issues/7894
 func TestTUNMode(t *testing.T) {
+	tstest.Shard(t)
 	if os.Getuid() != 0 {
 		t.Skip("skipping when not root")
 	}
@ -90,6 +91,7 @@ func TestTUNMode(t *testing.T) {
 }

 func TestOneNodeUpNoAuth(t *testing.T) {
+	tstest.Shard(t)
 	t.Parallel()
 	env := newTestEnv(t)
 	n1 := newTestNode(t, env)
@ -107,6 +109,7 @@ func TestOneNodeUpNoAuth(t *testing.T) {
 }

 func TestOneNodeExpiredKey(t *testing.T) {
+	tstest.Shard(t)
 	t.Parallel()
 	env := newTestEnv(t)
 	n1 := newTestNode(t, env)
@ -143,6 +146,7 @@ func TestOneNodeExpiredKey(t *testing.T) {
 }

 func TestControlKnobs(t *testing.T) {
+	tstest.Shard(t)
 	t.Parallel()
 	env := newTestEnv(t)
 	n1 := newTestNode(t, env)
@ -173,6 +177,7 @@ func TestControlKnobs(t *testing.T) {
 }

 func TestCollectPanic(t *testing.T) {
+	tstest.Shard(t)
 	t.Parallel()
 	env := newTestEnv(t)
 	n := newTestNode(t, env)
@ -203,6 +208,7 @@ func TestCollectPanic(t *testing.T) {
 }

 func TestControlTimeLogLine(t *testing.T) {
+	tstest.Shard(t)
 	t.Parallel()
 	env := newTestEnv(t)
 	env.LogCatcher.StoreRawJSON()
@ -226,6 +232,7 @@ func TestControlTimeLogLine(t *testing.T) {

 // test Issue 2321: Start with UpdatePrefs should save prefs to disk
 func TestStateSavedOnStart(t *testing.T) {
+	tstest.Shard(t)
 	t.Parallel()
 	env := newTestEnv(t)
 	n1 := newTestNode(t, env)
@ -262,6 +269,7 @@ func TestStateSavedOnStart(t *testing.T) {
 }

 func TestOneNodeUpAuth(t *testing.T) {
+	tstest.Shard(t)
 	t.Parallel()
 	env := newTestEnv(t, configureControl(func(control *testcontrol.Server) {
 		control.RequireAuth = true
@ -305,6 +313,7 @@ func TestOneNodeUpAuth(t *testing.T) {
 }

 func TestTwoNodes(t *testing.T) {
+	tstest.Shard(t)
 	flakytest.Mark(t, "https://github.com/tailscale/tailscale/issues/3598")
 	t.Parallel()
 	env := newTestEnv(t)
@ -354,6 +363,7 @@ func TestTwoNodes(t *testing.T) {
 // tests two nodes where the first gets a incremental MapResponse (with only
 // PeersRemoved set) saying that the second node disappeared.
 func TestIncrementalMapUpdatePeersRemoved(t *testing.T) {
+	tstest.Shard(t)
 	flakytest.Mark(t, "https://github.com/tailscale/tailscale/issues/3598")
 	t.Parallel()
 	env := newTestEnv(t)
@ -438,6 +448,7 @@ func TestIncrementalMapUpdatePeersRemoved(t *testing.T) {
 }

 func TestNodeAddressIPFields(t *testing.T) {
+	tstest.Shard(t)
 	flakytest.Mark(t, "https://github.com/tailscale/tailscale/issues/7008")
 	t.Parallel()
 	env := newTestEnv(t)
@ -465,6 +476,7 @@ func TestNodeAddressIPFields(t *testing.T) {
 }

 func TestAddPingRequest(t *testing.T) {
+	tstest.Shard(t)
 	t.Parallel()
 	env := newTestEnv(t)
 	n1 := newTestNode(t, env)
@ -517,6 +529,7 @@ func TestAddPingRequest(t *testing.T) {
 }

 func TestC2NPingRequest(t *testing.T) {
+	tstest.Shard(t)
 	t.Parallel()
 	env := newTestEnv(t)
 	n1 := newTestNode(t, env)
@ -587,6 +600,7 @@ func TestC2NPingRequest(t *testing.T) {
 // Issue 2434: when "down" (WantRunning false), tailscaled shouldn't
 // be connected to control.
 func TestNoControlConnWhenDown(t *testing.T) {
+	tstest.Shard(t)
 	t.Parallel()
 	env := newTestEnv(t)
 	n1 := newTestNode(t, env)
@ -628,6 +642,7 @@ func TestNoControlConnWhenDown(t *testing.T) {
 // Issue 2137: make sure Windows tailscaled works with the CLI alone,
 // without the GUI to kick off a Start.
 func TestOneNodeUpWindowsStyle(t *testing.T) {
+	tstest.Shard(t)
 	t.Parallel()
 	env := newTestEnv(t)
 	n1 := newTestNode(t, env)
@ -646,6 +661,7 @@ func TestOneNodeUpWindowsStyle(t *testing.T) {
 // TestNATPing creates two nodes, n1 and n2, sets up masquerades for both and
 // tries to do bi-directional pings between them.
 func TestNATPing(t *testing.T) {
+	tstest.Shard(t)
 	t.Parallel()
 	for _, v6 := range []bool{false, true} {
 		env := newTestEnv(t)
@ -773,6 +789,7 @@ func TestNATPing(t *testing.T) {
 }

 func TestLogoutRemovesAllPeers(t *testing.T) {
+	tstest.Shard(t)
 	t.Parallel()
 	env := newTestEnv(t)
 	// Spin up some nodes.
--- a/tstest/tstest.go
+++ b/tstest/tstest.go
@ -6,6 +6,10 @@ package tstest

 import (
 	"context"
+	"os"
+	"strconv"
+	"strings"
+	"sync/atomic"
 	"testing"
 	"time"

@ -46,3 +50,27 @@ func WaitFor(maxWait time.Duration, try func() error) error {
 	}
 	return err
 }
+
+var testNum atomic.Int32
+
+// Shard skips t if it's not running if the TS_TEST_SHARD test shard is set to
+// "n/m" and this test execution number in the process mod m is not equal to n-1.
+// That is, to run with 4 shards, set TS_TEST_SHARD=1/4, ..., TS_TEST_SHARD=4/4
+// for the four jobs.
+func Shard(t testing.TB) {
+	e := os.Getenv("TS_TEST_SHARD")
+	a, b, ok := strings.Cut(e, "/")
+	if !ok {
+		return
+	}
+	wantShard, _ := strconv.ParseInt(a, 10, 32)
+	shards, _ := strconv.ParseInt(b, 10, 32)
+	if wantShard == 0 || shards == 0 {
+		return
+	}
+
+	shard := ((testNum.Add(1) - 1) % int32(shards)) + 1
+	if shard != int32(wantShard) {
+		t.Skipf("skipping shard %d/%d (process has TS_TEST_SHARD=%q)", shard, shards, e)
+	}
+}