#!/usr/bin/env node /** * shard-split.js — Test shard assignment solver * * Splits Go test packages across N parallel CI runners using timing data * from previous runs. Uses a two-tier strategy: * * 1. "Light" packages (< HEAVY_MS total runtime): assigned whole to a shard * 2. "Heavy" packages (>= HEAVY_MS): individual tests distributed across * shards using -run regex filters * * Timing data sources (in priority order): * - gotestsum.json (JSONL): per-test elapsed times from previous run * - prev-report.xml (JUnit XML): package-level timing (fallback) * - Round-robin: when no timing data exists at all * * Assignment algorithm: greedy bin-packing (sort by duration desc, assign * each item to the shard with lowest current load). Simple and effective * for our distribution where 2 packages dominate 84% of runtime. * * Environment variables: * SHARD_INDEX — this runner's index (0-based) * SHARD_TOTAL — total number of shards * * Input files (in working directory): * all-packages.txt — newline-separated list of all test packages * prev-gotestsum.json — (optional) JSONL timing data from previous run * prev-report.xml — (optional) JUnit XML from previous run * * Output files (in working directory): * shard-te-packages.txt — space-separated TE packages for this shard * shard-ee-packages.txt — space-separated EE packages for this shard * shard-heavy-runs.txt — heavy package runs, one per line: "pkg REGEX" */ const fs = require("node:fs"); const { execSync } = require("node:child_process"); const SHARD_INDEX = parseInt(process.env.SHARD_INDEX); const SHARD_TOTAL = parseInt(process.env.SHARD_TOTAL); const HEAVY_MS = 600000; // 600s (10 min): packages above this get test-level splitting // Packages that should always be split test-by-test, even on a cold cache. // Without timing data the splitter falls through to alphabetical round-robin, // which places these adjacent on the same runner and overwhelms postgres. // Forcing them heavy lets `go test -list` enumerate their tests so the // bin-packer can spread them across all shards. const KNOWN_HEAVY_PKGS = new Set([ "github.com/mattermost/mattermost/server/v8/channels/api4", "github.com/mattermost/mattermost/server/v8/channels/app", ]); if (isNaN(SHARD_INDEX) || isNaN(SHARD_TOTAL) || SHARD_TOTAL < 1) { console.error("ERROR: SHARD_INDEX and SHARD_TOTAL must be set"); process.exit(1); } const allPkgs = fs .readFileSync("all-packages.txt", "utf8") .trim() .split("\n") .filter(Boolean); if (allPkgs.length === 0) { console.error("WARNING: No test packages found in all-packages.txt"); process.exit(0); } const pkgTimes = {}; const testTimes = {}; // "pkg::TestName" -> ms // ── Parse gotestsum.json (JSONL) for per-test timing ── // Each line is a JSON event; we want "pass" events with Elapsed times. if (fs.existsSync("prev-gotestsum.json")) { console.log("::group::Parsing gotestsum.json timing data"); const lines = fs.readFileSync("prev-gotestsum.json", "utf8").split("\n"); for (const line of lines) { if (!line.includes('"pass"')) continue; try { const d = JSON.parse(line); if (!d.Test || !d.Package) continue; const elapsed = Math.round((d.Elapsed || 0) * 1000); // Aggregate package time from test pass events pkgTimes[d.Package] = (pkgTimes[d.Package] || 0) + elapsed; // Top-level test name (use max elapsed for parent vs subtests) const top = d.Test.split("/")[0]; const key = d.Package + "::" + top; testTimes[key] = Math.max(testTimes[key] || 0, elapsed); } catch (e) { // Skip malformed lines } } console.log( `gotestsum.json: ${Object.keys(pkgTimes).length} packages, ${Object.keys(testTimes).length} tests`, ); console.log("::endgroup::"); } // ── Fallback: parse JUnit XML for package-level timing ── if (Object.keys(pkgTimes).length === 0 && fs.existsSync("prev-report.xml")) { console.log("::group::Parsing JUnit XML timing data (fallback)"); const xml = fs.readFileSync("prev-report.xml", "utf8"); for (const m of xml.matchAll(/]*>/g)) { const name = m[0].match(/name="([^"]+)"/)?.[1]; const time = m[0].match(/\btime="([^"]+)"/)?.[1]; if (name && time) { pkgTimes[name] = (pkgTimes[name] || 0) + Math.round(parseFloat(time) * 1000); } } console.log( `JUnit XML: ${Object.keys(pkgTimes).length} packages (no per-test data)`, ); console.log("::endgroup::"); } const hasTimingData = Object.keys(pkgTimes).length > 0; const hasTestTiming = Object.keys(testTimes).length > 0; // ── Identify heavy packages ── // Split at test level for packages above HEAVY_MS (requires per-test timing) // AND for the KNOWN_HEAVY_PKGS list (which uses go test -list discovery // to enumerate tests when no timing cache exists). // // Both checks gate on allPkgs membership so stale entries from the cached // pkgTimes (renamed/deleted packages from a prior run) can't end up in // heavyPkgs — otherwise the post-discovery fallback would emit them as // whole-package items for nonexistent packages. const allPkgsSet = new Set(allPkgs); const heavyPkgs = new Set(); if (hasTestTiming) { for (const [pkg, ms] of Object.entries(pkgTimes)) { if (ms > HEAVY_MS && allPkgsSet.has(pkg)) heavyPkgs.add(pkg); } } for (const pkg of allPkgs) { if (KNOWN_HEAVY_PKGS.has(pkg)) heavyPkgs.add(pkg); } if (heavyPkgs.size > 0) { console.log("Heavy packages (test-level splitting):"); for (const p of heavyPkgs) { const t = pkgTimes[p]; const label = t ? `${(t / 1000).toFixed(0)}s` : "no-timing"; console.log(` ${label} ${p.split("/").pop()}`); } } // ── Build work items ── // Each item is either a whole package ("P") or a single test from a heavy package ("T") const items = []; for (const pkg of allPkgs) { if (heavyPkgs.has(pkg)) { // Split into individual test items const tests = Object.entries(testTimes) .filter(([k]) => k.startsWith(pkg + "::")) .map(([k, ms]) => ({ ms, type: "T", pkg, test: k.split("::")[1] })); if (tests.length > 0) { items.push(...tests); } // If no per-test timing exists, the discovery step below enumerates // tests via `go test -list`. A final fallback to whole-package is // added after discovery for packages where both lookups failed. } else { items.push({ ms: pkgTimes[pkg] || 1, type: "P", pkg }); } } // ── Discover new/renamed tests in heavy packages ── // Tests not in the timing cache won't appear in any shard's -run regex, // silently skipping them. Discover current test names at runtime and // assign any cache-missing tests to the least-loaded shard. if (heavyPkgs.size > 0) { console.log("::group::Discovering new tests in heavy packages"); for (const pkg of heavyPkgs) { const cachedTests = new Set( Object.keys(testTimes) .filter((k) => k.startsWith(pkg + "::")) .map((k) => k.split("::")[1]), ); try { const out = execSync(`go test -list '.*' ${pkg} 2>/dev/null`, { encoding: "utf8", timeout: 300000, }); const currentTests = out .split("\n") .map((l) => l.trim()) .filter((l) => /^Test[A-Z]/.test(l)); let newCount = 0; for (const t of currentTests) { if (!cachedTests.has(t)) { // Assign a small default duration so it gets picked up items.push({ ms: 1000, type: "T", pkg, test: t }); newCount++; } } if (newCount > 0) { console.log( ` ${pkg.split("/").pop()}: ${newCount} new test(s) not in cache`, ); } } catch (e) { // go test -list can fail for packages whose TestMain requires a DB // connection (e.g. sqlstore) because the GitHub runner cannot reach the // docker-compose postgres network. Log a warning and fall back to // treating the package as a whole unit rather than failing all shards. console.error( `::warning::${pkg.split("/").pop()}: go test -list failed — treating as whole package (new tests may be skipped this run). ${e.message}`, ); } } // Ensure every heavy package has at least one item. A package can reach // this point with zero items if it has no per-test timing AND `go test // -list` failed (e.g. sqlstore on a cold cache). for (const pkg of heavyPkgs) { const hasItems = items.some((it) => it.pkg === pkg); if (!hasItems) { console.log( ` ${pkg.split("/").pop()}: no per-test data, running as whole package`, ); items.push({ ms: pkgTimes[pkg] || 1, type: "P", pkg }); } } console.log("::endgroup::"); } // Sort descending by duration for greedy bin-packing items.sort((a, b) => b.ms - a.ms); // ── Greedy bin-packing assignment ── const shards = Array.from({ length: SHARD_TOTAL }, () => ({ load: 0, whole: [], heavy: {}, })); if (!hasTimingData && heavyPkgs.size === 0) { // Round-robin fallback only when we have *no* signal — no timing cache // and no known-heavy packages to test-level-split. With heavyPkgs we // can still bin-pack: discovered tests (ms=1000 each) drive the // distribution and whole-package items (ms=1) fill in evenly. console.log("No timing data — using round-robin"); allPkgs.forEach((pkg, i) => { shards[i % SHARD_TOTAL].whole.push(pkg); }); } else { for (const item of items) { // Find shard with minimum current load const min = shards.reduce( (m, s, i) => (s.load < shards[m].load ? i : m), 0, ); shards[min].load += item.ms; if (item.type === "P") { shards[min].whole.push(item.pkg); } else { if (!shards[min].heavy[item.pkg]) shards[min].heavy[item.pkg] = []; shards[min].heavy[item.pkg].push(item.test); } } } // ── Report shard assignments ── console.log("::group::Shard assignment"); for (let i = 0; i < SHARD_TOTAL; i++) { const s = shards[i]; const hRuns = Object.keys(s.heavy).length; const hTests = Object.values(s.heavy).reduce((n, a) => n + a.length, 0); const marker = i === SHARD_INDEX ? " ← THIS SHARD" : ""; console.log( `Shard ${i}: ${(s.load / 1000).toFixed(1)}s | ${s.whole.length} pkgs` + (hRuns > 0 ? `, ${hRuns} heavy splits (${hTests} tests)` : "") + marker, ); } console.log("::endgroup::"); // ── Write output for this shard ── const myShard = shards[SHARD_INDEX]; const te = myShard.whole.filter((p) => !p.includes("/enterprise/")).join(" "); const ee = myShard.whole.filter((p) => p.includes("/enterprise/")).join(" "); fs.writeFileSync("shard-te-packages.txt", te); fs.writeFileSync("shard-ee-packages.txt", ee); // Heavy package runs: one line per run as "pkg REGEX" const heavyRuns = Object.entries(myShard.heavy).map(([pkg, tests]) => { const regex = tests.map((t) => "^" + t + "$").join("|"); return pkg + " " + regex; }); fs.writeFileSync("shard-heavy-runs.txt", heavyRuns.join("\n")); console.log( `Light packages: ${myShard.whole.length} (${te.split(" ").filter(Boolean).length} TE, ${ee.split(" ").filter(Boolean).length} EE)`, ); console.log(`Heavy package runs: ${heavyRuns.length}`);