From 564edae200930434055057e3f0585819b8c32f5b Mon Sep 17 00:00:00 2001
From: Adam Hamrick <adam.hamrick@smartcontract.com>
Date: Thu, 28 May 2026 16:15:11 -0400
Subject: [PATCH 1/9] Add benchmarks and optimize diagnose overhead

---
 .gitignore                                   |  5 +-
 AGENTS.md                                    | 12 ++-
 example_test.go                              |  6 --
 internal/runner/analyze.go                   | 58 ++++++++++--
 internal/runner/diagnose_progress.go         | 95 --------------------
 internal/runner/diagnose_progress_test.go    | 55 ------------
 internal/runner/runner.go                    | 83 +++++------------
 internal/runner/runner_bench_test.go         | 69 ++++++++++++++
 internal/runner/testdata/dummy/dummy_test.go |  5 ++
 justfile                                     | 19 +++-
 10 files changed, 176 insertions(+), 231 deletions(-)
 create mode 100644 internal/runner/runner_bench_test.go
 create mode 100644 internal/runner/testdata/dummy/dummy_test.go
diff --git a/.gitignore b/.gitignore
index e04f680..16357a2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,10 +9,7 @@ diagnose-*/
 
 .env
 .venv/
-
-# Local Phoenix DB for evals/eval.py
-evals/.phoenix/
-__pycache__/
+*.test
 
 # Agents
 .antigravitycli/
diff --git a/AGENTS.md b/AGENTS.md
index a862610..edd5deb 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -18,7 +18,6 @@ go test ./...                 # Test
 - `internal/runner/` — core test execution. `Diagnose` is the main entry point; `diagnoseRunHooks` carries iteration hooks as `func(context.Context) error` fields.
 - `internal/config/` — Cobra flag registry config loading. `config.App` is the unified config struct.
 - `internal/output/` — output printer abstraction. `--ai-output` flag controls format.
-- `internal/repo/` — git/module helpers.
 
 ## Critical decisions
 
@@ -27,3 +26,14 @@ go test ./...                 # Test
 **Public API vs CLI are separate layers.** `hooks.go` / `testrig.go` are the library surface. The CLI in `internal/cmd/` wraps that. Don't put CLI logic in the root package.
 
 **Hook catalog drives CLI flags and docs.** Add lifecycle hooks in [`internal/hooks/catalog.go`](internal/hooks/catalog.go) (`Catalog` slice), then wire `RunOptions.Hook`, public `hooks.go` registrar, and `go generate`. Document timing in godoc on registrars in [`hooks.go`](hooks.go) as `// Name registers a hook to <when>.` `hooks.RegisterPersistentFlags` registers Cobra flags from the catalog.
+
+## Diagnose overhead benchmark
+
+`internal/runner/runner_bench_test.go` defines a side-by-side pair against `internal/runner/testdata/dummy/...`:
+
+- `BenchmarkBaselineGoTest` — raw `go test -json` (the floor).
+- `BenchmarkDiagnose` — one `Diagnose` iteration with `parallel=1`.
+
+The overhead `Diagnose` adds is the **delta** in `ns/op`, `B/op`, and `allocs/op` between the two. Run with `just bench_overhead` (uses `-count=3` for variance). For a memory profile of the diagnose pipeline, `just bench_overhead_profile` writes `out/bench/diagnose_mem_runner.txt` (look for `scanIterationJSONL`, `applyTestEvent`, `buildReportFromAggs`). CPU profile is low-signal — parent blocks on the child subprocess.
+
+Drill down into the parent-only pipeline (no subprocess) via the micro-benches in `internal/runner/analyze_bench_test.go`.
diff --git a/example_test.go b/example_test.go
index cff11f4..a69ae6d 100644
--- a/example_test.go
+++ b/example_test.go
@@ -12,18 +12,12 @@ func ExampleRun() {
 		// GlobalSetup runs once before any tests start.
 		testrig.GlobalSetup(func(_ context.Context) error {
 			fmt.Println("Starting mock background service...")
-			// Simulate starting a dependency, e.g.:
-			// cmd := exec.CommandContext(ctx, "docker", "compose", "up", "-d")
-			// return cmd.Run()
 			return nil
 		}),
 
 		// IterationSetup runs before each diagnose iteration.
 		testrig.IterationSetup(func(_ context.Context) error {
 			fmt.Println("Resetting database state for next iteration...")
-			// Simulate resetting state:
-			// cmd := exec.CommandContext(ctx, "psql", "-c", "TRUNCATE events")
-			// return cmd.Run()
 			return nil
 		}),
 
diff --git a/internal/runner/analyze.go b/internal/runner/analyze.go
index d6a92f3..40a2c66 100644
--- a/internal/runner/analyze.go
+++ b/internal/runner/analyze.go
@@ -14,6 +14,7 @@ import (
 	"sort"
 	"strconv"
 	"strings"
+	"sync"
 	"time"
 
 	"charm.land/lipgloss/v2"
@@ -206,12 +207,18 @@ func (rep *Report) TestGroups() []TestGroup {
 // coupling the parser to the filesystem.
 type LogMap map[testKey]map[int]string
 
+var readerPool = sync.Pool{
+	New: func() any {
+		return bufio.NewReaderSize(nil, 1024*1024)
+	},
+}
+
 // Analyze reads per-iteration test2json streams and classifies tests.
 // Malformed lines are silently skipped (go test can interleave non-JSON).
 func Analyze(iterations []io.Reader, slowThreshold time.Duration) (*Report, LogMap, error) {
 	aggs := make(map[testKey]*aggregate)
 	for i, r := range iterations {
-		if err := scanIterationJSONL(r, i, aggs, nil); err != nil {
+		if err := scanIterationJSONL(r, i, aggs, nil, slowThreshold); err != nil {
 			return nil, nil, err
 		}
 	}
@@ -241,16 +248,35 @@ func (a *aggregate) recordElapsed(iterIdx int, d time.Duration) {
 
 // scanIterationJSONL merges one iteration's JSONL stream into aggs at iterIdx.
 // meta may be nil; when set, records e.g. compile/build failure from FailedBuild on fail events.
-func scanIterationJSONL(r io.Reader, iterIdx int, aggs map[testKey]*aggregate, meta *iterationScanMeta) error {
-	reader := bufio.NewReaderSize(r, 1024*1024)
+func scanIterationJSONL(
+	r io.Reader,
+	iterIdx int,
+	aggs map[testKey]*aggregate,
+	meta *iterationScanMeta,
+	slowThreshold time.Duration,
+) error {
+	reader := readerPool.Get().(*bufio.Reader)
+	reader.Reset(r)
+	defer func() {
+		reader.Reset(nil)
+		readerPool.Put(reader)
+	}()
+
 	for {
-		line, err := reader.ReadBytes('\n')
+		line, err := reader.ReadSlice('\n')
+		if err == bufio.ErrBufferFull {
+			rest, err2 := reader.ReadBytes('\n')
+			line = append(append([]byte(nil), line...), rest...)
+			err = err2
+		}
+
 		if len(line) > 0 && line[0] == '{' {
 			var ev TestEvent
 			if json.Unmarshal(line, &ev) == nil {
-				applyTestEvent(aggs, iterIdx, &ev, meta)
+				applyTestEvent(aggs, iterIdx, &ev, meta, slowThreshold)
 			}
 		}
+
 		if err != nil {
 			if err != io.EOF {
 				return fmt.Errorf("reading iteration %d: %w", iterIdx, err)
@@ -260,7 +286,13 @@ func scanIterationJSONL(r io.Reader, iterIdx int, aggs map[testKey]*aggregate, m
 	}
 }
 
-func applyTestEvent(aggs map[testKey]*aggregate, iterIdx int, ev *TestEvent, meta *iterationScanMeta) {
+func applyTestEvent(
+	aggs map[testKey]*aggregate,
+	iterIdx int,
+	ev *TestEvent,
+	meta *iterationScanMeta,
+	slowThreshold time.Duration,
+) {
 	key := testKey{Package: ev.Package, Test: ev.Test}
 	a := aggs[key]
 	if a == nil {
@@ -271,7 +303,11 @@ func applyTestEvent(aggs map[testKey]*aggregate, iterIdx int, ev *TestEvent, met
 	case "pass":
 		a.passes++
 		a.iterations[iterIdx] = struct{}{}
-		a.recordElapsed(iterIdx, seconds(ev.Elapsed))
+		el := seconds(ev.Elapsed)
+		a.recordElapsed(iterIdx, el)
+		if !a.timedOut && (slowThreshold == 0 || el <= slowThreshold) {
+			delete(a.outputs, iterIdx)
+		}
 	case "fail":
 		if meta != nil && ev.FailedBuild != "" {
 			meta.sawFailedBuild = true
@@ -284,7 +320,11 @@ func applyTestEvent(aggs map[testKey]*aggregate, iterIdx int, ev *TestEvent, met
 		a.skips++
 		a.iterations[iterIdx] = struct{}{}
 		a.skipIters[iterIdx] = true
-		a.recordElapsed(iterIdx, seconds(ev.Elapsed))
+		el := seconds(ev.Elapsed)
+		a.recordElapsed(iterIdx, el)
+		if !a.timedOut {
+			delete(a.outputs, iterIdx)
+		}
 	case "output":
 		if strings.Contains(ev.Output, timeoutPanic) {
 			a.timedOut = true
@@ -559,7 +599,7 @@ func countNamedTestsSkippedInAggs(aggs map[testKey]*aggregate) int {
 func DigestIterationJSONL(r io.Reader, slowThreshold time.Duration) (IterationDigest, error) {
 	aggs := make(map[testKey]*aggregate)
 	var meta iterationScanMeta
-	if err := scanIterationJSONL(r, 0, aggs, &meta); err != nil {
+	if err := scanIterationJSONL(r, 0, aggs, &meta, slowThreshold); err != nil {
 		return IterationDigest{}, err
 	}
 	reattributeTimeouts(aggs, newAggregate)
diff --git a/internal/runner/diagnose_progress.go b/internal/runner/diagnose_progress.go
index cf2d219..9e0a44a 100644
--- a/internal/runner/diagnose_progress.go
+++ b/internal/runner/diagnose_progress.go
@@ -1,13 +1,8 @@
 package runner
 
 import (
-	"context"
-	"encoding/json"
-	"errors"
 	"fmt"
 	"io"
-	"os"
-	"os/exec"
 	"slices"
 	"strings"
 	"sync"
@@ -70,44 +65,6 @@ func packagePatternsFromEnd(args []string) []string {
 	return pkgs
 }
 
-// listTestPackageCount runs `go list -test -e` for the trailing package patterns
-// in go test arguments (see packagePatternsFromEnd). On error or no patterns,
-// returns an error or zero packages.
-func listTestPackageCount(ctx context.Context, repoRoot string, goTestArgs []string) (int, error) {
-	pkgs := packagePatternsFromEnd(goTestArgs)
-	if len(pkgs) == 0 {
-		return 0, errors.New("no package patterns in go test arguments (put packages last, after flags)")
-	}
-	// Binary is fixed ("go"); pkgs come from the user's CLI package patterns by design.
-	//nolint:gosec // G204: forwarded package patterns from CLI invocation
-	cmd := exec.CommandContext(ctx, "go", append([]string{"list", "-test", "-e", "-f", "{{.ImportPath}}"}, pkgs...)...)
-	cmd.Dir = repoRoot
-	cmd.Env = os.Environ()
-	out, err := cmd.Output()
-	if err != nil {
-		return 0, err
-	}
-	n := 0
-	for line := range strings.SplitSeq(string(out), "\n") {
-		if strings.TrimSpace(line) != "" {
-			n++
-		}
-	}
-	if n == 0 {
-		return 0, errors.New("go list returned no packages")
-	}
-	return n, nil
-}
-
-// diagnoseProgress tracks completed packages from a go test -json stream.
-type diagnoseProgress struct {
-	mu         sync.Mutex
-	done       map[string]struct{}
-	lastPkg    string
-	pkgOutcome map[string]string // package import path → pass|fail|skip (package-level events only)
-	total      int               // -1 when denominator is unknown (go list failed or empty)
-}
-
 type parallelDiagnoseProgress struct {
 	mu              sync.Mutex
 	renderMu        sync.Mutex
@@ -138,14 +95,6 @@ func newParallelDiagnoseProgressAt(totalIterations int, poolStartedAt time.Time)
 	}
 }
 
-func newDiagnoseProgress(totalPackages int) *diagnoseProgress {
-	return &diagnoseProgress{
-		done:       make(map[string]struct{}),
-		pkgOutcome: make(map[string]string),
-		total:      totalPackages,
-	}
-}
-
 func (p *parallelDiagnoseProgress) start(iteration int) {
 	if p == nil {
 		return
@@ -214,50 +163,6 @@ func (p *parallelDiagnoseProgress) renderSnapshot(
 	return completed, total, actives, poolElapsed
 }
 
-// onTestJSONLine updates state from one JSONL line. Returns true if the number
-// of completed packages increased (for throttled redraws).
-func (p *diagnoseProgress) onTestJSONLine(line []byte) (completedIncreased bool) {
-	if len(line) == 0 || line[0] != '{' {
-		return false
-	}
-	var ev TestEvent
-	if err := json.Unmarshal(line, &ev); err != nil {
-		return false
-	}
-	if ev.Package != "" {
-		p.mu.Lock()
-		p.lastPkg = ev.Package
-		p.mu.Unlock()
-	}
-	if !isPackageTerminalEvent(&ev) {
-		return false
-	}
-	p.mu.Lock()
-	defer p.mu.Unlock()
-	p.pkgOutcome[ev.Package] = ev.Action
-	before := len(p.done)
-	p.done[ev.Package] = struct{}{}
-	return len(p.done) > before
-}
-
-func isPackageTerminalEvent(ev *TestEvent) bool {
-	if ev.Package == "" || ev.Test != "" {
-		return false
-	}
-	switch ev.Action {
-	case "pass", "fail", "skip":
-		return true
-	default:
-		return false
-	}
-}
-
-func (p *diagnoseProgress) snapshot() (completed int, total int, lastPkg string, outcome string) {
-	p.mu.Lock()
-	defer p.mu.Unlock()
-	return len(p.done), p.total, p.lastPkg, p.pkgOutcome[p.lastPkg]
-}
-
 // progressBracket wraps inner (already styled) in muted square brackets.
 func progressBracket(inner string) string {
 	return termstyle.Muted.Render("[") + inner + termstyle.Muted.Render("]")
diff --git a/internal/runner/diagnose_progress_test.go b/internal/runner/diagnose_progress_test.go
index 9c4cd34..d4188e4 100644
--- a/internal/runner/diagnose_progress_test.go
+++ b/internal/runner/diagnose_progress_test.go
@@ -11,61 +11,6 @@ import (
 	"github.com/smartcontractkit/testrig/internal/output"
 )
 
-func TestDiagnoseProgress_onTestJSONLine_packageTerminal(t *testing.T) {
-	t.Parallel()
-	p := newDiagnoseProgress(2)
-
-	require.False(t, p.onTestJSONLine([]byte(`not json`)))
-	require.False(t, p.onTestJSONLine([]byte(`{"Action":"run","Package":"a/b","Test":"TestX"}`)))
-
-	require.True(t, p.onTestJSONLine([]byte(`{"Action":"pass","Package":"a/b"}`)))
-	c, tot, _, _ := p.snapshot()
-	require.Equal(t, 1, c)
-	require.Equal(t, 2, tot)
-
-	// Duplicate package-level pass must not report a second completion tick.
-	require.False(t, p.onTestJSONLine([]byte(`{"Action":"pass","Package":"a/b"}`)))
-	c, _, _, _ = p.snapshot()
-	require.Equal(t, 1, c)
-
-	require.True(t, p.onTestJSONLine([]byte(`{"Action":"fail","Package":"c/d"}`)))
-	c, _, _, _ = p.snapshot()
-	require.Equal(t, 2, c)
-}
-
-func TestDiagnoseProgress_onTestJSONLine_skipFail(t *testing.T) {
-	t.Parallel()
-	p := newDiagnoseProgress(1)
-	require.True(t, p.onTestJSONLine([]byte(`{"Action":"skip","Package":"p"}`)))
-	c, _, _, _ := p.snapshot()
-	require.Equal(t, 1, c)
-
-	p2 := newDiagnoseProgress(1)
-	require.True(t, p2.onTestJSONLine([]byte(`{"Action":"fail","Package":"p"}`)))
-	c2, _, _, _ := p2.snapshot()
-	require.Equal(t, 1, c2)
-}
-
-func TestDiagnoseProgress_lastPkgUpdates(t *testing.T) {
-	t.Parallel()
-	p := newDiagnoseProgress(10)
-	p.onTestJSONLine([]byte(`{"Action":"run","Package":"x/y","Test":"TestZ"}`))
-	_, _, last, _ := p.snapshot()
-	require.Equal(t, "x/y", last)
-}
-
-func TestDiagnoseProgress_pkgOutcomeOnTerminal(t *testing.T) {
-	t.Parallel()
-	p := newDiagnoseProgress(5)
-	p.onTestJSONLine([]byte(`{"Action":"run","Package":"p/q","Test":"TestZ"}`))
-	_, _, _, out := p.snapshot()
-	require.Empty(t, out)
-	p.onTestJSONLine([]byte(`{"Action":"pass","Package":"p/q"}`))
-	_, _, last, out := p.snapshot()
-	require.Equal(t, "p/q", last)
-	require.Equal(t, "pass", out)
-}
-
 func TestEllipsizeRight(t *testing.T) {
 	t.Parallel()
 	require.Equal(t, "short", ellipsizeRight("short", 10))
diff --git a/internal/runner/runner.go b/internal/runner/runner.go
index 86b9b6a..7f640d6 100644
--- a/internal/runner/runner.go
+++ b/internal/runner/runner.go
@@ -34,6 +34,7 @@ type diagnoseIterationParams struct {
 	Out              *output.Printer
 	ResultsDir       string
 	GoTestArgs       []string
+	ModuleDir        string
 	Iteration        int
 	ShuffleSeed      int64
 	Env              []string
@@ -102,8 +103,6 @@ func Gotestsum(ctx context.Context, conf *config.App, args []string) error {
 // iterSetup and iterTeardown run before/after each iteration. Either may be
 // nil. Teardown runs even when the iteration's go test invocation fails; its
 // error is reported only when the iteration itself succeeded.
-//
-//nolint:gocyclo
 func Diagnose(
 	ctx context.Context,
 	conf *config.App,
@@ -384,6 +383,12 @@ func runDiagnoseIterations(
 	if hooks.runIteration == nil {
 		hooks.runIteration = diagnoseIteration
 	}
+
+	moduleDir, adjustedArgs, err := resolveModuleDir(conf.RepoRoot, goTestArgs)
+	if err != nil {
+		return diagnoseRunState{}, err
+	}
+
 	if hooks.seed == nil {
 		hooks.seed = func() int64 { return rand.Int64N(1<<62) + 1 } //nolint:gosec // G404: non-crypto seed for test shuffle
 	}
@@ -432,7 +437,8 @@ func runDiagnoseIterations(
 		conf:             conf,
 		out:              out,
 		resultsDir:       resultsDir,
-		goTestArgs:       goTestArgs,
+		goTestArgs:       adjustedArgs,
+		moduleDir:        moduleDir,
 		hooks:            hooks,
 		parallel:         parallel,
 		parallelProgress: parallelProgress,
@@ -487,6 +493,7 @@ type diagnoseWorker struct {
 	out              *output.Printer
 	resultsDir       string
 	goTestArgs       []string
+	moduleDir        string
 	hooks            diagnoseRunHooks
 	parallel         int
 	parallelProgress *parallelDiagnoseProgress
@@ -529,6 +536,7 @@ func (w *diagnoseWorker) run(runCtx context.Context, resource diagnoseIterationR
 			Out:              w.out,
 			ResultsDir:       w.resultsDir,
 			GoTestArgs:       w.goTestArgs,
+			ModuleDir:        w.moduleDir,
 			Iteration:        iteration,
 			ShuffleSeed:      seed,
 			Env:              resource.Env,
@@ -1098,16 +1106,12 @@ func (sw *syncedWriter) Write(p []byte) (int, error) {
 
 func diagnoseIteration(ctx context.Context, p diagnoseIterationParams) error {
 	conf, out := p.Conf, p.Out
-	resultsDir, goTestArgs := p.ResultsDir, p.GoTestArgs
+	resultsDir := p.ResultsDir
 	iteration, shuffleSeed := p.Iteration, p.ShuffleSeed
 	env := p.Env
 	liveProgress, parallelProgress := p.LiveProgress, p.ParallelProgress
 	diagnoseRunStart, serialProgressMu := p.DiagnoseRunStart, p.SerialProgressMu
-
-	moduleDir, goTestArgs, err := resolveModuleDir(conf.RepoRoot, goTestArgs)
-	if err != nil {
-		return err
-	}
+	moduleDir, goTestArgs := p.ModuleDir, p.GoTestArgs
 
 	start := time.Now()
 	jsonPath := filepath.Join(resultsDir, fmt.Sprintf("iteration-%d.log.jsonl", iteration))
@@ -1115,7 +1119,11 @@ func diagnoseIteration(ctx context.Context, p diagnoseIterationParams) error {
 	if err != nil {
 		return err
 	}
-	defer func() { _ = resultsFile.Close() }()
+	bw := bufio.NewWriterSize(resultsFile, 128*1024)
+	defer func() {
+		_ = bw.Flush()
+		_ = resultsFile.Close()
+	}()
 
 	args, err := buildDiagnoseArgs(goTestArgs, shuffleSeed)
 	if err != nil {
@@ -1130,29 +1138,19 @@ func diagnoseIteration(ctx context.Context, p diagnoseIterationParams) error {
 	cmd.Cancel = func() error { return cmd.Process.Signal(os.Interrupt) }
 	cmd.WaitDelay = 5 * time.Second
 
+	sw := &syncedWriter{w: bw}
+	cmd.Stdout = sw
+	cmd.Stderr = sw
+
 	if out.AIOutput() {
-		sw := &syncedWriter{w: resultsFile}
-		cmd.Stdout = sw
-		cmd.Stderr = sw
 		return cmd.Run()
 	}
 
-	sw := &syncedWriter{w: resultsFile}
-	cmd.Stderr = sw
-
-	totalPkgs := -1
-	if n, listErr := listTestPackageCount(ctx, moduleDir, goTestArgs); listErr == nil {
-		totalPkgs = n
-	}
-	prog := newDiagnoseProgress(totalPkgs)
 	if parallelProgress != nil {
 		parallelProgress.start(iteration)
 		defer parallelProgress.finish(iteration)
 	}
 
-	pr, pw := io.Pipe()
-	cmd.Stdout = pw
-
 	live := liveProgress && out.LiveInlineProgress()
 	iter, iters := iteration+1, conf.Iterations
 	if liveProgress && !live {
@@ -1175,30 +1173,6 @@ func diagnoseIteration(ctx context.Context, p diagnoseIterationParams) error {
 		)
 	}
 
-	var readWG sync.WaitGroup
-	var scanErr error
-	readWG.Go(func() {
-		r := bufio.NewReaderSize(pr, 1024*1024)
-		for {
-			line, err := r.ReadBytes('\n')
-			if len(line) > 0 {
-				if _, werr := sw.Write(line); werr != nil {
-					break
-				}
-				completedIncreased := prog.onTestJSONLine(line)
-				if completedIncreased && !live {
-					redraw(false)
-				}
-			}
-			if err != nil {
-				if err != io.EOF {
-					scanErr = err
-				}
-				break
-			}
-		}
-	})
-
 	tickDone := make(chan struct{})
 	var tickWG sync.WaitGroup
 	if live {
@@ -1217,24 +1191,13 @@ func diagnoseIteration(ctx context.Context, p diagnoseIterationParams) error {
 		redraw(true)
 	}
 
-	runErr := cmd.Start()
-	started := runErr == nil
-	if started {
-		runErr = cmd.Wait()
-		_ = pw.Close()
-	} else {
-		_ = pw.CloseWithError(runErr)
-	}
-	readWG.Wait()
+	runErr := cmd.Run()
 	close(tickDone)
 	tickWG.Wait()
 
 	if live {
 		out.ClearInline()
 	}
-	if scanErr != nil {
-		return fmt.Errorf("reading go test output: %w", scanErr)
-	}
 	return runErr
 }
 
diff --git a/internal/runner/runner_bench_test.go b/internal/runner/runner_bench_test.go
new file mode 100644
index 0000000..30562e9
--- /dev/null
+++ b/internal/runner/runner_bench_test.go
@@ -0,0 +1,69 @@
+package runner
+
+import (
+	"context"
+	"io"
+	"os/exec"
+	"path/filepath"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/require"
+
+	"github.com/smartcontractkit/testrig/internal/config"
+	"github.com/smartcontractkit/testrig/internal/output"
+)
+
+const benchOverheadPackage = "./internal/runner/testdata/dummy/..."
+
+// BenchmarkBaselineGoTest is the floor: raw `go test -json` against the same
+// target Diagnose runs. Subtract its ns/op, B/op, allocs/op from
+// BenchmarkDiagnose to read the overhead Diagnose adds.
+func BenchmarkBaselineGoTest(b *testing.B) {
+	repoRoot, err := filepath.Abs("../..")
+	require.NoError(b, err)
+	ctx := context.Background()
+
+	b.ReportAllocs()
+	for b.Loop() {
+		//nolint:gosec // G204: fixed args against testdata target
+		cmd := exec.CommandContext(ctx, "go", "test", "-json", "-count=1", benchOverheadPackage)
+		cmd.Dir = repoRoot
+		cmd.Stdout = io.Discard
+		cmd.Stderr = io.Discard
+		require.NoError(b, cmd.Run())
+	}
+}
+
+// BenchmarkDiagnose runs one Diagnose iteration against the same target as
+// BenchmarkBaselineGoTest. ns/op minus baseline is the overhead Diagnose adds.
+func BenchmarkDiagnose(b *testing.B) {
+	repoRoot, err := filepath.Abs("../..")
+	require.NoError(b, err)
+
+	out := output.NewForTest(true, io.Discard, io.Discard, false)
+	ctx := context.Background()
+	args := []string{benchOverheadPackage}
+	conf := &config.App{
+		RepoRoot:           repoRoot,
+		Iterations:         1,
+		ParallelIterations: 1,
+		SlowThreshold:      time.Second,
+	}
+
+	b.ReportAllocs()
+	for b.Loop() {
+		require.NoError(b, Diagnose(ctx, conf, out, args, nil, nil))
+	}
+}
+
+func BenchmarkResolveModuleDir(b *testing.B) {
+	repoRoot, err := filepath.Abs("../..")
+	require.NoError(b, err)
+	args := []string{"./internal/runner/..."}
+
+	for b.Loop() {
+		_, _, err := resolveModuleDir(repoRoot, args)
+		require.NoError(b, err)
+	}
+}
diff --git a/internal/runner/testdata/dummy/dummy_test.go b/internal/runner/testdata/dummy/dummy_test.go
new file mode 100644
index 0000000..f400466
--- /dev/null
+++ b/internal/runner/testdata/dummy/dummy_test.go
@@ -0,0 +1,5 @@
+package dummy
+
+import "testing"
+
+func TestDummy(t *testing.T) {}
diff --git a/justfile b/justfile
index dc677a4..3980709 100644
--- a/justfile
+++ b/justfile
@@ -16,7 +16,24 @@ test_race:
 
 # Run benchmarks with memory stats and specific CPU counts
 bench:
-    go test -bench=. -benchmem -run=^$ ./... -cpu=2,4,8
+    go test -bench=. -benchmem -run=^$ ./...
+
+# Compare diagnose overhead vs raw `go test -json` on the dummy target.
+# Overhead = ns/op, B/op, allocs/op delta between BenchmarkDiagnose and BenchmarkBaselineGoTest.
+bench_overhead:
+    go test ./internal/runner/ -run='^$' -bench='^Benchmark(BaselineGoTest|Diagnose)$' -benchmem -count=3
+
+# Memory profile of BenchmarkDiagnose (CPU is low-signal; subprocess-bound).
+bench_overhead_profile:
+    #!/usr/bin/env bash
+    set -euo pipefail
+    out_dir=out/bench
+    mkdir -p "$out_dir"
+    go test ./internal/runner/ -run='^$' -bench='^BenchmarkDiagnose$' -benchmem -count=1 \
+        -memprofile="$out_dir/diagnose_mem.prof" 2>&1 | tee "$out_dir/diagnose_bench.txt"
+    go tool pprof -top -nodecount=30 -focus='testrig/internal/runner' "$out_dir/diagnose_mem.prof" \
+        > "$out_dir/diagnose_mem_runner.txt"
+    echo "Mem hotspots: $out_dir/diagnose_mem_runner.txt"
 
 # Local GoReleaser dry-run (snapshot)
 goreleaser:

From bce4ad5ee96f5e423ba508156fd035df2b0051cd Mon Sep 17 00:00:00 2001
From: Adam Hamrick <adam.hamrick@smartcontract.com>
Date: Thu, 28 May 2026 16:37:53 -0400
Subject: [PATCH 2/9] Use test instead of benchmark

---
 AGENTS.md                            |  11 --
 internal/runner/runner_bench_test.go | 194 +++++++++++++++++++++++++--
 justfile                             |  19 +--
 3 files changed, 183 insertions(+), 41 deletions(-)

diff --git a/AGENTS.md b/AGENTS.md
index edd5deb..b9ce910 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -26,14 +26,3 @@ go test ./...                 # Test
 **Public API vs CLI are separate layers.** `hooks.go` / `testrig.go` are the library surface. The CLI in `internal/cmd/` wraps that. Don't put CLI logic in the root package.
 
 **Hook catalog drives CLI flags and docs.** Add lifecycle hooks in [`internal/hooks/catalog.go`](internal/hooks/catalog.go) (`Catalog` slice), then wire `RunOptions.Hook`, public `hooks.go` registrar, and `go generate`. Document timing in godoc on registrars in [`hooks.go`](hooks.go) as `// Name registers a hook to <when>.` `hooks.RegisterPersistentFlags` registers Cobra flags from the catalog.
-
-## Diagnose overhead benchmark
-
-`internal/runner/runner_bench_test.go` defines a side-by-side pair against `internal/runner/testdata/dummy/...`:
-
-- `BenchmarkBaselineGoTest` — raw `go test -json` (the floor).
-- `BenchmarkDiagnose` — one `Diagnose` iteration with `parallel=1`.
-
-The overhead `Diagnose` adds is the **delta** in `ns/op`, `B/op`, and `allocs/op` between the two. Run with `just bench_overhead` (uses `-count=3` for variance). For a memory profile of the diagnose pipeline, `just bench_overhead_profile` writes `out/bench/diagnose_mem_runner.txt` (look for `scanIterationJSONL`, `applyTestEvent`, `buildReportFromAggs`). CPU profile is low-signal — parent blocks on the child subprocess.
-
-Drill down into the parent-only pipeline (no subprocess) via the micro-benches in `internal/runner/analyze_bench_test.go`.
diff --git a/internal/runner/runner_bench_test.go b/internal/runner/runner_bench_test.go
index 30562e9..3ad5314 100644
--- a/internal/runner/runner_bench_test.go
+++ b/internal/runner/runner_bench_test.go
@@ -2,10 +2,15 @@ package runner
 
 import (
 	"context"
+	"fmt"
 	"io"
+	"os"
 	"os/exec"
 	"path/filepath"
+	"strings"
+	"sync"
 	"testing"
+	"text/tabwriter"
 	"time"
 
 	"github.com/stretchr/testify/require"
@@ -16,6 +21,74 @@ import (
 
 const benchOverheadPackage = "./internal/runner/testdata/dummy/..."
 
+// baselineWorkload runs the raw `go test -json` floor for one diagnose-equivalent
+// workload: `iterations` invocations against the dummy target, at most `parallel`
+// running concurrently (mirroring how Diagnose schedules iterations across workers).
+func baselineWorkload(ctx context.Context, repoRoot string, iterations, parallel int) error {
+	if parallel < 1 {
+		parallel = 1
+	}
+	sem := make(chan struct{}, parallel)
+	var wg sync.WaitGroup
+	var mu sync.Mutex
+	var firstErr error
+	for range iterations {
+		sem <- struct{}{}
+		wg.Go(func() {
+			defer func() { <-sem }()
+			//nolint:gosec // G204: fixed args against testdata target
+			cmd := exec.CommandContext(ctx, "go", "test", "-json", "-count=1", benchOverheadPackage)
+			cmd.Dir = repoRoot
+			cmd.Stdout = io.Discard
+			cmd.Stderr = io.Discard
+			if err := cmd.Run(); err != nil {
+				mu.Lock()
+				if firstErr == nil {
+					firstErr = err
+				}
+				mu.Unlock()
+			}
+		})
+	}
+	wg.Wait()
+	return firstErr
+}
+
+// diagnoseWorkload runs one Diagnose call against the dummy target with the given
+// iteration count and parallelism. Output is discarded.
+func diagnoseWorkload(ctx context.Context, out *output.Printer, repoRoot string, iterations, parallel int) error {
+	conf := &config.App{
+		RepoRoot:           repoRoot,
+		Iterations:         iterations,
+		ParallelIterations: parallel,
+		SlowThreshold:      time.Second,
+	}
+	return Diagnose(ctx, conf, out, []string{benchOverheadPackage}, nil, nil)
+}
+
+// existingDiagnoseDirs lists the diagnose-* result dirs currently in repoRoot.
+func existingDiagnoseDirs(repoRoot string) []string {
+	matches, _ := filepath.Glob(filepath.Join(repoRoot, "diagnose-*"))
+	return matches
+}
+
+// cleanupNewDiagnoseDirs removes any diagnose-* result dirs created during the
+// benchmark, so repeated runs don't accumulate output dirs in the repo root.
+func cleanupNewDiagnoseDirs(tb testing.TB, repoRoot string) {
+	tb.Helper()
+	before := make(map[string]struct{})
+	for _, d := range existingDiagnoseDirs(repoRoot) {
+		before[d] = struct{}{}
+	}
+	tb.Cleanup(func() {
+		for _, d := range existingDiagnoseDirs(repoRoot) {
+			if _, ok := before[d]; !ok {
+				_ = os.RemoveAll(d)
+			}
+		}
+	})
+}
+
 // BenchmarkBaselineGoTest is the floor: raw `go test -json` against the same
 // target Diagnose runs. Subtract its ns/op, B/op, allocs/op from
 // BenchmarkDiagnose to read the overhead Diagnose adds.
@@ -26,12 +99,7 @@ func BenchmarkBaselineGoTest(b *testing.B) {
 
 	b.ReportAllocs()
 	for b.Loop() {
-		//nolint:gosec // G204: fixed args against testdata target
-		cmd := exec.CommandContext(ctx, "go", "test", "-json", "-count=1", benchOverheadPackage)
-		cmd.Dir = repoRoot
-		cmd.Stdout = io.Discard
-		cmd.Stderr = io.Discard
-		require.NoError(b, cmd.Run())
+		require.NoError(b, baselineWorkload(ctx, repoRoot, 1, 1))
 	}
 }
 
@@ -40,21 +108,119 @@ func BenchmarkBaselineGoTest(b *testing.B) {
 func BenchmarkDiagnose(b *testing.B) {
 	repoRoot, err := filepath.Abs("../..")
 	require.NoError(b, err)
+	cleanupNewDiagnoseDirs(b, repoRoot)
 
 	out := output.NewForTest(true, io.Discard, io.Discard, false)
 	ctx := context.Background()
-	args := []string{benchOverheadPackage}
-	conf := &config.App{
-		RepoRoot:           repoRoot,
-		Iterations:         1,
-		ParallelIterations: 1,
-		SlowThreshold:      time.Second,
-	}
 
 	b.ReportAllocs()
 	for b.Loop() {
-		require.NoError(b, Diagnose(ctx, conf, out, args, nil, nil))
+		require.NoError(b, diagnoseWorkload(ctx, out, repoRoot, 1, 1))
+	}
+}
+
+// overheadConfig is one (iterations, parallel) point in the overhead matrix.
+type overheadConfig struct {
+	iterations int
+	parallel   int
+}
+
+// overheadRow pairs a config with its measured baseline and diagnose results.
+type overheadRow struct {
+	cfg      overheadConfig
+	baseline testing.BenchmarkResult
+	diagnose testing.BenchmarkResult
+}
+
+// overheadMatrix is the set of (iterations, parallel) points measured by
+// BenchmarkDiagnoseOverhead: single run, sequential iterations, then parallel.
+var overheadMatrix = []overheadConfig{
+	{iterations: 1, parallel: 1},
+	{iterations: 4, parallel: 1},
+	{iterations: 4, parallel: 4},
+	{iterations: 8, parallel: 1},
+	{iterations: 8, parallel: 8},
+}
+
+// overheadMatrixEnv gates TestDiagnoseOverhead; it spawns many `go test`
+// subprocesses and is too slow for the normal test run.
+const overheadMatrixEnv = "TESTRIG_BENCH_OVERHEAD"
+
+// TestDiagnoseOverhead measures Diagnose overhead vs the raw `go test` floor
+// across the overheadMatrix and logs a diff table (total overhead and
+// per-iteration overhead). It is a test, not a Benchmark, because it drives
+// testing.Benchmark internally (which deadlocks if called from a benchmark).
+// Run via `just bench_overhead_matrix`; gated behind TESTRIG_BENCH_OVERHEAD.
+//
+//nolint:paralleltest // serial by design: spawns many go test subprocesses and measures wall time.
+func TestDiagnoseOverhead(t *testing.T) {
+	if os.Getenv(overheadMatrixEnv) == "" {
+		t.Skipf("set %s=1 to run the diagnose overhead matrix", overheadMatrixEnv)
+	}
+	repoRoot, err := filepath.Abs("../..")
+	require.NoError(t, err)
+	cleanupNewDiagnoseDirs(t, repoRoot)
+
+	out := output.NewForTest(true, io.Discard, io.Discard, false)
+	ctx := context.Background()
+
+	rows := make([]overheadRow, 0, len(overheadMatrix))
+	for _, cfg := range overheadMatrix {
+		base := testing.Benchmark(func(b *testing.B) {
+			b.ReportAllocs()
+			for b.Loop() {
+				require.NoError(b, baselineWorkload(ctx, repoRoot, cfg.iterations, cfg.parallel))
+			}
+		})
+		require.NotZero(t, base.N, "baseline workload failed for %+v", cfg)
+
+		diag := testing.Benchmark(func(b *testing.B) {
+			b.ReportAllocs()
+			for b.Loop() {
+				require.NoError(b, diagnoseWorkload(ctx, out, repoRoot, cfg.iterations, cfg.parallel))
+			}
+		})
+		require.NotZero(t, diag.N, "diagnose workload failed for %+v", cfg)
+
+		rows = append(rows, overheadRow{cfg: cfg, baseline: base, diagnose: diag})
+	}
+	printDiagnoseOverhead(t, rows)
+}
+
+// roundedDur renders ns as a duration rounded to microseconds for the table.
+func roundedDur(ns int64) string {
+	return time.Duration(ns).Round(time.Microsecond).String()
+}
+
+// overheadDur renders overhead; negative deltas (noise) show as 0.
+func overheadDur(ns int64) string {
+	if ns < 0 {
+		ns = 0
+	}
+	return roundedDur(ns)
+}
+
+// printDiagnoseOverhead logs a table of baseline vs diagnose wall time per config.
+// overhead = diagnose ns/op - baseline ns/op; overhead/iter divides by iterations.
+func printDiagnoseOverhead(t *testing.T, rows []overheadRow) {
+	t.Helper()
+	var sb strings.Builder
+	tw := tabwriter.NewWriter(&sb, 0, 0, 2, ' ', 0)
+	_, _ = fmt.Fprintln(tw, "iters\tparallel\tbaseline\tdiagnose\toverhead\toverhead/iter")
+	for _, r := range rows {
+		overheadNs := r.diagnose.NsPerOp() - r.baseline.NsPerOp()
+		perIterNs := overheadNs / int64(max(r.cfg.iterations, 1))
+		_, _ = fmt.Fprintf(tw, "%d\t%d\t%s\t%s\t%s\t%s\n",
+			r.cfg.iterations,
+			r.cfg.parallel,
+			roundedDur(r.baseline.NsPerOp()),
+			roundedDur(r.diagnose.NsPerOp()),
+			overheadDur(overheadNs),
+			overheadDur(perIterNs),
+		)
 	}
+	_ = tw.Flush()
+	t.Logf("\nDiagnose overhead vs raw go test:\n%s", sb.String())
 }
 
 func BenchmarkResolveModuleDir(b *testing.B) {
diff --git a/justfile b/justfile
index 3980709..46b9c94 100644
--- a/justfile
+++ b/justfile
@@ -18,22 +18,9 @@ test_race:
 bench:
     go test -bench=. -benchmem -run=^$ ./...
 
-# Compare diagnose overhead vs raw `go test -json` on the dummy target.
-# Overhead = ns/op, B/op, allocs/op delta between BenchmarkDiagnose and BenchmarkBaselineGoTest.
-bench_overhead:
-    go test ./internal/runner/ -run='^$' -bench='^Benchmark(BaselineGoTest|Diagnose)$' -benchmem -count=3
-
-# Memory profile of BenchmarkDiagnose (CPU is low-signal; subprocess-bound).
-bench_overhead_profile:
-    #!/usr/bin/env bash
-    set -euo pipefail
-    out_dir=out/bench
-    mkdir -p "$out_dir"
-    go test ./internal/runner/ -run='^$' -bench='^BenchmarkDiagnose$' -benchmem -count=1 \
-        -memprofile="$out_dir/diagnose_mem.prof" 2>&1 | tee "$out_dir/diagnose_bench.txt"
-    go tool pprof -top -nodecount=30 -focus='testrig/internal/runner' "$out_dir/diagnose_mem.prof" \
-        > "$out_dir/diagnose_mem_runner.txt"
-    echo "Mem hotspots: $out_dir/diagnose_mem_runner.txt"
+# Print a diff table of diagnose overhead across iteration counts and parallelism.
+bench_overhead_matrix:
+    TESTRIG_BENCH_OVERHEAD=1 go test ./internal/runner/ -run='^TestDiagnoseOverhead$' -v -timeout=20m
 
 # Local GoReleaser dry-run (snapshot)
 goreleaser:

From 643788b04c41d84bcde3b0958b62360ad57bb9d7 Mon Sep 17 00:00:00 2001
From: Adam Hamrick <adam.hamrick@smartcontract.com>
Date: Thu, 28 May 2026 16:39:49 -0400
Subject: [PATCH 3/9] short

---
 internal/runner/runner_bench_test.go | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/internal/runner/runner_bench_test.go b/internal/runner/runner_bench_test.go
index 3ad5314..3ff859c 100644
--- a/internal/runner/runner_bench_test.go
+++ b/internal/runner/runner_bench_test.go
@@ -157,6 +157,9 @@ func TestDiagnoseOverhead(t *testing.T) {
 	if os.Getenv(overheadMatrixEnv) == "" {
 		t.Skipf("set %s=1 to run the diagnose overhead matrix", overheadMatrixEnv)
 	}
+	if testing.Short() {
+		t.Skip("skipping diagnose overhead matrix in short mode")
+	}
 	repoRoot, err := filepath.Abs("../..")
 	require.NoError(t, err)
 	cleanupNewDiagnoseDirs(t, repoRoot)

From 05ba2d0bb1e02a14f068c6225a3affd5a412fa2d Mon Sep 17 00:00:00 2001
From: Adam Hamrick <adam.hamrick@smartcontract.com>
Date: Thu, 28 May 2026 16:55:07 -0400
Subject: [PATCH 4/9] include dogfood bench as well

---
 go.mod                               |   2 +-
 internal/runner/runner_bench_test.go | 104 +++++++++++++++++++--------
 justfile                             |  13 +++-
 3 files changed, 86 insertions(+), 33 deletions(-)

diff --git a/go.mod b/go.mod
index 002f704..c56b021 100644
--- a/go.mod
+++ b/go.mod
@@ -15,6 +15,7 @@ require (
 	github.com/spf13/cobra v1.10.2
 	github.com/spf13/pflag v1.0.10
 	github.com/stretchr/testify v1.11.1
+	golang.org/x/sync v0.20.0
 )
 
 require (
@@ -50,7 +51,6 @@ require (
 	github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e // indirect
 	golang.org/x/exp v0.0.0-20260410095643-746e56fc9e2f // indirect
 	golang.org/x/mod v0.35.0 // indirect
-	golang.org/x/sync v0.20.0 // indirect
 	golang.org/x/sys v0.45.0 // indirect
 	golang.org/x/term v0.43.0 // indirect
 	golang.org/x/text v0.37.0 // indirect
diff --git a/internal/runner/runner_bench_test.go b/internal/runner/runner_bench_test.go
index 3ff859c..840b34f 100644
--- a/internal/runner/runner_bench_test.go
+++ b/internal/runner/runner_bench_test.go
@@ -14,31 +14,39 @@ import (
 	"time"
 
 	"github.com/stretchr/testify/require"
+	"golang.org/x/sync/semaphore"
 
 	"github.com/smartcontractkit/testrig/internal/config"
 	"github.com/smartcontractkit/testrig/internal/output"
 )
 
-const benchOverheadPackage = "./internal/runner/testdata/dummy/..."
+const (
+	benchDummyTarget   = "./internal/runner/testdata/dummy/..."
+	benchDogfoodTarget = "./..."
+)
 
 // baselineWorkload runs the raw `go test -json` floor for one diagnose-equivalent
-// workload: `iterations` invocations against the dummy target, at most `parallel`
-// running concurrently (mirroring how Diagnose schedules iterations across workers).
-func baselineWorkload(ctx context.Context, repoRoot string, iterations, parallel int) error {
+// workload: `iterations` invocations against target, at most `parallel` running
+// concurrently (mirroring how Diagnose schedules iterations across workers).
+func baselineWorkload(ctx context.Context, repoRoot, target string, iterations, parallel int) error {
 	if parallel < 1 {
 		parallel = 1
 	}
-	sem := make(chan struct{}, parallel)
+	sem := semaphore.NewWeighted(int64(parallel))
 	var wg sync.WaitGroup
 	var mu sync.Mutex
 	var firstErr error
 	for range iterations {
-		sem <- struct{}{}
+		if err := sem.Acquire(ctx, 1); err != nil {
+			wg.Wait()
+			return err
+		}
 		wg.Go(func() {
-			defer func() { <-sem }()
-			//nolint:gosec // G204: fixed args against testdata target
-			cmd := exec.CommandContext(ctx, "go", "test", "-json", "-count=1", benchOverheadPackage)
+			defer sem.Release(1)
+			//nolint:gosec // G204: target is fixed per test (dummy or ./...)
+			cmd := exec.CommandContext(ctx, "go", "test", "-json", "-count=1", target)
 			cmd.Dir = repoRoot
+			cmd.Env = envWithoutKey(os.Environ(), overheadMatrixEnv)
 			cmd.Stdout = io.Discard
 			cmd.Stderr = io.Discard
 			if err := cmd.Run(); err != nil {
@@ -54,16 +62,33 @@ func baselineWorkload(ctx context.Context, repoRoot string, iterations, parallel
 	return firstErr
 }
 
-// diagnoseWorkload runs one Diagnose call against the dummy target with the given
-// iteration count and parallelism. Output is discarded.
-func diagnoseWorkload(ctx context.Context, out *output.Printer, repoRoot string, iterations, parallel int) error {
+// diagnoseWorkload runs one Diagnose call against target with the given iteration
+// count and parallelism. Output is discarded.
+func diagnoseWorkload(
+	ctx context.Context,
+	out *output.Printer,
+	repoRoot, target string,
+	iterations, parallel int,
+) error {
 	conf := &config.App{
 		RepoRoot:           repoRoot,
 		Iterations:         iterations,
 		ParallelIterations: parallel,
 		SlowThreshold:      time.Second,
 	}
-	return Diagnose(ctx, conf, out, []string{benchOverheadPackage}, nil, nil)
+	return Diagnose(ctx, conf, out, []string{target}, nil, nil)
+}
+
+func envWithoutKey(env []string, key string) []string {
+	prefix := key + "="
+	out := make([]string, 0, len(env))
+	for _, e := range env {
+		if e == key || strings.HasPrefix(e, prefix) {
+			continue
+		}
+		out = append(out, e)
+	}
+	return out
 }
 
 // existingDiagnoseDirs lists the diagnose-* result dirs currently in repoRoot.
@@ -99,7 +124,7 @@ func BenchmarkBaselineGoTest(b *testing.B) {
 
 	b.ReportAllocs()
 	for b.Loop() {
-		require.NoError(b, baselineWorkload(ctx, repoRoot, 1, 1))
+		require.NoError(b, baselineWorkload(ctx, repoRoot, benchDummyTarget, 1, 1))
 	}
 }
 
@@ -115,7 +140,7 @@ func BenchmarkDiagnose(b *testing.B) {
 
 	b.ReportAllocs()
 	for b.Loop() {
-		require.NoError(b, diagnoseWorkload(ctx, out, repoRoot, 1, 1))
+		require.NoError(b, diagnoseWorkload(ctx, out, repoRoot, benchDummyTarget, 1, 1))
 	}
 }
 
@@ -142,28 +167,37 @@ var overheadMatrix = []overheadConfig{
 	{iterations: 8, parallel: 8},
 }
 
-// overheadMatrixEnv gates TestDiagnoseOverhead; it spawns many `go test`
+// overheadMatrixEnv gates TestDiagnoseOverhead_*; it spawns many `go test`
 // subprocesses and is too slow for the normal test run.
 const overheadMatrixEnv = "TESTRIG_BENCH_OVERHEAD"
 
-// TestDiagnoseOverhead measures Diagnose overhead vs the raw `go test` floor
-// across the overheadMatrix and logs a diff table (total overhead and
-// per-iteration overhead). It is a test, not a Benchmark, because it drives
-// testing.Benchmark internally (which deadlocks if called from a benchmark).
-// Run via `just bench_overhead_matrix`; gated behind TESTRIG_BENCH_OVERHEAD.
-//
-//nolint:paralleltest // serial by design: spawns many go test subprocesses and measures wall time.
-func TestDiagnoseOverhead(t *testing.T) {
+func skipUnlessDiagnoseOverheadMatrix(t *testing.T) {
+	t.Helper()
 	if os.Getenv(overheadMatrixEnv) == "" {
 		t.Skipf("set %s=1 to run the diagnose overhead matrix", overheadMatrixEnv)
 	}
 	if testing.Short() {
 		t.Skip("skipping diagnose overhead matrix in short mode")
 	}
+}
+
+// runDiagnoseOverheadMatrix measures Diagnose overhead vs the raw `go test` floor
+// across overheadMatrix for target and logs a diff table. It is a test helper, not
+// a Benchmark, because it drives testing.Benchmark internally (which deadlocks if
+// called from a benchmark).
+//
+//nolint:paralleltest // serial by design: spawns many go test subprocesses and measures wall time.
+func runDiagnoseOverheadMatrix(t *testing.T, label, target string) {
+	t.Helper()
+	skipUnlessDiagnoseOverheadMatrix(t)
+
 	repoRoot, err := filepath.Abs("../..")
 	require.NoError(t, err)
 	cleanupNewDiagnoseDirs(t, repoRoot)
 
+	// Child `go test` processes must not see TESTRIG_BENCH_OVERHEAD (dogfood runs ./...).
+	t.Setenv(overheadMatrixEnv, "")
+
 	out := output.NewForTest(true, io.Discard, io.Discard, false)
 	ctx := context.Background()
 
@@ -172,7 +206,7 @@ func TestDiagnoseOverhead(t *testing.T) {
 		base := testing.Benchmark(func(b *testing.B) {
 			b.ReportAllocs()
 			for b.Loop() {
-				require.NoError(b, baselineWorkload(ctx, repoRoot, cfg.iterations, cfg.parallel))
+				require.NoError(b, baselineWorkload(ctx, repoRoot, target, cfg.iterations, cfg.parallel))
 			}
 		})
 		require.NotZero(t, base.N, "baseline workload failed for %+v", cfg)
@@ -180,14 +214,26 @@ func TestDiagnoseOverhead(t *testing.T) {
 		diag := testing.Benchmark(func(b *testing.B) {
 			b.ReportAllocs()
 			for b.Loop() {
-				require.NoError(b, diagnoseWorkload(ctx, out, repoRoot, cfg.iterations, cfg.parallel))
+				require.NoError(b, diagnoseWorkload(ctx, out, repoRoot, target, cfg.iterations, cfg.parallel))
 			}
 		})
 		require.NotZero(t, diag.N, "diagnose workload failed for %+v", cfg)
 
 		rows = append(rows, overheadRow{cfg: cfg, baseline: base, diagnose: diag})
 	}
-	printDiagnoseOverhead(t, rows)
+	printDiagnoseOverhead(t, label, target, rows)
+}
+
+// TestDiagnoseOverhead_Dummy runs the overhead matrix against the tiny dummy package.
+// Run via `just bench_overhead_matrix_dummy`.
+func TestDiagnoseOverhead_Dummy(t *testing.T) {
+	runDiagnoseOverheadMatrix(t, "dummy", benchDummyTarget)
+}
+
+// TestDiagnoseOverhead_Dogfood runs the overhead matrix against the full testrig module (./...).
+// Run via `just bench_overhead_matrix_dogfood`; expect much longer wall time than dummy.
+func TestDiagnoseOverhead_Dogfood(t *testing.T) {
+	runDiagnoseOverheadMatrix(t, "dogfood", benchDogfoodTarget)
 }
 
 // roundedDur renders ns as a duration rounded to microseconds for the table.
@@ -205,7 +251,7 @@ func overheadDur(ns int64) string {
 
 // printDiagnoseOverhead logs a table of baseline vs diagnose wall time per config.
 // overhead = diagnose ns/op - baseline ns/op; overhead/iter divides by iterations.
-func printDiagnoseOverhead(t *testing.T, rows []overheadRow) {
+func printDiagnoseOverhead(t *testing.T, label, target string, rows []overheadRow) {
 	t.Helper()
 	var sb strings.Builder
 	tw := tabwriter.NewWriter(&sb, 0, 0, 2, ' ', 0)
@@ -223,7 +269,7 @@ func printDiagnoseOverhead(t *testing.T, rows []overheadRow) {
 		)
 	}
 	_ = tw.Flush()
-	t.Logf("\nDiagnose overhead vs raw go test:\n%s", sb.String())
+	t.Logf("\nDiagnose overhead vs raw go test (%s, target=%s):\n%s", label, target, sb.String())
 }
 
 func BenchmarkResolveModuleDir(b *testing.B) {
diff --git a/justfile b/justfile
index 46b9c94..7c6c1bc 100644
--- a/justfile
+++ b/justfile
@@ -18,9 +18,16 @@ test_race:
 bench:
     go test -bench=. -benchmem -run=^$ ./...
 
-# Print a diff table of diagnose overhead across iteration counts and parallelism.
-bench_overhead_matrix:
-    TESTRIG_BENCH_OVERHEAD=1 go test ./internal/runner/ -run='^TestDiagnoseOverhead$' -v -timeout=20m
+# Print a diff table of diagnose overhead (dummy package; fast).
+bench_overhead_matrix_dummy:
+    TESTRIG_BENCH_OVERHEAD=1 go test ./internal/runner/ -run='^TestDiagnoseOverhead_Dummy$' -v
+
+# Same matrix against the full testrig module (./...); slow.
+bench_overhead_matrix_dogfood:
+    TESTRIG_BENCH_OVERHEAD=1 go test ./internal/runner/ -run='^TestDiagnoseOverhead_Dogfood$' -v
+
+# Run dummy then dogfood overhead matrices.
+bench_overhead_matrix: bench_overhead_matrix_dummy bench_overhead_matrix_dogfood
 
 # Local GoReleaser dry-run (snapshot)
 goreleaser:

From 938b8f0a6596ba5c68406de4d46205be0d7ff64a Mon Sep 17 00:00:00 2001
From: Adam Hamrick <adam.hamrick@smartcontract.com>
Date: Thu, 28 May 2026 16:59:45 -0400
Subject: [PATCH 5/9] lint

---
 internal/runner/runner_bench_test.go | 4 ++++
 lefthook.yml                         | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/internal/runner/runner_bench_test.go b/internal/runner/runner_bench_test.go
index 840b34f..94808dc 100644
--- a/internal/runner/runner_bench_test.go
+++ b/internal/runner/runner_bench_test.go
@@ -226,12 +226,16 @@ func runDiagnoseOverheadMatrix(t *testing.T, label, target string) {
 
 // TestDiagnoseOverhead_Dummy runs the overhead matrix against the tiny dummy package.
 // Run via `just bench_overhead_matrix_dummy`.
+//
+//nolint:paralleltest // serial by design: spawns many go test subprocesses and measures wall time.
 func TestDiagnoseOverhead_Dummy(t *testing.T) {
 	runDiagnoseOverheadMatrix(t, "dummy", benchDummyTarget)
 }
 
 // TestDiagnoseOverhead_Dogfood runs the overhead matrix against the full testrig module (./...).
 // Run via `just bench_overhead_matrix_dogfood`; expect much longer wall time than dummy.
+//
+//nolint:paralleltest // serial by design: spawns many go test subprocesses and measures wall time.
 func TestDiagnoseOverhead_Dogfood(t *testing.T) {
 	runDiagnoseOverheadMatrix(t, "dogfood", benchDogfoodTarget)
 }
diff --git a/lefthook.yml b/lefthook.yml
index bbbcf28..7d6443d 100644
--- a/lefthook.yml
+++ b/lefthook.yml
@@ -97,5 +97,5 @@ pre-push:
   commands:
     go-short-tests:
       tags: test
-      glob: "*.go"
+      glob: "*.go|go.mod"
       run: go test -short ./...

From 11a3bc390851fd3bb4d02de3b41c33d49c9dcf96 Mon Sep 17 00:00:00 2001
From: Adam Hamrick <adam.hamrick@smartcontract.com>
Date: Fri, 29 May 2026 09:18:11 -0400
Subject: [PATCH 6/9] percentages

---
 AGENTS.md                            |   1 +
 internal/runner/runner.go            |  16 ++-
 internal/runner/runner_bench_test.go | 174 +++++++++++++++++++++++----
 justfile                             |  12 +-
 4 files changed, 168 insertions(+), 35 deletions(-)

diff --git a/AGENTS.md b/AGENTS.md
index b9ce910..cf388b6 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -3,6 +3,7 @@
 - An easy to import and use test rig for Go projects
 - Can be used as a package or CLI
 - Used to find and fix flaky tests
+- Minimal CPU, RAM, and timing overhead added to test execution
 
 ## Validate changes
 
diff --git a/internal/runner/runner.go b/internal/runner/runner.go
index 7f640d6..fbbd8a8 100644
--- a/internal/runner/runner.go
+++ b/internal/runner/runner.go
@@ -1120,9 +1120,14 @@ func diagnoseIteration(ctx context.Context, p diagnoseIterationParams) error {
 		return err
 	}
 	bw := bufio.NewWriterSize(resultsFile, 128*1024)
+	var retErr error
 	defer func() {
-		_ = bw.Flush()
-		_ = resultsFile.Close()
+		if err := bw.Flush(); err != nil && retErr == nil {
+			retErr = err
+		}
+		if err := resultsFile.Close(); err != nil && retErr == nil {
+			retErr = err
+		}
 	}()
 
 	args, err := buildDiagnoseArgs(goTestArgs, shuffleSeed)
@@ -1143,7 +1148,8 @@ func diagnoseIteration(ctx context.Context, p diagnoseIterationParams) error {
 	cmd.Stderr = sw
 
 	if out.AIOutput() {
-		return cmd.Run()
+		retErr = cmd.Run()
+		return retErr
 	}
 
 	if parallelProgress != nil {
@@ -1191,14 +1197,14 @@ func diagnoseIteration(ctx context.Context, p diagnoseIterationParams) error {
 		redraw(true)
 	}
 
-	runErr := cmd.Run()
+	retErr = cmd.Run()
 	close(tickDone)
 	tickWG.Wait()
 
 	if live {
 		out.ClearInline()
 	}
-	return runErr
+	return retErr
 }
 
 func newRunMeta(
diff --git a/internal/runner/runner_bench_test.go b/internal/runner/runner_bench_test.go
index 94808dc..45d2d96 100644
--- a/internal/runner/runner_bench_test.go
+++ b/internal/runner/runner_bench_test.go
@@ -7,6 +7,7 @@ import (
 	"os"
 	"os/exec"
 	"path/filepath"
+	"strconv"
 	"strings"
 	"sync"
 	"testing"
@@ -171,6 +172,15 @@ var overheadMatrix = []overheadConfig{
 // subprocesses and is too slow for the normal test run.
 const overheadMatrixEnv = "TESTRIG_BENCH_OVERHEAD"
 
+// overheadMatrixRunsEnv sets how many times each matrix cell is benchmarked
+// before averaging (default 5). Use 3–5 for stabler numbers; 1 for a quick smoke.
+const overheadMatrixRunsEnv = "TESTRIG_BENCH_OVERHEAD_RUNS"
+
+const (
+	overheadMatrixRunsDefault = 5
+	overheadMatrixRunsMax     = 10
+)
+
 func skipUnlessDiagnoseOverheadMatrix(t *testing.T) {
 	t.Helper()
 	if os.Getenv(overheadMatrixEnv) == "" {
@@ -181,12 +191,62 @@ func skipUnlessDiagnoseOverheadMatrix(t *testing.T) {
 	}
 }
 
+func overheadMatrixRuns() int {
+	s := strings.TrimSpace(os.Getenv(overheadMatrixRunsEnv))
+	if s == "" {
+		return overheadMatrixRunsDefault
+	}
+	n, err := strconv.Atoi(s)
+	if err != nil || n < 1 {
+		return overheadMatrixRunsDefault
+	}
+	return min(n, overheadMatrixRunsMax)
+}
+
+// averageBenchmarkResults averages per-op metrics across repeated benchmark runs.
+func averageBenchmarkResults(results []testing.BenchmarkResult) testing.BenchmarkResult {
+	if len(results) == 0 {
+		return testing.BenchmarkResult{}
+	}
+	var ns, bytes, allocs int64
+	for _, r := range results {
+		ns += r.NsPerOp()
+		bytes += r.AllocedBytesPerOp()
+		allocs += r.AllocsPerOp()
+	}
+	n := int64(len(results))
+	return testing.BenchmarkResult{
+		N:         1,
+		T:         time.Duration(ns / n),
+		MemBytes:  uint64(bytes / n),
+		MemAllocs: uint64(allocs / n),
+	}
+}
+
+// repeatBenchmark runs fn runs times and returns the averaged result.
+// phase labels log lines; pass "" to omit per-run logging.
+func repeatBenchmark(t *testing.T, runs int, phase string, fn func() testing.BenchmarkResult) testing.BenchmarkResult {
+	t.Helper()
+	results := make([]testing.BenchmarkResult, runs)
+	for i := range runs {
+		start := time.Now()
+		results[i] = fn()
+		if phase != "" {
+			t.Logf("%s: run %d/%d done (wall %s, %s/op)",
+				phase, i+1, runs, time.Since(start).Round(time.Second), roundedDur(results[i].NsPerOp()))
+		}
+	}
+	avg := averageBenchmarkResults(results)
+	if phase != "" {
+		t.Logf("%s: mean %s/op over %d runs", phase, roundedDur(avg.NsPerOp()), runs)
+	}
+	return avg
+}
+
 // runDiagnoseOverheadMatrix measures Diagnose overhead vs the raw `go test` floor
 // across overheadMatrix for target and logs a diff table. It is a test helper, not
 // a Benchmark, because it drives testing.Benchmark internally (which deadlocks if
 // called from a benchmark).
-//
-//nolint:paralleltest // serial by design: spawns many go test subprocesses and measures wall time.
 func runDiagnoseOverheadMatrix(t *testing.T, label, target string) {
 	t.Helper()
 	skipUnlessDiagnoseOverheadMatrix(t)
@@ -200,32 +260,51 @@ func runDiagnoseOverheadMatrix(t *testing.T, label, target string) {
 
 	out := output.NewForTest(true, io.Discard, io.Discard, false)
 	ctx := context.Background()
-
-	rows := make([]overheadRow, 0, len(overheadMatrix))
-	for _, cfg := range overheadMatrix {
-		base := testing.Benchmark(func(b *testing.B) {
-			b.ReportAllocs()
-			for b.Loop() {
-				require.NoError(b, baselineWorkload(ctx, repoRoot, target, cfg.iterations, cfg.parallel))
-			}
+	runs := overheadMatrixRuns()
+	total := len(overheadMatrix)
+	t.Logf("[%s] overhead matrix: target=%s, %d cells, %d runs/cell (%s overrides)",
+		label, target, total, runs, overheadMatrixRunsEnv)
+
+	rows := make([]overheadRow, 0, total)
+	for cell, cfg := range overheadMatrix {
+		cellLabel := fmt.Sprintf("[%s] cell %d/%d iters=%d parallel=%d",
+			label, cell+1, total, cfg.iterations, cfg.parallel)
+
+		base := repeatBenchmark(t, runs, cellLabel+" baseline", func() testing.BenchmarkResult {
+			r := testing.Benchmark(func(b *testing.B) {
+				b.ReportAllocs()
+				for b.Loop() {
+					require.NoError(b, baselineWorkload(ctx, repoRoot, target, cfg.iterations, cfg.parallel))
+				}
+			})
+			require.NotZero(t, r.N, "baseline workload failed for %+v", cfg)
+			return r
 		})
-		require.NotZero(t, base.N, "baseline workload failed for %+v", cfg)
 
-		diag := testing.Benchmark(func(b *testing.B) {
-			b.ReportAllocs()
-			for b.Loop() {
-				require.NoError(b, diagnoseWorkload(ctx, out, repoRoot, target, cfg.iterations, cfg.parallel))
-			}
+		diag := repeatBenchmark(t, runs, cellLabel+" diagnose", func() testing.BenchmarkResult {
+			r := testing.Benchmark(func(b *testing.B) {
+				b.ReportAllocs()
+				for b.Loop() {
+					require.NoError(b, diagnoseWorkload(ctx, out, repoRoot, target, cfg.iterations, cfg.parallel))
+				}
+			})
+			require.NotZero(t, r.N, "diagnose workload failed for %+v", cfg)
+			return r
 		})
-		require.NotZero(t, diag.N, "diagnose workload failed for %+v", cfg)
+
+		overheadNs := diag.NsPerOp() - base.NsPerOp()
+		t.Logf("%s: done — overhead %s (%s of diagnose; baseline %s, diagnose %s)",
+			cellLabel, overheadDur(overheadNs), overheadPercent(overheadNs, diag.NsPerOp()),
+			roundedDur(base.NsPerOp()), roundedDur(diag.NsPerOp()))
 
 		rows = append(rows, overheadRow{cfg: cfg, baseline: base, diagnose: diag})
 	}
-	printDiagnoseOverhead(t, label, target, rows)
+	printDiagnoseOverhead(t, label, target, runs, rows)
 }
 
 // TestDiagnoseOverhead_Dummy runs the overhead matrix against the tiny dummy package.
-// Run via `just bench_overhead_matrix_dummy`.
+// Run via `just bench_overhead_matrix_dummy`. Each cell is benchmarked 5 times by default
+// and averaged; set TESTRIG_BENCH_OVERHEAD_RUNS (e.g. 3) to tune accuracy vs wall time.
 //
 //nolint:paralleltest // serial by design: spawns many go test subprocesses and measures wall time.
 func TestDiagnoseOverhead_Dummy(t *testing.T) {
@@ -253,27 +332,74 @@ func overheadDur(ns int64) string {
 	return roundedDur(ns)
 }
 
+// overheadPercent is overhead as a share of diagnose runtime (overhead / diagnose).
+func overheadPercent(overheadNs, diagnoseNs int64) string {
+	if diagnoseNs <= 0 {
+		return "n/a"
+	}
+	if overheadNs < 0 {
+		overheadNs = 0
+	}
+	return fmt.Sprintf("%.1f%%", float64(overheadNs)*100/float64(diagnoseNs))
+}
+
 // printDiagnoseOverhead logs a table of baseline vs diagnose wall time per config.
 // overhead = diagnose ns/op - baseline ns/op; overhead/iter divides by iterations.
-func printDiagnoseOverhead(t *testing.T, label, target string, rows []overheadRow) {
+// Each cell is the mean of runs repeated benchmark invocations.
+func printDiagnoseOverhead(t *testing.T, label, target string, runs int, rows []overheadRow) {
 	t.Helper()
 	var sb strings.Builder
 	tw := tabwriter.NewWriter(&sb, 0, 0, 2, ' ', 0)
-	_, _ = fmt.Fprintln(tw, "iters\tparallel\tbaseline\tdiagnose\toverhead\toverhead/iter")
+	_, _ = fmt.Fprintln(tw, "iters\tparallel\tbaseline\tdiagnose\toverhead\toverhead%\toverhead/iter")
 	for _, r := range rows {
 		overheadNs := r.diagnose.NsPerOp() - r.baseline.NsPerOp()
 		perIterNs := overheadNs / int64(max(r.cfg.iterations, 1))
-		_, _ = fmt.Fprintf(tw, "%d\t%d\t%s\t%s\t%s\t%s\n",
+		diagNs := r.diagnose.NsPerOp()
+		_, _ = fmt.Fprintf(tw, "%d\t%d\t%s\t%s\t%s\t%s\t%s\n",
 			r.cfg.iterations,
 			r.cfg.parallel,
 			roundedDur(r.baseline.NsPerOp()),
-			roundedDur(r.diagnose.NsPerOp()),
+			roundedDur(diagNs),
 			overheadDur(overheadNs),
+			overheadPercent(overheadNs, diagNs),
 			overheadDur(perIterNs),
 		)
 	}
 	_ = tw.Flush()
-	t.Logf("\nDiagnose overhead vs raw go test (%s, target=%s):\n%s", label, target, sb.String())
+	t.Logf(`
+--------------------------------------------------------------------------------
+Diagnose overhead vs raw go test (%s, target=%s, %d-run average per cell)
+--------------------------------------------------------------------------------
+%s`,
+		label, target, runs, sb.String())
+}
+
+func TestAverageBenchmarkResults(t *testing.T) {
+	t.Parallel()
+	avg := averageBenchmarkResults([]testing.BenchmarkResult{
+		{N: 10, T: 1_000, MemBytes: 1_000, MemAllocs: 100},
+		{N: 10, T: 3_000, MemBytes: 3_000, MemAllocs: 300},
+	})
+	require.Equal(t, int64(200), avg.NsPerOp())
+	require.Equal(t, int64(200), avg.AllocedBytesPerOp())
+	require.Equal(t, int64(20), avg.AllocsPerOp())
+}
+
+func TestOverheadPercent(t *testing.T) {
+	t.Parallel()
+	require.Equal(t, "20.0%", overheadPercent(20, 100))
+	require.Equal(t, "0.0%", overheadPercent(-5, 100))
+	require.Equal(t, "n/a", overheadPercent(10, 0))
+}
+
+func TestOverheadMatrixRuns(t *testing.T) {
+	require.Equal(t, 5, overheadMatrixRuns())
+	t.Setenv(overheadMatrixRunsEnv, "3")
+	require.Equal(t, 3, overheadMatrixRuns())
+	t.Setenv(overheadMatrixRunsEnv, "99")
+	require.Equal(t, overheadMatrixRunsMax, overheadMatrixRuns())
+	t.Setenv(overheadMatrixRunsEnv, "nope")
+	require.Equal(t, overheadMatrixRunsDefault, overheadMatrixRuns())
 }
 
 func BenchmarkResolveModuleDir(b *testing.B) {
diff --git a/justfile b/justfile
index 7c6c1bc..c129a70 100644
--- a/justfile
+++ b/justfile
@@ -14,19 +14,19 @@ test:
 test_race:
     go tool gotestsum -- -race ./...
 
-# Run benchmarks with memory stats and specific CPU counts
+# Run standard benchmarks
 bench:
     go test -bench=. -benchmem -run=^$ ./...
 
-# Print a diff table of diagnose overhead (dummy package; fast).
+# Diagnose overhead matrix (dummy; fast). 5 runs averaged per cell; TESTRIG_BENCH_OVERHEAD_RUNS=3 to override.
 bench_overhead_matrix_dummy:
-    TESTRIG_BENCH_OVERHEAD=1 go test ./internal/runner/ -run='^TestDiagnoseOverhead_Dummy$' -v
+    TESTRIG_BENCH_OVERHEAD=1 go test ./internal/runner/ -run='^TestDiagnoseOverhead_Dummy$' -count=1 -v
 
-# Same matrix against the full testrig module (./...); slow.
+# Run benchmark to measure diagnose overhead against the full testrig module (./...); slow.
 bench_overhead_matrix_dogfood:
-    TESTRIG_BENCH_OVERHEAD=1 go test ./internal/runner/ -run='^TestDiagnoseOverhead_Dogfood$' -v
+    TESTRIG_BENCH_OVERHEAD=1 go test ./internal/runner/ -run='^TestDiagnoseOverhead_Dogfood$' -count=1 -v
 
-# Run dummy then dogfood overhead matrices.
+# Run benchmarks to measure diagnose overhead for both dummy and dogfood targets.
 bench_overhead_matrix: bench_overhead_matrix_dummy bench_overhead_matrix_dogfood
 
 # Local GoReleaser dry-run (snapshot)

From 143ddde8b21d2ba749cad812de9946ea02a2064e Mon Sep 17 00:00:00 2001
From: Adam Hamrick <adam.hamrick@smartcontract.com>
Date: Fri, 29 May 2026 09:51:49 -0400
Subject: [PATCH 7/9] lint

---
 internal/runner/runner_bench_test.go | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/internal/runner/runner_bench_test.go b/internal/runner/runner_bench_test.go
index 45d2d96..e5c719f 100644
--- a/internal/runner/runner_bench_test.go
+++ b/internal/runner/runner_bench_test.go
@@ -215,11 +215,19 @@ func averageBenchmarkResults(results []testing.BenchmarkResult) testing.Benchmar
 		allocs += r.AllocsPerOp()
 	}
 	n := int64(len(results))
+	avgBytes := bytes / n
+	avgAllocs := allocs / n
+	if avgBytes < 0 {
+		avgBytes = 0
+	}
+	if avgAllocs < 0 {
+		avgAllocs = 0
+	}
 	return testing.BenchmarkResult{
 		N:         1,
 		T:         time.Duration(ns / n),
-		MemBytes:  uint64(bytes / n),
-		MemAllocs: uint64(allocs / n),
+		MemBytes:  uint64(avgBytes),
+		MemAllocs: uint64(avgAllocs),
 	}
 }
 

From 6e1bba077575fb06e97d400492c7700ba336fd66 Mon Sep 17 00:00:00 2001
From: Adam Hamrick <adam.hamrick@smartcontract.com>
Date: Fri, 29 May 2026 09:54:56 -0400
Subject: [PATCH 8/9] lefthook

---
 lefthook.yml | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/lefthook.yml b/lefthook.yml
index 7d6443d..f101a69 100644
--- a/lefthook.yml
+++ b/lefthook.yml
@@ -54,14 +54,19 @@ pre-commit:
 
     golangci-lint:
       tags: linter
-      glob: "*.go"
-      root: ""
-      run: golangci-lint run --fast-only --fix ./...
+      glob:
+        - "*.go"
+        - "**/*.go"
+        - "go.mod"
+        - "go.sum"
+      run: golangci-lint run --fix --fast-only ./...
       stage_fixed: true
 
     go-generate:
       tags: linter
-      glob: "*.go"
+      glob:
+        - "*.go"
+        - "**/*.go"
       run: |
         before=$(git status --porcelain)
         go generate ./...
@@ -75,7 +80,11 @@ pre-commit:
 
     go-mod-tidy:
       tags: linter
-      glob: "go.mod"
+      glob:
+        - "go.mod"
+        - "go.sum"
+        - "*.go"
+        - "**/*.go"
       run: |
         before=$(git status --porcelain)
         while IFS= read -r mod; do

From f975f383d3166b66bcc6c62c1f474b2fafecb9a1 Mon Sep 17 00:00:00 2001
From: Adam Hamrick <adam.hamrick@smartcontract.com>
Date: Fri, 29 May 2026 11:12:34 -0400
Subject: [PATCH 9/9] lefthook

---
 lefthook.yml | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/lefthook.yml b/lefthook.yml
index f101a69..3e25b95 100644
--- a/lefthook.yml
+++ b/lefthook.yml
@@ -106,5 +106,9 @@ pre-push:
   commands:
     go-short-tests:
       tags: test
-      glob: "*.go|go.mod"
+      glob:
+        - "*.go"
+        - "**/*.go"
+        - "go.mod"
+        - "go.sum"
       run: go test -short ./...