diff --git a/.gitignore b/.gitignore
index e04f680..16357a2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,10 +9,7 @@ diagnose-*/
 
 .env
 .venv/
-
-# Local Phoenix DB for evals/eval.py
-evals/.phoenix/
-__pycache__/
+*.test
 
 # Agents
 .antigravitycli/
diff --git a/AGENTS.md b/AGENTS.md
index a862610..cf388b6 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -3,6 +3,7 @@
 - An easy to import and use test rig for Go projects
 - Can be used as a package or CLI
 - Used to find and fix flaky tests
+- Minimal CPU, RAM, and timing overhead added to test execution
 
 ## Validate changes
 
@@ -18,7 +19,6 @@ go test ./...                 # Test
 - `internal/runner/` — core test execution. `Diagnose` is the main entry point; `diagnoseRunHooks` carries iteration hooks as `func(context.Context) error` fields.
 - `internal/config/` — Cobra flag registry config loading. `config.App` is the unified config struct.
 - `internal/output/` — output printer abstraction. `--ai-output` flag controls format.
-- `internal/repo/` — git/module helpers.
 
 ## Critical decisions
 
diff --git a/example_test.go b/example_test.go
index cff11f4..a69ae6d 100644
--- a/example_test.go
+++ b/example_test.go
@@ -12,18 +12,12 @@ func ExampleRun() {
 		// GlobalSetup runs once before any tests start.
 		testrig.GlobalSetup(func(_ context.Context) error {
 			fmt.Println("Starting mock background service...")
-			// Simulate starting a dependency, e.g.:
-			// cmd := exec.CommandContext(ctx, "docker", "compose", "up", "-d")
-			// return cmd.Run()
 			return nil
 		}),
 
 		// IterationSetup runs before each diagnose iteration.
 		testrig.IterationSetup(func(_ context.Context) error {
 			fmt.Println("Resetting database state for next iteration...")
-			// Simulate resetting state:
-			// cmd := exec.CommandContext(ctx, "psql", "-c", "TRUNCATE events")
-			// return cmd.Run()
 			return nil
 		}),
 
diff --git a/go.mod b/go.mod
index 002f704..c56b021 100644
--- a/go.mod
+++ b/go.mod
@@ -15,6 +15,7 @@ require (
 	github.com/spf13/cobra v1.10.2
 	github.com/spf13/pflag v1.0.10
 	github.com/stretchr/testify v1.11.1
+	golang.org/x/sync v0.20.0
 )
 
 require (
@@ -50,7 +51,6 @@ require (
 	github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e // indirect
 	golang.org/x/exp v0.0.0-20260410095643-746e56fc9e2f // indirect
 	golang.org/x/mod v0.35.0 // indirect
-	golang.org/x/sync v0.20.0 // indirect
 	golang.org/x/sys v0.45.0 // indirect
 	golang.org/x/term v0.43.0 // indirect
 	golang.org/x/text v0.37.0 // indirect
diff --git a/internal/runner/analyze.go b/internal/runner/analyze.go
index d6a92f3..40a2c66 100644
--- a/internal/runner/analyze.go
+++ b/internal/runner/analyze.go
@@ -14,6 +14,7 @@ import (
 	"sort"
 	"strconv"
 	"strings"
+	"sync"
 	"time"
 
 	"charm.land/lipgloss/v2"
@@ -206,12 +207,18 @@ func (rep *Report) TestGroups() []TestGroup {
 // coupling the parser to the filesystem.
 type LogMap map[testKey]map[int]string
 
+var readerPool = sync.Pool{
+	New: func() any {
+		return bufio.NewReaderSize(nil, 1024*1024)
+	},
+}
+
 // Analyze reads per-iteration test2json streams and classifies tests.
 // Malformed lines are silently skipped (go test can interleave non-JSON).
 func Analyze(iterations []io.Reader, slowThreshold time.Duration) (*Report, LogMap, error) {
 	aggs := make(map[testKey]*aggregate)
 	for i, r := range iterations {
-		if err := scanIterationJSONL(r, i, aggs, nil); err != nil {
+		if err := scanIterationJSONL(r, i, aggs, nil, slowThreshold); err != nil {
 			return nil, nil, err
 		}
 	}
@@ -241,16 +248,35 @@ func (a *aggregate) recordElapsed(iterIdx int, d time.Duration) {
 
 // scanIterationJSONL merges one iteration's JSONL stream into aggs at iterIdx.
 // meta may be nil; when set, records e.g. compile/build failure from FailedBuild on fail events.
-func scanIterationJSONL(r io.Reader, iterIdx int, aggs map[testKey]*aggregate, meta *iterationScanMeta) error {
-	reader := bufio.NewReaderSize(r, 1024*1024)
+func scanIterationJSONL(
+	r io.Reader,
+	iterIdx int,
+	aggs map[testKey]*aggregate,
+	meta *iterationScanMeta,
+	slowThreshold time.Duration,
+) error {
+	reader := readerPool.Get().(*bufio.Reader)
+	reader.Reset(r)
+	defer func() {
+		reader.Reset(nil)
+		readerPool.Put(reader)
+	}()
+
 	for {
-		line, err := reader.ReadBytes('\n')
+		line, err := reader.ReadSlice('\n')
+		if err == bufio.ErrBufferFull {
+			rest, err2 := reader.ReadBytes('\n')
+			line = append(append([]byte(nil), line...), rest...)
+			err = err2
+		}
+
 		if len(line) > 0 && line[0] == '{' {
 			var ev TestEvent
 			if json.Unmarshal(line, &ev) == nil {
-				applyTestEvent(aggs, iterIdx, &ev, meta)
+				applyTestEvent(aggs, iterIdx, &ev, meta, slowThreshold)
 			}
 		}
+
 		if err != nil {
 			if err != io.EOF {
 				return fmt.Errorf("reading iteration %d: %w", iterIdx, err)
@@ -260,7 +286,13 @@ func scanIterationJSONL(r io.Reader, iterIdx int, aggs map[testKey]*aggregate, m
 	}
 }
 
-func applyTestEvent(aggs map[testKey]*aggregate, iterIdx int, ev *TestEvent, meta *iterationScanMeta) {
+func applyTestEvent(
+	aggs map[testKey]*aggregate,
+	iterIdx int,
+	ev *TestEvent,
+	meta *iterationScanMeta,
+	slowThreshold time.Duration,
+) {
 	key := testKey{Package: ev.Package, Test: ev.Test}
 	a := aggs[key]
 	if a == nil {
@@ -271,7 +303,11 @@ func applyTestEvent(aggs map[testKey]*aggregate, iterIdx int, ev *TestEvent, met
 	case "pass":
 		a.passes++
 		a.iterations[iterIdx] = struct{}{}
-		a.recordElapsed(iterIdx, seconds(ev.Elapsed))
+		el := seconds(ev.Elapsed)
+		a.recordElapsed(iterIdx, el)
+		if !a.timedOut && (slowThreshold == 0 || el <= slowThreshold) {
+			delete(a.outputs, iterIdx)
+		}
 	case "fail":
 		if meta != nil && ev.FailedBuild != "" {
 			meta.sawFailedBuild = true
@@ -284,7 +320,11 @@ func applyTestEvent(aggs map[testKey]*aggregate, iterIdx int, ev *TestEvent, met
 		a.skips++
 		a.iterations[iterIdx] = struct{}{}
 		a.skipIters[iterIdx] = true
-		a.recordElapsed(iterIdx, seconds(ev.Elapsed))
+		el := seconds(ev.Elapsed)
+		a.recordElapsed(iterIdx, el)
+		if !a.timedOut {
+			delete(a.outputs, iterIdx)
+		}
 	case "output":
 		if strings.Contains(ev.Output, timeoutPanic) {
 			a.timedOut = true
@@ -559,7 +599,7 @@ func countNamedTestsSkippedInAggs(aggs map[testKey]*aggregate) int {
 func DigestIterationJSONL(r io.Reader, slowThreshold time.Duration) (IterationDigest, error) {
 	aggs := make(map[testKey]*aggregate)
 	var meta iterationScanMeta
-	if err := scanIterationJSONL(r, 0, aggs, &meta); err != nil {
+	if err := scanIterationJSONL(r, 0, aggs, &meta, slowThreshold); err != nil {
 		return IterationDigest{}, err
 	}
 	reattributeTimeouts(aggs, newAggregate)
diff --git a/internal/runner/diagnose_progress.go b/internal/runner/diagnose_progress.go
index cf2d219..9e0a44a 100644
--- a/internal/runner/diagnose_progress.go
+++ b/internal/runner/diagnose_progress.go
@@ -1,13 +1,8 @@
 package runner
 
 import (
-	"context"
-	"encoding/json"
-	"errors"
 	"fmt"
 	"io"
-	"os"
-	"os/exec"
 	"slices"
 	"strings"
 	"sync"
@@ -70,44 +65,6 @@ func packagePatternsFromEnd(args []string) []string {
 	return pkgs
 }
 
-// listTestPackageCount runs `go list -test -e` for the trailing package patterns
-// in go test arguments (see packagePatternsFromEnd). On error or no patterns,
-// returns an error or zero packages.
-func listTestPackageCount(ctx context.Context, repoRoot string, goTestArgs []string) (int, error) {
-	pkgs := packagePatternsFromEnd(goTestArgs)
-	if len(pkgs) == 0 {
-		return 0, errors.New("no package patterns in go test arguments (put packages last, after flags)")
-	}
-	// Binary is fixed ("go"); pkgs come from the user's CLI package patterns by design.
-	//nolint:gosec // G204: forwarded package patterns from CLI invocation
-	cmd := exec.CommandContext(ctx, "go", append([]string{"list", "-test", "-e", "-f", "{{.ImportPath}}"}, pkgs...)...)
-	cmd.Dir = repoRoot
-	cmd.Env = os.Environ()
-	out, err := cmd.Output()
-	if err != nil {
-		return 0, err
-	}
-	n := 0
-	for line := range strings.SplitSeq(string(out), "\n") {
-		if strings.TrimSpace(line) != "" {
-			n++
-		}
-	}
-	if n == 0 {
-		return 0, errors.New("go list returned no packages")
-	}
-	return n, nil
-}
-
-// diagnoseProgress tracks completed packages from a go test -json stream.
-type diagnoseProgress struct {
-	mu         sync.Mutex
-	done       map[string]struct{}
-	lastPkg    string
-	pkgOutcome map[string]string // package import path → pass|fail|skip (package-level events only)
-	total      int               // -1 when denominator is unknown (go list failed or empty)
-}
-
 type parallelDiagnoseProgress struct {
 	mu              sync.Mutex
 	renderMu        sync.Mutex
@@ -138,14 +95,6 @@ func newParallelDiagnoseProgressAt(totalIterations int, poolStartedAt time.Time)
 	}
 }
 
-func newDiagnoseProgress(totalPackages int) *diagnoseProgress {
-	return &diagnoseProgress{
-		done:       make(map[string]struct{}),
-		pkgOutcome: make(map[string]string),
-		total:      totalPackages,
-	}
-}
-
 func (p *parallelDiagnoseProgress) start(iteration int) {
 	if p == nil {
 		return
@@ -214,50 +163,6 @@ func (p *parallelDiagnoseProgress) renderSnapshot(
 	return completed, total, actives, poolElapsed
 }
 
-// onTestJSONLine updates state from one JSONL line. Returns true if the number
-// of completed packages increased (for throttled redraws).
-func (p *diagnoseProgress) onTestJSONLine(line []byte) (completedIncreased bool) {
-	if len(line) == 0 || line[0] != '{' {
-		return false
-	}
-	var ev TestEvent
-	if err := json.Unmarshal(line, &ev); err != nil {
-		return false
-	}
-	if ev.Package != "" {
-		p.mu.Lock()
-		p.lastPkg = ev.Package
-		p.mu.Unlock()
-	}
-	if !isPackageTerminalEvent(&ev) {
-		return false
-	}
-	p.mu.Lock()
-	defer p.mu.Unlock()
-	p.pkgOutcome[ev.Package] = ev.Action
-	before := len(p.done)
-	p.done[ev.Package] = struct{}{}
-	return len(p.done) > before
-}
-
-func isPackageTerminalEvent(ev *TestEvent) bool {
-	if ev.Package == "" || ev.Test != "" {
-		return false
-	}
-	switch ev.Action {
-	case "pass", "fail", "skip":
-		return true
-	default:
-		return false
-	}
-}
-
-func (p *diagnoseProgress) snapshot() (completed int, total int, lastPkg string, outcome string) {
-	p.mu.Lock()
-	defer p.mu.Unlock()
-	return len(p.done), p.total, p.lastPkg, p.pkgOutcome[p.lastPkg]
-}
-
 // progressBracket wraps inner (already styled) in muted square brackets.
 func progressBracket(inner string) string {
 	return termstyle.Muted.Render("[") + inner + termstyle.Muted.Render("]")
diff --git a/internal/runner/diagnose_progress_test.go b/internal/runner/diagnose_progress_test.go
index 9c4cd34..d4188e4 100644
--- a/internal/runner/diagnose_progress_test.go
+++ b/internal/runner/diagnose_progress_test.go
@@ -11,61 +11,6 @@ import (
 	"github.com/smartcontractkit/testrig/internal/output"
 )
 
-func TestDiagnoseProgress_onTestJSONLine_packageTerminal(t *testing.T) {
-	t.Parallel()
-	p := newDiagnoseProgress(2)
-
-	require.False(t, p.onTestJSONLine([]byte(`not json`)))
-	require.False(t, p.onTestJSONLine([]byte(`{"Action":"run","Package":"a/b","Test":"TestX"}`)))
-
-	require.True(t, p.onTestJSONLine([]byte(`{"Action":"pass","Package":"a/b"}`)))
-	c, tot, _, _ := p.snapshot()
-	require.Equal(t, 1, c)
-	require.Equal(t, 2, tot)
-
-	// Duplicate package-level pass must not report a second completion tick.
-	require.False(t, p.onTestJSONLine([]byte(`{"Action":"pass","Package":"a/b"}`)))
-	c, _, _, _ = p.snapshot()
-	require.Equal(t, 1, c)
-
-	require.True(t, p.onTestJSONLine([]byte(`{"Action":"fail","Package":"c/d"}`)))
-	c, _, _, _ = p.snapshot()
-	require.Equal(t, 2, c)
-}
-
-func TestDiagnoseProgress_onTestJSONLine_skipFail(t *testing.T) {
-	t.Parallel()
-	p := newDiagnoseProgress(1)
-	require.True(t, p.onTestJSONLine([]byte(`{"Action":"skip","Package":"p"}`)))
-	c, _, _, _ := p.snapshot()
-	require.Equal(t, 1, c)
-
-	p2 := newDiagnoseProgress(1)
-	require.True(t, p2.onTestJSONLine([]byte(`{"Action":"fail","Package":"p"}`)))
-	c2, _, _, _ := p2.snapshot()
-	require.Equal(t, 1, c2)
-}
-
-func TestDiagnoseProgress_lastPkgUpdates(t *testing.T) {
-	t.Parallel()
-	p := newDiagnoseProgress(10)
-	p.onTestJSONLine([]byte(`{"Action":"run","Package":"x/y","Test":"TestZ"}`))
-	_, _, last, _ := p.snapshot()
-	require.Equal(t, "x/y", last)
-}
-
-func TestDiagnoseProgress_pkgOutcomeOnTerminal(t *testing.T) {
-	t.Parallel()
-	p := newDiagnoseProgress(5)
-	p.onTestJSONLine([]byte(`{"Action":"run","Package":"p/q","Test":"TestZ"}`))
-	_, _, _, out := p.snapshot()
-	require.Empty(t, out)
-	p.onTestJSONLine([]byte(`{"Action":"pass","Package":"p/q"}`))
-	_, _, last, out := p.snapshot()
-	require.Equal(t, "p/q", last)
-	require.Equal(t, "pass", out)
-}
-
 func TestEllipsizeRight(t *testing.T) {
 	t.Parallel()
 	require.Equal(t, "short", ellipsizeRight("short", 10))
diff --git a/internal/runner/runner.go b/internal/runner/runner.go
index 86b9b6a..fbbd8a8 100644
--- a/internal/runner/runner.go
+++ b/internal/runner/runner.go
@@ -34,6 +34,7 @@ type diagnoseIterationParams struct {
 	Out              *output.Printer
 	ResultsDir       string
 	GoTestArgs       []string
+	ModuleDir        string
 	Iteration        int
 	ShuffleSeed      int64
 	Env              []string
@@ -102,8 +103,6 @@ func Gotestsum(ctx context.Context, conf *config.App, args []string) error {
 // iterSetup and iterTeardown run before/after each iteration. Either may be
 // nil. Teardown runs even when the iteration's go test invocation fails; its
 // error is reported only when the iteration itself succeeded.
-//
-//nolint:gocyclo
 func Diagnose(
 	ctx context.Context,
 	conf *config.App,
@@ -384,6 +383,12 @@ func runDiagnoseIterations(
 	if hooks.runIteration == nil {
 		hooks.runIteration = diagnoseIteration
 	}
+
+	moduleDir, adjustedArgs, err := resolveModuleDir(conf.RepoRoot, goTestArgs)
+	if err != nil {
+		return diagnoseRunState{}, err
+	}
+
 	if hooks.seed == nil {
 		hooks.seed = func() int64 { return rand.Int64N(1<<62) + 1 } //nolint:gosec // G404: non-crypto seed for test shuffle
 	}
@@ -432,7 +437,8 @@ func runDiagnoseIterations(
 		conf:             conf,
 		out:              out,
 		resultsDir:       resultsDir,
-		goTestArgs:       goTestArgs,
+		goTestArgs:       adjustedArgs,
+		moduleDir:        moduleDir,
 		hooks:            hooks,
 		parallel:         parallel,
 		parallelProgress: parallelProgress,
@@ -487,6 +493,7 @@ type diagnoseWorker struct {
 	out              *output.Printer
 	resultsDir       string
 	goTestArgs       []string
+	moduleDir        string
 	hooks            diagnoseRunHooks
 	parallel         int
 	parallelProgress *parallelDiagnoseProgress
@@ -529,6 +536,7 @@ func (w *diagnoseWorker) run(runCtx context.Context, resource diagnoseIterationR
 			Out:              w.out,
 			ResultsDir:       w.resultsDir,
 			GoTestArgs:       w.goTestArgs,
+			ModuleDir:        w.moduleDir,
 			Iteration:        iteration,
 			ShuffleSeed:      seed,
 			Env:              resource.Env,
@@ -1098,16 +1106,12 @@ func (sw *syncedWriter) Write(p []byte) (int, error) {
 
 func diagnoseIteration(ctx context.Context, p diagnoseIterationParams) error {
 	conf, out := p.Conf, p.Out
-	resultsDir, goTestArgs := p.ResultsDir, p.GoTestArgs
+	resultsDir := p.ResultsDir
 	iteration, shuffleSeed := p.Iteration, p.ShuffleSeed
 	env := p.Env
 	liveProgress, parallelProgress := p.LiveProgress, p.ParallelProgress
 	diagnoseRunStart, serialProgressMu := p.DiagnoseRunStart, p.SerialProgressMu
-
-	moduleDir, goTestArgs, err := resolveModuleDir(conf.RepoRoot, goTestArgs)
-	if err != nil {
-		return err
-	}
+	moduleDir, goTestArgs := p.ModuleDir, p.GoTestArgs
 
 	start := time.Now()
 	jsonPath := filepath.Join(resultsDir, fmt.Sprintf("iteration-%d.log.jsonl", iteration))
@@ -1115,7 +1119,16 @@ func diagnoseIteration(ctx context.Context, p diagnoseIterationParams) error {
 	if err != nil {
 		return err
 	}
-	defer func() { _ = resultsFile.Close() }()
+	bw := bufio.NewWriterSize(resultsFile, 128*1024)
+	var retErr error
+	defer func() {
+		if err := bw.Flush(); err != nil && retErr == nil {
+			retErr = err
+		}
+		if err := resultsFile.Close(); err != nil && retErr == nil {
+			retErr = err
+		}
+	}()
 
 	args, err := buildDiagnoseArgs(goTestArgs, shuffleSeed)
 	if err != nil {
@@ -1130,29 +1143,20 @@ func diagnoseIteration(ctx context.Context, p diagnoseIterationParams) error {
 	cmd.Cancel = func() error { return cmd.Process.Signal(os.Interrupt) }
 	cmd.WaitDelay = 5 * time.Second
 
-	if out.AIOutput() {
-		sw := &syncedWriter{w: resultsFile}
-		cmd.Stdout = sw
-		cmd.Stderr = sw
-		return cmd.Run()
-	}
-
-	sw := &syncedWriter{w: resultsFile}
+	sw := &syncedWriter{w: bw}
+	cmd.Stdout = sw
 	cmd.Stderr = sw
 
-	totalPkgs := -1
-	if n, listErr := listTestPackageCount(ctx, moduleDir, goTestArgs); listErr == nil {
-		totalPkgs = n
+	if out.AIOutput() {
+		retErr = cmd.Run()
+		return retErr
 	}
-	prog := newDiagnoseProgress(totalPkgs)
+
 	if parallelProgress != nil {
 		parallelProgress.start(iteration)
 		defer parallelProgress.finish(iteration)
 	}
 
-	pr, pw := io.Pipe()
-	cmd.Stdout = pw
-
 	live := liveProgress && out.LiveInlineProgress()
 	iter, iters := iteration+1, conf.Iterations
 	if liveProgress && !live {
@@ -1175,30 +1179,6 @@ func diagnoseIteration(ctx context.Context, p diagnoseIterationParams) error {
 		)
 	}
 
-	var readWG sync.WaitGroup
-	var scanErr error
-	readWG.Go(func() {
-		r := bufio.NewReaderSize(pr, 1024*1024)
-		for {
-			line, err := r.ReadBytes('\n')
-			if len(line) > 0 {
-				if _, werr := sw.Write(line); werr != nil {
-					break
-				}
-				completedIncreased := prog.onTestJSONLine(line)
-				if completedIncreased && !live {
-					redraw(false)
-				}
-			}
-			if err != nil {
-				if err != io.EOF {
-					scanErr = err
-				}
-				break
-			}
-		}
-	})
-
 	tickDone := make(chan struct{})
 	var tickWG sync.WaitGroup
 	if live {
@@ -1217,25 +1197,14 @@ func diagnoseIteration(ctx context.Context, p diagnoseIterationParams) error {
 		redraw(true)
 	}
 
-	runErr := cmd.Start()
-	started := runErr == nil
-	if started {
-		runErr = cmd.Wait()
-		_ = pw.Close()
-	} else {
-		_ = pw.CloseWithError(runErr)
-	}
-	readWG.Wait()
+	retErr = cmd.Run()
 	close(tickDone)
 	tickWG.Wait()
 
 	if live {
 		out.ClearInline()
 	}
-	if scanErr != nil {
-		return fmt.Errorf("reading go test output: %w", scanErr)
-	}
-	return runErr
+	return retErr
 }
 
 func newRunMeta(
diff --git a/internal/runner/runner_bench_test.go b/internal/runner/runner_bench_test.go
new file mode 100644
index 0000000..e5c719f
--- /dev/null
+++ b/internal/runner/runner_bench_test.go
@@ -0,0 +1,422 @@
+package runner
+
+import (
+	"context"
+	"fmt"
+	"io"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"strconv"
+	"strings"
+	"sync"
+	"testing"
+	"text/tabwriter"
+	"time"
+
+	"github.com/stretchr/testify/require"
+	"golang.org/x/sync/semaphore"
+
+	"github.com/smartcontractkit/testrig/internal/config"
+	"github.com/smartcontractkit/testrig/internal/output"
+)
+
+const (
+	benchDummyTarget   = "./internal/runner/testdata/dummy/..."
+	benchDogfoodTarget = "./..."
+)
+
+// baselineWorkload runs the raw `go test -json` floor for one diagnose-equivalent
+// workload: `iterations` invocations against target, at most `parallel` running
+// concurrently (mirroring how Diagnose schedules iterations across workers).
+func baselineWorkload(ctx context.Context, repoRoot, target string, iterations, parallel int) error {
+	if parallel < 1 {
+		parallel = 1
+	}
+	sem := semaphore.NewWeighted(int64(parallel))
+	var wg sync.WaitGroup
+	var mu sync.Mutex
+	var firstErr error
+	for range iterations {
+		if err := sem.Acquire(ctx, 1); err != nil {
+			wg.Wait()
+			return err
+		}
+		wg.Go(func() {
+			defer sem.Release(1)
+			//nolint:gosec // G204: target is fixed per test (dummy or ./...)
+			cmd := exec.CommandContext(ctx, "go", "test", "-json", "-count=1", target)
+			cmd.Dir = repoRoot
+			cmd.Env = envWithoutKey(os.Environ(), overheadMatrixEnv)
+			cmd.Stdout = io.Discard
+			cmd.Stderr = io.Discard
+			if err := cmd.Run(); err != nil {
+				mu.Lock()
+				if firstErr == nil {
+					firstErr = err
+				}
+				mu.Unlock()
+			}
+		})
+	}
+	wg.Wait()
+	return firstErr
+}
+
+// diagnoseWorkload runs one Diagnose call against target with the given iteration
+// count and parallelism. Output is discarded.
+func diagnoseWorkload(
+	ctx context.Context,
+	out *output.Printer,
+	repoRoot, target string,
+	iterations, parallel int,
+) error {
+	conf := &config.App{
+		RepoRoot:           repoRoot,
+		Iterations:         iterations,
+		ParallelIterations: parallel,
+		SlowThreshold:      time.Second,
+	}
+	return Diagnose(ctx, conf, out, []string{target}, nil, nil)
+}
+
+func envWithoutKey(env []string, key string) []string {
+	prefix := key + "="
+	out := make([]string, 0, len(env))
+	for _, e := range env {
+		if e == key || strings.HasPrefix(e, prefix) {
+			continue
+		}
+		out = append(out, e)
+	}
+	return out
+}
+
+// existingDiagnoseDirs lists the diagnose-* result dirs currently in repoRoot.
+func existingDiagnoseDirs(repoRoot string) []string {
+	matches, _ := filepath.Glob(filepath.Join(repoRoot, "diagnose-*"))
+	return matches
+}
+
+// cleanupNewDiagnoseDirs removes any diagnose-* result dirs created during the
+// benchmark, so repeated runs don't accumulate output dirs in the repo root.
+func cleanupNewDiagnoseDirs(tb testing.TB, repoRoot string) {
+	tb.Helper()
+	before := make(map[string]struct{})
+	for _, d := range existingDiagnoseDirs(repoRoot) {
+		before[d] = struct{}{}
+	}
+	tb.Cleanup(func() {
+		for _, d := range existingDiagnoseDirs(repoRoot) {
+			if _, ok := before[d]; !ok {
+				_ = os.RemoveAll(d)
+			}
+		}
+	})
+}
+
+// BenchmarkBaselineGoTest is the floor: raw `go test -json` against the same
+// target Diagnose runs. Subtract its ns/op, B/op, allocs/op from
+// BenchmarkDiagnose to read the overhead Diagnose adds.
+func BenchmarkBaselineGoTest(b *testing.B) {
+	repoRoot, err := filepath.Abs("../..")
+	require.NoError(b, err)
+	ctx := context.Background()
+
+	b.ReportAllocs()
+	for b.Loop() {
+		require.NoError(b, baselineWorkload(ctx, repoRoot, benchDummyTarget, 1, 1))
+	}
+}
+
+// BenchmarkDiagnose runs one Diagnose iteration against the same target as
+// BenchmarkBaselineGoTest. ns/op minus baseline is the overhead Diagnose adds.
+func BenchmarkDiagnose(b *testing.B) {
+	repoRoot, err := filepath.Abs("../..")
+	require.NoError(b, err)
+	cleanupNewDiagnoseDirs(b, repoRoot)
+
+	out := output.NewForTest(true, io.Discard, io.Discard, false)
+	ctx := context.Background()
+
+	b.ReportAllocs()
+	for b.Loop() {
+		require.NoError(b, diagnoseWorkload(ctx, out, repoRoot, benchDummyTarget, 1, 1))
+	}
+}
+
+// overheadConfig is one (iterations, parallel) point in the overhead matrix.
+type overheadConfig struct {
+	iterations int
+	parallel   int
+}
+
+// overheadRow pairs a config with its measured baseline and diagnose results.
+type overheadRow struct {
+	cfg      overheadConfig
+	baseline testing.BenchmarkResult
+	diagnose testing.BenchmarkResult
+}
+
+// overheadMatrix is the set of (iterations, parallel) points measured by
+// BenchmarkDiagnoseOverhead: single run, sequential iterations, then parallel.
+var overheadMatrix = []overheadConfig{
+	{iterations: 1, parallel: 1},
+	{iterations: 4, parallel: 1},
+	{iterations: 4, parallel: 4},
+	{iterations: 8, parallel: 1},
+	{iterations: 8, parallel: 8},
+}
+
+// overheadMatrixEnv gates TestDiagnoseOverhead_*; it spawns many `go test`
+// subprocesses and is too slow for the normal test run.
+const overheadMatrixEnv = "TESTRIG_BENCH_OVERHEAD"
+
+// overheadMatrixRunsEnv sets how many times each matrix cell is benchmarked
+// before averaging (default 5). Use 3–5 for stabler numbers; 1 for a quick smoke.
+const overheadMatrixRunsEnv = "TESTRIG_BENCH_OVERHEAD_RUNS"
+
+const (
+	overheadMatrixRunsDefault = 5
+	overheadMatrixRunsMax     = 10
+)
+
+func skipUnlessDiagnoseOverheadMatrix(t *testing.T) {
+	t.Helper()
+	if os.Getenv(overheadMatrixEnv) == "" {
+		t.Skipf("set %s=1 to run the diagnose overhead matrix", overheadMatrixEnv)
+	}
+	if testing.Short() {
+		t.Skip("skipping diagnose overhead matrix in short mode")
+	}
+}
+
+func overheadMatrixRuns() int {
+	s := strings.TrimSpace(os.Getenv(overheadMatrixRunsEnv))
+	if s == "" {
+		return overheadMatrixRunsDefault
+	}
+	n, err := strconv.Atoi(s)
+	if err != nil || n < 1 {
+		return overheadMatrixRunsDefault
+	}
+	return min(n, overheadMatrixRunsMax)
+}
+
+// averageBenchmarkResults averages per-op metrics across repeated benchmark runs.
+func averageBenchmarkResults(results []testing.BenchmarkResult) testing.BenchmarkResult {
+	if len(results) == 0 {
+		return testing.BenchmarkResult{}
+	}
+	var ns, bytes, allocs int64
+	for _, r := range results {
+		ns += r.NsPerOp()
+		bytes += r.AllocedBytesPerOp()
+		allocs += r.AllocsPerOp()
+	}
+	n := int64(len(results))
+	avgBytes := bytes / n
+	avgAllocs := allocs / n
+	if avgBytes < 0 {
+		avgBytes = 0
+	}
+	if avgAllocs < 0 {
+		avgAllocs = 0
+	}
+	return testing.BenchmarkResult{
+		N:         1,
+		T:         time.Duration(ns / n),
+		MemBytes:  uint64(avgBytes),
+		MemAllocs: uint64(avgAllocs),
+	}
+}
+
+// repeatBenchmark runs fn runs times and returns the averaged result.
+// phase labels log lines; pass "" to omit per-run logging.
+func repeatBenchmark(t *testing.T, runs int, phase string, fn func() testing.BenchmarkResult) testing.BenchmarkResult {
+	t.Helper()
+	results := make([]testing.BenchmarkResult, runs)
+	for i := range runs {
+		start := time.Now()
+		results[i] = fn()
+		if phase != "" {
+			t.Logf("%s: run %d/%d done (wall %s, %s/op)",
+				phase, i+1, runs, time.Since(start).Round(time.Second), roundedDur(results[i].NsPerOp()))
+		}
+	}
+	avg := averageBenchmarkResults(results)
+	if phase != "" {
+		t.Logf("%s: mean %s/op over %d runs", phase, roundedDur(avg.NsPerOp()), runs)
+	}
+	return avg
+}
+
+// runDiagnoseOverheadMatrix measures Diagnose overhead vs the raw `go test` floor
+// across overheadMatrix for target and logs a diff table. It is a test helper, not
+// a Benchmark, because it drives testing.Benchmark internally (which deadlocks if
+// called from a benchmark).
+func runDiagnoseOverheadMatrix(t *testing.T, label, target string) {
+	t.Helper()
+	skipUnlessDiagnoseOverheadMatrix(t)
+
+	repoRoot, err := filepath.Abs("../..")
+	require.NoError(t, err)
+	cleanupNewDiagnoseDirs(t, repoRoot)
+
+	// Child `go test` processes must not see TESTRIG_BENCH_OVERHEAD (dogfood runs ./...).
+	t.Setenv(overheadMatrixEnv, "")
+
+	out := output.NewForTest(true, io.Discard, io.Discard, false)
+	ctx := context.Background()
+	runs := overheadMatrixRuns()
+	total := len(overheadMatrix)
+	t.Logf("[%s] overhead matrix: target=%s, %d cells, %d runs/cell (%s overrides)",
+		label, target, total, runs, overheadMatrixRunsEnv)
+
+	rows := make([]overheadRow, 0, total)
+	for cell, cfg := range overheadMatrix {
+		cellLabel := fmt.Sprintf("[%s] cell %d/%d iters=%d parallel=%d",
+			label, cell+1, total, cfg.iterations, cfg.parallel)
+
+		base := repeatBenchmark(t, runs, cellLabel+" baseline", func() testing.BenchmarkResult {
+			r := testing.Benchmark(func(b *testing.B) {
+				b.ReportAllocs()
+				for b.Loop() {
+					require.NoError(b, baselineWorkload(ctx, repoRoot, target, cfg.iterations, cfg.parallel))
+				}
+			})
+			require.NotZero(t, r.N, "baseline workload failed for %+v", cfg)
+			return r
+		})
+
+		diag := repeatBenchmark(t, runs, cellLabel+" diagnose", func() testing.BenchmarkResult {
+			r := testing.Benchmark(func(b *testing.B) {
+				b.ReportAllocs()
+				for b.Loop() {
+					require.NoError(b, diagnoseWorkload(ctx, out, repoRoot, target, cfg.iterations, cfg.parallel))
+				}
+			})
+			require.NotZero(t, r.N, "diagnose workload failed for %+v", cfg)
+			return r
+		})
+
+		overheadNs := diag.NsPerOp() - base.NsPerOp()
+		t.Logf("%s: done — overhead %s (%s of diagnose; baseline %s, diagnose %s)",
+			cellLabel, overheadDur(overheadNs), overheadPercent(overheadNs, diag.NsPerOp()),
+			roundedDur(base.NsPerOp()), roundedDur(diag.NsPerOp()))
+
+		rows = append(rows, overheadRow{cfg: cfg, baseline: base, diagnose: diag})
+	}
+	printDiagnoseOverhead(t, label, target, runs, rows)
+}
+
+// TestDiagnoseOverhead_Dummy runs the overhead matrix against the tiny dummy package.
+// Run via `just bench_overhead_matrix_dummy`. Each cell is benchmarked 5 times by default
+// and averaged; set TESTRIG_BENCH_OVERHEAD_RUNS (e.g. 3) to tune accuracy vs wall time.
+//
+//nolint:paralleltest // serial by design: spawns many go test subprocesses and measures wall time.
+func TestDiagnoseOverhead_Dummy(t *testing.T) {
+	runDiagnoseOverheadMatrix(t, "dummy", benchDummyTarget)
+}
+
+// TestDiagnoseOverhead_Dogfood runs the overhead matrix against the full testrig module (./...).
+// Run via `just bench_overhead_matrix_dogfood`; expect much longer wall time than dummy.
+//
+//nolint:paralleltest // serial by design: spawns many go test subprocesses and measures wall time.
+func TestDiagnoseOverhead_Dogfood(t *testing.T) {
+	runDiagnoseOverheadMatrix(t, "dogfood", benchDogfoodTarget)
+}
+
+// roundedDur renders ns as a duration rounded to microseconds for the table.
+func roundedDur(ns int64) string {
+	return time.Duration(ns).Round(time.Microsecond).String()
+}
+
+// overheadDur renders overhead; negative deltas (noise) show as 0.
+func overheadDur(ns int64) string {
+	if ns < 0 {
+		ns = 0
+	}
+	return roundedDur(ns)
+}
+
+// overheadPercent is overhead as a share of diagnose runtime (overhead / diagnose).
+func overheadPercent(overheadNs, diagnoseNs int64) string {
+	if diagnoseNs <= 0 {
+		return "n/a"
+	}
+	if overheadNs < 0 {
+		overheadNs = 0
+	}
+	return fmt.Sprintf("%.1f%%", float64(overheadNs)*100/float64(diagnoseNs))
+}
+
+// printDiagnoseOverhead logs a table of baseline vs diagnose wall time per config.
+// overhead = diagnose ns/op - baseline ns/op; overhead/iter divides by iterations.
+// Each cell is the mean of runs repeated benchmark invocations.
+func printDiagnoseOverhead(t *testing.T, label, target string, runs int, rows []overheadRow) {
+	t.Helper()
+	var sb strings.Builder
+	tw := tabwriter.NewWriter(&sb, 0, 0, 2, ' ', 0)
+	_, _ = fmt.Fprintln(tw, "iters\tparallel\tbaseline\tdiagnose\toverhead\toverhead%\toverhead/iter")
+	for _, r := range rows {
+		overheadNs := r.diagnose.NsPerOp() - r.baseline.NsPerOp()
+		perIterNs := overheadNs / int64(max(r.cfg.iterations, 1))
+		diagNs := r.diagnose.NsPerOp()
+		_, _ = fmt.Fprintf(tw, "%d\t%d\t%s\t%s\t%s\t%s\t%s\n",
+			r.cfg.iterations,
+			r.cfg.parallel,
+			roundedDur(r.baseline.NsPerOp()),
+			roundedDur(diagNs),
+			overheadDur(overheadNs),
+			overheadPercent(overheadNs, diagNs),
+			overheadDur(perIterNs),
+		)
+	}
+	_ = tw.Flush()
+	t.Logf(`
+--------------------------------------------------------------------------------
+Diagnose overhead vs raw go test (%s, target=%s, %d-run average per cell)
+--------------------------------------------------------------------------------
+%s`,
+		label, target, runs, sb.String())
+}
+
+func TestAverageBenchmarkResults(t *testing.T) {
+	t.Parallel()
+	avg := averageBenchmarkResults([]testing.BenchmarkResult{
+		{N: 10, T: 1_000, MemBytes: 1_000, MemAllocs: 100},
+		{N: 10, T: 3_000, MemBytes: 3_000, MemAllocs: 300},
+	})
+	require.Equal(t, int64(200), avg.NsPerOp())
+	require.Equal(t, int64(200), avg.AllocedBytesPerOp())
+	require.Equal(t, int64(20), avg.AllocsPerOp())
+}
+
+func TestOverheadPercent(t *testing.T) {
+	t.Parallel()
+	require.Equal(t, "20.0%", overheadPercent(20, 100))
+	require.Equal(t, "0.0%", overheadPercent(-5, 100))
+	require.Equal(t, "n/a", overheadPercent(10, 0))
+}
+
+func TestOverheadMatrixRuns(t *testing.T) {
+	require.Equal(t, 5, overheadMatrixRuns())
+	t.Setenv(overheadMatrixRunsEnv, "3")
+	require.Equal(t, 3, overheadMatrixRuns())
+	t.Setenv(overheadMatrixRunsEnv, "99")
+	require.Equal(t, overheadMatrixRunsMax, overheadMatrixRuns())
+	t.Setenv(overheadMatrixRunsEnv, "nope")
+	require.Equal(t, overheadMatrixRunsDefault, overheadMatrixRuns())
+}
+
+func BenchmarkResolveModuleDir(b *testing.B) {
+	repoRoot, err := filepath.Abs("../..")
+	require.NoError(b, err)
+	args := []string{"./internal/runner/..."}
+
+	for b.Loop() {
+		_, _, err := resolveModuleDir(repoRoot, args)
+		require.NoError(b, err)
+	}
+}
diff --git a/internal/runner/testdata/dummy/dummy_test.go b/internal/runner/testdata/dummy/dummy_test.go
new file mode 100644
index 0000000..f400466
--- /dev/null
+++ b/internal/runner/testdata/dummy/dummy_test.go
@@ -0,0 +1,5 @@
+package dummy
+
+import "testing"
+
+func TestDummy(t *testing.T) {}
diff --git a/justfile b/justfile
index dc677a4..c129a70 100644
--- a/justfile
+++ b/justfile
@@ -14,9 +14,20 @@ test:
 test_race:
     go tool gotestsum -- -race ./...
 
-# Run benchmarks with memory stats and specific CPU counts
+# Run standard benchmarks
 bench:
-    go test -bench=. -benchmem -run=^$ ./... -cpu=2,4,8
+    go test -bench=. -benchmem -run=^$ ./...
+
+# Diagnose overhead matrix (dummy; fast). 5 runs averaged per cell; TESTRIG_BENCH_OVERHEAD_RUNS=3 to override.
+bench_overhead_matrix_dummy:
+    TESTRIG_BENCH_OVERHEAD=1 go test ./internal/runner/ -run='^TestDiagnoseOverhead_Dummy$' -count=1 -v
+
+# Run benchmark to measure diagnose overhead against the full testrig module (./...); slow.
+bench_overhead_matrix_dogfood:
+    TESTRIG_BENCH_OVERHEAD=1 go test ./internal/runner/ -run='^TestDiagnoseOverhead_Dogfood$' -count=1 -v
+
+# Run benchmarks to measure diagnose overhead for both dummy and dogfood targets.
+bench_overhead_matrix: bench_overhead_matrix_dummy bench_overhead_matrix_dogfood
 
 # Local GoReleaser dry-run (snapshot)
 goreleaser:
diff --git a/lefthook.yml b/lefthook.yml
index bbbcf28..3e25b95 100644
--- a/lefthook.yml
+++ b/lefthook.yml
@@ -54,14 +54,19 @@ pre-commit:
 
     golangci-lint:
       tags: linter
-      glob: "*.go"
-      root: ""
-      run: golangci-lint run --fast-only --fix ./...
+      glob:
+        - "*.go"
+        - "**/*.go"
+        - "go.mod"
+        - "go.sum"
+      run: golangci-lint run --fix --fast-only ./...
       stage_fixed: true
 
     go-generate:
       tags: linter
-      glob: "*.go"
+      glob:
+        - "*.go"
+        - "**/*.go"
       run: |
         before=$(git status --porcelain)
         go generate ./...
@@ -75,7 +80,11 @@ pre-commit:
 
     go-mod-tidy:
       tags: linter
-      glob: "go.mod"
+      glob:
+        - "go.mod"
+        - "go.sum"
+        - "*.go"
+        - "**/*.go"
       run: |
         before=$(git status --porcelain)
         while IFS= read -r mod; do
@@ -97,5 +106,9 @@ pre-push:
   commands:
     go-short-tests:
       tags: test
-      glob: "*.go"
+      glob:
+        - "*.go"
+        - "**/*.go"
+        - "go.mod"
+        - "go.sum"
       run: go test -short ./...