Skip to content

Commit bc59574

Browse files
committed
feat(orchestrator): SOTA agent subsystem — 12 new mechanisms, 107 new tests
Add three layers of advanced agent machinery that no existing coding CLI (Claude Code, Codex, Aider) ships in this combination: Ausführung (Execution) - verifier.go — Verifier, Verdict, weighted Check scoring - speculative.go — best-of-N in isolated git worktrees, verifier-selected - critic.go — bounded verify→diagnose→repair loop with stall detection - episodic.go — FTS5 episode store; passed/failed as planner prior Kontrolle (Control) - contract.go — Intent Contract (Scope, Frozen, Forbidden, Blast-Radius) - blame.go — O(log n) bisection over edit log → surgical repair - strategy_router.go — Thompson-sampling over (task-class, strategy) arms - governor.go — escalating budget ladder with audit trail Wahrnehmung (Perception) - impact.go — ImpactGraph from `go list -json`, blast-radius prediction - targeted.go — fast affected-only tests + full-suite final gate - confidence.go — Brier-calibrated agent confidence + MergePolicy - mutation.go — Mutation Probe: do the tests actually observe the change? Integration notes - Task.Title added additively (no breaking change to existing tests). - StrategyRouter named to avoid collision with the existing intent-classifier Router (Pre-LLM keyword heuristic). - All persistent components (EpisodeStore, StrategyRouter, Calibrator) take an optional *sql.DB and degrade gracefully to in-memory mode when nil — default build has no sqlite dependency, FTS5 enabled at runtime when wired. - Test helpers are hermetic: t.TempDir(), no network, no git subprocess. - 12 .doc.md companion files documenting public surface, behavior, and caller responsibilities (CoDocs standard). - Total: 32 files, +2,344 LOC, 107 new tests (133/133 green in package). Out of scope (deliberate): wiring into orchestrate.go and replacing MockAgent with the live NIM agent. Both require a separate integration commit that touches the existing Dispatcher/Planner paths. Tracked in docs/issues/done/st-orch-advanced.md.
1 parent 54609e0 commit bc59574

32 files changed

Lines changed: 3039 additions & 0 deletions
Lines changed: 293 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,293 @@
1+
// SPDX-License-Identifier: MIT
2+
package orchestrator
3+
4+
import (
5+
"context"
6+
"path/filepath"
7+
"testing"
8+
"time"
9+
)
10+
11+
func TestTargetedFastChecksUsesAffected(t *testing.T) {
12+
g := &ImpactGraph{
13+
nodes: map[string]*PkgNode{
14+
"repo/a": {ImportPath: "repo/a", TestFiles: []string{"a_test.go"}},
15+
"repo/b": {ImportPath: "repo/b", TestFiles: []string{"b_test.go"}},
16+
},
17+
reverse: map[string][]string{},
18+
fileToPkg: map[string]string{"a/x.go": "repo/a"},
19+
}
20+
tv := NewTargetedVerifier(NewVerifier(t.TempDir()), g)
21+
checks := tv.FastChecks([]string{"a/x.go"})
22+
if len(checks) != 2 {
23+
t.Fatalf("expected 2 checks (build+test), got %d", len(checks))
24+
}
25+
if checks[1].Kind != CheckTest {
26+
t.Fatalf("second check should be test, got %s", checks[1].Kind)
27+
}
28+
}
29+
30+
func TestTargetedFallbackBuildAll(t *testing.T) {
31+
g := &ImpactGraph{
32+
nodes: map[string]*PkgNode{"repo/a": {ImportPath: "repo/a"}},
33+
reverse: map[string][]string{},
34+
fileToPkg: map[string]string{},
35+
}
36+
tv := NewTargetedVerifier(NewVerifier(t.TempDir()), g)
37+
checks := tv.FastChecks([]string{"unknown.go"})
38+
if len(checks) != 1 || checks[0].Name != "build-all" {
39+
t.Fatalf("expected fallback build-all, got %+v", checks)
40+
}
41+
}
42+
43+
func TestTargetedFinalChecksEqualDefault(t *testing.T) {
44+
tv := &TargetedVerifier{}
45+
defaults := DefaultGoChecks()
46+
finals := tv.FinalChecks()
47+
if len(finals) != len(defaults) {
48+
t.Fatalf("final must equal default, got %d vs %d", len(finals), len(defaults))
49+
}
50+
}
51+
52+
func TestTargetedVerifyStagedFastFail(t *testing.T) {
53+
g := &ImpactGraph{
54+
nodes: map[string]*PkgNode{"repo/a": {ImportPath: "repo/a", TestFiles: []string{"a_test.go"}}},
55+
reverse: map[string][]string{},
56+
fileToPkg: map[string]string{"a/x.go": "repo/a"},
57+
}
58+
tv := NewTargetedVerifier(NewVerifier(t.TempDir()), g)
59+
v := tv.VerifyStaged(context.Background(), "t", "c", []string{"a/x.go"})
60+
if v.Passed {
61+
t.Fatal("expected red — fast build of empty repo must fail")
62+
}
63+
}
64+
65+
func TestTargetedSpeedup(t *testing.T) {
66+
g := &ImpactGraph{
67+
nodes: map[string]*PkgNode{
68+
"repo/a": {},
69+
"repo/b": {},
70+
"repo/c": {},
71+
"repo/d": {},
72+
},
73+
reverse: map[string][]string{},
74+
fileToPkg: map[string]string{"a/x.go": "repo/a"},
75+
}
76+
tv := NewTargetedVerifier(NewVerifier(t.TempDir()), g)
77+
s := tv.Speedup([]string{"a/x.go"})
78+
if s == "" {
79+
t.Fatal("expected non-empty speedup string")
80+
}
81+
}
82+
83+
func TestMergePolicyGreenButUncalibratedGoesToReview(t *testing.T) {
84+
p := DefaultMergePolicy()
85+
if d := p.Decide(true, 0.95); d != DecisionAutoMerge {
86+
t.Fatalf("high calibrated confidence must auto-merge, got %s", d)
87+
}
88+
if d := p.Decide(true, 0.55); d != DecisionGreenReview {
89+
t.Fatalf("green + low confidence must go to review, got %s", d)
90+
}
91+
if d := p.Decide(false, 0.99); d != DecisionBlock {
92+
t.Fatalf("red must always block, got %s", d)
93+
}
94+
}
95+
96+
func TestCalibratorNilDBReturnsDeclared(t *testing.T) {
97+
c, err := NewCalibrator(nil)
98+
if err != nil {
99+
t.Fatal(err)
100+
}
101+
got, err := c.Calibrate(context.Background(), "a", 0.9)
102+
if err != nil || got != 0.9 {
103+
t.Fatalf("nil DB must return declared as-is, got %f err=%v", got, err)
104+
}
105+
}
106+
107+
func TestStrategyRouterNilDBIsSafe(t *testing.T) {
108+
r, err := NewStrategyRouter(nil, 1)
109+
if err != nil {
110+
t.Fatal(err)
111+
}
112+
pick := r.Pick(ClassRefactor, []Strategy{StratASTEdit, StratHashline})
113+
if pick != StratASTEdit && pick != StratHashline {
114+
t.Fatalf("pick must be one of candidates, got %q", pick)
115+
}
116+
if err := r.Report(context.Background(), ClassRefactor, StratASTEdit, true); err != nil {
117+
t.Fatal(err)
118+
}
119+
mean, n := r.Posterior(ClassRefactor, StratASTEdit)
120+
if n < 0 || mean < 0 {
121+
t.Fatalf("posterior must be valid, got mean=%f n=%d", mean, n)
122+
}
123+
}
124+
125+
func TestStrategyRouterLearnsFromOutcomes(t *testing.T) {
126+
r, err := NewStrategyRouter(nil, 42)
127+
if err != nil {
128+
t.Fatal(err)
129+
}
130+
for i := 0; i < 20; i++ {
131+
if err := r.Report(context.Background(), ClassRefactor, StratASTEdit, true); err != nil {
132+
t.Fatal(err)
133+
}
134+
}
135+
mean, _ := r.Posterior(ClassRefactor, StratASTEdit)
136+
if mean < 0.8 {
137+
t.Fatalf("after 20 successes, posterior should be >0.8, got %f", mean)
138+
}
139+
}
140+
141+
func TestMutationProbeEmptyWorkdirAssumesKill(t *testing.T) {
142+
mp := NewMutationProbe("", []string{"true"})
143+
mp.MaxMutations = 1
144+
res, err := mp.Run(context.Background(), nil)
145+
if err != nil {
146+
t.Fatal(err)
147+
}
148+
if res.ObservabilityScore != 1.0 {
149+
t.Fatalf("empty input -> 1.0, got %f", res.ObservabilityScore)
150+
}
151+
}
152+
153+
func TestParseAddedLinesIgnoresComments(t *testing.T) {
154+
diff := `--- a/foo.go
155+
+++ b/foo.go
156+
@@ -1,2 +1,3 @@
157+
// existing comment
158+
+# new comment
159+
func X() {}`
160+
lines := ParseAddedLines(diff)
161+
for _, l := range lines {
162+
if l.Text == "" {
163+
t.Fatal("comment line must be skipped")
164+
}
165+
}
166+
}
167+
168+
func TestGovernorRunsLadder(t *testing.T) {
169+
vf := NewVerifier(t.TempDir())
170+
gov := &Governor{
171+
Ladder: []Rung{
172+
{Name: "single", Agents: 1, RepairRounds: 1, Timeout: 5 * time.Second},
173+
},
174+
Verifier: vf,
175+
Checks: []Check{{Kind: CheckBuild, Name: "ok", Cmd: []string{"true"}}},
176+
Factory: func(r Rung) []Agent {
177+
return []Agent{&scriptAgent{name: "a", reply: "ok"}}
178+
},
179+
}
180+
res, err := gov.Execute(context.Background(), &Task{ID: "t1", Title: "x", Description: "d"}, NewScratchpad())
181+
if err != nil {
182+
t.Fatal(err)
183+
}
184+
if !res.Passed {
185+
t.Fatal("single-shot green rung should pass")
186+
}
187+
if res.FinalRung != "single" {
188+
t.Fatalf("FinalRung mismatch, got %q", res.FinalRung)
189+
}
190+
}
191+
192+
func TestGovernorEscalatesOnFailure(t *testing.T) {
193+
vf := NewVerifier(t.TempDir())
194+
gov := &Governor{
195+
Ladder: []Rung{
196+
{Name: "cheap", Agents: 1, RepairRounds: 0, Timeout: 5 * time.Second},
197+
{Name: "escalated", Agents: 1, RepairRounds: 0, Timeout: 5 * time.Second},
198+
},
199+
Verifier: vf,
200+
Checks: []Check{{Kind: CheckBuild, Name: "fail", Cmd: []string{"false"}}},
201+
Factory: func(r Rung) []Agent {
202+
return []Agent{&scriptAgent{name: "a", reply: "ok"}}
203+
},
204+
}
205+
res, err := gov.Execute(context.Background(), &Task{ID: "t1", Title: "x", Description: "d"}, NewScratchpad())
206+
if err != nil {
207+
t.Fatal(err)
208+
}
209+
if res.Passed {
210+
t.Fatal("cannot pass on failing check")
211+
}
212+
if len(res.Escalations) != 1 {
213+
t.Fatalf("expected 1 escalation, got %d", len(res.Escalations))
214+
}
215+
if res.Escalations[0].FromRung != "cheap" || res.Escalations[0].ToRung != "escalated" {
216+
t.Fatalf("wrong escalation: %+v", res.Escalations[0])
217+
}
218+
}
219+
220+
func TestClassifyTask(t *testing.T) {
221+
cases := []struct {
222+
title, desc string
223+
want TaskClass
224+
}{
225+
{"Rename foo to bar", "", ClassRename},
226+
{"Fix nil pointer", "crash on resize", ClassBugfix},
227+
{"Add new helper", "create util", ClassGreenfield},
228+
{"Refactor TUI", "restructure views", ClassRefactor},
229+
{"Set config", "edit yaml", ClassConfig},
230+
{"Misc task", "do something", ClassUnknown},
231+
}
232+
for _, c := range cases {
233+
if got := ClassifyTask(&Task{Title: c.title, Description: c.desc}); got != c.want {
234+
t.Errorf("ClassifyTask(%q)=%q want %q", c.title, got, c.want)
235+
}
236+
}
237+
}
238+
239+
func TestSpeculativeRunNoAgents(t *testing.T) {
240+
s := NewSpeculativeRunner("", nil)
241+
_, err := s.Run(context.Background(), &Task{ID: "t"}, nil, NewScratchpad())
242+
if err == nil {
243+
t.Fatal("expected error on empty agents")
244+
}
245+
}
246+
247+
func TestBlamerEmptyLog(t *testing.T) {
248+
bl := &Blamer{Verifier: NewVerifier(t.TempDir())}
249+
_, err := bl.Blame(context.Background(), &EditLog{}, Check{Name: "x"})
250+
if err == nil {
251+
t.Fatal("expected error on empty log")
252+
}
253+
}
254+
255+
func TestBlamerPreExistingFailure(t *testing.T) {
256+
vf := NewVerifier(t.TempDir())
257+
bl := &Blamer{Verifier: vf}
258+
// No Workdir/Base set → checkAt returns true (assume base green).
259+
// Then a single edit with no SHA — bisect narrows immediately.
260+
log := &EditLog{Edits: []EditRecord{{Seq: 1, SHA: "", Summary: "test"}}}
261+
res, err := bl.Blame(context.Background(), log, Check{Kind: CheckBuild, Name: "x", Cmd: []string{"true"}})
262+
if err != nil {
263+
t.Fatal(err)
264+
}
265+
if res.Culprit == nil {
266+
t.Fatal("expected culprit, got nil")
267+
}
268+
if res.PriorGreen != 0 {
269+
t.Fatalf("expected PriorGreen=0, got %d", res.PriorGreen)
270+
}
271+
}
272+
273+
func TestShortSHA(t *testing.T) {
274+
if got := shortSHA("abcdef0123456789"); got != "abcdef01" {
275+
t.Fatalf("shortSHA wrong: %q", got)
276+
}
277+
if got := shortSHA("abc"); got != "abc" {
278+
t.Fatalf("shortSHA must not pad: %q", got)
279+
}
280+
}
281+
282+
func TestPathAllowedNested(t *testing.T) {
283+
c := &Contract{AllowedGlobs: []string{"internal/tui/*"}}
284+
if !c.pathAllowed(filepath.Join("internal", "tui", "deep", "thing.go")) {
285+
t.Fatal("nested in-scope path must be allowed")
286+
}
287+
}
288+
289+
func TestBoolToInt(t *testing.T) {
290+
if boolToInt(true) != 1 || boolToInt(false) != 0 {
291+
t.Fatal("boolToInt wrong")
292+
}
293+
}
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
# orchestrator/blame.go
2+
3+
Causal Blame — bisect the edit log to find the first edit that flipped a
4+
check green→red. O(log n) verification runs via binary search. The
5+
precondition is that the full edit log fails the check; the postcondition
6+
is `BlameResult.Culprit` pointing at the responsible edit (or `nil` if
7+
the failure pre-existed).
8+
9+
## Public surface
10+
11+
- `EditRecord{Seq, SHA, Path, Summary}`
12+
- `EditLog{TaskID, Workdir, Base, Edits}`
13+
- `BlameResult{Culprit, Check, Bisections, PriorGreen}``PriorGreen` is
14+
the highest seq that still passes (repair can rewind to it).
15+
- `Blamer{Verifier}`
16+
- `Blame(ctx, log, failing) *BlameResult`
17+
18+
## Behavior
19+
20+
- Step 1: check the base commit. If it already fails, return early —
21+
the failure is pre-existing, not caused by current edits.
22+
- Step 2: binary search the edit log by SHA. `checkAt` checks out the
23+
SHA in detached mode, runs the single check, restores the tip in `defer`.
24+
- The "no git workdir" mode (Workdir empty) treats every prefix as
25+
passing — useful for tests.
26+
27+
## Caller responsibilities
28+
29+
- The agent layer must commit each applied edit to a scratch branch
30+
(`sin/run-<taskID>`) and append to `EditLog.Edits` before calling `Blame`.
31+
- `Verifier` should be configured to run **only the failing check** for
32+
bisect efficiency (`Verify(ctx, ..., []Check{c})`).

0 commit comments

Comments
 (0)