From ef5d2f49d29d924632d06502ffa4446853278ab7 Mon Sep 17 00:00:00 2001 From: Lu Nelson Date: Wed, 13 May 2026 13:17:50 +0200 Subject: [PATCH 01/42] initial sync plus first status-/semantic-reconciliation of PLAN vs PLAN from fe-705 --- docs/archive/PLAN_HISTORY.md | 11 + memory-fe-705/PLAN.md | 235 +++++++ memory-fe-705/SPEC.md | 658 ++++++++++++++++++ memory/CARDS.md | 442 ------------ memory/PLAN.md | 270 ++++--- .../components/pending-review-section.tsx | 11 +- 6 files changed, 1031 insertions(+), 596 deletions(-) create mode 100644 memory-fe-705/PLAN.md create mode 100644 memory-fe-705/SPEC.md delete mode 100644 memory/CARDS.md diff --git a/docs/archive/PLAN_HISTORY.md b/docs/archive/PLAN_HISTORY.md index 89ebc6c5..7316d797 100644 --- a/docs/archive/PLAN_HISTORY.md +++ b/docs/archive/PLAN_HISTORY.md @@ -178,3 +178,14 @@ Archived out of `memory/PLAN.md` after side-chat V3.1 closed end-to-end (PR #124 - 2026-05-07 — **Side-chat V2 — Edit / Drill-down / Propose-edge plumbing** (FE-673, PR #97) — added `edit`, `edge`, and `drill-down` patch kinds. Server `classifyEditImpact` returns `none | soft | hard`; soft applies directly with undo, hard returns `deferred: true` placeholder (removed at V3.0 ship). Client: patch-list reducer + three applier factories with real undo handlers. Verified: `npm run verify` (935 tests, 19 new). - 2026-05-04 — **Graph view structured-list peer route** — `/specification/$id/graph` now renders project-wide entities through the structured-list layout with relationship subsections, relation chips, empty state, row controls, and a back-to-chat affordance. Follow-up active-path filtering and spatial canvas remain horizon work. - 2026-05-01 — **Side-chat V1.1 — Explore vertical slice** — end-to-end graph-launched chat interaction shipped: prompt builder, POST `/side-chat` SSE endpoint, popover host, graph-view wiring, SSE consumer, and active-button activation. Follow-up refactor collapsed pending assistant text into the message list and extracted `SideChatHost` so activation is a tree-mount fact. + +## 2026-05-13 Sync Archive + +Archived out of `memory/PLAN.md` during `ln-sync` so the live plan keeps only the rolling frontier plus the last three completed items. Entries already archived in the 2026-05-11 sync archive were not duplicated here. + +- [2026-05-08] FE-698 prompt/context follow-up hardening — Candidate-spec prompt scenarios no longer advertise durable changeset submission, prompt scenario artifacts report schema version 2 for the fingerprinted shape, scenario definitions require typed context data, empty prompt assets are cached correctly, context-pack anchors use intent vocabulary, and `context-pack.ts` now remains the public entry point over private scenario-specific context-pack modules. Verified: `npm run verify`. Watch: this is still FE-698 continuation hardening; broader generative quality review and additional scenario probes remain later slices. +- [2026-05-08] FE-698 prompt/context remediation + candidate scenario — Prompt scenario definitions are now discriminated by scenario kind, candidate-spec scenarios render deterministic no-provider proposal artifacts from typed context packs, scenario artifacts include prompt/context fingerprints, server prompt asset copying mirrors current source assets, prompt golden coverage protects production prompt text, and the build-boundary prompt test writes isolated output. Verified: `npm run verify`. Watch: full generative quality review for candidate-spec output remains a later execution/probe slice. +- [2026-05-08] FE-698 scenario execution error hardening — Scenario execution failures now serialize safe deterministic summaries: API-key-like provider errors are redacted, non-Error rejections avoid object dumps, and ordinary errors remain reviewable. Verified: `npm run verify`. +- [2026-05-08] FE-698 Anthropic scenario adapter — Added a probe-only Anthropic AI SDK adapter behind the existing `PromptScenarioModelAdapter` seam. Web-research prompt scenarios now map rendered prompts to AI SDK system content and rendered context packs to user prompt content under mocked tests, with unsupported providers rejected before model construction. Verified: `npm run verify`. Watch: this is not the shared AI runtime provider seam; OpenRouter/provider-neutral routing, credential UX, Pi, web tools, CLI/UI, persistence, and Brunch mutations remain out of scope. +- [2026-05-08] FE-698 prompt scenario execution probe — Web-research prompt scenarios can now execute through an injected fakeable model adapter and serialize `succeeded` / `failed` execution results with raw output or deterministic error text, while no-provider artifacts remain deterministic `not-run` snapshots. Structured parsing is explicitly `not-applicable` for this prose-only web-research path. Verified: `npm run verify`. Watch: real provider adapters, Pi, web tools, CLI/UI, persistence, and mutating Brunch handlers remain out of scope for this foundation slice. +- [2026-05-06] Multi-chat substrate + reconciliation needs (FE-697) — `chat` table with one interview chat per spec, nullable `turn.chat_id`, `specification.primary_chat_id`, mirrored `chat.active_turn_id`, plus the `reconciliation_need` queue with directed source/target items, narrow `kind`/`status`, partial unique index on open rows, cascade FK. Spec creation inserts spec + interview chat in one transaction; `advanceHead` is transactional. No user-visible change. Verified: `npm run verify` (673 tests) plus manual fixture playback (39 specs / 81 turns / dual-pointer equivalence). A82 / A83 validated for Phase 1. diff --git a/memory-fe-705/PLAN.md b/memory-fe-705/PLAN.md new file mode 100644 index 00000000..407aa54f --- /dev/null +++ b/memory-fe-705/PLAN.md @@ -0,0 +1,235 @@ + + +# Plan + +The interaction model is mature: four-phase interview, interviewer-autonomous question format, phase-agnostic preface cards with workspace exploration, structured review with per-item commenting, observer knowledge extraction, workflow ownership extraction, distribution hardening, graph view's structured-list peer route, and the first relation-first observer capture seam all ship as working product. In this stack, downstack FE-697 supplies the multi-chat substrate (chat containers + `reconciliation_need` queue), and FE-698 supplies the prompt/context scenario substrate from `main`. Side-chat V2 plumbing — `edit` / `edge` / `drill-down` patch kinds with server route, reducer, and undo-capable appliers — is branch-complete on FE-673 (PR #97) but ships without its user-facing Edit-mode trigger, and the V2 hard-impact branch returns a `deferred: true` placeholder banner. The live frontier is **side-chat V3.0**, which removes that placeholder by routing hard-impact apply through the new `reconciliation_need` queue. + +The May 2026 intent-spec, multi-chat, changeset-ledger, prompt/context, agent-mutation, and strategy design notes are reconciled into one direction. `docs/design/MULTI_CHAT.md` is the downstack phase-one substrate for this stack. `docs/design/SIDE_CHAT.md` describes side-chat V1 / V2 / V3.0 / V3.1 / V4 phasing on top of that substrate, with §13 mapping each user-surface version onto a substrate phase. `docs/design/PATCH_LEDGER.md` remains historical deeper design pressure for semantic mutation history, but canonical future-facing vocabulary is `changeset` / `change`; `docs/design/INTENT_SPEC_EVOLUTION.md` carries the broader synthesis. The product-layer ontology trajectory is split out as `docs/design/INTENT_GRAPH_SEMANTICS.md` (canonical reference for the FE-700 frontier), `docs/design/BEHAVIORAL_KERNELS.md` (canonical reference for the FE-702 kernel probes), and `docs/design/SPEC_EVOLUTION_STRATEGIES.md` (chat-local strategies, candidate bundles, graph-review oracle, and concern/dependency map). The dev-layer self-tooling trajectory — the `ln-*` skill family, the proposed file-backed spec registry, and the long-horizon convergence between dev and product ontologies — lives in `docs/design/DEV_WORKFLOW_EVOLUTION.md`. Older portability work remains a future-facing boundary map rather than a live roadmap item until a hosted, remote, or adapter-backed substrate becomes a product goal. + + +## Active + +1. **Side-chat V3.0 — hard-impact edit cascade through `reconciliation_need`** — drop the V2 deferred banner; on hard-impact `propose_edit` apply, server enumerates incident `knowledge_edge` rows under typed relation policy (Path 1 from MULTI_CHAT.md §5.1) and opens one `reconciliation_need` per affected pair; client surfaces those rows as a `Pending review` section in `patch-list-overlay.tsx` with per-row accept-on-target / edit-target / dismiss actions. V3.0 groups needs mechanically (by `kind` and relation type); agent-grouped resolution is V3.1 horizon work. + - Why now / unlocks: downstack FE-697 supplies the queue table for this stack; the FE-674 planning sync (PR #110) reconciled SIDE_CHAT.md §5.3 / §8 / §9 / §13 and SPEC.md (Acceptance Criterion 7, A88, D146, I113) against the substrate; the V2 deferred banner is the highest-visibility user gap. Without V3.0, FE-697's queue has no reader and V2 hard-impact stays an empty promise. + - Recommended shape: ship as a small queue of scope cards inside this one frontier item (track in `memory/CARDS.md` if needed). Suggested order — (a) un-stub `SideChatPopover` Edit-mode button so V2 plumbing is reachable from the UI at all; (b) server `openReconciliationNeedsForItemChange()` + lifecycle endpoint for resolution; (c) `edit-applier` rewrite to drop the `deferred: true` shape and surface needs into side-chat state; (d) overlay `Pending review` section + per-row resolution actions; (e) verification — `edit-applier.test.ts`, `reconciliation-need.test.ts`, `patch-list-overlay.test.tsx`, F6 fixture matrix (leaf, 2-downstream, 5+-downstream, in-active-review-set, mixed kinds). + - Linear: FE-674. + - Traceability: Acceptance Criterion 7; Requirement 10; A48, A71, A83, A88, A93; D80, D135, D137, D138, D146, D150; I111, I113, I117. + - Design doc: `docs/design/SIDE_CHAT.md` §5.3, §9, §13; `docs/design/MULTI_CHAT.md` §5. + +## Next + +2. **Intent graph semantics + relation-policy directionality foundation** — refine the ontology and relation policy so the graph can represent invariants, examples/counterexamples, constraint subtypes, narrowed decisions, witness strength, checkability gaps, and operational edge behavior as source/destination material for future generative features. + - Linear: FE-700. + - Work type: semantic substrate / high-coordination foundation. + - Why now / unlocks: candidate generation, behavioral kernels, graph review, scenario-options acceleration, architect proposals, direct-edit cascade, and downstream verification-aware decomposition all need a sharper semantic target than the current exploration/review ontology. This is the next substrate layer most likely to collide with parallel work, so it should land before broadening graph-review or reconciliation behavior. + - Recommended shape: add `invariant` and `example` as first-class durable kinds; subtype examples (positive / negative / edge-case / trace / not-relevant); narrow `decision`; enrich `constraint`, `criterion`, and `invariant` subtypes; add `checkability` and `witness strength`; introduce the five-family relation taxonomy and negative relations; add edge epistemic metadata; and make relation-policy directionality explicit (`canonicalSentence`, `inverseSentence`, source-change behavior, target-change behavior) rather than inferring cascade from raw edge direction. Leave room for contrastive-kernel artifacts such as `alternative`, `question`, `ambiguity`, and `candidate`, but keep them proposal-local unless probes prove they need durable top-level kinds. + - Verification approach: corpus/fixture observer probes comparing old vs refined ontology; relation-policy unit tests for mixed-direction relations; graph-review manual assessment for precision/noise; context-pack probe outputs must show authority, witness, relation support, and directionality labels. + - Parallelization note: this is the semantic-layer lane; keep strategy probes artifact-only until this stabilizes. + - Traceability: Requirement 38; A77, A78, A80, A81, A84, A93; D134, D136, D137, D139, D140, D150; I117. + - Design docs: `docs/design/INTENT_GRAPH_SEMANTICS.md`; `docs/design/SPEC_EVOLUTION_STRATEGIES.md` §Relation directionality; `docs/design/INTENT_SPEC_EVOLUTION.md`. + +3. **Semantic changeset ledger + proposal-turn staleness** — introduce the semantic history spine that separates graph mutation history from conversational turn ancestry. + - Linear: FE-701. + - Work type: persistence / mutation substrate. + - Why now / unlocks: scenario bundle acceptance, direct-edit atomicity, accepted-with-issues flows, stale proposal detection, graph-review repairs, and future architect/reconciliation agents all need a durable semantic mutation boundary. Without it, productized scenario-options can stay probe-only but cannot safely commit candidate bundles. + - Recommended shape: add `changeset` / `change` as canonical schema and operation vocabulary; track `specification.latest_changeset_id`; stamp new turns with `opened_at_changeset_id` / `base_changeset_id`; connect `reconciliation_need.caused_by_changeset_id`; keep proposals/findings as turn-owned artifacts until accepted; ensure only `accept` applies a proposal changeset; and treat a changeset as the smallest atomic unit that preserves semantic coherence. Do not add a first-class `procedure_run` table unless lifecycle/retry/cancel or multi-turn operation grouping demands it. + - Verification approach: DB atomicity tests for changeset + changes + reconciliation_need writes, staleness tests for open proposal turns across multi-chat changes, capability/transition tests proving non-accept actions cannot mutate graph truth. + - Parallelization note: can proceed after/alongside FE-700 if relation-policy interfaces are agreed, but avoid product candidate acceptance until this lands. + - Traceability: Requirements 39, 42, 44; A71, A79, A92; D135, D138, D145, D149; I116. + - Design docs: `docs/design/PATCH_LEDGER.md` (historical filename; future vocabulary is changeset/change); `docs/design/SPEC_EVOLUTION_STRATEGIES.md` §Semantic history and proposal turns. + +4. **Graph-review oracle + scenario-options probes** — build the internal critique path and artifact-only candidate bundle probes before product UI. + - Linear: FE-702 for graph-review / scenario probes; FE-649 and FE-640 remain productization children under FE-698 where relevant. + - Work type: strategy/proposal artifact + oracle probe. + - Why now / unlocks: product wants first-turn strategy choice and a mid-interview `speed this up` affordance, but engineering needs graph-review critique to make generated candidate bundles credible. This lane can advance in parallel with FE-700 if it stays artifact-only and does not commit canonical graph truth. + - Recommended shape: define candidate graph bundle and graph-review finding artifacts; add a graph-review prompt/context pack and rubric covering coherence, fixed-premise respect, coverage, tradeoff honesty, checkability, granularity, scenario fidelity, epistemic labels, provenance, and downstream usefulness; generate 2–3 scenario options that complete the current direction from context-packed accepted graph truth; run fast gates before display and deeper async critique/refine/repair as probe artifacts; classify candidate readiness as `draft` / `reviewing` / `reviewed_clean` / `reviewed_with_issues` / `blocked`; keep broader graph-review issues turn-owned rather than adding a `graph_issue` table. + - Verification approach: scenario-runner fixtures, FE-705 JSONL-generated completed-spec fixtures, raw output review, structured parse validation, qualitative scorecards, and comparison against drilldown-produced graphs. Middle/outer-loop oracle design should decide when fixture candidates become golden. + - Parallelization note: good isolation lane for work that should avoid schema-heavy relation migrations; use `scripts/agent-probes` and context-pack modules rather than product UI. + - Traceability: Requirements 20, 21, 31, 32, 40, 41, 43, 44; A67, A68, A80, A85, A87, A89, A90, A91; D126, D127, D139, D141, D147, D151, D152; I114, I118. + - Design docs: `docs/design/SPEC_EVOLUTION_STRATEGIES.md`; `docs/design/BEHAVIORAL_KERNELS.md`; `docs/design/INTENT_GRAPH_SEMANTICS.md`; `docs/design/AGENT_MUTATION_SURFACE.md`. + +5. **Continuous workspace / phase-addressable interview surface** — cumulative center pane with realized phase sections, one chat runtime per specification, sidebar section navigation, scroll/focus behavior, and the single actionable frontier preserved at the current reachable phase. + - Why now / unlocks: workflow read/write ownership is extracted (FE-616); the multi-chat substrate (FE-697) ships chat containers below the specification, so continuous workspace can adopt one visible runtime without smuggling in a second durable workflow model. Bumped behind V3.0 and the semantic substrate because V3.0 closes a visible V2 gap while FE-700/FE-701 define the graph/turn semantics that multi-strategy work will need. + - Traceability: A58; D86, D87, D110, D113, D114; I24, I102. + - Design doc: `docs/design/CONTINUOUS_WORKSPACE_HYBRID.md`. + + +## Horizon + +### Intent graph and reconciliation + +- **Relation-first observer capture enrichment** — after the next ontology/relation-policy probes, broaden observer relationship extraction across the refined ontology where edge support and operational participation are understood. + - Recommended shape: keep `runObserver()` as the public turn-owned seam, but feed it scenario-specific context packs and validate output through the relation-policy registry. The FE-639 first cut has landed; remaining work should be driven by corpus/manual proving. + - Depends on: prompt/context substrate; intent graph semantics + progressive checkability foundation. + - Traceability: Requirements 30, 38, 40; A66, A81, A84; D125, D136, D137, D139, D140; I109. + +- **Architect / generator loop** — autonomous agent that iterates over the intent graph and proposes semantic changes for HITL review through the same future changeset / reconciliation pathway as user-driven edits. + - Recommended shape: keep productized architect proposals behind multi-chat + reconciliation + semantic changesets; use the scenario substrate for shadow/proposal-only probes first. + - Traceability: A73, A85, A87; D139, D141; depends on chat containers + reconciliation needs and semantic changeset ledger. + +- **Side-chat V3.1 — agent-grouped reconciliation resolution** — once V3.0 ships, a reconciliation agent reads the `reconciliation_need` queue and reclassifies open needs into auto-confirm (review-only items, one-click resolve), auto-edit (mechanical text replacements applied through the standard edit pipeline), and substantive (judgment required, walk inside the side-chat panel using pinned-context conversation). Maps onto MULTI_CHAT.md Phase 3. + - Why later: V3.0 satisfies Acceptance Criterion 7 mechanically; agent grouping is value-add, not gap-closing. Hold until V3.0's mechanical grouping reveals whether substantive items get lost in a flat list (A88 validation). + - Depends on: V3.0 ship; reconciliation agent prompt + grouping policy. + - Traceability: Requirement 10; A48, A88; D135, D137, D138, D146. + - Design doc: `docs/design/SIDE_CHAT.md` §5.3 (V3.1), §9. + +### User-facing capabilities + +- **First-run provider setup** — deferred out of FE-698. Make missing LLM credentials visible on the dashboard, add a shared AI runtime provider seam for interviewer / observer model construction, support UI-entered keys through XDG-compliant user auth state, and evaluate whether OpenRouter should become the preferred onboarding provider while preserving Anthropic-specific capabilities or explicit degradation. + - Linear: FE-633 covers the OpenRouter/default-provider part; dashboard credential UX + XDG key storage may need a sibling issue if split from provider proving. + - Recommended shape: prove the provider resolver first with current Anthropic behavior, then spike OpenRouter against tool use, structured output, and reasoning/thinking options before making it the default. The dashboard should expose credential status without leaking secret values and offer setup before the user starts a specification. + - Traceability: Requirements 34, 35, 36; A74, A75; D130, D131, D132; I106. + +- **Workspace hygiene / `.brunch/` gitignore assist** — detect whether generated local state is already ignored and, with explicit confirmation, add an idempotent `.gitignore` entry or create `.gitignore` when absent. + - Linear: FE-648. + - Recommended shape: keep this as a deterministic local mutation with preview/confirmation semantics; it can ship independently, but the dashboard is the natural surface because it already explains workspace binding and first-run setup. + - Traceability: Requirement 37; A76; D133; I107. + +- **Productized web research capability** — web search and page-fetch tools as interviewer-invoked context gathering, surfaced as preface cards after the scenario substrate proves query framing, tool ergonomics, and provisional-context handling. + - Linear: FE-649. + - Depends on: prompt/context scenario substrate and web-research probe. + - Traceability: Requirements 20, 21, 40, 41; D99, D112, D139, D142. + +- **Dashboard result summaries and completeness metrics** — progress visibility across specifications. + +- **Two-axis interview framing** — adapt interviewer setup and questioning to the full `greenfield <> brownfield` by `end-to-end build <> incremental feature` matrix instead of treating partial-scope work as a special case. + - Linear: FE-638. + - Traceability: Requirement 29; A65; D124. + +- **Productized scenario-options / candidate-spec completion assist** — replace skip-only remainder handling with first-turn strategy choice and a mid-interview `speed this up` side-chat that generates 2–3 reviewed candidate graph bundles with tradeoffs, completing the current direction by default. + - Depends on: prompt/context scenario substrate; intent graph semantics + relation-policy directionality; graph-review oracle; changeset ledger for canonical acceptance. + - Traceability: Requirements 31, 40, 44; A67, A77, A78, A85, A90, A91; D126, D134, D136, D139, D148, D151, D152; I118. + +- **Progressive detail / recursive deflation** — support broad-pass interviewing with explicit next-level-of-detail actions rather than one uniform depth-first drill-down. + - Linear: FE-637. + - Recommended shape: pair ordinary grounding/design question turns with a turn-owned breadth-skeleton artifact that makes current coverage visible and exposes a structured detail reaction (`deepen this area`, `continue broad pass`, `sufficient for now`). The chosen reaction should steer the next same-phase frontier turn instead of introducing a separate detail workflow. + - First cut should optimize for `broad question -> choose one area to deepen next -> focused successor question -> refreshed breadth skeleton`, while keeping the same detail-focus intent reusable later from chat or graph surfaces. + - Traceability: Requirement 32; A67, A68; D127. + +- **Spatial canvas layout for graph view** — add the spatial DAG layout as a second layout choice inside graph mode, alongside the structured-list route. Same projection seam, same intent contract; only the layout strategy changes. + - Recommended shape: a layout switch inside the existing `/specification/$id/graph` route that transforms the same `EntitiesData` projection into a spatial scene with viewport / selection / focus / path-highlighting. First cut should optimize for `select node -> inspect -> launch refinement` through the multi-chat substrate. + - Depends on: graph view structured-list ship. Richer node actions depend on multi-chat / reconciliation rather than the old side-chat conceptual roadmap. + - Traceability: Requirement 33; A69; D128. + +- **Graph view active-path render filter + scope toggle** — render only active-path items by default in graph view, with a `Show all` toggle in the header that flips to the full whole-spec set. Both subsets project from the same in-memory `mode=project-wide` data; no second fetch. + - Depends on: server data-layer change for active-path membership exposure. + - Traceability: Requirement 33; D128, D129; I102. + +### Infrastructure / tooling + +- **Server mini-library compartmentalization** — consider renaming and organizing growing server seams into plural public roots with same-named private subtrees, especially around fixtures, context packs, prompts, scenario runner, entity APIs, and agent APIs. + - Status: refactor idea captured for later, not current work and not a migration commitment. + - Candidate shape: `fixtures.ts` + `fixtures/`, `context-packs.ts` + `context-packs/`, `prompts.ts` + `prompts/` with prompt snapshots colocated under `prompts/__snapshots__/`, `scenario-runner.ts` + `scenario-runner/`, `entity-apis.ts` + `entity-apis/*-route.ts`, and `agent-apis.ts` + `agent-apis/` containing tool/capability-registry subtrees. + - Rationale: make public mini-library boundaries and private implementation compartments more obvious as FE-698 prompt/context and future agent API seams grow. + +- **Structured development spec registry** — prototype file-backed canonical spec records, deterministic checks, generated markdown views, and task-local slices for Brunch's own development workflow (the `ln-*` skill family). + - Status: design horizon, not a migration commitment. Self-tooling experiment for the dev layer; not part of the product roadmap. + - Recommended shape: follow the `memory/spec/{schema,records,generated,tools}/` trajectory and the 5-step migration path (stable IDs → sidecar files → stop editing generated md → `spec:check` in the verify gate → task-local slices). First-adopter candidate: a bounded sub-area such as the multi-chat substrate's records, not the full SPEC. + - Traceability: D134. + - Design doc: `docs/design/DEV_WORKFLOW_EVOLUTION.md` (canonical reference, including the three-layer framing and convergence question); `docs/design/INTENT_SPEC_EVOLUTION.md` (broader synthesis context). + +- **Portability boundaries** — split durable store/read-model, interview session runtime, and workspace capability provider if Brunch targets hosted, remote, embedded, or sandbox-backed operation. + - Status: deferred. Some enabling seams already exist (query domains, workflow projector, no persisted `cwd` on specifications), but adapter-backed portability is not on the live roadmap. + - Deep design source: `docs/design/PORTABILITY_BOUNDARIES.md`. + +- **Agent-native CLI adapter** — future CLI-addressability should project the agent capability contract registry rather than wrap routes or ORM scripts by hand. + - Status: design input captured, not current work. + - Recommended shape: generate or mechanically validate commands from capability contracts; enforce conventional verbs/flags (`get`, `list`, `--json`, `--force`, `--wait`), non-interactive defaults, bounded JSON output, enumerated errors, structured `brunch agent-context` introspection, and a recoverable async job ledger. Durable writes still route through Brunch-owned mutation handlers. + - Traceability: A89; D143, D147. + +- Headless interview driver for scripted end-to-end probes. +- MCP server adapter for core operations. +- Git-friendly file-based persistence representation for diffable exported specs. +- Typed fixture-builder convergence for happy-path tests. + +## Recently Completed + +- [2026-05-12] FE-705 probe harness hardening refactor — Hardened the external agent-probe harness around JSONL process failures, artifact failure reporting, explicit turn budgets, and fixture-candidate structure/readiness checks. Process transports now settle pending requests on protocol errors, malformed output, process exit, stderr, write failure, or timeout; process-backed runs serialize failure artifacts; scripted/process probes accept a configurable turn budget defaulting to two; and fixture reports distinguish `parseReady`, `structureReady`, and normalization debt. Verified: targeted probe-harness tests and `npm run verify`. +- [2026-05-12] FE-705 fixture-candidate normalization checkpoint — Added a fixture-candidate inspector for probe artifact directories. It validates `artifact-bundle.json`, `summary.json`, `raw-jsonl.ndjson`, `final-chat.json`, expected preserved workspace state, completed vs error-run status, and normalization debt for timestamps/durations, temp paths, environment metadata, generated wording, resource ids, and provider-specific error redaction. Verified: targeted test, real-provider packaged smoke artifact `/tmp/brunch-llm-user-smoke-OVPjPG`, and `npm run verify`. +- [2026-05-12] FE-705 opt-in packaged LLM-as-user smoke helper — Added a fake-tested smoke helper that runs `npm run build`, drives the default packaged `node bin/brunch.js agent` command with the model-backed user policy, preserves workspace state, and returns/prints JSON-safe summaries with redacted failure artifacts. Verified: targeted tests, real-provider smoke (`turnsAnswered: 2`, final frontier `answered`, artifact dir `/tmp/brunch-llm-user-smoke-OVPjPG`), and `npm run verify`. +- [2026-05-12] FE-705 model-backed LLM-as-user policy — Added a fakeable model-backed simulated-user policy for the external probe runner. It renders strict JSON prompts from scenario brief, active question, options, and prior Q/A; parses free-text and option-selection responses into `turn.submitResponse` payloads; records prompt/raw-output/parse-status events in artifact bundles; and reports invalid model output as structured probe errors. Verified: targeted tests and `npm run verify`. +- [2026-05-12] FE-705 user-simulator policy interface — Added an injectable probe response policy that receives the scenario brief, current `chat.read` projection, active turn, and prior answered turns; the scripted answer path now runs through that policy seam, and policy failures become structured probe errors. Verified: targeted test and `npm run verify`. +- [2026-05-12] FE-705 probe workspace fixture preservation — Added opt-in `preserveWorkspaceState` support for process-backed probes: run results and artifact bundles now record the temp workspace cwd, and enabled runs copy the workspace `.brunch/` state into `workspace-state/` under the artifact directory while disabled runs keep the existing minimal artifact set. Verified: targeted test and `npm run verify`. +- [2026-05-12] FE-705 probe-runner scripts harness boundary — Moved the external probe runner out of `src/server` into `scripts/agent-probes`, expanded formatter/lint/test/type-check coverage to include `scripts/`, and updated the boundary guard around the script-side harness so it cannot import DB, capability dispatch/registry, schema, core, route-transition, or turn-response authority modules directly. Verified: targeted test and `npm run verify`. +- [2026-05-12] FE-705 probe runner import-boundary guard — Added a static boundary test proving the probe-runner module does not import DB, capability dispatch/registry, schema, core, route-transition, or turn-response authority modules directly; the existing capability/JSONL tests continue to cover the server-owned mutation path. Verified: targeted test and `npm run check`. +- [2026-05-12] FE-705 probe artifact schema and safe summaries — Hardened proof-runner artifacts with schema-versioned bundles, command sequences, raw JSONL transcripts, parsed events, non-secret environment metadata, compact question/answer summaries, duration, and deterministic redacted errors. Verified: targeted test and `npm run check`. Watch: remaining runner boundary guard should mechanically prevent direct DB/handler imports. +- [2026-05-12] FE-705 process-backed probe runner — Added a process JSONL transport plus temp-workspace runner path around the packaged `node bin/brunch.js agent` boundary. The runner can spawn through an injected process adapter, drive the scripted two-turn probe, and write raw JSONL, final chat, and summary artifacts outside `.brunch/`. Verified: targeted test and `npm run check`. Watch: next slice should harden the artifact schema/redaction before treating output as fixture-candidate material. +- [2026-05-12] FE-705 probe runner JSONL client — Added a provider-free scripted probe-runner core over an injected JSONL transport. It drives `spec.create → chat.getPrimary → chat.ensureReady → chat.read → turn.submitResponse → chat.read → chat.ensureReady → chat.read → turn.submitResponse → chat.read`, supports free-text and option-selection responses from `chat.read`, and reports structured errors without DB/handler imports. Verified: targeted test and `npm run check`. Watch: next slice still needs a process-backed temp-workspace runner and artifact writes. +- [2026-05-11] FE-705 real-provider readiness smoke hardening — Hardened `chat.ensureReady` for live provider use: initial generation now uses a non-empty runtime prompt, readiness question persistence falls back from plain text to structured ask-question parts to the turn row written by tool execution, and the manual temp-workspace JSONL smoke reaches a second answerable frontier with JSONL-only output. Verified: targeted tests, real-provider smoke, and `npm run verify`. Watch: next FE-705 slice can add `turn.get` or start the proof-of-life probe runner. +- [2026-05-11] FE-705 agent turn response submission — Added executable `turn.submitResponse` with explicit chat/turn ownership checks, shared turn-response payload validation, delegation to `submitTurnResponseTransition`, and agent-facing read projection that points answered turns back to `chat.ensureReady`. JSONL tests prove `spec.create → chat.getPrimary → chat.ensureReady → turn.submitResponse → chat.read` over explicit ids. Verified: `npm run verify`. Superseded by the live readiness smoke hardening above. +- [2026-05-11] FE-705 generated chat readiness — `chat.ensureReady` now turns an empty generated frontier into an answerable `awaiting_response` frontier by invoking a fakeable interviewer generation boundary, persisting fallback question text plus assistant parts, and preserving idempotence for already-answerable turns. JSONL tests prove `spec.create → chat.getPrimary → chat.ensureReady → chat.read` returns an answerable turn through explicit ids. Verified: `npm run verify`. Superseded by the turn-response submission slice above. +- [2026-05-11] FE-705 deterministic chat readiness — Added `chat.ensureReady` as a runtime-replay JSONL capability that materialized a kickoff-ready primary chat into a persisted empty frontier turn without invoking LLM/provider generation. The handler resolved explicit `chatId` ownership, used the existing phase-entry transition seam, mirrored the active head through spec/chat state, and was idempotent when a frontier already existed. Verified: `npm run verify`. Superseded by the generated chat readiness slice above. +- [2026-05-11] FE-705 primary chat read projection — Added read-only `chat.getPrimary` and `chat.read` agent capabilities. JSONL clients can now create a spec, discover its primary interview chat, and read a compact Brunch-owned chat projection with spec/chat identity, visible active-path turns, frontier state, and neutral next-command hints. Verified: `npm run verify`. Watch: this is read-only; next FE-705 work still needs readiness/generation and turn-response mutation before an external LLM-as-user probe can drive the interview. +- [2026-05-11] FE-705 agent JSONL lifecycle proof — Added `brunch agent` as a long-lived JSONL capability session, with executable `spec.create` and `spec.getStatus` contracts routed through Brunch-owned handlers rather than Express routes or ORM scripts. The packaged CLI can create a real local specification and read it back by explicit `specId`; malformed JSON, unknown capabilities, and schema-invalid inputs return typed error envelopes. Verified: `npm run verify`. Watch: next FE-705 slices still need chat readiness / turn response capabilities and the external LLM-as-user probe runner. +- [2026-05-11] FE-698 reconciliation context-pack slice — Added a proposal-only reconciliation prompt/context scenario that renders open reconciliation needs with source/target anchors, reason/status, prompt/context fingerprints, and read-only capability metadata. This is substrate-only: no FE-674 need lifecycle endpoint, overlay action, side-chat reducer, or durable mutation behavior. Verified: `npm run verify`. Watch: next FE-698 work can move to broader read-only/proposal-only probes and the Pi adapter spike without treating this pack as a resolution agent. +- [2026-05-08] FE-674 planning sync — reconciled `docs/design/SIDE_CHAT.md` §5.3 / §8 / §9 / §13 against the downstack FE-697 substrate; SPEC.md adds A88 (Path 1 sufficiency without agent), D146 (cascade routes through `reconciliation_need`, `deferred: true` apply contract removed at V3.0 ship), I113 (apply opens at least one need per typed dependency edge), and rewrites Acceptance Criterion 7. Doc-only, no `src/` touched. PR #110 stacked on FE-704. +- [2026-05-08] FE-698 prompt/context follow-up hardening — Candidate-spec prompt scenarios no longer advertise durable changeset submission, prompt scenario artifacts report schema version 2 for the fingerprinted shape, scenario definitions require typed context data, empty prompt assets are cached correctly, context-pack anchors use intent vocabulary, and `context-pack.ts` now remains the public entry point over private scenario-specific context-pack modules. Verified: `npm run verify`. Watch: this is still FE-698 continuation hardening; broader generative quality review and additional scenario probes remain later slices. +- [2026-05-08] FE-698 prompt/context remediation + candidate scenario — Prompt scenario definitions are now discriminated by scenario kind, candidate-spec scenarios render deterministic no-provider proposal artifacts from typed context packs, scenario artifacts include prompt/context fingerprints, server prompt asset copying mirrors current source assets, prompt golden coverage protects production prompt text, and the build-boundary prompt test writes isolated output. Verified: `npm run verify`. Watch: full generative quality review for candidate-spec output remains a later execution/probe slice. +- [2026-05-08] FE-698 scenario execution error hardening — Scenario execution failures now serialize safe deterministic summaries: API-key-like provider errors are redacted, non-Error rejections avoid object dumps, and ordinary errors remain reviewable. Verified: `npm run verify`. +- [2026-05-08] FE-698 Anthropic scenario adapter — Added a probe-only Anthropic AI SDK adapter behind the existing `PromptScenarioModelAdapter` seam. Web-research prompt scenarios now map rendered prompts to AI SDK system content and rendered context packs to user prompt content under mocked tests, with unsupported providers rejected before model construction. Verified: `npm run verify`. Watch: this is not the shared AI runtime provider seam; OpenRouter/provider-neutral routing, credential UX, Pi, web tools, CLI/UI, persistence, and Brunch mutations remain out of scope. +- [2026-05-08] FE-698 prompt scenario execution probe — Web-research prompt scenarios can now execute through an injected fakeable model adapter and serialize `succeeded` / `failed` execution results with raw output or deterministic error text, while no-provider artifacts remain deterministic `not-run` snapshots. Structured parsing is explicitly `not-applicable` for this prose-only web-research path. Verified: `npm run verify`. Watch: real provider adapters, Pi, web tools, CLI/UI, persistence, and mutating Brunch handlers remain out of scope for this foundation slice. +- [2026-05-07] FE-698 prompt/context foundation slices — Packaged markdown prompt registry + observer and web-research context-pack foundations + scenario runner capture skeleton/composition + agent mutation-surface audit + capability registry metadata. Server interviewer, observer, side-chat, and web-research role prompts now load from markdown assets through a typed prompt registry; observer capture and web-research probes render typed scenario-specific context packs; seeded prompt scenarios compose production prompts with typed context-pack output into deterministic no-provider probe artifacts; and scenario artifacts can declare validated Brunch capability contracts. Review fixes moved observer prompt composition into a pure module and made prompt scenario prompt sources explicit. The agent mutation-surface audit inventories current and projected agent-originated write paths as input to later handler slices. Verified: `npm run verify` for code slices; audit verified by code-search/document consistency. This is a completed foundation within FE-698, not retirement of the whole FE-698 frontier; the live continuation remains in `Next`. +- [2026-05-07] Side-chat V2 — Edit / Drill-down / Propose-edge plumbing (FE-673, PR #97) — added `edit`, `edge`, and `drill-down` patch kinds. Server `classifyEditImpact` returns `none | soft | hard`; soft applies directly with undo, hard returns `deferred: true` placeholder. Client: patch-list reducer + three applier factories with real undo handlers. Verified: `npm run verify` (935 tests, 19 new). Watch: `SideChatPopover` Edit button stays `disabled` and hard-impact deferred banner is live until V3.0 lands. +- [2026-05-06] Multi-chat substrate + reconciliation needs (FE-697) — `chat` table with one interview chat per spec, nullable `turn.chat_id`, `specification.primary_chat_id`, mirrored `chat.active_turn_id`, plus the `reconciliation_need` queue with directed source/target items, narrow `kind`/`status`, partial unique index on open rows, cascade FK. Spec creation inserts spec + interview chat in one transaction; `advanceHead` is transactional. No user-visible change. Verified: `npm run verify` (673 tests) plus manual fixture playback (39 specs / 81 turns / dual-pointer equivalence). A82 / A83 validated for Phase 1. +- [2026-05-01] Side-chat V1.1 — Explore vertical slice. End-to-end graph-launched chat interaction shipped: prompt builder, POST `/side-chat` SSE endpoint, popover host, graph-view wiring, SSE consumer, and active-button activation. Follow-up refactor collapsed pending assistant text into the message list and extracted `SideChatHost` so activation is a tree-mount fact. This is complete implementation history; future conceptual work is multi-chat / reconciliation, not Side-chat V2/V3. +- [2026-05-04] Graph view structured-list peer route — `/specification/$id/graph` now renders project-wide entities through the structured-list layout with relationship subsections, relation chips, empty state, row controls, and a back-to-chat affordance. Follow-up active-path filtering and spatial canvas remain horizon work. Verified: `npm run verify` in the FE-643 slice family. +- [2026-04-30] FE-650 streamed question cache promotion — `ask_question` tool execution now advances the active frontier, returns the acknowledged turn id, interviewer streams emit a post-finalize `frontier-turn-ready` event, and the client promotes that streamed question into the specification bundle query cache before refetch reconciliation. Verified: `npm run verify` plus dev-mode manual retry; the formerly visible inert-card gap is improved. Watch: if residual scroll jumps persist, inspect remaining pane-wide rerender boundaries around workspace stream projection. +- [2026-04-30] FE-639 relation-first observer capture first cut — eligible answered turns now enter one background observer-capture backlog, observer prompts use compact existing-knowledge anchors, observer output persists validated graph-delta relationship candidates, and accepted review grounding refs reuse the same conservative relation policy. Verified: `npm run verify`. Watch: A66 remains open until corpus/manual graph-review proves edge precision and density are useful. +- [2026-04-27] Runtime JSON payload hardening — Express API parsing now accepts chat-sized request bodies above the default parser ceiling and returns a JSON 413 response instead of Express HTML when a payload exceeds the app limit. Verified: `npm run verify`. Watch: if real chat requests still exceed the 5 MB limit, investigate client history / tool-result pruning rather than only raising the ceiling. +- [2026-04-24] Distribution hardening release path — `package.json` now declares the Node 22+ engine floor, explicit shipped files, and public scoped publish config; `npm run release` drives release-it at repo root, rebuilds and dry-runs the packaged artifact, and documents npm auth prerequisites. Verified: `npm run verify`. Watch: CI trusted publishing is still intentionally out of scope. + +Older history: `docs/archive/PLAN_HISTORY.md` + +## Dependencies + +```text +TRACK A — Semantic substrate (highest coordination) +multi-chat-substrate + reconciliation-needs (completed) + ├──→ intent graph semantics + relation-policy directionality (next, FE-700) + │ ├──→ relation-first observer enrichment (horizon, after ontology/policy probes) + │ ├──→ robust direct-edit / reconciliation cascade (active V3.0 uses mechanical subset) + │ └──→ graph-review oracle can become semantically meaningful + └──→ semantic changeset ledger + proposal-turn staleness (next, FE-701) + ├──→ canonical scenario bundle acceptance + ├──→ direct-edit atomicity with caused_by_changeset_id + ├──→ stale open proposal detection + └──→ architect-loop / verifier/import mutation provenance + +TRACK B — Strategy probes / graph-review / candidate bundles (parallelizable if artifact-only) +prompt/context scenario substrate foundation (completed) + ├──→ agent-capability-CLI + LLM-as-user fixture probe (branch-complete FE-705) + │ └──→ golden completed-spec fixture curation (probe output) + ├──→ graph-review oracle + scenario-options probes (next, FE-702) + │ ├──→ behavioral-kernel targeted-case probes + │ ├──→ candidate bundle readiness gates + │ ├──→ async review/refine/repair worker shape + │ └──→ productized scenario-options / speed-this-up side-chat (horizon) + ├──→ productized web research capability (horizon) + └──→ post-spec oracle/decomposition frontier (probe/future product) + +TRACK C — Graph/workspace surfaces +graph-view-structured-list (completed) + ├──→ active-path-filter-and-scope-toggle (horizon, blocked on server data-layer) + ├──→ spatial-canvas-layout (horizon) + └──→ side-chat-V2-plumbing (completed, FE-673 PR #97) + └──→ side-chat-V3.0-cascade-through-reconciliation_need (active, FE-674) + └──→ side-chat-V3.1-agent-grouped-resolution (horizon) + +TRACK D — Workspace shell / UX +multi-chat-substrate (completed) + ├──→ continuous-workspace (next after active/substrate pressure) + ├──→ first-turn strategy choice (horizon, after strategy artifacts prove) + └──→ scenario-options side-chat (horizon, after graph-review + changesets) + +UNBLOCKED / LOWER-COORDINATION HORIZON +first-run provider setup (needs provider spike / scope) +workspace hygiene gitignore assist (bounded, dashboard-surface candidate) +web-research tools (gate ready, needs tool impl) +dashboard metrics +two-axis interview framing +progressive detail / recursive deflation +structured development spec registry (tooling experiment) +portability boundaries (deferred until substrate goal exists) +``` diff --git a/memory-fe-705/SPEC.md b/memory-fe-705/SPEC.md new file mode 100644 index 00000000..4e6188d1 --- /dev/null +++ b/memory-fe-705/SPEC.md @@ -0,0 +1,658 @@ + + +# Brunch v2 — Spec Elicitation Tool + +## Concept & Goal + +Brunch is an AI-guided spec elicitation tool that turns natural-language goals into structured specifications through a four-phase interview: + +- **grounding** — goals, terms, context, constraints +- **design** — commitments and tradeoffs +- **requirements** — capability review and gap-finding +- **criteria** — verification coverage + +An interviewer agent conducts the conversation. A separate observer agent extracts typed intent items from each answered turn and links them into an intent graph. The interviewer may also invoke context-gathering capabilities when it lacks enough orientation for the next move; their visible outputs appear in the stream as preface cards. The workspace stream is turn-centered rather than message-shaped: durable conversational turns provide the branch-bearing lineage spine, while projected control cards, phase markers, and activity cards frame them. An open phase should always bottom out in one visible next action — a projected kickoff card, actionable frontier turn, visible generation state, projected recovery card, or closed-phase handoff / completion control. + +Brunch is strongest while certainty is still being formed: when the real work is clarifying the target, surfacing commitments, and making unresolvedness legible before downstream implementation decomposition takes over. Its output is a calibrated handoff, not fake closure — a truthful starting point for implementation that makes visible what is known, chosen, constrained, required, and still open. Export is therefore built from the active path's accepted review outputs plus reviewed knowledge, not from laundering unresolved uncertainty into a prematurely final document. + +The product direction is from **planning specs** toward **intent specs**. Planning and downstream work sequencing remain useful projections, but Brunch's source artifact should preserve meaning first: what the user commits to, what properties define correctness, which examples or counterexamples disambiguate the intent, which assumptions remain open, what evidence has been accepted, and where ambiguity is explicitly unresolved. Because future agent features and post-spec handoff flows should consume the graph rather than a single transcript, Brunch needs explicit prompt/context engineering: scenario-specific graph context packs, reusable prompt doctrines, and lightweight prompt probes before UI surfaces are committed. + +Brunch operates inside a **workspace**: the cwd-backed software context whose local `.brunch/` directory stores one or more specifications. Grounding supports two strategies: **elicitation-first** for greenfield work and **analysis-first** for brownfield work. Brownfield grounding begins with read-only workspace analysis that produces a visible preface card (grounding brief), and the interviewer may gather more context via preface cards in any phase when it needs orientation. + +Post-launch, Brunch should support specification work across two axes rather than one: `greenfield <> brownfield` and `end-to-end build <> incremental feature`. That means the interview cannot assume one long whole-product drill-down. It should be able to start broad, deepen recursively where needed, synthesize candidate directions when the user wants help filling in the gaps, and let the intent graph itself become a working surface for refinement instead of only a sidebar summary. + +## Constraints & Non-goals + +- Anthropic direct is the current runtime implementation; near-term provider work may add OpenRouter or provider-neutral routing, but Brunch remains user-supplied-key / no hosted inference account for now. +- No collaborative editing. +- No explicit document-ingestion UX in V1. +- No hard turn-tree branching UX in V1; revisit operates through knowledge-graph edit mode + secondary threads instead. +- No automatic cascade deletion; downstream effects are surfaced and re-resolved explicitly. +- No task-planning surface; Brunch elicits specs, it does not plan implementation work for the user. +- No downstream execution-management workflow in V1; Brunch ends at the handoff boundary rather than owning implementation after export. Verification-aware decomposition and orchestration are a future product frontier to probe through agent-harness experiments before any UI commitment. +- No general-purpose inline document editor in review phases; requirements and criteria review stay recommendation-led with lightweight user comments for revision. +- No offline-first or multi-tab sync layer; the current system stays server-authoritative and local-first. + +## Requirements + +1. `npx brunch` in a project directory with configured supported LLM provider credentials opens a working app in the browser with state in local `.brunch/`. +2. Starting a new specification asks only for the specification name before entering the workspace; greenfield / brownfield grounding strategy is then chosen through grounding entry states inside the specification workspace. +3. Brownfield grounding can use read-only workspace analysis to ground the opening flow and the first substantive question. +4. Structured responses support turn-appropriate option selections or explicit action submissions, an explicit `none of the above` path where relevant, and one attached response note. The interviewer autonomously chooses whether to include options on each question based on conversational trajectory; grounding accepts either a free-text response or one-or-more selected options, with the response note optional when an option is selected and required only for the `none of the above` path. Design preserves the current selection-required gate with a structural "none of the above" path. A single turn may carry multiple assistant-part artifacts (e.g. a preface card followed by a question card, or a revision card followed by a review set) rendered as stacked cards with one unified response submission. +5. Users can see thinking, tool usage, and streaming progress in real time; if live-only artifacts are shown, replay keeps concise durable activity metadata (at minimum elapsed thinking time plus a coarse tool-use summary / placeholder seam) instead of dropping them completely. +6. The observer extracts typed intent items and intent edges from answered turns. +7. The accumulated knowledge layer and readiness state stay visible during the interview. +8. Each workflow mode has deterministic closeability plus a separate readiness signal. +9. Phase close records summary text and closure basis. +10. Users can revisit knowledge through edit mode, cascade preview, and a secondary thread. +11. Requirements review synthesizes a candidate requirement set from the knowledge layer, presents stable item reference codes, supports per-item commenting through an inline comment toggle on each item, and resolves through explicit `accept review` / `request changes` submission with per-item comments plus one optional global review note. +12. Criteria review synthesizes a candidate verification set from accepted requirements plus the knowledge layer, presents stable item reference codes, and supports the same per-item commenting and full-set review seam. +13. Export is available only when workflow closure, accepted review outputs, and staleness rules are satisfied. +14. Closing and reopening the browser resumes the specification from persisted state. +15. The dashboard shows multiple specifications / elicitation runs within one `.brunch/` directory. +16. Partial-scope elicitation works for a feature or bounded sub-area, not just whole-workspace greenfield specs. +17. Each phase exposes an explicit kickoff, frontier, recovery, handoff, or completion affordance; the UI must not strand the user with a bare generic composer as the only visible action. +18. Open interview phases default to a projected kickoff card, the current frontier turn, a visible generation state, or a projected recovery affordance when the frontier is missing, and closed phases terminate in a projected handoff or completion artifact at the bottom of the workspace stream. +19. The first phase is grounding in both product language and canonical workflow identifiers. +20. The interviewer may invoke context-gathering capabilities such as workspace analysis in any phase when the workspace directory is available; their outputs appear as visible preface cards paired with question cards within the same turn. +21. Preface cards are provisional context rendered as turn-internal artifacts paired with a question card within the same turn, so the observer captures from the whole validated unit (preface context + question + user response) rather than from unvalidated provisional content alone. +22. Grounding and elicitation persist only the durable exploration ontology (`goal`, `term`, `context`, `constraint`, `decision`, `assumption`); `non-goal` is represented as a `constraint` subtype, and requirements / criteria become durable only through accepted review outputs. +23. The knowledge ontology is defined once and projected consistently through schema, shared registries, observer prompts, API types, fixtures, and UI copy so kind semantics do not drift across layers. +24. Each phase section in the workspace stream opens with a phase section header that states the phase purpose and what kinds of knowledge are captured there, projected from workflow state rather than persisted as a turn. +25. When a user requests changes on a review set, the interviewer regenerates the full set as a successor review turn; revisions stack in the turn lineage but visually only the current revision renders live with a version badge, while prior revisions collapse to compact answered-turn summaries. A revision card (changelog + version badge) renders above the review set card within the same successor turn. +26. The homepage surfaces workspace (CWD) binding so the user understands that listed specifications and the "new specification" affordance are scoped to the current project directory. +27. The grounding interviewer prompt uses a hint-guided priority-ordered topic list (concept, users/audience, existing constraints, scope boundaries) with example question shapes rather than generating questions from scratch, keeping thinking budget low and generation lightweight. +28. Observer capture treats the full turn — including any turn-internal preface card or revision card plus the question or review set plus the user response — as one atomic validated unit for knowledge extraction. +29. Grounding captures both workspace novelty (`greenfield` / `brownfield`) and delivery posture (`end-to-end build` / `incremental feature`), and interviewer behavior adapts to any point in that matrix rather than assuming a whole-product greenfield interview. +30. Observer extraction treats typed relationships as first-class across the ontology and records them whenever they can be reasonably traced from a turn or accepted review state, while abstaining when support is weak. Relationship extraction must stay prompt-budgeted: existing entities should be presented as compact identity anchors, not full Markdown inventories or graph dumps. +31. Users can request a turn-owned candidate-spec set during grounding or design instead of only skipping the remainder of a phase; each candidate direction includes implications, tradeoffs, likely generated knowledge, and what it rules out, and the user can accept a direction, request refinement, reject, or regenerate candidates. Accepting a candidate direction may steer the next interview move and materialize intent items, but does not itself close the phase. +32. Interview detail can proceed as a progressive broad-pass-to-detail flow with explicit `next level of detail` actions, rather than only as one monolithic linear drill-down. +33. Graph view is a first-class alternative to chat view, accessed as a peer route, and projects the intent graph as a navigable workspace with visible relationship topology and supports launching refinement side-chats from graph selections. The first ship is a structured-list layout; a spatial canvas layout follows as a layout switch inside graph mode. +34. First-run setup detects missing expected LLM provider credentials before the user starts a specification, makes the missing-key state visible on the dashboard, and offers a guided setup path rather than requiring README / shell-env debugging. +35. If Brunch accepts an API key through the UI, it stores credentials outside the project workspace in XDG-compliant user auth/config state; project `.env` files and `.brunch/` never become the default secret-storage target. +36. LLM provider configuration is owned by a shared AI runtime provider seam, so interviewer and observer model creation do not encode direct provider imports or environment-variable reads as product truth. That seam must preserve provider-specific capabilities such as Anthropic thinking / reasoning options or degrade them explicitly. +37. Workspace hygiene detects whether the local `.brunch/` directory is git-ignored and, with explicit user confirmation, can add an idempotent `.gitignore` entry, creating `.gitignore` when absent. +38. The product ontology should expand beyond the current exploration + review kinds to support `invariant` and `example` as first-class durable knowledge kinds, with observer prompts and promotion rules that distinguish descriptive context, constraints, decisions, assumptions, requirements, invariants, criteria, and examples without treating every answer as a decision. +39. Specifications can own multiple durable chat containers below the specification, with turns gradually moving toward chat ownership while preserving current spec-scoped compatibility during transition. The same substrate records directed `reconciliation_need` process debt when changed intent items may affect other graph truth; semantic intent edges remain separate (currently persisted as `knowledge_edge` rows during transition). +40. Prompt and context engineering are first-class server subsystems: prompts and reusable policy doctrines live as inspectable markdown assets, while typed context-pack builders derive scenario-specific intent-graph renderings for interviewer, observer, research, candidate synthesis, behavioral kernels, reconciliation, architect, and downstream decomposition probes. +41. Agent-heavy future capabilities can be tested before product UI exists through a lightweight scenario substrate that runs prompt/context packs against seeded graphs or transcript fixtures, captures raw and structured outputs, and supports harness comparison. Scenario execution may use the existing Anthropic API key or fake adapters for probes, but first-run provider setup, credential storage, OpenRouter defaulting, and the shared production AI runtime seam belong to the provider setup frontier. Pi may be evaluated as a lower-level agent harness, especially for tool experiments and pre-UI probes, but Brunch product authority over durable workflow, replay, graph mutation, and reconciliation remains explicit. +42. Agent-originated mutations of Brunch data use one typed server-owned mutation surface regardless of caller. Internal interviewer/observer flows, scenario probes, CLI/TUI harnesses, Pi or other harness adapters, and future external agents may not mutate durable Brunch state by calling the ORM directly; they must invoke stable mutation handlers with input/output schemas, authority metadata, replay policy, and reconciliation/changeset-ledger semantics. Read-only capability contracts may share the same registry shape, but the hard invariant is single-entry mutation authority. +43. A local agent capability CLI can expose Brunch-owned capability contracts over long-lived JSONL stdin/stdout so an external probe runner or harness can drive the real specification flow without privileged ORM access. The CLI is an adapter over capability contracts, not a separate product API: calls carry explicit resource identifiers, read commands distinguish structured `get` / `list` data from agent-facing `read` projections with affordance hints, and mutating commands stay small and procedural around spec lifecycle requests, chat readiness, and turn response submission. The LLM-as-user scenario brief, model choice, fixture curation, and probe artifacts belong to an external probe runner that talks to the CLI like any other agent. +44. Specifications can evolve through multiple chat-local strategies rather than one global interviewer mode. A chat's first frontier turn may offer or declare its strategy (`step_by_step`, `scenario_options`, `targeted_cases`, `graph_review`, `reconciliation`), and every active/resumable chat should have at most one open assistant/system-first frontier turn waiting for a user completion action. Proposal turns use normalized completion semantics (`accept`, `reject`, `revise`, `ask_followup`, `defer`, `regenerate`); only acceptance of a proposal turn may apply that proposal's semantic changeset. Mid-interview acceleration should branch into a side-chat / strategy chat that completes the current direction from context-packed graph truth, while graph-review critique remains the internal oracle for judging and repairing generated candidate bundles. + +## Assumptions + + + +| # | Assumption | Confidence | Status | Depends on | Validation approach | +| --- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------- | ------ | ------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| A15 | The LLM can offer useful coarse readiness and closure recommendations, but closure authority must remain explainable and user-legible rather than model-owned. | medium | open | D65, D66 | Manual comparison of model recommendations vs user judgment across varied projects. | +| A20 | Users experience observer capture as responsive when every eligible answered turn enters one turn-owned background capture backlog instead of blocking chat stream completion, while replay still attaches capture status and results to the originating turn. | medium | open | D22, D96, D113, D123 | Measure stream completion timing, backlog draining, and replay clarity across grounding, design, requirements, and criteria turns. | +| A48 | Knowledge-graph edges are sufficient to drive accurate cascade preview for revisit work. | medium | open | D50, D80 | Structural cascade tests plus manual judgment about scope. | +| A49 | A modal secondary thread can resolve revisit implications without forcing a full interview restart. | medium | open | D80 | Manual revisit walkthrough once the thread lifecycle lands. | +| A51 | Grounding plus design remain legible if the primary input surface is the workspace-owned card family — durable turn cards for substantive elicitation plus projected control cards for structural affordances — rather than a persistent global composer. | medium | open | D89, D93, D94, D110 | Manual walkthroughs on grounding, design, and resumed states plus story review of entry / handoff patterns. | +| A53 | Concise durable activity summaries are sufficient to preserve transcript trust for live thinking/tool artifacts without persisting hidden reasoning or raw tool results. | medium | open | D93, D112 | Manual replay/reload walkthroughs on streamed turns once transcript activity summaries land. | +| A54 | An open phase can reliably project a kickoff control card, current frontier turn, visible generation state, or projected recovery card on first render without requiring the user to bootstrap the phase by typing into a generic composer. | medium | open | D89, D94, D95, D110 | Manual walkthroughs on kickoff-ready, design-active, review-active, and recovery states. | +| A55 | Trailing observer capture remains trustworthy if waiting/applying state stays attached to the answered turn and deferred completion writes back through that turn's identity rather than the current frontier. | medium | open | D96, D113, D123 | Manual timing walkthroughs plus reload/resume tests on seeded turns with known deferred observer work. | +| A57 | A specification-scoped lifecycle seam — whether implemented as a lightweight runtime supervisor, router-integrated service, or chart-backed helper — can own duplicate-safe automatic phase entry / continue, late-event suppression, and route-independent in-flight operation identity without introducing a second durable workflow model or a general runtime-operations ledger. | medium | open | D113 | Prototype the lifecycle seam on auto-present / recovery / force-close edges; if duplicate-submit or restart truth remains ambiguous, revisit whether the seam needs stronger runtime machinery or more durable coordination. | +| A58 | A cumulative workspace can preserve phase legibility and workflow honesty if realized sections stay visible as historical record, future sections do not render until reachable, and section focus remains navigation-only state rather than redefining durable workflow truth, reachability, or the single actionable frontier. | medium | open | D86, D110, D113, D114 | Prototype the cumulative workspace against future-phase deep-link redirects, scroll/focus transitions, close-to-next-phase motion, and resume/reload walkthroughs; if unrealized-phase routing or single-frontier clarity drifts, keep the current per-phase rendering boundary. | +| A59 | Interviewer-autonomous question format — where the model chooses whether to include options based on conversational trajectory rather than rigid phase rules — produces better grounding conversations than mandating free-text-only, because the interviewer naturally starts open-ended and adds suggestive options as the user's thinking narrows. The observer can interpret option selections phase-appropriately (resonance in grounding, commitment in design) without schema changes. | medium | open | D89, D110, Requirement 4 | Manual walkthroughs across greenfield and brownfield grounding comparing interviewer-chosen format vs phase-mandated format; check whether observer captures stay coherent when the same selection structure carries different semantic weight by phase. | +| A60 | A concise phase section header (purpose + captured knowledge kinds) is sufficient to orient the user at phase entry without requiring a longer onboarding flow or tutorial card. | medium | open | D116 | Manual walkthroughs on fresh specifications; check whether users understand what the phase expects of them. | +| A63 | Hint-guided grounding prompts produce meaningfully adapted questions rather than degenerating into rote template output across different projects. | medium | open | Requirement 27 | Manual greenfield walkthroughs across varied project types; compare question quality against the current unconstrained prompt. | +| A64 | Replacing coarse `router.invalidate()` with query-owned invalidation boundaries eliminates the scroll-jank cascade without introducing coordination complexity or stale-data bugs; the near-term boundary may be one specification bundle domain plus a separate entities domain rather than a fake finer split. | medium | open | D121 | Prototype the staged bundle + entities decomposition and measure scroll stability plus data freshness during observer updates. | +| A65 | The interviewer can adapt usefully to the full `greenfield <> brownfield` by `end-to-end build <> incremental feature` matrix without making kickoff feel bureaucratic or over-parameterized. | medium | open | D124 | Manual walkthroughs across all four corners of the matrix, including partial brownfield feature work. | +| A66 | Relation-first observer capture will improve revisit, export grounding, and graph-view utility without flooding the graph with speculative or low-value edges. | medium | open | D50, D125 | Run post-FE-639 observer corpus probes plus manual graph/export review focused on edge precision, coverage, and visible usefulness. | +| A67 | Users who are tired, rushed, or under-informed will converge faster by reacting to synthesized candidate directions than by continuing a long direct interview or force-closing early. | medium | open | D126, D127 | Manual user-flow comparison between direct questioning, skip-close, and candidate-spec reaction flows. | +| A68 | Broad-pass interviewing followed by explicit deepen-detail actions will preserve coherence better than a single depth-first drill-down while still producing export-worthy specifications. | medium | open | D127 | Prototype broad-pass-first flows and compare resulting knowledge completeness and user comprehension. | +| A69 | A graph-centric refinement surface can launch side-chats without splitting durable specification truth, so chat view and graph view stay two projections over one evolving graph. | medium | open | D128, D114 | Prototype graph-launched refinement with reload/resume checks to ensure side-chat state and graph state stay coherent. | +| A70 | The structured-list graph-view layout provides standalone enumeration value beyond relationship density: users benefit from seeing all intent items grouped by kind even when most have no edges yet, and graceful degradation (collapse the relations footer when zero edges) keeps the view honest while relation-first observer capture matures. | medium | open | A66, D128, D129 | Manual walkthroughs at low and high edge density once the structured list ships; check whether the layout still feels valuable when most items have empty relations footers, and whether observer-density growth visibly improves the view over time. | +| A71 | Semantic mutations will eventually need a changeset-ledger history distinct from conversational turn ancestry, but the first implementation should prove chat containers and reconciliation needs before committing the full ledger shape. | medium | open | D135 | Build chat containers plus reconciliation needs first; revisit whether turn-linked provenance remains sufficient before adding full semantic changesets. | +| A72 | Intent items can carry version history without breaking the active-path durable-truth contract: each version is the result of an applied semantic mutation, prior versions are queryable for diff / comparison / audit, and the active-path projection always reflects the latest version for each item. | low | future | A71, D135 | Prototype item versioning behind the changeset ledger; verify that revisit cascades, span-anchored annotations, and soft-edit audit trails behave correctly across versions. | +| A73 | Autonomous architect / generator loops can propose useful graph mutations only after human-driven multi-chat and reconciliation surfaces prove the shared mutation pipeline. | low | future | A71, D135 | Run architect proposals in shadow mode after multi-chat / reconciliation seams stabilize, then compare proposed changes against user-driven edits. | +| A74 | OpenRouter may reduce first-run friction for Brunch's likely users compared with requiring direct Anthropic keys, but model capability parity and AI SDK support need proof before making it the default provider path. This is provider-setup work, not a default FE-698 prompt/context substrate task. | medium | open | D130, D131 | In the first-run provider setup frontier, spike provider configuration against interviewer/observer calls, especially model naming, structured output, tool use, and reasoning/thinking support. | +| A75 | XDG-compliant user-scoped auth/config storage is acceptable for UI-entered API keys and safer than writing secrets to the project workspace, while environment variables remain useful for automation and CI. | medium | open | D130, D132 | Prototype key save/load/delete precedence and inspect OS/XDG paths; manual first-run walkthrough verifies users understand where the key is stored. | +| A76 | Users will accept Brunch editing `.gitignore` when the action is explicit, previewable, and idempotent; doing so should reduce accidental commits of `.brunch/` without feeling like surprising repo mutation. | high | open | D133 | Unit-test ignore detection / append behavior and manual dashboard walkthrough with absent, present, and already-covering `.gitignore` states. | +| A77 | Progressive checkability will improve generated specs more than a binary "formal / not formal" framing, because the weakest sufficient witness may be prose, example, test, runtime contract, invariant, proof obligation, or explicit unresolved ambiguity depending on the intent item. | medium | open | D134 | Prototype intent-item-to-witness review on a small corpus and compare whether users can validate meaning without being forced into formal-methods terminology. | +| A78 | Adding `invariant` and `example` as product ontology candidates will make intent drift easier to detect without overwhelming early interviews, provided examples carry subtypes such as positive, negative / counterexample, edge-case, and not-relevant rather than expanding into many top-level kinds. | medium | open | D134 | Run transcript probes for examples, counterexamples, not-relevant cases, and state/transition rules; check whether items improve export and review quality or create noisy capture. | +| A79 | Once semantic truth can change through graph edits, side-chats, reconciliation, verifier feedback, or implementation feedback, turn ancestry alone will be insufficient as the semantic history spine. | medium | open | D135 | Prototype chat containers and reconciliation needs before full patch history; revisit if turn-linked provenance remains enough for first-class graph editing. | +| A80 | Behavioral kernels can generate higher-yield disambiguating questions than generic elicitation prompts, but only if kernels stay as interviewer / architect / wizard machinery that emits checkable artifacts rather than user-visible formalism. | low | open | D134 | Try state/lifecycle and containment/topology prototypes first, and compare question value against current prompt-only interviewing. | +| A81 | Knowledge edges can carry intent semantics without becoming too noisy only if relation policy distinguishes semantic relations from reconciliation needs, and distinguishes display edges, cascade-participating edges, export-relevant edges, staleness-producing edges, and low-confidence suggestions. | medium | open | D137 | Design relation-policy semantics before broad observer edge expansion; test low- and high-density graphs for user trust and operational noise. | +| A82 | A soft dual-pointer migration can introduce chat containers without destabilizing current spec-scoped reads: `turn.specification_id` and `specification.active_turn_id` can remain temporarily while `turn.chat_id`, `specification.primary_chat_id`, and `chat.active_turn_id` become the future ownership path. A separate `active_chat_id` is deferred until multiple active chat surfaces need an explicit UI-level pointer. | medium | validated | D138 | Validated by FE-697: `chat-substrate.test.ts` proves read-path equivalence (`spec.active_turn_id === spec.primary_chat → chat.active_turn_id`) and parent-chat consistency; `npm run verify` shows no regression in the existing interview flow. | +| A83 | A minimal item-to-item `reconciliation_need` table is enough for the first queue if it carries narrow kind/status values plus nullable provenance placeholders, and if future relation targets / changeset provenance can extend the shape without renaming the concept. | medium | validated | D137, D138 | Validated by FE-697 for the Phase 1 substrate: `reconciliation-need.test.ts` proves the partial unique index, lifecycle, cascade, and multi-kind-per-pair behaviors. Forward extensibility (changeset provenance, relation-targeted needs) remains untested until the changeset ledger lands. | +| A88 | Path 1 deterministic enumeration over existing `knowledge_edge` rows incident on a changed knowledge item produces a useful cascade preview without requiring the reconciliation agent. Mechanical grouping by `reconciliation_need.kind` plus relation type is enough for the user to walk a hard-edit cascade in V3.0; agent-grouped resolution (auto-confirm / auto-edit / substantive) is V3.1 work that can ship later without reshaping the queue. | medium | open | D135, D137, D138, D146 | Manual hard-edit walkthroughs across the side-chat V3.0 fixture matrix (leaf, 2-downstream, 5+-downstream, in-active-review-set, mixed `supersedes` / `needs_confirmation`). Check whether the mechanical grouping reads as actionable or whether substantive items get lost. If users skip needs without resolving, V3.1 agent work moves up the priority list. | +| A84 | Scenario-specific graph context packs can replace transcript-as-default prompt context without losing conversational nuance, provided packs preserve authority, provenance, unresolvedness, relation neighborhoods, and recency where relevant. | medium | open | D139, D140 | Build prompt/context probes over seeded graphs and compare generated observer, interviewer, candidate, and oracle/decomposition outputs against transcript-heavy baselines. | +| A85 | A lightweight prompt scenario substrate will let Brunch validate LLM-heavy product directions faster than building UI first, if it captures rendered prompts, context packs, model settings, raw outputs, structured parses, and human review notes as repeatable artifacts. | medium | open | D139 | Run multi-scenario prompt probes for observer ontology, behavioral kernels, candidate-spec assist, and downstream oracle/decomposition before productizing their UI. | +| A86 | Pi can serve as a useful pre-UI agent harness or tool-spike backend without forcing Brunch to adopt Pi as its production agent runtime, as long as integration remains adapter-shaped and Brunch-owned authority/replay/mutation semantics stay outside the harness. | low | open | D142 | Spike Pi SDK or RPC with in-memory sessions, custom tools, controlled prompts, and Brunch graph context packs; evaluate event capture, tool ergonomics, provider handling, packaging, and isolation. | +| A87 | Verification-aware post-spec decomposition can be explored as agent scenarios before it is a Brunch product surface: intent graph truth plus progressive checkability can feed design alternatives, oracle strategy, execution slices, and orchestration constraints. | low | future | D141 | Prototype decomposition and oracle-design probes inspired by `ln-design` and `ln-oracles`; compare outputs for traceability to requirements, invariants, examples, criteria, and blind spots. | +| A89 | A long-lived local JSONL agent capability CLI can drive the real Brunch interview flow well enough for external LLM-as-user probes to produce credible completed specification fixtures, while keeping product resources explicit in every call and using ambient process state only for runtime plumbing such as DB handles, provider config, and in-flight generation bookkeeping. | medium | open | D143, D147, Requirement 43 | Prototype the minimal `brunch agent` JSONL loop over capability contracts, then run small LLM-as-user scenarios end-to-end through `chat.ensureReady`, `chat.read`, `turn.submitResponse`, `spec.requestPhaseClosure`, and `spec.requestExport`. Validate that probe logs are replayable, no probe code imports DB/product handlers directly, and no durable operation ledger is needed for the first readiness semantics. | +| A90 | Users who ask to speed up a long interview will prefer a side-chat that generates 2–3 reviewed scenario options completing the current direction over continuing the primary drilldown, provided existing accepted graph truth is treated as fixed premise by default. | medium | open | D126, D148, D151, Requirement 44 | Probe scenario-options against drilldown fixtures and run manual flow review: do users understand the tradeoff profiles, preserve trust in prior answers, and return to the primary interview when generated options disappoint? | +| A91 | Graph-review critique can make scenario-generated candidate bundles safe enough for product use without requiring perfect one-shot generation, if candidate readiness distinguishes `reviewed_clean`, `reviewed_with_issues`, and `blocked`, and if accepted-with-issues immediately opens durable follow-on review work. | medium | open | D151, D152, Requirement 44 | Run candidate bundle probes with graph-review scoring and human review; verify accepted-with-issues flows create a graph-review frontier or appropriate reconciliation needs rather than hiding defects. | +| A92 | A conservative global staleness rule for open proposal turns — stale when `specification.latest_changeset_id` differs from `turn.opened_at_changeset_id` — is acceptable before neighborhood-level staleness calculation exists. | medium | open | D149, I116 | Exercise multi-chat proposal flows where another chat applies a changeset while a proposal remains open; check whether regeneration prompts feel safe rather than noisy. | +| A93 | Relation-policy directionality lookup is safer than trying to force all useful intent-edge verbs into one dependency direction, because graph edges must serve display, context packs, export trace, reconciliation, critique, verification, candidate generation, and explanation. | medium | open | D137, D150 | In FE-700, define canonical/inverse sentences and source/target change behavior for each relation; test direct-edit and hard-impact cascade against mixed-direction relations. | + +## Decisions + + + +22. **Observer-result sync is turn-owned and background by default** — eligible answered turns enter one turn-owned observer capture backlog after durable turn finalization, and chat stream completion must not wait on extraction. Capture may still surface results through the existing turn-owned `data-observer-result` artifact once available, but the runtime path should be normalized around the `/api/specifications/:id/turns/:turnId/observer-capture` seam rather than split between inline stream-blocking capture and deferred capture. This preserves one durable workflow model: durable truth remains the answered turn plus any persisted observer result part, not a separate workflow store or ledger. + +50. **Knowledge relationships live behind one typed graph seam** — persisted graph edges are first-class and drive dependency, derivation, and revisit behavior. +65. **Phase outcomes are explicit durable records** — workflow status, closeability, readiness, and closure provenance project from durable phase outcomes on the active path. +66. **Interviewer-recommended and user-forced closes share one transcript-friendly seam** — one phase-close transport handles both paths, with explicit closure basis. +80. **Intent-graph revisit replaces hard turn-tree branching for V1** — revisit starts from edit mode on intent items, traces cascade through intent edges, and resolves through a secondary thread. **Updated 2026-05-07 (D135):** the older modal secondary-thread and side-chat V2/V3 persistence shapes are superseded by the multi-chat + reconciliation-need direction; the user-facing revisit/cascade goal remains live. **Chat-level branching note:** the no-turn-tree-branching invariant remains in force at the *turn* level, but multiple chats per spec are explicitly allowed at the *chat* level once the multi-chat substrate lands. Branching at the chat level is not user-surfaced as a generic `branch this thread` affordance by default; it manifests through graph-anchored refinement / reconciliation surfaces. +86. **The client is organized by phase-addressable routing and three concentric layout shells** — AppLayout, SpecificationWorkspaceLayout, and ViewLayout own the user-facing route structure. Interview phases remain router-addressable for deep links, gating, and sibling route composition even if the center pane later renders them inside one continuous workspace surface. +87. **Layout-level data ownership partitions invalidation** — the specification bundle and entity collections subscribe through separately owned query domains / route surfaces instead of one monolithic refresh boundary, so entity refreshes do not remount or tear down the transcript-owning surface. +89. **Primary grounding/design input is workspace-owned and card-owned** — substantive elicitation in grounding and design proceeds through durable turn cards inside the workspace stream, while structural phase-entry, recovery, and handoff affordances project as control cards in that same stream; the global bottom composer is not the canonical input seam. Preface cards accept optional comment + continue, while question cards collect substantive answers. Depends on: A51. Supersedes: —. +93. **Replay for elicitation phases is turn-centered, not message-shaped** — completed interview turns collapse into answered-turn records that summarize the offer, the structured user response, and the capture status, while phase markers, projected control cards, and activity cards render as stream elements around those turns rather than as ordinary chat bubbles. Depends on: A51, A53, D110. Supersedes: —. +94. **Phase progression is frontier-anchored** — every open phase bottoms out in exactly one visible next action: a projected kickoff card, actionable frontier turn, visible generation state, or projected recovery card. Accepting a frontier turn durably creates its successor turn, successor generation avoids closed-without-frontier gaps, and recovery is a structural fallback that appears whenever an open phase lacks a valid frontier rather than another generative turn that must itself be created. Closure proposals remain durable proposal-shaped turns on the active path; accepting one confirms phase closure and opens the next phase into its projected entry state, while rejecting one keeps the phase open and requires a same-phase successor frontier. If a phase is closed, the stream bottoms out in a handoff or completion control. Depends on: A51, A54. Supersedes: —. +95. **Structural control affordances project from workflow state rather than masquerading as ordinary turns** — kickoff, recovery, and end-of-phase affordances derive from workflow state, phase outcomes, and neighboring turn anchors instead of from incidental copy or mandatory durable turn rows. Any durable implementation seam used to help project them must be treated as transitional and must not redefine their product meaning as authored conversational turns. Depends on: D65, D94, D110. Supersedes: `why`-based kickoff/recovery sentinels and the earlier persisted-turn-kind framing. +96. **Observer capture trails interviewer progression through one turn-owned backlog** — interviewer completion may unlock the next turn before observer capture finishes, and that should be the default lifecycle for all eligible phases rather than a grounding/design exception. Any trailing observer state remains attached to the just-answered turn card instead of surfacing as a free-floating transcript row; observer-result transport carries the originating turn identity so late capture can hydrate back into that same card. Depends on: A20, A53, A55. Supersedes: inline stream-blocking observer capture as a normal completion path. + +110. **The workspace stream is a merged read model, not identical to the turn tree** — active-path durable conversational turns remain the only branch-bearing lineage spine; durable non-turn workflow facts such as phase outcomes anchor themselves to turn ids for provenance, ordering, and invalidation; projected control cards, phase markers, and activity cards derive from workflow state plus nearby anchors instead of requiring their own turn rows. Depends on: D65, D89, D93, D94, D96. Supersedes: the implicit equivalence between rendered cards and persisted turns. + +111. **The app is seed-first and migration-light until the data model settles** — prioritize one truthful read-model contract plus up-to-date seeded scenarios over compatibility for legacy local rows. Durable authority comes from active-path substantive turns, `phaseOutcome`, workflow state, and the current canonical record/phase identifiers; projected kickoff / recovery / handoff affordances must be derived from those facts rather than preserved as canonical control-turn rows. Transitional seams may survive briefly as internal submit plumbing, but new server reads, client renders, fixtures, and happy-path tests must not depend on legacy aliases or adaptation layers as product truth. When a naming or persistence cutover lands — including `project` → `specification` and `scope` → `grounding` — destructive reseed is preferred over spending time on migration logic for unstable local data. Depends on: D95, D110. Supersedes: the implicit bias toward preserving legacy control-row compatibility during the cutover. + +112. **Turn-artifact persistence is server-owned and interviewer-shaped** — durable review-set, preface-card, activity-summary, and phase-summary artifacts materialize from interviewer output through one server helper, so the chat-runtime finalize path acts as orchestration glue instead of reconstructing artifact semantics ad hoc. Replay, accepted-review materialization, and seeded walkthroughs therefore consume the same persisted artifact contract the interviewer produced. Depends on: D93, D96, D110. Supersedes: the ownership split where runtime finalization re-derived grounding/review artifacts outside one authoritative persistence seam. + +113. **Phase lifecycle side effects are specification-scoped, not route-scoped** — durable workflow truth, landing reconciliation, and routed read-model projection remain authoritative; they do **not** move into a second client-side workflow store. The router continues to own navigation, loader/query subscription, and rendering of the derived read model. A separate specification-scoped lifecycle seam owns only the ephemeral process concerns that routes are poor at holding correctly: one-shot automatic phase entry / continue, in-flight operation identity, duplicate-submit suppression, cancellation, stale-event rejection, and capture-backlog reseeding after hydration. That seam may be implemented as a lightweight runtime supervisor, router-integrated service, or chart-backed helper, but its implementation is intentionally left open; what is decided here is the ownership boundary, not a mandatory framework. Constraints: (1) no second durable workflow model or general runtime-operations ledger by default, (2) no independent client authority over phase status, landing truth, or handoff/completion semantics, (3) no route-local `useEffect` or remount-tied behavior as the trusted owner of lifecycle effects like auto-present, and (4) any lifecycle helper must consume durable truth and emit idempotent, ignorable side effects rather than redefine product state. Depends on: D87, D94, D95, D96, D110, D112. Supersedes: route-local auto-present / continue effects as a trusted lifecycle seam. + +116. **Each phase section opens with a projected phase section header** — a non-turn, non-durable stream artifact that states the phase purpose and what kinds of knowledge are captured there. The header is projected from workflow state and phase metadata (similar to phase markers) and re-projects on hydration. Content is phase-specific: grounding explains goals/terms/context/constraints, elicitation explains design decisions, requirements explains review, criteria explains verification. Depends on: A60, D110. Supersedes: —. + +121. **Client data ownership migrates from coarse loader invalidation to query-owned domains** — the near-term authoritative boundary is one specification bundle seam for workflow state, landing state, and turns, plus a separately invalidable entities domain scoped to the specification. Mutations and SSE events invalidate only the owned query key. `ask_question` tool execution persists the frontier question/options, advances the active head, and returns the acknowledged turn id; the client may then patch the bundle query cache from that tool output while the following bundle fetch remains authoritative reconciliation. The router loader becomes a thin shell that primes or guards those domains instead of owning the read model, and finer core/turn split work waits for a real server ownership boundary rather than a fake cache-key split over one payload. Depends on: A64, D87. Supersedes: monolithic `router.invalidate()` after every mutation. + +123. **Runtime proving uses a lightweight lifecycle seam with observer backlog, not a second workflow store** — every eligible answered turn should be able to unlock successor interactivity as soon as interviewer generation is durably ready, while observer capture for the answered turn runs afterward through a turn-owned `/api/specifications/:id/turns/:turnId/observer-capture` seam. The client lifecycle may keep only ephemeral capture state (`waiting`, `applying`, retry/backlog identity`) and reseed unfinished capture from durable turns after hydration/reload; durable authority remains the persisted turn plus its observer result part. Current constraint: server-side dedupe is process-local, so restart recovery depends on reseeding from turns that still need observer capture rather than on a durable runtime-operations ledger. Depends on: D22, D96, D113. Supersedes: the mixed inline/deferred observer finish boundary. + +114. **Continuous workspace rendering and phase addressability are separate concerns** — the interview center pane may render one cumulative workspace stream whose realized grounding, design, requirements, and criteria sections remain visible as the workflow advances, while the router continues to preserve deep links, gating, and sibling-route composition. A workspace-level controller may own one chat session, cross-section projection, focus / scroll behavior, and close-to-next-phase motion without turning focus state into a second durable workflow model. Phase routes act as focus addresses into that shared surface rather than distinct transcript owners: navigating to a realized phase focuses and scrolls to its section, while direct navigation to an unrealized future phase redirects to the current reachable phase instead of rendering placeholder content. Constraints: (1) one chat runtime per specification, not one per rendered phase, (2) only realized sections render in the cumulative center pane, so future phases do not project empty shells before they become reachable, (3) exactly one actionable frontier remains at the bottom of the current reachable section while prior sections are replay-only record, (4) focused section state must not redefine durable workflow truth or landing truth, (5) graph view is a peer route to the cumulative workspace surface — phase-independent, accessed via direct navigation, but rendered inside the outer specification shell so phase-sidebar continuity and top-bar consistency remain, and (6) output remains a separate route because it is not part of the interview timeline. Depends on: A58, D86, D87, D110, D113. Supersedes: the assumption that each phase route must own a distinct rendered transcript surface. + +124. **Interview framing is two-axis, not novelty-only** — the interviewer should orient itself with both workspace novelty (`greenfield` / `brownfield`) and delivery posture (`end-to-end build` / `incremental feature`). Partial-scope work is therefore a first-class interview shape rather than just a greenfield/brownfield footnote. Depends on: A65. Supersedes: the implicit single-axis framing around grounding strategy plus partial-scope support. + +125. **Observer capture is a prompt-budgeted graph-delta seam** — `runObserver()` remains the public capture boundary, but its internal output includes a generic graph delta: per-kind item collections plus a compact top-level relationship-candidate set that can reference existing entities by `knowledge_item.id` and same-turn provisional items by `{ kind, index }`. Existing-entity identity is the database id; any kind metadata in prompts is only display/validation context, never part of the lookup key. The server owns provisional-reference resolution after persistence, validates candidate edges through one typed relation-policy registry, and writes only supported edges, preferring abstention over speculative edge inflation. Accepted review sets reuse the same relation policy when materializing requirements or criteria so review-authoritative entities can add edges without a second durable graph model. Observer prompts avoid full entity tables and existing graph topology by default; future enrichment should use compact anchor inventories, phase/relevance filtering, and corpus/manual graph review before adding more context. Depends on: A66, D50, D112, D123. Supersedes: the decision/assumption-only relationship extraction instruction in the current observer seam. + +126. **Recognition-first assists synthesize proposals through turn-owned candidate direction sets** — grounding, design, and future architect / wizard-style modes may project user actions like `fill in the rest for me`, compare broad directions, or propose typologies. These invoke interviewer-authored candidate direction set artifacts on ordinary durable turns rather than extending force-close semantics. Each set presents concrete options with implications, tradeoffs, likely generated knowledge, and what each direction rules out. The user responds through a structured reaction seam (`accept-direction`, `refine`, `reject`, or `regenerate`). Accepting a direction may materialize goals, constraints, assumptions, decisions, requirements, invariants, and examples as accepted or proposed-from-selection; rejecting a direction may still create intent evidence through negative / not-relevant examples, `non_goal` constraints, or `rules_out` relations. Criteria are generated later unless the selected bundle includes concrete witness cases. This reuses the existing turn-artifact / workflow model instead of adding a second durable workflow machine. Depends on: A67, A77, A78, D66, D94, D112, D134. Supersedes: skip-only close as the sole user-legible fallback for low-patience or low-information phases, and supersedes treating candidate-spec assist as only a phase-shortening tool. + +127. **Interview detail should flow through a turn-owned breadth skeleton and detail-focus reaction** — grounding and design may pair an ordinary question turn with an interviewer-authored breadth-skeleton artifact that makes the current broad-pass map visible, marks which areas are already sufficient for now, and offers explicit `next level of detail` affordances. The user reacts by choosing whether to deepen one named area now, continue the broad pass, or leave an area sufficient-for-now; that reaction steers the next same-phase frontier turn without creating a second durable workflow or topic-tree store. Recursive follow-up remains ordinary focused turns on the same active path, and any future chat or graph affordance should emit the same detail-focus intent against shared specification truth rather than inventing a parallel deepening model. Depends on: A67, A68, D94, D112, D113. Supersedes: the implicit assumption that every frontier turn advances by the same depth granularity. + +128. **Graph view becomes an actionable workspace mode through a projection-first, intent-emitting seam** — graph mode should project shared entity truth into a relationship-aware scene with visible edge topology and own only ephemeral graph-local interaction state such as viewport, selection, focus, and path highlighting. It must not create a second durable workflow or edit-state model. Node-level actions emit intents into the existing workspace lifecycle so refinement side-chats, revisit, and future edit flows still materialize through turn-owned artifacts, projected control cards, and the same durable specification truth used by chat view. The common-case interaction should optimize for `select node -> inspect -> launch focused refinement`, with broader multi-select or edit overlays layered on later. The first ship layout is a **structured list**: kind-grouped item rows with a relations footer (Outgoing / Incoming subsections of relation chips), `referenceCode`-based hash anchors for cross-item navigation, hover-card previews on chips, soft-truncation at 6 chips per direction, and a per-row action rail reserving one disabled `chat-with` placeholder for future intent emission. A **spatial canvas** layout follows as a layout switch inside graph mode without changing the projection seam or action contract. The empty state is a minimal orientation card linking to the current reachable phase rather than empty kind sections; an explicit `Back to chat` affordance returns to the user's last phase route. Depends on: A69, A70, D50, D80, D113, D114. Supersedes: graph view as a placeholder grouped list surface, and graph view as a sibling layout mode under `_view`. + +129. **Graph view's structured-list scope decouples data fetching from default render** — graph view always fetches the `whole-spec` entities so chip targets always resolve, but defaults the rendered row set to active-path items so toggling between chat view and graph view does not silently widen the user's working scope. A `Show all` toggle expands the rendered set to the full data already in memory; no second fetch and no scope-dependent loading. **Phased shipping:** the structured-list ship lands the whole-spec fetch portion of this contract first; the active-path render filter and `Show all` toggle ship in a follow-up frontier item (see `memory/PLAN.md` Horizon: *Graph view active-path filter + scope toggle*) once per-item active-path membership is exposed in the entities API or derived through a stable client-side seam. Until then the structured list renders all `whole-spec` items by default, so the toggle would be a no-op and is not surfaced. Depends on: A70, D87, D121, D128. Supersedes: render scope and fetch scope coupled through the existing `mode=active-path | project-wide` query parameter. + +130. **First-run setup becomes a product surface, not README-only configuration** — the dashboard should expose provider credential status before specification creation and route users toward setup when no supported key is available. CLI logs and README env instructions can remain, but they are no longer the only supported onboarding path. Depends on: A74, A75. Supersedes: `ANTHROPIC_API_KEY` in project `.env` as the sole user-facing setup contract. +131. **Provider access moves behind one AI runtime provider seam** — interviewer and observer model construction should consume a shared provider/model resolver instead of importing Anthropic directly and reading model environment variables in each caller. The seam may keep Anthropic as the current implementation while testing OpenRouter as the preferred onboarding provider, but provider choice must be explicit and testable. Depends on: A74. Supersedes: direct Anthropic imports in interviewer/observer code as product truth. +132. **UI-entered credentials are user-scoped auth state, not workspace state** — if the app collects an API key, it writes to an XDG-compliant user auth/config location, never to `.brunch/` or the project `.env` by default. Existing environment-variable configuration remains supported as an override path for scripted use. Depends on: A75. Supersedes: project-local `.env` as the only persistent setup mechanism. +133. **`.brunch/` gitignore support is confirm-gated deterministic workspace mutation** — Brunch may inspect the workspace repository and offer to add `.brunch/` to `.gitignore`, but it must not mutate repository files without explicit confirmation. The mutation should be idempotent, preserve existing file content, and create `.gitignore` only when the user accepts. Depends on: A76. Supersedes: relying solely on user memory / docs to ignore the generated workspace directory. + +134. **Brunch specs evolve toward recognition-first intent graphs with progressive checkability** — the product direction is to preserve meaning as typed intent items, semantic edges, examples / counterexamples, verification witnesses, unresolved ambiguity, and user validation status rather than treating the spec as a planning document or prose inventory. Requirements and criteria remain distinct product items for now: a requirement is a commitment and a criterion is an oracle / witness. `invariant` and `example` should become first-class product ontology kinds, with positive, negative / counterexample, edge-case, and not-relevant examples represented as subtypes rather than separate top-level kinds. A shared `Property`-like intent primitive remains a design candidate rather than a committed storage or UI surface. Behavioral kernels are hidden interviewer / architect / wizard machinery for surfacing latent state, containment, authority, concurrency, migration, and evidence questions while emitting the weakest useful checkable artifact for the intent item. Depends on: A77, A78, A80, D50, D125, Requirement 38. Supersedes: the implicit framing that requirements / criteria review is the terminal semantic model of product intent. + +135. **Semantic mutation history should split from conversational turn history when graph editing becomes first-class** — turns remain conversational provenance and replay; the intent graph remains current semantic truth; a future changeset ledger records semantic mutation history; and reconciliation needs record semantic debt caused by changes that may stale existing graph truth. The first implementation should follow the multi-chat substrate in D138: chat containers plus durable reconciliation needs before a full changeset ledger, keeping turn-linked provenance and legacy spec-scoped pointers as compatibility while making room for changeset-backed provenance later. User-direct-edit mode should be allowed to land a committed group of intent-item changes immediately, synchronously create reconciliation needs from existing dependency and historical relations, then queue an asynchronous observer pass that may immediately add newly implied intent edges and additional reconciliation needs as a later interpretive-structure changeset. That observer pass may not silently rewrite, retire, or weaken existing accepted intent; content changes that require judgment go through reconciliation review. This explicitly reshapes the older revisit-session draft: revisit / cascade remains a product capability, but `revisit_session` is no longer the preferred persistence foundation once multiple chats, direct graph edits, and reconciliation review sets are in scope. Depends on: A71, A79, D80, D110, D112, D125, D128, D134, D138. Supersedes: turn ancestry as the only plausible semantic history spine, and the `docs/archive/design/REVISIT_MODULE.md` table shape as canonical persistence design. + +136. **Observer ontology should classify intent items by modality, not answer shape** — observer capture should distinguish value / outcome items (`goal`), descriptive items (`context`), boundary items (`constraint`), uncertainty items (`assumption`), choice items (`decision`), obligation items (`requirement`), preservation items (`invariant`), oracle items (`criterion`), and concrete witness items (`example`). `Decision` should narrow to chosen directions among plausible alternatives with durable consequences; `constraint` should remain top-level but gain subtypes such as `non_goal`, `scope`, `technical`, `policy`, `resource`, `compatibility`, and `environmental`. Generic `context` should be promoted when the content carries stronger semantics: success condition -> requirement or invariant, solution boundary -> constraint, uncertain material belief -> assumption, chosen alternative -> decision, mere interpretation aid -> context. Depends on: D134, Requirement 38. Supersedes: treating all user commitments or selected options as decisions by default. + +137. **Intent edges are semantic relations, while reconciliation needs are process debt** — intent-item kinds say what semantic units exist; intent-edge kinds say how items justify, constrain, depend on, refine, illustrate, and verify one another. A negative example is intent content; a boundary relation such as `rules_out`, `excludes`, or `counterexample_for` is intent semantics; a `reconciliation_need` is directed process obligation saying existing semantic truth may require renewed judgment because a change, contradiction, verifier result, or historical premise may affect it. The observer and future graph tools should provide edge-local neighborhoods around active intent items, but not every inferred edge should drive cascade, staleness, export explanation, criteria generation, or reconciliation. Relation policy should classify edge support (`explicit`, strong inference, weak candidate) and operational participation before relation-first capture broadens beyond today's limited edge set. Observer-created interpretive structure may land immediately when it adds supported edges, examples, or reconciliation needs; rewriting accepted intent remains reconciliation-review work. Depends on: A66, A81, D50, D125, D128, D134, D135, D138. Supersedes: treating graph edges as only display infrastructure, and also supersedes treating every visible edge as equally authoritative process truth or work queue state. + +138. **Multi-chat substrate is the first concrete persistence slice before the full changeset ledger** — add `chat`, nullable `turn.chat_id`, `specification.primary_chat_id`, mirrored `chat.active_turn_id`, and a minimal `reconciliation_need` table while keeping legacy `turn.specification_id` and `specification.active_turn_id` during transition. Do not add `active_chat_id` in phase one; `primary_chat_id -> chat.active_turn_id` covers the interview head until multiple active chat surfaces need their own pointer. New writes populate both legacy and chat pointers; application assertions preserve same-spec and same-chat ancestry; later cleanup can make chat ownership canonical and remove the legacy pointers. `reconciliation_need` uses directed item-to-item source / target fields, narrow `kind` and `status`, free-text reason, immediate `caused_by_turn_id`, and nullable `caused_by_changeset_id` as a future changeset-ledger placeholder. This supersedes older side-chat substrate assumptions and makes `docs/design/MULTI_CHAT.md` the concrete phase-one design while `docs/design/PATCH_LEDGER.md` remains historical deeper semantic mutation history. Depends on: A71, A82, A83, D135, D137, Requirement 39. Supersedes: implementing multi-chat by preserving an in-memory-only side-chat patch list as the durable substrate, and supersedes naming the process-debt table `reconciliation_edge`. +139. **Prompt/context scenario substrate is a first-class foundation** — Brunch should externalize server-side prompts and reusable agent doctrines into inspectable markdown assets, load and compose them through a typed server seam, and introduce context-pack builders that render the current intent graph for a specific generative scenario rather than letting each call site hand-roll prompt context. The same substrate should support lightweight prompt probes over seeded graphs and transcripts before UI surfaces are built. A prompt scenario composes prompt + context pack + model settings + capability inventory + captured output/review for evaluation; it must not become the owner of prompt doctrine, context semantics, credential UX, or the shared production AI runtime. Depends on: A84, A85, D134, D136, D137, Requirement 40, Requirement 41. Supersedes: scattered TypeScript prompt strings and transcript-dump context as the default mechanism for new agent features. +140. **Intent graph context packs are scenario-specific semantic briefings** — a context pack is an explicit rendering of graph truth, workflow state, relevant provenance, unresolved ambiguity, relation neighborhoods, and authority labels for one agent task. Packs should exist for observer capture, next-question generation, candidate-spec synthesis, criteria/witness generation, web research query framing, reconciliation review, architect proposals, and downstream decomposition/oracle probes. They should be bounded, ranked, and typed rather than raw graph dumps. Depends on: A84, D125, D134, D137, D138, Requirement 40. Supersedes: assuming the active chat transcript is the canonical prompt context after multi-chat. +141. **Post-spec decomposition remains a probe frontier, not a committed Brunch UI** — the next-after-spec direction is to derive design alternatives, oracle strategy, execution slices, and verification-aware orchestration constraints from the intent graph and its checkability implications. This should first run through the prompt/context scenario substrate, borrowing cognitive patterns from `ln-design` and `ln-oracles`, before deciding whether it belongs inside Brunch or a successor product. Depends on: A87, D134, D139, D140, Requirement 41. Supersedes: treating export prose as the only meaningful handoff target. +142. **Pi is a candidate harness adapter, not current product runtime truth** — Pi may be evaluated via SDK or RPC as the first lower-level agent harness for prompt probes, web/tool experiments, and future decomposition scenarios because it already provides sessions, custom tools, provider support, event streams, and embedding modes. Brunch should not assume Pi owns product workflow, durable replay, intent-graph mutation authority, reconciliation review, or credential UX unless a later spike proves and explicitly adopts those boundaries. Depends on: A86, D139, Requirement 41. Supersedes: deciding the web-research tool spike only at the individual tool API level. +143. **Brunch owns the agent mutation surface; harnesses adapt it as tools** — Any mutation of durable Brunch data initiated by an agent must route through Brunch-owned mutation handlers, not direct ORM access or harness-specific tool implementations. Those handlers define the product operation: stable id, input/output schemas, description, authority class, replay policy, and reconciliation/changeset-ledger behavior. AI SDK, Pi, CLI/TUI, or future adapters may expose the handlers as tools, but adapters only translate transport and tool shape; they do not define mutation authority. Read-only capabilities can use the same contract registry for consistency, but the binding rule is that agent-originated writes enter through one server-owned surface. Depends on: Requirement 42, D138, D139, D142. Supersedes: defining separate mutating tool surfaces inside each agent harness or letting agent flows bypass application handlers to call the ORM. +144. **Intent graph vocabulary supersedes knowledge graph vocabulary** — Canonical product vocabulary is `intent graph`, made of `intent items` and `intent edges`. Current schema/code may still use `knowledge_item` and `knowledge_edge` as implementation names during transition, but new planning, agent capability contracts, context packs, operation ids, and user-facing design should prefer intent vocabulary unless referring to current persistence/API names. `Claim` may remain an explanatory generic for natural-language content, but it is not a product/schema noun. Depends on: D134, D136, D137. Supersedes: using `knowledge graph`, `knowledge item`, `knowledge edge`, or `claim` as future-facing product nouns. +145. **Changeset/change supersedes patch/patch_change** — Semantic mutation history uses `changeset` for one submitted semantic mutation bundle and `change` for one atomic mutation inside it. `Patch` and `patch_change` remain historical design-doc vocabulary and may appear in older file names, but new schema, capability contracts, operation ids, and planning language should use `changeset` / `change` unless this decision is explicitly reversed. Depends on: D135, D138, D143. Supersedes: treating naming as open between patch and changeset. +146. **Hard-impact edit cascade reads from the `reconciliation_need` queue, not from REVISIT walk state** — when a hard-impact `propose_edit` patch applies, the server enumerates `knowledge_edge` rows incident on the changed item under typed relation policy and opens one `reconciliation_need` row per affected pair (Path 1 from `docs/design/MULTI_CHAT.md` §5.1). The patch list overlay is the canonical resolution surface: open needs render as a `Pending review` section alongside staged patches, with per-row accept-on-target / edit-target / dismiss actions. The V2 `deferred: true` apply response and the "Hard impact — coming in V3 cascade preview" banner are removed at V3.0 ship. V3.0 groups needs mechanically by `kind` and relation type; agent-grouped resolution (auto-confirm / auto-edit / substantive) is V3.1 work and does not block V3.0. Side-chat thread persistence is not a V3.0 prerequisite — threads stay in-memory until MULTI_CHAT.md Phase 2. Depends on: A71, A83, A88, D80, D135, D137, D138. Supersedes: hard-edit deferral with a placeholder banner, the modal secondary-thread walk in `docs/archive/design/REVISIT_MODULE.md`, and the SIDE_CHAT.md V3 prose that pre-dated the multi-chat substrate. +147. **The local agent CLI is a long-lived JSONL adapter over Brunch capability contracts** — CLI-addressability should first ship as a `brunch agent`-style local process that speaks request/response JSONL over stdin/stdout, dispatches Brunch-owned capability contracts, and keeps all product resources explicit in input payloads. The adapter may hold ambient runtime plumbing such as a DB connection, provider config, and in-flight interviewer / observer generation bookkeeping, but it must not hold hidden selected spec/chat/turn handles as command semantics. Read capabilities use `list` / `get` for structured read-model data and `read` for agent-facing projections with allowed response shapes and next-command hints. Mutations stay capability-first and surface-lazy: add only contracts needed by real probe/tool use, with an initial surface around `spec.create`, `spec.getStatus`, `spec.requestPhaseClosure(specId, phaseId?)`, `spec.requestExport`, `chat.getPrimary`, `chat.ensureReady(chatId?, timeoutMs?)`, `chat.read`, `turn.get`, and `turn.submitResponse(chatId, turnId?, response)`. `chat.ensureReady` is the idempotent synchronization/recovery command: it may trigger continuation when a chat lacks a usable next turn, can block up to a bounded timeout, and returns a derived state such as `generating`, `awaiting_response`, `idle_no_frontier`, `closed`, or `error` without requiring a durable runtime-operations ledger in the first cut. The LLM-as-user probe runner is a separate client of this JSONL adapter, not part of the capability server and not allowed to import DB/product handlers directly; its scenario briefs, model policy, generated transcripts, and curated golden fixture bundles are probe artifacts, not Brunch authority. Suggested module boundary: `src/server/capabilities.ts` plus `src/server/capabilities/` own contracts, schemas, handlers, and dispatch; `src/server/agent-jsonl.ts` plus `src/server/agent-jsonl/` own only protocol/session/transport; `scripts/agent-probes/` owns the outer LLM-as-user loop and artifact writing as development harness code through a JSONL client. Depends on: Requirement 43, A89, D139, D140, D143. Supersedes: treating the CLI as hand-written route wrappers, direct ORM scripts, a one-shot TUI with hidden ambient selection, or a probe runner that bypasses the same mutation surface future agents must use. +148. **Spec evolution strategies are chat-local, turn-mediated process state** — strategy is not specification-level semantic truth. A chat may be established through a first assistant/system frontier turn that offers or declares a strategy such as `step_by_step`, `scenario_options`, `targeted_cases`, `graph_review`, or `reconciliation`; globally triggered flows may create/reuse a pre-strategized chat whose first turn is the procedure kickoff. A chat can technically change strategy through later turns, but explicit switch UX is deferred. Tactical sub-strategies are allowed inside a chat, but broad mid-interview acceleration should branch into a side-chat/strategy chat rather than mutate the primary interview chat in place. Depends on: Requirement 39, Requirement 44, D138. Supersedes: treating the interviewer as one global mode per specification. +149. **Changesets are the atomic semantic mutation boundary, while proposal turns are not mutations until accepted** — a graph-review finding, candidate bundle, or reconciliation suggestion is the assistant/system half of an open frontier turn until the user responds. Only `accept` applies a proposal turn's semantic changeset; `revise`, `ask_followup`, `regenerate`, `defer`, and ordinary `reject` produce successor/process state rather than direct graph mutation. A changeset is the smallest semantic mutation unit that preserves coherence, and may record direct edits, candidate acceptance, reconciliation resolutions, opened reconciliation needs, or future verifier/import results. Turns should stamp the latest applied changeset id at creation (`opened_at_changeset_id` / `base_changeset_id`) so open proposals can be conservatively marked stale when the specification advances. Depends on: A71, A79, A92, D135, D145, Requirement 44. Supersedes: treating agent proposals or review findings as durable semantic truth before user/action acceptance. +150. **Relation policy owns operational directionality for intent edges** — relation names should be semantically clear, but code must not infer cascade or reconciliation behavior from raw edge source/target direction. Each relation kind declares a canonical sentence, inverse display sentence, operational-axis participation, and source-change / target-change behavior. Direct edit and hard-impact cascade enumerate incident accepted edges, then ask relation policy which endpoint, if any, receives a `reconciliation_need`. FE-700 may break current `depends_on` / `derived_from` / `constrains` / `verifies` records while expanding the ontology, but should not force every useful edge verb into one dependency direction at the expense of display, prompt context, export trace, critique, verification, candidate generation, or explanation. Depends on: A81, A88, A93, D137, D146. Supersedes: assuming outgoing edges from the changed item are the cascade direction. +151. **Scenario-options acceleration is product-facing, but graph review is its safety oracle** — the first user-visible alternative to long drilldown should likely be a first-turn strategy choice or mid-interview `speed this up` side-chat that generates 2–3 candidate bundles completing the current direction from context-packed accepted graph truth. Candidate bundles present named tradeoff profiles and are accepted as coherent units, not item-by-item pick lists. Fast gates (parse/schema/fixed-premise/no-obvious-conflict/tradeoff summary) can run before display; deeper graph review, coverage, checkability, provenance, and repair/refinement can run asynchronously. Depends on: A67, A84, A85, A90, A91, D126, D139, D140, D148, Requirement 31, Requirement 44. Supersedes: treating candidate-spec assist as a skip/force-close helper or as one-shot generation that can be committed without critique. +152. **Graph review and reconciliation are separate graph operations** — reconciliation is repair-oriented process debt from a known disturbance (`reconciliation_need`), while graph review is quality-oriented critique over any graph for weakness, genericity, low support, missing coverage, weak checkability, poor provenance, or maturity gaps. Broader review findings start as turn-owned structured artifacts; `reconciliation_need` remains the only first-class problem table until review issues require independent querying, filtering, badges, assignment, or lifecycle. Candidates may be accepted with represented issues if accepting also opens a graph-review frontier or appropriate process-debt records. Depends on: A91, D137, D149, D151, Requirement 44. Supersedes: overloading reconciliation as the umbrella for all graph intelligence or blocking useful imperfect specs until every review issue is repaired. + +## Interaction Stream Model + +The center column is a **merged stream projection** over multiple artifact families. The turn tree remains the authority for conversational lineage and branching, but the rendered stream is intentionally richer than the tree itself. + +| Artifact family | Durable | Branch-bearing | Current examples | Ordering / invalidation rule | +| ------------------------- | ------- | -------------- | -------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------- | +| Conversational turn cards | yes | yes | grounding question, design question, review proposal, closure proposal, answered-turn replay | Ordered by the active-path turn chain; branch membership comes from `parent_turn_id`. | +| Anchored workflow facts | yes | no | phase outcome | Stored outside the turn table but anchored to turn ids for provenance; if an anchor falls off the active path, the fact is superseded or hidden. | +| Projected control cards | no | no | kickoff, recovery, proceed / go-to-frontier affordances | Derived from workflow state plus nearby anchors; they re-project on hydration and may disappear / reappear without needing their own durable row. | +| Activity cards | mixed | no | visible generation state, persisted activity summary, trailing observer state | Derived from runtime state or replay summaries adjacent to a turn or control boundary; they do not become branch nodes. | +| Phase markers | no | no | phase start, phase closed | Projected from workflow position and anchored workflow facts such as phase outcomes; they annotate the stream without entering the turn tree. | +| Phase section headers | no | no | grounding purpose + knowledge kinds | Projected from workflow state and phase metadata at the top of each phase section; re-project on hydration. | + +This model is deliberately asymmetric: only conversational turns participate in the linked-list lineage model, while the other artifact families either anchor to that lineage or project from it. A rendered card therefore does not imply a persisted turn row, and a persisted durable record does not need to masquerade as a turn to belong in the stream. + +The ordering rule is: active-path turns provide the spine, anchored workflow facts attach to points on that spine, and projected control / activity / phase-marker elements are injected relative to workflow state and those anchors. The invalidation rule is: if a durable non-turn record is anchored to a turn that leaves the active path, the record must be superseded or hidden rather than left floating as if it still belonged to the trusted branch. + +## Layout Architecture + +### Top Bar + +| Element | Content | Position | +| ------------------ | ---------------------------- | --------------------- | +| Logo | Placeholder (TBD) | left | +| App name + version | "Brunch v{version}" | left, after logo | +| Separator | Pipe character | left, after version | +| Tagline | "AI-guided spec elicitation" | left, after separator | +| Working directory | `cwd` in mono | right-aligned | + +Height: `h-10` (40px). Version injected at build time from `package.json`. + +### Three-Pane Layout + +Below the top bar, three vertical panes fill the remaining viewport height. Each pane has a sticky-positioned header and a scrollable body using ScrollArea. + +#### Left Pane — Specification Navigation Sidebar + +**Sticky header:** +- "< Back to Workspace" navigation link +- Read-only specification name (set at creation, not editable) + +**Body — Phase stepper / section navigator:** +A vertical timeline with connecting line (blue for completed segments, gray for future). It remains strictly sequential for workflow truth, but it may behave as a section-jump / scroll-spy surface inside one continuous workspace transcript. Each phase item shows: + +| Phase | Internal key | Label | +| ----- | -------------- | ------------------- | +| 1 | `grounding` | Grounding | +| 2 | `design` | Elicitation | +| 3 | `requirements` | Requirements | +| 4 | `criteria` | Acceptance Criteria | +| 5 | *(route only)* | Output | + +Per-phase metadata: status (colored: Closed / In-Progress / Unstarted), readiness band (when in-progress), turn count. Closed phases and the current reachable phase are selectable; future phases may remain visible but locked. Output appears conditionally when all phases are closed. + +#### Center Pane — Chat Transcript + +**Sticky header:** +- "Phase N/M – [Phase Name]" for the currently focused section or current reachable phase — positional progress label +- Status text (colored) +- Turn count +- Readiness band (when in-progress) +- Close Phase button (right-aligned, in-progress only, gated by closeability, triggers confirmation) +- Status badge replaces button when phase is closed + +**Body (chat view):** +- One continuous workspace scroll surface that may be segmented into phase sections rather than remounted per phase +- Each phase section opens with a projected phase section header stating the phase purpose and captured knowledge kinds +- Closed phases replay their phase markers and answered / compacted turn cards as prior sections +- The current reachable phase owns the only actionable bottom artifact +- Activity cards and visible generation state stay attached to their section / turn anchors while the next generative turn is being created +- Active bottom artifact: projected kickoff control card, durable frontier turn card (grounding/question/review/closure proposal), or projected recovery card +- Artifact-specific controls + +**Body (closed phase):** +- Answered question cards +- Phase-closure marker plus any activity cards +- "Proceed to [next phase]" or equivalent handoff control card at bottom + +Scroll container: ChatScroll (ScrollArea + useStickToBottom). + +#### Right Pane — Knowledge Graph Sidebar + +**Sticky header:** +- "Knowledge Graph" title +- Item count + connection count + +**Body — Grouped intent items:** + +| Group label | Kinds | Visible | +| ----------------------- | -------------------------------------------------------- | ------- | +| Goals | goal, context, constraint (including `non-goal` subtype) | yes | +| Assumptions & Decisions | assumption, decision | yes | +| Requirements | requirement | yes | +| Acceptance Criteria | criterion | yes | +| *(hidden)* | term | no | + +Items render as compact DrawerCard instances: code + content in header, edge/dependency reference codes as drawer-peek summary when edges exist, plain card otherwise. + +### Design Tokens + +**Typography scale** (11px–16px, no sizes outside this range): + +| Token | Size | Usage | +| -------------- | ---- | -------------------------------------- | +| `text-xxs` | 11px | Impact badges, tag labels | +| `text-xs` | 12px | Secondary text, metadata | +| `text-xs-plus` | 13px | Secondary body, explanatory text | +| `text-sm` | 14px | Body text | +| `text-sm-plus` | 15px | Card headings, collapsed question text | +| `text-base` | 16px | Section headings | + +Question card titles use arbitrary `text-[17px]` above the scale for emphasis. + +**Font weights**: normal (400), medium (500), semibold (600). No bold (700+). + +**Color tokens**: + +| Token | Hex | Usage | +| ------ | ------- | ------------------------------- | +| `ink` | #202020 | Primary text | +| `sub` | #5b5b5b | Subtitles, secondary text | +| `hint` | #a6a6a6 | Placeholders, inactive elements | +| `rule` | #e3e3e3 | Borders, dividers | +| `wash` | #f0f0f0 | Ghost fills, tracks | +| `tint` | #fafafa | Subtle background | + +**Accent blue** (interactive elements, recommendations, progress): +- Primary: `#2070e6` +- Gradient top: `#3484fa` +- Ring/border: `#1060d6` + +**Shadow tokens**: `--shadow-card`, `--shadow-ring`, `--shadow-card-ring`. + +**Card structure pattern** (DrawerCard): outer `rounded-xl border border-rule bg-tint` shell, inner white header with `-m-px` border overlap trick and `shadow-card`, tinted drawer body below. + +## Critical Invariants + + + +Each row in this table is a **formalization candidate** ascending the progressive-checkability ladder: the `Invariant` column states the property in human-readable form, `Protected by` names the *current oracle* (its present rung on the ladder — typically a regression test today), and `Proves` ties the property back to the requirements or decisions it preserves. Stronger oracles (state-machine model, runtime contract, proof obligation) are deliberate future moves recorded in `docs/design/INTENT_GRAPH_SEMANTICS.md` rather than expanded inline here. + +| # | Invariant | Protected by | Proves | +| ---- | --------- | ------------ | ------ | +| I4 | Vite proxy routing and the runtime backend-port seam stay aligned through one explicit configuration path. | `runtime-config.test.ts` | Requirement 1 | +| I17 | Data Part schema validation remains confined to true LLM / HTTP boundaries rather than mirrored internal seams. | `parts.test.ts` | Requirement 4 | +| I24 | Interview hydration, streaming projection, controller orchestration, mutation transport, phase-scoped rendering, and successor-frontier continuity remain stable through the routed interview surface, including concise durable activity summaries for replay, projected kickoff/recovery/handoff controls, preface-card replay and continue affordances, landing-only grounding-strategy kickoff submission, turn-owned submit/interviewer-processing, visible generation states, anchored phase-boundary projection, and trailing observer attachment. | `InterviewView.test.tsx`, `-workspace-stream-projector.test.ts`, `transcript-parity.test.tsx`, `-interview-data.test.ts`, `-interview-controller.test.tsx`, `app.test.ts`, `client-mutation.test.ts`, `task.test.tsx` | D86, D87, D93, D94, D95, D96, D110, D113 | +| I44 | Structured turn responses round-trip through persistence, hydration, projection, and UI affordance state without collapsing back to scalar semantics. | `turn-response.test.ts`, `context.test.ts`, `InterviewView.test.tsx` | Requirement 4 | +| I48 | Canonical knowledge kinds persist with provenance and project through typed entity collections, stable per-kind reference codes, turn-linked capture projection, and graph edges without ontology drift. | `db.test.ts`, `core.test.ts`, `knowledge.test.ts`, `EntitySidebar.test.tsx`, `InterviewView.test.tsx`, `GraphView.test.tsx` | D50, Requirements 22, 23 | +| I54 | Phase-aware capture preserves the committed ontology boundary: grounding / elicitation persist only durable exploration knowledge, accepted review outputs materialize durable requirements / criteria, and both seams survive persistence, turn-linked replay hydration, and UI refresh without breaking sync. | `observer.test.ts`, `context.test.ts`, `app.test.ts`, `InterviewView.test.tsx` | D95, D112, Requirements 22, 23 | +| I72 | Explicit phase outcomes project shared workflow status, closeability, readiness, closure basis, and closed-phase boundary markers through one durable seam. | `phase-close.test.ts`, `db.test.ts`, `app.test.ts` | D65, D66, D110 | +| I87 | Requirements and criteria review ground themselves in their respective inventories, persist interviewer-owned review metadata on the review turn itself, project stable review-set reference codes, submit lightweight full-set review replies by semantic action rather than assumed option order, and carry accepted review outputs into downstream workflow without leaving dead frontier states. | `interview.test.ts`, `db.test.ts`, `app.test.ts`, `InterviewView.test.tsx`, `project-state-turn.test.ts` | D94, D112 | +| I100 | `.brunch/` workspace resolution, compiled package-bin startup from the packed install artifact, built-client serving, actual bound URL reporting, same-workspace runtime ownership, chat-sized JSON request parsing, and JSON-shaped payload-too-large failures stay correct in local-first distribution. | `project.test.ts`, `launcher.test.ts`, `cli.test.ts`, `runtime-config.test.ts`, `app.test.ts` | Requirement 1 | +| I101 | Grounding strategy and workspace-backed context gathering persist through schema, API, interviewer configuration, and observer context; preface-card assistant metadata round-trips through persistence/projection, and preface cards stay provisional rather than directly mutating durable knowledge. | `db.test.ts`, `interview.test.ts`, `app.test.ts`, `context.test.ts`, `observer.test.ts`, `parts.test.ts`, `project-state-turn.test.ts`, `ProjectList.test.tsx` | D112, Requirements 3, 20, 21 | +| I102 | File-route generation, directory-based nesting, the three-shell route architecture, and phase addressability remain the runtime routing source of truth; graph view stays code-split. | `router.test.tsx`, `build-boundary.test.ts`, `GraphView.test.tsx` | D86 | +| I103 | Trusted fixture state comes only from TypeScript builders or direct DB setup; walkthrough seeds stay builder-owned, observer probes seed directly without a second fixture format, and seeded scenarios remain resumable/exportable through that one surviving fixture model. | `corpus.test.ts`, `walkthrough.test.ts`, `seed.test.ts` | Requirements 13, 14, 15 | +| I104 | Interviewer-owned turn artifacts materialize through one persistence seam, so runtime review metadata, preface cards, activity summaries, phase summaries, and seeded brownfield replay all round-trip without route-specific reconstruction drift. | `turn-artifacts.test.ts`, `app.test.ts`, `walkthrough.test.ts` | D93, D96, D112 | +| I105 | Grounding/design structured-response turns can unlock the next frontier before observer capture finishes, while deferred capture stays keyed to the answered turn, reseeds from durable turns after reload, and avoids stale completion attachment. | `-interview-controller.test.tsx`, `app.test.ts` | D96, D113, D123 | +| I106 | Provider credential discovery, precedence, dashboard status, and model-provider resolution stay explicit without exposing raw secret values through `/api/config`, logs, persisted specification state, or client-visible payloads. | planned: `runtime-config.test.ts`, `app.test.ts`, `ProjectList.test.tsx` | Requirements 34, 35, 36; D130, D131, D132 | +| I107 | `.brunch/` gitignore hygiene is idempotent and confirmation-gated: existing ignore coverage is detected, missing entries are appended only after user confirmation, and absent `.gitignore` files are created only through that same accepted action. | planned: `project-gitignore.test.ts`, `app.test.ts`, `ProjectList.test.tsx` | Requirement 37; D133 | +| I108 | Observer capture no longer blocks chat stream completion for any eligible answered turn; capture backlog state is re-derived from durable turns, drains through the turn-owned observer-capture endpoint, and persists results back onto the originating turn. | planned: `app.test.ts`, `-interview-controller.test.tsx` | D22, D96, D123 | +| I109 | Observer prompts remain compact as relation extraction widens: existing knowledge is passed as id/kind/content-preview anchors with bounded length, graph-delta candidates resolve only through validated `knowledge_item.id` or same-turn provisional references, and accepted review grounding refs reuse the same relation policy. | `context.test.ts`, `observer.test.ts`, `db.test.ts`, `app.test.ts` | Requirement 30; D50, D125 | +| I110 | Workflow read truth and workflow write truth stay behind named seams: durable snapshots project through `projectWorkflowState`, while turn-response, chat-route, phase-intent, and phase-close mutations apply through transition/runtime helpers instead of transport handlers owning workflow semantics. | `workflow-projector.test.ts`, `turn-response-transition.test.ts`, `chat-route-transition.test.ts`, `phase-close.test.ts`, `app.test.ts` | D110, D113, D123 | +| I111 | Multi-chat substrate preserves one interview chat per specification, keeps legacy and chat-derived active heads equivalent during transition, guarantees each turn's `chat_id` belongs to the same specification as its legacy `specification_id`, scopes parent turns to the same chat, and deduplicates simultaneously open reconciliation needs for the same source / target / kind without conflating them with semantic `knowledge_edge` rows. | `chat-substrate.test.ts`, `reconciliation-need.test.ts`, `db.test.ts` | Requirement 39; A82, A83; D137, D138 | +| I112 | Prompt/context scenarios render from packaged markdown prompts and typed context-pack builders rather than scattered inline prompt strings; probe artifacts include deterministic rendered prompt/context fingerprints, prompt asset packaging mirrors current source assets at build time, and production prompt text has reviewable golden coverage without requiring product UI. | `prompt-loader.test.ts`, `prompt-build-boundary.test.ts`, `prompt-golden.test.ts`, `context-pack.test.ts`, `scenario-runner.test.ts` | Requirements 40, 41; D139, D140 | +| I113 | Hard-impact `propose_edit` apply opens at least one `reconciliation_need` per existing typed dependency edge incident on the changed knowledge item (relations: `depends_on`, `derived_from`, `constrains`, `refines`, `verifies`), records `caused_by_turn_id` provenance, deduplicates against the partial unique index, and never returns `deferred: true` from the apply contract; resolutions transition `open → resolved` idempotently. | planned: `edit-applier.test.ts`, `reconciliation-need.test.ts`, `patch-list-overlay.test.tsx`, `app.test.ts` | Acceptance Criterion 7; A88; D135, D137, D138, D146 | +| I114 | The agent capability CLI remains an adapter over Brunch capability contracts: JSONL calls validate explicit resource ids and schemas, mutating calls dispatch through server-owned capability handlers rather than ORM/route bypasses, `read` projections provide affordance hints without importing scenario briefs, and the probe runner exercises the surface only through a JSONL client. | planned: `capabilities/*.test.ts`, `agent-jsonl.test.ts`, `probe-runner.test.ts` | Requirements 42, 43; A89; D143, D147 | +| I115 | Each active/resumable chat has at most one open assistant/system-first frontier turn; user responses complete that turn through normalized proposal/response semantics, and strategy is chat-local process state rather than specification-level semantic truth. | planned: `chat-substrate.test.ts`, `turn-response-transition.test.ts`, `capabilities.test.ts` | Requirement 44; D138, D148 | +| I116 | Open proposal turns are stamped with the latest applied changeset id at creation and are conservatively stale when the specification's latest changeset advances before completion; stale proposals refresh/regenerate rather than applying against unknown graph state. | planned: `changeset.test.ts`, `turn-response-transition.test.ts`, `app.test.ts` | A92; D149 | +| I117 | Reconciliation/direct-edit cascade never infers affected endpoints from raw edge direction alone; it consults relation policy source-change / target-change behavior over incident accepted edges. | planned: `knowledge-relationship-policy.test.ts`, `edit-impact.test.ts`, `reconciliation-need.test.ts` | A93; D137, D146, D150 | +| I118 | Scenario-option candidate bundles can only become canonical by accepting a coherent bundle changeset; accepted-with-issues candidates must also create durable follow-on review/process debt so known weaknesses are not hidden. | planned: `scenario-runner.test.ts`, `turn-artifacts.test.ts`, `changeset.test.ts` | A90, A91; D151, D152 | + +## Lexicon + +### Core terms + +| Term | Definition | +| ---- | ---------- | +| **workspace** | The cwd-backed software context whose local `.brunch/` directory stores specifications and runtime state. | +| **prompt/context scenario substrate** | The server-side and test-harness foundation for loading markdown prompts, composing reusable doctrines, deriving typed intent-graph context packs, and running prompt probes before UI commitment. It is not the provider credential/setup system or shared production AI runtime. | +| **context pack** | A scenario-specific semantic briefing derived from intent graph truth, workflow state, provenance, unresolvedness, relation neighborhoods, and authority labels for one agent task. It is bounded and typed, not a raw graph or transcript dump. | +| **progressive checkability** | The discipline of representing intent items at the weakest useful witness level today — prose, example, counterexample, criterion, executable test, runtime invariant, state/transition property, or formal model — while preserving paths toward stronger witnesses where valuable. | +| **behavioral kernel** | Hidden interviewer / architect machinery that recognizes recurring correctness patterns such as lifecycle, containment, authority, concurrency, migration, and evidence, then elicits checkable artifacts without exposing formalism as product ceremony. | +| **scenario runner** | A lightweight pre-UI harness that runs a selected prompt scenario against fixtures, context packs, tools, and model settings, then records outputs for qualitative and structural review. Execution adapters translate this harness input into a concrete fake/model/harness call; they do not define Brunch semantics, credential UX, provider resolution, or mutation authority. | +| **agent capability CLI** | A local machine-facing CLI adapter, initially a long-lived JSONL stdin/stdout process, that exposes Brunch-owned capability contracts to external agents and probe runners without defining its own product API or mutation authority. | +| **JSONL capability session** | The request/response transport between an external harness and `brunch agent`: every call includes an id, capability id, and explicit input resource identifiers; the process may keep DB/provider/in-flight runtime handles internally, but selected spec/chat/turn targets are not hidden ambient state. | +| **probe runner** | An external client of the agent capability CLI that supplies scenario briefs, calls an LLM-as-user, drives Brunch through capability calls, and writes generated transcript/spec/export/graph artifacts for human curation. It must not import Brunch DB or product handlers directly. | +| **read projection** | An agent-facing read output that summarizes Brunch-known state and compatible next actions or response shapes. It differs from `get` / `list` reads, which return structured entity/read-model data for assertions, fixture capture, and tooling. | +| **agent mutation surface** | The Brunch-owned typed handler layer for any durable data mutation initiated by an agent, internal or external. It is the only write entry point agents may use; handlers own schemas, authority, replay behavior, and reconciliation/changeset-ledger semantics rather than letting agents call the ORM directly. | +| **agent capability contract** | A Brunch-owned typed contract addressable by agents or harnesses, with a stable id, description, input/output schemas, authority class, and replay policy. Read-only capabilities and mutating handlers can share this registry shape, but mutating contracts must route through the agent mutation surface. | +| **tool adapter** | A provider- or harness-specific projection of an agent capability contract into a concrete tool format such as AI SDK tools, Pi tools, CLI/TUI commands, or a future external-agent API. Adapters translate shape and transport while preserving Brunch-owned authority semantics. | +| **authority class** | The contract metadata that says whether an agent capability is read-only, proposal-only, or commits durable product truth, and therefore which replay, reconciliation, and mutation boundaries govern it. | +| **AI runtime provider** | The shared server seam that resolves the configured LLM provider, model names, API-key source, and provider-specific options for interviewer and observer calls. | +| **provider credential status** | The app-visible setup state indicating whether a supported LLM key is available, which source supplied it, and what user action is needed, without exposing the secret value itself. | +| **XDG auth state** | User-scoped configuration / credential storage outside the project workspace, used for API keys entered through Brunch UI when implemented. | +| **workspace hygiene affordance** | A confirm-gated local repository action that helps keep generated Brunch state such as `.brunch/` out of version control without silently mutating the workspace. | +| **specification** | One elicitation run within a workspace. Browser routes, HTTP paths, shared transport contracts, and durable DB/storage should all use canonical `specification` terms. | +| **project** *(legacy term)* | A deprecated older name for a specification record. Remove it rather than preserving it as a long-term compatibility seam. | +| **workspace stream** | The merged center-column read model composed from active-path turns, anchored workflow facts, projected control cards, phase markers, and activity cards. | +| **specification runtime** | The live lifecycle owner for one specification: it reconciles durable truth into the current landing, owns in-flight interviewer / successor / capture orchestration, and rejects stale lifecycle outputs that routes must not treat as their own authority. | +| **turn** | One persisted authored conversational interaction, with typed offer/reply parts and parent linkage. Today the primary interview active path still provides the main lineage spine; the multi-chat substrate is moving turn ownership toward chat-scoped chains. Questions, review proposals, closure proposals, and future side-chat turns use this seam. | +| **turn kind** *(current internal seam)* | The current persisted implementation field on a turn (`question`, `kickoff`, `recovery`). It may help project control state today, but kickoff / recovery are product-level structural affordances rather than durable authored turn categories. | +| **turn card** | The user-facing rendering of a durable conversational turn inside the workspace stream. | +| **anchored workflow fact** | A durable non-turn record whose validity is anchored to one or more turns on the active path. `phaseOutcome` is the canonical current example. | +| **projected control card** | A workflow affordance derived from durable state rather than authored conversational content. Kickoff, recovery, and proceed / handoff controls live here. | +| **kickoff card** | A projected phase-entry control card that appears whenever an open phase is in entry-pending state and requires an explicit user action before substantive interviewer progression begins. | +| **frontier turn** | The single actionable durable conversational turn currently at the bottom of an open phase when the phase is in substantive elicitation rather than structural control. In multi-chat strategy flows, each active/resumable chat has at most one open frontier turn; the specification can have multiple open frontier turns across different chats. | +| **chat-local strategy** | Process state that determines how one chat advances the spec: step-by-step drilldown, scenario options, targeted cases, graph review, or reconciliation. It is established or declared through a chat's frontier turn and is not durable product truth about the specification. | +| **proposal turn** | An assistant/system-first frontier turn that offers a candidate bundle, graph-review finding, reconciliation suggestion, or other proposed action. It is not a semantic mutation until the user completes it, usually by accepting, revising, asking follow-up, deferring, regenerating, or rejecting. | +| **preface card** | A turn-internal artifact that presents provisional context from interviewer-invoked context gathering, rendered above a paired question card within the same turn. The observer captures from the whole turn (preface context + question + user response) as one validated unit rather than from the preface card alone. Available in any phase when the workspace directory is present. Implementation: `preface` / `PrefaceCard` / `present_preface` tool / `data-preface` part. Renders as a simple `bg-tint` rounded box with italic subdued text, not as a DrawerCard. | +| **question card** | A turn card that asks a structured interviewer question and expects a substantive user response. | +| **review turn** | A full-set requirements or criteria review interaction that offers a synthesized candidate list with stable reference codes, supports per-item commenting (inline comment toggle on each item) plus one optional global review note, and persists its own `reviewActions` / `reviewSet` metadata on the turn. On `request changes`, the successor review turn carries a revision card above the new review set. | +| **closure turn** | A durable proposal turn whose offer proposes closing a phase and whose reply explicitly accepts or rejects that proposal. Accepting it confirms the phase outcome on that same turn and advances the workflow into the next phase's projected entry state. | +| **recovery card** | A projected control card that appears whenever an open phase lacks a valid actionable frontier and offers the user a repair path without requiring a separately generated recovery turn. | +| **active turn** | The live frontier turn currently awaiting substantive user completion inside the workspace. Structural control cards such as kickoff and recovery are not active turns. | +| **answered-turn card** | The compact replay form of a completed elicitation turn, summarizing the offer, the structured response, and the turn-owned capture status. | +| **response note** | The single attached text field on a structured user response; it may explain selections, annotate a review, add missing context, or redirect the interviewer. | +| **grounding** | The first phase of a specification, aimed at establishing enough orientation to proceed into design. It is both the product term and the canonical workflow key. | +| **grounding strategy** | The method used to reach grounding sufficiency: elicitation-first (`greenfield`) or analysis-first (`brownfield`). | +| **delivery posture** | The second interview-orientation axis: `end-to-end build` for whole-system creation or reshaping, versus `incremental feature` for bounded change inside an existing or emerging system. | +| **grounding brief** | The concise visible summary surfaced on a preface card after context gathering during grounding. | +| **grounding sufficiency** | The threshold at which the interviewer has enough stable orientation to begin design. | +| **recognition-first elicitation** | The strategy of helping users converge by reacting to concrete possibilities, tradeoffs, examples, and ruled-out directions rather than requiring them to author intent from scratch. | +| **candidate direction** | An agent-synthesized possible specification direction offered when the user asks Brunch to fill in the rest, compare options, or react to proposed typologies. It includes rationale, implications, tradeoffs, likely generated knowledge, and what it rules out. | +| **candidate graph bundle** | The coherent commit/review unit produced by scenario-options flows: a named scenario with tradeoff profile, generated intent items and edges, required core items, optional/swappable items, known risks, graph-review findings, provenance labels, and commit preconditions. It should be accepted or revised as a bundle rather than item-by-item unless semantic closure can be proved. | +| **scenario options** | A chat-local strategy that generates 2–3 candidate graph bundles completing the current direction from context-packed accepted graph truth, then uses graph review to gate clean acceptance or acceptance with represented issues. User-facing labels may be "Show me strong options" or "Speed this up". | +| **targeted cases** | A chat-local strategy based on behavioral kernels: the interviewer asks contrastive domain cases, and user classifications emit checkable artifacts such as decisions, invariants, criteria, examples, and counterexamples. | +| **candidate-spec set** | A turn-owned interviewer artifact in grounding or design that presents one or more candidate directions for reaction-driven refinement. It is analogous to a review set in being a persisted artifact on the turn, but it proposes possible directions rather than reviewing a synthesized inventory. The newer candidate graph bundle framing is the coherent graph-level version of this artifact. | +| **candidate-spec reaction** | The structured user response to a candidate-spec set, choosing whether to accept a direction, request refinement of one candidate, or regenerate a fresh set. It steers the next interview move without directly closing the phase. | +| **breadth skeleton** | A turn-owned interviewer artifact used during a progressive detail pass that summarizes the current broad-pass map, highlights areas that remain shallow, and offers explicit deepening targets. | +| **detail focus** | The selected area or lens for the next recursive follow-up pass. It scopes the next same-phase frontier turn without becoming a separate workflow state or durable topic tree. | +| **detail reaction** | The structured user response to a breadth skeleton, choosing whether to deepen a specific area now, continue broad coverage, or leave an area sufficient for now. | +| **progressive detail pass** | An interview shape that establishes broad structure first, then offers explicit `next level of detail` actions to deepen selected areas recursively rather than drilling to maximum depth immediately. | +| **review set** | A synthesized candidate list used in requirements or criteria review, presented with stable reference codes, supporting per-item commenting, and resolved through `accept review` or `request changes` with per-item comments plus one optional global review note. | +| **review revision** | A successor review set generated after `request changes`, carrying a revision card (changelog + version badge) as a turn-internal artifact above the new review set card. Prior revisions collapse to compact answered-turn summaries. | +| **revision card** | A turn-internal artifact on a review revision turn that summarizes what changed from the prior version and displays a version badge (v2, v3, etc.), paralleling how preface cards sit above question cards. | +| **per-item comment** | An inline comment placed on a specific item in a review set via a comment toggle, forming part of the structured change-request payload alongside the optional global review note. | +| **accepted review set** | The terminal accepted review output for a review phase; this is the authoritative carry-forward set for later review and export seams, and any accepted requirement / criterion items derive their authority from membership in this set. | +| **phase entry state** | The workspace state shown when a projected kickoff card is the current bottom-of-phase affordance. | +| **landing reconciliation** | The pure derivation from durable specification snapshot into the one truthful visible bottom artifact for hydration/restart, plus any pending capture backlog the runtime must re-seed. | +| **observer capture backlog** | The ephemeral specification-scoped queue of answered turns that still need deferred observer capture. It is re-derived from durable turns with a persisted response but no turn-owned observer result, then drained by the runtime lifecycle once a successor frontier exists. | +| **phase handoff state** | The workspace state shown when a phase is complete and a projected handoff / completion control card is the current bottom-of-phase affordance. | +| **control marker** | A transcript-visible workspace event such as interview start, resume, or confirmation that is not rendered as a normal user chat bubble. | +| **phase marker** | A projected boundary annotation in the workspace stream, such as phase start or phase closed, derived from workflow position or anchored workflow facts. | +| **turn capture status** | The per-turn state describing what the observer has captured already, is still capturing, or failed to capture from that answered turn. | +| **active path** | The trusted chain from HEAD to root in the primary interview chat. Side-chats are sibling chat chains under the same specification, not branches of this active path. | +| **phase / mode** | One workflow stage: `grounding` *(label: Grounding)*, `design` *(label: Elicitation)*, `requirements` *(label: Requirements)*, or `criteria` *(label: Acceptance Criteria)*. | +| **phase outcome** | Durable closure artifact for a phase, including summary and closure basis. | +| **closure basis** | Whether a confirmed phase close came from interviewer recommendation or explicit user-forced closure. | +| **closeability** | Deterministic minimum bar for whether the user may close a phase now. | +| **readiness band** | Coarse descriptive signal (`low`, `medium`, `high`) separate from closeability. | +| **review action** | The explicit submit path on a review turn: `accept review` or `request changes`; the action gives any attached review note its meaning. | +| **exploration knowledge** | Durable knowledge captured during grounding or elicitation: `goal`, `term`, `context`, `constraint`, `decision`, and `assumption`. | +| **context** | Descriptive situational truth, actors, workflows, repo facts, or bounded area under discussion that would remain true even if the specification paused tomorrow. Promote context when it carries stronger semantics: success condition -> requirement / invariant, solution boundary -> constraint, uncertain material belief -> assumption, chosen alternative -> decision. | +| **constraint** | A durable boundary on acceptable scope or solution space. Planned subtypes include `non_goal`, `scope`, `technical`, `policy`, `resource`, `compatibility`, and `environmental`. | +| **non-goal** | A `constraint` subtype expressing an explicit exclusion from the current specification scope. | +| **decision** | A chosen direction among plausible alternatives, with durable consequences for future design, implementation, or interpretation. Not every user answer or option selection is a decision. | +| **assumption** | A durable material belief supporting a direction or decision that could later prove false. | +| **intent graph** | Canonical product term for Brunch's semantic substrate: typed intent items, intent edges, examples / counterexamples, validation status, and semantic mutation state. Chat and graph views are projections over this truth; reconciliation needs are process state attached to the graph, not intent content. Supersedes `knowledge graph` as future-facing product vocabulary. | +| **intent item** | Canonical product term for one durable typed semantic unit in the intent graph. Current schema/code may still persist these as `knowledge_item` rows during transition. Use `knowledge item` only when referring to current implementation names. | +| **intent edge** | Canonical product term for one durable typed semantic relation between intent items. Current schema/code may still persist these as `knowledge_edge` rows during transition. Use `knowledge edge` only when referring to current implementation names. | +| **knowledge item / knowledge edge** | Legacy implementation names for current persistence/API records backing intent items and intent edges. Avoid these in new product concepts, capability contracts, and operation ids unless referring to existing code or database schema. | +| **progressive checkability** | The stance that each intent item should receive the weakest sufficient witness: human review, concrete example, counterexample, regression test, runtime contract, state-machine rule, invariant, proof obligation, or explicit unresolved ambiguity. | +| **property** *(candidate ontology)* | A normalized intent primitive that requirements could commit to and criteria could observe. It is a design candidate, not a committed storage or UI surface. | +| **invariant** *(planned ontology kind)* | A property that must remain true across relevant states, transitions, executions, versions, or semantic revisions. | +| **example** *(planned ontology kind)* | A concrete scenario, trace, input/output, edge case, approved example, rejected example, not-relevant label, or counterexample that disambiguates or witnesses intent. Expected subtypes include positive, negative / counterexample, edge-case, and not-relevant. | +| **edge-local neighborhood** | The focused relation context around one intent item: incoming and outgoing intent edges with nearby item summaries, support strength, and relation semantics. Used by interviewer / observer prompts and graph refinement instead of dumping all grouped knowledge. | +| **behavioral kernel** | Reusable interviewer machinery for one class of latent correctness question, such as state/lifecycle, containment, authority, concurrency, transactionality, migration, or evidence. Kernels are not user-facing formalism by default. | +| **intent spec** | The complementary framing to a planning spec: a specification optimized for preserving and validating meaning rather than sequencing downstream work. Carries typed intent items, examples and counterexamples, witness strength, unresolved ambiguity, and validation status. The intent graph is the durable substrate; an intent spec is the human-facing projection of that graph. Contrast with `planning spec`. | +| **planning spec** | A specification optimized for downstream work sequencing — what to build, what scope is in or out, which slices follow. Brunch's product direction is for planning to remain a useful projection from the intent graph rather than the source artifact. | +| **checkability** | A typed field on an intent item describing the strongest oracle that currently witnesses it, drawn from the progressive-checkability ladder: `human_review` / `example` / `counterexample` / `regression_test` / `runtime_contract` / `state_machine_rule` / `invariant` / `proof_obligation` / `unresolved_ambiguity`. The discipline is `progressive checkability`; the field is `checkability`. | +| **witness strength** | The breadth of an intent item's oracle coverage, distinct from which oracle exists. "Checked on three examples" and "proved for all reachable states" can both be `checkability: invariant`, but they have very different `strength`. The pairing forces honesty about what is actually verified. | +| **formalization candidate** | A Brunch-internal intent item that is worth promoting along the progressive-checkability ladder. Critical invariants are formalization candidates: each one states a property currently witnessed by a regression test, with stronger oracles (state-machine model, runtime contract, proof obligation) as deliberate future moves rather than implicit expectations. | +| **disambiguating example** | An `example` whose primary purpose is to settle ambiguity between plausible interpretations of a requirement, invariant, or decision. Linked through the `disambiguates` relation. Generalizes the TiCoder move beyond test cases: the interviewer generates cases where interpretations diverge, and the user's classification settles the meaning. | +| **spec drift** | A divergence between an intent item's recorded meaning and the artifact (criterion, generated requirement, candidate spec, export bundle, or downstream implementation behavior) meant to satisfy it. Surfaced in human terms — "original intent vs generated behavior vs potential mismatch" — so the user can validate meaning at the point where it could have changed, rather than after the divergence has been laundered into a final document. | +| **relation family** | One of five semantic groupings that organize the relation kinds in the intent graph: `justification`, `dependency`, `boundary`, `refinement`, and `verification`. Distinct from the relation `kind` itself; a single kind belongs to exactly one family. Drives prompt grouping, default policy, and observer classification heuristics. | +| **relation policy** | The per-relation, per-axis registry that decides whether each edge participates in `visible`, `cascade`, `export_trace`, `staleness`, `reconciliation`, `criteria_help`, or `weak_suggestion` capabilities. Replaces the implicit assumption that every edge is equally authoritative. Gated by edge `support` (`explicit` / `strong_inference` / `weak_candidate`) and `status` (`proposed` / `accepted` / `rejected` / `stale`). It also owns operational directionality: source-change and target-change behavior must be explicit rather than inferred from raw edge direction. | +| **graph review** | A quality-oriented graph operation that critiques any intent graph for weakness, genericity, low support, missing coverage, weak checkability, poor provenance, or maturity gaps. Distinct from reconciliation, which repairs known process debt after a disturbance. | +| **graph-review finding** | A turn-owned structured artifact produced by graph review. It may later lead to a changeset if accepted, but it is not itself semantic truth or process debt unless represented through a follow-on turn, changeset, or reconciliation need. | +| **structured list** | The first-ship graph-view layout: kind-grouped item rows with a relations footer of Outgoing / Incoming relation chips. Item-first; relationships visible inline. It currently renders the whole-spec entity set because D129 ships the whole-spec fetch first; the intended default becomes active-path items over whole-spec data once the active-path membership seam and `Show all` toggle land. | +| **spatial canvas** | A deferred future graph-view layout where intent items render as nodes with visible edges in a 2D scene. Shares the projection seam and intent contract of D128 with the structured-list layout. | +| **relation chip** | A compact UI element representing one intent-edge endpoint inside a relations footer, carrying the target item's reference code and content snippet. Hover reveals a preview card; click navigates to the target item via hash anchor. | +| **relations footer** | The grouped Outgoing / Incoming subsections beneath an item row in the structured list, listing relation chips for that item's incoming and outgoing edges. Soft-truncates at 6 chips per direction with an inline `+N more` expander; collapses to nothing when an item has zero edges. | +| **action rail** | The per-row right-aligned slot in graph view's structured list reserved for node-level action affordances. Actions emit intents into the existing workspace lifecycle rather than owning their own state. The first ship reserves the slot with one disabled `chat-with` placeholder. | +| **secondary thread** | Modal revisit conversation anchored to a primary-path turn and used to resolve cascade implications. | +| **needs-revisit** | Flag meaning an item is affected by upstream invalidation and must be explicitly resolved before the specification is whole again. | +| **chat** *(planned persistence seam)* | A conversation container inside one specification. The primary interview, side-chats, reconciliation chats, verifier feedback, and review discussions may all own turns without owning semantic truth directly. Phase one adds the table and transitional pointers before making chat ownership canonical. | +| **changeset** *(future persistence seam)* | Canonical term for one submitted semantic mutation bundle against the intent graph. It records what changed and why, separate from the conversational turn that may have initiated it. A changeset is the smallest atomic unit that preserves graph coherence; proposals/findings become changesets only when accepted or otherwise acted on. Supersedes `patch` as the future-facing schema/contract noun. | +| **change** *(future persistence seam)* | Canonical term for one atomic semantic mutation inside a changeset, such as `intentItem.create`, `intentItem.updateContent`, `intentEdge.create`, or `intentEdge.delete`. Supersedes `patch_change`. | +| **patch / patch_change** | Historical design-doc vocabulary for changeset/change. Avoid in new schema, capability contracts, and operation ids unless referring to older docs or source-control-style analogy. | +| **reconciliation need** *(planned persistence seam)* | Durable semantic debt saying existing intent-graph truth may require renewed judgment because an upstream item, relation, verifier, contradiction, or historical premise changed. Phase one stores directed item-to-item needs with narrow kind/status and provenance placeholders; later phases may add relation targets and changeset-backed cause/resolution. It is process state, not an intent edge or intent content. | +| **DrawerCard** | Shared card primitive with header/summary/children slots that supports static, summary-peeking, and toggleable (minimized ↔ maximized) render modes. A `locked` prop disables toggle for controlled-state cards. | +| **ChatScroll** | Composite scroll container that wires Radix ScrollArea (custom scrollbar) with `useStickToBottom` (auto-scroll-to-bottom + scroll-down indicator). Used for the center pane transcript. | +| **phase stepper** | The vertical timeline navigation in the left sidebar showing phases as sequential steps with connecting line, status, readiness, and turn count. | +| **phase addressability** | The ability to deep-link, gate, and focus interview phases through router state even when the center pane renders one continuous sectioned workspace. | +| **knowledge group** | A display-level grouping of knowledge kinds for the sidebar, defined by a hard-coded registry that maps kinds to group labels and visibility. | +| **output view** | The terminal route available when all phases are closed, providing specification summary and markdown export. Not a workflow phase. | +| **activity card** | A projected runtime or replay artifact adjacent to a turn or phase boundary, such as visible generation state, coarse interviewer activity summary, or trailing observer status. It is not a branch-bearing conversational turn. | +| **activity placeholder** | The compact replayable presentation of an activity card between turn cards, showing elapsed thinking time and a coarse tool-use summary for the interviewer without exposing hidden reasoning or raw tool payloads. | +| **phase section header** | A projected, non-durable artifact at the top of each phase section that states the phase purpose and what kinds of knowledge are captured there. Re-projects from workflow state on hydration. | +| **grounding question** | A free-text-first question format used during grounding that presents the question, a why explanation, and a response note field without requiring option selections. Distinct from the option-selection format used in elicitation. | +| **turn-internal artifact** | An assistant-part artifact rendered as its own visual card within a turn but sharing the turn's single response submission. Preface cards and revision cards are turn-internal artifacts that render above their paired question or review set card. | +| **query domain** | An independently invalidable TanStack Query scope within a specification. The current live ownership target is one specification bundle domain (`workflow`, `landing`, `turns`) plus a separate entities domain; finer splits should follow real server ownership boundaries rather than outrunning them. | + +### Boundary terms + +| Term | Definition | +| -------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------- | +| **greenfield** | A grounding strategy for a new concept or under-specified area where the system grounds primarily through elicitation. | +| **brownfield** | A grounding strategy for work inside an existing codebase where the system grounds through analysis, then interrogation. | +| **end-to-end build** | A delivery posture where the specification is shaping a whole system, workflow, or major slice from entry to outcome. | +| **incremental feature** | A delivery posture where the specification is shaping a bounded change inside an existing or partially established system. | +| **context-gathering capability** | An interviewer-invoked capability such as workspace analysis or future web research that gathers provisional orientation for the next move. | +| **BrunchUIMessage** | Typed UI message contract spanning validation, persistence, SSE streaming, and hydration. | +| **Data Part** | Typed custom message part used for structured input and domain-specific assistant output. | +| **context builder** | Typed projection from specification state into inference context for interviewer, observer, or closure logic. | +| **walkthrough scenario** | Named trusted fixture scenario used to seed a resumable manual-inspection workspace. | + +## Verification Design + +### Verification Commands + +| Step | Check | Command | +| ---- | ----------------- | ------------------- | +| 1 | Formatting | `npm run fmt:check` | +| 2 | Lint + type check | `npm run lint` | +| 3 | Unit tests | `npm run test` | +| 4 | Build | `npm run build` | +| all | Full gate | `npm run verify` | + +### Verification Policy + +Every meaningful code change should pass `npm run fix` in the inner loop and `npm run verify` before commit. Slices that touch the user-facing boundary should also stay manually walkthrough-able via the local app. + +### Verification Stance + +- Verification is first-class work; this wave stays **manual-heavy by deliberate choice**, not by accident. +- **Inner loop** proves structural validity, boundary safety, and non-destructive behavior. +- **Middle loop** proves replay, refresh-boundary ownership, and explicit state projection where cheap automated checks can remove bad degrees of freedom. +- **Outer loop** is the authority for brownfield grounding quality, transcript legibility, waiting-state clarity, and phase-layout differentiation. +- Outer-loop UI review uses a **dramaturgical see-and-inspect** posture: judge whether the product stages its state transitions legibly for a human, not just whether bytes round-trip. + +### Diagnostic Assessment + +| Dimension | Score | Notes | Change trigger | +| --------------- | ------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------- | +| Observability | partial | Persistence, DB state, TypeScript seed builders, and route seams are visible in text, but the most important failures in this wave still present as browser-visible transcript disappearance, waiting-state ambiguity, and layout legibility issues. | Promote instrumentation if manual browser inspection cannot explain refresh or lock behavior confidently. | +| Reproducibility | partial | TypeScript scenario builders and direct observer probes give a strong base, but brownfield kickoff quality still varies by repo shape and live refresh behavior is not yet represented by a canonical replay matrix. | Promote a stronger corpus or replay harness if ad hoc brownfield/manual checks stop being trustworthy. | +| Controllability | partial | The agent can iterate on fixtures, stories, and structural tests autonomously, but the core acceptance signals for this wave remain human judgment calls. | Raise controllability only if manual review becomes the bottleneck or repeated ambiguity blocks progress. | + +### Oracle Strategy by Loop Tier + +| Tier | Oracle families | What they prove | Main targets | +| ------ | ------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------- | +| Inner | Schema validation, type-aware linting, focused unit/integration tests, negative-space regressions | Boundaries remain type-safe; persistence and transport seams do not silently collapse; obvious bad failures are caught cheaply. | I4, I17, I24, I44, I48, I54, I72, I87, I100, I101, I102, I103 | +| Middle | Round-trip / replay oracles for seeded projects, hydration, export, and resume | Seeded or persisted state can be loaded, projected, re-rendered, and exported without losing required semantic markers. | Requirements 13, 14, 15; I24, I44, I100, I103 | +| Middle | Route/query ownership integration oracles | Observer updates and response mutations refresh only their owned surfaces instead of tearing down unrelated transcript state. | Requirements 5, 7, 14; A20, A64; I24, I54, I102 | +| Middle | Explicit state-model oracles for in-flight UI states | Every major in-flight mode is named, projectable, and visibly representable instead of collapsing into one opaque loading bit. | Requirement 5; I24, I44 | +| Outer | Fixture-backed manual walkthroughs on seeded scenarios | Walkthrough fixtures are useful enough to inspect phase transitions, export output, resume behavior, and missing-view discovery. | Requirements 13, 14, 15; I100, I103 | +| Outer | Brownfield kickoff walkthroughs on real repos, evaluated qualitatively | Kickoff yields durable useful knowledge and a grounded first question for feature-area work, without needing a fully automated quality score. | Requirements 3, 16; A63; I101 | +| Outer | Dramaturgical story and transcript review | Phase differentiation, transcript artifact legibility, and waiting-state clarity are judged as staged user experience rather than just structural output. | Requirement 5; A15, A51, A53, A54 | + +### Design Notes + +- **Legible replay fidelity beats exact replay fidelity for now** — hydrated transcripts may use placeholders or summary markers to indicate that reasoning or tool activity happened at a point in the conversation, even if the full original content is not persisted. +- **Turn-first replay now beats message-first replay** — for grounding/design, the replay unit should trend toward completed turns plus one live unresolved turn, not alternating assistant/user chat bubbles and stream markers. +- **Brownfield kickoff has a deliberately modest proof bar** — this wave only needs durable useful knowledge plus a grounded first question, not a fully proven grounding bundle before design can proceed. +- **Waiting states should become an explicit vocabulary in code** — the user-facing contract is that each major in-flight mode is visibly represented; deep lock/wait introspection is diagnostic scaffolding, not yet a product requirement. +- **Manual verification is intentionally lightweight** — no heavyweight scripted walkthrough protocol yet; use seeded scenarios and see-and-inspect review rather than bureaucratic checklists. +- **Kickoff strategy comparison stays qualitative unless proven insufficient** — if the brownfield mode fork remains ambiguous after manual repo comparisons, promote that question to a spike with a stronger comparison harness. +- **Graph-view fixture matrix is project-shareable infrastructure** — named scenario builders (`emptySpec`, `singleItemNoEdges`, `crossPhaseDecisionLink`, `denseGoalAnchor`, `activePathDivergence`, plus an explicit `compareLowVsHighEdgeDensity` for A70) underwrite both inner-loop component tests and outer-loop manual walkthroughs. Reusable beyond graph view as similar visualization slices land. + +### Acknowledged Blind Spots + +| Blind spot | Reason | Current mitigation | Revisit trigger | +| ------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------ | ----------------------------------------------------------------------------------------- | +| Qualitative interviewer and kickoff quality across many repo shapes | Chosen manual-first; no broad brownfield corpus or score harness yet | Manual brownfield walkthroughs on representative repos | Brownfield regressions recur or kickoff strategy debates cannot be resolved qualitatively | +| Transcript trust and readability after hydration | Exact replay of all reasoning/tool detail is intentionally deferred | Legible placeholders/summary markers plus manual transcript review | Users still cannot understand what happened after replay despite visible markers | +| Actual lock/wait causality in the UI | Instrumentation is not yet the primary investment | Require explicit visible in-flight states and inspect browser behavior manually | Manual inspection cannot explain a repeated perceived lock or disappearance bug | +| Story quality and phase differentiation | Design quality is not executable in a trustworthy way yet | Story variants reviewed against seeded walkthrough findings | Story/app drift grows or design disagreement blocks implementation | +| Observer latency and layout refresh freshness | No explicit latency budget or perf gate yet | Runtime observation during manual sessions | A20 shows recurring latency or coarse refresh pain | +| Revisit UX and secondary-thread adequacy | That seam is still future work | Keep structural coverage on graph/persistence seams only | Revisit work moves from horizon into the active frontier | +| Real browser scroll behavior under JSDOM | `scrollIntoView` is shimmed in JSDOM — component tests cannot prove real scroll happens after chip click | Outer-loop manual walkthrough explicitly checks scroll-into-view + highlight on chip click | Reports of chip click "doing nothing" or scroll behaving inconsistently across browsers | +| Hover-card timing and popover positioning feel | Animation delay and placement perception are not text-observable | Outer-loop manual review with shadcn defaults (~300ms open, ~150ms close) | Users report flicker, misplaced popovers, or unintended dismissal | +| Mobile / touch / keyboard-only ergonomics for relation chips | HoverCard pattern is mouse-biased; long-press fallback is designed but has no automated test surface | Manual walkthrough on touch device once per slice family | Touch users report missing or undiscoverable preview | +| Performance under large intent graphs | No render or memory budget yet; relation-first observer expansion (A66) will increase edge density | Defer until specs with hundreds of items + dense edges become common | Render lag visible on representative manual walkthroughs | +| Cross-session "Back to chat" target persistence | sessionStorage clears on tab close so the deep-linked entry to graph view has no remembered chat origin | Falls back to current reachable phase via workflow state | Users report "Back to chat" landing in the wrong phase after a fresh tab | +| Visual regression infrastructure | Manual-heavy stance accepted across the project; no Chromatic/Playwright-screenshot seam yet | Outer-loop manual walkthrough on the named graph-view fixture scenarios | Three or more visual regressions caught only after merge | + +### Current Coverage + +| File | Protects | +| ----------------------------------------------------------------- | ------------------------------ | +| `db.test.ts` | I48, I72, I101 | +| `core.test.ts` | I48 | +| `app.test.ts` | I24, I54, I72, I87, I101, I104 | +| `context.test.ts` | I44, I54 | +| `observer.test.ts` | I48, I54 | +| `parts.test.ts` | I17, I101 | +| `project-state-turn.test.ts` | I24, I44, I87, I101 | +| `task.test.tsx` | I24 | +| `EntitySidebar.test.tsx` | I48 | +| `InterviewView.test.tsx` | I24, I44, I48, I54, I72 | +| `-interview-controller.test.tsx` | I24, I105 | +| `-workspace-stream-projector.test.ts` | I24 | +| `transcript-parity.test.tsx` | I24 | +| `interview.test.ts` | I87, I101 | +| `turn-artifacts.test.ts` | I104 | +| `phase-close.test.ts` | I72 | +| `router.test.tsx` | I102 | +| `GraphView.test.tsx` | I48, I102 | +| `project.test.ts` / `launcher.test.ts` / `runtime-config.test.ts` | I4, I100 | +| `corpus.test.ts` / `walkthrough.test.ts` / `seed.test.ts` | I103 | + +## Acceptance Criteria + +1. `npx brunch` can start from a workspace directory with local-first persistence in `.brunch/`. +2. Greenfield and brownfield grounding both work, with brownfield able to start from workspace analysis and converge into the same grounding phase purpose. +3. Structured turns support rich responses without losing semantic fidelity. +4. The knowledge layer stays visible, typed, and linked through graph relationships. +5. Phase closeability, readiness, and closure provenance stay legible to the user. +6. Requirements and criteria review remain explicit, lightweight, durable at the turn level, and export-relevant. +7. Revisit can invalidate knowledge, surface cascade through the `reconciliation_need` queue, and re-resolve through the patch list — no separate modal or secondary-thread surface. +8. The routed UI stays stable across dashboard, phase views, sidebar knowledge, and graph view. +9. Resume works from persisted state. +10. The verification gate passes. +11. Grounding/design use workspace-owned turn cards for substantive elicitation, requirements/criteria use full-set review turns, and structural kickoff / recovery / handoff / completion affordances project without a bare generic composer. +12. Hydrated transcripts preserve interviewer-side structure plus stable durable activity summaries for any live-only artifacts that were shown during streaming, including elapsed thinking time and a coarse tool-use summary / placeholder seam. +13. Open phases bottom-load a projected kickoff card, the current frontier turn, a visible generation state, or a projected recovery card; completed elicitation turns replay as answered-turn records, and closed phases bottom-load a projected handoff or completion artifact. +14. Preface cards render as turn-internal artifacts paired with question cards, so the observer captures from the whole validated turn rather than from unvalidated provisional content alone. +15. Grounding and elicitation persist only the durable exploration ontology, with `non-goal` represented as a `constraint` subtype rather than a separate top-level kind. +16. Observer prompt, shared kind registry, schema / API types, fixtures, and UI copy describe the same ontology and accepted-review semantics without per-layer language drift. +17. The interview can orient itself anywhere in the `greenfield <> brownfield` by `end-to-end build <> incremental feature` matrix without forcing whole-project assumptions. +18. Observer capture records intent edges broadly enough that most durable intent items link to upstream or downstream context whenever that relation is reasonably traceable. +19. Users who cannot complete a long interview can request candidate directions with explained tradeoffs and refine by reacting to them. +20. The interview can stop at a broad pass and deepen selected areas incrementally through explicit next-detail actions. +21. Graph view renders the intent graph as a navigable workspace with visible edges and node-launched refinement flows, not just a grouped list. +22. First-run setup makes missing provider credentials visible and recoverable from the dashboard without requiring users to hand-edit project `.env` files. +23. Brunch can help users keep `.brunch/` out of version control through an explicit, idempotent `.gitignore` confirmation flow. diff --git a/memory/CARDS.md b/memory/CARDS.md deleted file mode 100644 index 90855cb5..00000000 --- a/memory/CARDS.md +++ /dev/null @@ -1,442 +0,0 @@ - - -# Scope cards — V3.1 + node-edit completion - -The frontier (PLAN.md §Next item 2) is **Side-chat V3.1 — agent-grouped reconciliation resolution**. The user's direction (T-019e08b9 conversation) folds in two adjacent improvements that close out node editing on the cascade surface: showing the source diff inline (Card A) and an Edit-target affordance per need (Card B). These compose against the V3.0 seam and are independent of the V3.1 agent. - -Cards 1-3 are done and live in settled seams. Card 4 is a Figma-aligned visual polish pass over those surfaces (chat panel, staged-patches strip, Pending review section, direct-edit toolbar) that lands before the V3.1 agent UI builds on top — the agent's per-row status chips and proposal-diff actions reuse the polished `DiffPopover` and toolbar contracts that Card 4 introduces. Card 5 is the V3.1 agent backend (next, full scope card). Cards 6-7 (V3.1 client UI + bulk actions) are NOT queued yet — they depend on `ln-oracles` settling the LLM verification strategy and on what slice 5 actually feels like. - ---- - -## 1 — Source-content snapshots on `reconciliation_need` (server) — `done` - -### Objective - -Each `reconciliation_need` row carries the source item's content snapshot from immediately before and after the edit that opened it, so downstream surfaces (Pending review row, V3.1 agent pre-image) can render or reason about the actual change without re-querying mutable item history. - -### Acceptance Criteria - -- ✓ `reconciliation_need` table has two new nullable columns: `source_previous_content TEXT`, `source_current_content TEXT`. -- ✓ `OpenReconciliationNeedInput` accepts and persists `sourcePreviousContent` and `sourceCurrentContent`; `openReconciliationNeed` writes them through. -- ✓ `handleApplyEdit` (edit-route hard path) passes the existing `previousContent` and the just-applied `parsed.data.content` into every `openReconciliationNeedIfAbsent` call. -- ✓ `ReconciliationNeedRecord` (shared type) gains the two fields so the client query exposes them. -- ✓ `GET /api/specifications/:id/reconciliation-needs` returns the new fields without breaking existing test payload assertions. -- ✓ Existing partial-unique-index dedupe still applies — re-applying the same edit does not open new rows or overwrite snapshots. -- ✓ Migration `0018_reconciliation_need_source_snapshots.sql` is generated via the standard drizzle pipeline. - -### Verification Approach - -- Inner: extend `cascade-producer.test.ts` / `reconciliation-need.test.ts` for the new columns; extend `edit-route.test.ts` to assert snapshots arrive on opened needs; extend `reconciliation-needs-route.test.ts` for the response shape. `npm run verify`. - -### Promotion checklist - -- [ ] Requirement change? **No** — extends existing Requirement 10 surface, no new requirement. -- [ ] Assumption change? **No** — A88 is preserved; A80 explicitly pre-authorizes extending the queue table with provenance fields. -- [ ] Non-trivial design decision? **No** — D139 already routes cascade through `reconciliation_need`; this just attaches the source delta to the queue row instead of re-deriving it. -- [ ] New seam-level invariant? **No** — snapshots are advisory render data, not load-bearing state. `caused_by_turn_id` already exists for stronger provenance. -- [ ] Crosses >2 major seams? **No** — schema + producer + shared type + route response. -- [ ] First touch in unfamiliar seam? **No** — same code paths as PR #115/#116. -- [ ] Cannot name containing seam from live docs? **No** — `cascade-producer.ts`, `db.openReconciliationNeed*`, `reconciliation-needs-route.ts`, SPEC.md A80/A88 + I112/I113. - -→ Stays light. - ---- - -## 2 — Source diff rendered inline on each Pending review row (client) — `done` - -### Objective - -Each row in `` shows the source item's before/after as a `` so the user can read what changed without leaving the cascade surface. - -### Acceptance Criteria - -- ✓ When `source_previous_content` and `source_current_content` are both present and non-equal, the row renders a `` block under the source/target reference line. -- ✓ When either snapshot is null (legacy rows opened before Card 1), the row renders today's bare layout — no diff block, no error. -- ✓ Diff styling reuses FE-665's `` component verbatim; no new diff library, no duplicated tokenization. -- ✓ The diff block does not push the Resolve button below the fold of the overlay's normal scroll height — verified by snapshot of an existing test fixture. -- ✓ A small "Source change" label sits above the diff so it's not confused with the (future) target diff. - -### Verification Approach - -- Inner: extend `pending-review-section` component test (or add one) using the existing fixture in `__tests__/reconciliation-need-fixtures.ts` extended with snapshots; assert `` renders only when snapshots are present and non-equal. -- Outer: manual walkthrough of an apply that opens 4 needs — the diff should make the source change legible at a glance. - -### Promotion checklist - -- [ ] All seven items: **No**. Pure UI composition over an existing component using fields just added in Card 1. No new decisions, no new seams. - -→ Stays light. - ---- - -## 3 — "Edit target" affordance per Pending review row (client + reuse) — `done` - -### Objective - -Each row gets an "Edit target" button that expands an inline textarea pre-filled with the target item's current content; saving runs through the existing edit pipeline (`PATCH /knowledge-items/:id`) and then resolves the need (`POST /api/specifications/:id/reconciliation-needs/:needId/resolve`). - -### Acceptance Criteria - -- ✓ Each row exposes `[ Edit target ]` alongside the existing `[ Resolve ]`. -- ✓ Clicking Edit target expands an inline textarea with the target item's current content; ⌘↵ saves, esc cancels. -- ✓ Save calls `PATCH /knowledge-items/:id` (or whichever existing edit-route endpoint FE-657 uses) with content + rationale; on success, calls the existing resolve endpoint and refetches the needs query. -- ✓ If the edit returns `impact === 'hard'` with new opened needs, the new needs surface immediately in the same Pending review section — re-entrant cascade works without a page reload. -- ✓ While save is in flight, both Edit target's Save button and the row's Resolve are disabled. -- ✓ Existing per-row Resolve behavior is unchanged when the editor is collapsed. -- ✓ The target's current content used to pre-fill the textarea comes from a single source of truth — either the existing knowledge-items query already mounted on the overlay surface, or a newly threaded field on `ReconciliationNeedRecord`. Pick one and document the choice in the commit body. - -### Verification Approach - -- Inner: extend the pending-review section test to drive the inline-edit flow against a mocked edit endpoint; assert the resolve endpoint is called only after the edit succeeds; assert re-entrant cascade rows appear after save. -- Outer: manual walkthrough — open a hard apply that creates needs, edit one target inline, confirm the cascade rerenders and Resolve clears the row. - -### Promotion checklist - -- [ ] Requirement change? **No** — already implicit in Requirement 10's HITL contract. -- [ ] Assumption change? **No**. -- [ ] Non-trivial design decision? **Possibly** — choosing whether the target content is read from the items query vs threaded onto `ReconciliationNeedRecord`. Both are reversible. Not promoting unless the chosen direction surfaces a durable invariant. -- [ ] New seam-level invariant? **No**. -- [ ] Crosses >2 major seams? **No** — UI + existing edit-route + existing resolve endpoint. -- [ ] First touch in unfamiliar seam? **No**. -- [ ] Cannot name containing seam? **No** — `pending-review-section.tsx`, FE-657 inline-edit pattern, V2 edit-route. - -→ Stays light. - ---- - -## 4 — Side-chat / pending-review polish — Figma alignment + `DiffPopover` primitive (client) — `done` - -### Objective - -Bring the V3.0 side-chat panel, staged-patches strip, pending-review section, and direct-edit toolbar in line with the HASH-SgAI Figma design language (file `nTw9n0blCJm1j9t22Jo72d`, node `969:13119`) and Linear-chat minimal-chrome conventions: kind-accent tints replace ad-hoc grays, a shared `DiffPopover` primitive replaces inline diff expanders, action chrome shrinks toward icon-only ghost shapes, and FE-only vocabulary unifies as `Note` / `Edit mode`. No backend or contract changes. The four sub-sections form a cohesive pass — split during build is allowed, but the kind-accent tint system, button shapes, and vocabulary must land together to feel coherent. - -### Acceptance Criteria - -**S1 — Vocabulary + chat-panel chrome** (`side-chat-popover.tsx`) - -- ✓ FE-only string changes, no type renames: - - Annotate button label `Annotate` → `Note`; aria `Annotate item` → `Add a note`. - - Annotation composer aria `Annotation composer` → `Note composer`; summary placeholder `Summary` → `Title`; body placeholder `Note body` → `Details`. - - Edit-mode button label `Edit` → `Edit mode` (off) / `Edit on` (active); tooltip → `Toggle edit mode — your messages propose changes for review`. - - Promote-from-drawer aria `Add … to chat context` → `Add … to context`. - - Patch kinds (`'annotate' | 'edit' | 'edge' | 'drill-down'`) and `mode` prop values stay unchanged at the type level. -- ✓ Top-right floating header buttons (layout-toggle, close) shrink from 24×24 to 20×20 ghost. -- ✓ The current right-side action row above the input (`[Annotate] [Edit]`) is removed. Both actions move: - - `+ note` becomes a 24×24 ghost icon button inside the input card's left action row, next to the disabled `+` attach button (`NotebookPen` icon, label only on hover/aria). - - Notes(N) drawer button (rendered when `existingAnnotations.length > 0`) sits to its right in the same input-card left action row. Drawer still opens upward as a popover. -- ✓ `Edit mode` becomes a thin strip rendered **below** the input card (separate full-width row, ~28px tall): `[PencilLine icon] Edit mode [toggle pill on right]`. When `mode === 'edit'`: strip bg `${kindAccent}10`, input placeholder swaps to `Suggest an edit…`, toggle pill reads `Edit on`. -- ✓ When `kindAccent` is null (untyped pinned item), all kind-accent tint values fall back to `#5424ff` so existing untyped-kind behavior is preserved. - -**S2 — Staged-patches strip + diff color system + `DiffPopover`** - -- ✓ Staged-patches `
` background flips from `bg-wash/60` to `${kindAccent}0a` with `${kindAccent}1f` border. Row hover bg `${kindAccent}05`. -- ✓ Per-row layout: `[kind chip with kind-specific lucide icon] [truncated title] [↗ view diff chip] [impact chip] [× discard]`. - - Kind chip rendered on every staged patch (note / edit / edge / drill-down), 10px font, kind-accent-tinted bg. - - `[↗ view diff]` chip rendered only when `kind === 'edit' && currentContent !== newContent`. Click opens `` anchored to this chip. - - Today's inline `
` expander is removed. - - Discard `×` shrinks to 14×14 ghost icon (`X` lucide), opacity 0 by default, opacity 1 on row hover or focus-within. - - Impact chip moves to the right of the title (not next to kind chip). -- ✓ Footer Undo / Apply become 28×28 icon-only: - - Undo: `Undo2` lucide, ghost (no bg), hover bg `${kindAccent}14`. Aria `Undo last change`. Hidden when `!canUndo`. - - Apply: `Check` lucide, solid `${kindAccent}` bg, white icon, ring shadow. Aria `Apply N change(s)`. Stays the eye-anchor. -- ✓ The current `Saving change…` status moves into a small inline label adjacent to Apply (right-aligned), not its own row. -- ✓ A new component `` lands at `src/client/components/diff-popover.tsx`: - - Props: `{ open, onClose, anchor, before, after, title, kindChip?, kindAccent? }`. - - Floating popover, no backdrop dim. Click-outside and ESC close. - - Viewport-aware absolute positioning (above by default, below if no space above). No floating-ui dependency. - - Max-width 480px. Header bar `[kind chip] [title (truncated)] [✕]` over `${kindAccent}10` bg. Body ``. Container border `${kindAccent}1f`. -- ✓ ``'s inline tint colors (warm-amber removed, cool-blue added) are unchanged — the visual diff signal stays decoupled from kind-accent so it remains universally readable. - -**S3 — `PendingReviewSection` redesign** (`pending-review-section.tsx`) - -- ✓ Strip background softens from `rgba(255,219,168,0.35)` to `rgba(255,219,168,0.18)`. -- ✓ Strip header becomes `[AlertCircle, amber] N pending reviews` — count + icon, no chevron, no resolve-all. -- ✓ Per-row layout: - ``` - ┃ [supersedes|confirm chip with Replace|CheckCircle2 icon] #ID · {target excerpt} - ┃ from #ID was edited [↗ view source diff] - ┃ [✎ edit] [✓ resolve] - ``` - - Left vertical bar (`┃`) is 2px wide, `rgba(255,219,168,0.6)` neutral-amber for v1 (target-item-kind enrichment is the deferred follow-up below). - - Title shows raw `#ID` followed by `·` and the first ~80 chars of `target_current_content`, single-line truncate. - - Sub-line `from #ID was edited` rendered only when `source_previous_content` and `source_current_content` are both present and differ. The `[↗ view source diff]` chip opens `` (same primitive from S2). - - Today's inline `` block under the row is removed; the diff is reachable only through the chip. - - Action row: `[✎ edit]` and `[✓ resolve]` 24×24, opacity 0.6 default, opacity 1 on row hover or focus-within. Edit is ghost. Resolve is solid `${kindAccent}` (target-kind-accent fallback to neutral amber when target kind unknown for v1). Tooltips and aria carry the labels. -- ✓ State icons: - - Resolving in-flight: `Check` swaps to `Loader2` spinner. - - Saving in-flight (inline edit form): `Check` swaps to `Loader2` spinner. -- ✓ Inline edit form (when `editDrafts` has the row's id) wraps textarea + Cancel/Save in `${kindAccent}10`-tinted card with `${kindAccent}1f` border. Cancel and Save shapes match S4's direct-edit toolbar contract. -- ✓ Existing data-attribute selectors (`data-need-id`, `data-need-kind`, `data-edit-target-form`) stay so the existing tests still resolve rows. - -**S4 — Direct-edit toolbar** (`ItemEditTextarea` in `routes/specification/$id/-structured-list-view.tsx`) - -- ✓ Textarea drops `shadow-[var(--shadow-card)]` and the heavy `border-rule`. Border `border-[${kindAccent}1f]`; focus ring `${kindAccent}33` at 2px (down from `ring-3`). Background stays `bg-background`. -- ✓ Cancel becomes icon-only — drop the word `Cancel`, keep the `X` icon, set `aria-label="Cancel edit"` and `title="Cancel"`. -- ✓ Save loses the hard-coded blue gradient and `ring-1 ring-[#1060d6]`. Becomes small kind-accent-solid (`bg-[${kindAccent}]`), white text, `Check` icon + word `Save`, `size="xs"` retained. Disabled state: `opacity-40`, no special bg. -- ✓ Keyboard hint row (`⌘↵ save · esc cancel`) stays unchanged. -- ✓ Vertical footprint shrinks ~6px versus today (the dropped shadow and thinner ring carry the savings). -- ✓ The same toolbar contract is reused inside `PendingReviewSection`'s inline edit form — same Cancel / Save composition, same kindAccent ring derivation, same disabled-state recipe. - -**Cross-section invariants** - -- ✓ Kind-accent values are derived everywhere from the existing `kindAccentHex` map in `knowledge-card.tsx`. No new color tokens; tints are computed inline via hex+alpha string concatenation following the existing precedent in `side-chat-popover.tsx`. -- ✓ All four surfaces share the same `kindAccent` fallback (`#5424ff`) when a kind cannot be determined. -- ✓ `npm run verify` passes with all existing component tests, plus extensions for the new primitive and the relabeled affordances. - -### Verification Approach - -- **Inner**: - - `side-chat-popover.test.tsx`: extend to assert (a) Note/Edit-mode label changes, (b) `+ note` button lives inside the input card's left action row, (c) Edit-mode strip renders below the input card with the toggle reflecting `mode`, (d) Undo/Apply are icon-only, (e) staged-patch rows expose `[↗ view diff]` chip when content differs, (f) discard `×` only visible on row hover/focus. - - `pending-review-section.test.tsx`: extend to assert (a) `[↗ view source diff]` chip opens `` instead of inline ``, (b) per-row `[✎ edit]` and `[✓ resolve]` icon buttons render with correct ARIA, (c) inline edit form uses the new toolbar shape (icon-only Cancel, kindAccent Save), (d) `Loader2` spinner replaces `Check` during in-flight states. - - New `diff-popover.test.tsx`: rendering with/without `kindChip`, ESC closes, click-outside closes, viewport-aware position falls back to below when no space above, focus management. - - Structured-list-view tests: extend `ItemEditTextarea` assertions to confirm icon-only Cancel and small kindAccent Save (no blue gradient), keyboard hints unchanged. - - `npm run verify` (lint + format + tests + build). -- **Outer**: manual walkthrough — open side-chat on items of three different kinds, stage and diff-popover-inspect a few edits, apply, undo. Trigger a hard cascade, walk the Pending review surface, source-diff popover, edit-target inline, save, see re-entrant cascade rerender. Direct-edit a row from the structured-list view, confirm the toolbar feels thin and ⌘↵ flow is unchanged. - -### Promotion checklist - -- [ ] Requirement change? **No** — pure visual + vocabulary polish over already-shipped V3.0 surfaces. No new product capability. -- [ ] Assumption change? **No** — A88 (Path 1 sufficiency) and A80 (HITL contract) untouched. -- [ ] Non-trivial design decision? **Possibly** — the `DiffPopover` primitive's shape (anchored vs modal, kindAccent vs neutral chrome) is a small reusable contract. Reversible if a future surface needs a different popover shape. -- [ ] New seam-level invariant? **No** — kind-accent tints are render-time derived, not stored. -- [ ] Crosses >2 major seams? **No** — four components in `src/client/`, one new primitive, no server / shared / contract changes. -- [ ] First touch in unfamiliar seam? **No**. -- [ ] Cannot name containing seam from live docs? **No** — `side-chat-popover.tsx`, `pending-review-section.tsx`, `content-diff.tsx`, `structured-list-view.tsx#ItemEditTextarea`, all in PR #115/#116/#117 territory. - -→ Stays light. - -### Polish follow-up — reference-code & target-kind enrichment on the listing endpoint (deferred) - -Card 4's S3 keeps raw `#ID` references and a neutral-amber row left bar because the current `GET /api/specifications/:id/reconciliation-needs` payload does not carry `target_reference_code`, `target_title`, `source_reference_code`, or `target_item_kind`. A small follow-up card (~30 lines in `reconciliation-needs-route.ts` plus a join per row) can enrich these fields, after which: - -- The Pending review row title flips from `#12 · {excerpt}` to `AS-12 · {excerpt}`. -- The sub-line flips from `from #9 was edited` to `from AS-9 was edited`. -- The row left bar derives its color from the target's `kindAccentHex` instead of the v1 neutral amber. -- Resolve button bg derives from target-kind-accent. - -Queue this only after Card 4 ships and the v1 polish has corpus signal. - ---- - -## 5 — V3.1 agent backend (schema + classifier + run-agent endpoint) — `done` (full scope card) - -### Target Behavior - -`POST /api/specifications/:id/reconciliation-needs/run-agent` classifies every open `reconciliation_need` row in the given specification whose `agent_status` is `null`, persisting one of `{auto-confirm, auto-edit, substantive}` plus an optional text proposal per row, while transitioning each row through `null → queued → classifying → classified | failed`. - -### Boundary Crossings - -``` -→ POST /api/specifications/:id/reconciliation-needs/run-agent (route) -→ handleRunReconciliationAgent (server/reconciliation-agent-route.ts, new) -→ list open + agent_status=null needs (db.ts; existing query, new filter) -→ enrich each need with sourceItem + targetItem (existing getKnowledgeItem) -→ classifyNeed(need, sourceItem, targetItem, getRelationKind, llm) → { classification, proposal? } (server/reconciliation-agent.ts, new pure function) - → loadPrompt('reconciliation-classifier') (prompt-loader; new asset src/server/prompts/reconciliation-classifier.md) - → generateText({ model, system, prompt }) on the AI SDK adapter already used by side-chat-route - → parse single-shot response into label + optional proposal -→ updateReconciliationNeedAgentFields(needId, { agent_status, agent_classification, agent_proposal }) (db.ts; new helper, transitions one row at a time) -→ 200 OK { specId, ranAt, classifiedCount, failedCount } (route response) -``` - -### Risks and Assumptions - -``` -- RISK: LLM returns a label outside the three-value vocabulary - → MITIGATION: classifyNeed validates against the literal union; on parse failure, transition to 'failed' with the parser error message persisted into agent_proposal as 'Parse error: ...'. - -- RISK: classifying N needs in a single request with a synchronous LLM call blocks the route past the typical proxy timeout when N is large - → MITIGATION: V3.1 first cut runs in-process with a per-need iteration so partial progress persists; the route returns once the loop completes. Single-digit open-need counts per spec (same as the N+1 caveat in Card 3) keep this acceptable for the MVP. Promote to a queue substrate (BullMQ / pg-boss / inline scheduler) only if outer-loop walkthroughs surface user-visible blocking. - -- RISK: Re-running the agent against rows already classified clobbers prior classification - → MITIGATION: route filters strictly on agent_status IS NULL; per-row Re-run (slice 6) re-sets a single row to null first, so the re-run path stays explicit and per-need. - -- ASSUMPTION: The lifecycle (null → queued → classifying → classified | failed) plus the three-label vocabulary is enough seam to support slices 6-7 (status chips, action buttons) without further schema change. → VALIDATE: build slice 6 against the schema as-is; if a new column appears in slice 6 (e.g. confidence score, retry count), promote that as an A### at slice-6 scoping. → memory/SPEC.md §Assumptions A88 (Path 1 sufficiency) is the umbrella; this is a sub-assumption under it. - -- ASSUMPTION: Single-shot LLM call (one prompt → one structured response, no tool use, no multi-turn) is sufficient classification quality for the three-label decision when the prompt has source previous + current content (Card 1) and target current content (Card 3) in context. → VALIDATE: the middle-loop golden-fixture corpus (see Verification Approach) is the only oracle that proves this; if classification is unstable across runs at temperature 0, promote to multi-shot or add confidence scoring as a follow-up slice. → memory/SPEC.md §Acknowledged Blind Spots row "V3.1 classifier multi-run determinism" already names this; current mitigation is the per-need Re-run button shipping in slice 6. -``` - -No spike required — both LLM seam (`generateText` via the existing AI SDK adapter) and the prompt registry (`prompt-loader` + markdown assets) are already in production use. The classifier is novel only in *what* it classifies, not *how* it talks to the model. - -### Acceptance Criteria - -``` -✓ schema: reconciliation_need.test.ts — three new nullable columns (agent_status TEXT, agent_classification TEXT, agent_proposal TEXT) round-trip through openReconciliationNeed* and the listing query; defaults are all null on existing rows -✓ schema: migration 0019_reconciliation_need_agent_columns.sql is hand-written + journal entry added (per HANDOFF.md non-TTY caveat); structural test asserts column presence -✓ classifier (state-machine, stubbed LLM): reconciliation-agent.test.ts — happy path null → queued → classifying → classified with label='auto-confirm' on a leaf need; auto-edit returns a non-null proposal; substantive returns null proposal -✓ classifier (state-machine, stubbed LLM): reconciliation-agent.test.ts — failure path null → queued → classifying → failed when the stub throws; agent_classification stays null; agent_proposal carries the error message -✓ classifier (state-machine, stubbed LLM): reconciliation-agent.test.ts — invalid label from the stub transitions to failed with a 'Parse error: ...' proposal; agent_classification stays null -✓ classifier (pure): reconciliation-agent.test.ts — classifyNeed is pure: same (need, source, target, relationKind) input + stubbed LLM returning the same string yields the same { classification, proposal } output -✓ route: reconciliation-agent-route.test.ts — POST .../run-agent returns 200 with { classifiedCount, failedCount } and persists agent_status/classification on every previously-null open need; rows already classified stay untouched -✓ route: reconciliation-agent-route.test.ts — POST .../run-agent on a spec with zero open needs returns 200 with { classifiedCount: 0, failedCount: 0 } -✓ route: reconciliation-agent-route.test.ts — POST .../run-agent on a missing or non-owned spec returns the same 404 / 403 shape as the existing reconciliation-needs route (auth parity) -✓ wire: reconciliation-needs-route.test.ts — GET .../reconciliation-needs response now exposes agent_status, agent_classification, agent_proposal on every row; existing test fixtures stay typesafe by adding null defaults to makeNeed -✓ wire: ReconciliationNeedRecord (shared type) gains the three fields with doc-comments naming the lifecycle and label vocabulary -``` - -### Verification Approach - -``` -- Inner: deterministic state-machine tests over the lifecycle with a stubbed classifier (per SPEC.md row 553); structural unit tests for new schema columns + classifyNeed purity; route-level tests for the run-agent endpoint and the listing-endpoint wire-shape change. `npm run verify`. - -- Middle: golden-fixture corpus of (source change, target content, relation kind) → expected classification tuples, evaluated against the live AI SDK adapter behind a recorded-or-live model. **Seed bootstrap (this slice ships the seed; the corpus harness itself is built incrementally as classification probes lands)**: - 1. (no semantic source change, target unchanged, depends_on) → auto-confirm - 2. (rename "user" → "customer" in source, target verbatim references "user", refines) → auto-edit, proposal replaces "user" with "customer" in target text - 3. (constraint loosened in source, target encodes the older constraint, constrains) → substantive, proposal null (judgment required) - 4. (added counterexample to source, target unaffected, illustrates) → auto-confirm - 5. (verifier replaced in source, target derives_from old verifier, derived_from) → substantive - Per SPEC.md §Verification Design row 554, the corpus lives outside `npm run verify` (recorded-or-live model adapter). The five seed tuples land as a test-resources directory next to reconciliation-agent.test.ts so slice 6/7 can extend them; the harness that runs the corpus against the live adapter is its own slice (not this one). This slice's middle-loop deliverable is **the seed corpus + the prompt asset that the corpus exercises**, not the runner. - -- Outer: deferred to after slice 7 (UI actions land), per SPEC.md row 555. The walkthrough on dense specs validating A88 is the only ring that says whether grouping helps. -``` - -### Promotion notes - -- New invariant lands as **I114** in SPEC.md §Invariants: lifecycle + label vocabulary + structural recoverability (`agent_proposal` text-only, never auto-applied; `failed` is reachable from `classifying` and is recoverable via per-need Re-run in slice 6). Add I114 row to SPEC.md during build (per ln-scope traceability rule for full cards). The "planned I114" placeholder already in SPEC.md rows 553-554 gets replaced with the live id. -- No new D### unless the in-process loop turns out to be the wrong shape under outer-loop walkthrough; per HANDOFF.md it stays a deliberate MVP choice with a documented promotion trigger. -- A88 stays open — this slice does **not** validate it; slice 7 outer-loop walkthrough does. - ---- - -## 6 — V3.1 agent client UI — Run agent + status chips + per-row Re-run (client + small server seam) — `next` (full scope card) - -### Target Behavior - -A user with open `reconciliation_need` rows can trigger the V3.1 classifier from the Pending review header, see each row's live classification state as a chip (`null` / `queued` / `classifying` / `auto-confirm` / `auto-edit` / `substantive` / `failed`), and re-run classification on any single row from its action rail — all within the existing `` polished surface, with no per-class action behavior yet (those land in Card 7). - -### Boundary Crossings - -``` -→ header (src/client/components/pending-review-section.tsx — Card 4 polished surface) -→ (NEW small component; lives in header next to the existing kind-counts row) -→ POST /api/specifications/:id/reconciliation-needs/run-agent (existing, Card 5) -→ useSpecificationOpenReconciliationNeeds query (existing; verify it returns agent_* fields per Card 5 slice 4) -→ conditional refetchInterval (1000ms while ANY need.agent_status ∈ {'queued','classifying'}, otherwise off) -→ Per-row (NEW; six variants; reuses kind-chip pattern from Card 4) -→ Per-row [↻ Re-run] action button (visible only when agent_status ∈ {'classified','failed'}) -→ POST /api/specifications/:id/reconciliation-needs/:needId/reset-agent (NEW minimal route; idempotent agent_status → null + immediate POST run-agent for that row) -→ db.resetReconciliationNeedAgentFields(needId) (NEW one-liner helper; clears agent_status / agent_classification / agent_proposal on one row) -→ Optimistic chip swap on Run / Re-run click (chip flips to 'queued' immediately; polling reconciles) -``` - -### Risks and Assumptions - -- RISK: 1-second polling against the listing endpoint while several needs classify in parallel could hammer the server → MITIGATION: conditional polling — `refetchInterval: 1000` ONLY when at least one open need is `queued` or `classifying`; idle state has no polling. Single-digit need counts per spec (existing N+1 caveat from Card 3) keep this cheap. If outer-loop reveals load issues, drop to 2000ms or switch to event-stream notification (a Card 6.5 follow-up, not pre-scoped). -- RISK: Per-row Re-run requires resetting `agent_status` before the run-agent route's `IS NULL` filter will re-process the row → MITIGATION: dedicated `POST .../:needId/reset-agent` endpoint that clears the three agent_* fields on one row in a single statement, then calls the same classifier pipeline used by the spec-level run-agent route (factor the inner loop into a shared helper if cleanest). One transaction; idempotent. -- RISK: "Run agent" button state must reflect global agent state (running / idle / partial) and disable correctly to prevent double-runs → MITIGATION: derive button state from query data (`hasInflight = some need has agent_status ∈ {queued, classifying}`); the spec-level run-agent route is already idempotent on rows where `agent_status IS NOT NULL`, so duplicate clicks during polling are harmless but disabling avoids confusing UX. -- ASSUMPTION: The GET listing endpoint already exposes `agent_status`, `agent_classification`, `agent_proposal` per Card 5 slice 4 [2026-05-08]. → VALIDATE: read `src/server/reconciliation-needs-route.ts` listing path before implementing; if not exposed, this is a one-line extension to the response shape and the `ReconciliationNeedRecord` shared type. → memory/SPEC.md I114 (lifecycle persistence). -- ASSUMPTION: Six chip variants (null / queued / classifying / auto-confirm / auto-edit / substantive / failed) cover all states the user sees today; the `failed` chip carries enough information (icon + label + tooltip showing `agent_proposal` error text) for the user to decide whether to Re-run without opening a separate panel. → VALIDATE: outer-loop walkthrough on dense graphs; if `failed` rows need richer detail, add a hover-popover in Card 7 along with the auto-edit DiffPopover. -- ASSUMPTION: Card 6 alone is enough surface to validate A88 (does grouping help legibility?). → VALIDATE: outer-loop walkthrough on a dense real spec immediately after build; the qualitative read is whether the chip vocabulary makes the queue actionable at a glance vs. the V3.0 flat list with per-row Resolve. → memory/SPEC.md A88. If A88 invalidates, route through `/ln-spec` before scoping Card 7. - -### Acceptance Criteria - -``` -✓ pending-review-section.test.tsx — Run agent button renders in the section header when ≥1 open need has agent_status=null; clicking dispatches exactly one POST .../run-agent call -✓ pending-review-section.test.tsx — Run agent button is disabled (and tooltip explains) while any open need has agent_status ∈ {'queued','classifying'}; re-enabled when all rows reach a terminal state -✓ pending-review-section.test.tsx — progress strip ("Agent: M of N classified") renders only while in-flight; counter derives from agent_status states; hidden at rest -✓ pending-review-section.test.tsx — each row renders a ClassificationChip matching agent_status (seven fixtures for null / queued / classifying / auto-confirm / auto-edit / substantive / failed); failed chip shows agent_proposal tooltip on hover -✓ pending-review-section.test.tsx — per-row Re-run button visible only when agent_status ∈ {'classified','failed'}; click sends POST .../:needId/reset-agent; resulting chip transitions queued → classifying → terminal within the polling window -✓ specification-open-reconciliation-needs.test.tsx — refetchInterval is 1000ms when any need is queued/classifying; falsy when all needs are terminal or null (no polling at rest) -✓ reconciliation-needs-route.test.ts — new POST .../:needId/reset-agent: 200 + classifier dispatch on valid open need; 404 on unknown need; 200 + no-op when row is already null; auth parity with existing /resolve route -✓ classification-chip.test.tsx — pure render snapshot of all six variants; accessibility labels are present and distinguishable -✓ npm run verify — no unrelated regressions in existing pending-review-section, reconciliation-needs-route, or query tests -``` - -### Verification Approach - -``` -- Inner: component test extensions in pending-review-section.test.tsx for the seven UI cases above; query-layer test for refetchInterval gating; route-level test for the new reset-agent endpoint; standalone ClassificationChip variant tests. `npm run verify` covers all. -- Middle: not applicable — no LLM judgment changes; classifier prompt unchanged from Card 5. The middle-loop golden-fixture corpus seeded in Card 5 stays as-is. -- Outer: manual walkthrough — open a spec with a dense knowledge graph (≥10 items with mixed typed edges); make a hard-impact edit that opens 4–6 needs across both `supersedes` and `needs_confirmation` kinds; click Run agent; observe chips cycle through queued → classifying → terminal within ~5s; re-run two rows manually; ASSESS A88: can the user interpret the classification pattern without coaching? Is the queue more actionable than V3.0's flat list with per-row Resolve? Capture qualitative notes — these are the A88 signal that gates Card 7 scoping. -``` - -### Promotion checklist - -- [ ] Requirement change? **No** — Requirement 10 already names the HITL contract. -- [x] Assumption change? **Maybe** — A88's outer-loop validation lands here. Build does not change A88; the walkthrough either upholds it, refines it (e.g., "grouping helps but only with class-count summary in header"), or invalidates it (e.g., "users miss substantive needs in a mixed-chip list"). No SPEC update at scope time; route through `/ln-spec` post-walkthrough only if invalidated. -- [x] Non-trivial design decision? **Yes** — (a) chip vocabulary (six variants + tooltip-driven failure detail), (b) per-row reset endpoint shape (`POST /reset-agent` vs. generic `PATCH /agent-status` mutation), (c) polling cadence + conditionality. All three are reversible inside the route + component contracts; document choices in the commit body. No D### needed unless walkthrough reveals a load-bearing constraint. -- [ ] New seam-level invariant? **No** — I114 (classifier lifecycle) already established by Card 5; Card 6 surfaces it without changing it. The `agent_proposal` text-only / never-auto-applied invariant remains untouched (Card 7 will lean on it for `auto-edit` Apply). -- [x] Crosses >2 major seams? **Yes** — listing route (verify-only), new reset-agent route, query hook, component surface, three new sub-components. Justifies full scope. -- [ ] First touch in unfamiliar seam? **No** — same code paths as Card 4 polish + Card 5 backend. -- [ ] Cannot name containing seam? **No** — `pending-review-section.tsx`, `useSpecificationOpenReconciliationNeeds`, `reconciliation-needs-route.ts`. - -→ Stays full scope on (b) + (c) + multi-seam crossing. SPEC.md unchanged at scope time; reconcile only if A88 walkthrough invalidates. - ---- - -## 7 — V3.1 per-class actions + bulk (client + listing extension) — `done` (full scope card) - -### Target Behavior - -Each classified row exposes the action appropriate to its classification (`auto-confirm` → Confirm; `auto-edit` → Apply suggested / Skip with a `` preview; `substantive` → Open side-chat) and the section header exposes two bulk actions ("Confirm all (N)" and "Apply all suggested (N)") that iterate client-side over the existing per-row endpoints, closing V3.1's user-facing surface end-to-end. - -### Boundary Crossings - -``` -→ (src/client/components/pending-review-section.tsx — Card 6 surface) -→ Per-row action rail (new buttons mounted next to existing Resolve / Edit / Re-run) - ├── auto-confirm → Confirm → resolveReconciliationNeedRequest (existing) - ├── auto-edit → Apply → editKnowledgeItemRequest({ content: agent_proposal }) + resolveReconciliationNeedRequest - ├── auto-edit → Skip → resolveReconciliationNeedRequest (existing) - ├── auto-edit → View proposal → opens with target current → agent_proposal - └── substantive → Open side-chat → useSideChat().openFor({ kind, id, referenceCode, content }) -→ Header bulk row (next to Run agent / progress strip) - ├── Confirm all (N) → iterate auto-confirm rows → Promise.allSettled(resolveReconciliationNeedRequest…) - └── Apply all suggested (N) → iterate auto-edit rows with proposals → Promise.allSettled(editKnowledgeItemRequest + resolveReconciliationNeedRequest…) -→ GET /api/specifications/:id/reconciliation-needs listing endpoint extension - └── ReconciliationNeedView gains target_item_kind + target_reference_code (closes Card 4 deferred follow-up) -→ ReconciliationNeedRecord (shared type) gains the two read-time fields -``` - -### Risks and Assumptions - -- RISK: Bulk client-side iteration can fire N concurrent PATCHes against the same spec, which may stress the server's edit-route under hard-impact cascade (one bulk Apply could open many needs) → MITIGATION: serialize bulk operations (await each request before starting the next) rather than parallelize; the user-perceived latency is dominated by network round-trips, not parallelism, and serialization keeps cascade-opening predictable. Trade-off documented in commit body. -- RISK: Apply suggested writes the raw `agent_proposal` into the target item without giving the user a chance to edit it → MITIGATION: the View-proposal `` lets the user preview before clicking Apply; if Card 7 walkthrough surfaces that the user wants to edit-before-apply, promote that to a follow-up card (would re-use Card 3's inline-textarea machinery seeded with `agent_proposal`). -- RISK: Substantive Open-side-chat opens an ephemeral conversation that disappears on refresh (V4a side-chat persistence isn't shipped yet) → MITIGATION: accepted for V3.1; the side-chat is anchored to the target item with its current content as pinned context, which is enough for one substantive walk. V4a persistence makes the same affordance durable without changing the entry contract. -- RISK: The listing-endpoint extension touches a shared response shape (`ReconciliationNeedView`, `ReconciliationNeedRecord`) consumed by existing tests and component fixtures → MITIGATION: both new fields are nullable on the shared type; existing fixtures (`reconciliation-need-fixtures.ts`) default both to null, so older tests stay green. The Confirm / Apply / Open-side-chat buttons all need-check for non-null before enabling. -- ASSUMPTION: `useSideChat()` returning `null` (no SideChatHost mounted) is the right gate to hide the Open-side-chat button. → VALIDATE: outer-loop walkthrough on a route that has SideChatHost; render unit test that asserts the button hides when the context is null. -- ASSUMPTION: Confirm-all / Apply-all-suggested are scoped per-classification (not "everything the agent classified") because the actions are semantically distinct. → VALIDATE: outer-loop walkthrough; if users want a single "apply everything" affordance, that's a follow-up after seeing the multi-button feel. - -### Acceptance Criteria - -``` -✓ reconciliation-needs-route.test.ts — GET .../reconciliation-needs response now includes target_item_kind and target_reference_code on every row (defaults null when target item missing) -✓ pending-review-section.test.tsx — auto-confirm row exposes Confirm button; click calls resolveReconciliationNeedRequest once -✓ pending-review-section.test.tsx — auto-edit row exposes View-proposal + Apply + Skip; View opens the existing DiffPopover with target current vs agent_proposal; Apply calls editKnowledgeItemRequest with content=agent_proposal then resolveReconciliationNeedRequest; Skip calls resolveReconciliationNeedRequest only -✓ pending-review-section.test.tsx — substantive row exposes Open side-chat button; click invokes useSideChat().openFor with the target item's kind / id / referenceCode / content -✓ pending-review-section.test.tsx — substantive row hides Open-side-chat when useSideChat() returns null (no host mounted) -✓ pending-review-section.test.tsx — header exposes "Confirm all (N)" only when ≥1 auto-confirm row exists; click resolves each auto-confirm row serially -✓ pending-review-section.test.tsx — header exposes "Apply all suggested (N)" only when ≥1 auto-edit row with non-null agent_proposal exists; click applies each in sequence -✓ pending-review-section.test.tsx — auto-edit rows lacking agent_proposal are excluded from "Apply all suggested" iteration but still expose Skip / Resolve individually -✓ npm run verify — no regressions across server route tests, query tests, component tests -``` - -### Verification Approach - -``` -- Inner: pending-review-section.test.tsx for the per-class action affordances, bulk header visibility, and dispatch correctness; reconciliation-needs-route.test.ts for the listing-extension fields -- Middle: not applicable — no LLM judgment changes; classifier output (auto-confirm / auto-edit / substantive) is the input contract, not the output -- Outer: manual walkthrough — open a spec with mixed classifications (≥2 auto-confirm, ≥2 auto-edit, ≥1 substantive); use per-row actions on a few; use one bulk action; observe the row leaves the section atomically on each resolve. THIS is the second A88 signal: do the action affordances close the loop legibly, or do users hesitate on which button to use? Capture qualitative notes — this is where V3.1 ends. -``` - -### Promotion checklist - -- [ ] Requirement change? **No** — Requirement 10 already names the HITL contract for accept-on-target / edit-target / dismiss. -- [x] Assumption change? **Yes** — A88 (Path 1 sufficiency without agent) gets its second outer-loop validation here; together with Card 6's walkthrough, this is the canonical signal on whether agent grouping helps. No SPEC update at scope time; route through `/ln-spec` post-walkthrough only if invalidated. -- [x] Non-trivial design decision? **Yes** — (a) bulk semantics (serialize vs parallelize), (b) Apply path bypasses the inline-edit affordance (raw application of agent_proposal), (c) substantive handoff ships without persistence (V4a-blocked but functional). All reversible. Document in commit bodies; no D###. -- [ ] New seam-level invariant? **No** — reuses Card 5's I114 and Card 6's polling contract. -- [x] Crosses >2 major seams? **Yes** — listing route extension, shared type, component, side-chat context. Full scope. -- [ ] First touch in unfamiliar seam? **No** — same code paths as Cards 3 / 5 / 6. -- [ ] Cannot name containing seam? **No**. - -→ Full scope card. SPEC.md unchanged at scope time. After Card 7 + walkthrough lands, run `/ln-sync` to retire CARDS.md (frontier exhausted) and update PLAN.md's §Recently Completed. - ---- - -## Not yet queued - -(Nothing remaining for FE-674 after Card 7. Next frontier is the V4a side-chat persistence promotion already in PLAN.md §Next item 3 — re-scope via `/ln-scope` once V3.1 closes and the §349 anchor decision is made.) diff --git a/memory/PLAN.md b/memory/PLAN.md index 9c069c7f..1c3380d7 100644 --- a/memory/PLAN.md +++ b/memory/PLAN.md @@ -4,87 +4,69 @@ # Plan -The interaction model is mature: four-phase interview, interviewer-autonomous question format, phase-agnostic preface cards with workspace exploration, structured review with per-item commenting, observer knowledge extraction, workflow ownership extraction, distribution hardening, graph view's structured-list peer route, the first relation-first observer capture seam, the multi-chat substrate (chat containers + `reconciliation_need` queue), **side-chat V3.0 — hard-impact cascade through `reconciliation_need`**, and **side-chat V3.1 — agent-grouped reconciliation resolution** all ship as working product. V3.1 closes the V3.x arc: the reconciliation classifier writes `auto-confirm` / `auto-edit` / `substantive` per row and the Pending review surface renders chips + per-class actions + bulk Confirm-all / Apply-all-suggested. The live frontier is now **continuous workspace**, the phase-addressable interview surface that adopts one visible runtime per specification. +The interaction model is mature: four-phase interview, interviewer-autonomous question format, phase-agnostic preface cards with workspace exploration, structured review with per-item commenting, observer knowledge extraction, workflow ownership extraction, distribution hardening, graph view's structured-list peer route, the first relation-first observer capture seam, the multi-chat substrate (chat containers + `reconciliation_need` queue), **side-chat V3.0 — hard-impact cascade through `reconciliation_need`**, and **side-chat V3.1 — agent-grouped reconciliation resolution** all ship as working product. V3.1 closes the V3.x arc: the reconciliation classifier writes `auto-confirm` / `auto-edit` / `substantive` per row and the Pending review surface renders chips + per-class actions + bulk Confirm-all / Apply-all-suggested. -The May 2026 intent-spec, multi-chat, changeset-ledger, prompt/context, and agent-mutation design notes are reconciled into one direction. `docs/design/MULTI_CHAT.md` is the substrate document. `docs/design/SIDE_CHAT.md` describes side-chat V1 / V2 / V3.0 / V3.1 / V4 phasing on top of that substrate, with §13 mapping each user-surface version onto a substrate phase. `docs/design/PATCH_LEDGER.md` remains historical deeper design pressure for semantic mutation history, but canonical future-facing vocabulary is `changeset` / `change`; `docs/design/INTENT_SPEC_EVOLUTION.md` carries the broader synthesis. The product-layer ontology trajectory is split out as `docs/design/INTENT_GRAPH_SEMANTICS.md` (canonical reference for the FE-700 frontier) and `docs/design/BEHAVIORAL_KERNELS.md` (canonical reference for the FE-702 kernel probes). The dev-layer self-tooling trajectory — the `ln-*` skill family, the proposed file-backed spec registry, and the long-horizon convergence between dev and product ontologies — lives in `docs/design/DEV_WORKFLOW_EVOLUTION.md`. Older portability work remains a future-facing boundary map rather than a live roadmap item until a hosted, remote, or adapter-backed substrate becomes a product goal. +The next product arc is a **continuous conversational workspace** plus a stronger semantic/generative substrate. Continuous workspace is already active in parallel: it gives the chat runtime a stable phase-addressable host. The FE-705 branch contributes an integration substrate — a local agent capability CLI and external LLM-as-user probe harness — that should be reconciled into main before graph-review and scenario-options work depends on generated completed-spec fixtures. After that, the highest-coordination work is the intent-graph semantic model and semantic changeset ledger; lower-coordination provider, gitignore, and web-research work can proceed in parallel. +The May 2026 intent-spec, multi-chat, changeset-ledger, prompt/context, and agent-mutation design notes are reconciled into one direction. `docs/design/MULTI_CHAT.md` is the substrate document. `docs/design/SIDE_CHAT.md` describes side-chat V1 / V2 / V3.0 / V3.1 / V4 phasing on top of that substrate. `docs/design/PATCH_LEDGER.md` remains historical deeper design pressure for semantic mutation history, but canonical future-facing vocabulary is `changeset` / `change`; `docs/design/INTENT_SPEC_EVOLUTION.md` carries the broader synthesis. The product-layer ontology trajectory is split out as `docs/design/INTENT_GRAPH_SEMANTICS.md` (canonical reference for FE-700) and `docs/design/BEHAVIORAL_KERNELS.md` (kernel probes). FE-705's branch-local strategy/proposal notes add scenario options, graph-review oracle, chat-local strategies, and concern/dependency mapping; those notes should become a canonical design doc when the branch is integrated. The dev-layer self-tooling trajectory lives in `docs/design/DEV_WORKFLOW_EVOLUTION.md`. ## Active 1. **Continuous workspace / phase-addressable interview surface** — cumulative center pane with realized phase sections, one chat runtime per specification, sidebar section navigation, scroll/focus behavior, and the single actionable frontier preserved at the current reachable phase. - - Why now / unlocks: workflow read/write ownership is extracted (FE-616); the multi-chat substrate ships chat containers below the specification, so continuous workspace can adopt one visible runtime without smuggling in a second durable workflow model. Side-chat V3.0 + V3.1 just closed, so the cascade surface is stable; no remaining V2/V3 placeholder blocks the workspace work. + - Why now / unlocks: workflow read/write ownership is extracted (FE-616); the multi-chat substrate ships chat containers below the specification, so continuous workspace can adopt one visible runtime without smuggling in a second durable workflow model. Side-chat V3.0 + V3.1 just closed, so the cascade surface is stable; no remaining V2/V3 placeholder blocks the workspace work. This is being handled in parallel with the FE-705 reconciliation lane. - Traceability: A58; D86, D87, D110, D113, D114; I24, I102. - - Design doc: `docs/design/CONTINUOUS_WORKSPACE_HYBRID.md`. + - Design doc: `docs/design/CONTINUOUS_WORKSPACE_HYBRID.md`; umbrella synthesis in `docs/design/CONVERSATIONAL_WORKSPACE_RUNTIME.md`. ## Next -2. **Side-chat persistence — V4a (multi-chat Phase 2 substrate)** — side-chat client persists its turns into the existing `chat` / `turn` tables with `chat.kind='side_chat'`, loads prior side-chat sessions on remount, and surfaces an "Old chats" affordance per pinned item / spec. Phase 1 substrate (FE-697, [2026-05-06]) already shipped the `chat` table, nullable `turn.chat_id`, and `specification.primary_chat_id`; nothing schema-side blocks this — only the client write path and a per-spec session listing remain. This is the V4a half of FE-675; V4b (item versioning + branched exploration) stays in Horizon, gated on FE-701. - - Why later: side-chat threads stay in-memory through V3 by design (SIDE_CHAT.md §5.3); applied patches and `reconciliation_need` rows already persist independently. With V3.1 closed and the cascade surface settled, V4a becomes the next user-facing surface to light up — but Card 1 (server-side persistence) and Cards 2+ both depend on MULTI_CHAT.md §349's open question (anchor field on `chat` row vs deferred `chat_focus` table); route through `/ln-spec` or `/ln-spike` before scoping Cards 2+. - - Linear: FE-675 (umbrella; per-substrate phase note on FE-675 rather than a new sub-ticket). - - Traceability: Requirement 39; A82, A83; D138. - - Design doc: `docs/design/MULTI_CHAT.md` §10 Phase 2; `docs/design/SIDE_CHAT.md` §9 V4 row (V4a half only). - -3. **Prompt/context scenario substrate (FE-698 continuation)** — continue the FE-698 substrate after the foundation slices: packaged prompt assets, the first observer context-pack path, deterministic no-provider scenario capture, and the agent mutation-surface audit are complete, but FE-698 still has live follow-up work. - - Linear: FE-698. Pi harness spike: FE-635. - - Status: partially complete, not retired. Completed foundation: prompt registry + markdown prompt loading, observer-capture and web-research context-pack composition, scenario runner capture skeleton / seeded snapshots, prompt-source explicitness, mutation-surface audit / terminology cleanup, capability registry metadata surfaced in scenario artifacts, fake-adapter web-research scenario execution capture, a probe-only Anthropic AI SDK scenario adapter, and safe scenario execution error summaries. Outstanding FE-698 follow-up: more context-pack scenarios beyond observer capture / web research, broader read-only/proposal-only harness execution probes, and/or the Pi adapter spike. OpenRouter/default-provider setup is deferred to the first-run provider setup frontier. - - Why now / unlocks: multi-chat removes the single transcript spine as default agent context, while ontology, observer, candidate-spec, web research, behavioral-kernel, architect, and post-spec decomposition work all need shared prompt/context machinery. This prevents every future agent feature from inventing its own prompt-context hack and lets LLM-heavy flows be tested before UI work. - - Recommended shape: define the next FE-698 slice around one of the remaining prompt/context seams. Likely candidates: additional context-pack scenarios for next-question, candidate-spec, web research, reconciliation, architect, or decomposition probes; a narrow execution-probe path using the existing Anthropic API key / fake adapters; or the FE-635 Pi SDK/RPC spike. Keep provider credential UX, shared production AI runtime/provider resolution, execution adapters as product truth, and durable mutating handlers out of scope. The key rule is that future agent-originated writes must go through Brunch-owned handlers rather than direct ORM access. Registry naming should follow `docs/design/AGENT_MUTATION_SURFACE.md`: product nouns plus semantic verbs, with intent-graph mutations converging on `changeset.submit` / `changeset.apply` and atomic `change` variants rather than many ad hoc mutating tools. - - Verification approach: inner-loop prompt-loader/context-pack unit tests plus seeded scenario snapshots; middle-loop multi-run prompt probes should be designed before judging generative quality. - - Traceability: Requirements 40, 41, 42; A84, A85, A86, A87; D139, D140, D141, D142, D143; I112. - - Design docs: `docs/design/INTENT_SPEC_EVOLUTION.md`; `docs/design/MULTI_CHAT.md`; `docs/design/AGENT_MUTATION_SURFACE.md` (agent-originated mutation audit and registry input); Pi SDK docs as spike input. - -4. **Intent graph semantics + progressive checkability foundation** — refine the ontology and relation policy so the graph can represent invariants, examples/counterexamples, constraint subtypes, narrowed decisions, witness strength, and checkability gaps as source/destination material for future generative features. +2. **FE-705 integration — agent capability CLI + LLM-as-user fixture probe** — integrate the branch-complete local `brunch agent` JSONL capability adapter and external probe runner so agents can drive the real Brunch interview flow through Brunch-owned contracts rather than privileged ORM access. + - Linear: FE-705. Pi comparison remains FE-635 after this seam has a real Brunch use case to compare against. + - Status: branch-complete off main; not treated as shipped in main until the FE-705 implementation is rebased and verified. Canonical plan records it as the near-term integration substrate because later graph-review/scenario-options probes need credible completed-spec fixtures. + - Why now / unlocks: prompt/context and graph-review probes need realistic graph/transcript fixtures, but hand-authoring those fixtures is chicken-and-egg. A JSONL capability adapter lets an external LLM-as-user drive the real lifecycle through the same mutation authority future agents must use, pressure-testing tool-call vocabulary, chat readiness, resource identity, fixture curation, and import-boundary discipline. + - Recommended shape: preserve the branch's split between server-owned capability contracts and script-side probe harness. The adapter exposes explicit resource-id calls (`spec.create`, `chat.getPrimary`, `chat.ensureReady`, `chat.read`, `turn.submitResponse`, and follow-on lifecycle/export operations as scoped); the probe runner owns scenario briefs, model-backed simulated-user policy, artifact bundles, fixture-candidate inspection, and workspace-state preservation. Keep browser automation, product UI, provider credential UX, shared production provider routing, and durable runtime-operation ledgers out of the integration slice. + - Verification approach: contract/dispatcher tests, JSONL protocol/session tests, import-boundary tests proving the probe runner uses only the JSONL client/process boundary, fake process tests, opt-in real-provider smoke, and fixture-candidate structure/readiness checks. + - Traceability: Requirement 43; A89; D143, D147; I114. Also protects Requirements 40, 41, 42 by making prompt/context and mutation-surface probes executable through a real adapter. + - Design docs: `docs/design/AGENT_MUTATION_SURFACE.md`; `docs/design/INTENT_SPEC_EVOLUTION.md`; FE-705 branch artifacts until rebased. + +3. **Intent graph semantics + relation-policy directionality foundation** — refine the ontology and relation policy so the graph can represent invariants, examples/counterexamples, constraint subtypes, narrowed decisions, witness strength, checkability gaps, and operational edge behavior as source/destination material for future generative features. - Linear: FE-700. - - Why now / unlocks: candidate generation, behavioral kernels, architect proposals, and downstream verification-aware decomposition need a sharper semantic target than the current exploration/review ontology. - - Recommended shape: add `invariant` and `example` as first-class durable kinds; subtype examples (positive / negative / edge-case / trace / not-relevant); narrow `decision` per the decision-capture criteria; enrich `constraint` subtypes (non_goal / scope / technical / policy / resource / compatibility / environmental); add `criterion` subtypes (acceptance / test / manual_review / runtime_check / proof / observability) and `invariant` subtypes (state / transition / authority / provenance / consistency / security / data_integrity); add `checkability` and `witness strength` fields on intent items per the progressive-checkability ladder; introduce the five-family relation taxonomy (justification / dependency / boundary / refinement / verification) plus first-class negative relations (`rules_out`, `counterexample_for`); add edge epistemic metadata (`support`, `status`, `provenanceTurnId`, `rationale`); land a relation-policy registry whose axes distinguish `visible`, `cascade`, `export_trace`, `staleness`, `reconciliation`, `criteria_help`, and `weak_suggestion` participation. Full enumerations and worked examples in `docs/design/INTENT_GRAPH_SEMANTICS.md`. - - Verification approach: corpus/fixture observer probes comparing old vs refined ontology; graph-review manual assessment for precision/noise; context-pack probe outputs must show authority and witness labels. + - Why now / unlocks: candidate generation, behavioral kernels, graph review, scenario-options acceleration, architect proposals, direct-edit cascade, and downstream verification-aware decomposition all need a sharper semantic target than the current exploration/review ontology. This is the semantic-layer lane most likely to collide with parallel work, so it should land before broadening observer enrichment or committing generated candidate bundles. + - Recommended shape: add `invariant` and `example` as first-class durable kinds; subtype examples; narrow `decision`; enrich `constraint`, `criterion`, and `invariant` subtypes; add `checkability` and witness strength; introduce the five-family relation taxonomy and negative relations; add edge epistemic metadata; and make relation-policy directionality explicit (`canonicalSentence`, `inverseSentence`, source-change behavior, target-change behavior) rather than inferring cascade from raw edge direction. Leave room for contrastive-kernel artifacts such as `alternative`, `question`, `ambiguity`, and `candidate`, but keep them proposal-local unless probes prove they need durable top-level kinds. + - Verification approach: corpus/fixture observer probes comparing old vs refined ontology; relation-policy unit tests for mixed-direction relations; graph-review manual assessment for precision/noise; context-pack probe outputs must show authority, witness, relation support, and directionality labels. - Traceability: Requirement 38; A77, A78, A80, A81, A84; D134, D136, D137, D139, D140. - - Design docs: `docs/design/INTENT_GRAPH_SEMANTICS.md` (canonical reference); `docs/design/INTENT_SPEC_EVOLUTION.md` (broader synthesis context). - -5. **Generative prompt probes before UI** — use the scenario substrate to prototype web research, behavioral kernels, candidate-spec completion, and post-spec design/oracle/decomposition flows against intent-graph fixtures before committing product surfaces. - - Linear: FE-702 for post-spec decomposition probes; FE-649 and FE-640 are productization children under FE-698. - - Why now / unlocks: proves whether progressive checkability and graph-first context can be taught to agents, and de-risks the next generation of UI features. - - Recommended shape: start with one web-research context/query scenario, the first three behavioral kernels (`state & lifecycle`, `containment & topology`, `authority & capability`) per the v0.1 kernel ontology, candidate-spec set generation, and exploratory oracle/decomposition scenarios inspired by `.agents/skills/ln-design/` and `.agents/skills/ln-oracles/`. Each kernel probe should follow the kernel-card structure (detection signals, contrastive question templates, artifact schema, validators) and emit typed intent items / intent edges per `docs/design/INTENT_GRAPH_SEMANTICS.md`. Outputs remain probe artifacts or proposal-only structures, not committed graph mutations. - - Verification approach: scenario-runner fixtures, raw output review, structured parse validation, and qualitative scorecards before product UI. - - Traceability: Requirements 20, 21, 31, 32, 40, 41; A67, A68, A80, A85, A87; D126, D127, D139, D141. - - Design docs: `docs/design/BEHAVIORAL_KERNELS.md` (kernel ontology + cards); `docs/design/INTENT_GRAPH_SEMANTICS.md` (artifact target). - - -## Horizon - -### Intent graph and reconciliation - -- **Semantic changeset ledger** — make semantic mutations first-class once non-primary surfaces can change intent-graph truth. - - Linear: FE-701. - - Recommended shape: one `changeset` contains one or more atomic `change` records. Use `changeset` / `change` as canonical schema and operation vocabulary; `patch` / `patch_change` remain historical design-doc terms only. Connect `reconciliation_need.caused_by_changeset_id` once changesets exist. - - Depends on: multi-chat substrate + reconciliation needs; prompt/context context packs for reconciliation scenarios. - - Traceability: A71, A82, A83; D135, D138, D140. - - Design doc: `docs/design/PATCH_LEDGER.md` (historical file name; future vocabulary is changeset/change). - -- **Relation-first observer capture enrichment** — after the next ontology/relation-policy probes, broaden observer relationship extraction across the refined ontology where edge support and operational participation are understood. - - Recommended shape: keep `runObserver()` as the public turn-owned seam, but feed it scenario-specific context packs and validate output through the relation-policy registry. The FE-639 first cut has landed; remaining work should be driven by corpus/manual proving. - - Depends on: prompt/context substrate; intent graph semantics + progressive checkability foundation. - - Traceability: Requirements 30, 38, 40; A66, A81, A84; D125, D136, D137, D139, D140; I109. - -- **Architect / generator loop** — autonomous agent that iterates over the intent graph and proposes semantic changes for HITL review through the same future changeset / reconciliation pathway as user-driven edits. - - Recommended shape: keep productized architect proposals behind multi-chat + reconciliation + semantic changesets; use the scenario substrate for shadow/proposal-only probes first. - - Traceability: A73, A85, A87; D139, D141; depends on chat containers + reconciliation needs and semantic changeset ledger. - -- **Side-chat V4b — item versioning + branched exploration** — once the patch ledger lands, item versioning unblocks dangling-annotation repair and soft-edit audit; branched exploration lets drill-downs / past-turn edits / revisits coexist with the original chain. FE-675 V4b half. - - Depends on: FE-701 patch ledger; V4a side-chat persistence (Next item 2). - - Traceability: A72, A73, A85; D139, D141. - - Design doc: `docs/design/SIDE_CHAT.md` §9 V4 row (V4b half). - -### User-facing capabilities + - Design docs: `docs/design/INTENT_GRAPH_SEMANTICS.md`; `docs/design/INTENT_SPEC_EVOLUTION.md`; FE-705 strategy/proposal notes for relation directionality. + +4. **Semantic changeset ledger + proposal-turn staleness** — introduce the semantic history spine that separates graph mutation history from conversational turn ancestry. + - Linear: FE-701. + - Status: not complete in main. Current code has `reconciliation_need`, side-chat apply behavior, and the V3.1 classifier lifecycle, but no first-class `changeset` / `change` ledger and no durable proposal-turn staleness semantics. + - Why now / unlocks: scenario bundle acceptance, direct-edit atomicity, accepted-with-issues flows, stale proposal detection, graph-review repairs, side-chat V4b item versioning, and future architect/reconciliation agents all need a durable semantic mutation boundary. Without it, productized scenario-options can stay probe-only but cannot safely commit candidate bundles. + - Recommended shape: add `changeset` / `change` as canonical schema and operation vocabulary; track the latest semantic changeset per specification; stamp proposal turns with base/opened changeset identity; connect `reconciliation_need.caused_by_changeset_id`; keep proposals/findings as turn-owned artifacts until accepted; ensure only `accept` applies a proposal changeset; and treat a changeset as the smallest atomic unit that preserves semantic coherence. + - Verification approach: DB atomicity tests for changeset + changes + reconciliation_need writes, staleness tests for open proposal turns across multi-chat changes, capability/transition tests proving non-accept actions cannot mutate graph truth. + - Traceability: Requirements 39, 42, 44; A71, A79; D135, D138, D143. + - Design doc: `docs/design/PATCH_LEDGER.md` (historical filename; future vocabulary is changeset/change); FE-705 strategy/proposal notes for semantic history and proposal turns. + +5. **Graph-review oracle + scenario-options probes** — build the internal critique path and artifact-only candidate bundle probes before product UI. + - Linear: FE-702 for graph-review / scenario probes; FE-649 and FE-640 remain productization children under FE-698 where relevant. + - Why now / unlocks: product wants first-turn strategy choice and mid-interview acceleration, but engineering needs graph-review critique to make generated candidate bundles credible. This lane can advance in parallel with FE-700 if it stays artifact-only and does not commit canonical graph truth. + - Recommended shape: define candidate graph bundle and graph-review finding artifacts; add a graph-review prompt/context pack and rubric covering coherence, fixed-premise respect, coverage, tradeoff honesty, checkability, granularity, scenario fidelity, epistemic labels, provenance, and downstream usefulness; generate 2–3 scenario options that complete the current direction from context-packed accepted graph truth; run fast gates before display and deeper async critique/refine/repair as probe artifacts; classify candidate readiness as `draft` / `reviewing` / `reviewed_clean` / `reviewed_with_issues` / `blocked`; keep broader graph-review issues turn-owned rather than adding a `graph_issue` table. + - Verification approach: scenario-runner fixtures, FE-705 JSONL-generated completed-spec fixtures, raw output review, structured parse validation, qualitative scorecards, and comparison against drilldown-produced graphs. Middle/outer-loop oracle design should decide when fixture candidates become golden. + - Traceability: Requirements 20, 21, 31, 32, 40, 41, 43, 44; A67, A68, A80, A85, A87, A89; D126, D127, D139, D141, D147. + - Design docs: `docs/design/BEHAVIORAL_KERNELS.md`; `docs/design/INTENT_GRAPH_SEMANTICS.md`; `docs/design/AGENT_MUTATION_SURFACE.md`; FE-705 strategy/proposal notes. + +6. **Productized scenario-options / candidate-spec completion assist** — replace skip-only remainder handling with first-turn strategy choice and a mid-interview `speed this up` path that generates reviewed candidate graph bundles with tradeoffs, completing the current direction by default. + - Why later: product UI waits on graph-review probes, FE-700 semantics, and FE-701 changesets. Until then, scenario-options remain artifact/proposal-only. + - Scope relationship: this likely absorbs or reshapes **two-axis interview framing** and **progressive detail / recursive deflation** because first-turn strategy and speed-up paths are where those distinctions become actionable. The broader **architect / generator loop** remains related but not fully subsumed; autonomous graph mutation proposals through changeset/reconciliation stay a later capability unless deliberately narrowed into this surface. + - Depends on: FE-705 fixture substrate, prompt/context substrate, intent graph semantics + relation-policy directionality, graph-review oracle, and changeset ledger for canonical acceptance. + - Traceability: Requirements 31, 40, 44; A67, A77, A78, A85; D126, D134, D136, D139. + +## Parallel / low-conflict candidates - **First-run provider setup** — make missing LLM credentials visible on the dashboard, add a shared AI runtime provider seam for interviewer / observer model construction, support UI-entered keys through XDG-compliant user auth state, and evaluate whether OpenRouter should become the preferred onboarding provider while preserving Anthropic-specific capabilities or explicit degradation. - Linear: FE-633 covers the OpenRouter/default-provider part; dashboard credential UX + XDG key storage may need a sibling issue if split from provider proving. - - Recommended shape: prove the provider resolver first with current Anthropic behavior, then spike OpenRouter against tool use, structured output, and reasoning/thinking options before making it the default. The dashboard should expose credential status without leaking secret values and offer setup before the user starts a specification. - Traceability: Requirements 34, 35, 36; A74, A75; D130, D131, D132; I106. - **Workspace hygiene / `.brunch/` gitignore assist** — detect whether generated local state is already ignored and, with explicit confirmation, add an idempotent `.gitignore` entry or create `.gitignore` when absent. - Linear: FE-648. - - Recommended shape: keep this as a deterministic local mutation with preview/confirmation semantics; it can ship independently, but the dashboard is the natural surface because it already explains workspace binding and first-run setup. - Traceability: Requirement 37; A76; D133; I107. - **Productized web research capability** — web search and page-fetch tools as interviewer-invoked context gathering, surfaced as preface cards after the scenario substrate proves query framing, tool ergonomics, and provisional-context handling. @@ -92,114 +74,106 @@ The May 2026 intent-spec, multi-chat, changeset-ledger, prompt/context, and agen - Depends on: prompt/context scenario substrate and web-research probe. - Traceability: Requirements 20, 21, 40, 41; D99, D112, D139, D142. -- **Dashboard result summaries and completeness metrics** — progress visibility across specifications. +## Horizon -- **Two-axis interview framing** — adapt interviewer setup and questioning to the full `greenfield <> brownfield` by `end-to-end build <> incremental feature` matrix instead of treating partial-scope work as a special case. - - Linear: FE-638. - - Traceability: Requirement 29; A65; D124. +### Semantic and generative follow-through -- **Productized candidate-spec completion assist** — replace skip-only remainder handling with a `fill in the rest for me` path that generates candidate specs, implications, tradeoffs, and likely typed knowledge for reaction-based refinement after prompt probes prove useful output. - - Depends on: prompt/context scenario substrate; intent graph semantics + progressive checkability foundation; candidate-spec generation probe. - - Traceability: Requirement 31, 40; A67, A77, A78, A85; D126, D134, D136, D139. +- **Relation-first observer capture enrichment** — the first cut is shipped; enrichment waits for FE-700 relation policy so observer output can broaden across the refined ontology without flooding the graph. + - Depends on: intent graph semantics + relation-policy directionality; prompt/context substrate. + - Traceability: Requirements 30, 38, 40; A66, A81, A84; D125, D136, D137, D139, D140; I109. -- **Progressive detail / recursive deflation** — support broad-pass interviewing with explicit next-level-of-detail actions rather than one uniform depth-first drill-down. - - Linear: FE-637. - - Recommended shape: pair ordinary grounding/design question turns with a turn-owned breadth-skeleton artifact that makes current coverage visible and exposes a structured detail reaction (`deepen this area`, `continue broad pass`, `sufficient for now`). The chosen reaction should steer the next same-phase frontier turn instead of introducing a separate detail workflow. - - First cut should optimize for `broad question -> choose one area to deepen next -> focused successor question -> refreshed breadth skeleton`, while keeping the same detail-focus intent reusable later from chat or graph surfaces. - - Traceability: Requirement 32; A67, A68; D127. +- **Architect / generator loop** — autonomous agent that iterates over the intent graph and proposes semantic changes for HITL review through the same future changeset / reconciliation pathway as user-driven edits. + - Status: related to scenario-options but broader. Keep productized architect proposals behind multi-chat + reconciliation + semantic changesets; use the scenario substrate for shadow/proposal-only probes first. + - Traceability: A73, A85, A87; D139, D141. + +- **Server mini-library compartmentalization** — refactor growing server seams into plural public roots with same-named private subtrees where FE-698 / FE-705 pressure has made boundaries too implicit. + - Status: near-term refactor candidate after FE-705 integration, not product roadmap work. + - Candidate shape: `fixtures.ts` + `fixtures/`, `context-packs.ts` + `context-packs/`, `prompts.ts` + `prompts/`, `scenario-runner.ts` + `scenario-runner/`, `entity-apis.ts` + route submodules, and `agent-apis.ts` + capability/protocol subtrees. -- **Spatial canvas layout for graph view** — add the spatial DAG layout as a second layout choice inside graph mode, alongside the structured-list route. Same projection seam, same intent contract; only the layout strategy changes. - - Recommended shape: a layout switch inside the existing `/specification/$id/graph` route that transforms the same `EntitiesData` projection into a spatial scene with viewport / selection / focus / path-highlighting. First cut should optimize for `select node -> inspect -> launch refinement` through the multi-chat substrate. - - Depends on: graph view structured-list ship. Richer node actions depend on multi-chat / reconciliation rather than the old side-chat conceptual roadmap. - - Traceability: Requirement 33; A69; D128. +### Side-chat follow-on -- **Graph view active-path render filter + scope toggle** — render only active-path items by default in graph view, with a `Show all` toggle in the header that flips to the full whole-spec set. Both subsets project from the same in-memory `mode=project-wide` data; no second fetch. - - Depends on: server data-layer change for active-path membership exposure. - - Traceability: Requirement 33; D128, D129; I102. +- **Side-chat persistence — V4a (multi-chat Phase 2 substrate)** — side-chat client persists its turns into the existing `chat` / `turn` tables with `chat.kind='side_chat'`, loads prior side-chat sessions on remount, and surfaces an "Old chats" affordance per pinned item / spec. + - Status: deprioritized below continuous workspace and semantic/generative substrate. Phase 1 substrate already ships schema support; the remaining decision is the anchor model (`chat` row anchor fields vs deferred `chat_focus` table). + - Linear: FE-675 (umbrella; V4a half). + - Traceability: Requirement 39; A82, A83; D138. + - Design docs: `docs/design/MULTI_CHAT.md` §10 Phase 2; `docs/design/SIDE_CHAT.md` §9 V4 row. -### Infrastructure / tooling +- **Side-chat V4b — item versioning + branched exploration** — once the changeset ledger lands, item versioning unblocks dangling-annotation repair and soft-edit audit; branched exploration lets drill-downs / past-turn edits / revisits coexist with the original chain. + - Depends on: semantic changeset ledger; V4a side-chat persistence. + - Traceability: A72, A73, A85; D139, D141. -- **Structured development spec registry** — prototype file-backed canonical spec records, deterministic checks, generated markdown views, and task-local slices for Brunch's own development workflow (the `ln-*` skill family). - - Status: design horizon, not a migration commitment. Self-tooling experiment for the dev layer; not part of the product roadmap. - - Recommended shape: follow the `memory/spec/{schema,records,generated,tools}/` trajectory and the 5-step migration path (stable IDs → sidecar files → stop editing generated md → `spec:check` in the verify gate → task-local slices). First-adopter candidate: a bounded sub-area such as the multi-chat substrate's records, not the full SPEC. - - Traceability: D134. - - Design doc: `docs/design/DEV_WORKFLOW_EVOLUTION.md` (canonical reference, including the three-layer framing and convergence question); `docs/design/INTENT_SPEC_EVOLUTION.md` (broader synthesis context). +### Lower-priority / unclear product surface + +- **Dashboard result summaries and completeness metrics** — progress visibility across specifications. +- **Spatial canvas layout for graph view** — add the spatial DAG layout as a second layout choice inside graph mode, alongside the structured-list route. +- **Graph view active-path render filter + scope toggle** — render only active-path items by default in graph view, with a `Show all` toggle. +- **MCP server adapter for core operations** — future adapter over capability contracts, not direct ORM / route wrappers. +- **Git-friendly file-based persistence representation for diffable exported specs**. +- **Typed fixture-builder convergence for happy-path tests**. + +### Meta / deferred boundaries + +- **Structured development spec registry** — prototype file-backed canonical spec records, deterministic checks, generated markdown views, and task-local slices for Brunch's own development workflow. + - Meaning: self-tooling experiment for Brunch's development process, not product functionality. It would make `memory/SPEC.md` / `memory/PLAN.md` generated views over structured records to reduce drift and merge conflicts. + - Status: design horizon, not a migration commitment. + - Design doc: `docs/design/DEV_WORKFLOW_EVOLUTION.md`. - **Portability boundaries** — split durable store/read-model, interview session runtime, and workspace capability provider if Brunch targets hosted, remote, embedded, or sandbox-backed operation. - - Status: deferred. Some enabling seams already exist (query domains, workflow projector, no persisted `cwd` on specifications), but adapter-backed portability is not on the live roadmap. + - Meaning: future architecture boundary map for non-local deployments or adapter-backed execution. Deferred until hosted/remote/sandbox operation becomes a product goal. - Deep design source: `docs/design/PORTABILITY_BOUNDARIES.md`. -- Headless interview driver for scripted end-to-end probes. -- MCP server adapter for core operations. -- Git-friendly file-based persistence representation for diffable exported specs. -- Typed fixture-builder convergence for happy-path tests. ## Recently Completed - [2026-05-11] **Side-chat V3.1 — agent-grouped reconciliation resolution** (FE-674, PR #124 + downstack) — closes the V3.x arc end-to-end. Server: `POST /api/specifications/:id/reconciliation-needs/run-agent` (spec-level classifier loop) and `POST /api/specifications/:id/reconciliation-needs/:needId/reset-agent` (per-row Re-run) walk every awaiting open need through I114's `null → queued → classifying → classified | failed` lifecycle; agent_classification persists one of `auto-confirm` / `auto-edit` / `substantive`; agent_proposal carries an optional text suggestion. Client: `` renders six visual variants per row; `` in the Pending review header with conditional 1s polling while any need is in flight; per-row Re-run on classified/failed rows; per-class action buttons (`auto-confirm` → Confirm, `auto-edit` → View proposal + Apply + Skip, `substantive` → Open side-chat via `useSideChat().openFor`); bulk Confirm-all (N) and Apply-all-suggested (N) iterate serially over existing per-row endpoints. Listing endpoint extended with `target_item_kind` + `target_reference_code` to feed the Open-side-chat handoff. Verified: `npm run verify` 1178 / 1179 pass (one unrelated `side-chat-route` flake). **Watch**: A88 outer-loop walkthrough has not yet happened — empirical signal on whether agent grouping helps legibility vs V3.0's flat list remains open; capture qualitative notes during the next manual walkthrough on a dense spec. - [2026-05-11] FE-698 reconciliation context-pack slice — Added a proposal-only reconciliation prompt/context scenario that renders open reconciliation needs with source/target anchors, reason/status, prompt/context fingerprints, and read-only capability metadata. This is substrate-only: no FE-674 need lifecycle endpoint, overlay action, side-chat reducer, or durable mutation behavior. Verified: `npm run verify`. Watch: next FE-698 work can move to broader read-only/proposal-only probes and the Pi adapter spike without treating this pack as a resolution agent. - [2026-05-08] **Side-chat V3.0 — hard-impact cascade through `reconciliation_need`** (FE-674, PR #115 + #116 + #117) — three-card stack closes V3.0. Card 1 (PR #115): server `cascade-producer` + `getDownstreamEdges` + `openReconciliationNeedIfAbsent`; hard-impact apply mutates the source and opens one need per typed dependency edge; response shape adds `openedNeedIds`; partial-unique-index dedupe. Card 2 (PR #116): drop deferred banner; new `GET /api/specifications/:id/reconciliation-needs` endpoint and `useSpecificationOpenReconciliationNeeds` query; patch-list overlay renders a Pending review section listing open needs with kind chip and source/target references. Card 3 (PR #117): idempotent `POST /api/specifications/:id/reconciliation-needs/:needId/resolve` endpoint and per-row Resolve button; mutation pending state disables the button mid-flight. Verified: `npm run verify` (1063 tests, 0 lint warnings). Watch: A88 (Path 1 sufficiency without agent) is partially validated mechanically — full validation depends on outer-loop walkthrough on dense graphs. V3.1 (agent-grouped resolution) shipped 2026-05-11; richer per-row kinds beyond single Resolve are V3.1. SIDE_CHAT.md §9 updated to reflect the V3.0 single-action shape. -- [2026-05-08] FE-674 planning sync — reconciled `docs/design/SIDE_CHAT.md` §5.3 / §8 / §9 / §13 against the downstack FE-697 substrate; SPEC.md adds A88 (Path 1 sufficiency without agent), D146 (cascade routes through `reconciliation_need`, `deferred: true` apply contract removed at V3.0 ship), I113 (apply opens at least one need per typed dependency edge), and rewrites Acceptance Criterion 7. Doc-only, no `src/` touched. PR #110 stacked on FE-704. -- [2026-05-08] FE-698 prompt/context follow-up hardening — Candidate-spec prompt scenarios no longer advertise durable changeset submission, prompt scenario artifacts report schema version 2 for the fingerprinted shape, scenario definitions require typed context data, empty prompt assets are cached correctly, context-pack anchors use intent vocabulary, and `context-pack.ts` now remains the public entry point over private scenario-specific context-pack modules. Verified: `npm run verify`. Watch: this is still FE-698 continuation hardening; broader generative quality review and additional scenario probes remain later slices. -- [2026-05-08] FE-698 prompt/context remediation + candidate scenario — Prompt scenario definitions are now discriminated by scenario kind, candidate-spec scenarios render deterministic no-provider proposal artifacts from typed context packs, scenario artifacts include prompt/context fingerprints, server prompt asset copying mirrors current source assets, prompt golden coverage protects production prompt text, and the build-boundary prompt test writes isolated output. Verified: `npm run verify`. Watch: full generative quality review for candidate-spec output remains a later execution/probe slice. -- [2026-05-08] FE-698 scenario execution error hardening — Scenario execution failures now serialize safe deterministic summaries: API-key-like provider errors are redacted, non-Error rejections avoid object dumps, and ordinary errors remain reviewable. Verified: `npm run verify`. -- [2026-05-08] FE-698 Anthropic scenario adapter — Added a probe-only Anthropic AI SDK adapter behind the existing `PromptScenarioModelAdapter` seam. Web-research prompt scenarios now map rendered prompts to AI SDK system content and rendered context packs to user prompt content under mocked tests, with unsupported providers rejected before model construction. Verified: `npm run verify`. Watch: this is not the shared AI runtime provider seam; OpenRouter/provider-neutral routing, credential UX, Pi, web tools, CLI/UI, persistence, and Brunch mutations remain out of scope. -- [2026-05-08] FE-698 prompt scenario execution probe — Web-research prompt scenarios can now execute through an injected fakeable model adapter and serialize `succeeded` / `failed` execution results with raw output or deterministic error text, while no-provider artifacts remain deterministic `not-run` snapshots. Structured parsing is explicitly `not-applicable` for this prose-only web-research path. Verified: `npm run verify`. Watch: real provider adapters, Pi, web tools, CLI/UI, persistence, and mutating Brunch handlers remain out of scope for this foundation slice. -- [2026-05-07] FE-698 prompt/context foundation slices — Packaged markdown prompt registry + observer and web-research context-pack foundations + scenario runner capture skeleton/composition + agent mutation-surface audit + capability registry metadata. Server interviewer, observer, side-chat, and web-research role prompts now load from markdown assets through a typed prompt registry; observer capture and web-research probes render typed scenario-specific context packs; seeded prompt scenarios compose production prompts with typed context-pack output into deterministic no-provider probe artifacts; and scenario artifacts can declare validated Brunch capability contracts. Review fixes moved observer prompt composition into a pure module and made prompt scenario prompt sources explicit. The agent mutation-surface audit inventories current and projected agent-originated write paths as input to later handler slices. Verified: `npm run verify` for code slices; audit verified by code-search/document consistency. This is a completed foundation within FE-698, not retirement of the whole FE-698 frontier; the live continuation remains in `Next`. -- [2026-05-07] Side-chat V2 — Edit / Drill-down / Propose-edge plumbing (FE-673, PR #97) — added `edit`, `edge`, and `drill-down` patch kinds. Server `classifyEditImpact` returns `none | soft | hard`; soft applies directly with undo, hard returns `deferred: true` placeholder (removed at V3.0 ship). Client: patch-list reducer + three applier factories with real undo handlers. Verified: `npm run verify` (935 tests, 19 new). Watch: `SideChatPopover` Edit-mode reachability and cascade UX evolve with continuous workspace; V3.0 removed the hard-impact deferred banner. -- [2026-05-06] Multi-chat substrate + reconciliation needs (FE-697) — `chat` table with one interview chat per spec, nullable `turn.chat_id`, `specification.primary_chat_id`, mirrored `chat.active_turn_id`, plus the `reconciliation_need` queue with directed source/target items, narrow `kind`/`status`, partial unique index on open rows, cascade FK. Spec creation inserts spec + interview chat in one transaction; `advanceHead` is transactional. No user-visible change. Verified: `npm run verify` (673 tests) plus manual fixture playback (39 specs / 81 turns / dual-pointer equivalence). A82 / A83 validated for Phase 1. -- [2026-05-01] Side-chat V1.1 — Explore vertical slice. End-to-end graph-launched chat interaction shipped: prompt builder, POST `/side-chat` SSE endpoint, popover host, graph-view wiring, SSE consumer, and active-button activation. Follow-up refactor collapsed pending assistant text into the message list and extracted `SideChatHost` so activation is a tree-mount fact. -- [2026-05-04] Graph view structured-list peer route — `/specification/$id/graph` now renders project-wide entities through the structured-list layout with relationship subsections, relation chips, empty state, row controls, and a back-to-chat affordance. Follow-up active-path filtering and spatial canvas remain horizon work. Verified: `npm run verify` in the FE-643 slice family. - Older history: `docs/archive/PLAN_HISTORY.md` ## Dependencies ```text -TRACK A — Agent/semantic substrate +TRACK A — Workspace shell (parallel colleague lane) +continuous-workspace / phase-addressable interview surface (active) + ├──→ stable host for side-chat persistence and strategy chats + └──→ workspace-aware graph / structured-list peer routes + +TRACK B — Agent fixture substrate (FE-705 integration lane) +prompt/context scenario substrate foundation (completed) + └──→ agent capability CLI + LLM-as-user fixture probe (next, branch-complete off main) + ├──→ generated completed-spec fixture candidates + ├──→ graph-review oracle + scenario-options probes + └──→ Pi harness comparison (future, FE-635) + +TRACK C — Semantic substrate (highest coordination) multi-chat-substrate + reconciliation-needs (completed) - ├──→ prompt/context scenario substrate (completed) - │ ├──→ intent graph semantics + progressive checkability (next) - │ ├──→ generative prompt probes before UI (next) - │ │ ├──→ productized web research capability (horizon) - │ │ ├──→ productized candidate-spec completion assist (horizon) - │ │ └──→ post-spec oracle/decomposition frontier (probe/future product) - │ └──→ continuous-workspace (active, independent UI track but graph-context aware) - └──→ semantic-changeset ledger (horizon) - ├──→ relation-first observer enrichment (horizon, after ontology/policy probes) - └──→ architect-loop (horizon, proposal-only until changeset/reconciliation path) - -TRACK B — Graph/workspace surfaces -graph-view-structured-list (completed) - ├──→ active-path-filter-and-scope-toggle (horizon, blocked on server data-layer) - ├──→ spatial-canvas-layout (horizon) - └──→ multi-chat-substrate + reconciliation-needs (completed) - ├──→ side-chat-V2-plumbing (completed, FE-673 PR #97) - │ └──→ side-chat-V3.0-cascade-through-reconciliation_need (completed, FE-674) - │ └──→ side-chat-V3.1-agent-grouped-resolution (completed, FE-674 PR #124) - │ └──→ side-chat-persistence-V4a (next, FE-675 V4a half) - └──→ semantic-changeset ledger (horizon) - └──→ side-chat-V4b-item-versioning-+-branched-exploration (horizon, FE-675 V4b half) - -TRACK B — Infrastructure -multi-chat-substrate (completed) - ├──→ semantic-changeset ledger (horizon) - └──→ continuous-workspace (next) - - - -UNBLOCKED HORIZON -first-run provider setup (needs provider spike / scope) -workspace hygiene gitignore assist (bounded, dashboard-surface candidate) -intent-spec ontology + progressive checkability (needs probe) -relation-first observer capture (first cut complete, needs enrichment proving) -knowledge-edge semantics policy (discussion/design before observer expansion) -web-research tools (gate ready, needs tool impl) + ├──→ intent graph semantics + relation-policy directionality (next, FE-700) + │ ├──→ relation-first observer enrichment (horizon, first cut already shipped) + │ ├──→ robust direct-edit / reconciliation cascade policy + │ └──→ graph-review oracle can become semantically meaningful + └──→ semantic changeset ledger + proposal-turn staleness (next, FE-701) + ├──→ canonical scenario bundle acceptance + ├──→ direct-edit atomicity with caused_by_changeset_id + ├──→ stale open proposal detection + └──→ architect-loop / verifier/import mutation provenance + +TRACK D — Strategy probes and product acceleration +FE-705 fixtures + FE-700 semantics + └──→ graph-review oracle + scenario-options probes (next, artifact-only) + └──→ productized scenario-options / candidate-spec completion assist (after changesets) + ├──→ absorbs / reshapes two-axis interview framing + └──→ absorbs / reshapes progressive detail / recursive deflation + +TRACK E — Low-conflict parallel work +first-run provider setup +workspace hygiene gitignore assist +productized web research capability + +LOWER-PRIORITY / DEFERRED +side-chat persistence V4a / V4b +spatial graph layout + active-path filter dashboard metrics -two-axis interview framing -progressive detail / recursive deflation -revisit / edit-mode (reshaped by reconciliation needs + changeset ledger) -structured development spec registry (tooling experiment) -portability boundaries (deferred until substrate goal exists) +MCP adapter / file-based persistence / typed fixture builders +structured development spec registry +portability boundaries ``` - diff --git a/src/client/components/pending-review-section.tsx b/src/client/components/pending-review-section.tsx index 25f3b4d8..7bb3627a 100644 --- a/src/client/components/pending-review-section.tsx +++ b/src/client/components/pending-review-section.tsx @@ -4,17 +4,16 @@ // a per-row Resolve button. Driven by useSpecificationOpenReconciliationNeeds; // returns null when the queue is empty so the parent overlay can skip rendering. // -// V3.1 will add agent grouping (auto-confirm / auto-edit / substantive) and a -// substantive-walk surface; that work expands inside this component without -// affecting the patch-list-overlay's other regions. +// V3.1 adds agent grouping (auto-confirm / auto-edit / substantive), +// per-row agent actions, and bulk resolution while preserving the +// patch-list-overlay's surrounding staged-change regions. // // Card 4 polish: source diff is no longer rendered inline. Each row shows a // "↗ view source diff" chip that opens a . Action buttons shrink // to icon-only ghost (Edit) + small kind-accent solid (Resolve). The inline // edit form reuses the same toolbar contract as ItemEditTextarea (icon-only -// Cancel + small kind-accent Save). Until the listing endpoint is enriched -// with target_item_kind, the row left bar and Resolve fill use a neutral -// amber as a kind-accent fallback (deferred follow-up card). +// Cancel + small kind-accent Save). Rows use target_item_kind when present, +// with neutral amber as the nullable-kind fallback. import { Check, From 1c551c39512fc4158d793b361f22f3d78fa69e8e Mon Sep 17 00:00:00 2001 From: Lu Nelson Date: Wed, 13 May 2026 13:25:17 +0200 Subject: [PATCH 02/42] reconcile SPEC, and delete fe-705 reference copies --- memory-fe-705/PLAN.md | 235 --------------- memory-fe-705/SPEC.md | 658 ------------------------------------------ memory/SPEC.md | 30 +- 3 files changed, 26 insertions(+), 897 deletions(-) delete mode 100644 memory-fe-705/PLAN.md delete mode 100644 memory-fe-705/SPEC.md diff --git a/memory-fe-705/PLAN.md b/memory-fe-705/PLAN.md deleted file mode 100644 index 407aa54f..00000000 --- a/memory-fe-705/PLAN.md +++ /dev/null @@ -1,235 +0,0 @@ - - -# Plan - -The interaction model is mature: four-phase interview, interviewer-autonomous question format, phase-agnostic preface cards with workspace exploration, structured review with per-item commenting, observer knowledge extraction, workflow ownership extraction, distribution hardening, graph view's structured-list peer route, and the first relation-first observer capture seam all ship as working product. In this stack, downstack FE-697 supplies the multi-chat substrate (chat containers + `reconciliation_need` queue), and FE-698 supplies the prompt/context scenario substrate from `main`. Side-chat V2 plumbing — `edit` / `edge` / `drill-down` patch kinds with server route, reducer, and undo-capable appliers — is branch-complete on FE-673 (PR #97) but ships without its user-facing Edit-mode trigger, and the V2 hard-impact branch returns a `deferred: true` placeholder banner. The live frontier is **side-chat V3.0**, which removes that placeholder by routing hard-impact apply through the new `reconciliation_need` queue. - -The May 2026 intent-spec, multi-chat, changeset-ledger, prompt/context, agent-mutation, and strategy design notes are reconciled into one direction. `docs/design/MULTI_CHAT.md` is the downstack phase-one substrate for this stack. `docs/design/SIDE_CHAT.md` describes side-chat V1 / V2 / V3.0 / V3.1 / V4 phasing on top of that substrate, with §13 mapping each user-surface version onto a substrate phase. `docs/design/PATCH_LEDGER.md` remains historical deeper design pressure for semantic mutation history, but canonical future-facing vocabulary is `changeset` / `change`; `docs/design/INTENT_SPEC_EVOLUTION.md` carries the broader synthesis. The product-layer ontology trajectory is split out as `docs/design/INTENT_GRAPH_SEMANTICS.md` (canonical reference for the FE-700 frontier), `docs/design/BEHAVIORAL_KERNELS.md` (canonical reference for the FE-702 kernel probes), and `docs/design/SPEC_EVOLUTION_STRATEGIES.md` (chat-local strategies, candidate bundles, graph-review oracle, and concern/dependency map). The dev-layer self-tooling trajectory — the `ln-*` skill family, the proposed file-backed spec registry, and the long-horizon convergence between dev and product ontologies — lives in `docs/design/DEV_WORKFLOW_EVOLUTION.md`. Older portability work remains a future-facing boundary map rather than a live roadmap item until a hosted, remote, or adapter-backed substrate becomes a product goal. - - -## Active - -1. **Side-chat V3.0 — hard-impact edit cascade through `reconciliation_need`** — drop the V2 deferred banner; on hard-impact `propose_edit` apply, server enumerates incident `knowledge_edge` rows under typed relation policy (Path 1 from MULTI_CHAT.md §5.1) and opens one `reconciliation_need` per affected pair; client surfaces those rows as a `Pending review` section in `patch-list-overlay.tsx` with per-row accept-on-target / edit-target / dismiss actions. V3.0 groups needs mechanically (by `kind` and relation type); agent-grouped resolution is V3.1 horizon work. - - Why now / unlocks: downstack FE-697 supplies the queue table for this stack; the FE-674 planning sync (PR #110) reconciled SIDE_CHAT.md §5.3 / §8 / §9 / §13 and SPEC.md (Acceptance Criterion 7, A88, D146, I113) against the substrate; the V2 deferred banner is the highest-visibility user gap. Without V3.0, FE-697's queue has no reader and V2 hard-impact stays an empty promise. - - Recommended shape: ship as a small queue of scope cards inside this one frontier item (track in `memory/CARDS.md` if needed). Suggested order — (a) un-stub `SideChatPopover` Edit-mode button so V2 plumbing is reachable from the UI at all; (b) server `openReconciliationNeedsForItemChange()` + lifecycle endpoint for resolution; (c) `edit-applier` rewrite to drop the `deferred: true` shape and surface needs into side-chat state; (d) overlay `Pending review` section + per-row resolution actions; (e) verification — `edit-applier.test.ts`, `reconciliation-need.test.ts`, `patch-list-overlay.test.tsx`, F6 fixture matrix (leaf, 2-downstream, 5+-downstream, in-active-review-set, mixed kinds). - - Linear: FE-674. - - Traceability: Acceptance Criterion 7; Requirement 10; A48, A71, A83, A88, A93; D80, D135, D137, D138, D146, D150; I111, I113, I117. - - Design doc: `docs/design/SIDE_CHAT.md` §5.3, §9, §13; `docs/design/MULTI_CHAT.md` §5. - -## Next - -2. **Intent graph semantics + relation-policy directionality foundation** — refine the ontology and relation policy so the graph can represent invariants, examples/counterexamples, constraint subtypes, narrowed decisions, witness strength, checkability gaps, and operational edge behavior as source/destination material for future generative features. - - Linear: FE-700. - - Work type: semantic substrate / high-coordination foundation. - - Why now / unlocks: candidate generation, behavioral kernels, graph review, scenario-options acceleration, architect proposals, direct-edit cascade, and downstream verification-aware decomposition all need a sharper semantic target than the current exploration/review ontology. This is the next substrate layer most likely to collide with parallel work, so it should land before broadening graph-review or reconciliation behavior. - - Recommended shape: add `invariant` and `example` as first-class durable kinds; subtype examples (positive / negative / edge-case / trace / not-relevant); narrow `decision`; enrich `constraint`, `criterion`, and `invariant` subtypes; add `checkability` and `witness strength`; introduce the five-family relation taxonomy and negative relations; add edge epistemic metadata; and make relation-policy directionality explicit (`canonicalSentence`, `inverseSentence`, source-change behavior, target-change behavior) rather than inferring cascade from raw edge direction. Leave room for contrastive-kernel artifacts such as `alternative`, `question`, `ambiguity`, and `candidate`, but keep them proposal-local unless probes prove they need durable top-level kinds. - - Verification approach: corpus/fixture observer probes comparing old vs refined ontology; relation-policy unit tests for mixed-direction relations; graph-review manual assessment for precision/noise; context-pack probe outputs must show authority, witness, relation support, and directionality labels. - - Parallelization note: this is the semantic-layer lane; keep strategy probes artifact-only until this stabilizes. - - Traceability: Requirement 38; A77, A78, A80, A81, A84, A93; D134, D136, D137, D139, D140, D150; I117. - - Design docs: `docs/design/INTENT_GRAPH_SEMANTICS.md`; `docs/design/SPEC_EVOLUTION_STRATEGIES.md` §Relation directionality; `docs/design/INTENT_SPEC_EVOLUTION.md`. - -3. **Semantic changeset ledger + proposal-turn staleness** — introduce the semantic history spine that separates graph mutation history from conversational turn ancestry. - - Linear: FE-701. - - Work type: persistence / mutation substrate. - - Why now / unlocks: scenario bundle acceptance, direct-edit atomicity, accepted-with-issues flows, stale proposal detection, graph-review repairs, and future architect/reconciliation agents all need a durable semantic mutation boundary. Without it, productized scenario-options can stay probe-only but cannot safely commit candidate bundles. - - Recommended shape: add `changeset` / `change` as canonical schema and operation vocabulary; track `specification.latest_changeset_id`; stamp new turns with `opened_at_changeset_id` / `base_changeset_id`; connect `reconciliation_need.caused_by_changeset_id`; keep proposals/findings as turn-owned artifacts until accepted; ensure only `accept` applies a proposal changeset; and treat a changeset as the smallest atomic unit that preserves semantic coherence. Do not add a first-class `procedure_run` table unless lifecycle/retry/cancel or multi-turn operation grouping demands it. - - Verification approach: DB atomicity tests for changeset + changes + reconciliation_need writes, staleness tests for open proposal turns across multi-chat changes, capability/transition tests proving non-accept actions cannot mutate graph truth. - - Parallelization note: can proceed after/alongside FE-700 if relation-policy interfaces are agreed, but avoid product candidate acceptance until this lands. - - Traceability: Requirements 39, 42, 44; A71, A79, A92; D135, D138, D145, D149; I116. - - Design docs: `docs/design/PATCH_LEDGER.md` (historical filename; future vocabulary is changeset/change); `docs/design/SPEC_EVOLUTION_STRATEGIES.md` §Semantic history and proposal turns. - -4. **Graph-review oracle + scenario-options probes** — build the internal critique path and artifact-only candidate bundle probes before product UI. - - Linear: FE-702 for graph-review / scenario probes; FE-649 and FE-640 remain productization children under FE-698 where relevant. - - Work type: strategy/proposal artifact + oracle probe. - - Why now / unlocks: product wants first-turn strategy choice and a mid-interview `speed this up` affordance, but engineering needs graph-review critique to make generated candidate bundles credible. This lane can advance in parallel with FE-700 if it stays artifact-only and does not commit canonical graph truth. - - Recommended shape: define candidate graph bundle and graph-review finding artifacts; add a graph-review prompt/context pack and rubric covering coherence, fixed-premise respect, coverage, tradeoff honesty, checkability, granularity, scenario fidelity, epistemic labels, provenance, and downstream usefulness; generate 2–3 scenario options that complete the current direction from context-packed accepted graph truth; run fast gates before display and deeper async critique/refine/repair as probe artifacts; classify candidate readiness as `draft` / `reviewing` / `reviewed_clean` / `reviewed_with_issues` / `blocked`; keep broader graph-review issues turn-owned rather than adding a `graph_issue` table. - - Verification approach: scenario-runner fixtures, FE-705 JSONL-generated completed-spec fixtures, raw output review, structured parse validation, qualitative scorecards, and comparison against drilldown-produced graphs. Middle/outer-loop oracle design should decide when fixture candidates become golden. - - Parallelization note: good isolation lane for work that should avoid schema-heavy relation migrations; use `scripts/agent-probes` and context-pack modules rather than product UI. - - Traceability: Requirements 20, 21, 31, 32, 40, 41, 43, 44; A67, A68, A80, A85, A87, A89, A90, A91; D126, D127, D139, D141, D147, D151, D152; I114, I118. - - Design docs: `docs/design/SPEC_EVOLUTION_STRATEGIES.md`; `docs/design/BEHAVIORAL_KERNELS.md`; `docs/design/INTENT_GRAPH_SEMANTICS.md`; `docs/design/AGENT_MUTATION_SURFACE.md`. - -5. **Continuous workspace / phase-addressable interview surface** — cumulative center pane with realized phase sections, one chat runtime per specification, sidebar section navigation, scroll/focus behavior, and the single actionable frontier preserved at the current reachable phase. - - Why now / unlocks: workflow read/write ownership is extracted (FE-616); the multi-chat substrate (FE-697) ships chat containers below the specification, so continuous workspace can adopt one visible runtime without smuggling in a second durable workflow model. Bumped behind V3.0 and the semantic substrate because V3.0 closes a visible V2 gap while FE-700/FE-701 define the graph/turn semantics that multi-strategy work will need. - - Traceability: A58; D86, D87, D110, D113, D114; I24, I102. - - Design doc: `docs/design/CONTINUOUS_WORKSPACE_HYBRID.md`. - - -## Horizon - -### Intent graph and reconciliation - -- **Relation-first observer capture enrichment** — after the next ontology/relation-policy probes, broaden observer relationship extraction across the refined ontology where edge support and operational participation are understood. - - Recommended shape: keep `runObserver()` as the public turn-owned seam, but feed it scenario-specific context packs and validate output through the relation-policy registry. The FE-639 first cut has landed; remaining work should be driven by corpus/manual proving. - - Depends on: prompt/context substrate; intent graph semantics + progressive checkability foundation. - - Traceability: Requirements 30, 38, 40; A66, A81, A84; D125, D136, D137, D139, D140; I109. - -- **Architect / generator loop** — autonomous agent that iterates over the intent graph and proposes semantic changes for HITL review through the same future changeset / reconciliation pathway as user-driven edits. - - Recommended shape: keep productized architect proposals behind multi-chat + reconciliation + semantic changesets; use the scenario substrate for shadow/proposal-only probes first. - - Traceability: A73, A85, A87; D139, D141; depends on chat containers + reconciliation needs and semantic changeset ledger. - -- **Side-chat V3.1 — agent-grouped reconciliation resolution** — once V3.0 ships, a reconciliation agent reads the `reconciliation_need` queue and reclassifies open needs into auto-confirm (review-only items, one-click resolve), auto-edit (mechanical text replacements applied through the standard edit pipeline), and substantive (judgment required, walk inside the side-chat panel using pinned-context conversation). Maps onto MULTI_CHAT.md Phase 3. - - Why later: V3.0 satisfies Acceptance Criterion 7 mechanically; agent grouping is value-add, not gap-closing. Hold until V3.0's mechanical grouping reveals whether substantive items get lost in a flat list (A88 validation). - - Depends on: V3.0 ship; reconciliation agent prompt + grouping policy. - - Traceability: Requirement 10; A48, A88; D135, D137, D138, D146. - - Design doc: `docs/design/SIDE_CHAT.md` §5.3 (V3.1), §9. - -### User-facing capabilities - -- **First-run provider setup** — deferred out of FE-698. Make missing LLM credentials visible on the dashboard, add a shared AI runtime provider seam for interviewer / observer model construction, support UI-entered keys through XDG-compliant user auth state, and evaluate whether OpenRouter should become the preferred onboarding provider while preserving Anthropic-specific capabilities or explicit degradation. - - Linear: FE-633 covers the OpenRouter/default-provider part; dashboard credential UX + XDG key storage may need a sibling issue if split from provider proving. - - Recommended shape: prove the provider resolver first with current Anthropic behavior, then spike OpenRouter against tool use, structured output, and reasoning/thinking options before making it the default. The dashboard should expose credential status without leaking secret values and offer setup before the user starts a specification. - - Traceability: Requirements 34, 35, 36; A74, A75; D130, D131, D132; I106. - -- **Workspace hygiene / `.brunch/` gitignore assist** — detect whether generated local state is already ignored and, with explicit confirmation, add an idempotent `.gitignore` entry or create `.gitignore` when absent. - - Linear: FE-648. - - Recommended shape: keep this as a deterministic local mutation with preview/confirmation semantics; it can ship independently, but the dashboard is the natural surface because it already explains workspace binding and first-run setup. - - Traceability: Requirement 37; A76; D133; I107. - -- **Productized web research capability** — web search and page-fetch tools as interviewer-invoked context gathering, surfaced as preface cards after the scenario substrate proves query framing, tool ergonomics, and provisional-context handling. - - Linear: FE-649. - - Depends on: prompt/context scenario substrate and web-research probe. - - Traceability: Requirements 20, 21, 40, 41; D99, D112, D139, D142. - -- **Dashboard result summaries and completeness metrics** — progress visibility across specifications. - -- **Two-axis interview framing** — adapt interviewer setup and questioning to the full `greenfield <> brownfield` by `end-to-end build <> incremental feature` matrix instead of treating partial-scope work as a special case. - - Linear: FE-638. - - Traceability: Requirement 29; A65; D124. - -- **Productized scenario-options / candidate-spec completion assist** — replace skip-only remainder handling with first-turn strategy choice and a mid-interview `speed this up` side-chat that generates 2–3 reviewed candidate graph bundles with tradeoffs, completing the current direction by default. - - Depends on: prompt/context scenario substrate; intent graph semantics + relation-policy directionality; graph-review oracle; changeset ledger for canonical acceptance. - - Traceability: Requirements 31, 40, 44; A67, A77, A78, A85, A90, A91; D126, D134, D136, D139, D148, D151, D152; I118. - -- **Progressive detail / recursive deflation** — support broad-pass interviewing with explicit next-level-of-detail actions rather than one uniform depth-first drill-down. - - Linear: FE-637. - - Recommended shape: pair ordinary grounding/design question turns with a turn-owned breadth-skeleton artifact that makes current coverage visible and exposes a structured detail reaction (`deepen this area`, `continue broad pass`, `sufficient for now`). The chosen reaction should steer the next same-phase frontier turn instead of introducing a separate detail workflow. - - First cut should optimize for `broad question -> choose one area to deepen next -> focused successor question -> refreshed breadth skeleton`, while keeping the same detail-focus intent reusable later from chat or graph surfaces. - - Traceability: Requirement 32; A67, A68; D127. - -- **Spatial canvas layout for graph view** — add the spatial DAG layout as a second layout choice inside graph mode, alongside the structured-list route. Same projection seam, same intent contract; only the layout strategy changes. - - Recommended shape: a layout switch inside the existing `/specification/$id/graph` route that transforms the same `EntitiesData` projection into a spatial scene with viewport / selection / focus / path-highlighting. First cut should optimize for `select node -> inspect -> launch refinement` through the multi-chat substrate. - - Depends on: graph view structured-list ship. Richer node actions depend on multi-chat / reconciliation rather than the old side-chat conceptual roadmap. - - Traceability: Requirement 33; A69; D128. - -- **Graph view active-path render filter + scope toggle** — render only active-path items by default in graph view, with a `Show all` toggle in the header that flips to the full whole-spec set. Both subsets project from the same in-memory `mode=project-wide` data; no second fetch. - - Depends on: server data-layer change for active-path membership exposure. - - Traceability: Requirement 33; D128, D129; I102. - -### Infrastructure / tooling - -- **Server mini-library compartmentalization** — consider renaming and organizing growing server seams into plural public roots with same-named private subtrees, especially around fixtures, context packs, prompts, scenario runner, entity APIs, and agent APIs. - - Status: refactor idea captured for later, not current work and not a migration commitment. - - Candidate shape: `fixtures.ts` + `fixtures/`, `context-packs.ts` + `context-packs/`, `prompts.ts` + `prompts/` with prompt snapshots colocated under `prompts/__snapshots__/`, `scenario-runner.ts` + `scenario-runner/`, `entity-apis.ts` + `entity-apis/*-route.ts`, and `agent-apis.ts` + `agent-apis/` containing tool/capability-registry subtrees. - - Rationale: make public mini-library boundaries and private implementation compartments more obvious as FE-698 prompt/context and future agent API seams grow. - -- **Structured development spec registry** — prototype file-backed canonical spec records, deterministic checks, generated markdown views, and task-local slices for Brunch's own development workflow (the `ln-*` skill family). - - Status: design horizon, not a migration commitment. Self-tooling experiment for the dev layer; not part of the product roadmap. - - Recommended shape: follow the `memory/spec/{schema,records,generated,tools}/` trajectory and the 5-step migration path (stable IDs → sidecar files → stop editing generated md → `spec:check` in the verify gate → task-local slices). First-adopter candidate: a bounded sub-area such as the multi-chat substrate's records, not the full SPEC. - - Traceability: D134. - - Design doc: `docs/design/DEV_WORKFLOW_EVOLUTION.md` (canonical reference, including the three-layer framing and convergence question); `docs/design/INTENT_SPEC_EVOLUTION.md` (broader synthesis context). - -- **Portability boundaries** — split durable store/read-model, interview session runtime, and workspace capability provider if Brunch targets hosted, remote, embedded, or sandbox-backed operation. - - Status: deferred. Some enabling seams already exist (query domains, workflow projector, no persisted `cwd` on specifications), but adapter-backed portability is not on the live roadmap. - - Deep design source: `docs/design/PORTABILITY_BOUNDARIES.md`. - -- **Agent-native CLI adapter** — future CLI-addressability should project the agent capability contract registry rather than wrap routes or ORM scripts by hand. - - Status: design input captured, not current work. - - Recommended shape: generate or mechanically validate commands from capability contracts; enforce conventional verbs/flags (`get`, `list`, `--json`, `--force`, `--wait`), non-interactive defaults, bounded JSON output, enumerated errors, structured `brunch agent-context` introspection, and a recoverable async job ledger. Durable writes still route through Brunch-owned mutation handlers. - - Traceability: A89; D143, D147. - -- Headless interview driver for scripted end-to-end probes. -- MCP server adapter for core operations. -- Git-friendly file-based persistence representation for diffable exported specs. -- Typed fixture-builder convergence for happy-path tests. - -## Recently Completed - -- [2026-05-12] FE-705 probe harness hardening refactor — Hardened the external agent-probe harness around JSONL process failures, artifact failure reporting, explicit turn budgets, and fixture-candidate structure/readiness checks. Process transports now settle pending requests on protocol errors, malformed output, process exit, stderr, write failure, or timeout; process-backed runs serialize failure artifacts; scripted/process probes accept a configurable turn budget defaulting to two; and fixture reports distinguish `parseReady`, `structureReady`, and normalization debt. Verified: targeted probe-harness tests and `npm run verify`. -- [2026-05-12] FE-705 fixture-candidate normalization checkpoint — Added a fixture-candidate inspector for probe artifact directories. It validates `artifact-bundle.json`, `summary.json`, `raw-jsonl.ndjson`, `final-chat.json`, expected preserved workspace state, completed vs error-run status, and normalization debt for timestamps/durations, temp paths, environment metadata, generated wording, resource ids, and provider-specific error redaction. Verified: targeted test, real-provider packaged smoke artifact `/tmp/brunch-llm-user-smoke-OVPjPG`, and `npm run verify`. -- [2026-05-12] FE-705 opt-in packaged LLM-as-user smoke helper — Added a fake-tested smoke helper that runs `npm run build`, drives the default packaged `node bin/brunch.js agent` command with the model-backed user policy, preserves workspace state, and returns/prints JSON-safe summaries with redacted failure artifacts. Verified: targeted tests, real-provider smoke (`turnsAnswered: 2`, final frontier `answered`, artifact dir `/tmp/brunch-llm-user-smoke-OVPjPG`), and `npm run verify`. -- [2026-05-12] FE-705 model-backed LLM-as-user policy — Added a fakeable model-backed simulated-user policy for the external probe runner. It renders strict JSON prompts from scenario brief, active question, options, and prior Q/A; parses free-text and option-selection responses into `turn.submitResponse` payloads; records prompt/raw-output/parse-status events in artifact bundles; and reports invalid model output as structured probe errors. Verified: targeted tests and `npm run verify`. -- [2026-05-12] FE-705 user-simulator policy interface — Added an injectable probe response policy that receives the scenario brief, current `chat.read` projection, active turn, and prior answered turns; the scripted answer path now runs through that policy seam, and policy failures become structured probe errors. Verified: targeted test and `npm run verify`. -- [2026-05-12] FE-705 probe workspace fixture preservation — Added opt-in `preserveWorkspaceState` support for process-backed probes: run results and artifact bundles now record the temp workspace cwd, and enabled runs copy the workspace `.brunch/` state into `workspace-state/` under the artifact directory while disabled runs keep the existing minimal artifact set. Verified: targeted test and `npm run verify`. -- [2026-05-12] FE-705 probe-runner scripts harness boundary — Moved the external probe runner out of `src/server` into `scripts/agent-probes`, expanded formatter/lint/test/type-check coverage to include `scripts/`, and updated the boundary guard around the script-side harness so it cannot import DB, capability dispatch/registry, schema, core, route-transition, or turn-response authority modules directly. Verified: targeted test and `npm run verify`. -- [2026-05-12] FE-705 probe runner import-boundary guard — Added a static boundary test proving the probe-runner module does not import DB, capability dispatch/registry, schema, core, route-transition, or turn-response authority modules directly; the existing capability/JSONL tests continue to cover the server-owned mutation path. Verified: targeted test and `npm run check`. -- [2026-05-12] FE-705 probe artifact schema and safe summaries — Hardened proof-runner artifacts with schema-versioned bundles, command sequences, raw JSONL transcripts, parsed events, non-secret environment metadata, compact question/answer summaries, duration, and deterministic redacted errors. Verified: targeted test and `npm run check`. Watch: remaining runner boundary guard should mechanically prevent direct DB/handler imports. -- [2026-05-12] FE-705 process-backed probe runner — Added a process JSONL transport plus temp-workspace runner path around the packaged `node bin/brunch.js agent` boundary. The runner can spawn through an injected process adapter, drive the scripted two-turn probe, and write raw JSONL, final chat, and summary artifacts outside `.brunch/`. Verified: targeted test and `npm run check`. Watch: next slice should harden the artifact schema/redaction before treating output as fixture-candidate material. -- [2026-05-12] FE-705 probe runner JSONL client — Added a provider-free scripted probe-runner core over an injected JSONL transport. It drives `spec.create → chat.getPrimary → chat.ensureReady → chat.read → turn.submitResponse → chat.read → chat.ensureReady → chat.read → turn.submitResponse → chat.read`, supports free-text and option-selection responses from `chat.read`, and reports structured errors without DB/handler imports. Verified: targeted test and `npm run check`. Watch: next slice still needs a process-backed temp-workspace runner and artifact writes. -- [2026-05-11] FE-705 real-provider readiness smoke hardening — Hardened `chat.ensureReady` for live provider use: initial generation now uses a non-empty runtime prompt, readiness question persistence falls back from plain text to structured ask-question parts to the turn row written by tool execution, and the manual temp-workspace JSONL smoke reaches a second answerable frontier with JSONL-only output. Verified: targeted tests, real-provider smoke, and `npm run verify`. Watch: next FE-705 slice can add `turn.get` or start the proof-of-life probe runner. -- [2026-05-11] FE-705 agent turn response submission — Added executable `turn.submitResponse` with explicit chat/turn ownership checks, shared turn-response payload validation, delegation to `submitTurnResponseTransition`, and agent-facing read projection that points answered turns back to `chat.ensureReady`. JSONL tests prove `spec.create → chat.getPrimary → chat.ensureReady → turn.submitResponse → chat.read` over explicit ids. Verified: `npm run verify`. Superseded by the live readiness smoke hardening above. -- [2026-05-11] FE-705 generated chat readiness — `chat.ensureReady` now turns an empty generated frontier into an answerable `awaiting_response` frontier by invoking a fakeable interviewer generation boundary, persisting fallback question text plus assistant parts, and preserving idempotence for already-answerable turns. JSONL tests prove `spec.create → chat.getPrimary → chat.ensureReady → chat.read` returns an answerable turn through explicit ids. Verified: `npm run verify`. Superseded by the turn-response submission slice above. -- [2026-05-11] FE-705 deterministic chat readiness — Added `chat.ensureReady` as a runtime-replay JSONL capability that materialized a kickoff-ready primary chat into a persisted empty frontier turn without invoking LLM/provider generation. The handler resolved explicit `chatId` ownership, used the existing phase-entry transition seam, mirrored the active head through spec/chat state, and was idempotent when a frontier already existed. Verified: `npm run verify`. Superseded by the generated chat readiness slice above. -- [2026-05-11] FE-705 primary chat read projection — Added read-only `chat.getPrimary` and `chat.read` agent capabilities. JSONL clients can now create a spec, discover its primary interview chat, and read a compact Brunch-owned chat projection with spec/chat identity, visible active-path turns, frontier state, and neutral next-command hints. Verified: `npm run verify`. Watch: this is read-only; next FE-705 work still needs readiness/generation and turn-response mutation before an external LLM-as-user probe can drive the interview. -- [2026-05-11] FE-705 agent JSONL lifecycle proof — Added `brunch agent` as a long-lived JSONL capability session, with executable `spec.create` and `spec.getStatus` contracts routed through Brunch-owned handlers rather than Express routes or ORM scripts. The packaged CLI can create a real local specification and read it back by explicit `specId`; malformed JSON, unknown capabilities, and schema-invalid inputs return typed error envelopes. Verified: `npm run verify`. Watch: next FE-705 slices still need chat readiness / turn response capabilities and the external LLM-as-user probe runner. -- [2026-05-11] FE-698 reconciliation context-pack slice — Added a proposal-only reconciliation prompt/context scenario that renders open reconciliation needs with source/target anchors, reason/status, prompt/context fingerprints, and read-only capability metadata. This is substrate-only: no FE-674 need lifecycle endpoint, overlay action, side-chat reducer, or durable mutation behavior. Verified: `npm run verify`. Watch: next FE-698 work can move to broader read-only/proposal-only probes and the Pi adapter spike without treating this pack as a resolution agent. -- [2026-05-08] FE-674 planning sync — reconciled `docs/design/SIDE_CHAT.md` §5.3 / §8 / §9 / §13 against the downstack FE-697 substrate; SPEC.md adds A88 (Path 1 sufficiency without agent), D146 (cascade routes through `reconciliation_need`, `deferred: true` apply contract removed at V3.0 ship), I113 (apply opens at least one need per typed dependency edge), and rewrites Acceptance Criterion 7. Doc-only, no `src/` touched. PR #110 stacked on FE-704. -- [2026-05-08] FE-698 prompt/context follow-up hardening — Candidate-spec prompt scenarios no longer advertise durable changeset submission, prompt scenario artifacts report schema version 2 for the fingerprinted shape, scenario definitions require typed context data, empty prompt assets are cached correctly, context-pack anchors use intent vocabulary, and `context-pack.ts` now remains the public entry point over private scenario-specific context-pack modules. Verified: `npm run verify`. Watch: this is still FE-698 continuation hardening; broader generative quality review and additional scenario probes remain later slices. -- [2026-05-08] FE-698 prompt/context remediation + candidate scenario — Prompt scenario definitions are now discriminated by scenario kind, candidate-spec scenarios render deterministic no-provider proposal artifacts from typed context packs, scenario artifacts include prompt/context fingerprints, server prompt asset copying mirrors current source assets, prompt golden coverage protects production prompt text, and the build-boundary prompt test writes isolated output. Verified: `npm run verify`. Watch: full generative quality review for candidate-spec output remains a later execution/probe slice. -- [2026-05-08] FE-698 scenario execution error hardening — Scenario execution failures now serialize safe deterministic summaries: API-key-like provider errors are redacted, non-Error rejections avoid object dumps, and ordinary errors remain reviewable. Verified: `npm run verify`. -- [2026-05-08] FE-698 Anthropic scenario adapter — Added a probe-only Anthropic AI SDK adapter behind the existing `PromptScenarioModelAdapter` seam. Web-research prompt scenarios now map rendered prompts to AI SDK system content and rendered context packs to user prompt content under mocked tests, with unsupported providers rejected before model construction. Verified: `npm run verify`. Watch: this is not the shared AI runtime provider seam; OpenRouter/provider-neutral routing, credential UX, Pi, web tools, CLI/UI, persistence, and Brunch mutations remain out of scope. -- [2026-05-08] FE-698 prompt scenario execution probe — Web-research prompt scenarios can now execute through an injected fakeable model adapter and serialize `succeeded` / `failed` execution results with raw output or deterministic error text, while no-provider artifacts remain deterministic `not-run` snapshots. Structured parsing is explicitly `not-applicable` for this prose-only web-research path. Verified: `npm run verify`. Watch: real provider adapters, Pi, web tools, CLI/UI, persistence, and mutating Brunch handlers remain out of scope for this foundation slice. -- [2026-05-07] FE-698 prompt/context foundation slices — Packaged markdown prompt registry + observer and web-research context-pack foundations + scenario runner capture skeleton/composition + agent mutation-surface audit + capability registry metadata. Server interviewer, observer, side-chat, and web-research role prompts now load from markdown assets through a typed prompt registry; observer capture and web-research probes render typed scenario-specific context packs; seeded prompt scenarios compose production prompts with typed context-pack output into deterministic no-provider probe artifacts; and scenario artifacts can declare validated Brunch capability contracts. Review fixes moved observer prompt composition into a pure module and made prompt scenario prompt sources explicit. The agent mutation-surface audit inventories current and projected agent-originated write paths as input to later handler slices. Verified: `npm run verify` for code slices; audit verified by code-search/document consistency. This is a completed foundation within FE-698, not retirement of the whole FE-698 frontier; the live continuation remains in `Next`. -- [2026-05-07] Side-chat V2 — Edit / Drill-down / Propose-edge plumbing (FE-673, PR #97) — added `edit`, `edge`, and `drill-down` patch kinds. Server `classifyEditImpact` returns `none | soft | hard`; soft applies directly with undo, hard returns `deferred: true` placeholder. Client: patch-list reducer + three applier factories with real undo handlers. Verified: `npm run verify` (935 tests, 19 new). Watch: `SideChatPopover` Edit button stays `disabled` and hard-impact deferred banner is live until V3.0 lands. -- [2026-05-06] Multi-chat substrate + reconciliation needs (FE-697) — `chat` table with one interview chat per spec, nullable `turn.chat_id`, `specification.primary_chat_id`, mirrored `chat.active_turn_id`, plus the `reconciliation_need` queue with directed source/target items, narrow `kind`/`status`, partial unique index on open rows, cascade FK. Spec creation inserts spec + interview chat in one transaction; `advanceHead` is transactional. No user-visible change. Verified: `npm run verify` (673 tests) plus manual fixture playback (39 specs / 81 turns / dual-pointer equivalence). A82 / A83 validated for Phase 1. -- [2026-05-01] Side-chat V1.1 — Explore vertical slice. End-to-end graph-launched chat interaction shipped: prompt builder, POST `/side-chat` SSE endpoint, popover host, graph-view wiring, SSE consumer, and active-button activation. Follow-up refactor collapsed pending assistant text into the message list and extracted `SideChatHost` so activation is a tree-mount fact. This is complete implementation history; future conceptual work is multi-chat / reconciliation, not Side-chat V2/V3. -- [2026-05-04] Graph view structured-list peer route — `/specification/$id/graph` now renders project-wide entities through the structured-list layout with relationship subsections, relation chips, empty state, row controls, and a back-to-chat affordance. Follow-up active-path filtering and spatial canvas remain horizon work. Verified: `npm run verify` in the FE-643 slice family. -- [2026-04-30] FE-650 streamed question cache promotion — `ask_question` tool execution now advances the active frontier, returns the acknowledged turn id, interviewer streams emit a post-finalize `frontier-turn-ready` event, and the client promotes that streamed question into the specification bundle query cache before refetch reconciliation. Verified: `npm run verify` plus dev-mode manual retry; the formerly visible inert-card gap is improved. Watch: if residual scroll jumps persist, inspect remaining pane-wide rerender boundaries around workspace stream projection. -- [2026-04-30] FE-639 relation-first observer capture first cut — eligible answered turns now enter one background observer-capture backlog, observer prompts use compact existing-knowledge anchors, observer output persists validated graph-delta relationship candidates, and accepted review grounding refs reuse the same conservative relation policy. Verified: `npm run verify`. Watch: A66 remains open until corpus/manual graph-review proves edge precision and density are useful. -- [2026-04-27] Runtime JSON payload hardening — Express API parsing now accepts chat-sized request bodies above the default parser ceiling and returns a JSON 413 response instead of Express HTML when a payload exceeds the app limit. Verified: `npm run verify`. Watch: if real chat requests still exceed the 5 MB limit, investigate client history / tool-result pruning rather than only raising the ceiling. -- [2026-04-24] Distribution hardening release path — `package.json` now declares the Node 22+ engine floor, explicit shipped files, and public scoped publish config; `npm run release` drives release-it at repo root, rebuilds and dry-runs the packaged artifact, and documents npm auth prerequisites. Verified: `npm run verify`. Watch: CI trusted publishing is still intentionally out of scope. - -Older history: `docs/archive/PLAN_HISTORY.md` - -## Dependencies - -```text -TRACK A — Semantic substrate (highest coordination) -multi-chat-substrate + reconciliation-needs (completed) - ├──→ intent graph semantics + relation-policy directionality (next, FE-700) - │ ├──→ relation-first observer enrichment (horizon, after ontology/policy probes) - │ ├──→ robust direct-edit / reconciliation cascade (active V3.0 uses mechanical subset) - │ └──→ graph-review oracle can become semantically meaningful - └──→ semantic changeset ledger + proposal-turn staleness (next, FE-701) - ├──→ canonical scenario bundle acceptance - ├──→ direct-edit atomicity with caused_by_changeset_id - ├──→ stale open proposal detection - └──→ architect-loop / verifier/import mutation provenance - -TRACK B — Strategy probes / graph-review / candidate bundles (parallelizable if artifact-only) -prompt/context scenario substrate foundation (completed) - ├──→ agent-capability-CLI + LLM-as-user fixture probe (branch-complete FE-705) - │ └──→ golden completed-spec fixture curation (probe output) - ├──→ graph-review oracle + scenario-options probes (next, FE-702) - │ ├──→ behavioral-kernel targeted-case probes - │ ├──→ candidate bundle readiness gates - │ ├──→ async review/refine/repair worker shape - │ └──→ productized scenario-options / speed-this-up side-chat (horizon) - ├──→ productized web research capability (horizon) - └──→ post-spec oracle/decomposition frontier (probe/future product) - -TRACK C — Graph/workspace surfaces -graph-view-structured-list (completed) - ├──→ active-path-filter-and-scope-toggle (horizon, blocked on server data-layer) - ├──→ spatial-canvas-layout (horizon) - └──→ side-chat-V2-plumbing (completed, FE-673 PR #97) - └──→ side-chat-V3.0-cascade-through-reconciliation_need (active, FE-674) - └──→ side-chat-V3.1-agent-grouped-resolution (horizon) - -TRACK D — Workspace shell / UX -multi-chat-substrate (completed) - ├──→ continuous-workspace (next after active/substrate pressure) - ├──→ first-turn strategy choice (horizon, after strategy artifacts prove) - └──→ scenario-options side-chat (horizon, after graph-review + changesets) - -UNBLOCKED / LOWER-COORDINATION HORIZON -first-run provider setup (needs provider spike / scope) -workspace hygiene gitignore assist (bounded, dashboard-surface candidate) -web-research tools (gate ready, needs tool impl) -dashboard metrics -two-axis interview framing -progressive detail / recursive deflation -structured development spec registry (tooling experiment) -portability boundaries (deferred until substrate goal exists) -``` diff --git a/memory-fe-705/SPEC.md b/memory-fe-705/SPEC.md deleted file mode 100644 index 4e6188d1..00000000 --- a/memory-fe-705/SPEC.md +++ /dev/null @@ -1,658 +0,0 @@ - - -# Brunch v2 — Spec Elicitation Tool - -## Concept & Goal - -Brunch is an AI-guided spec elicitation tool that turns natural-language goals into structured specifications through a four-phase interview: - -- **grounding** — goals, terms, context, constraints -- **design** — commitments and tradeoffs -- **requirements** — capability review and gap-finding -- **criteria** — verification coverage - -An interviewer agent conducts the conversation. A separate observer agent extracts typed intent items from each answered turn and links them into an intent graph. The interviewer may also invoke context-gathering capabilities when it lacks enough orientation for the next move; their visible outputs appear in the stream as preface cards. The workspace stream is turn-centered rather than message-shaped: durable conversational turns provide the branch-bearing lineage spine, while projected control cards, phase markers, and activity cards frame them. An open phase should always bottom out in one visible next action — a projected kickoff card, actionable frontier turn, visible generation state, projected recovery card, or closed-phase handoff / completion control. - -Brunch is strongest while certainty is still being formed: when the real work is clarifying the target, surfacing commitments, and making unresolvedness legible before downstream implementation decomposition takes over. Its output is a calibrated handoff, not fake closure — a truthful starting point for implementation that makes visible what is known, chosen, constrained, required, and still open. Export is therefore built from the active path's accepted review outputs plus reviewed knowledge, not from laundering unresolved uncertainty into a prematurely final document. - -The product direction is from **planning specs** toward **intent specs**. Planning and downstream work sequencing remain useful projections, but Brunch's source artifact should preserve meaning first: what the user commits to, what properties define correctness, which examples or counterexamples disambiguate the intent, which assumptions remain open, what evidence has been accepted, and where ambiguity is explicitly unresolved. Because future agent features and post-spec handoff flows should consume the graph rather than a single transcript, Brunch needs explicit prompt/context engineering: scenario-specific graph context packs, reusable prompt doctrines, and lightweight prompt probes before UI surfaces are committed. - -Brunch operates inside a **workspace**: the cwd-backed software context whose local `.brunch/` directory stores one or more specifications. Grounding supports two strategies: **elicitation-first** for greenfield work and **analysis-first** for brownfield work. Brownfield grounding begins with read-only workspace analysis that produces a visible preface card (grounding brief), and the interviewer may gather more context via preface cards in any phase when it needs orientation. - -Post-launch, Brunch should support specification work across two axes rather than one: `greenfield <> brownfield` and `end-to-end build <> incremental feature`. That means the interview cannot assume one long whole-product drill-down. It should be able to start broad, deepen recursively where needed, synthesize candidate directions when the user wants help filling in the gaps, and let the intent graph itself become a working surface for refinement instead of only a sidebar summary. - -## Constraints & Non-goals - -- Anthropic direct is the current runtime implementation; near-term provider work may add OpenRouter or provider-neutral routing, but Brunch remains user-supplied-key / no hosted inference account for now. -- No collaborative editing. -- No explicit document-ingestion UX in V1. -- No hard turn-tree branching UX in V1; revisit operates through knowledge-graph edit mode + secondary threads instead. -- No automatic cascade deletion; downstream effects are surfaced and re-resolved explicitly. -- No task-planning surface; Brunch elicits specs, it does not plan implementation work for the user. -- No downstream execution-management workflow in V1; Brunch ends at the handoff boundary rather than owning implementation after export. Verification-aware decomposition and orchestration are a future product frontier to probe through agent-harness experiments before any UI commitment. -- No general-purpose inline document editor in review phases; requirements and criteria review stay recommendation-led with lightweight user comments for revision. -- No offline-first or multi-tab sync layer; the current system stays server-authoritative and local-first. - -## Requirements - -1. `npx brunch` in a project directory with configured supported LLM provider credentials opens a working app in the browser with state in local `.brunch/`. -2. Starting a new specification asks only for the specification name before entering the workspace; greenfield / brownfield grounding strategy is then chosen through grounding entry states inside the specification workspace. -3. Brownfield grounding can use read-only workspace analysis to ground the opening flow and the first substantive question. -4. Structured responses support turn-appropriate option selections or explicit action submissions, an explicit `none of the above` path where relevant, and one attached response note. The interviewer autonomously chooses whether to include options on each question based on conversational trajectory; grounding accepts either a free-text response or one-or-more selected options, with the response note optional when an option is selected and required only for the `none of the above` path. Design preserves the current selection-required gate with a structural "none of the above" path. A single turn may carry multiple assistant-part artifacts (e.g. a preface card followed by a question card, or a revision card followed by a review set) rendered as stacked cards with one unified response submission. -5. Users can see thinking, tool usage, and streaming progress in real time; if live-only artifacts are shown, replay keeps concise durable activity metadata (at minimum elapsed thinking time plus a coarse tool-use summary / placeholder seam) instead of dropping them completely. -6. The observer extracts typed intent items and intent edges from answered turns. -7. The accumulated knowledge layer and readiness state stay visible during the interview. -8. Each workflow mode has deterministic closeability plus a separate readiness signal. -9. Phase close records summary text and closure basis. -10. Users can revisit knowledge through edit mode, cascade preview, and a secondary thread. -11. Requirements review synthesizes a candidate requirement set from the knowledge layer, presents stable item reference codes, supports per-item commenting through an inline comment toggle on each item, and resolves through explicit `accept review` / `request changes` submission with per-item comments plus one optional global review note. -12. Criteria review synthesizes a candidate verification set from accepted requirements plus the knowledge layer, presents stable item reference codes, and supports the same per-item commenting and full-set review seam. -13. Export is available only when workflow closure, accepted review outputs, and staleness rules are satisfied. -14. Closing and reopening the browser resumes the specification from persisted state. -15. The dashboard shows multiple specifications / elicitation runs within one `.brunch/` directory. -16. Partial-scope elicitation works for a feature or bounded sub-area, not just whole-workspace greenfield specs. -17. Each phase exposes an explicit kickoff, frontier, recovery, handoff, or completion affordance; the UI must not strand the user with a bare generic composer as the only visible action. -18. Open interview phases default to a projected kickoff card, the current frontier turn, a visible generation state, or a projected recovery affordance when the frontier is missing, and closed phases terminate in a projected handoff or completion artifact at the bottom of the workspace stream. -19. The first phase is grounding in both product language and canonical workflow identifiers. -20. The interviewer may invoke context-gathering capabilities such as workspace analysis in any phase when the workspace directory is available; their outputs appear as visible preface cards paired with question cards within the same turn. -21. Preface cards are provisional context rendered as turn-internal artifacts paired with a question card within the same turn, so the observer captures from the whole validated unit (preface context + question + user response) rather than from unvalidated provisional content alone. -22. Grounding and elicitation persist only the durable exploration ontology (`goal`, `term`, `context`, `constraint`, `decision`, `assumption`); `non-goal` is represented as a `constraint` subtype, and requirements / criteria become durable only through accepted review outputs. -23. The knowledge ontology is defined once and projected consistently through schema, shared registries, observer prompts, API types, fixtures, and UI copy so kind semantics do not drift across layers. -24. Each phase section in the workspace stream opens with a phase section header that states the phase purpose and what kinds of knowledge are captured there, projected from workflow state rather than persisted as a turn. -25. When a user requests changes on a review set, the interviewer regenerates the full set as a successor review turn; revisions stack in the turn lineage but visually only the current revision renders live with a version badge, while prior revisions collapse to compact answered-turn summaries. A revision card (changelog + version badge) renders above the review set card within the same successor turn. -26. The homepage surfaces workspace (CWD) binding so the user understands that listed specifications and the "new specification" affordance are scoped to the current project directory. -27. The grounding interviewer prompt uses a hint-guided priority-ordered topic list (concept, users/audience, existing constraints, scope boundaries) with example question shapes rather than generating questions from scratch, keeping thinking budget low and generation lightweight. -28. Observer capture treats the full turn — including any turn-internal preface card or revision card plus the question or review set plus the user response — as one atomic validated unit for knowledge extraction. -29. Grounding captures both workspace novelty (`greenfield` / `brownfield`) and delivery posture (`end-to-end build` / `incremental feature`), and interviewer behavior adapts to any point in that matrix rather than assuming a whole-product greenfield interview. -30. Observer extraction treats typed relationships as first-class across the ontology and records them whenever they can be reasonably traced from a turn or accepted review state, while abstaining when support is weak. Relationship extraction must stay prompt-budgeted: existing entities should be presented as compact identity anchors, not full Markdown inventories or graph dumps. -31. Users can request a turn-owned candidate-spec set during grounding or design instead of only skipping the remainder of a phase; each candidate direction includes implications, tradeoffs, likely generated knowledge, and what it rules out, and the user can accept a direction, request refinement, reject, or regenerate candidates. Accepting a candidate direction may steer the next interview move and materialize intent items, but does not itself close the phase. -32. Interview detail can proceed as a progressive broad-pass-to-detail flow with explicit `next level of detail` actions, rather than only as one monolithic linear drill-down. -33. Graph view is a first-class alternative to chat view, accessed as a peer route, and projects the intent graph as a navigable workspace with visible relationship topology and supports launching refinement side-chats from graph selections. The first ship is a structured-list layout; a spatial canvas layout follows as a layout switch inside graph mode. -34. First-run setup detects missing expected LLM provider credentials before the user starts a specification, makes the missing-key state visible on the dashboard, and offers a guided setup path rather than requiring README / shell-env debugging. -35. If Brunch accepts an API key through the UI, it stores credentials outside the project workspace in XDG-compliant user auth/config state; project `.env` files and `.brunch/` never become the default secret-storage target. -36. LLM provider configuration is owned by a shared AI runtime provider seam, so interviewer and observer model creation do not encode direct provider imports or environment-variable reads as product truth. That seam must preserve provider-specific capabilities such as Anthropic thinking / reasoning options or degrade them explicitly. -37. Workspace hygiene detects whether the local `.brunch/` directory is git-ignored and, with explicit user confirmation, can add an idempotent `.gitignore` entry, creating `.gitignore` when absent. -38. The product ontology should expand beyond the current exploration + review kinds to support `invariant` and `example` as first-class durable knowledge kinds, with observer prompts and promotion rules that distinguish descriptive context, constraints, decisions, assumptions, requirements, invariants, criteria, and examples without treating every answer as a decision. -39. Specifications can own multiple durable chat containers below the specification, with turns gradually moving toward chat ownership while preserving current spec-scoped compatibility during transition. The same substrate records directed `reconciliation_need` process debt when changed intent items may affect other graph truth; semantic intent edges remain separate (currently persisted as `knowledge_edge` rows during transition). -40. Prompt and context engineering are first-class server subsystems: prompts and reusable policy doctrines live as inspectable markdown assets, while typed context-pack builders derive scenario-specific intent-graph renderings for interviewer, observer, research, candidate synthesis, behavioral kernels, reconciliation, architect, and downstream decomposition probes. -41. Agent-heavy future capabilities can be tested before product UI exists through a lightweight scenario substrate that runs prompt/context packs against seeded graphs or transcript fixtures, captures raw and structured outputs, and supports harness comparison. Scenario execution may use the existing Anthropic API key or fake adapters for probes, but first-run provider setup, credential storage, OpenRouter defaulting, and the shared production AI runtime seam belong to the provider setup frontier. Pi may be evaluated as a lower-level agent harness, especially for tool experiments and pre-UI probes, but Brunch product authority over durable workflow, replay, graph mutation, and reconciliation remains explicit. -42. Agent-originated mutations of Brunch data use one typed server-owned mutation surface regardless of caller. Internal interviewer/observer flows, scenario probes, CLI/TUI harnesses, Pi or other harness adapters, and future external agents may not mutate durable Brunch state by calling the ORM directly; they must invoke stable mutation handlers with input/output schemas, authority metadata, replay policy, and reconciliation/changeset-ledger semantics. Read-only capability contracts may share the same registry shape, but the hard invariant is single-entry mutation authority. -43. A local agent capability CLI can expose Brunch-owned capability contracts over long-lived JSONL stdin/stdout so an external probe runner or harness can drive the real specification flow without privileged ORM access. The CLI is an adapter over capability contracts, not a separate product API: calls carry explicit resource identifiers, read commands distinguish structured `get` / `list` data from agent-facing `read` projections with affordance hints, and mutating commands stay small and procedural around spec lifecycle requests, chat readiness, and turn response submission. The LLM-as-user scenario brief, model choice, fixture curation, and probe artifacts belong to an external probe runner that talks to the CLI like any other agent. -44. Specifications can evolve through multiple chat-local strategies rather than one global interviewer mode. A chat's first frontier turn may offer or declare its strategy (`step_by_step`, `scenario_options`, `targeted_cases`, `graph_review`, `reconciliation`), and every active/resumable chat should have at most one open assistant/system-first frontier turn waiting for a user completion action. Proposal turns use normalized completion semantics (`accept`, `reject`, `revise`, `ask_followup`, `defer`, `regenerate`); only acceptance of a proposal turn may apply that proposal's semantic changeset. Mid-interview acceleration should branch into a side-chat / strategy chat that completes the current direction from context-packed graph truth, while graph-review critique remains the internal oracle for judging and repairing generated candidate bundles. - -## Assumptions - - - -| # | Assumption | Confidence | Status | Depends on | Validation approach | -| --- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------- | ------ | ------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| A15 | The LLM can offer useful coarse readiness and closure recommendations, but closure authority must remain explainable and user-legible rather than model-owned. | medium | open | D65, D66 | Manual comparison of model recommendations vs user judgment across varied projects. | -| A20 | Users experience observer capture as responsive when every eligible answered turn enters one turn-owned background capture backlog instead of blocking chat stream completion, while replay still attaches capture status and results to the originating turn. | medium | open | D22, D96, D113, D123 | Measure stream completion timing, backlog draining, and replay clarity across grounding, design, requirements, and criteria turns. | -| A48 | Knowledge-graph edges are sufficient to drive accurate cascade preview for revisit work. | medium | open | D50, D80 | Structural cascade tests plus manual judgment about scope. | -| A49 | A modal secondary thread can resolve revisit implications without forcing a full interview restart. | medium | open | D80 | Manual revisit walkthrough once the thread lifecycle lands. | -| A51 | Grounding plus design remain legible if the primary input surface is the workspace-owned card family — durable turn cards for substantive elicitation plus projected control cards for structural affordances — rather than a persistent global composer. | medium | open | D89, D93, D94, D110 | Manual walkthroughs on grounding, design, and resumed states plus story review of entry / handoff patterns. | -| A53 | Concise durable activity summaries are sufficient to preserve transcript trust for live thinking/tool artifacts without persisting hidden reasoning or raw tool results. | medium | open | D93, D112 | Manual replay/reload walkthroughs on streamed turns once transcript activity summaries land. | -| A54 | An open phase can reliably project a kickoff control card, current frontier turn, visible generation state, or projected recovery card on first render without requiring the user to bootstrap the phase by typing into a generic composer. | medium | open | D89, D94, D95, D110 | Manual walkthroughs on kickoff-ready, design-active, review-active, and recovery states. | -| A55 | Trailing observer capture remains trustworthy if waiting/applying state stays attached to the answered turn and deferred completion writes back through that turn's identity rather than the current frontier. | medium | open | D96, D113, D123 | Manual timing walkthroughs plus reload/resume tests on seeded turns with known deferred observer work. | -| A57 | A specification-scoped lifecycle seam — whether implemented as a lightweight runtime supervisor, router-integrated service, or chart-backed helper — can own duplicate-safe automatic phase entry / continue, late-event suppression, and route-independent in-flight operation identity without introducing a second durable workflow model or a general runtime-operations ledger. | medium | open | D113 | Prototype the lifecycle seam on auto-present / recovery / force-close edges; if duplicate-submit or restart truth remains ambiguous, revisit whether the seam needs stronger runtime machinery or more durable coordination. | -| A58 | A cumulative workspace can preserve phase legibility and workflow honesty if realized sections stay visible as historical record, future sections do not render until reachable, and section focus remains navigation-only state rather than redefining durable workflow truth, reachability, or the single actionable frontier. | medium | open | D86, D110, D113, D114 | Prototype the cumulative workspace against future-phase deep-link redirects, scroll/focus transitions, close-to-next-phase motion, and resume/reload walkthroughs; if unrealized-phase routing or single-frontier clarity drifts, keep the current per-phase rendering boundary. | -| A59 | Interviewer-autonomous question format — where the model chooses whether to include options based on conversational trajectory rather than rigid phase rules — produces better grounding conversations than mandating free-text-only, because the interviewer naturally starts open-ended and adds suggestive options as the user's thinking narrows. The observer can interpret option selections phase-appropriately (resonance in grounding, commitment in design) without schema changes. | medium | open | D89, D110, Requirement 4 | Manual walkthroughs across greenfield and brownfield grounding comparing interviewer-chosen format vs phase-mandated format; check whether observer captures stay coherent when the same selection structure carries different semantic weight by phase. | -| A60 | A concise phase section header (purpose + captured knowledge kinds) is sufficient to orient the user at phase entry without requiring a longer onboarding flow or tutorial card. | medium | open | D116 | Manual walkthroughs on fresh specifications; check whether users understand what the phase expects of them. | -| A63 | Hint-guided grounding prompts produce meaningfully adapted questions rather than degenerating into rote template output across different projects. | medium | open | Requirement 27 | Manual greenfield walkthroughs across varied project types; compare question quality against the current unconstrained prompt. | -| A64 | Replacing coarse `router.invalidate()` with query-owned invalidation boundaries eliminates the scroll-jank cascade without introducing coordination complexity or stale-data bugs; the near-term boundary may be one specification bundle domain plus a separate entities domain rather than a fake finer split. | medium | open | D121 | Prototype the staged bundle + entities decomposition and measure scroll stability plus data freshness during observer updates. | -| A65 | The interviewer can adapt usefully to the full `greenfield <> brownfield` by `end-to-end build <> incremental feature` matrix without making kickoff feel bureaucratic or over-parameterized. | medium | open | D124 | Manual walkthroughs across all four corners of the matrix, including partial brownfield feature work. | -| A66 | Relation-first observer capture will improve revisit, export grounding, and graph-view utility without flooding the graph with speculative or low-value edges. | medium | open | D50, D125 | Run post-FE-639 observer corpus probes plus manual graph/export review focused on edge precision, coverage, and visible usefulness. | -| A67 | Users who are tired, rushed, or under-informed will converge faster by reacting to synthesized candidate directions than by continuing a long direct interview or force-closing early. | medium | open | D126, D127 | Manual user-flow comparison between direct questioning, skip-close, and candidate-spec reaction flows. | -| A68 | Broad-pass interviewing followed by explicit deepen-detail actions will preserve coherence better than a single depth-first drill-down while still producing export-worthy specifications. | medium | open | D127 | Prototype broad-pass-first flows and compare resulting knowledge completeness and user comprehension. | -| A69 | A graph-centric refinement surface can launch side-chats without splitting durable specification truth, so chat view and graph view stay two projections over one evolving graph. | medium | open | D128, D114 | Prototype graph-launched refinement with reload/resume checks to ensure side-chat state and graph state stay coherent. | -| A70 | The structured-list graph-view layout provides standalone enumeration value beyond relationship density: users benefit from seeing all intent items grouped by kind even when most have no edges yet, and graceful degradation (collapse the relations footer when zero edges) keeps the view honest while relation-first observer capture matures. | medium | open | A66, D128, D129 | Manual walkthroughs at low and high edge density once the structured list ships; check whether the layout still feels valuable when most items have empty relations footers, and whether observer-density growth visibly improves the view over time. | -| A71 | Semantic mutations will eventually need a changeset-ledger history distinct from conversational turn ancestry, but the first implementation should prove chat containers and reconciliation needs before committing the full ledger shape. | medium | open | D135 | Build chat containers plus reconciliation needs first; revisit whether turn-linked provenance remains sufficient before adding full semantic changesets. | -| A72 | Intent items can carry version history without breaking the active-path durable-truth contract: each version is the result of an applied semantic mutation, prior versions are queryable for diff / comparison / audit, and the active-path projection always reflects the latest version for each item. | low | future | A71, D135 | Prototype item versioning behind the changeset ledger; verify that revisit cascades, span-anchored annotations, and soft-edit audit trails behave correctly across versions. | -| A73 | Autonomous architect / generator loops can propose useful graph mutations only after human-driven multi-chat and reconciliation surfaces prove the shared mutation pipeline. | low | future | A71, D135 | Run architect proposals in shadow mode after multi-chat / reconciliation seams stabilize, then compare proposed changes against user-driven edits. | -| A74 | OpenRouter may reduce first-run friction for Brunch's likely users compared with requiring direct Anthropic keys, but model capability parity and AI SDK support need proof before making it the default provider path. This is provider-setup work, not a default FE-698 prompt/context substrate task. | medium | open | D130, D131 | In the first-run provider setup frontier, spike provider configuration against interviewer/observer calls, especially model naming, structured output, tool use, and reasoning/thinking support. | -| A75 | XDG-compliant user-scoped auth/config storage is acceptable for UI-entered API keys and safer than writing secrets to the project workspace, while environment variables remain useful for automation and CI. | medium | open | D130, D132 | Prototype key save/load/delete precedence and inspect OS/XDG paths; manual first-run walkthrough verifies users understand where the key is stored. | -| A76 | Users will accept Brunch editing `.gitignore` when the action is explicit, previewable, and idempotent; doing so should reduce accidental commits of `.brunch/` without feeling like surprising repo mutation. | high | open | D133 | Unit-test ignore detection / append behavior and manual dashboard walkthrough with absent, present, and already-covering `.gitignore` states. | -| A77 | Progressive checkability will improve generated specs more than a binary "formal / not formal" framing, because the weakest sufficient witness may be prose, example, test, runtime contract, invariant, proof obligation, or explicit unresolved ambiguity depending on the intent item. | medium | open | D134 | Prototype intent-item-to-witness review on a small corpus and compare whether users can validate meaning without being forced into formal-methods terminology. | -| A78 | Adding `invariant` and `example` as product ontology candidates will make intent drift easier to detect without overwhelming early interviews, provided examples carry subtypes such as positive, negative / counterexample, edge-case, and not-relevant rather than expanding into many top-level kinds. | medium | open | D134 | Run transcript probes for examples, counterexamples, not-relevant cases, and state/transition rules; check whether items improve export and review quality or create noisy capture. | -| A79 | Once semantic truth can change through graph edits, side-chats, reconciliation, verifier feedback, or implementation feedback, turn ancestry alone will be insufficient as the semantic history spine. | medium | open | D135 | Prototype chat containers and reconciliation needs before full patch history; revisit if turn-linked provenance remains enough for first-class graph editing. | -| A80 | Behavioral kernels can generate higher-yield disambiguating questions than generic elicitation prompts, but only if kernels stay as interviewer / architect / wizard machinery that emits checkable artifacts rather than user-visible formalism. | low | open | D134 | Try state/lifecycle and containment/topology prototypes first, and compare question value against current prompt-only interviewing. | -| A81 | Knowledge edges can carry intent semantics without becoming too noisy only if relation policy distinguishes semantic relations from reconciliation needs, and distinguishes display edges, cascade-participating edges, export-relevant edges, staleness-producing edges, and low-confidence suggestions. | medium | open | D137 | Design relation-policy semantics before broad observer edge expansion; test low- and high-density graphs for user trust and operational noise. | -| A82 | A soft dual-pointer migration can introduce chat containers without destabilizing current spec-scoped reads: `turn.specification_id` and `specification.active_turn_id` can remain temporarily while `turn.chat_id`, `specification.primary_chat_id`, and `chat.active_turn_id` become the future ownership path. A separate `active_chat_id` is deferred until multiple active chat surfaces need an explicit UI-level pointer. | medium | validated | D138 | Validated by FE-697: `chat-substrate.test.ts` proves read-path equivalence (`spec.active_turn_id === spec.primary_chat → chat.active_turn_id`) and parent-chat consistency; `npm run verify` shows no regression in the existing interview flow. | -| A83 | A minimal item-to-item `reconciliation_need` table is enough for the first queue if it carries narrow kind/status values plus nullable provenance placeholders, and if future relation targets / changeset provenance can extend the shape without renaming the concept. | medium | validated | D137, D138 | Validated by FE-697 for the Phase 1 substrate: `reconciliation-need.test.ts` proves the partial unique index, lifecycle, cascade, and multi-kind-per-pair behaviors. Forward extensibility (changeset provenance, relation-targeted needs) remains untested until the changeset ledger lands. | -| A88 | Path 1 deterministic enumeration over existing `knowledge_edge` rows incident on a changed knowledge item produces a useful cascade preview without requiring the reconciliation agent. Mechanical grouping by `reconciliation_need.kind` plus relation type is enough for the user to walk a hard-edit cascade in V3.0; agent-grouped resolution (auto-confirm / auto-edit / substantive) is V3.1 work that can ship later without reshaping the queue. | medium | open | D135, D137, D138, D146 | Manual hard-edit walkthroughs across the side-chat V3.0 fixture matrix (leaf, 2-downstream, 5+-downstream, in-active-review-set, mixed `supersedes` / `needs_confirmation`). Check whether the mechanical grouping reads as actionable or whether substantive items get lost. If users skip needs without resolving, V3.1 agent work moves up the priority list. | -| A84 | Scenario-specific graph context packs can replace transcript-as-default prompt context without losing conversational nuance, provided packs preserve authority, provenance, unresolvedness, relation neighborhoods, and recency where relevant. | medium | open | D139, D140 | Build prompt/context probes over seeded graphs and compare generated observer, interviewer, candidate, and oracle/decomposition outputs against transcript-heavy baselines. | -| A85 | A lightweight prompt scenario substrate will let Brunch validate LLM-heavy product directions faster than building UI first, if it captures rendered prompts, context packs, model settings, raw outputs, structured parses, and human review notes as repeatable artifacts. | medium | open | D139 | Run multi-scenario prompt probes for observer ontology, behavioral kernels, candidate-spec assist, and downstream oracle/decomposition before productizing their UI. | -| A86 | Pi can serve as a useful pre-UI agent harness or tool-spike backend without forcing Brunch to adopt Pi as its production agent runtime, as long as integration remains adapter-shaped and Brunch-owned authority/replay/mutation semantics stay outside the harness. | low | open | D142 | Spike Pi SDK or RPC with in-memory sessions, custom tools, controlled prompts, and Brunch graph context packs; evaluate event capture, tool ergonomics, provider handling, packaging, and isolation. | -| A87 | Verification-aware post-spec decomposition can be explored as agent scenarios before it is a Brunch product surface: intent graph truth plus progressive checkability can feed design alternatives, oracle strategy, execution slices, and orchestration constraints. | low | future | D141 | Prototype decomposition and oracle-design probes inspired by `ln-design` and `ln-oracles`; compare outputs for traceability to requirements, invariants, examples, criteria, and blind spots. | -| A89 | A long-lived local JSONL agent capability CLI can drive the real Brunch interview flow well enough for external LLM-as-user probes to produce credible completed specification fixtures, while keeping product resources explicit in every call and using ambient process state only for runtime plumbing such as DB handles, provider config, and in-flight generation bookkeeping. | medium | open | D143, D147, Requirement 43 | Prototype the minimal `brunch agent` JSONL loop over capability contracts, then run small LLM-as-user scenarios end-to-end through `chat.ensureReady`, `chat.read`, `turn.submitResponse`, `spec.requestPhaseClosure`, and `spec.requestExport`. Validate that probe logs are replayable, no probe code imports DB/product handlers directly, and no durable operation ledger is needed for the first readiness semantics. | -| A90 | Users who ask to speed up a long interview will prefer a side-chat that generates 2–3 reviewed scenario options completing the current direction over continuing the primary drilldown, provided existing accepted graph truth is treated as fixed premise by default. | medium | open | D126, D148, D151, Requirement 44 | Probe scenario-options against drilldown fixtures and run manual flow review: do users understand the tradeoff profiles, preserve trust in prior answers, and return to the primary interview when generated options disappoint? | -| A91 | Graph-review critique can make scenario-generated candidate bundles safe enough for product use without requiring perfect one-shot generation, if candidate readiness distinguishes `reviewed_clean`, `reviewed_with_issues`, and `blocked`, and if accepted-with-issues immediately opens durable follow-on review work. | medium | open | D151, D152, Requirement 44 | Run candidate bundle probes with graph-review scoring and human review; verify accepted-with-issues flows create a graph-review frontier or appropriate reconciliation needs rather than hiding defects. | -| A92 | A conservative global staleness rule for open proposal turns — stale when `specification.latest_changeset_id` differs from `turn.opened_at_changeset_id` — is acceptable before neighborhood-level staleness calculation exists. | medium | open | D149, I116 | Exercise multi-chat proposal flows where another chat applies a changeset while a proposal remains open; check whether regeneration prompts feel safe rather than noisy. | -| A93 | Relation-policy directionality lookup is safer than trying to force all useful intent-edge verbs into one dependency direction, because graph edges must serve display, context packs, export trace, reconciliation, critique, verification, candidate generation, and explanation. | medium | open | D137, D150 | In FE-700, define canonical/inverse sentences and source/target change behavior for each relation; test direct-edit and hard-impact cascade against mixed-direction relations. | - -## Decisions - - - -22. **Observer-result sync is turn-owned and background by default** — eligible answered turns enter one turn-owned observer capture backlog after durable turn finalization, and chat stream completion must not wait on extraction. Capture may still surface results through the existing turn-owned `data-observer-result` artifact once available, but the runtime path should be normalized around the `/api/specifications/:id/turns/:turnId/observer-capture` seam rather than split between inline stream-blocking capture and deferred capture. This preserves one durable workflow model: durable truth remains the answered turn plus any persisted observer result part, not a separate workflow store or ledger. - -50. **Knowledge relationships live behind one typed graph seam** — persisted graph edges are first-class and drive dependency, derivation, and revisit behavior. -65. **Phase outcomes are explicit durable records** — workflow status, closeability, readiness, and closure provenance project from durable phase outcomes on the active path. -66. **Interviewer-recommended and user-forced closes share one transcript-friendly seam** — one phase-close transport handles both paths, with explicit closure basis. -80. **Intent-graph revisit replaces hard turn-tree branching for V1** — revisit starts from edit mode on intent items, traces cascade through intent edges, and resolves through a secondary thread. **Updated 2026-05-07 (D135):** the older modal secondary-thread and side-chat V2/V3 persistence shapes are superseded by the multi-chat + reconciliation-need direction; the user-facing revisit/cascade goal remains live. **Chat-level branching note:** the no-turn-tree-branching invariant remains in force at the *turn* level, but multiple chats per spec are explicitly allowed at the *chat* level once the multi-chat substrate lands. Branching at the chat level is not user-surfaced as a generic `branch this thread` affordance by default; it manifests through graph-anchored refinement / reconciliation surfaces. -86. **The client is organized by phase-addressable routing and three concentric layout shells** — AppLayout, SpecificationWorkspaceLayout, and ViewLayout own the user-facing route structure. Interview phases remain router-addressable for deep links, gating, and sibling route composition even if the center pane later renders them inside one continuous workspace surface. -87. **Layout-level data ownership partitions invalidation** — the specification bundle and entity collections subscribe through separately owned query domains / route surfaces instead of one monolithic refresh boundary, so entity refreshes do not remount or tear down the transcript-owning surface. -89. **Primary grounding/design input is workspace-owned and card-owned** — substantive elicitation in grounding and design proceeds through durable turn cards inside the workspace stream, while structural phase-entry, recovery, and handoff affordances project as control cards in that same stream; the global bottom composer is not the canonical input seam. Preface cards accept optional comment + continue, while question cards collect substantive answers. Depends on: A51. Supersedes: —. -93. **Replay for elicitation phases is turn-centered, not message-shaped** — completed interview turns collapse into answered-turn records that summarize the offer, the structured user response, and the capture status, while phase markers, projected control cards, and activity cards render as stream elements around those turns rather than as ordinary chat bubbles. Depends on: A51, A53, D110. Supersedes: —. -94. **Phase progression is frontier-anchored** — every open phase bottoms out in exactly one visible next action: a projected kickoff card, actionable frontier turn, visible generation state, or projected recovery card. Accepting a frontier turn durably creates its successor turn, successor generation avoids closed-without-frontier gaps, and recovery is a structural fallback that appears whenever an open phase lacks a valid frontier rather than another generative turn that must itself be created. Closure proposals remain durable proposal-shaped turns on the active path; accepting one confirms phase closure and opens the next phase into its projected entry state, while rejecting one keeps the phase open and requires a same-phase successor frontier. If a phase is closed, the stream bottoms out in a handoff or completion control. Depends on: A51, A54. Supersedes: —. -95. **Structural control affordances project from workflow state rather than masquerading as ordinary turns** — kickoff, recovery, and end-of-phase affordances derive from workflow state, phase outcomes, and neighboring turn anchors instead of from incidental copy or mandatory durable turn rows. Any durable implementation seam used to help project them must be treated as transitional and must not redefine their product meaning as authored conversational turns. Depends on: D65, D94, D110. Supersedes: `why`-based kickoff/recovery sentinels and the earlier persisted-turn-kind framing. -96. **Observer capture trails interviewer progression through one turn-owned backlog** — interviewer completion may unlock the next turn before observer capture finishes, and that should be the default lifecycle for all eligible phases rather than a grounding/design exception. Any trailing observer state remains attached to the just-answered turn card instead of surfacing as a free-floating transcript row; observer-result transport carries the originating turn identity so late capture can hydrate back into that same card. Depends on: A20, A53, A55. Supersedes: inline stream-blocking observer capture as a normal completion path. - -110. **The workspace stream is a merged read model, not identical to the turn tree** — active-path durable conversational turns remain the only branch-bearing lineage spine; durable non-turn workflow facts such as phase outcomes anchor themselves to turn ids for provenance, ordering, and invalidation; projected control cards, phase markers, and activity cards derive from workflow state plus nearby anchors instead of requiring their own turn rows. Depends on: D65, D89, D93, D94, D96. Supersedes: the implicit equivalence between rendered cards and persisted turns. - -111. **The app is seed-first and migration-light until the data model settles** — prioritize one truthful read-model contract plus up-to-date seeded scenarios over compatibility for legacy local rows. Durable authority comes from active-path substantive turns, `phaseOutcome`, workflow state, and the current canonical record/phase identifiers; projected kickoff / recovery / handoff affordances must be derived from those facts rather than preserved as canonical control-turn rows. Transitional seams may survive briefly as internal submit plumbing, but new server reads, client renders, fixtures, and happy-path tests must not depend on legacy aliases or adaptation layers as product truth. When a naming or persistence cutover lands — including `project` → `specification` and `scope` → `grounding` — destructive reseed is preferred over spending time on migration logic for unstable local data. Depends on: D95, D110. Supersedes: the implicit bias toward preserving legacy control-row compatibility during the cutover. - -112. **Turn-artifact persistence is server-owned and interviewer-shaped** — durable review-set, preface-card, activity-summary, and phase-summary artifacts materialize from interviewer output through one server helper, so the chat-runtime finalize path acts as orchestration glue instead of reconstructing artifact semantics ad hoc. Replay, accepted-review materialization, and seeded walkthroughs therefore consume the same persisted artifact contract the interviewer produced. Depends on: D93, D96, D110. Supersedes: the ownership split where runtime finalization re-derived grounding/review artifacts outside one authoritative persistence seam. - -113. **Phase lifecycle side effects are specification-scoped, not route-scoped** — durable workflow truth, landing reconciliation, and routed read-model projection remain authoritative; they do **not** move into a second client-side workflow store. The router continues to own navigation, loader/query subscription, and rendering of the derived read model. A separate specification-scoped lifecycle seam owns only the ephemeral process concerns that routes are poor at holding correctly: one-shot automatic phase entry / continue, in-flight operation identity, duplicate-submit suppression, cancellation, stale-event rejection, and capture-backlog reseeding after hydration. That seam may be implemented as a lightweight runtime supervisor, router-integrated service, or chart-backed helper, but its implementation is intentionally left open; what is decided here is the ownership boundary, not a mandatory framework. Constraints: (1) no second durable workflow model or general runtime-operations ledger by default, (2) no independent client authority over phase status, landing truth, or handoff/completion semantics, (3) no route-local `useEffect` or remount-tied behavior as the trusted owner of lifecycle effects like auto-present, and (4) any lifecycle helper must consume durable truth and emit idempotent, ignorable side effects rather than redefine product state. Depends on: D87, D94, D95, D96, D110, D112. Supersedes: route-local auto-present / continue effects as a trusted lifecycle seam. - -116. **Each phase section opens with a projected phase section header** — a non-turn, non-durable stream artifact that states the phase purpose and what kinds of knowledge are captured there. The header is projected from workflow state and phase metadata (similar to phase markers) and re-projects on hydration. Content is phase-specific: grounding explains goals/terms/context/constraints, elicitation explains design decisions, requirements explains review, criteria explains verification. Depends on: A60, D110. Supersedes: —. - -121. **Client data ownership migrates from coarse loader invalidation to query-owned domains** — the near-term authoritative boundary is one specification bundle seam for workflow state, landing state, and turns, plus a separately invalidable entities domain scoped to the specification. Mutations and SSE events invalidate only the owned query key. `ask_question` tool execution persists the frontier question/options, advances the active head, and returns the acknowledged turn id; the client may then patch the bundle query cache from that tool output while the following bundle fetch remains authoritative reconciliation. The router loader becomes a thin shell that primes or guards those domains instead of owning the read model, and finer core/turn split work waits for a real server ownership boundary rather than a fake cache-key split over one payload. Depends on: A64, D87. Supersedes: monolithic `router.invalidate()` after every mutation. - -123. **Runtime proving uses a lightweight lifecycle seam with observer backlog, not a second workflow store** — every eligible answered turn should be able to unlock successor interactivity as soon as interviewer generation is durably ready, while observer capture for the answered turn runs afterward through a turn-owned `/api/specifications/:id/turns/:turnId/observer-capture` seam. The client lifecycle may keep only ephemeral capture state (`waiting`, `applying`, retry/backlog identity`) and reseed unfinished capture from durable turns after hydration/reload; durable authority remains the persisted turn plus its observer result part. Current constraint: server-side dedupe is process-local, so restart recovery depends on reseeding from turns that still need observer capture rather than on a durable runtime-operations ledger. Depends on: D22, D96, D113. Supersedes: the mixed inline/deferred observer finish boundary. - -114. **Continuous workspace rendering and phase addressability are separate concerns** — the interview center pane may render one cumulative workspace stream whose realized grounding, design, requirements, and criteria sections remain visible as the workflow advances, while the router continues to preserve deep links, gating, and sibling-route composition. A workspace-level controller may own one chat session, cross-section projection, focus / scroll behavior, and close-to-next-phase motion without turning focus state into a second durable workflow model. Phase routes act as focus addresses into that shared surface rather than distinct transcript owners: navigating to a realized phase focuses and scrolls to its section, while direct navigation to an unrealized future phase redirects to the current reachable phase instead of rendering placeholder content. Constraints: (1) one chat runtime per specification, not one per rendered phase, (2) only realized sections render in the cumulative center pane, so future phases do not project empty shells before they become reachable, (3) exactly one actionable frontier remains at the bottom of the current reachable section while prior sections are replay-only record, (4) focused section state must not redefine durable workflow truth or landing truth, (5) graph view is a peer route to the cumulative workspace surface — phase-independent, accessed via direct navigation, but rendered inside the outer specification shell so phase-sidebar continuity and top-bar consistency remain, and (6) output remains a separate route because it is not part of the interview timeline. Depends on: A58, D86, D87, D110, D113. Supersedes: the assumption that each phase route must own a distinct rendered transcript surface. - -124. **Interview framing is two-axis, not novelty-only** — the interviewer should orient itself with both workspace novelty (`greenfield` / `brownfield`) and delivery posture (`end-to-end build` / `incremental feature`). Partial-scope work is therefore a first-class interview shape rather than just a greenfield/brownfield footnote. Depends on: A65. Supersedes: the implicit single-axis framing around grounding strategy plus partial-scope support. - -125. **Observer capture is a prompt-budgeted graph-delta seam** — `runObserver()` remains the public capture boundary, but its internal output includes a generic graph delta: per-kind item collections plus a compact top-level relationship-candidate set that can reference existing entities by `knowledge_item.id` and same-turn provisional items by `{ kind, index }`. Existing-entity identity is the database id; any kind metadata in prompts is only display/validation context, never part of the lookup key. The server owns provisional-reference resolution after persistence, validates candidate edges through one typed relation-policy registry, and writes only supported edges, preferring abstention over speculative edge inflation. Accepted review sets reuse the same relation policy when materializing requirements or criteria so review-authoritative entities can add edges without a second durable graph model. Observer prompts avoid full entity tables and existing graph topology by default; future enrichment should use compact anchor inventories, phase/relevance filtering, and corpus/manual graph review before adding more context. Depends on: A66, D50, D112, D123. Supersedes: the decision/assumption-only relationship extraction instruction in the current observer seam. - -126. **Recognition-first assists synthesize proposals through turn-owned candidate direction sets** — grounding, design, and future architect / wizard-style modes may project user actions like `fill in the rest for me`, compare broad directions, or propose typologies. These invoke interviewer-authored candidate direction set artifacts on ordinary durable turns rather than extending force-close semantics. Each set presents concrete options with implications, tradeoffs, likely generated knowledge, and what each direction rules out. The user responds through a structured reaction seam (`accept-direction`, `refine`, `reject`, or `regenerate`). Accepting a direction may materialize goals, constraints, assumptions, decisions, requirements, invariants, and examples as accepted or proposed-from-selection; rejecting a direction may still create intent evidence through negative / not-relevant examples, `non_goal` constraints, or `rules_out` relations. Criteria are generated later unless the selected bundle includes concrete witness cases. This reuses the existing turn-artifact / workflow model instead of adding a second durable workflow machine. Depends on: A67, A77, A78, D66, D94, D112, D134. Supersedes: skip-only close as the sole user-legible fallback for low-patience or low-information phases, and supersedes treating candidate-spec assist as only a phase-shortening tool. - -127. **Interview detail should flow through a turn-owned breadth skeleton and detail-focus reaction** — grounding and design may pair an ordinary question turn with an interviewer-authored breadth-skeleton artifact that makes the current broad-pass map visible, marks which areas are already sufficient for now, and offers explicit `next level of detail` affordances. The user reacts by choosing whether to deepen one named area now, continue the broad pass, or leave an area sufficient-for-now; that reaction steers the next same-phase frontier turn without creating a second durable workflow or topic-tree store. Recursive follow-up remains ordinary focused turns on the same active path, and any future chat or graph affordance should emit the same detail-focus intent against shared specification truth rather than inventing a parallel deepening model. Depends on: A67, A68, D94, D112, D113. Supersedes: the implicit assumption that every frontier turn advances by the same depth granularity. - -128. **Graph view becomes an actionable workspace mode through a projection-first, intent-emitting seam** — graph mode should project shared entity truth into a relationship-aware scene with visible edge topology and own only ephemeral graph-local interaction state such as viewport, selection, focus, and path highlighting. It must not create a second durable workflow or edit-state model. Node-level actions emit intents into the existing workspace lifecycle so refinement side-chats, revisit, and future edit flows still materialize through turn-owned artifacts, projected control cards, and the same durable specification truth used by chat view. The common-case interaction should optimize for `select node -> inspect -> launch focused refinement`, with broader multi-select or edit overlays layered on later. The first ship layout is a **structured list**: kind-grouped item rows with a relations footer (Outgoing / Incoming subsections of relation chips), `referenceCode`-based hash anchors for cross-item navigation, hover-card previews on chips, soft-truncation at 6 chips per direction, and a per-row action rail reserving one disabled `chat-with` placeholder for future intent emission. A **spatial canvas** layout follows as a layout switch inside graph mode without changing the projection seam or action contract. The empty state is a minimal orientation card linking to the current reachable phase rather than empty kind sections; an explicit `Back to chat` affordance returns to the user's last phase route. Depends on: A69, A70, D50, D80, D113, D114. Supersedes: graph view as a placeholder grouped list surface, and graph view as a sibling layout mode under `_view`. - -129. **Graph view's structured-list scope decouples data fetching from default render** — graph view always fetches the `whole-spec` entities so chip targets always resolve, but defaults the rendered row set to active-path items so toggling between chat view and graph view does not silently widen the user's working scope. A `Show all` toggle expands the rendered set to the full data already in memory; no second fetch and no scope-dependent loading. **Phased shipping:** the structured-list ship lands the whole-spec fetch portion of this contract first; the active-path render filter and `Show all` toggle ship in a follow-up frontier item (see `memory/PLAN.md` Horizon: *Graph view active-path filter + scope toggle*) once per-item active-path membership is exposed in the entities API or derived through a stable client-side seam. Until then the structured list renders all `whole-spec` items by default, so the toggle would be a no-op and is not surfaced. Depends on: A70, D87, D121, D128. Supersedes: render scope and fetch scope coupled through the existing `mode=active-path | project-wide` query parameter. - -130. **First-run setup becomes a product surface, not README-only configuration** — the dashboard should expose provider credential status before specification creation and route users toward setup when no supported key is available. CLI logs and README env instructions can remain, but they are no longer the only supported onboarding path. Depends on: A74, A75. Supersedes: `ANTHROPIC_API_KEY` in project `.env` as the sole user-facing setup contract. -131. **Provider access moves behind one AI runtime provider seam** — interviewer and observer model construction should consume a shared provider/model resolver instead of importing Anthropic directly and reading model environment variables in each caller. The seam may keep Anthropic as the current implementation while testing OpenRouter as the preferred onboarding provider, but provider choice must be explicit and testable. Depends on: A74. Supersedes: direct Anthropic imports in interviewer/observer code as product truth. -132. **UI-entered credentials are user-scoped auth state, not workspace state** — if the app collects an API key, it writes to an XDG-compliant user auth/config location, never to `.brunch/` or the project `.env` by default. Existing environment-variable configuration remains supported as an override path for scripted use. Depends on: A75. Supersedes: project-local `.env` as the only persistent setup mechanism. -133. **`.brunch/` gitignore support is confirm-gated deterministic workspace mutation** — Brunch may inspect the workspace repository and offer to add `.brunch/` to `.gitignore`, but it must not mutate repository files without explicit confirmation. The mutation should be idempotent, preserve existing file content, and create `.gitignore` only when the user accepts. Depends on: A76. Supersedes: relying solely on user memory / docs to ignore the generated workspace directory. - -134. **Brunch specs evolve toward recognition-first intent graphs with progressive checkability** — the product direction is to preserve meaning as typed intent items, semantic edges, examples / counterexamples, verification witnesses, unresolved ambiguity, and user validation status rather than treating the spec as a planning document or prose inventory. Requirements and criteria remain distinct product items for now: a requirement is a commitment and a criterion is an oracle / witness. `invariant` and `example` should become first-class product ontology kinds, with positive, negative / counterexample, edge-case, and not-relevant examples represented as subtypes rather than separate top-level kinds. A shared `Property`-like intent primitive remains a design candidate rather than a committed storage or UI surface. Behavioral kernels are hidden interviewer / architect / wizard machinery for surfacing latent state, containment, authority, concurrency, migration, and evidence questions while emitting the weakest useful checkable artifact for the intent item. Depends on: A77, A78, A80, D50, D125, Requirement 38. Supersedes: the implicit framing that requirements / criteria review is the terminal semantic model of product intent. - -135. **Semantic mutation history should split from conversational turn history when graph editing becomes first-class** — turns remain conversational provenance and replay; the intent graph remains current semantic truth; a future changeset ledger records semantic mutation history; and reconciliation needs record semantic debt caused by changes that may stale existing graph truth. The first implementation should follow the multi-chat substrate in D138: chat containers plus durable reconciliation needs before a full changeset ledger, keeping turn-linked provenance and legacy spec-scoped pointers as compatibility while making room for changeset-backed provenance later. User-direct-edit mode should be allowed to land a committed group of intent-item changes immediately, synchronously create reconciliation needs from existing dependency and historical relations, then queue an asynchronous observer pass that may immediately add newly implied intent edges and additional reconciliation needs as a later interpretive-structure changeset. That observer pass may not silently rewrite, retire, or weaken existing accepted intent; content changes that require judgment go through reconciliation review. This explicitly reshapes the older revisit-session draft: revisit / cascade remains a product capability, but `revisit_session` is no longer the preferred persistence foundation once multiple chats, direct graph edits, and reconciliation review sets are in scope. Depends on: A71, A79, D80, D110, D112, D125, D128, D134, D138. Supersedes: turn ancestry as the only plausible semantic history spine, and the `docs/archive/design/REVISIT_MODULE.md` table shape as canonical persistence design. - -136. **Observer ontology should classify intent items by modality, not answer shape** — observer capture should distinguish value / outcome items (`goal`), descriptive items (`context`), boundary items (`constraint`), uncertainty items (`assumption`), choice items (`decision`), obligation items (`requirement`), preservation items (`invariant`), oracle items (`criterion`), and concrete witness items (`example`). `Decision` should narrow to chosen directions among plausible alternatives with durable consequences; `constraint` should remain top-level but gain subtypes such as `non_goal`, `scope`, `technical`, `policy`, `resource`, `compatibility`, and `environmental`. Generic `context` should be promoted when the content carries stronger semantics: success condition -> requirement or invariant, solution boundary -> constraint, uncertain material belief -> assumption, chosen alternative -> decision, mere interpretation aid -> context. Depends on: D134, Requirement 38. Supersedes: treating all user commitments or selected options as decisions by default. - -137. **Intent edges are semantic relations, while reconciliation needs are process debt** — intent-item kinds say what semantic units exist; intent-edge kinds say how items justify, constrain, depend on, refine, illustrate, and verify one another. A negative example is intent content; a boundary relation such as `rules_out`, `excludes`, or `counterexample_for` is intent semantics; a `reconciliation_need` is directed process obligation saying existing semantic truth may require renewed judgment because a change, contradiction, verifier result, or historical premise may affect it. The observer and future graph tools should provide edge-local neighborhoods around active intent items, but not every inferred edge should drive cascade, staleness, export explanation, criteria generation, or reconciliation. Relation policy should classify edge support (`explicit`, strong inference, weak candidate) and operational participation before relation-first capture broadens beyond today's limited edge set. Observer-created interpretive structure may land immediately when it adds supported edges, examples, or reconciliation needs; rewriting accepted intent remains reconciliation-review work. Depends on: A66, A81, D50, D125, D128, D134, D135, D138. Supersedes: treating graph edges as only display infrastructure, and also supersedes treating every visible edge as equally authoritative process truth or work queue state. - -138. **Multi-chat substrate is the first concrete persistence slice before the full changeset ledger** — add `chat`, nullable `turn.chat_id`, `specification.primary_chat_id`, mirrored `chat.active_turn_id`, and a minimal `reconciliation_need` table while keeping legacy `turn.specification_id` and `specification.active_turn_id` during transition. Do not add `active_chat_id` in phase one; `primary_chat_id -> chat.active_turn_id` covers the interview head until multiple active chat surfaces need their own pointer. New writes populate both legacy and chat pointers; application assertions preserve same-spec and same-chat ancestry; later cleanup can make chat ownership canonical and remove the legacy pointers. `reconciliation_need` uses directed item-to-item source / target fields, narrow `kind` and `status`, free-text reason, immediate `caused_by_turn_id`, and nullable `caused_by_changeset_id` as a future changeset-ledger placeholder. This supersedes older side-chat substrate assumptions and makes `docs/design/MULTI_CHAT.md` the concrete phase-one design while `docs/design/PATCH_LEDGER.md` remains historical deeper semantic mutation history. Depends on: A71, A82, A83, D135, D137, Requirement 39. Supersedes: implementing multi-chat by preserving an in-memory-only side-chat patch list as the durable substrate, and supersedes naming the process-debt table `reconciliation_edge`. -139. **Prompt/context scenario substrate is a first-class foundation** — Brunch should externalize server-side prompts and reusable agent doctrines into inspectable markdown assets, load and compose them through a typed server seam, and introduce context-pack builders that render the current intent graph for a specific generative scenario rather than letting each call site hand-roll prompt context. The same substrate should support lightweight prompt probes over seeded graphs and transcripts before UI surfaces are built. A prompt scenario composes prompt + context pack + model settings + capability inventory + captured output/review for evaluation; it must not become the owner of prompt doctrine, context semantics, credential UX, or the shared production AI runtime. Depends on: A84, A85, D134, D136, D137, Requirement 40, Requirement 41. Supersedes: scattered TypeScript prompt strings and transcript-dump context as the default mechanism for new agent features. -140. **Intent graph context packs are scenario-specific semantic briefings** — a context pack is an explicit rendering of graph truth, workflow state, relevant provenance, unresolved ambiguity, relation neighborhoods, and authority labels for one agent task. Packs should exist for observer capture, next-question generation, candidate-spec synthesis, criteria/witness generation, web research query framing, reconciliation review, architect proposals, and downstream decomposition/oracle probes. They should be bounded, ranked, and typed rather than raw graph dumps. Depends on: A84, D125, D134, D137, D138, Requirement 40. Supersedes: assuming the active chat transcript is the canonical prompt context after multi-chat. -141. **Post-spec decomposition remains a probe frontier, not a committed Brunch UI** — the next-after-spec direction is to derive design alternatives, oracle strategy, execution slices, and verification-aware orchestration constraints from the intent graph and its checkability implications. This should first run through the prompt/context scenario substrate, borrowing cognitive patterns from `ln-design` and `ln-oracles`, before deciding whether it belongs inside Brunch or a successor product. Depends on: A87, D134, D139, D140, Requirement 41. Supersedes: treating export prose as the only meaningful handoff target. -142. **Pi is a candidate harness adapter, not current product runtime truth** — Pi may be evaluated via SDK or RPC as the first lower-level agent harness for prompt probes, web/tool experiments, and future decomposition scenarios because it already provides sessions, custom tools, provider support, event streams, and embedding modes. Brunch should not assume Pi owns product workflow, durable replay, intent-graph mutation authority, reconciliation review, or credential UX unless a later spike proves and explicitly adopts those boundaries. Depends on: A86, D139, Requirement 41. Supersedes: deciding the web-research tool spike only at the individual tool API level. -143. **Brunch owns the agent mutation surface; harnesses adapt it as tools** — Any mutation of durable Brunch data initiated by an agent must route through Brunch-owned mutation handlers, not direct ORM access or harness-specific tool implementations. Those handlers define the product operation: stable id, input/output schemas, description, authority class, replay policy, and reconciliation/changeset-ledger behavior. AI SDK, Pi, CLI/TUI, or future adapters may expose the handlers as tools, but adapters only translate transport and tool shape; they do not define mutation authority. Read-only capabilities can use the same contract registry for consistency, but the binding rule is that agent-originated writes enter through one server-owned surface. Depends on: Requirement 42, D138, D139, D142. Supersedes: defining separate mutating tool surfaces inside each agent harness or letting agent flows bypass application handlers to call the ORM. -144. **Intent graph vocabulary supersedes knowledge graph vocabulary** — Canonical product vocabulary is `intent graph`, made of `intent items` and `intent edges`. Current schema/code may still use `knowledge_item` and `knowledge_edge` as implementation names during transition, but new planning, agent capability contracts, context packs, operation ids, and user-facing design should prefer intent vocabulary unless referring to current persistence/API names. `Claim` may remain an explanatory generic for natural-language content, but it is not a product/schema noun. Depends on: D134, D136, D137. Supersedes: using `knowledge graph`, `knowledge item`, `knowledge edge`, or `claim` as future-facing product nouns. -145. **Changeset/change supersedes patch/patch_change** — Semantic mutation history uses `changeset` for one submitted semantic mutation bundle and `change` for one atomic mutation inside it. `Patch` and `patch_change` remain historical design-doc vocabulary and may appear in older file names, but new schema, capability contracts, operation ids, and planning language should use `changeset` / `change` unless this decision is explicitly reversed. Depends on: D135, D138, D143. Supersedes: treating naming as open between patch and changeset. -146. **Hard-impact edit cascade reads from the `reconciliation_need` queue, not from REVISIT walk state** — when a hard-impact `propose_edit` patch applies, the server enumerates `knowledge_edge` rows incident on the changed item under typed relation policy and opens one `reconciliation_need` row per affected pair (Path 1 from `docs/design/MULTI_CHAT.md` §5.1). The patch list overlay is the canonical resolution surface: open needs render as a `Pending review` section alongside staged patches, with per-row accept-on-target / edit-target / dismiss actions. The V2 `deferred: true` apply response and the "Hard impact — coming in V3 cascade preview" banner are removed at V3.0 ship. V3.0 groups needs mechanically by `kind` and relation type; agent-grouped resolution (auto-confirm / auto-edit / substantive) is V3.1 work and does not block V3.0. Side-chat thread persistence is not a V3.0 prerequisite — threads stay in-memory until MULTI_CHAT.md Phase 2. Depends on: A71, A83, A88, D80, D135, D137, D138. Supersedes: hard-edit deferral with a placeholder banner, the modal secondary-thread walk in `docs/archive/design/REVISIT_MODULE.md`, and the SIDE_CHAT.md V3 prose that pre-dated the multi-chat substrate. -147. **The local agent CLI is a long-lived JSONL adapter over Brunch capability contracts** — CLI-addressability should first ship as a `brunch agent`-style local process that speaks request/response JSONL over stdin/stdout, dispatches Brunch-owned capability contracts, and keeps all product resources explicit in input payloads. The adapter may hold ambient runtime plumbing such as a DB connection, provider config, and in-flight interviewer / observer generation bookkeeping, but it must not hold hidden selected spec/chat/turn handles as command semantics. Read capabilities use `list` / `get` for structured read-model data and `read` for agent-facing projections with allowed response shapes and next-command hints. Mutations stay capability-first and surface-lazy: add only contracts needed by real probe/tool use, with an initial surface around `spec.create`, `spec.getStatus`, `spec.requestPhaseClosure(specId, phaseId?)`, `spec.requestExport`, `chat.getPrimary`, `chat.ensureReady(chatId?, timeoutMs?)`, `chat.read`, `turn.get`, and `turn.submitResponse(chatId, turnId?, response)`. `chat.ensureReady` is the idempotent synchronization/recovery command: it may trigger continuation when a chat lacks a usable next turn, can block up to a bounded timeout, and returns a derived state such as `generating`, `awaiting_response`, `idle_no_frontier`, `closed`, or `error` without requiring a durable runtime-operations ledger in the first cut. The LLM-as-user probe runner is a separate client of this JSONL adapter, not part of the capability server and not allowed to import DB/product handlers directly; its scenario briefs, model policy, generated transcripts, and curated golden fixture bundles are probe artifacts, not Brunch authority. Suggested module boundary: `src/server/capabilities.ts` plus `src/server/capabilities/` own contracts, schemas, handlers, and dispatch; `src/server/agent-jsonl.ts` plus `src/server/agent-jsonl/` own only protocol/session/transport; `scripts/agent-probes/` owns the outer LLM-as-user loop and artifact writing as development harness code through a JSONL client. Depends on: Requirement 43, A89, D139, D140, D143. Supersedes: treating the CLI as hand-written route wrappers, direct ORM scripts, a one-shot TUI with hidden ambient selection, or a probe runner that bypasses the same mutation surface future agents must use. -148. **Spec evolution strategies are chat-local, turn-mediated process state** — strategy is not specification-level semantic truth. A chat may be established through a first assistant/system frontier turn that offers or declares a strategy such as `step_by_step`, `scenario_options`, `targeted_cases`, `graph_review`, or `reconciliation`; globally triggered flows may create/reuse a pre-strategized chat whose first turn is the procedure kickoff. A chat can technically change strategy through later turns, but explicit switch UX is deferred. Tactical sub-strategies are allowed inside a chat, but broad mid-interview acceleration should branch into a side-chat/strategy chat rather than mutate the primary interview chat in place. Depends on: Requirement 39, Requirement 44, D138. Supersedes: treating the interviewer as one global mode per specification. -149. **Changesets are the atomic semantic mutation boundary, while proposal turns are not mutations until accepted** — a graph-review finding, candidate bundle, or reconciliation suggestion is the assistant/system half of an open frontier turn until the user responds. Only `accept` applies a proposal turn's semantic changeset; `revise`, `ask_followup`, `regenerate`, `defer`, and ordinary `reject` produce successor/process state rather than direct graph mutation. A changeset is the smallest semantic mutation unit that preserves coherence, and may record direct edits, candidate acceptance, reconciliation resolutions, opened reconciliation needs, or future verifier/import results. Turns should stamp the latest applied changeset id at creation (`opened_at_changeset_id` / `base_changeset_id`) so open proposals can be conservatively marked stale when the specification advances. Depends on: A71, A79, A92, D135, D145, Requirement 44. Supersedes: treating agent proposals or review findings as durable semantic truth before user/action acceptance. -150. **Relation policy owns operational directionality for intent edges** — relation names should be semantically clear, but code must not infer cascade or reconciliation behavior from raw edge source/target direction. Each relation kind declares a canonical sentence, inverse display sentence, operational-axis participation, and source-change / target-change behavior. Direct edit and hard-impact cascade enumerate incident accepted edges, then ask relation policy which endpoint, if any, receives a `reconciliation_need`. FE-700 may break current `depends_on` / `derived_from` / `constrains` / `verifies` records while expanding the ontology, but should not force every useful edge verb into one dependency direction at the expense of display, prompt context, export trace, critique, verification, candidate generation, or explanation. Depends on: A81, A88, A93, D137, D146. Supersedes: assuming outgoing edges from the changed item are the cascade direction. -151. **Scenario-options acceleration is product-facing, but graph review is its safety oracle** — the first user-visible alternative to long drilldown should likely be a first-turn strategy choice or mid-interview `speed this up` side-chat that generates 2–3 candidate bundles completing the current direction from context-packed accepted graph truth. Candidate bundles present named tradeoff profiles and are accepted as coherent units, not item-by-item pick lists. Fast gates (parse/schema/fixed-premise/no-obvious-conflict/tradeoff summary) can run before display; deeper graph review, coverage, checkability, provenance, and repair/refinement can run asynchronously. Depends on: A67, A84, A85, A90, A91, D126, D139, D140, D148, Requirement 31, Requirement 44. Supersedes: treating candidate-spec assist as a skip/force-close helper or as one-shot generation that can be committed without critique. -152. **Graph review and reconciliation are separate graph operations** — reconciliation is repair-oriented process debt from a known disturbance (`reconciliation_need`), while graph review is quality-oriented critique over any graph for weakness, genericity, low support, missing coverage, weak checkability, poor provenance, or maturity gaps. Broader review findings start as turn-owned structured artifacts; `reconciliation_need` remains the only first-class problem table until review issues require independent querying, filtering, badges, assignment, or lifecycle. Candidates may be accepted with represented issues if accepting also opens a graph-review frontier or appropriate process-debt records. Depends on: A91, D137, D149, D151, Requirement 44. Supersedes: overloading reconciliation as the umbrella for all graph intelligence or blocking useful imperfect specs until every review issue is repaired. - -## Interaction Stream Model - -The center column is a **merged stream projection** over multiple artifact families. The turn tree remains the authority for conversational lineage and branching, but the rendered stream is intentionally richer than the tree itself. - -| Artifact family | Durable | Branch-bearing | Current examples | Ordering / invalidation rule | -| ------------------------- | ------- | -------------- | -------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------- | -| Conversational turn cards | yes | yes | grounding question, design question, review proposal, closure proposal, answered-turn replay | Ordered by the active-path turn chain; branch membership comes from `parent_turn_id`. | -| Anchored workflow facts | yes | no | phase outcome | Stored outside the turn table but anchored to turn ids for provenance; if an anchor falls off the active path, the fact is superseded or hidden. | -| Projected control cards | no | no | kickoff, recovery, proceed / go-to-frontier affordances | Derived from workflow state plus nearby anchors; they re-project on hydration and may disappear / reappear without needing their own durable row. | -| Activity cards | mixed | no | visible generation state, persisted activity summary, trailing observer state | Derived from runtime state or replay summaries adjacent to a turn or control boundary; they do not become branch nodes. | -| Phase markers | no | no | phase start, phase closed | Projected from workflow position and anchored workflow facts such as phase outcomes; they annotate the stream without entering the turn tree. | -| Phase section headers | no | no | grounding purpose + knowledge kinds | Projected from workflow state and phase metadata at the top of each phase section; re-project on hydration. | - -This model is deliberately asymmetric: only conversational turns participate in the linked-list lineage model, while the other artifact families either anchor to that lineage or project from it. A rendered card therefore does not imply a persisted turn row, and a persisted durable record does not need to masquerade as a turn to belong in the stream. - -The ordering rule is: active-path turns provide the spine, anchored workflow facts attach to points on that spine, and projected control / activity / phase-marker elements are injected relative to workflow state and those anchors. The invalidation rule is: if a durable non-turn record is anchored to a turn that leaves the active path, the record must be superseded or hidden rather than left floating as if it still belonged to the trusted branch. - -## Layout Architecture - -### Top Bar - -| Element | Content | Position | -| ------------------ | ---------------------------- | --------------------- | -| Logo | Placeholder (TBD) | left | -| App name + version | "Brunch v{version}" | left, after logo | -| Separator | Pipe character | left, after version | -| Tagline | "AI-guided spec elicitation" | left, after separator | -| Working directory | `cwd` in mono | right-aligned | - -Height: `h-10` (40px). Version injected at build time from `package.json`. - -### Three-Pane Layout - -Below the top bar, three vertical panes fill the remaining viewport height. Each pane has a sticky-positioned header and a scrollable body using ScrollArea. - -#### Left Pane — Specification Navigation Sidebar - -**Sticky header:** -- "< Back to Workspace" navigation link -- Read-only specification name (set at creation, not editable) - -**Body — Phase stepper / section navigator:** -A vertical timeline with connecting line (blue for completed segments, gray for future). It remains strictly sequential for workflow truth, but it may behave as a section-jump / scroll-spy surface inside one continuous workspace transcript. Each phase item shows: - -| Phase | Internal key | Label | -| ----- | -------------- | ------------------- | -| 1 | `grounding` | Grounding | -| 2 | `design` | Elicitation | -| 3 | `requirements` | Requirements | -| 4 | `criteria` | Acceptance Criteria | -| 5 | *(route only)* | Output | - -Per-phase metadata: status (colored: Closed / In-Progress / Unstarted), readiness band (when in-progress), turn count. Closed phases and the current reachable phase are selectable; future phases may remain visible but locked. Output appears conditionally when all phases are closed. - -#### Center Pane — Chat Transcript - -**Sticky header:** -- "Phase N/M – [Phase Name]" for the currently focused section or current reachable phase — positional progress label -- Status text (colored) -- Turn count -- Readiness band (when in-progress) -- Close Phase button (right-aligned, in-progress only, gated by closeability, triggers confirmation) -- Status badge replaces button when phase is closed - -**Body (chat view):** -- One continuous workspace scroll surface that may be segmented into phase sections rather than remounted per phase -- Each phase section opens with a projected phase section header stating the phase purpose and captured knowledge kinds -- Closed phases replay their phase markers and answered / compacted turn cards as prior sections -- The current reachable phase owns the only actionable bottom artifact -- Activity cards and visible generation state stay attached to their section / turn anchors while the next generative turn is being created -- Active bottom artifact: projected kickoff control card, durable frontier turn card (grounding/question/review/closure proposal), or projected recovery card -- Artifact-specific controls - -**Body (closed phase):** -- Answered question cards -- Phase-closure marker plus any activity cards -- "Proceed to [next phase]" or equivalent handoff control card at bottom - -Scroll container: ChatScroll (ScrollArea + useStickToBottom). - -#### Right Pane — Knowledge Graph Sidebar - -**Sticky header:** -- "Knowledge Graph" title -- Item count + connection count - -**Body — Grouped intent items:** - -| Group label | Kinds | Visible | -| ----------------------- | -------------------------------------------------------- | ------- | -| Goals | goal, context, constraint (including `non-goal` subtype) | yes | -| Assumptions & Decisions | assumption, decision | yes | -| Requirements | requirement | yes | -| Acceptance Criteria | criterion | yes | -| *(hidden)* | term | no | - -Items render as compact DrawerCard instances: code + content in header, edge/dependency reference codes as drawer-peek summary when edges exist, plain card otherwise. - -### Design Tokens - -**Typography scale** (11px–16px, no sizes outside this range): - -| Token | Size | Usage | -| -------------- | ---- | -------------------------------------- | -| `text-xxs` | 11px | Impact badges, tag labels | -| `text-xs` | 12px | Secondary text, metadata | -| `text-xs-plus` | 13px | Secondary body, explanatory text | -| `text-sm` | 14px | Body text | -| `text-sm-plus` | 15px | Card headings, collapsed question text | -| `text-base` | 16px | Section headings | - -Question card titles use arbitrary `text-[17px]` above the scale for emphasis. - -**Font weights**: normal (400), medium (500), semibold (600). No bold (700+). - -**Color tokens**: - -| Token | Hex | Usage | -| ------ | ------- | ------------------------------- | -| `ink` | #202020 | Primary text | -| `sub` | #5b5b5b | Subtitles, secondary text | -| `hint` | #a6a6a6 | Placeholders, inactive elements | -| `rule` | #e3e3e3 | Borders, dividers | -| `wash` | #f0f0f0 | Ghost fills, tracks | -| `tint` | #fafafa | Subtle background | - -**Accent blue** (interactive elements, recommendations, progress): -- Primary: `#2070e6` -- Gradient top: `#3484fa` -- Ring/border: `#1060d6` - -**Shadow tokens**: `--shadow-card`, `--shadow-ring`, `--shadow-card-ring`. - -**Card structure pattern** (DrawerCard): outer `rounded-xl border border-rule bg-tint` shell, inner white header with `-m-px` border overlap trick and `shadow-card`, tinted drawer body below. - -## Critical Invariants - - - -Each row in this table is a **formalization candidate** ascending the progressive-checkability ladder: the `Invariant` column states the property in human-readable form, `Protected by` names the *current oracle* (its present rung on the ladder — typically a regression test today), and `Proves` ties the property back to the requirements or decisions it preserves. Stronger oracles (state-machine model, runtime contract, proof obligation) are deliberate future moves recorded in `docs/design/INTENT_GRAPH_SEMANTICS.md` rather than expanded inline here. - -| # | Invariant | Protected by | Proves | -| ---- | --------- | ------------ | ------ | -| I4 | Vite proxy routing and the runtime backend-port seam stay aligned through one explicit configuration path. | `runtime-config.test.ts` | Requirement 1 | -| I17 | Data Part schema validation remains confined to true LLM / HTTP boundaries rather than mirrored internal seams. | `parts.test.ts` | Requirement 4 | -| I24 | Interview hydration, streaming projection, controller orchestration, mutation transport, phase-scoped rendering, and successor-frontier continuity remain stable through the routed interview surface, including concise durable activity summaries for replay, projected kickoff/recovery/handoff controls, preface-card replay and continue affordances, landing-only grounding-strategy kickoff submission, turn-owned submit/interviewer-processing, visible generation states, anchored phase-boundary projection, and trailing observer attachment. | `InterviewView.test.tsx`, `-workspace-stream-projector.test.ts`, `transcript-parity.test.tsx`, `-interview-data.test.ts`, `-interview-controller.test.tsx`, `app.test.ts`, `client-mutation.test.ts`, `task.test.tsx` | D86, D87, D93, D94, D95, D96, D110, D113 | -| I44 | Structured turn responses round-trip through persistence, hydration, projection, and UI affordance state without collapsing back to scalar semantics. | `turn-response.test.ts`, `context.test.ts`, `InterviewView.test.tsx` | Requirement 4 | -| I48 | Canonical knowledge kinds persist with provenance and project through typed entity collections, stable per-kind reference codes, turn-linked capture projection, and graph edges without ontology drift. | `db.test.ts`, `core.test.ts`, `knowledge.test.ts`, `EntitySidebar.test.tsx`, `InterviewView.test.tsx`, `GraphView.test.tsx` | D50, Requirements 22, 23 | -| I54 | Phase-aware capture preserves the committed ontology boundary: grounding / elicitation persist only durable exploration knowledge, accepted review outputs materialize durable requirements / criteria, and both seams survive persistence, turn-linked replay hydration, and UI refresh without breaking sync. | `observer.test.ts`, `context.test.ts`, `app.test.ts`, `InterviewView.test.tsx` | D95, D112, Requirements 22, 23 | -| I72 | Explicit phase outcomes project shared workflow status, closeability, readiness, closure basis, and closed-phase boundary markers through one durable seam. | `phase-close.test.ts`, `db.test.ts`, `app.test.ts` | D65, D66, D110 | -| I87 | Requirements and criteria review ground themselves in their respective inventories, persist interviewer-owned review metadata on the review turn itself, project stable review-set reference codes, submit lightweight full-set review replies by semantic action rather than assumed option order, and carry accepted review outputs into downstream workflow without leaving dead frontier states. | `interview.test.ts`, `db.test.ts`, `app.test.ts`, `InterviewView.test.tsx`, `project-state-turn.test.ts` | D94, D112 | -| I100 | `.brunch/` workspace resolution, compiled package-bin startup from the packed install artifact, built-client serving, actual bound URL reporting, same-workspace runtime ownership, chat-sized JSON request parsing, and JSON-shaped payload-too-large failures stay correct in local-first distribution. | `project.test.ts`, `launcher.test.ts`, `cli.test.ts`, `runtime-config.test.ts`, `app.test.ts` | Requirement 1 | -| I101 | Grounding strategy and workspace-backed context gathering persist through schema, API, interviewer configuration, and observer context; preface-card assistant metadata round-trips through persistence/projection, and preface cards stay provisional rather than directly mutating durable knowledge. | `db.test.ts`, `interview.test.ts`, `app.test.ts`, `context.test.ts`, `observer.test.ts`, `parts.test.ts`, `project-state-turn.test.ts`, `ProjectList.test.tsx` | D112, Requirements 3, 20, 21 | -| I102 | File-route generation, directory-based nesting, the three-shell route architecture, and phase addressability remain the runtime routing source of truth; graph view stays code-split. | `router.test.tsx`, `build-boundary.test.ts`, `GraphView.test.tsx` | D86 | -| I103 | Trusted fixture state comes only from TypeScript builders or direct DB setup; walkthrough seeds stay builder-owned, observer probes seed directly without a second fixture format, and seeded scenarios remain resumable/exportable through that one surviving fixture model. | `corpus.test.ts`, `walkthrough.test.ts`, `seed.test.ts` | Requirements 13, 14, 15 | -| I104 | Interviewer-owned turn artifacts materialize through one persistence seam, so runtime review metadata, preface cards, activity summaries, phase summaries, and seeded brownfield replay all round-trip without route-specific reconstruction drift. | `turn-artifacts.test.ts`, `app.test.ts`, `walkthrough.test.ts` | D93, D96, D112 | -| I105 | Grounding/design structured-response turns can unlock the next frontier before observer capture finishes, while deferred capture stays keyed to the answered turn, reseeds from durable turns after reload, and avoids stale completion attachment. | `-interview-controller.test.tsx`, `app.test.ts` | D96, D113, D123 | -| I106 | Provider credential discovery, precedence, dashboard status, and model-provider resolution stay explicit without exposing raw secret values through `/api/config`, logs, persisted specification state, or client-visible payloads. | planned: `runtime-config.test.ts`, `app.test.ts`, `ProjectList.test.tsx` | Requirements 34, 35, 36; D130, D131, D132 | -| I107 | `.brunch/` gitignore hygiene is idempotent and confirmation-gated: existing ignore coverage is detected, missing entries are appended only after user confirmation, and absent `.gitignore` files are created only through that same accepted action. | planned: `project-gitignore.test.ts`, `app.test.ts`, `ProjectList.test.tsx` | Requirement 37; D133 | -| I108 | Observer capture no longer blocks chat stream completion for any eligible answered turn; capture backlog state is re-derived from durable turns, drains through the turn-owned observer-capture endpoint, and persists results back onto the originating turn. | planned: `app.test.ts`, `-interview-controller.test.tsx` | D22, D96, D123 | -| I109 | Observer prompts remain compact as relation extraction widens: existing knowledge is passed as id/kind/content-preview anchors with bounded length, graph-delta candidates resolve only through validated `knowledge_item.id` or same-turn provisional references, and accepted review grounding refs reuse the same relation policy. | `context.test.ts`, `observer.test.ts`, `db.test.ts`, `app.test.ts` | Requirement 30; D50, D125 | -| I110 | Workflow read truth and workflow write truth stay behind named seams: durable snapshots project through `projectWorkflowState`, while turn-response, chat-route, phase-intent, and phase-close mutations apply through transition/runtime helpers instead of transport handlers owning workflow semantics. | `workflow-projector.test.ts`, `turn-response-transition.test.ts`, `chat-route-transition.test.ts`, `phase-close.test.ts`, `app.test.ts` | D110, D113, D123 | -| I111 | Multi-chat substrate preserves one interview chat per specification, keeps legacy and chat-derived active heads equivalent during transition, guarantees each turn's `chat_id` belongs to the same specification as its legacy `specification_id`, scopes parent turns to the same chat, and deduplicates simultaneously open reconciliation needs for the same source / target / kind without conflating them with semantic `knowledge_edge` rows. | `chat-substrate.test.ts`, `reconciliation-need.test.ts`, `db.test.ts` | Requirement 39; A82, A83; D137, D138 | -| I112 | Prompt/context scenarios render from packaged markdown prompts and typed context-pack builders rather than scattered inline prompt strings; probe artifacts include deterministic rendered prompt/context fingerprints, prompt asset packaging mirrors current source assets at build time, and production prompt text has reviewable golden coverage without requiring product UI. | `prompt-loader.test.ts`, `prompt-build-boundary.test.ts`, `prompt-golden.test.ts`, `context-pack.test.ts`, `scenario-runner.test.ts` | Requirements 40, 41; D139, D140 | -| I113 | Hard-impact `propose_edit` apply opens at least one `reconciliation_need` per existing typed dependency edge incident on the changed knowledge item (relations: `depends_on`, `derived_from`, `constrains`, `refines`, `verifies`), records `caused_by_turn_id` provenance, deduplicates against the partial unique index, and never returns `deferred: true` from the apply contract; resolutions transition `open → resolved` idempotently. | planned: `edit-applier.test.ts`, `reconciliation-need.test.ts`, `patch-list-overlay.test.tsx`, `app.test.ts` | Acceptance Criterion 7; A88; D135, D137, D138, D146 | -| I114 | The agent capability CLI remains an adapter over Brunch capability contracts: JSONL calls validate explicit resource ids and schemas, mutating calls dispatch through server-owned capability handlers rather than ORM/route bypasses, `read` projections provide affordance hints without importing scenario briefs, and the probe runner exercises the surface only through a JSONL client. | planned: `capabilities/*.test.ts`, `agent-jsonl.test.ts`, `probe-runner.test.ts` | Requirements 42, 43; A89; D143, D147 | -| I115 | Each active/resumable chat has at most one open assistant/system-first frontier turn; user responses complete that turn through normalized proposal/response semantics, and strategy is chat-local process state rather than specification-level semantic truth. | planned: `chat-substrate.test.ts`, `turn-response-transition.test.ts`, `capabilities.test.ts` | Requirement 44; D138, D148 | -| I116 | Open proposal turns are stamped with the latest applied changeset id at creation and are conservatively stale when the specification's latest changeset advances before completion; stale proposals refresh/regenerate rather than applying against unknown graph state. | planned: `changeset.test.ts`, `turn-response-transition.test.ts`, `app.test.ts` | A92; D149 | -| I117 | Reconciliation/direct-edit cascade never infers affected endpoints from raw edge direction alone; it consults relation policy source-change / target-change behavior over incident accepted edges. | planned: `knowledge-relationship-policy.test.ts`, `edit-impact.test.ts`, `reconciliation-need.test.ts` | A93; D137, D146, D150 | -| I118 | Scenario-option candidate bundles can only become canonical by accepting a coherent bundle changeset; accepted-with-issues candidates must also create durable follow-on review/process debt so known weaknesses are not hidden. | planned: `scenario-runner.test.ts`, `turn-artifacts.test.ts`, `changeset.test.ts` | A90, A91; D151, D152 | - -## Lexicon - -### Core terms - -| Term | Definition | -| ---- | ---------- | -| **workspace** | The cwd-backed software context whose local `.brunch/` directory stores specifications and runtime state. | -| **prompt/context scenario substrate** | The server-side and test-harness foundation for loading markdown prompts, composing reusable doctrines, deriving typed intent-graph context packs, and running prompt probes before UI commitment. It is not the provider credential/setup system or shared production AI runtime. | -| **context pack** | A scenario-specific semantic briefing derived from intent graph truth, workflow state, provenance, unresolvedness, relation neighborhoods, and authority labels for one agent task. It is bounded and typed, not a raw graph or transcript dump. | -| **progressive checkability** | The discipline of representing intent items at the weakest useful witness level today — prose, example, counterexample, criterion, executable test, runtime invariant, state/transition property, or formal model — while preserving paths toward stronger witnesses where valuable. | -| **behavioral kernel** | Hidden interviewer / architect machinery that recognizes recurring correctness patterns such as lifecycle, containment, authority, concurrency, migration, and evidence, then elicits checkable artifacts without exposing formalism as product ceremony. | -| **scenario runner** | A lightweight pre-UI harness that runs a selected prompt scenario against fixtures, context packs, tools, and model settings, then records outputs for qualitative and structural review. Execution adapters translate this harness input into a concrete fake/model/harness call; they do not define Brunch semantics, credential UX, provider resolution, or mutation authority. | -| **agent capability CLI** | A local machine-facing CLI adapter, initially a long-lived JSONL stdin/stdout process, that exposes Brunch-owned capability contracts to external agents and probe runners without defining its own product API or mutation authority. | -| **JSONL capability session** | The request/response transport between an external harness and `brunch agent`: every call includes an id, capability id, and explicit input resource identifiers; the process may keep DB/provider/in-flight runtime handles internally, but selected spec/chat/turn targets are not hidden ambient state. | -| **probe runner** | An external client of the agent capability CLI that supplies scenario briefs, calls an LLM-as-user, drives Brunch through capability calls, and writes generated transcript/spec/export/graph artifacts for human curation. It must not import Brunch DB or product handlers directly. | -| **read projection** | An agent-facing read output that summarizes Brunch-known state and compatible next actions or response shapes. It differs from `get` / `list` reads, which return structured entity/read-model data for assertions, fixture capture, and tooling. | -| **agent mutation surface** | The Brunch-owned typed handler layer for any durable data mutation initiated by an agent, internal or external. It is the only write entry point agents may use; handlers own schemas, authority, replay behavior, and reconciliation/changeset-ledger semantics rather than letting agents call the ORM directly. | -| **agent capability contract** | A Brunch-owned typed contract addressable by agents or harnesses, with a stable id, description, input/output schemas, authority class, and replay policy. Read-only capabilities and mutating handlers can share this registry shape, but mutating contracts must route through the agent mutation surface. | -| **tool adapter** | A provider- or harness-specific projection of an agent capability contract into a concrete tool format such as AI SDK tools, Pi tools, CLI/TUI commands, or a future external-agent API. Adapters translate shape and transport while preserving Brunch-owned authority semantics. | -| **authority class** | The contract metadata that says whether an agent capability is read-only, proposal-only, or commits durable product truth, and therefore which replay, reconciliation, and mutation boundaries govern it. | -| **AI runtime provider** | The shared server seam that resolves the configured LLM provider, model names, API-key source, and provider-specific options for interviewer and observer calls. | -| **provider credential status** | The app-visible setup state indicating whether a supported LLM key is available, which source supplied it, and what user action is needed, without exposing the secret value itself. | -| **XDG auth state** | User-scoped configuration / credential storage outside the project workspace, used for API keys entered through Brunch UI when implemented. | -| **workspace hygiene affordance** | A confirm-gated local repository action that helps keep generated Brunch state such as `.brunch/` out of version control without silently mutating the workspace. | -| **specification** | One elicitation run within a workspace. Browser routes, HTTP paths, shared transport contracts, and durable DB/storage should all use canonical `specification` terms. | -| **project** *(legacy term)* | A deprecated older name for a specification record. Remove it rather than preserving it as a long-term compatibility seam. | -| **workspace stream** | The merged center-column read model composed from active-path turns, anchored workflow facts, projected control cards, phase markers, and activity cards. | -| **specification runtime** | The live lifecycle owner for one specification: it reconciles durable truth into the current landing, owns in-flight interviewer / successor / capture orchestration, and rejects stale lifecycle outputs that routes must not treat as their own authority. | -| **turn** | One persisted authored conversational interaction, with typed offer/reply parts and parent linkage. Today the primary interview active path still provides the main lineage spine; the multi-chat substrate is moving turn ownership toward chat-scoped chains. Questions, review proposals, closure proposals, and future side-chat turns use this seam. | -| **turn kind** *(current internal seam)* | The current persisted implementation field on a turn (`question`, `kickoff`, `recovery`). It may help project control state today, but kickoff / recovery are product-level structural affordances rather than durable authored turn categories. | -| **turn card** | The user-facing rendering of a durable conversational turn inside the workspace stream. | -| **anchored workflow fact** | A durable non-turn record whose validity is anchored to one or more turns on the active path. `phaseOutcome` is the canonical current example. | -| **projected control card** | A workflow affordance derived from durable state rather than authored conversational content. Kickoff, recovery, and proceed / handoff controls live here. | -| **kickoff card** | A projected phase-entry control card that appears whenever an open phase is in entry-pending state and requires an explicit user action before substantive interviewer progression begins. | -| **frontier turn** | The single actionable durable conversational turn currently at the bottom of an open phase when the phase is in substantive elicitation rather than structural control. In multi-chat strategy flows, each active/resumable chat has at most one open frontier turn; the specification can have multiple open frontier turns across different chats. | -| **chat-local strategy** | Process state that determines how one chat advances the spec: step-by-step drilldown, scenario options, targeted cases, graph review, or reconciliation. It is established or declared through a chat's frontier turn and is not durable product truth about the specification. | -| **proposal turn** | An assistant/system-first frontier turn that offers a candidate bundle, graph-review finding, reconciliation suggestion, or other proposed action. It is not a semantic mutation until the user completes it, usually by accepting, revising, asking follow-up, deferring, regenerating, or rejecting. | -| **preface card** | A turn-internal artifact that presents provisional context from interviewer-invoked context gathering, rendered above a paired question card within the same turn. The observer captures from the whole turn (preface context + question + user response) as one validated unit rather than from the preface card alone. Available in any phase when the workspace directory is present. Implementation: `preface` / `PrefaceCard` / `present_preface` tool / `data-preface` part. Renders as a simple `bg-tint` rounded box with italic subdued text, not as a DrawerCard. | -| **question card** | A turn card that asks a structured interviewer question and expects a substantive user response. | -| **review turn** | A full-set requirements or criteria review interaction that offers a synthesized candidate list with stable reference codes, supports per-item commenting (inline comment toggle on each item) plus one optional global review note, and persists its own `reviewActions` / `reviewSet` metadata on the turn. On `request changes`, the successor review turn carries a revision card above the new review set. | -| **closure turn** | A durable proposal turn whose offer proposes closing a phase and whose reply explicitly accepts or rejects that proposal. Accepting it confirms the phase outcome on that same turn and advances the workflow into the next phase's projected entry state. | -| **recovery card** | A projected control card that appears whenever an open phase lacks a valid actionable frontier and offers the user a repair path without requiring a separately generated recovery turn. | -| **active turn** | The live frontier turn currently awaiting substantive user completion inside the workspace. Structural control cards such as kickoff and recovery are not active turns. | -| **answered-turn card** | The compact replay form of a completed elicitation turn, summarizing the offer, the structured response, and the turn-owned capture status. | -| **response note** | The single attached text field on a structured user response; it may explain selections, annotate a review, add missing context, or redirect the interviewer. | -| **grounding** | The first phase of a specification, aimed at establishing enough orientation to proceed into design. It is both the product term and the canonical workflow key. | -| **grounding strategy** | The method used to reach grounding sufficiency: elicitation-first (`greenfield`) or analysis-first (`brownfield`). | -| **delivery posture** | The second interview-orientation axis: `end-to-end build` for whole-system creation or reshaping, versus `incremental feature` for bounded change inside an existing or emerging system. | -| **grounding brief** | The concise visible summary surfaced on a preface card after context gathering during grounding. | -| **grounding sufficiency** | The threshold at which the interviewer has enough stable orientation to begin design. | -| **recognition-first elicitation** | The strategy of helping users converge by reacting to concrete possibilities, tradeoffs, examples, and ruled-out directions rather than requiring them to author intent from scratch. | -| **candidate direction** | An agent-synthesized possible specification direction offered when the user asks Brunch to fill in the rest, compare options, or react to proposed typologies. It includes rationale, implications, tradeoffs, likely generated knowledge, and what it rules out. | -| **candidate graph bundle** | The coherent commit/review unit produced by scenario-options flows: a named scenario with tradeoff profile, generated intent items and edges, required core items, optional/swappable items, known risks, graph-review findings, provenance labels, and commit preconditions. It should be accepted or revised as a bundle rather than item-by-item unless semantic closure can be proved. | -| **scenario options** | A chat-local strategy that generates 2–3 candidate graph bundles completing the current direction from context-packed accepted graph truth, then uses graph review to gate clean acceptance or acceptance with represented issues. User-facing labels may be "Show me strong options" or "Speed this up". | -| **targeted cases** | A chat-local strategy based on behavioral kernels: the interviewer asks contrastive domain cases, and user classifications emit checkable artifacts such as decisions, invariants, criteria, examples, and counterexamples. | -| **candidate-spec set** | A turn-owned interviewer artifact in grounding or design that presents one or more candidate directions for reaction-driven refinement. It is analogous to a review set in being a persisted artifact on the turn, but it proposes possible directions rather than reviewing a synthesized inventory. The newer candidate graph bundle framing is the coherent graph-level version of this artifact. | -| **candidate-spec reaction** | The structured user response to a candidate-spec set, choosing whether to accept a direction, request refinement of one candidate, or regenerate a fresh set. It steers the next interview move without directly closing the phase. | -| **breadth skeleton** | A turn-owned interviewer artifact used during a progressive detail pass that summarizes the current broad-pass map, highlights areas that remain shallow, and offers explicit deepening targets. | -| **detail focus** | The selected area or lens for the next recursive follow-up pass. It scopes the next same-phase frontier turn without becoming a separate workflow state or durable topic tree. | -| **detail reaction** | The structured user response to a breadth skeleton, choosing whether to deepen a specific area now, continue broad coverage, or leave an area sufficient for now. | -| **progressive detail pass** | An interview shape that establishes broad structure first, then offers explicit `next level of detail` actions to deepen selected areas recursively rather than drilling to maximum depth immediately. | -| **review set** | A synthesized candidate list used in requirements or criteria review, presented with stable reference codes, supporting per-item commenting, and resolved through `accept review` or `request changes` with per-item comments plus one optional global review note. | -| **review revision** | A successor review set generated after `request changes`, carrying a revision card (changelog + version badge) as a turn-internal artifact above the new review set card. Prior revisions collapse to compact answered-turn summaries. | -| **revision card** | A turn-internal artifact on a review revision turn that summarizes what changed from the prior version and displays a version badge (v2, v3, etc.), paralleling how preface cards sit above question cards. | -| **per-item comment** | An inline comment placed on a specific item in a review set via a comment toggle, forming part of the structured change-request payload alongside the optional global review note. | -| **accepted review set** | The terminal accepted review output for a review phase; this is the authoritative carry-forward set for later review and export seams, and any accepted requirement / criterion items derive their authority from membership in this set. | -| **phase entry state** | The workspace state shown when a projected kickoff card is the current bottom-of-phase affordance. | -| **landing reconciliation** | The pure derivation from durable specification snapshot into the one truthful visible bottom artifact for hydration/restart, plus any pending capture backlog the runtime must re-seed. | -| **observer capture backlog** | The ephemeral specification-scoped queue of answered turns that still need deferred observer capture. It is re-derived from durable turns with a persisted response but no turn-owned observer result, then drained by the runtime lifecycle once a successor frontier exists. | -| **phase handoff state** | The workspace state shown when a phase is complete and a projected handoff / completion control card is the current bottom-of-phase affordance. | -| **control marker** | A transcript-visible workspace event such as interview start, resume, or confirmation that is not rendered as a normal user chat bubble. | -| **phase marker** | A projected boundary annotation in the workspace stream, such as phase start or phase closed, derived from workflow position or anchored workflow facts. | -| **turn capture status** | The per-turn state describing what the observer has captured already, is still capturing, or failed to capture from that answered turn. | -| **active path** | The trusted chain from HEAD to root in the primary interview chat. Side-chats are sibling chat chains under the same specification, not branches of this active path. | -| **phase / mode** | One workflow stage: `grounding` *(label: Grounding)*, `design` *(label: Elicitation)*, `requirements` *(label: Requirements)*, or `criteria` *(label: Acceptance Criteria)*. | -| **phase outcome** | Durable closure artifact for a phase, including summary and closure basis. | -| **closure basis** | Whether a confirmed phase close came from interviewer recommendation or explicit user-forced closure. | -| **closeability** | Deterministic minimum bar for whether the user may close a phase now. | -| **readiness band** | Coarse descriptive signal (`low`, `medium`, `high`) separate from closeability. | -| **review action** | The explicit submit path on a review turn: `accept review` or `request changes`; the action gives any attached review note its meaning. | -| **exploration knowledge** | Durable knowledge captured during grounding or elicitation: `goal`, `term`, `context`, `constraint`, `decision`, and `assumption`. | -| **context** | Descriptive situational truth, actors, workflows, repo facts, or bounded area under discussion that would remain true even if the specification paused tomorrow. Promote context when it carries stronger semantics: success condition -> requirement / invariant, solution boundary -> constraint, uncertain material belief -> assumption, chosen alternative -> decision. | -| **constraint** | A durable boundary on acceptable scope or solution space. Planned subtypes include `non_goal`, `scope`, `technical`, `policy`, `resource`, `compatibility`, and `environmental`. | -| **non-goal** | A `constraint` subtype expressing an explicit exclusion from the current specification scope. | -| **decision** | A chosen direction among plausible alternatives, with durable consequences for future design, implementation, or interpretation. Not every user answer or option selection is a decision. | -| **assumption** | A durable material belief supporting a direction or decision that could later prove false. | -| **intent graph** | Canonical product term for Brunch's semantic substrate: typed intent items, intent edges, examples / counterexamples, validation status, and semantic mutation state. Chat and graph views are projections over this truth; reconciliation needs are process state attached to the graph, not intent content. Supersedes `knowledge graph` as future-facing product vocabulary. | -| **intent item** | Canonical product term for one durable typed semantic unit in the intent graph. Current schema/code may still persist these as `knowledge_item` rows during transition. Use `knowledge item` only when referring to current implementation names. | -| **intent edge** | Canonical product term for one durable typed semantic relation between intent items. Current schema/code may still persist these as `knowledge_edge` rows during transition. Use `knowledge edge` only when referring to current implementation names. | -| **knowledge item / knowledge edge** | Legacy implementation names for current persistence/API records backing intent items and intent edges. Avoid these in new product concepts, capability contracts, and operation ids unless referring to existing code or database schema. | -| **progressive checkability** | The stance that each intent item should receive the weakest sufficient witness: human review, concrete example, counterexample, regression test, runtime contract, state-machine rule, invariant, proof obligation, or explicit unresolved ambiguity. | -| **property** *(candidate ontology)* | A normalized intent primitive that requirements could commit to and criteria could observe. It is a design candidate, not a committed storage or UI surface. | -| **invariant** *(planned ontology kind)* | A property that must remain true across relevant states, transitions, executions, versions, or semantic revisions. | -| **example** *(planned ontology kind)* | A concrete scenario, trace, input/output, edge case, approved example, rejected example, not-relevant label, or counterexample that disambiguates or witnesses intent. Expected subtypes include positive, negative / counterexample, edge-case, and not-relevant. | -| **edge-local neighborhood** | The focused relation context around one intent item: incoming and outgoing intent edges with nearby item summaries, support strength, and relation semantics. Used by interviewer / observer prompts and graph refinement instead of dumping all grouped knowledge. | -| **behavioral kernel** | Reusable interviewer machinery for one class of latent correctness question, such as state/lifecycle, containment, authority, concurrency, transactionality, migration, or evidence. Kernels are not user-facing formalism by default. | -| **intent spec** | The complementary framing to a planning spec: a specification optimized for preserving and validating meaning rather than sequencing downstream work. Carries typed intent items, examples and counterexamples, witness strength, unresolved ambiguity, and validation status. The intent graph is the durable substrate; an intent spec is the human-facing projection of that graph. Contrast with `planning spec`. | -| **planning spec** | A specification optimized for downstream work sequencing — what to build, what scope is in or out, which slices follow. Brunch's product direction is for planning to remain a useful projection from the intent graph rather than the source artifact. | -| **checkability** | A typed field on an intent item describing the strongest oracle that currently witnesses it, drawn from the progressive-checkability ladder: `human_review` / `example` / `counterexample` / `regression_test` / `runtime_contract` / `state_machine_rule` / `invariant` / `proof_obligation` / `unresolved_ambiguity`. The discipline is `progressive checkability`; the field is `checkability`. | -| **witness strength** | The breadth of an intent item's oracle coverage, distinct from which oracle exists. "Checked on three examples" and "proved for all reachable states" can both be `checkability: invariant`, but they have very different `strength`. The pairing forces honesty about what is actually verified. | -| **formalization candidate** | A Brunch-internal intent item that is worth promoting along the progressive-checkability ladder. Critical invariants are formalization candidates: each one states a property currently witnessed by a regression test, with stronger oracles (state-machine model, runtime contract, proof obligation) as deliberate future moves rather than implicit expectations. | -| **disambiguating example** | An `example` whose primary purpose is to settle ambiguity between plausible interpretations of a requirement, invariant, or decision. Linked through the `disambiguates` relation. Generalizes the TiCoder move beyond test cases: the interviewer generates cases where interpretations diverge, and the user's classification settles the meaning. | -| **spec drift** | A divergence between an intent item's recorded meaning and the artifact (criterion, generated requirement, candidate spec, export bundle, or downstream implementation behavior) meant to satisfy it. Surfaced in human terms — "original intent vs generated behavior vs potential mismatch" — so the user can validate meaning at the point where it could have changed, rather than after the divergence has been laundered into a final document. | -| **relation family** | One of five semantic groupings that organize the relation kinds in the intent graph: `justification`, `dependency`, `boundary`, `refinement`, and `verification`. Distinct from the relation `kind` itself; a single kind belongs to exactly one family. Drives prompt grouping, default policy, and observer classification heuristics. | -| **relation policy** | The per-relation, per-axis registry that decides whether each edge participates in `visible`, `cascade`, `export_trace`, `staleness`, `reconciliation`, `criteria_help`, or `weak_suggestion` capabilities. Replaces the implicit assumption that every edge is equally authoritative. Gated by edge `support` (`explicit` / `strong_inference` / `weak_candidate`) and `status` (`proposed` / `accepted` / `rejected` / `stale`). It also owns operational directionality: source-change and target-change behavior must be explicit rather than inferred from raw edge direction. | -| **graph review** | A quality-oriented graph operation that critiques any intent graph for weakness, genericity, low support, missing coverage, weak checkability, poor provenance, or maturity gaps. Distinct from reconciliation, which repairs known process debt after a disturbance. | -| **graph-review finding** | A turn-owned structured artifact produced by graph review. It may later lead to a changeset if accepted, but it is not itself semantic truth or process debt unless represented through a follow-on turn, changeset, or reconciliation need. | -| **structured list** | The first-ship graph-view layout: kind-grouped item rows with a relations footer of Outgoing / Incoming relation chips. Item-first; relationships visible inline. It currently renders the whole-spec entity set because D129 ships the whole-spec fetch first; the intended default becomes active-path items over whole-spec data once the active-path membership seam and `Show all` toggle land. | -| **spatial canvas** | A deferred future graph-view layout where intent items render as nodes with visible edges in a 2D scene. Shares the projection seam and intent contract of D128 with the structured-list layout. | -| **relation chip** | A compact UI element representing one intent-edge endpoint inside a relations footer, carrying the target item's reference code and content snippet. Hover reveals a preview card; click navigates to the target item via hash anchor. | -| **relations footer** | The grouped Outgoing / Incoming subsections beneath an item row in the structured list, listing relation chips for that item's incoming and outgoing edges. Soft-truncates at 6 chips per direction with an inline `+N more` expander; collapses to nothing when an item has zero edges. | -| **action rail** | The per-row right-aligned slot in graph view's structured list reserved for node-level action affordances. Actions emit intents into the existing workspace lifecycle rather than owning their own state. The first ship reserves the slot with one disabled `chat-with` placeholder. | -| **secondary thread** | Modal revisit conversation anchored to a primary-path turn and used to resolve cascade implications. | -| **needs-revisit** | Flag meaning an item is affected by upstream invalidation and must be explicitly resolved before the specification is whole again. | -| **chat** *(planned persistence seam)* | A conversation container inside one specification. The primary interview, side-chats, reconciliation chats, verifier feedback, and review discussions may all own turns without owning semantic truth directly. Phase one adds the table and transitional pointers before making chat ownership canonical. | -| **changeset** *(future persistence seam)* | Canonical term for one submitted semantic mutation bundle against the intent graph. It records what changed and why, separate from the conversational turn that may have initiated it. A changeset is the smallest atomic unit that preserves graph coherence; proposals/findings become changesets only when accepted or otherwise acted on. Supersedes `patch` as the future-facing schema/contract noun. | -| **change** *(future persistence seam)* | Canonical term for one atomic semantic mutation inside a changeset, such as `intentItem.create`, `intentItem.updateContent`, `intentEdge.create`, or `intentEdge.delete`. Supersedes `patch_change`. | -| **patch / patch_change** | Historical design-doc vocabulary for changeset/change. Avoid in new schema, capability contracts, and operation ids unless referring to older docs or source-control-style analogy. | -| **reconciliation need** *(planned persistence seam)* | Durable semantic debt saying existing intent-graph truth may require renewed judgment because an upstream item, relation, verifier, contradiction, or historical premise changed. Phase one stores directed item-to-item needs with narrow kind/status and provenance placeholders; later phases may add relation targets and changeset-backed cause/resolution. It is process state, not an intent edge or intent content. | -| **DrawerCard** | Shared card primitive with header/summary/children slots that supports static, summary-peeking, and toggleable (minimized ↔ maximized) render modes. A `locked` prop disables toggle for controlled-state cards. | -| **ChatScroll** | Composite scroll container that wires Radix ScrollArea (custom scrollbar) with `useStickToBottom` (auto-scroll-to-bottom + scroll-down indicator). Used for the center pane transcript. | -| **phase stepper** | The vertical timeline navigation in the left sidebar showing phases as sequential steps with connecting line, status, readiness, and turn count. | -| **phase addressability** | The ability to deep-link, gate, and focus interview phases through router state even when the center pane renders one continuous sectioned workspace. | -| **knowledge group** | A display-level grouping of knowledge kinds for the sidebar, defined by a hard-coded registry that maps kinds to group labels and visibility. | -| **output view** | The terminal route available when all phases are closed, providing specification summary and markdown export. Not a workflow phase. | -| **activity card** | A projected runtime or replay artifact adjacent to a turn or phase boundary, such as visible generation state, coarse interviewer activity summary, or trailing observer status. It is not a branch-bearing conversational turn. | -| **activity placeholder** | The compact replayable presentation of an activity card between turn cards, showing elapsed thinking time and a coarse tool-use summary for the interviewer without exposing hidden reasoning or raw tool payloads. | -| **phase section header** | A projected, non-durable artifact at the top of each phase section that states the phase purpose and what kinds of knowledge are captured there. Re-projects from workflow state on hydration. | -| **grounding question** | A free-text-first question format used during grounding that presents the question, a why explanation, and a response note field without requiring option selections. Distinct from the option-selection format used in elicitation. | -| **turn-internal artifact** | An assistant-part artifact rendered as its own visual card within a turn but sharing the turn's single response submission. Preface cards and revision cards are turn-internal artifacts that render above their paired question or review set card. | -| **query domain** | An independently invalidable TanStack Query scope within a specification. The current live ownership target is one specification bundle domain (`workflow`, `landing`, `turns`) plus a separate entities domain; finer splits should follow real server ownership boundaries rather than outrunning them. | - -### Boundary terms - -| Term | Definition | -| -------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------- | -| **greenfield** | A grounding strategy for a new concept or under-specified area where the system grounds primarily through elicitation. | -| **brownfield** | A grounding strategy for work inside an existing codebase where the system grounds through analysis, then interrogation. | -| **end-to-end build** | A delivery posture where the specification is shaping a whole system, workflow, or major slice from entry to outcome. | -| **incremental feature** | A delivery posture where the specification is shaping a bounded change inside an existing or partially established system. | -| **context-gathering capability** | An interviewer-invoked capability such as workspace analysis or future web research that gathers provisional orientation for the next move. | -| **BrunchUIMessage** | Typed UI message contract spanning validation, persistence, SSE streaming, and hydration. | -| **Data Part** | Typed custom message part used for structured input and domain-specific assistant output. | -| **context builder** | Typed projection from specification state into inference context for interviewer, observer, or closure logic. | -| **walkthrough scenario** | Named trusted fixture scenario used to seed a resumable manual-inspection workspace. | - -## Verification Design - -### Verification Commands - -| Step | Check | Command | -| ---- | ----------------- | ------------------- | -| 1 | Formatting | `npm run fmt:check` | -| 2 | Lint + type check | `npm run lint` | -| 3 | Unit tests | `npm run test` | -| 4 | Build | `npm run build` | -| all | Full gate | `npm run verify` | - -### Verification Policy - -Every meaningful code change should pass `npm run fix` in the inner loop and `npm run verify` before commit. Slices that touch the user-facing boundary should also stay manually walkthrough-able via the local app. - -### Verification Stance - -- Verification is first-class work; this wave stays **manual-heavy by deliberate choice**, not by accident. -- **Inner loop** proves structural validity, boundary safety, and non-destructive behavior. -- **Middle loop** proves replay, refresh-boundary ownership, and explicit state projection where cheap automated checks can remove bad degrees of freedom. -- **Outer loop** is the authority for brownfield grounding quality, transcript legibility, waiting-state clarity, and phase-layout differentiation. -- Outer-loop UI review uses a **dramaturgical see-and-inspect** posture: judge whether the product stages its state transitions legibly for a human, not just whether bytes round-trip. - -### Diagnostic Assessment - -| Dimension | Score | Notes | Change trigger | -| --------------- | ------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------- | -| Observability | partial | Persistence, DB state, TypeScript seed builders, and route seams are visible in text, but the most important failures in this wave still present as browser-visible transcript disappearance, waiting-state ambiguity, and layout legibility issues. | Promote instrumentation if manual browser inspection cannot explain refresh or lock behavior confidently. | -| Reproducibility | partial | TypeScript scenario builders and direct observer probes give a strong base, but brownfield kickoff quality still varies by repo shape and live refresh behavior is not yet represented by a canonical replay matrix. | Promote a stronger corpus or replay harness if ad hoc brownfield/manual checks stop being trustworthy. | -| Controllability | partial | The agent can iterate on fixtures, stories, and structural tests autonomously, but the core acceptance signals for this wave remain human judgment calls. | Raise controllability only if manual review becomes the bottleneck or repeated ambiguity blocks progress. | - -### Oracle Strategy by Loop Tier - -| Tier | Oracle families | What they prove | Main targets | -| ------ | ------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------- | -| Inner | Schema validation, type-aware linting, focused unit/integration tests, negative-space regressions | Boundaries remain type-safe; persistence and transport seams do not silently collapse; obvious bad failures are caught cheaply. | I4, I17, I24, I44, I48, I54, I72, I87, I100, I101, I102, I103 | -| Middle | Round-trip / replay oracles for seeded projects, hydration, export, and resume | Seeded or persisted state can be loaded, projected, re-rendered, and exported without losing required semantic markers. | Requirements 13, 14, 15; I24, I44, I100, I103 | -| Middle | Route/query ownership integration oracles | Observer updates and response mutations refresh only their owned surfaces instead of tearing down unrelated transcript state. | Requirements 5, 7, 14; A20, A64; I24, I54, I102 | -| Middle | Explicit state-model oracles for in-flight UI states | Every major in-flight mode is named, projectable, and visibly representable instead of collapsing into one opaque loading bit. | Requirement 5; I24, I44 | -| Outer | Fixture-backed manual walkthroughs on seeded scenarios | Walkthrough fixtures are useful enough to inspect phase transitions, export output, resume behavior, and missing-view discovery. | Requirements 13, 14, 15; I100, I103 | -| Outer | Brownfield kickoff walkthroughs on real repos, evaluated qualitatively | Kickoff yields durable useful knowledge and a grounded first question for feature-area work, without needing a fully automated quality score. | Requirements 3, 16; A63; I101 | -| Outer | Dramaturgical story and transcript review | Phase differentiation, transcript artifact legibility, and waiting-state clarity are judged as staged user experience rather than just structural output. | Requirement 5; A15, A51, A53, A54 | - -### Design Notes - -- **Legible replay fidelity beats exact replay fidelity for now** — hydrated transcripts may use placeholders or summary markers to indicate that reasoning or tool activity happened at a point in the conversation, even if the full original content is not persisted. -- **Turn-first replay now beats message-first replay** — for grounding/design, the replay unit should trend toward completed turns plus one live unresolved turn, not alternating assistant/user chat bubbles and stream markers. -- **Brownfield kickoff has a deliberately modest proof bar** — this wave only needs durable useful knowledge plus a grounded first question, not a fully proven grounding bundle before design can proceed. -- **Waiting states should become an explicit vocabulary in code** — the user-facing contract is that each major in-flight mode is visibly represented; deep lock/wait introspection is diagnostic scaffolding, not yet a product requirement. -- **Manual verification is intentionally lightweight** — no heavyweight scripted walkthrough protocol yet; use seeded scenarios and see-and-inspect review rather than bureaucratic checklists. -- **Kickoff strategy comparison stays qualitative unless proven insufficient** — if the brownfield mode fork remains ambiguous after manual repo comparisons, promote that question to a spike with a stronger comparison harness. -- **Graph-view fixture matrix is project-shareable infrastructure** — named scenario builders (`emptySpec`, `singleItemNoEdges`, `crossPhaseDecisionLink`, `denseGoalAnchor`, `activePathDivergence`, plus an explicit `compareLowVsHighEdgeDensity` for A70) underwrite both inner-loop component tests and outer-loop manual walkthroughs. Reusable beyond graph view as similar visualization slices land. - -### Acknowledged Blind Spots - -| Blind spot | Reason | Current mitigation | Revisit trigger | -| ------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------ | ----------------------------------------------------------------------------------------- | -| Qualitative interviewer and kickoff quality across many repo shapes | Chosen manual-first; no broad brownfield corpus or score harness yet | Manual brownfield walkthroughs on representative repos | Brownfield regressions recur or kickoff strategy debates cannot be resolved qualitatively | -| Transcript trust and readability after hydration | Exact replay of all reasoning/tool detail is intentionally deferred | Legible placeholders/summary markers plus manual transcript review | Users still cannot understand what happened after replay despite visible markers | -| Actual lock/wait causality in the UI | Instrumentation is not yet the primary investment | Require explicit visible in-flight states and inspect browser behavior manually | Manual inspection cannot explain a repeated perceived lock or disappearance bug | -| Story quality and phase differentiation | Design quality is not executable in a trustworthy way yet | Story variants reviewed against seeded walkthrough findings | Story/app drift grows or design disagreement blocks implementation | -| Observer latency and layout refresh freshness | No explicit latency budget or perf gate yet | Runtime observation during manual sessions | A20 shows recurring latency or coarse refresh pain | -| Revisit UX and secondary-thread adequacy | That seam is still future work | Keep structural coverage on graph/persistence seams only | Revisit work moves from horizon into the active frontier | -| Real browser scroll behavior under JSDOM | `scrollIntoView` is shimmed in JSDOM — component tests cannot prove real scroll happens after chip click | Outer-loop manual walkthrough explicitly checks scroll-into-view + highlight on chip click | Reports of chip click "doing nothing" or scroll behaving inconsistently across browsers | -| Hover-card timing and popover positioning feel | Animation delay and placement perception are not text-observable | Outer-loop manual review with shadcn defaults (~300ms open, ~150ms close) | Users report flicker, misplaced popovers, or unintended dismissal | -| Mobile / touch / keyboard-only ergonomics for relation chips | HoverCard pattern is mouse-biased; long-press fallback is designed but has no automated test surface | Manual walkthrough on touch device once per slice family | Touch users report missing or undiscoverable preview | -| Performance under large intent graphs | No render or memory budget yet; relation-first observer expansion (A66) will increase edge density | Defer until specs with hundreds of items + dense edges become common | Render lag visible on representative manual walkthroughs | -| Cross-session "Back to chat" target persistence | sessionStorage clears on tab close so the deep-linked entry to graph view has no remembered chat origin | Falls back to current reachable phase via workflow state | Users report "Back to chat" landing in the wrong phase after a fresh tab | -| Visual regression infrastructure | Manual-heavy stance accepted across the project; no Chromatic/Playwright-screenshot seam yet | Outer-loop manual walkthrough on the named graph-view fixture scenarios | Three or more visual regressions caught only after merge | - -### Current Coverage - -| File | Protects | -| ----------------------------------------------------------------- | ------------------------------ | -| `db.test.ts` | I48, I72, I101 | -| `core.test.ts` | I48 | -| `app.test.ts` | I24, I54, I72, I87, I101, I104 | -| `context.test.ts` | I44, I54 | -| `observer.test.ts` | I48, I54 | -| `parts.test.ts` | I17, I101 | -| `project-state-turn.test.ts` | I24, I44, I87, I101 | -| `task.test.tsx` | I24 | -| `EntitySidebar.test.tsx` | I48 | -| `InterviewView.test.tsx` | I24, I44, I48, I54, I72 | -| `-interview-controller.test.tsx` | I24, I105 | -| `-workspace-stream-projector.test.ts` | I24 | -| `transcript-parity.test.tsx` | I24 | -| `interview.test.ts` | I87, I101 | -| `turn-artifacts.test.ts` | I104 | -| `phase-close.test.ts` | I72 | -| `router.test.tsx` | I102 | -| `GraphView.test.tsx` | I48, I102 | -| `project.test.ts` / `launcher.test.ts` / `runtime-config.test.ts` | I4, I100 | -| `corpus.test.ts` / `walkthrough.test.ts` / `seed.test.ts` | I103 | - -## Acceptance Criteria - -1. `npx brunch` can start from a workspace directory with local-first persistence in `.brunch/`. -2. Greenfield and brownfield grounding both work, with brownfield able to start from workspace analysis and converge into the same grounding phase purpose. -3. Structured turns support rich responses without losing semantic fidelity. -4. The knowledge layer stays visible, typed, and linked through graph relationships. -5. Phase closeability, readiness, and closure provenance stay legible to the user. -6. Requirements and criteria review remain explicit, lightweight, durable at the turn level, and export-relevant. -7. Revisit can invalidate knowledge, surface cascade through the `reconciliation_need` queue, and re-resolve through the patch list — no separate modal or secondary-thread surface. -8. The routed UI stays stable across dashboard, phase views, sidebar knowledge, and graph view. -9. Resume works from persisted state. -10. The verification gate passes. -11. Grounding/design use workspace-owned turn cards for substantive elicitation, requirements/criteria use full-set review turns, and structural kickoff / recovery / handoff / completion affordances project without a bare generic composer. -12. Hydrated transcripts preserve interviewer-side structure plus stable durable activity summaries for any live-only artifacts that were shown during streaming, including elapsed thinking time and a coarse tool-use summary / placeholder seam. -13. Open phases bottom-load a projected kickoff card, the current frontier turn, a visible generation state, or a projected recovery card; completed elicitation turns replay as answered-turn records, and closed phases bottom-load a projected handoff or completion artifact. -14. Preface cards render as turn-internal artifacts paired with question cards, so the observer captures from the whole validated turn rather than from unvalidated provisional content alone. -15. Grounding and elicitation persist only the durable exploration ontology, with `non-goal` represented as a `constraint` subtype rather than a separate top-level kind. -16. Observer prompt, shared kind registry, schema / API types, fixtures, and UI copy describe the same ontology and accepted-review semantics without per-layer language drift. -17. The interview can orient itself anywhere in the `greenfield <> brownfield` by `end-to-end build <> incremental feature` matrix without forcing whole-project assumptions. -18. Observer capture records intent edges broadly enough that most durable intent items link to upstream or downstream context whenever that relation is reasonably traceable. -19. Users who cannot complete a long interview can request candidate directions with explained tradeoffs and refine by reacting to them. -20. The interview can stop at a broad pass and deepen selected areas incrementally through explicit next-detail actions. -21. Graph view renders the intent graph as a navigable workspace with visible edges and node-launched refinement flows, not just a grouped list. -22. First-run setup makes missing provider credentials visible and recoverable from the dashboard without requiring users to hand-edit project `.env` files. -23. Brunch can help users keep `.brunch/` out of version control through an explicit, idempotent `.gitignore` confirmation flow. diff --git a/memory/SPEC.md b/memory/SPEC.md index 2931a640..8bc60464 100644 --- a/memory/SPEC.md +++ b/memory/SPEC.md @@ -90,6 +90,8 @@ Post-launch, Brunch should support specification work across two axes rather tha 40. Prompt and context engineering are first-class server subsystems: prompts and reusable policy doctrines live as inspectable markdown assets, while typed context-pack builders derive scenario-specific intent-graph renderings for interviewer, observer, research, candidate synthesis, behavioral kernels, reconciliation, architect, and downstream decomposition probes. 41. Agent-heavy future capabilities can be tested before product UI exists through a lightweight scenario substrate that runs prompt/context packs against seeded graphs or transcript fixtures, captures raw and structured outputs, and supports harness comparison. Scenario execution may use the existing Anthropic API key or fake adapters for probes, but first-run provider setup, credential storage, OpenRouter defaulting, and the shared production AI runtime seam belong to the provider setup frontier. Pi may be evaluated as a lower-level agent harness, especially for tool experiments and pre-UI probes, but Brunch product authority over durable workflow, replay, graph mutation, and reconciliation remains explicit. 42. Agent-originated mutations of Brunch data use one typed server-owned mutation surface regardless of caller. Internal interviewer/observer flows, scenario probes, CLI/TUI harnesses, Pi or other harness adapters, and future external agents may not mutate durable Brunch state by calling the ORM directly; they must invoke stable mutation handlers with input/output schemas, authority metadata, replay policy, and reconciliation/changeset-ledger semantics. Read-only capability contracts may share the same registry shape, but the hard invariant is single-entry mutation authority. +43. A local agent capability CLI can expose Brunch-owned capability contracts over long-lived JSONL stdin/stdout so an external probe runner or harness can drive the real specification flow without privileged ORM access. The CLI is an adapter over capability contracts, not a separate product API: calls carry explicit resource identifiers, read commands distinguish structured `get` / `list` data from agent-facing `read` projections with affordance hints, and mutating commands stay small and procedural around spec lifecycle requests, chat readiness, and turn response submission. The LLM-as-user scenario brief, model choice, fixture curation, and probe artifacts belong to an external probe runner that talks to the CLI like any other agent. +44. Specifications can evolve through multiple chat-local strategies rather than one global interviewer mode. A chat's first frontier turn may offer or declare its strategy (`step_by_step`, `scenario_options`, `targeted_cases`, `graph_review`, `reconciliation`), and every active/resumable chat should have at most one open assistant/system-first frontier turn waiting for a user completion action. Proposal turns use normalized completion semantics (`accept`, `reject`, `revise`, `ask_followup`, `defer`, `regenerate`); only acceptance of a proposal turn may apply that proposal's semantic changeset. Mid-interview acceleration should branch into a side-chat / strategy chat that completes the current direction from context-packed graph truth, while graph-review critique remains the internal oracle for judging and repairing generated candidate bundles. ## Assumptions @@ -135,7 +137,11 @@ Post-launch, Brunch should support specification work across two axes rather tha | A85 | A lightweight prompt scenario substrate will let Brunch validate LLM-heavy product directions faster than building UI first, if it captures rendered prompts, context packs, model settings, raw outputs, structured parses, and human review notes as repeatable artifacts. | medium | open | D139 | Run multi-scenario prompt probes for observer ontology, behavioral kernels, candidate-spec assist, and downstream oracle/decomposition before productizing their UI. | | A86 | Pi can serve as a useful pre-UI agent harness or tool-spike backend without forcing Brunch to adopt Pi as its production agent runtime, as long as integration remains adapter-shaped and Brunch-owned authority/replay/mutation semantics stay outside the harness. | low | open | D142 | Spike Pi SDK or RPC with in-memory sessions, custom tools, controlled prompts, and Brunch graph context packs; evaluate event capture, tool ergonomics, provider handling, packaging, and isolation. | | A87 | Verification-aware post-spec decomposition can be explored as agent scenarios before it is a Brunch product surface: intent graph truth plus progressive checkability can feed design alternatives, oracle strategy, execution slices, and orchestration constraints. | low | future | D141 | Prototype decomposition and oracle-design probes inspired by `ln-design` and `ln-oracles`; compare outputs for traceability to requirements, invariants, examples, criteria, and blind spots. | -| A89 | Making Brunch CLI-addressable will be valuable only if the CLI is generated or validated from the same Brunch-owned agent capability contracts that power other adapters; a hand-written CLI surface is likely to drift on verbs, flags, schemas, introspection, async recovery, and mutation authority. | medium | future | D143, D147 | Before productizing a CLI, prototype a narrow generated/contract-checked command set and audit it against agent-native constraints: non-interactive execution, uniform JSON, bounded output, enumerated errors, idempotent/recoverable mutations, stable vocabulary, machine-readable introspection, and explicit profile/delivery/feedback seams. | +| A89 | A long-lived local JSONL agent capability CLI can drive the real Brunch interview flow well enough for external LLM-as-user probes to produce credible completed specification fixtures, while keeping product resources explicit in every call and using ambient process state only for runtime plumbing such as DB handles, provider config, and in-flight generation bookkeeping. | medium | open | D143, D147, Requirement 43 | Prototype the minimal `brunch agent` JSONL loop over capability contracts, then run small LLM-as-user scenarios end-to-end through `chat.ensureReady`, `chat.read`, `turn.submitResponse`, `spec.requestPhaseClosure`, and `spec.requestExport`. Validate that probe logs are replayable, no probe code imports DB/product handlers directly, and no durable operation ledger is needed for the first readiness semantics. | +| A90 | Users who ask to speed up a long interview will prefer a side-chat that generates 2–3 reviewed scenario options completing the current direction over continuing the primary drilldown, provided existing accepted graph truth is treated as fixed premise by default. | medium | open | D126, D148, D151, Requirement 44 | Probe scenario-options against drilldown fixtures and run manual flow review: do users understand the tradeoff profiles, preserve trust in prior answers, and return to the primary interview when generated options disappoint? | +| A91 | Graph-review critique can make scenario-generated candidate bundles safe enough for product use without requiring perfect one-shot generation, if candidate readiness distinguishes `reviewed_clean`, `reviewed_with_issues`, and `blocked`, and if accepted-with-issues immediately opens durable follow-on review work. | medium | open | D151, D152, Requirement 44 | Run candidate bundle probes with graph-review scoring and human review; verify accepted-with-issues flows create a graph-review frontier or appropriate reconciliation needs rather than hiding defects. | +| A92 | A conservative global staleness rule for open proposal turns — stale when `specification.latest_changeset_id` differs from `turn.opened_at_changeset_id` — is acceptable before neighborhood-level staleness calculation exists. | medium | open | D149, I116 | Exercise multi-chat proposal flows where another chat applies a changeset while a proposal remains open; check whether regeneration prompts feel safe rather than noisy. | +| A93 | Relation-policy directionality lookup is safer than trying to force all useful intent-edge verbs into one dependency direction, because graph edges must serve display, context packs, export trace, reconciliation, critique, verification, candidate generation, and explanation. | medium | open | D137, D150 | In FE-700, define canonical/inverse sentences and source/target change behavior for each relation; test direct-edit and hard-impact cascade against mixed-direction relations. | ## Decisions @@ -205,7 +211,12 @@ Post-launch, Brunch should support specification work across two axes rather tha 144. **Intent graph vocabulary supersedes knowledge graph vocabulary** — Canonical product vocabulary is `intent graph`, made of `intent items` and `intent edges`. Current schema/code may still use `knowledge_item` and `knowledge_edge` as implementation names during transition, but new planning, agent capability contracts, context packs, operation ids, and user-facing design should prefer intent vocabulary unless referring to current persistence/API names. `Claim` may remain an explanatory generic for natural-language content, but it is not a product/schema noun. Depends on: D134, D136, D137. Supersedes: using `knowledge graph`, `knowledge item`, `knowledge edge`, or `claim` as future-facing product nouns. 145. **Changeset/change supersedes patch/patch_change** — Semantic mutation history uses `changeset` for one submitted semantic mutation bundle and `change` for one atomic mutation inside it. `Patch` and `patch_change` remain historical design-doc vocabulary and may appear in older file names, but new schema, capability contracts, operation ids, and planning language should use `changeset` / `change` unless this decision is explicitly reversed. Depends on: D135, D138, D143. Supersedes: treating naming as open between patch and changeset. 146. **Hard-impact edit cascade reads from the `reconciliation_need` queue, not from REVISIT walk state** — when a hard-impact `propose_edit` patch applies, the server enumerates `knowledge_edge` rows incident on the changed item under typed relation policy and opens one `reconciliation_need` row per affected pair (Path 1 from `docs/design/MULTI_CHAT.md` §5.1). The patch list overlay is the canonical resolution surface: open needs render as a `Pending review` section alongside staged patches, with per-row accept-on-target / edit-target / dismiss actions. The V2 `deferred: true` apply response and the "Hard impact — coming in V3 cascade preview" banner are removed at V3.0 ship. V3.0 groups needs mechanically by `kind` and relation type; agent-grouped resolution (auto-confirm / auto-edit / substantive) is V3.1 work and does not block V3.0. Side-chat thread persistence is not a V3.0 prerequisite — threads stay in-memory until MULTI_CHAT.md Phase 2. Depends on: A71, A83, A88, D80, D135, D137, D138. Supersedes: hard-edit deferral with a placeholder banner, the modal secondary-thread walk in `docs/archive/design/REVISIT_MODULE.md`, and the SIDE_CHAT.md V3 prose that pre-dated the multi-chat substrate. -147. **A future Brunch CLI should be an agent-native adapter over capability contracts, not a separate product API** — CLI-addressability belongs as a transport/tool adapter generated from or mechanically checked against Brunch-owned agent capability contracts. The CLI should inherit stable operation ids, input/output schemas, authority classes, replay policy, and reconciliation/changeset semantics from the contract registry, then project them through conventional agent-friendly CLI vocabulary (`get`, `list`, `create`, `update`, `delete`, `--json`, `--force`, `--limit`, `--dry-run`, `--wait`) with CI checks for banned aliases. A CLI adapter should default to non-interactive execution, send data to stdout and diagnostics to stderr, bound list/log output, enumerate valid values in validation errors, expose structured introspection (`brunch agent-context`) plus workflow guidance, and treat async/submitting operations as recoverable through `--wait` and a local job ledger. Durable Brunch writes still enter only through D143 mutation handlers; CLI profiles, delivery sinks, and feedback commands are adapter conveniences, not alternate state authority. Depends on: D139, D140, D143, A89. Supersedes: treating a future CLI as hand-written wrappers around routes or direct ORM scripts. +147. **The local agent CLI is a long-lived JSONL adapter over Brunch capability contracts** — CLI-addressability should first ship as a `brunch agent`-style local process that speaks request/response JSONL over stdin/stdout, dispatches Brunch-owned capability contracts, and keeps all product resources explicit in input payloads. The adapter may hold ambient runtime plumbing such as a DB connection, provider config, and in-flight interviewer / observer generation bookkeeping, but it must not hold hidden selected spec/chat/turn handles as command semantics. Read capabilities use `list` / `get` for structured read-model data and `read` for agent-facing projections with allowed response shapes and next-command hints. Mutations stay capability-first and surface-lazy: add only contracts needed by real probe/tool use, with an initial surface around `spec.create`, `spec.getStatus`, `spec.requestPhaseClosure(specId, phaseId?)`, `spec.requestExport`, `chat.getPrimary`, `chat.ensureReady(chatId?, timeoutMs?)`, `chat.read`, `turn.get`, and `turn.submitResponse(chatId, turnId?, response)`. `chat.ensureReady` is the idempotent synchronization/recovery command: it may trigger continuation when a chat lacks a usable next turn, can block up to a bounded timeout, and returns a derived state such as `generating`, `awaiting_response`, `idle_no_frontier`, `closed`, or `error` without requiring a durable runtime-operations ledger in the first cut. The LLM-as-user probe runner is a separate client of this JSONL adapter, not part of the capability server and not allowed to import DB/product handlers directly; its scenario briefs, model policy, generated transcripts, and curated golden fixture bundles are probe artifacts, not Brunch authority. Suggested module boundary: `src/server/capabilities.ts` plus `src/server/capabilities/` own contracts, schemas, handlers, and dispatch; `src/server/agent-jsonl.ts` plus `src/server/agent-jsonl/` own only protocol/session/transport; `scripts/agent-probes/` owns the outer LLM-as-user loop and artifact writing as development harness code through a JSONL client. Depends on: Requirement 43, A89, D139, D140, D143. Supersedes: treating the CLI as hand-written route wrappers, direct ORM scripts, a one-shot TUI with hidden ambient selection, or a probe runner that bypasses the same mutation surface future agents must use. +148. **Spec evolution strategies are chat-local, turn-mediated process state** — strategy is not specification-level semantic truth. A chat may be established through a first assistant/system frontier turn that offers or declares a strategy such as `step_by_step`, `scenario_options`, `targeted_cases`, `graph_review`, or `reconciliation`; globally triggered flows may create/reuse a pre-strategized chat whose first turn is the procedure kickoff. A chat can technically change strategy through later turns, but explicit switch UX is deferred. Tactical sub-strategies are allowed inside a chat, but broad mid-interview acceleration should branch into a side-chat/strategy chat rather than mutate the primary interview chat in place. Depends on: Requirement 39, Requirement 44, D138. Supersedes: treating the interviewer as one global mode per specification. +149. **Changesets are the atomic semantic mutation boundary, while proposal turns are not mutations until accepted** — a graph-review finding, candidate bundle, or reconciliation suggestion is the assistant/system half of an open frontier turn until the user responds. Only `accept` applies a proposal turn's semantic changeset; `revise`, `ask_followup`, `regenerate`, `defer`, and ordinary `reject` produce successor/process state rather than direct graph mutation. A changeset is the smallest semantic mutation unit that preserves coherence, and may record direct edits, candidate acceptance, reconciliation resolutions, opened reconciliation needs, or future verifier/import results. Turns should stamp the latest applied changeset id at creation (`opened_at_changeset_id` / `base_changeset_id`) so open proposals can be conservatively marked stale when the specification advances. Depends on: A71, A79, A92, D135, D145, Requirement 44. Supersedes: treating agent proposals or review findings as durable semantic truth before user/action acceptance. +150. **Relation policy owns operational directionality for intent edges** — relation names should be semantically clear, but code must not infer cascade or reconciliation behavior from raw edge source/target direction. Each relation kind declares a canonical sentence, inverse display sentence, operational-axis participation, and source-change / target-change behavior. Direct edit and hard-impact cascade enumerate incident accepted edges, then ask relation policy which endpoint, if any, receives a `reconciliation_need`. FE-700 may break current `depends_on` / `derived_from` / `constrains` / `verifies` records while expanding the ontology, but should not force every useful edge verb into one dependency direction at the expense of display, prompt context, export trace, critique, verification, candidate generation, or explanation. Depends on: A81, A88, A93, D137, D146. Supersedes: assuming outgoing edges from the changed item are the cascade direction. +151. **Scenario-options acceleration is product-facing, but graph review is its safety oracle** — the first user-visible alternative to long drilldown should likely be a first-turn strategy choice or mid-interview `speed this up` side-chat that generates 2–3 candidate bundles completing the current direction from context-packed accepted graph truth. Candidate bundles present named tradeoff profiles and are accepted as coherent units, not item-by-item pick lists. Fast gates (parse/schema/fixed-premise/no-obvious-conflict/tradeoff summary) can run before display; deeper graph review, coverage, checkability, provenance, and repair/refinement can run asynchronously. Depends on: A67, A84, A85, A90, A91, D126, D139, D140, D148, Requirement 31, Requirement 44. Supersedes: treating candidate-spec assist as a skip/force-close helper or as one-shot generation that can be committed without critique. +152. **Graph review and reconciliation are separate graph operations** — reconciliation is repair-oriented process debt from a known disturbance (`reconciliation_need`), while graph review is quality-oriented critique over any graph for weakness, genericity, low support, missing coverage, weak checkability, poor provenance, or maturity gaps. Broader review findings start as turn-owned structured artifacts; `reconciliation_need` remains the only first-class problem table until review issues require independent querying, filtering, badges, assignment, or lifecycle. Candidates may be accepted with represented issues if accepting also opens a graph-review frontier or appropriate process-debt records. Depends on: A91, D137, D149, D151, Requirement 44. Supersedes: overloading reconciliation as the umbrella for all graph intelligence or blocking useful imperfect specs until every review issue is repaired. ## Interaction Stream Model @@ -373,6 +384,11 @@ Each row in this table is a **formalization candidate** ascending the progressiv | I112 | Prompt/context scenarios render from packaged markdown prompts and typed context-pack builders rather than scattered inline prompt strings; probe artifacts include deterministic rendered prompt/context fingerprints, prompt asset packaging mirrors current source assets at build time, and production prompt text has reviewable golden coverage without requiring product UI. | `prompt-loader.test.ts`, `prompt-build-boundary.test.ts`, `prompt-golden.test.ts`, `context-pack.test.ts`, `scenario-runner.test.ts` | Requirements 40, 41; D139, D140 | | I113 | Hard-impact `propose_edit` apply opens at least one `reconciliation_need` per existing typed dependency edge incident on the changed knowledge item (relations: `depends_on`, `derived_from`, `constrains`, `refines`, `verifies`), records `caused_by_turn_id` provenance, deduplicates against the partial unique index, and never returns `deferred: true` from the apply contract; resolutions transition `open → resolved` idempotently. | planned: `edit-applier.test.ts`, `reconciliation-need.test.ts`, `patch-list-overlay.test.tsx`, `app.test.ts` | Acceptance Criterion 7; A88; D135, D137, D138, D146 | | I114 | The reconciliation classifier (V3.1 `run-agent` route + `classifyNeed`) walks every awaiting open `reconciliation_need` row through the lifecycle `null → queued → classifying → classified \| failed`, persists exactly one of `auto-confirm` / `auto-edit` / `substantive` into `agent_classification` on `classified`, and writes the parser error or thrown message into `agent_proposal` on `failed`; `agent_proposal` is text-only and is never auto-applied by the server (resolution actions remain user-initiated per slice 6), so an invalid label or hallucinated proposal stays recoverable via per-row Re-run. | `reconciliation-agent.test.ts`, `reconciliation-agent-route.test.ts`, `reconciliation-need.test.ts`, `reconciliation-needs-route.test.ts` | Requirement 10; A88; D139 | +| I115 | The agent capability CLI remains an adapter over Brunch capability contracts: JSONL calls validate explicit resource ids and schemas, mutating calls dispatch through server-owned capability handlers rather than ORM/route bypasses, `read` projections provide affordance hints without importing scenario briefs, and the probe runner exercises the surface only through a JSONL client. | planned: `capabilities/*.test.ts`, `agent-jsonl.test.ts`, `probe-runner.test.ts` | Requirements 42, 43; A89; D143, D147 | +| I116 | Each active/resumable chat has at most one open assistant/system-first frontier turn; user responses complete that turn through normalized proposal/response semantics, and strategy is chat-local process state rather than specification-level semantic truth. | planned: `chat-substrate.test.ts`, `turn-response-transition.test.ts`, `capabilities.test.ts` | Requirement 44; D138, D148 | +| I117 | Open proposal turns are stamped with the latest applied changeset id at creation and are conservatively stale when the specification's latest changeset advances before completion; stale proposals refresh/regenerate rather than applying against unknown graph state. | planned: `changeset.test.ts`, `turn-response-transition.test.ts`, `app.test.ts` | A92; D149 | +| I118 | Reconciliation/direct-edit cascade never infers affected endpoints from raw edge direction alone; it consults relation policy source-change / target-change behavior over incident accepted edges. | planned: `knowledge-relationship-policy.test.ts`, `edit-impact.test.ts`, `reconciliation-need.test.ts` | A93; D137, D146, D150 | +| I119 | Scenario-option candidate bundles can only become canonical by accepting a coherent bundle changeset; accepted-with-issues candidates must also create durable follow-on review/process debt so known weaknesses are not hidden. | planned: `scenario-runner.test.ts`, `turn-artifacts.test.ts`, `changeset.test.ts` | A90, A91; D151, D152 | ## Lexicon @@ -388,6 +404,9 @@ Each row in this table is a **formalization candidate** ascending the progressiv | **scenario runner** | A lightweight pre-UI harness that runs a selected prompt scenario against fixtures, context packs, tools, and model settings, then records outputs for qualitative and structural review. Execution adapters translate this harness input into a concrete fake/model/harness call; they do not define Brunch semantics, credential UX, provider resolution, or mutation authority. | | **agent mutation surface** | The Brunch-owned typed handler layer for any durable data mutation initiated by an agent, internal or external. It is the only write entry point agents may use; handlers own schemas, authority, replay behavior, and reconciliation/changeset-ledger semantics rather than letting agents call the ORM directly. | | **agent capability contract** | A Brunch-owned typed contract addressable by agents or harnesses, with a stable id, description, input/output schemas, authority class, and replay policy. Read-only capabilities and mutating handlers can share this registry shape, but mutating contracts must route through the agent mutation surface. | +| **agent capability CLI** | A local machine-facing CLI adapter, initially a long-lived JSONL stdin/stdout process, that exposes Brunch-owned capability contracts to external agents and probe runners without defining its own product API or mutation authority. | +| **JSONL capability session** | The request/response transport between an external harness and `brunch agent`: every call includes an id, capability id, and explicit input resource identifiers; the process may keep DB/provider/in-flight runtime handles internally, but selected spec/chat/turn targets are not hidden ambient state. | +| **probe runner** | An external client of the agent capability CLI that supplies scenario briefs, calls an LLM-as-user, drives Brunch through capability calls, and writes generated transcript/spec/export/graph artifacts for human curation. It must not import Brunch DB or product handlers directly. | | **tool adapter** | A provider- or harness-specific projection of an agent capability contract into a concrete tool format such as AI SDK tools, Pi tools, CLI/TUI commands, or a future external-agent API. Adapters translate shape and transport while preserving Brunch-owned authority semantics. | | **authority class** | The contract metadata that says whether an agent capability is read-only, proposal-only, or commits durable product truth, and therefore which replay, reconciliation, and mutation boundaries govern it. | | **AI runtime provider** | The shared server seam that resolves the configured LLM provider, model names, API-key source, and provider-specific options for interviewer and observer calls. | @@ -405,6 +424,7 @@ Each row in this table is a **formalization candidate** ascending the progressiv | **projected control card** | A workflow affordance derived from durable state rather than authored conversational content. Kickoff, recovery, and proceed / handoff controls live here. | | **kickoff card** | A projected phase-entry control card that appears whenever an open phase is in entry-pending state and requires an explicit user action before substantive interviewer progression begins. | | **frontier turn** | The single actionable durable conversational turn currently at the bottom of an open phase when the phase is in substantive elicitation rather than structural control. | +| **proposal turn** | An assistant/system-first frontier turn that offers a candidate bundle, graph-review finding, reconciliation suggestion, or other proposed action. It is not a semantic mutation until the user completes it, usually by accepting, revising, asking follow-up, deferring, regenerating, or rejecting. | | **preface card** | A turn-internal artifact that presents provisional context from interviewer-invoked context gathering, rendered above a paired question card within the same turn. The observer captures from the whole turn (preface context + question + user response) as one validated unit rather than from the preface card alone. Available in any phase when the workspace directory is present. Implementation: `preface` / `PrefaceCard` / `present_preface` tool / `data-preface` part. Renders as a simple `bg-tint` rounded box with italic subdued text, not as a DrawerCard. | | **question card** | A turn card that asks a structured interviewer question and expects a substantive user response. | | **review turn** | A full-set requirements or criteria review interaction that offers a synthesized candidate list with stable reference codes, supports per-item commenting (inline comment toggle on each item) plus one optional global review note, and persists its own `reviewActions` / `reviewSet` metadata on the turn. On `request changes`, the successor review turn carries a revision card above the new review set. | @@ -421,6 +441,7 @@ Each row in this table is a **formalization candidate** ascending the progressiv | **recognition-first elicitation** | The strategy of helping users converge by reacting to concrete possibilities, tradeoffs, examples, and ruled-out directions rather than requiring them to author intent from scratch. | | **candidate direction** | An agent-synthesized possible specification direction offered when the user asks Brunch to fill in the rest, compare options, or react to proposed typologies. It includes rationale, implications, tradeoffs, likely generated knowledge, and what it rules out. | | **candidate-spec set** | A turn-owned interviewer artifact in grounding or design that presents one or more candidate directions for reaction-driven refinement. It is analogous to a review set in being a persisted artifact on the turn, but it proposes possible directions rather than reviewing a synthesized inventory. | +| **candidate graph bundle** | The coherent commit/review unit produced by scenario-options flows: a named scenario with tradeoff profile, generated intent items and edges, required core items, optional/swappable items, known risks, graph-review findings, provenance labels, and commit preconditions. It should be accepted or revised as a bundle rather than item-by-item unless semantic closure can be proved. | | **candidate-spec reaction** | The structured user response to a candidate-spec set, choosing whether to accept a direction, request refinement of one candidate, or regenerate a fresh set. It steers the next interview move without directly closing the phase. | | **breadth skeleton** | A turn-owned interviewer artifact used during a progressive detail pass that summarizes the current broad-pass map, highlights areas that remain shallow, and offers explicit deepening targets. | | **detail focus** | The selected area or lens for the next recursive follow-up pass. It scopes the next same-phase frontier turn without becoming a separate workflow state or durable topic tree. | @@ -469,7 +490,8 @@ Each row in this table is a **formalization candidate** ascending the progressiv | **disambiguating example** | An `example` whose primary purpose is to settle ambiguity between plausible interpretations of a requirement, invariant, or decision. Linked through the `disambiguates` relation. Generalizes the TiCoder move beyond test cases: the interviewer generates cases where interpretations diverge, and the user's classification settles the meaning. | | **spec drift** | A divergence between an intent item's recorded meaning and the artifact (criterion, generated requirement, candidate spec, export bundle, or downstream implementation behavior) meant to satisfy it. Surfaced in human terms — "original intent vs generated behavior vs potential mismatch" — so the user can validate meaning at the point where it could have changed, rather than after the divergence has been laundered into a final document. | | **relation family** | One of five semantic groupings that organize the relation kinds in the intent graph: `justification`, `dependency`, `boundary`, `refinement`, and `verification`. Distinct from the relation `kind` itself; a single kind belongs to exactly one family. Drives prompt grouping, default policy, and observer classification heuristics. | -| **relation policy** | The per-relation, per-axis registry that decides whether each edge participates in `visible`, `cascade`, `export_trace`, `staleness`, `reconciliation`, `criteria_help`, or `weak_suggestion` capabilities. Replaces the implicit assumption that every edge is equally authoritative. Gated by edge `support` (`explicit` / `strong_inference` / `weak_candidate`) and `status` (`proposed` / `accepted` / `rejected` / `stale`). | +| **relation policy** | The per-relation, per-axis registry that decides whether each edge participates in `visible`, `cascade`, `export_trace`, `staleness`, `reconciliation`, `criteria_help`, or `weak_suggestion` capabilities. Replaces the implicit assumption that every edge is equally authoritative. Gated by edge `support` (`explicit` / `strong_inference` / `weak_candidate`) and `status` (`proposed` / `accepted` / `rejected` / `stale`). It also owns operational directionality: source-change and target-change behavior must be explicit rather than inferred from raw edge direction. | +| **graph-review finding** | A turn-owned structured artifact produced by graph review. It may later lead to a changeset if accepted, but it is not itself semantic truth or process debt unless represented through a follow-on turn, changeset, or reconciliation need. | | **structured list** | The first-ship graph-view layout: kind-grouped item rows with a relations footer of Outgoing / Incoming relation chips. Item-first; relationships visible inline. It currently renders the whole-spec entity set because D129 ships the whole-spec fetch first; the intended default becomes active-path items over whole-spec data once the active-path membership seam and `Show all` toggle land. | | **spatial canvas** | A deferred future graph-view layout where intent items render as nodes with visible edges in a 2D scene. Shares the projection seam and intent contract of D128 with the structured-list layout. | | **relation chip** | A compact UI element representing one intent-edge endpoint inside a relations footer, carrying the target item's reference code and content snippet. Hover reveals a preview card; click navigates to the target item via hash anchor. | @@ -478,7 +500,7 @@ Each row in this table is a **formalization candidate** ascending the progressiv | **secondary thread** | Modal revisit conversation anchored to a primary-path turn and used to resolve cascade implications. | | **needs-revisit** | Flag meaning an item is affected by upstream invalidation and must be explicitly resolved before the specification is whole again. | | **chat** *(planned persistence seam)* | A conversation container inside one specification. The primary interview, side-chats, reconciliation chats, verifier feedback, and review discussions may all own turns without owning semantic truth directly. Phase one adds the table and transitional pointers before making chat ownership canonical. | -| **changeset** *(future persistence seam)* | Canonical term for one submitted semantic mutation bundle against the intent graph. It records what changed and why, separate from the conversational turn that may have initiated it. Supersedes `patch` as the future-facing schema/contract noun. | +| **changeset** *(future persistence seam)* | Canonical term for one submitted semantic mutation bundle against the intent graph. It records what changed and why, separate from the conversational turn that may have initiated it. A changeset is the smallest atomic unit that preserves graph coherence; proposals/findings become changesets only when accepted or otherwise acted on. Supersedes `patch` as the future-facing schema/contract noun. | | **change** *(future persistence seam)* | Canonical term for one atomic semantic mutation inside a changeset, such as `intentItem.create`, `intentItem.updateContent`, `intentEdge.create`, or `intentEdge.delete`. Supersedes `patch_change`. | | **patch / patch_change** | Historical design-doc vocabulary for changeset/change. Avoid in new schema, capability contracts, and operation ids unless referring to older docs or source-control-style analogy. | | **reconciliation need** *(planned persistence seam)* | Durable semantic debt saying existing intent-graph truth may require renewed judgment because an upstream item, relation, verifier, contradiction, or historical premise changed. Phase one stores directed item-to-item needs with narrow kind/status and provenance placeholders; later phases may add relation targets and changeset-backed cause/resolution. It is process state, not an intent edge or intent content. | From 0deb7754f76c71669693e06409b1cdfa4dbe7d53 Mon Sep 17 00:00:00 2001 From: Lu Nelson Date: Mon, 11 May 2026 16:40:55 +0200 Subject: [PATCH 03/42] FE-705: Add agent JSONL lifecycle capabilities --- src/server/agent-jsonl.test.ts | 100 ++++++++++++++++++++ src/server/agent-jsonl.ts | 89 ++++++++++++++++++ src/server/capabilities.test.ts | 74 +++++++++++++++ src/server/capabilities.ts | 125 +++++++++++++++++++++++++ src/server/capability-registry.test.ts | 14 +++ src/server/capability-registry.ts | 42 ++++++++- src/server/cli.test.ts | 44 ++++++++- src/server/cli.ts | 33 +++++-- 8 files changed, 509 insertions(+), 12 deletions(-) create mode 100644 src/server/agent-jsonl.test.ts create mode 100644 src/server/agent-jsonl.ts create mode 100644 src/server/capabilities.test.ts create mode 100644 src/server/capabilities.ts diff --git a/src/server/agent-jsonl.test.ts b/src/server/agent-jsonl.test.ts new file mode 100644 index 00000000..d1ede131 --- /dev/null +++ b/src/server/agent-jsonl.test.ts @@ -0,0 +1,100 @@ +import { mkdtempSync, rmSync } from 'node:fs'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; +import { PassThrough } from 'node:stream'; + +import { afterEach, describe, expect, it } from 'vitest'; + +import { runAgentJsonlSession } from './agent-jsonl.js'; +import { createDb, type DB } from './db.js'; + +describe('agent JSONL session', () => { + const tempDirs: string[] = []; + let db: DB | null = null; + + afterEach(() => { + db?.$client.close(); + db = null; + for (const dir of tempDirs.splice(0)) { + rmSync(dir, { recursive: true, force: true }); + } + }); + + function createTempDb(): DB { + const dir = mkdtempSync(join(tmpdir(), 'brunch-agent-jsonl-')); + tempDirs.push(dir); + db = createDb(join(dir, 'brunch.db')); + return db; + } + + async function runSession(lines: string[]) { + const input = new PassThrough(); + const output = new PassThrough(); + const chunks: string[] = []; + output.on('data', (chunk) => chunks.push(chunk.toString())); + + const session = runAgentJsonlSession({ db: createTempDb(), input, output }); + for (const line of lines) { + input.write(`${line}\n`); + } + input.end(); + await session; + + return chunks + .join('') + .trim() + .split('\n') + .filter(Boolean) + .map((line) => JSON.parse(line) as unknown); + } + + it('creates and reads a real specification over JSONL without ambient selection', async () => { + const responses = await runSession([ + JSON.stringify({ id: 'create-1', capability: 'spec.create', input: { name: 'JSONL spec' } }), + JSON.stringify({ id: 'read-1', capability: 'spec.getStatus', input: { specId: 1 } }), + ]); + + expect(responses).toEqual([ + expect.objectContaining({ + id: 'create-1', + ok: true, + output: expect.objectContaining({ specId: 1 }), + }), + expect.objectContaining({ + id: 'read-1', + ok: true, + output: expect.objectContaining({ + specification: expect.objectContaining({ id: 1, name: 'JSONL spec' }), + }), + }), + ]); + }); + + it('returns typed error envelopes and keeps processing after recoverable errors', async () => { + const responses = await runSession([ + '{not json', + JSON.stringify({ id: 'unknown-1', capability: 'spec.delete', input: {} }), + JSON.stringify({ id: 'invalid-1', capability: 'spec.create', input: { name: '' } }), + JSON.stringify({ id: 'create-2', capability: 'spec.create', input: { name: 'Still works' } }), + ]); + + expect(responses).toEqual([ + expect.objectContaining({ + id: null, + ok: false, + error: expect.objectContaining({ code: 'invalid_json' }), + }), + expect.objectContaining({ + id: 'unknown-1', + ok: false, + error: expect.objectContaining({ code: 'unknown_capability' }), + }), + expect.objectContaining({ + id: 'invalid-1', + ok: false, + error: expect.objectContaining({ code: 'invalid_input' }), + }), + expect.objectContaining({ id: 'create-2', ok: true, output: expect.objectContaining({ specId: 1 }) }), + ]); + }); +}); diff --git a/src/server/agent-jsonl.ts b/src/server/agent-jsonl.ts new file mode 100644 index 00000000..f93dbd2c --- /dev/null +++ b/src/server/agent-jsonl.ts @@ -0,0 +1,89 @@ +import { createInterface } from 'node:readline/promises'; +import type { Readable, Writable } from 'node:stream'; + +import { z } from 'zod'; + +import { CapabilityDispatchError, dispatchCapability } from './capabilities.js'; +import type { DB } from './db.js'; + +const agentJsonlRequestSchema = z.object({ + id: z.string().min(1), + capability: z.string().min(1), + input: z.unknown().optional(), +}); + +export interface AgentJsonlSessionOptions { + db: DB; + input: Readable; + output: Writable; +} + +type AgentJsonlResponse = + | { id: string; ok: true; output: unknown } + | { id: string | null; ok: false; error: { code: string; message: string } }; + +function writeResponse(output: Writable, response: AgentJsonlResponse): void { + output.write(`${JSON.stringify(response)}\n`); +} + +function toErrorResponse(id: string | null, code: string, message: string): AgentJsonlResponse { + return { + id, + ok: false, + error: { code, message }, + }; +} + +function getRecoverableErrorCode(error: unknown): string { + if (error instanceof CapabilityDispatchError) { + return error.code; + } + return 'handler_failed'; +} + +export async function runAgentJsonlSession({ db, input, output }: AgentJsonlSessionOptions): Promise { + const lines = createInterface({ input, crlfDelay: Infinity }); + + for await (const line of lines) { + if (line.trim() === '') { + continue; + } + + let rawRequest: unknown; + try { + rawRequest = JSON.parse(line); + } catch { + writeResponse(output, toErrorResponse(null, 'invalid_json', 'Invalid JSONL request')); + continue; + } + + const parsedRequest = agentJsonlRequestSchema.safeParse(rawRequest); + const requestId = + rawRequest && typeof rawRequest === 'object' && 'id' in rawRequest && typeof rawRequest.id === 'string' + ? rawRequest.id + : null; + + if (!parsedRequest.success) { + writeResponse(output, toErrorResponse(requestId, 'invalid_request', 'Invalid JSONL request envelope')); + continue; + } + + try { + const result = await dispatchCapability({ + db, + capability: parsedRequest.data.capability, + input: parsedRequest.data.input, + }); + writeResponse(output, { id: parsedRequest.data.id, ok: true, output: result }); + } catch (error) { + writeResponse( + output, + toErrorResponse( + parsedRequest.data.id, + getRecoverableErrorCode(error), + error instanceof Error ? error.message : 'Capability dispatch failed', + ), + ); + } + } +} diff --git a/src/server/capabilities.test.ts b/src/server/capabilities.test.ts new file mode 100644 index 00000000..4ab325df --- /dev/null +++ b/src/server/capabilities.test.ts @@ -0,0 +1,74 @@ +import { mkdtempSync, rmSync } from 'node:fs'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; + +import { afterEach, describe, expect, it } from 'vitest'; + +import { dispatchCapability } from './capabilities.js'; +import { createDb, listSpecifications, type DB } from './db.js'; + +describe('agent capabilities', () => { + const tempDirs: string[] = []; + let db: DB | null = null; + + afterEach(() => { + db?.$client.close(); + db = null; + for (const dir of tempDirs.splice(0)) { + rmSync(dir, { recursive: true, force: true }); + } + }); + + function createTempDb(): DB { + const dir = mkdtempSync(join(tmpdir(), 'brunch-capabilities-')); + tempDirs.push(dir); + db = createDb(join(dir, 'brunch.db')); + return db; + } + + it('dispatches spec.create through a Brunch-owned handler', async () => { + const result = await dispatchCapability({ + db: createTempDb(), + capability: 'spec.create', + input: { name: 'Agent-made spec' }, + }); + + expect(result).toMatchObject({ + specId: expect.any(Number), + specification: expect.objectContaining({ name: 'Agent-made spec' }), + }); + expect(listSpecifications(db!)).toHaveLength(1); + }); + + it('dispatches spec.getStatus using an explicit spec id', async () => { + const activeDb = createTempDb(); + const created = await dispatchCapability({ + db: activeDb, + capability: 'spec.create', + input: { name: 'Readable spec' }, + }); + + const result = await dispatchCapability({ + db: activeDb, + capability: 'spec.getStatus', + input: { specId: created.specId }, + }); + + expect(result).toMatchObject({ + specification: expect.objectContaining({ id: created.specId, name: 'Readable spec' }), + workflow: expect.objectContaining({ + phases: expect.objectContaining({ grounding: expect.any(Object) }), + }), + }); + }); + + it('rejects schema-invalid capability input before calling handlers', async () => { + await expect( + dispatchCapability({ + db: createTempDb(), + capability: 'spec.create', + input: { name: '' }, + }), + ).rejects.toThrow('Invalid input for capability spec.create'); + }); +}); diff --git a/src/server/capabilities.ts b/src/server/capabilities.ts new file mode 100644 index 00000000..7d02ee13 --- /dev/null +++ b/src/server/capabilities.ts @@ -0,0 +1,125 @@ +import { z } from 'zod'; + +import { getCapabilityContract, type CapabilityId } from './capability-registry.js'; +import { createNewSpecification, getSpecificationState } from './core.js'; +import type { DB } from './db.js'; + +const specCreateInputSchema = z.object({ + name: z.string().trim().min(1), + mode: z.enum(['greenfield', 'brownfield']).optional(), +}); + +const specGetStatusInputSchema = z.object({ + specId: z.number().int().positive(), +}); + +const capabilityInputSchemas = { + 'spec.create': specCreateInputSchema, + 'spec.getStatus': specGetStatusInputSchema, +} as const; + +export class CapabilityDispatchError extends Error { + constructor( + message: string, + public readonly code: 'unknown_capability' | 'invalid_input' | 'handler_failed', + ) { + super(message); + this.name = 'CapabilityDispatchError'; + } +} + +export interface CapabilityDispatchContext { + db: DB; +} + +export interface DispatchCapabilityInput extends CapabilityDispatchContext { + capability: string; + input: unknown; +} + +type SpecCreateInput = z.infer; +type SpecGetStatusInput = z.infer; +type SpecCreateOutput = ReturnType; +type SpecGetStatusOutput = ReturnType; + +function parseSpecCreateInput(input: unknown): SpecCreateInput { + const parsed = specCreateInputSchema.safeParse(input); + if (!parsed.success) { + throw new CapabilityDispatchError('Invalid input for capability spec.create', 'invalid_input'); + } + return parsed.data; +} + +function parseSpecGetStatusInput(input: unknown): SpecGetStatusInput { + const parsed = specGetStatusInputSchema.safeParse(input); + if (!parsed.success) { + throw new CapabilityDispatchError('Invalid input for capability spec.getStatus', 'invalid_input'); + } + return parsed.data; +} + +function assertExecutableCapability( + capability: string, +): asserts capability is keyof typeof capabilityInputSchemas { + try { + getCapabilityContract(capability as CapabilityId); + } catch { + throw new CapabilityDispatchError(`Unknown capability ${capability}`, 'unknown_capability'); + } + + if (!(capability in capabilityInputSchemas)) { + throw new CapabilityDispatchError( + `Capability ${capability} has no executable handler`, + 'unknown_capability', + ); + } +} + +function createSpecificationFromCapability(db: DB, input: SpecCreateInput) { + const specification = createNewSpecification( + db, + input.name, + input.mode === 'brownfield' ? { mode: input.mode } : {}, + ); + return { + specId: specification.id, + specification, + }; +} + +function getSpecificationStatusFromCapability(db: DB, input: SpecGetStatusInput) { + const state = getSpecificationState(db, input.specId); + if (!state) { + throw new CapabilityDispatchError(`Specification ${input.specId} not found`, 'handler_failed'); + } + return state; +} + +export function dispatchCapability(input: { + db: DB; + capability: 'spec.create'; + input: unknown; +}): Promise; +export function dispatchCapability(input: { + db: DB; + capability: 'spec.getStatus'; + input: unknown; +}): Promise; +export function dispatchCapability(input: DispatchCapabilityInput): Promise; +export async function dispatchCapability({ + db, + capability, + input, +}: DispatchCapabilityInput): Promise { + assertExecutableCapability(capability); + + if (capability === 'spec.create') { + return createSpecificationFromCapability(db, parseSpecCreateInput(input)); + } + + if (capability === 'spec.getStatus') { + return getSpecificationStatusFromCapability(db, parseSpecGetStatusInput(input)); + } + + throw new CapabilityDispatchError('Capability has no executable handler', 'unknown_capability'); +} diff --git a/src/server/capability-registry.test.ts b/src/server/capability-registry.test.ts index 60bb1bcb..8c12951b 100644 --- a/src/server/capability-registry.test.ts +++ b/src/server/capability-registry.test.ts @@ -41,6 +41,18 @@ describe('capability registry', () => { id: 'changeset.submit', authority: 'proposal_only', }), + expect.objectContaining({ + id: 'spec.create', + authority: 'commit_truth', + inputSchema: 'spec.create.input.v1', + outputSchema: 'spec.create.output.v1', + }), + expect.objectContaining({ + id: 'spec.getStatus', + authority: 'read_only', + inputSchema: 'spec.getStatus.input.v1', + outputSchema: 'spec.getStatus.output.v1', + }), ]); }); @@ -49,6 +61,8 @@ describe('capability registry', () => { id: 'workspace.readFile', authority: 'read_only', summary: 'Read a file from the workspace context.', + inputSchema: 'workspace.readFile.input.v1', + outputSchema: 'workspace.readFile.output.v1', handler: null, }); }); diff --git a/src/server/capability-registry.ts b/src/server/capability-registry.ts index 9000cac2..809d514e 100644 --- a/src/server/capability-registry.ts +++ b/src/server/capability-registry.ts @@ -14,15 +14,19 @@ export type CapabilityId = | 'intentGraph.validateEdge' | 'scenario.render' | 'observer.captureTurnIntent' - | 'changeset.submit'; + | 'changeset.submit' + | 'spec.create' + | 'spec.getStatus'; export interface CapabilityContract { id: CapabilityId; authority: CapabilityAuthority; summary: string; + inputSchema: string; + outputSchema: string; /** - * Capability contracts are metadata only for now. Runtime handlers and adapter - * tool projections must be introduced explicitly in later slices. + * Capability contracts carry transport-safe metadata here. Executable handlers + * live behind the capability dispatcher so adapters do not own product semantics. */ handler: null; } @@ -32,48 +36,80 @@ const capabilityContracts = [ id: 'workspace.readFile', authority: 'read_only', summary: 'Read a file from the workspace context.', + inputSchema: 'workspace.readFile.input.v1', + outputSchema: 'workspace.readFile.output.v1', handler: null, }, { id: 'workspace.search', authority: 'read_only', summary: 'Search workspace files without mutating project or Brunch state.', + inputSchema: 'workspace.search.input.v1', + outputSchema: 'workspace.search.output.v1', handler: null, }, { id: 'web.search', authority: 'read_only', summary: 'Search the web for current external context without mutating Brunch state.', + inputSchema: 'web.search.input.v1', + outputSchema: 'web.search.output.v1', handler: null, }, { id: 'web.fetchPage', authority: 'read_only', summary: 'Fetch a web page for research context without mutating Brunch state.', + inputSchema: 'web.fetchPage.input.v1', + outputSchema: 'web.fetchPage.output.v1', handler: null, }, { id: 'intentGraph.validateEdge', authority: 'read_only', summary: 'Validate an intent graph edge against relation policy without mutating graph truth.', + inputSchema: 'intentGraph.validateEdge.input.v1', + outputSchema: 'intentGraph.validateEdge.output.v1', handler: null, }, { id: 'scenario.render', authority: 'read_only', summary: 'Render prompt scenario inputs into a reviewable probe artifact.', + inputSchema: 'scenario.render.input.v1', + outputSchema: 'scenario.render.output.v1', handler: null, }, { id: 'observer.captureTurnIntent', authority: 'commit_truth', summary: 'Capture supported intent items and edges from a validated turn.', + inputSchema: 'observer.captureTurnIntent.input.v1', + outputSchema: 'observer.captureTurnIntent.output.v1', handler: null, }, { id: 'changeset.submit', authority: 'proposal_only', summary: 'Submit proposed semantic graph changes for later validation and application.', + inputSchema: 'changeset.submit.input.v1', + outputSchema: 'changeset.submit.output.v1', + handler: null, + }, + { + id: 'spec.create', + authority: 'commit_truth', + summary: 'Create a new Brunch specification in the local project store.', + inputSchema: 'spec.create.input.v1', + outputSchema: 'spec.create.output.v1', + handler: null, + }, + { + id: 'spec.getStatus', + authority: 'read_only', + summary: 'Read the current workflow and active-path projection for an explicit specification id.', + inputSchema: 'spec.getStatus.input.v1', + outputSchema: 'spec.getStatus.output.v1', handler: null, }, ] as const satisfies readonly CapabilityContract[]; diff --git a/src/server/cli.test.ts b/src/server/cli.test.ts index bf211a19..35a42dbb 100644 --- a/src/server/cli.test.ts +++ b/src/server/cli.test.ts @@ -62,12 +62,13 @@ function runCommand( args: string[], cwd: string, env: NodeJS.ProcessEnv = process.env, + input?: string, ): Promise { return new Promise((resolve, reject) => { const child = spawn(command, args, { cwd, env, - stdio: ['ignore', 'pipe', 'pipe'], + stdio: [input === undefined ? 'ignore' : 'pipe', 'pipe', 'pipe'], }); let stdout = ''; @@ -79,6 +80,9 @@ function runCommand( child.stderr?.on('data', (chunk) => { stderr += chunk.toString(); }); + if (input !== undefined) { + child.stdin?.end(input); + } child.once('error', reject); child.once('close', (code) => { resolve({ code, stdout, stderr }); @@ -117,8 +121,13 @@ async function packBuiltPackage(): Promise<{ filePaths: string[]; installedRoot: }; } -function runCli(args: string[], cwd: string, env: NodeJS.ProcessEnv = process.env): Promise { - return runCommand(process.execPath, [getInstalledBinEntrypoint(), ...args], cwd, env); +function runCli( + args: string[], + cwd: string, + env: NodeJS.ProcessEnv = process.env, + input?: string, +): Promise { + return runCommand(process.execPath, [getInstalledBinEntrypoint(), ...args], cwd, env, input); } describe('published CLI entrypoint', () => { @@ -183,6 +192,35 @@ describe('published CLI entrypoint', () => { expect(result.stdout).toContain('Usage: brunch'); }); + it('runs the packaged agent JSONL session without launching the web UI', async () => { + const workspaceCwd = makeTempDir('brunch-agent-workspace-'); + const input = `${JSON.stringify({ + id: 'create-1', + capability: 'spec.create', + input: { name: 'Packaged agent spec' }, + })}\n${JSON.stringify({ id: 'read-1', capability: 'spec.getStatus', input: { specId: 1 } })}\n`; + + const result = await runCli(['agent'], workspaceCwd, process.env, input); + const responses = result.stdout + .trim() + .split('\n') + .filter(Boolean) + .map((line) => JSON.parse(line) as unknown); + + expect(result.code).toBe(0); + expect(result.stderr).toBe(''); + expect(responses).toEqual([ + expect.objectContaining({ id: 'create-1', ok: true, output: expect.objectContaining({ specId: 1 }) }), + expect.objectContaining({ + id: 'read-1', + ok: true, + output: expect.objectContaining({ + specification: expect.objectContaining({ id: 1, name: 'Packaged agent spec' }), + }), + }), + ]); + }); + it('dry-runs the release flow against the packaged npm artifact seam', async () => { const result = await runCommand( 'npm', diff --git a/src/server/cli.ts b/src/server/cli.ts index afad1ea2..2b6dbf86 100644 --- a/src/server/cli.ts +++ b/src/server/cli.ts @@ -1,21 +1,42 @@ #!/usr/bin/env node +import { runAgentJsonlSession } from './agent-jsonl.js'; +import { createDb } from './db.js'; import { launch } from './launcher.js'; +import { resolveBrunchProject } from './project.js'; import { loadLocalEnvFile } from './runtime-config.js'; -const args = new Set(process.argv.slice(2)); +const rawArgs = process.argv.slice(2); +const args = new Set(rawArgs); const launchCwd = process.env.BRUNCH_LAUNCH_CWD || process.cwd(); loadLocalEnvFile(launchCwd); if (args.has('--help') || args.has('-h') || args.has('help')) { - console.log('Usage: brunch'); + console.log('Usage: brunch [agent]'); console.log(''); console.log('Launch the Brunch web UI in the current project directory.'); + console.log(''); + console.log('Commands:'); + console.log(' agent Run a JSONL capability session on stdin/stdout.'); process.exit(0); } -launch(launchCwd).catch((error) => { - console.error('Failed to start brunch:', error); - process.exit(1); -}); +if (rawArgs[0] === 'agent') { + const project = resolveBrunchProject(launchCwd); + const db = createDb(project.dbPath); + runAgentJsonlSession({ db, input: process.stdin, output: process.stdout }) + .then(() => { + db.$client.close(); + }) + .catch((error) => { + db.$client.close(); + console.error('Failed to run brunch agent session:', error); + process.exit(1); + }); +} else { + launch(launchCwd).catch((error) => { + console.error('Failed to start brunch:', error); + process.exit(1); + }); +} From 27e28e51e251f6dd6deeacce14c1b38ae1963939 Mon Sep 17 00:00:00 2001 From: Lu Nelson Date: Mon, 11 May 2026 16:50:01 +0200 Subject: [PATCH 04/42] FE-705: Add agent chat read capabilities --- src/server/agent-jsonl.test.ts | 57 +++++++++ src/server/capabilities.test.ts | 92 ++++++++++++++- src/server/capabilities.ts | 153 +++++++++++++++++++++++++ src/server/capability-registry.test.ts | 12 ++ src/server/capability-registry.ts | 20 +++- 5 files changed, 330 insertions(+), 4 deletions(-) diff --git a/src/server/agent-jsonl.test.ts b/src/server/agent-jsonl.test.ts index d1ede131..f820dfa4 100644 --- a/src/server/agent-jsonl.test.ts +++ b/src/server/agent-jsonl.test.ts @@ -70,6 +70,63 @@ describe('agent JSONL session', () => { ]); }); + it('creates and reads the primary chat projection over JSONL using explicit ids', async () => { + const responses = await runSession([ + JSON.stringify({ id: 'create-1', capability: 'spec.create', input: { name: 'JSONL chat spec' } }), + JSON.stringify({ id: 'primary-1', capability: 'chat.getPrimary', input: { specId: 1 } }), + JSON.stringify({ id: 'chat-1', capability: 'chat.read', input: { chatId: 1 } }), + ]); + + expect(responses).toEqual([ + expect.objectContaining({ id: 'create-1', ok: true, output: expect.objectContaining({ specId: 1 }) }), + expect.objectContaining({ + id: 'primary-1', + ok: true, + output: { specId: 1, chatId: 1, kind: 'interview', activeTurnId: null }, + }), + expect.objectContaining({ + id: 'chat-1', + ok: true, + output: expect.objectContaining({ + specification: { id: 1, name: 'JSONL chat spec', mode: 'greenfield' }, + chat: { id: 1, specificationId: 1, kind: 'interview', activeTurnId: null }, + frontier: { state: 'idle_no_frontier', phase: 'grounding', turnId: null }, + nextCommands: [{ capability: 'chat.ensureReady', input: { chatId: 1 } }], + }), + }), + ]); + }); + + it('returns typed chat read errors without crashing the session', async () => { + const responses = await runSession([ + JSON.stringify({ id: 'missing-chat', capability: 'chat.read', input: { chatId: 999 } }), + JSON.stringify({ id: 'invalid-chat', capability: 'chat.read', input: { chatId: 0 } }), + JSON.stringify({ + id: 'create-after-chat-errors', + capability: 'spec.create', + input: { name: 'Still works' }, + }), + ]); + + expect(responses).toEqual([ + expect.objectContaining({ + id: 'missing-chat', + ok: false, + error: expect.objectContaining({ code: 'handler_failed' }), + }), + expect.objectContaining({ + id: 'invalid-chat', + ok: false, + error: expect.objectContaining({ code: 'invalid_input' }), + }), + expect.objectContaining({ + id: 'create-after-chat-errors', + ok: true, + output: expect.objectContaining({ specId: 1 }), + }), + ]); + }); + it('returns typed error envelopes and keeps processing after recoverable errors', async () => { const responses = await runSession([ '{not json', diff --git a/src/server/capabilities.test.ts b/src/server/capabilities.test.ts index 4ab325df..a98a63da 100644 --- a/src/server/capabilities.test.ts +++ b/src/server/capabilities.test.ts @@ -5,7 +5,7 @@ import { join } from 'node:path'; import { afterEach, describe, expect, it } from 'vitest'; import { dispatchCapability } from './capabilities.js'; -import { createDb, listSpecifications, type DB } from './db.js'; +import { advanceHead, createDb, createTurn, listSpecifications, type DB } from './db.js'; describe('agent capabilities', () => { const tempDirs: string[] = []; @@ -62,10 +62,96 @@ describe('agent capabilities', () => { }); }); - it('rejects schema-invalid capability input before calling handlers', async () => { + it('dispatches chat.getPrimary for an explicit spec id', async () => { + const activeDb = createTempDb(); + const created = await dispatchCapability({ + db: activeDb, + capability: 'spec.create', + input: { name: 'Chat owner' }, + }); + + const result = await dispatchCapability({ + db: activeDb, + capability: 'chat.getPrimary', + input: { specId: created.specId }, + }); + + expect(result).toEqual({ + specId: created.specId, + chatId: expect.any(Number), + kind: 'interview', + activeTurnId: null, + }); + }); + + it('dispatches chat.read as a compact agent-facing projection with next-command hints', async () => { + const activeDb = createTempDb(); + const created = await dispatchCapability({ + db: activeDb, + capability: 'spec.create', + input: { name: 'Chat readable' }, + }); + const turn = createTurn(activeDb, created.specId, { + parent_turn_id: null, + phase: 'grounding', + question: 'What are you trying to build?', + answer: null, + assistant_parts: null, + user_parts: null, + }); + advanceHead(activeDb, created.specId, turn.id); + const primary = await dispatchCapability({ + db: activeDb, + capability: 'chat.getPrimary', + input: { specId: created.specId }, + }); + + const result = await dispatchCapability({ + db: activeDb, + capability: 'chat.read', + input: { chatId: primary.chatId }, + }); + + expect(result).toEqual({ + specification: { id: created.specId, name: 'Chat readable', mode: 'greenfield' }, + chat: { + id: primary.chatId, + specificationId: created.specId, + kind: 'interview', + activeTurnId: turn.id, + }, + frontier: { state: 'awaiting_response', phase: 'grounding', turnId: turn.id }, + turns: [ + { + id: turn.id, + phase: 'grounding', + kind: 'question', + question: 'What are you trying to build?', + answer: null, + isResolution: false, + options: [], + capturedItems: [], + }, + ], + nextCommands: [ + { capability: 'turn.submitResponse', input: { chatId: primary.chatId, turnId: turn.id } }, + ], + }); + }); + + it('rejects unknown chat ids and schema-invalid capability input before calling handlers', async () => { + const activeDb = createTempDb(); + await expect( + dispatchCapability({ + db: activeDb, + capability: 'chat.read', + input: { chatId: 999 }, + }), + ).rejects.toThrow('Chat 999 not found'); + await expect( dispatchCapability({ - db: createTempDb(), + db: activeDb, capability: 'spec.create', input: { name: '' }, }), diff --git a/src/server/capabilities.ts b/src/server/capabilities.ts index 7d02ee13..d7841805 100644 --- a/src/server/capabilities.ts +++ b/src/server/capabilities.ts @@ -1,8 +1,10 @@ +import { eq } from 'drizzle-orm'; import { z } from 'zod'; import { getCapabilityContract, type CapabilityId } from './capability-registry.js'; import { createNewSpecification, getSpecificationState } from './core.js'; import type { DB } from './db.js'; +import * as schema from './schema.js'; const specCreateInputSchema = z.object({ name: z.string().trim().min(1), @@ -13,9 +15,19 @@ const specGetStatusInputSchema = z.object({ specId: z.number().int().positive(), }); +const chatGetPrimaryInputSchema = z.object({ + specId: z.number().int().positive(), +}); + +const chatReadInputSchema = z.object({ + chatId: z.number().int().positive(), +}); + const capabilityInputSchemas = { 'spec.create': specCreateInputSchema, 'spec.getStatus': specGetStatusInputSchema, + 'chat.getPrimary': chatGetPrimaryInputSchema, + 'chat.read': chatReadInputSchema, } as const; export class CapabilityDispatchError extends Error { @@ -39,8 +51,12 @@ export interface DispatchCapabilityInput extends CapabilityDispatchContext { type SpecCreateInput = z.infer; type SpecGetStatusInput = z.infer; +type ChatGetPrimaryInput = z.infer; +type ChatReadInput = z.infer; type SpecCreateOutput = ReturnType; type SpecGetStatusOutput = ReturnType; +type ChatGetPrimaryOutput = ReturnType; +type ChatReadOutput = ReturnType; function parseSpecCreateInput(input: unknown): SpecCreateInput { const parsed = specCreateInputSchema.safeParse(input); @@ -58,6 +74,22 @@ function parseSpecGetStatusInput(input: unknown): SpecGetStatusInput { return parsed.data; } +function parseChatGetPrimaryInput(input: unknown): ChatGetPrimaryInput { + const parsed = chatGetPrimaryInputSchema.safeParse(input); + if (!parsed.success) { + throw new CapabilityDispatchError('Invalid input for capability chat.getPrimary', 'invalid_input'); + } + return parsed.data; +} + +function parseChatReadInput(input: unknown): ChatReadInput { + const parsed = chatReadInputSchema.safeParse(input); + if (!parsed.success) { + throw new CapabilityDispatchError('Invalid input for capability chat.read', 'invalid_input'); + } + return parsed.data; +} + function assertExecutableCapability( capability: string, ): asserts capability is keyof typeof capabilityInputSchemas { @@ -95,6 +127,109 @@ function getSpecificationStatusFromCapability(db: DB, input: SpecGetStatusInput) return state; } +function getPrimaryChatFromCapability(db: DB, input: ChatGetPrimaryInput) { + const specification = db + .select({ + id: schema.specification.id, + primary_chat_id: schema.specification.primary_chat_id, + }) + .from(schema.specification) + .where(eq(schema.specification.id, input.specId)) + .get(); + + if (!specification) { + throw new CapabilityDispatchError(`Specification ${input.specId} not found`, 'handler_failed'); + } + if (!specification.primary_chat_id) { + throw new CapabilityDispatchError(`Specification ${input.specId} has no primary chat`, 'handler_failed'); + } + + const chat = db + .select({ + id: schema.chat.id, + specification_id: schema.chat.specification_id, + kind: schema.chat.kind, + active_turn_id: schema.chat.active_turn_id, + }) + .from(schema.chat) + .where(eq(schema.chat.id, specification.primary_chat_id)) + .get(); + + if (!chat || chat.specification_id !== input.specId) { + throw new CapabilityDispatchError( + `Primary chat for specification ${input.specId} not found`, + 'handler_failed', + ); + } + + return { + specId: input.specId, + chatId: chat.id, + kind: chat.kind, + activeTurnId: chat.active_turn_id, + }; +} + +function getChatById(db: DB, chatId: number) { + return db + .select({ + id: schema.chat.id, + specification_id: schema.chat.specification_id, + kind: schema.chat.kind, + active_turn_id: schema.chat.active_turn_id, + }) + .from(schema.chat) + .where(eq(schema.chat.id, chatId)) + .get(); +} + +function readChatFromCapability(db: DB, input: ChatReadInput) { + const chat = getChatById(db, input.chatId); + if (!chat) { + throw new CapabilityDispatchError(`Chat ${input.chatId} not found`, 'handler_failed'); + } + + const state = getSpecificationState(db, chat.specification_id); + if (!state) { + throw new CapabilityDispatchError(`Specification ${chat.specification_id} not found`, 'handler_failed'); + } + + const currentPhase = state.workflow.phases.grounding.status === 'closed' ? 'design' : 'grounding'; + const activeTurn = state.turns.find((turn) => turn.id === chat.active_turn_id) ?? null; + const frontier = activeTurn + ? { state: 'awaiting_response' as const, phase: activeTurn.phase, turnId: activeTurn.id } + : { state: 'idle_no_frontier' as const, phase: currentPhase, turnId: null }; + const nextCommands = activeTurn + ? [{ capability: 'turn.submitResponse', input: { chatId: chat.id, turnId: activeTurn.id } }] + : [{ capability: 'chat.ensureReady', input: { chatId: chat.id } }]; + + return { + specification: { + id: state.specification.id, + name: state.specification.name, + mode: state.specification.mode, + }, + chat: { + id: chat.id, + specificationId: chat.specification_id, + kind: chat.kind, + activeTurnId: chat.active_turn_id, + }, + frontier, + turns: state.turns.map((turn) => ({ + id: turn.id, + phase: turn.phase, + kind: turn.turn_kind ?? 'question', + question: turn.question, + answer: turn.answer, + isResolution: Boolean(turn.is_resolution), + options: turn.options ?? [], + capturedItems: turn.captured_items ?? [], + })), + nextCommands, + }; +} + export function dispatchCapability(input: { db: DB; capability: 'spec.create'; @@ -105,6 +240,16 @@ export function dispatchCapability(input: { capability: 'spec.getStatus'; input: unknown; }): Promise; +export function dispatchCapability(input: { + db: DB; + capability: 'chat.getPrimary'; + input: unknown; +}): Promise; +export function dispatchCapability(input: { + db: DB; + capability: 'chat.read'; + input: unknown; +}): Promise; export function dispatchCapability(input: DispatchCapabilityInput): Promise; export async function dispatchCapability({ db, @@ -121,5 +266,13 @@ export async function dispatchCapability({ return getSpecificationStatusFromCapability(db, parseSpecGetStatusInput(input)); } + if (capability === 'chat.getPrimary') { + return getPrimaryChatFromCapability(db, parseChatGetPrimaryInput(input)); + } + + if (capability === 'chat.read') { + return readChatFromCapability(db, parseChatReadInput(input)); + } + throw new CapabilityDispatchError('Capability has no executable handler', 'unknown_capability'); } diff --git a/src/server/capability-registry.test.ts b/src/server/capability-registry.test.ts index 8c12951b..ac627993 100644 --- a/src/server/capability-registry.test.ts +++ b/src/server/capability-registry.test.ts @@ -53,6 +53,18 @@ describe('capability registry', () => { inputSchema: 'spec.getStatus.input.v1', outputSchema: 'spec.getStatus.output.v1', }), + expect.objectContaining({ + id: 'chat.getPrimary', + authority: 'read_only', + inputSchema: 'chat.getPrimary.input.v1', + outputSchema: 'chat.getPrimary.output.v1', + }), + expect.objectContaining({ + id: 'chat.read', + authority: 'read_only', + inputSchema: 'chat.read.input.v1', + outputSchema: 'chat.read.output.v1', + }), ]); }); diff --git a/src/server/capability-registry.ts b/src/server/capability-registry.ts index 809d514e..5a2d9d46 100644 --- a/src/server/capability-registry.ts +++ b/src/server/capability-registry.ts @@ -16,7 +16,9 @@ export type CapabilityId = | 'observer.captureTurnIntent' | 'changeset.submit' | 'spec.create' - | 'spec.getStatus'; + | 'spec.getStatus' + | 'chat.getPrimary' + | 'chat.read'; export interface CapabilityContract { id: CapabilityId; @@ -112,6 +114,22 @@ const capabilityContracts = [ outputSchema: 'spec.getStatus.output.v1', handler: null, }, + { + id: 'chat.getPrimary', + authority: 'read_only', + summary: 'Read the primary interview chat identity for an explicit specification id.', + inputSchema: 'chat.getPrimary.input.v1', + outputSchema: 'chat.getPrimary.output.v1', + handler: null, + }, + { + id: 'chat.read', + authority: 'read_only', + summary: 'Read a compact agent-facing projection for an explicit chat id.', + inputSchema: 'chat.read.input.v1', + outputSchema: 'chat.read.output.v1', + handler: null, + }, ] as const satisfies readonly CapabilityContract[]; const capabilityContractsById = new Map( From 696c8b63bbe653b0916ecb352055ea386481802e Mon Sep 17 00:00:00 2001 From: Lu Nelson Date: Mon, 11 May 2026 16:57:25 +0200 Subject: [PATCH 05/42] FE-705: Add deterministic chat readiness --- src/server/agent-jsonl.test.ts | 40 +++++++++++ src/server/capabilities.test.ts | 90 +++++++++++++++++++++++- src/server/capabilities.ts | 96 +++++++++++++++++++++++++- src/server/capability-registry.test.ts | 6 ++ src/server/capability-registry.ts | 11 ++- 5 files changed, 239 insertions(+), 4 deletions(-) diff --git a/src/server/agent-jsonl.test.ts b/src/server/agent-jsonl.test.ts index f820dfa4..8498acf7 100644 --- a/src/server/agent-jsonl.test.ts +++ b/src/server/agent-jsonl.test.ts @@ -97,10 +97,45 @@ describe('agent JSONL session', () => { ]); }); + it('ensures chat readiness and then reads the active frontier over JSONL', async () => { + const responses = await runSession([ + JSON.stringify({ id: 'create-1', capability: 'spec.create', input: { name: 'JSONL ready spec' } }), + JSON.stringify({ id: 'primary-1', capability: 'chat.getPrimary', input: { specId: 1 } }), + JSON.stringify({ id: 'ready-1', capability: 'chat.ensureReady', input: { chatId: 1 } }), + JSON.stringify({ id: 'chat-1', capability: 'chat.read', input: { chatId: 1 } }), + ]); + + expect(responses).toEqual([ + expect.objectContaining({ id: 'create-1', ok: true, output: expect.objectContaining({ specId: 1 }) }), + expect.objectContaining({ id: 'primary-1', ok: true, output: expect.objectContaining({ chatId: 1 }) }), + expect.objectContaining({ + id: 'ready-1', + ok: true, + output: expect.objectContaining({ + chatId: 1, + specId: 1, + state: 'needs_generation', + turnId: 1, + }), + }), + expect.objectContaining({ + id: 'chat-1', + ok: true, + output: expect.objectContaining({ + chat: { id: 1, specificationId: 1, kind: 'interview', activeTurnId: 1 }, + frontier: { state: 'needs_generation', phase: 'grounding', turnId: 1 }, + turns: [expect.objectContaining({ id: 1, phase: 'grounding', question: '', answer: null })], + nextCommands: [{ capability: 'turn.submitResponse', input: { chatId: 1, turnId: 1 } }], + }), + }), + ]); + }); + it('returns typed chat read errors without crashing the session', async () => { const responses = await runSession([ JSON.stringify({ id: 'missing-chat', capability: 'chat.read', input: { chatId: 999 } }), JSON.stringify({ id: 'invalid-chat', capability: 'chat.read', input: { chatId: 0 } }), + JSON.stringify({ id: 'missing-ready', capability: 'chat.ensureReady', input: { chatId: 999 } }), JSON.stringify({ id: 'create-after-chat-errors', capability: 'spec.create', @@ -119,6 +154,11 @@ describe('agent JSONL session', () => { ok: false, error: expect.objectContaining({ code: 'invalid_input' }), }), + expect.objectContaining({ + id: 'missing-ready', + ok: false, + error: expect.objectContaining({ code: 'handler_failed' }), + }), expect.objectContaining({ id: 'create-after-chat-errors', ok: true, diff --git a/src/server/capabilities.test.ts b/src/server/capabilities.test.ts index a98a63da..bffa520c 100644 --- a/src/server/capabilities.test.ts +++ b/src/server/capabilities.test.ts @@ -5,7 +5,15 @@ import { join } from 'node:path'; import { afterEach, describe, expect, it } from 'vitest'; import { dispatchCapability } from './capabilities.js'; -import { advanceHead, createDb, createTurn, listSpecifications, type DB } from './db.js'; +import { + advanceHead, + createDb, + createTurn, + getActivePath, + getSpecification, + listSpecifications, + type DB, +} from './db.js'; describe('agent capabilities', () => { const tempDirs: string[] = []; @@ -139,6 +147,78 @@ describe('agent capabilities', () => { }); }); + it('dispatches chat.ensureReady by materializing a deterministic empty frontier', async () => { + const activeDb = createTempDb(); + const created = await dispatchCapability({ + db: activeDb, + capability: 'spec.create', + input: { name: 'Ready spec' }, + }); + const primary = await dispatchCapability({ + db: activeDb, + capability: 'chat.getPrimary', + input: { specId: created.specId }, + }); + + const result = await dispatchCapability({ + db: activeDb, + capability: 'chat.ensureReady', + input: { chatId: primary.chatId }, + }); + const activePath = getActivePath(activeDb, created.specId); + + expect(result).toEqual({ + chatId: primary.chatId, + specId: created.specId, + state: 'needs_generation', + turnId: expect.any(Number), + nextCommands: [{ capability: 'chat.read', input: { chatId: primary.chatId } }], + }); + expect(activePath).toHaveLength(1); + expect(activePath[0]).toMatchObject({ + id: result.turnId, + phase: 'grounding', + question: '', + answer: null, + }); + expect(getSpecification(activeDb, created.specId)?.active_turn_id).toBe(result.turnId); + expect( + await dispatchCapability({ + db: activeDb, + capability: 'chat.getPrimary', + input: { specId: created.specId }, + }), + ).toMatchObject({ activeTurnId: result.turnId }); + }); + + it('keeps chat.ensureReady idempotent when a frontier already exists', async () => { + const activeDb = createTempDb(); + const created = await dispatchCapability({ + db: activeDb, + capability: 'spec.create', + input: { name: 'Idempotent readiness' }, + }); + const primary = await dispatchCapability({ + db: activeDb, + capability: 'chat.getPrimary', + input: { specId: created.specId }, + }); + + const first = await dispatchCapability({ + db: activeDb, + capability: 'chat.ensureReady', + input: { chatId: primary.chatId }, + }); + const second = await dispatchCapability({ + db: activeDb, + capability: 'chat.ensureReady', + input: { chatId: primary.chatId }, + }); + + expect(second).toEqual(first); + expect(getActivePath(activeDb, created.specId)).toHaveLength(1); + }); + it('rejects unknown chat ids and schema-invalid capability input before calling handlers', async () => { const activeDb = createTempDb(); await expect( @@ -149,6 +229,14 @@ describe('agent capabilities', () => { }), ).rejects.toThrow('Chat 999 not found'); + await expect( + dispatchCapability({ + db: activeDb, + capability: 'chat.ensureReady', + input: { chatId: 999 }, + }), + ).rejects.toThrow('Chat 999 not found'); + await expect( dispatchCapability({ db: activeDb, diff --git a/src/server/capabilities.ts b/src/server/capabilities.ts index d7841805..8e5d5d03 100644 --- a/src/server/capabilities.ts +++ b/src/server/capabilities.ts @@ -2,7 +2,8 @@ import { eq } from 'drizzle-orm'; import { z } from 'zod'; import { getCapabilityContract, type CapabilityId } from './capability-registry.js'; -import { createNewSpecification, getSpecificationState } from './core.js'; +import { applyChatRouteTransition } from './chat-route-transition.js'; +import { createNewSpecification, finalizeTurn, getSpecificationState } from './core.js'; import type { DB } from './db.js'; import * as schema from './schema.js'; @@ -23,11 +24,16 @@ const chatReadInputSchema = z.object({ chatId: z.number().int().positive(), }); +const chatEnsureReadyInputSchema = z.object({ + chatId: z.number().int().positive(), +}); + const capabilityInputSchemas = { 'spec.create': specCreateInputSchema, 'spec.getStatus': specGetStatusInputSchema, 'chat.getPrimary': chatGetPrimaryInputSchema, 'chat.read': chatReadInputSchema, + 'chat.ensureReady': chatEnsureReadyInputSchema, } as const; export class CapabilityDispatchError extends Error { @@ -53,10 +59,12 @@ type SpecCreateInput = z.infer; type SpecGetStatusInput = z.infer; type ChatGetPrimaryInput = z.infer; type ChatReadInput = z.infer; +type ChatEnsureReadyInput = z.infer; type SpecCreateOutput = ReturnType; type SpecGetStatusOutput = ReturnType; type ChatGetPrimaryOutput = ReturnType; type ChatReadOutput = ReturnType; +type ChatEnsureReadyOutput = ReturnType; function parseSpecCreateInput(input: unknown): SpecCreateInput { const parsed = specCreateInputSchema.safeParse(input); @@ -90,6 +98,14 @@ function parseChatReadInput(input: unknown): ChatReadInput { return parsed.data; } +function parseChatEnsureReadyInput(input: unknown): ChatEnsureReadyInput { + const parsed = chatEnsureReadyInputSchema.safeParse(input); + if (!parsed.success) { + throw new CapabilityDispatchError('Invalid input for capability chat.ensureReady', 'invalid_input'); + } + return parsed.data; +} + function assertExecutableCapability( capability: string, ): asserts capability is keyof typeof capabilityInputSchemas { @@ -183,6 +199,10 @@ function getChatById(db: DB, chatId: number) { .get(); } +function getReadyStateForTurn(turn: { question: string; answer: string | null }) { + return turn.answer === null && turn.question.trim() === '' ? 'needs_generation' : 'awaiting_response'; +} + function readChatFromCapability(db: DB, input: ChatReadInput) { const chat = getChatById(db, input.chatId); if (!chat) { @@ -197,7 +217,7 @@ function readChatFromCapability(db: DB, input: ChatReadInput) { const currentPhase = state.workflow.phases.grounding.status === 'closed' ? 'design' : 'grounding'; const activeTurn = state.turns.find((turn) => turn.id === chat.active_turn_id) ?? null; const frontier = activeTurn - ? { state: 'awaiting_response' as const, phase: activeTurn.phase, turnId: activeTurn.id } + ? { state: getReadyStateForTurn(activeTurn), phase: activeTurn.phase, turnId: activeTurn.id } : { state: 'idle_no_frontier' as const, phase: currentPhase, turnId: null }; const nextCommands = activeTurn ? [{ capability: 'turn.submitResponse', input: { chatId: chat.id, turnId: activeTurn.id } }] @@ -230,6 +250,69 @@ function readChatFromCapability(db: DB, input: ChatReadInput) { }; } +function ensureChatReadyFromCapability(db: DB, input: ChatEnsureReadyInput) { + const chat = getChatById(db, input.chatId); + if (!chat) { + throw new CapabilityDispatchError(`Chat ${input.chatId} not found`, 'handler_failed'); + } + + const state = getSpecificationState(db, chat.specification_id); + if (!state) { + throw new CapabilityDispatchError(`Specification ${chat.specification_id} not found`, 'handler_failed'); + } + + const activeTurn = state.turns.find((turn) => turn.id === chat.active_turn_id) ?? null; + if (activeTurn) { + return { + chatId: chat.id, + specId: chat.specification_id, + state: getReadyStateForTurn(activeTurn), + turnId: activeTurn.id, + nextCommands: [{ capability: 'chat.read', input: { chatId: chat.id } }], + }; + } + + const landing = state.landing; + if (!landing || landing.kind === 'frontier-turn') { + throw new CapabilityDispatchError( + `Chat ${chat.id} is not ready for deterministic entry`, + 'handler_failed', + ); + } + + const request = + landing.kind === 'kickoff' + ? { kind: 'phase-entry' as const, phase: landing.phase } + : { kind: 'phase-continue' as const, phase: landing.phase }; + const transition = applyChatRouteTransition( + { db, specificationId: chat.specification_id }, + { + kind: 'phase-entry', + request, + }, + ); + + if (!transition.ok) { + throw new CapabilityDispatchError(transition.message, 'handler_failed'); + } + if (transition.kind !== 'interviewer-turn') { + throw new CapabilityDispatchError( + `Chat ${chat.id} did not produce an interviewer frontier`, + 'handler_failed', + ); + } + + finalizeTurn(db, chat.specification_id, transition.prepared.turn.id); + + return { + chatId: chat.id, + specId: chat.specification_id, + state: 'needs_generation' as const, + turnId: transition.prepared.turn.id, + nextCommands: [{ capability: 'chat.read', input: { chatId: chat.id } }], + }; +} + export function dispatchCapability(input: { db: DB; capability: 'spec.create'; @@ -250,6 +333,11 @@ export function dispatchCapability(input: { capability: 'chat.read'; input: unknown; }): Promise; +export function dispatchCapability(input: { + db: DB; + capability: 'chat.ensureReady'; + input: unknown; +}): Promise; export function dispatchCapability(input: DispatchCapabilityInput): Promise; export async function dispatchCapability({ db, @@ -274,5 +362,9 @@ export async function dispatchCapability({ return readChatFromCapability(db, parseChatReadInput(input)); } + if (capability === 'chat.ensureReady') { + return ensureChatReadyFromCapability(db, parseChatEnsureReadyInput(input)); + } + throw new CapabilityDispatchError('Capability has no executable handler', 'unknown_capability'); } diff --git a/src/server/capability-registry.test.ts b/src/server/capability-registry.test.ts index ac627993..95c6c211 100644 --- a/src/server/capability-registry.test.ts +++ b/src/server/capability-registry.test.ts @@ -65,6 +65,12 @@ describe('capability registry', () => { inputSchema: 'chat.read.input.v1', outputSchema: 'chat.read.output.v1', }), + expect.objectContaining({ + id: 'chat.ensureReady', + authority: 'runtime_replay', + inputSchema: 'chat.ensureReady.input.v1', + outputSchema: 'chat.ensureReady.output.v1', + }), ]); }); diff --git a/src/server/capability-registry.ts b/src/server/capability-registry.ts index 5a2d9d46..8be180d0 100644 --- a/src/server/capability-registry.ts +++ b/src/server/capability-registry.ts @@ -18,7 +18,8 @@ export type CapabilityId = | 'spec.create' | 'spec.getStatus' | 'chat.getPrimary' - | 'chat.read'; + | 'chat.read' + | 'chat.ensureReady'; export interface CapabilityContract { id: CapabilityId; @@ -130,6 +131,14 @@ const capabilityContracts = [ outputSchema: 'chat.read.output.v1', handler: null, }, + { + id: 'chat.ensureReady', + authority: 'runtime_replay', + summary: 'Materialize deterministic chat readiness for an explicit chat id without generation.', + inputSchema: 'chat.ensureReady.input.v1', + outputSchema: 'chat.ensureReady.output.v1', + handler: null, + }, ] as const satisfies readonly CapabilityContract[]; const capabilityContractsById = new Map( From 32a65e982971ffa837985838e5335a9c38368b4c Mon Sep 17 00:00:00 2001 From: Lu Nelson Date: Mon, 11 May 2026 17:10:30 +0200 Subject: [PATCH 06/42] FE-705: Generate agent chat readiness --- src/server/agent-jsonl.test.ts | 23 +++++-- src/server/agent-jsonl.ts | 15 ++++- src/server/capabilities.test.ts | 39 ++++++++--- src/server/capabilities.ts | 113 ++++++++++++++++++++++++++++++-- 4 files changed, 169 insertions(+), 21 deletions(-) diff --git a/src/server/agent-jsonl.test.ts b/src/server/agent-jsonl.test.ts index 8498acf7..12a41496 100644 --- a/src/server/agent-jsonl.test.ts +++ b/src/server/agent-jsonl.test.ts @@ -33,7 +33,15 @@ describe('agent JSONL session', () => { const chunks: string[] = []; output.on('data', (chunk) => chunks.push(chunk.toString())); - const session = runAgentJsonlSession({ db: createTempDb(), input, output }); + const session = runAgentJsonlSession({ + db: createTempDb(), + input, + output, + generateAnswerableFrontier: async () => ({ + question: 'What are you trying to build?', + assistantParts: [{ type: 'text', text: 'What are you trying to build?' }], + }), + }); for (const line of lines) { input.write(`${line}\n`); } @@ -114,7 +122,7 @@ describe('agent JSONL session', () => { output: expect.objectContaining({ chatId: 1, specId: 1, - state: 'needs_generation', + state: 'awaiting_response', turnId: 1, }), }), @@ -123,8 +131,15 @@ describe('agent JSONL session', () => { ok: true, output: expect.objectContaining({ chat: { id: 1, specificationId: 1, kind: 'interview', activeTurnId: 1 }, - frontier: { state: 'needs_generation', phase: 'grounding', turnId: 1 }, - turns: [expect.objectContaining({ id: 1, phase: 'grounding', question: '', answer: null })], + frontier: { state: 'awaiting_response', phase: 'grounding', turnId: 1 }, + turns: [ + expect.objectContaining({ + id: 1, + phase: 'grounding', + question: expect.stringMatching(/What are you trying to build/), + answer: null, + }), + ], nextCommands: [{ capability: 'turn.submitResponse', input: { chatId: 1, turnId: 1 } }], }), }), diff --git a/src/server/agent-jsonl.ts b/src/server/agent-jsonl.ts index f93dbd2c..9fbd0fdd 100644 --- a/src/server/agent-jsonl.ts +++ b/src/server/agent-jsonl.ts @@ -3,7 +3,11 @@ import type { Readable, Writable } from 'node:stream'; import { z } from 'zod'; -import { CapabilityDispatchError, dispatchCapability } from './capabilities.js'; +import { + CapabilityDispatchError, + dispatchCapability, + type GenerateAnswerableFrontier, +} from './capabilities.js'; import type { DB } from './db.js'; const agentJsonlRequestSchema = z.object({ @@ -16,6 +20,7 @@ export interface AgentJsonlSessionOptions { db: DB; input: Readable; output: Writable; + generateAnswerableFrontier?: GenerateAnswerableFrontier; } type AgentJsonlResponse = @@ -41,7 +46,12 @@ function getRecoverableErrorCode(error: unknown): string { return 'handler_failed'; } -export async function runAgentJsonlSession({ db, input, output }: AgentJsonlSessionOptions): Promise { +export async function runAgentJsonlSession({ + db, + input, + output, + generateAnswerableFrontier, +}: AgentJsonlSessionOptions): Promise { const lines = createInterface({ input, crlfDelay: Infinity }); for await (const line of lines) { @@ -73,6 +83,7 @@ export async function runAgentJsonlSession({ db, input, output }: AgentJsonlSess db, capability: parsedRequest.data.capability, input: parsedRequest.data.input, + generateAnswerableFrontier, }); writeResponse(output, { id: parsedRequest.data.id, ok: true, output: result }); } catch (error) { diff --git a/src/server/capabilities.test.ts b/src/server/capabilities.test.ts index bffa520c..590dd778 100644 --- a/src/server/capabilities.test.ts +++ b/src/server/capabilities.test.ts @@ -2,7 +2,7 @@ import { mkdtempSync, rmSync } from 'node:fs'; import { tmpdir } from 'node:os'; import { join } from 'node:path'; -import { afterEach, describe, expect, it } from 'vitest'; +import { afterEach, describe, expect, it, vi } from 'vitest'; import { dispatchCapability } from './capabilities.js'; import { @@ -147,8 +147,12 @@ describe('agent capabilities', () => { }); }); - it('dispatches chat.ensureReady by materializing a deterministic empty frontier', async () => { + it('dispatches chat.ensureReady by generating an answerable frontier', async () => { const activeDb = createTempDb(); + const generateAnswerableFrontier = vi.fn(async () => ({ + question: 'What are you trying to build?', + assistantParts: [{ type: 'text' as const, text: 'What are you trying to build?' }], + })); const created = await dispatchCapability({ db: activeDb, capability: 'spec.create', @@ -164,35 +168,50 @@ describe('agent capabilities', () => { db: activeDb, capability: 'chat.ensureReady', input: { chatId: primary.chatId }, + generateAnswerableFrontier, }); const activePath = getActivePath(activeDb, created.specId); expect(result).toEqual({ chatId: primary.chatId, specId: created.specId, - state: 'needs_generation', + state: 'awaiting_response', turnId: expect.any(Number), nextCommands: [{ capability: 'chat.read', input: { chatId: primary.chatId } }], }); + expect(generateAnswerableFrontier).toHaveBeenCalledOnce(); expect(activePath).toHaveLength(1); expect(activePath[0]).toMatchObject({ id: result.turnId, phase: 'grounding', - question: '', + question: 'What are you trying to build?', answer: null, }); + expect(activePath[0]?.assistant_parts).toBe( + JSON.stringify([{ type: 'text', text: 'What are you trying to build?' }]), + ); expect(getSpecification(activeDb, created.specId)?.active_turn_id).toBe(result.turnId); expect( await dispatchCapability({ db: activeDb, - capability: 'chat.getPrimary', - input: { specId: created.specId }, + capability: 'chat.read', + input: { chatId: primary.chatId }, }), - ).toMatchObject({ activeTurnId: result.turnId }); + ).toMatchObject({ + frontier: { state: 'awaiting_response', phase: 'grounding', turnId: result.turnId }, + turns: [expect.objectContaining({ question: 'What are you trying to build?' })], + nextCommands: [ + { capability: 'turn.submitResponse', input: { chatId: primary.chatId, turnId: result.turnId } }, + ], + }); }); - it('keeps chat.ensureReady idempotent when a frontier already exists', async () => { + it('keeps chat.ensureReady idempotent when an answerable frontier already exists', async () => { const activeDb = createTempDb(); + const generateAnswerableFrontier = vi.fn(async () => ({ + question: 'What should we clarify first?', + assistantParts: [{ type: 'text' as const, text: 'What should we clarify first?' }], + })); const created = await dispatchCapability({ db: activeDb, capability: 'spec.create', @@ -208,14 +227,18 @@ describe('agent capabilities', () => { db: activeDb, capability: 'chat.ensureReady', input: { chatId: primary.chatId }, + generateAnswerableFrontier, }); const second = await dispatchCapability({ db: activeDb, capability: 'chat.ensureReady', input: { chatId: primary.chatId }, + generateAnswerableFrontier, }); expect(second).toEqual(first); + expect(second.state).toBe('awaiting_response'); + expect(generateAnswerableFrontier).toHaveBeenCalledOnce(); expect(getActivePath(activeDb, created.specId)).toHaveLength(1); }); diff --git a/src/server/capabilities.ts b/src/server/capabilities.ts index 8e5d5d03..4aae1485 100644 --- a/src/server/capabilities.ts +++ b/src/server/capabilities.ts @@ -1,11 +1,18 @@ +import { readUIMessageStream } from 'ai'; import { eq } from 'drizzle-orm'; import { z } from 'zod'; +import { extractTextFromMessage, type BrunchUIMessage } from '@/shared/chat.js'; + import { getCapabilityContract, type CapabilityId } from './capability-registry.js'; import { applyChatRouteTransition } from './chat-route-transition.js'; -import { createNewSpecification, finalizeTurn, getSpecificationState } from './core.js'; -import type { DB } from './db.js'; +import { createNewSpecification, finalizeTurn, getSpecificationState, type TurnWithOptions } from './core.js'; +import type { DB, Turn } from './db.js'; +import { getTurn, updateTurn } from './db.js'; +import { persistFallbackQuestionText, streamInterviewer } from './interview.js'; +import { serializeParts, type AssistantPart } from './parts.js'; import * as schema from './schema.js'; +import { materializeTurnArtifacts } from './turn-artifacts.js'; const specCreateInputSchema = z.object({ name: z.string().trim().min(1), @@ -46,8 +53,25 @@ export class CapabilityDispatchError extends Error { } } +export interface GeneratedAnswerableFrontier { + question: string; + assistantParts: AssistantPart[]; +} + +export interface GenerateAnswerableFrontierInput { + db: DB; + turn: Turn; + activePath: TurnWithOptions[]; + userMessage: string; +} + +export type GenerateAnswerableFrontier = ( + input: GenerateAnswerableFrontierInput, +) => Promise; + export interface CapabilityDispatchContext { db: DB; + generateAnswerableFrontier?: GenerateAnswerableFrontier; } export interface DispatchCapabilityInput extends CapabilityDispatchContext { @@ -64,7 +88,7 @@ type SpecCreateOutput = ReturnType; type SpecGetStatusOutput = ReturnType; type ChatGetPrimaryOutput = ReturnType; type ChatReadOutput = ReturnType; -type ChatEnsureReadyOutput = ReturnType; +type ChatEnsureReadyOutput = Awaited>; function parseSpecCreateInput(input: unknown): SpecCreateInput { const parsed = specCreateInputSchema.safeParse(input); @@ -203,6 +227,53 @@ function getReadyStateForTurn(turn: { question: string; answer: string | null }) return turn.answer === null && turn.question.trim() === '' ? 'needs_generation' : 'awaiting_response'; } +async function generateAnswerableFrontierWithInterviewer({ + db, + turn, + activePath, + userMessage, +}: GenerateAnswerableFrontierInput): Promise { + const startedAt = Date.now(); + const interviewer = await streamInterviewer(db, turn, activePath, userMessage, turn.phase); + const stream = interviewer.toUIMessageStream({ + sendReasoning: true, + sendFinish: false, + }); + let responseMessage: BrunchUIMessage | null = null; + for await (const message of readUIMessageStream({ stream })) { + responseMessage = message; + } + await interviewer.finishReason; + + if (!responseMessage) { + throw new Error(`Interviewer did not generate content for turn ${turn.id}`); + } + + const question = extractTextFromMessage(responseMessage); + const assistantParts = materializeTurnArtifacts({ + phase: turn.phase, + responseMessage, + elapsedMs: Date.now() - startedAt, + }); + + return { question, assistantParts }; +} + +async function persistGeneratedAnswerableFrontier( + db: DB, + turn: Turn, + generated: GeneratedAnswerableFrontier, +): Promise { + if (generated.question.trim() === '') { + throw new Error(`Interviewer generated an empty question for turn ${turn.id}`); + } + + persistFallbackQuestionText(db, turn.id, generated.question); + updateTurn(db, turn.id, { + assistant_parts: serializeParts(generated.assistantParts), + }); +} + function readChatFromCapability(db: DB, input: ChatReadInput) { const chat = getChatById(db, input.chatId); if (!chat) { @@ -250,7 +321,11 @@ function readChatFromCapability(db: DB, input: ChatReadInput) { }; } -function ensureChatReadyFromCapability(db: DB, input: ChatEnsureReadyInput) { +async function ensureChatReadyFromCapability( + db: DB, + input: ChatEnsureReadyInput, + generateAnswerableFrontier: GenerateAnswerableFrontier = generateAnswerableFrontierWithInterviewer, +) { const chat = getChatById(db, input.chatId); if (!chat) { throw new CapabilityDispatchError(`Chat ${input.chatId} not found`, 'handler_failed'); @@ -263,10 +338,25 @@ function ensureChatReadyFromCapability(db: DB, input: ChatEnsureReadyInput) { const activeTurn = state.turns.find((turn) => turn.id === chat.active_turn_id) ?? null; if (activeTurn) { + const activeState = getReadyStateForTurn(activeTurn); + if (activeState === 'needs_generation') { + const persistedActiveTurn = getTurn(db, activeTurn.id); + if (!persistedActiveTurn) { + throw new CapabilityDispatchError(`Turn ${activeTurn.id} not found`, 'handler_failed'); + } + const generated = await generateAnswerableFrontier({ + db, + turn: persistedActiveTurn, + activePath: state.turns, + userMessage: '', + }); + await persistGeneratedAnswerableFrontier(db, persistedActiveTurn, generated); + } + return { chatId: chat.id, specId: chat.specification_id, - state: getReadyStateForTurn(activeTurn), + state: 'awaiting_response' as const, turnId: activeTurn.id, nextCommands: [{ capability: 'chat.read', input: { chatId: chat.id } }], }; @@ -303,11 +393,18 @@ function ensureChatReadyFromCapability(db: DB, input: ChatEnsureReadyInput) { } finalizeTurn(db, chat.specification_id, transition.prepared.turn.id); + const generated = await generateAnswerableFrontier({ + db, + turn: transition.prepared.turn, + activePath: transition.prepared.activePath, + userMessage: '', + }); + await persistGeneratedAnswerableFrontier(db, transition.prepared.turn, generated); return { chatId: chat.id, specId: chat.specification_id, - state: 'needs_generation' as const, + state: 'awaiting_response' as const, turnId: transition.prepared.turn.id, nextCommands: [{ capability: 'chat.read', input: { chatId: chat.id } }], }; @@ -337,12 +434,14 @@ export function dispatchCapability(input: { db: DB; capability: 'chat.ensureReady'; input: unknown; + generateAnswerableFrontier?: GenerateAnswerableFrontier; }): Promise; export function dispatchCapability(input: DispatchCapabilityInput): Promise; export async function dispatchCapability({ db, capability, input, + generateAnswerableFrontier, }: DispatchCapabilityInput): Promise { assertExecutableCapability(capability); @@ -363,7 +462,7 @@ export async function dispatchCapability({ } if (capability === 'chat.ensureReady') { - return ensureChatReadyFromCapability(db, parseChatEnsureReadyInput(input)); + return ensureChatReadyFromCapability(db, parseChatEnsureReadyInput(input), generateAnswerableFrontier); } throw new CapabilityDispatchError('Capability has no executable handler', 'unknown_capability'); From 642d57d5253c03ed49fb03d7430fd7882d3084a8 Mon Sep 17 00:00:00 2001 From: Lu Nelson Date: Mon, 11 May 2026 17:25:38 +0200 Subject: [PATCH 07/42] FE-705: Add agent turn response capability --- src/server/agent-jsonl.test.ts | 43 +++++++++ src/server/capabilities.test.ts | 109 ++++++++++++++++++++++ src/server/capabilities.ts | 123 ++++++++++++++++++++++++- src/server/capability-registry.test.ts | 6 ++ src/server/capability-registry.ts | 13 ++- 5 files changed, 287 insertions(+), 7 deletions(-) diff --git a/src/server/agent-jsonl.test.ts b/src/server/agent-jsonl.test.ts index 12a41496..db78cfcb 100644 --- a/src/server/agent-jsonl.test.ts +++ b/src/server/agent-jsonl.test.ts @@ -146,6 +146,49 @@ describe('agent JSONL session', () => { ]); }); + it('submits a turn response and reads the answered turn over JSONL', async () => { + const responses = await runSession([ + JSON.stringify({ id: 'create-1', capability: 'spec.create', input: { name: 'JSONL response spec' } }), + JSON.stringify({ id: 'primary-1', capability: 'chat.getPrimary', input: { specId: 1 } }), + JSON.stringify({ id: 'ready-1', capability: 'chat.ensureReady', input: { chatId: 1 } }), + JSON.stringify({ + id: 'response-1', + capability: 'turn.submitResponse', + input: { + chatId: 1, + turnId: 1, + response: { kind: 'free-text', freeText: 'A local spec elicitation tool' }, + }, + }), + JSON.stringify({ id: 'chat-1', capability: 'chat.read', input: { chatId: 1 } }), + ]); + + expect(responses).toEqual([ + expect.objectContaining({ id: 'create-1', ok: true, output: expect.objectContaining({ specId: 1 }) }), + expect.objectContaining({ id: 'primary-1', ok: true, output: expect.objectContaining({ chatId: 1 }) }), + expect.objectContaining({ id: 'ready-1', ok: true, output: expect.objectContaining({ turnId: 1 }) }), + expect.objectContaining({ + id: 'response-1', + ok: true, + output: expect.objectContaining({ + chatId: 1, + specId: 1, + turnId: 1, + response: { ok: true }, + }), + }), + expect.objectContaining({ + id: 'chat-1', + ok: true, + output: expect.objectContaining({ + frontier: { state: 'answered', phase: 'grounding', turnId: 1 }, + turns: [expect.objectContaining({ id: 1, answer: 'A local spec elicitation tool' })], + nextCommands: [{ capability: 'chat.ensureReady', input: { chatId: 1 } }], + }), + }), + ]); + }); + it('returns typed chat read errors without crashing the session', async () => { const responses = await runSession([ JSON.stringify({ id: 'missing-chat', capability: 'chat.read', input: { chatId: 999 } }), diff --git a/src/server/capabilities.test.ts b/src/server/capabilities.test.ts index 590dd778..5213e630 100644 --- a/src/server/capabilities.test.ts +++ b/src/server/capabilities.test.ts @@ -11,6 +11,7 @@ import { createTurn, getActivePath, getSpecification, + getTurn, listSpecifications, type DB, } from './db.js'; @@ -242,6 +243,106 @@ describe('agent capabilities', () => { expect(getActivePath(activeDb, created.specId)).toHaveLength(1); }); + it('dispatches turn.submitResponse through the existing turn-response transition', async () => { + const activeDb = createTempDb(); + const created = await dispatchCapability({ + db: activeDb, + capability: 'spec.create', + input: { name: 'Respondable spec' }, + }); + const primary = await dispatchCapability({ + db: activeDb, + capability: 'chat.getPrimary', + input: { specId: created.specId }, + }); + const ready = await dispatchCapability({ + db: activeDb, + capability: 'chat.ensureReady', + input: { chatId: primary.chatId }, + generateAnswerableFrontier: async () => ({ + question: 'What are you trying to build?', + assistantParts: [{ type: 'text' as const, text: 'What are you trying to build?' }], + }), + }); + + const result = await dispatchCapability({ + db: activeDb, + capability: 'turn.submitResponse', + input: { + chatId: primary.chatId, + turnId: ready.turnId, + response: { kind: 'free-text', freeText: 'A local spec elicitation tool' }, + }, + }); + + expect(result).toEqual({ + chatId: primary.chatId, + specId: created.specId, + turnId: ready.turnId, + response: { ok: true }, + nextCommands: [{ capability: 'chat.read', input: { chatId: primary.chatId } }], + }); + expect(getTurn(activeDb, ready.turnId)?.answer).toBe('A local spec elicitation tool'); + expect(getTurn(activeDb, ready.turnId)?.user_parts).toContain('data-turn-response'); + await expect( + dispatchCapability({ + db: activeDb, + capability: 'chat.read', + input: { chatId: primary.chatId }, + }), + ).resolves.toMatchObject({ + frontier: { state: 'answered', phase: 'grounding', turnId: ready.turnId }, + turns: [expect.objectContaining({ id: ready.turnId, answer: 'A local spec elicitation tool' })], + nextCommands: [{ capability: 'chat.ensureReady', input: { chatId: primary.chatId } }], + }); + }); + + it('rejects turn.submitResponse for turns outside the explicit chat', async () => { + const activeDb = createTempDb(); + const first = await dispatchCapability({ + db: activeDb, + capability: 'spec.create', + input: { name: 'First spec' }, + }); + const second = await dispatchCapability({ + db: activeDb, + capability: 'spec.create', + input: { name: 'Second spec' }, + }); + const firstChat = await dispatchCapability({ + db: activeDb, + capability: 'chat.getPrimary', + input: { specId: first.specId }, + }); + const secondChat = await dispatchCapability({ + db: activeDb, + capability: 'chat.getPrimary', + input: { specId: second.specId }, + }); + const secondReady = await dispatchCapability({ + db: activeDb, + capability: 'chat.ensureReady', + input: { chatId: secondChat.chatId }, + generateAnswerableFrontier: async () => ({ + question: 'What are you trying to build?', + assistantParts: [{ type: 'text' as const, text: 'What are you trying to build?' }], + }), + }); + + await expect( + dispatchCapability({ + db: activeDb, + capability: 'turn.submitResponse', + input: { + chatId: firstChat.chatId, + turnId: secondReady.turnId, + response: { kind: 'free-text', freeText: 'Wrong owner' }, + }, + }), + ).rejects.toThrow(`Turn ${secondReady.turnId} does not belong to chat ${firstChat.chatId}`); + expect(getTurn(activeDb, secondReady.turnId)?.answer).toBeNull(); + }); + it('rejects unknown chat ids and schema-invalid capability input before calling handlers', async () => { const activeDb = createTempDb(); await expect( @@ -260,6 +361,14 @@ describe('agent capabilities', () => { }), ).rejects.toThrow('Chat 999 not found'); + await expect( + dispatchCapability({ + db: activeDb, + capability: 'turn.submitResponse', + input: { chatId: 1, turnId: 1, response: { kind: 'free-text', freeText: '' } }, + }), + ).rejects.toThrow('Invalid input for capability turn.submitResponse'); + await expect( dispatchCapability({ db: activeDb, diff --git a/src/server/capabilities.ts b/src/server/capabilities.ts index 4aae1485..4284a2e7 100644 --- a/src/server/capabilities.ts +++ b/src/server/capabilities.ts @@ -2,6 +2,7 @@ import { readUIMessageStream } from 'ai'; import { eq } from 'drizzle-orm'; import { z } from 'zod'; +import { submitTurnResponseRequestSchema } from '@/shared/api-types.js'; import { extractTextFromMessage, type BrunchUIMessage } from '@/shared/chat.js'; import { getCapabilityContract, type CapabilityId } from './capability-registry.js'; @@ -13,6 +14,7 @@ import { persistFallbackQuestionText, streamInterviewer } from './interview.js'; import { serializeParts, type AssistantPart } from './parts.js'; import * as schema from './schema.js'; import { materializeTurnArtifacts } from './turn-artifacts.js'; +import { submitTurnResponseTransition } from './turn-response-transition.js'; const specCreateInputSchema = z.object({ name: z.string().trim().min(1), @@ -35,12 +37,19 @@ const chatEnsureReadyInputSchema = z.object({ chatId: z.number().int().positive(), }); +const turnSubmitResponseInputSchema = z.object({ + chatId: z.number().int().positive(), + turnId: z.number().int().positive(), + response: submitTurnResponseRequestSchema, +}); + const capabilityInputSchemas = { 'spec.create': specCreateInputSchema, 'spec.getStatus': specGetStatusInputSchema, 'chat.getPrimary': chatGetPrimaryInputSchema, 'chat.read': chatReadInputSchema, 'chat.ensureReady': chatEnsureReadyInputSchema, + 'turn.submitResponse': turnSubmitResponseInputSchema, } as const; export class CapabilityDispatchError extends Error { @@ -84,11 +93,13 @@ type SpecGetStatusInput = z.infer; type ChatGetPrimaryInput = z.infer; type ChatReadInput = z.infer; type ChatEnsureReadyInput = z.infer; +type TurnSubmitResponseInput = z.infer; type SpecCreateOutput = ReturnType; type SpecGetStatusOutput = ReturnType; type ChatGetPrimaryOutput = ReturnType; type ChatReadOutput = ReturnType; type ChatEnsureReadyOutput = Awaited>; +type TurnSubmitResponseOutput = ReturnType; function parseSpecCreateInput(input: unknown): SpecCreateInput { const parsed = specCreateInputSchema.safeParse(input); @@ -130,6 +141,14 @@ function parseChatEnsureReadyInput(input: unknown): ChatEnsureReadyInput { return parsed.data; } +function parseTurnSubmitResponseInput(input: unknown): TurnSubmitResponseInput { + const parsed = turnSubmitResponseInputSchema.safeParse(input); + if (!parsed.success) { + throw new CapabilityDispatchError('Invalid input for capability turn.submitResponse', 'invalid_input'); + } + return parsed.data; +} + function assertExecutableCapability( capability: string, ): asserts capability is keyof typeof capabilityInputSchemas { @@ -224,7 +243,10 @@ function getChatById(db: DB, chatId: number) { } function getReadyStateForTurn(turn: { question: string; answer: string | null }) { - return turn.answer === null && turn.question.trim() === '' ? 'needs_generation' : 'awaiting_response'; + if (turn.answer !== null) { + return 'answered'; + } + return turn.question.trim() === '' ? 'needs_generation' : 'awaiting_response'; } async function generateAnswerableFrontierWithInterviewer({ @@ -290,9 +312,10 @@ function readChatFromCapability(db: DB, input: ChatReadInput) { const frontier = activeTurn ? { state: getReadyStateForTurn(activeTurn), phase: activeTurn.phase, turnId: activeTurn.id } : { state: 'idle_no_frontier' as const, phase: currentPhase, turnId: null }; - const nextCommands = activeTurn - ? [{ capability: 'turn.submitResponse', input: { chatId: chat.id, turnId: activeTurn.id } }] - : [{ capability: 'chat.ensureReady', input: { chatId: chat.id } }]; + const nextCommands = + activeTurn && frontier.state === 'awaiting_response' + ? [{ capability: 'turn.submitResponse', input: { chatId: chat.id, turnId: activeTurn.id } }] + : [{ capability: 'chat.ensureReady', input: { chatId: chat.id } }]; return { specification: { @@ -321,6 +344,43 @@ function readChatFromCapability(db: DB, input: ChatReadInput) { }; } +function submitTurnResponseFromCapability(db: DB, input: TurnSubmitResponseInput) { + const chat = getChatById(db, input.chatId); + if (!chat) { + throw new CapabilityDispatchError(`Chat ${input.chatId} not found`, 'handler_failed'); + } + + const turn = getTurn(db, input.turnId); + if (!turn) { + throw new CapabilityDispatchError(`Turn ${input.turnId} not found`, 'handler_failed'); + } + if (turn.chat_id !== chat.id || turn.specification_id !== chat.specification_id) { + throw new CapabilityDispatchError( + `Turn ${input.turnId} does not belong to chat ${input.chatId}`, + 'handler_failed', + ); + } + + const response = submitTurnResponseTransition({ + db, + specificationId: chat.specification_id, + turnId: turn.id, + request: input.response, + }); + + if (!response.ok) { + throw new CapabilityDispatchError(response.message, 'handler_failed'); + } + + return { + chatId: chat.id, + specId: chat.specification_id, + turnId: turn.id, + response, + nextCommands: [{ capability: 'chat.read', input: { chatId: chat.id } }], + }; +} + async function ensureChatReadyFromCapability( db: DB, input: ChatEnsureReadyInput, @@ -339,6 +399,16 @@ async function ensureChatReadyFromCapability( const activeTurn = state.turns.find((turn) => turn.id === chat.active_turn_id) ?? null; if (activeTurn) { const activeState = getReadyStateForTurn(activeTurn); + if (activeState === 'awaiting_response') { + return { + chatId: chat.id, + specId: chat.specification_id, + state: 'awaiting_response' as const, + turnId: activeTurn.id, + nextCommands: [{ capability: 'chat.read', input: { chatId: chat.id } }], + }; + } + if (activeState === 'needs_generation') { const persistedActiveTurn = getTurn(db, activeTurn.id); if (!persistedActiveTurn) { @@ -351,13 +421,47 @@ async function ensureChatReadyFromCapability( userMessage: '', }); await persistGeneratedAnswerableFrontier(db, persistedActiveTurn, generated); + + return { + chatId: chat.id, + specId: chat.specification_id, + state: 'awaiting_response' as const, + turnId: activeTurn.id, + nextCommands: [{ capability: 'chat.read', input: { chatId: chat.id } }], + }; + } + + const answeredText = activeTurn.answer ?? ''; + const transition = applyChatRouteTransition( + { db, specificationId: chat.specification_id }, + { + kind: 'continue', + reply: { text: answeredText, parts: [] }, + }, + ); + if (!transition.ok) { + throw new CapabilityDispatchError(transition.message, 'handler_failed'); + } + if (transition.kind !== 'interviewer-turn') { + throw new CapabilityDispatchError( + `Chat ${chat.id} did not produce an interviewer frontier`, + 'handler_failed', + ); } + finalizeTurn(db, chat.specification_id, transition.prepared.turn.id); + const generated = await generateAnswerableFrontier({ + db, + turn: transition.prepared.turn, + activePath: transition.prepared.activePath, + userMessage: answeredText, + }); + await persistGeneratedAnswerableFrontier(db, transition.prepared.turn, generated); return { chatId: chat.id, specId: chat.specification_id, state: 'awaiting_response' as const, - turnId: activeTurn.id, + turnId: transition.prepared.turn.id, nextCommands: [{ capability: 'chat.read', input: { chatId: chat.id } }], }; } @@ -436,6 +540,11 @@ export function dispatchCapability(input: { input: unknown; generateAnswerableFrontier?: GenerateAnswerableFrontier; }): Promise; +export function dispatchCapability(input: { + db: DB; + capability: 'turn.submitResponse'; + input: unknown; +}): Promise; export function dispatchCapability(input: DispatchCapabilityInput): Promise; export async function dispatchCapability({ db, @@ -465,5 +574,9 @@ export async function dispatchCapability({ return ensureChatReadyFromCapability(db, parseChatEnsureReadyInput(input), generateAnswerableFrontier); } + if (capability === 'turn.submitResponse') { + return submitTurnResponseFromCapability(db, parseTurnSubmitResponseInput(input)); + } + throw new CapabilityDispatchError('Capability has no executable handler', 'unknown_capability'); } diff --git a/src/server/capability-registry.test.ts b/src/server/capability-registry.test.ts index 95c6c211..35de001a 100644 --- a/src/server/capability-registry.test.ts +++ b/src/server/capability-registry.test.ts @@ -71,6 +71,12 @@ describe('capability registry', () => { inputSchema: 'chat.ensureReady.input.v1', outputSchema: 'chat.ensureReady.output.v1', }), + expect.objectContaining({ + id: 'turn.submitResponse', + authority: 'commit_truth', + inputSchema: 'turn.submitResponse.input.v1', + outputSchema: 'turn.submitResponse.output.v1', + }), ]); }); diff --git a/src/server/capability-registry.ts b/src/server/capability-registry.ts index 8be180d0..ac50b20f 100644 --- a/src/server/capability-registry.ts +++ b/src/server/capability-registry.ts @@ -19,7 +19,8 @@ export type CapabilityId = | 'spec.getStatus' | 'chat.getPrimary' | 'chat.read' - | 'chat.ensureReady'; + | 'chat.ensureReady' + | 'turn.submitResponse'; export interface CapabilityContract { id: CapabilityId; @@ -134,11 +135,19 @@ const capabilityContracts = [ { id: 'chat.ensureReady', authority: 'runtime_replay', - summary: 'Materialize deterministic chat readiness for an explicit chat id without generation.', + summary: 'Ensure an explicit chat has an answerable generated frontier.', inputSchema: 'chat.ensureReady.input.v1', outputSchema: 'chat.ensureReady.output.v1', handler: null, }, + { + id: 'turn.submitResponse', + authority: 'commit_truth', + summary: 'Submit a structured response to an explicit chat turn.', + inputSchema: 'turn.submitResponse.input.v1', + outputSchema: 'turn.submitResponse.output.v1', + handler: null, + }, ] as const satisfies readonly CapabilityContract[]; const capabilityContractsById = new Map( From 61f2ca26fd66caca31593d7ddc724b56d5b17dcb Mon Sep 17 00:00:00 2001 From: Lu Nelson Date: Mon, 11 May 2026 17:38:37 +0200 Subject: [PATCH 08/42] FE-705: Harden agent readiness smoke --- src/server/capabilities.test.ts | 24 +++++++++++++++++++----- src/server/capabilities.ts | 31 +++++++++++++++++++++++++------ 2 files changed, 44 insertions(+), 11 deletions(-) diff --git a/src/server/capabilities.test.ts b/src/server/capabilities.test.ts index 5213e630..bc0e5bff 100644 --- a/src/server/capabilities.test.ts +++ b/src/server/capabilities.test.ts @@ -151,8 +151,21 @@ describe('agent capabilities', () => { it('dispatches chat.ensureReady by generating an answerable frontier', async () => { const activeDb = createTempDb(); const generateAnswerableFrontier = vi.fn(async () => ({ - question: 'What are you trying to build?', - assistantParts: [{ type: 'text' as const, text: 'What are you trying to build?' }], + question: '', + assistantParts: [ + { + type: 'tool-ask_question' as const, + toolCallId: 'question-1', + state: 'output-available' as const, + input: { + question: 'What are you trying to build?', + why: 'Grounding starts with the user goal.', + impact: 'high' as const, + options: [], + }, + output: { ok: true as const, turnId: 1, optionCount: 0 }, + }, + ], })); const created = await dispatchCapability({ db: activeDb, @@ -181,6 +194,9 @@ describe('agent capabilities', () => { nextCommands: [{ capability: 'chat.read', input: { chatId: primary.chatId } }], }); expect(generateAnswerableFrontier).toHaveBeenCalledOnce(); + expect(generateAnswerableFrontier).toHaveBeenCalledWith( + expect.objectContaining({ userMessage: 'Begin the grounding interview.' }), + ); expect(activePath).toHaveLength(1); expect(activePath[0]).toMatchObject({ id: result.turnId, @@ -188,9 +204,7 @@ describe('agent capabilities', () => { question: 'What are you trying to build?', answer: null, }); - expect(activePath[0]?.assistant_parts).toBe( - JSON.stringify([{ type: 'text', text: 'What are you trying to build?' }]), - ); + expect(activePath[0]?.assistant_parts).toContain('tool-ask_question'); expect(getSpecification(activeDb, created.specId)?.active_turn_id).toBe(result.turnId); expect( await dispatchCapability({ diff --git a/src/server/capabilities.ts b/src/server/capabilities.ts index 4284a2e7..42d0e65a 100644 --- a/src/server/capabilities.ts +++ b/src/server/capabilities.ts @@ -3,7 +3,7 @@ import { eq } from 'drizzle-orm'; import { z } from 'zod'; import { submitTurnResponseRequestSchema } from '@/shared/api-types.js'; -import { extractTextFromMessage, type BrunchUIMessage } from '@/shared/chat.js'; +import { extractTextFromMessage, structuredQuestionSchema, type BrunchUIMessage } from '@/shared/chat.js'; import { getCapabilityContract, type CapabilityId } from './capability-registry.js'; import { applyChatRouteTransition } from './chat-route-transition.js'; @@ -242,6 +242,8 @@ function getChatById(db: DB, chatId: number) { .get(); } +const INITIAL_INTERVIEWER_PROMPT = 'Begin the grounding interview.'; + function getReadyStateForTurn(turn: { question: string; answer: string | null }) { if (turn.answer !== null) { return 'answered'; @@ -271,26 +273,43 @@ async function generateAnswerableFrontierWithInterviewer({ throw new Error(`Interviewer did not generate content for turn ${turn.id}`); } - const question = extractTextFromMessage(responseMessage); const assistantParts = materializeTurnArtifacts({ phase: turn.phase, responseMessage, elapsedMs: Date.now() - startedAt, }); + const question = + extractTextFromMessage(responseMessage) || extractQuestionFromAssistantParts(assistantParts); return { question, assistantParts }; } +function extractQuestionFromAssistantParts(parts: AssistantPart[]): string { + const askQuestionPart = parts.find( + (part): part is Extract => + part.type === 'tool-ask_question' && 'input' in part, + ); + if (!askQuestionPart) { + return ''; + } + + const parsedInput = structuredQuestionSchema.safeParse(askQuestionPart.input); + return parsedInput.success ? parsedInput.data.question : ''; +} + async function persistGeneratedAnswerableFrontier( db: DB, turn: Turn, generated: GeneratedAnswerableFrontier, ): Promise { - if (generated.question.trim() === '') { + const currentQuestion = getTurn(db, turn.id)?.question ?? ''; + const question = + generated.question || extractQuestionFromAssistantParts(generated.assistantParts) || currentQuestion; + if (question.trim() === '') { throw new Error(`Interviewer generated an empty question for turn ${turn.id}`); } - persistFallbackQuestionText(db, turn.id, generated.question); + persistFallbackQuestionText(db, turn.id, question); updateTurn(db, turn.id, { assistant_parts: serializeParts(generated.assistantParts), }); @@ -418,7 +437,7 @@ async function ensureChatReadyFromCapability( db, turn: persistedActiveTurn, activePath: state.turns, - userMessage: '', + userMessage: INITIAL_INTERVIEWER_PROMPT, }); await persistGeneratedAnswerableFrontier(db, persistedActiveTurn, generated); @@ -501,7 +520,7 @@ async function ensureChatReadyFromCapability( db, turn: transition.prepared.turn, activePath: transition.prepared.activePath, - userMessage: '', + userMessage: INITIAL_INTERVIEWER_PROMPT, }); await persistGeneratedAnswerableFrontier(db, transition.prepared.turn, generated); From 9855375632be7e625ab153a14daaca73bd003dba Mon Sep 17 00:00:00 2001 From: Lu Nelson Date: Tue, 12 May 2026 10:29:20 +0200 Subject: [PATCH 09/42] FE-705: Add scripted probe runner core --- memory/CARDS.md | 142 +++++++++++++++++++++ src/server/probe-runner.test.ts | 170 ++++++++++++++++++++++++++ src/server/probe-runner.ts | 210 ++++++++++++++++++++++++++++++++ 3 files changed, 522 insertions(+) create mode 100644 memory/CARDS.md create mode 100644 src/server/probe-runner.test.ts create mode 100644 src/server/probe-runner.ts diff --git a/memory/CARDS.md b/memory/CARDS.md new file mode 100644 index 00000000..ef29625e --- /dev/null +++ b/memory/CARDS.md @@ -0,0 +1,142 @@ +# FE-705 scope cards — proof-of-life JSONL probe runner + +> Prepared by `ln-scope` on 2026-05-11. These cards stay under the existing FE-705 frontier item and branch (`ln/fe-705-agent-capability-cli`). They are sub-slices, not new Linear issues or branches. Do not add `turn.get`, phase closure/export, or LLM-as-user until this queue proves where the runner bottleneck is. + +## Orientation + +- Containing seam: FE-705 agent capability CLI / probe-runner seam governed by `memory/SPEC.md` Requirement 43, A89, D147, and I114. +- Relevant frontier item: `memory/PLAN.md` Next item 2, **Agent capability CLI + LLM-as-user fixture probe**; the JSONL adapter is working through two real-provider turns, and the next proof is an external runner over JSONL. +- Volatile handoff state: `HANDOFF.md` says no card is live; Card 6 passed `npm run verify` and a manual Anthropic-backed temp-workspace JSONL smoke reached a second answerable frontier. +- Main open risk: the probe runner must become an external client with reviewable artifacts without quietly importing DB/capability handlers or growing into speculative LLM-as-user / phase-export scope. + +## Card 7 — Probe runner JSONL client and scripted user policy + +**Status:** done + +### Target Behavior + +A probe-runner core can drive the first two interview responses through an injected JSONL transport using only `chat.read` projections and deterministic scripted answers. + +### Boundary Crossings + +```text +→ probe runner scenario command sequence +→ JSONL client / injected transport boundary +→ scripted user response policy +→ parsed Brunch read projection / response request payloads +``` + +### Risks and Assumptions + +- RISK: The runner accidentally couples to current `chat.read` object internals instead of the agent-facing projection contract → MITIGATION: centralize projection parsing in a tiny typed client module and keep policy tests fixture-shaped around observable `frontier`, `turns`, `options`, and `nextCommands` fields. +- RISK: Deterministic scripted answers cannot handle option-bearing turns → MITIGATION: support both free-text answers and option-position selection from the first implementation, with tests covering both response shapes. +- ASSUMPTION: `chat.read` is broad enough for a first scripted runner without adding `turn.get` → VALIDATE: tests construct payloads from `chat.read` alone and no card in this queue adds `turn.get` → `memory/SPEC.md` §Assumptions A89. + +### Acceptance Criteria + +✓ `probe-runner.test.ts` — a fake JSONL transport receives `spec.create → chat.getPrimary → chat.ensureReady → chat.read → turn.submitResponse → chat.read → chat.ensureReady → chat.read` in order. +✓ `probe-runner.test.ts` — the scripted policy submits free-text for an open question and an option selection payload when `chat.read` exposes options. +✓ `probe-runner.test.ts` — the runner reports `turnsAnswered: 2`, final frontier state, and structured errors without importing Brunch DB or capability dispatch modules. + +### Verification Approach + +- Inner: unit / fake-transport interaction oracle — proves sequencing, response construction, and error propagation without a provider. +- Middle: import-shape/code-boundary oracle — confirms the runner core is client-side over JSONL concepts, not a DB/handler caller. + +## Card 8 — Process-backed temp-workspace proof runner + +**Status:** queued + +### Target Behavior + +A local probe runner can launch the packaged `brunch agent` process in an isolated temp workspace and persist a minimal proof-of-life artifact bundle. + +### Boundary Crossings + +```text +→ probe runner process adapter +→ child process stdin/stdout JSONL session +→ packaged Brunch CLI agent command +→ temp workspace `.brunch/` runtime state +→ probe output artifact directory outside `.brunch/` +``` + +### Risks and Assumptions + +- RISK: Real-provider availability makes automated tests flaky → MITIGATION: test process plumbing with a fake child process or injected spawn adapter, and keep compiled CLI / Anthropic smoke opt-in unless credentials are present. +- RISK: Temp workspace cleanup or output paths leak `.brunch/` internals into curated artifacts → MITIGATION: write artifacts to an explicit output directory outside the temp workspace state directory and assert artifact paths do not point inside `.brunch/`. +- ASSUMPTION: The packaged `bin/brunch.js agent` remains the right process boundary for proof-of-life use → VALIDATE: an opt-in/manual smoke command launches that binary in a temp cwd and reaches a second answerable frontier → `memory/SPEC.md` §Assumptions A89. + +### Acceptance Criteria + +✓ `probe-runner.test.ts` — process-backed runner uses an injected spawn/process adapter to write JSONL requests and parse JSONL responses. +✓ `probe-runner.test.ts` — a run creates an isolated workspace cwd and writes raw request/response JSONL, final `chat.read` projection, and run summary outside `.brunch/`. +✓ opt-in smoke command/documented invocation — when provider credentials and built package output are available, the runner reaches a second answerable frontier through `bin/brunch.js agent`. + +### Verification Approach + +- Inner: fake child-process oracle — proves process lifecycle, JSONL parsing, artifact paths, and cleanup semantics deterministically. +- Middle: opt-in real-provider smoke — proves the external process boundary against the compiled CLI without making CI depend on credentials. + +## Card 9 — Probe artifact schema and safe summaries + +**Status:** queued + +### Objective + +Probe runs produce deterministic, reviewable artifact bundles that are safe to inspect, compare, and eventually curate into fixture candidates. + +### Acceptance Criteria + +✓ Artifact schema records scenario name/brief, command sequence, raw JSONL transcript, parsed events, final chat projection, run summary, errors, duration, and non-secret environment metadata. +✓ Failure summaries redact API-key-like values and avoid provider stack/internal object dumps while preserving useful failure class and message context. +✓ Successful summaries identify final frontier state and compact question/answer pairs from the driven turns. +✓ Tests cover successful and failed artifact rendering without launching a real provider. + +### Verification Approach + +- Inner: artifact serialization/redaction tests. +- Middle: snapshot-like deterministic summary oracle for success and failure bundles. + +### Promotion checklist + +- [ ] Does this change a requirement? +- [ ] Does this create, retire, or invalidate an assumption? +- [ ] Does this make or reverse a non-trivial design decision? +- [ ] Does this establish a new seam-level invariant? +- [ ] Does it cross more than two major seams? +- [ ] Is this the first touch in an unfamiliar seam from a fresh thread? +- [ ] Can you not name the containing seam or current rationale from the live docs? + +Result: stays light. This hardens artifacts inside the already-established FE-705 probe-runner seam and does not change durable product requirements or capability authority. + +## Card 10 — Probe runner import-boundary guard + +**Status:** queued + +### Objective + +The probe-runner module boundary is mechanically guarded so runner code can only exercise Brunch through the JSONL client/process wrapper. + +### Acceptance Criteria + +✓ Boundary test fails if probe-runner modules import `src/server/db`, `src/server/capabilities`, `src/server/capability-registry`, server handlers, or ORM schema modules directly. +✓ Allowed imports are documented in the test or module boundary helper: Node process/fs/path utilities, probe-runner private modules, and the JSONL client/process wrapper surface. +✓ Existing capability and JSONL tests continue to prove server-owned handlers remain the mutation authority. + +### Verification Approach + +- Inner: static import-boundary test plus existing capability / agent-jsonl unit tests. +- Middle: `npm run verify` gate. + +### Promotion checklist + +- [ ] Does this change a requirement? +- [ ] Does this create, retire, or invalidate an assumption? +- [ ] Does this make or reverse a non-trivial design decision? +- [ ] Does this establish a new seam-level invariant? +- [ ] Does it cross more than two major seams? +- [ ] Is this the first touch in an unfamiliar seam from a fresh thread? +- [ ] Can you not name the containing seam or current rationale from the live docs? + +Result: stays light. It enforces I114 for the new runner code without changing the I114 invariant itself. diff --git a/src/server/probe-runner.test.ts b/src/server/probe-runner.test.ts new file mode 100644 index 00000000..53a6da05 --- /dev/null +++ b/src/server/probe-runner.test.ts @@ -0,0 +1,170 @@ +import { describe, expect, it } from 'vitest'; + +import { runScriptedProbe, type JsonlTransport, type ProbeJsonlRequest } from './probe-runner.js'; + +describe('probe runner', () => { + it('drives two interview responses through an injected JSONL transport', async () => { + const requests: ProbeJsonlRequest[] = []; + const transport: JsonlTransport = { + async send(request) { + requests.push(request); + if (request.capability === 'spec.create') { + return { id: request.id, ok: true, output: { specId: 1 } }; + } + if (request.capability === 'chat.getPrimary') { + return { + id: request.id, + ok: true, + output: { specId: 1, chatId: 10, kind: 'interview', activeTurnId: null }, + }; + } + if (request.id === 'ready-1') { + return { + id: request.id, + ok: true, + output: { chatId: 10, specId: 1, state: 'awaiting_response', turnId: 100 }, + }; + } + if (request.id === 'read-1') { + return { + id: request.id, + ok: true, + output: { + frontier: { state: 'awaiting_response', phase: 'grounding', turnId: 100 }, + turns: [{ id: 100, question: 'What are you building?', answer: null, options: [] }], + nextCommands: [{ capability: 'turn.submitResponse', input: { chatId: 10, turnId: 100 } }], + }, + }; + } + if (request.id === 'answer-1') { + return { id: request.id, ok: true, output: { response: { ok: true } } }; + } + if (request.id === 'read-2') { + return { + id: request.id, + ok: true, + output: { + frontier: { state: 'answered', phase: 'grounding', turnId: 100 }, + turns: [ + { id: 100, question: 'What are you building?', answer: 'A probeable spec tool', options: [] }, + ], + nextCommands: [{ capability: 'chat.ensureReady', input: { chatId: 10 } }], + }, + }; + } + if (request.id === 'ready-2') { + return { + id: request.id, + ok: true, + output: { chatId: 10, specId: 1, state: 'awaiting_response', turnId: 101 }, + }; + } + if (request.id === 'read-3') { + return { + id: request.id, + ok: true, + output: { + frontier: { state: 'awaiting_response', phase: 'grounding', turnId: 101 }, + turns: [ + { id: 100, question: 'What are you building?', answer: 'A probeable spec tool', options: [] }, + { + id: 101, + question: 'What should be specified first?', + answer: null, + options: [ + { id: 1, position: 0, content: 'Acceptance criteria' }, + { id: 2, position: 1, content: 'API shape' }, + ], + }, + ], + nextCommands: [{ capability: 'turn.submitResponse', input: { chatId: 10, turnId: 101 } }], + }, + }; + } + if (request.id === 'answer-2') { + return { id: request.id, ok: true, output: { response: { ok: true } } }; + } + if (request.id === 'read-4') { + return { + id: request.id, + ok: true, + output: { + frontier: { state: 'answered', phase: 'grounding', turnId: 101 }, + turns: [ + { id: 100, question: 'What are you building?', answer: 'A probeable spec tool', options: [] }, + { + id: 101, + question: 'What should be specified first?', + answer: 'Acceptance criteria', + options: [], + }, + ], + nextCommands: [{ capability: 'chat.ensureReady', input: { chatId: 10 } }], + }, + }; + } + return { id: request.id, ok: false, error: { code: 'unexpected', message: request.id } }; + }, + }; + + const result = await runScriptedProbe({ + transport, + scenario: { name: 'proof', specName: 'Probe proof' }, + scriptedAnswers: ['A probeable spec tool'], + }); + + expect(requests.map((request) => request.capability)).toEqual([ + 'spec.create', + 'chat.getPrimary', + 'chat.ensureReady', + 'chat.read', + 'turn.submitResponse', + 'chat.read', + 'chat.ensureReady', + 'chat.read', + 'turn.submitResponse', + 'chat.read', + ]); + expect(requests[4]).toMatchObject({ + id: 'answer-1', + input: { chatId: 10, turnId: 100, response: { kind: 'free-text', freeText: 'A probeable spec tool' } }, + }); + expect(requests[8]).toMatchObject({ + id: 'answer-2', + input: { chatId: 10, turnId: 101, response: { kind: 'select-options', positions: [0] } }, + }); + expect(result.summary).toMatchObject({ turnsAnswered: 2, finalFrontierState: 'answered' }); + expect(result.errors).toEqual([]); + }); + + it('returns structured errors from failed JSONL responses', async () => { + const transport: JsonlTransport = { + async send(request) { + if (request.capability === 'spec.create') { + return { id: request.id, ok: true, output: { specId: 1 } }; + } + return { + id: request.id, + ok: false, + error: { code: 'handler_failed', message: 'Chat 10 not found' }, + }; + }, + }; + + const result = await runScriptedProbe({ + transport, + scenario: { name: 'failure', specName: 'Failure proof' }, + scriptedAnswers: [], + }); + + expect(result.summary.turnsAnswered).toBe(0); + expect(result.errors).toEqual([ + { + requestId: 'primary', + capability: 'chat.getPrimary', + code: 'handler_failed', + message: 'Chat 10 not found', + }, + ]); + }); +}); diff --git a/src/server/probe-runner.ts b/src/server/probe-runner.ts new file mode 100644 index 00000000..b3f4adf3 --- /dev/null +++ b/src/server/probe-runner.ts @@ -0,0 +1,210 @@ +export interface ProbeJsonlRequest { + id: string; + capability: string; + input?: unknown; +} + +export type ProbeJsonlResponse = + | { id: string; ok: true; output: unknown } + | { id: string | null; ok: false; error: { code: string; message: string } }; + +export interface JsonlTransport { + send(request: ProbeJsonlRequest): Promise; +} + +export interface ScriptedProbeScenario { + name: string; + specName: string; +} + +export interface ProbeRunError { + requestId: string; + capability: string; + code: string; + message: string; +} + +export interface ProbeRunSummary { + turnsAnswered: number; + finalFrontierState: string | null; +} + +export interface ProbeRunResult { + scenario: ScriptedProbeScenario; + requests: ProbeJsonlRequest[]; + responses: ProbeJsonlResponse[]; + finalChat: AgentChatReadProjection | null; + summary: ProbeRunSummary; + errors: ProbeRunError[]; +} + +interface SpecCreateOutput { + specId: number; +} + +interface ChatGetPrimaryOutput { + chatId: number; +} + +interface AgentChatReadProjection { + frontier: { state: string; turnId: number | null }; + turns: AgentChatTurn[]; + nextCommands?: AgentNextCommand[]; +} + +interface AgentChatTurn { + id: number; + question: string; + answer: string | null; + options?: AgentTurnOption[]; +} + +interface AgentTurnOption { + position: number; + content: string; +} + +interface AgentNextCommand { + capability: string; + input?: unknown; +} + +interface RunScriptedProbeOptions { + transport: JsonlTransport; + scenario: ScriptedProbeScenario; + scriptedAnswers: string[]; +} + +export async function runScriptedProbe({ + transport, + scenario, + scriptedAnswers, +}: RunScriptedProbeOptions): Promise { + const state: ProbeRunResult = { + scenario, + requests: [], + responses: [], + finalChat: null, + summary: { turnsAnswered: 0, finalFrontierState: null }, + errors: [], + }; + + const created = await sendExpectingOutput(state, transport, { + id: 'create', + capability: 'spec.create', + input: { name: scenario.specName }, + }); + if (!created) { + return state; + } + + const primary = await sendExpectingOutput(state, transport, { + id: 'primary', + capability: 'chat.getPrimary', + input: { specId: created.specId }, + }); + if (!primary) { + return state; + } + + for (let turnIndex = 0; turnIndex < 2; turnIndex += 1) { + const ready = await sendExpectingOutput(state, transport, { + id: `ready-${turnIndex + 1}`, + capability: 'chat.ensureReady', + input: { chatId: primary.chatId }, + }); + if (!ready) { + return state; + } + + const readyRead = await sendExpectingOutput(state, transport, { + id: `read-${turnIndex * 2 + 1}`, + capability: 'chat.read', + input: { chatId: primary.chatId }, + }); + if (!readyRead) { + return state; + } + state.finalChat = readyRead; + state.summary.finalFrontierState = readyRead.frontier.state; + + const activeTurn = getActiveTurn(readyRead); + if (!activeTurn) { + state.errors.push({ + requestId: `read-${turnIndex * 2 + 1}`, + capability: 'chat.read', + code: 'no_answerable_turn', + message: 'chat.read did not expose an awaiting-response frontier turn', + }); + return state; + } + + const submit = await sendExpectingOutput(state, transport, { + id: `answer-${turnIndex + 1}`, + capability: 'turn.submitResponse', + input: { + chatId: primary.chatId, + turnId: activeTurn.id, + response: buildScriptedResponse(activeTurn, scriptedAnswers[turnIndex]), + }, + }); + if (!submit) { + return state; + } + state.summary.turnsAnswered += 1; + + const afterAnswerRead = await sendExpectingOutput(state, transport, { + id: `read-${turnIndex * 2 + 2}`, + capability: 'chat.read', + input: { chatId: primary.chatId }, + }); + if (!afterAnswerRead) { + return state; + } + state.finalChat = afterAnswerRead; + state.summary.finalFrontierState = afterAnswerRead.frontier.state; + } + + return state; +} + +async function sendExpectingOutput( + state: ProbeRunResult, + transport: JsonlTransport, + request: ProbeJsonlRequest, +): Promise { + state.requests.push(request); + const response = await transport.send(request); + state.responses.push(response); + + if (!response.ok) { + state.errors.push({ + requestId: request.id, + capability: request.capability, + code: response.error.code, + message: response.error.message, + }); + return null; + } + + return response.output as T; +} + +function getActiveTurn(read: AgentChatReadProjection): AgentChatTurn | null { + if (read.frontier.state !== 'awaiting_response' || read.frontier.turnId === null) { + return null; + } + return read.turns.find((turn) => turn.id === read.frontier.turnId) ?? null; +} + +function buildScriptedResponse(turn: AgentChatTurn, scriptedAnswer: string | undefined) { + const firstOption = turn.options?.[0]; + if (firstOption) { + return { kind: 'select-options' as const, positions: [firstOption.position] }; + } + + return { + kind: 'free-text' as const, + freeText: scriptedAnswer?.trim() || `Scripted response to: ${turn.question}`, + }; +} From 0e7242cd358d496444355fd59d884131f5f6e0c9 Mon Sep 17 00:00:00 2001 From: Lu Nelson Date: Tue, 12 May 2026 10:35:39 +0200 Subject: [PATCH 10/42] FE-705: Add process-backed probe runner --- memory/CARDS.md | 3 +- src/server/probe-runner.test.ts | 188 +++++++++++++++++++++++++++++++- src/server/probe-runner.ts | 121 ++++++++++++++++++++ 3 files changed, 309 insertions(+), 3 deletions(-) diff --git a/memory/CARDS.md b/memory/CARDS.md index ef29625e..a9d4a632 100644 --- a/memory/CARDS.md +++ b/memory/CARDS.md @@ -45,7 +45,7 @@ A probe-runner core can drive the first two interview responses through an injec ## Card 8 — Process-backed temp-workspace proof runner -**Status:** queued +**Status:** done ### Target Behavior @@ -72,6 +72,7 @@ A local probe runner can launch the packaged `brunch agent` process in an isolat ✓ `probe-runner.test.ts` — process-backed runner uses an injected spawn/process adapter to write JSONL requests and parse JSONL responses. ✓ `probe-runner.test.ts` — a run creates an isolated workspace cwd and writes raw request/response JSONL, final `chat.read` projection, and run summary outside `.brunch/`. ✓ opt-in smoke command/documented invocation — when provider credentials and built package output are available, the runner reaches a second answerable frontier through `bin/brunch.js agent`. + - Manual invocation shape for a future smoke wrapper: build first, then call `runProcessBackedProbe()` with the default command (`node bin/brunch.js agent`), an explicit `outputDir`, and a temp workspace created by the runner; this is intentionally not a CI command until provider credentials are controlled. ### Verification Approach diff --git a/src/server/probe-runner.test.ts b/src/server/probe-runner.test.ts index 53a6da05..8dfc1cee 100644 --- a/src/server/probe-runner.test.ts +++ b/src/server/probe-runner.test.ts @@ -1,8 +1,33 @@ -import { describe, expect, it } from 'vitest'; +import { mkdtempSync, readFileSync, rmSync } from 'node:fs'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; -import { runScriptedProbe, type JsonlTransport, type ProbeJsonlRequest } from './probe-runner.js'; +import { afterEach, describe, expect, it } from 'vitest'; + +import { + createProcessJsonlTransport, + runProcessBackedProbe, + runScriptedProbe, + type JsonlTransport, + type ProbeJsonlRequest, + type SpawnedJsonlProcess, +} from './probe-runner.js'; describe('probe runner', () => { + const tempDirs: string[] = []; + + afterEach(() => { + for (const dir of tempDirs.splice(0)) { + rmSync(dir, { recursive: true, force: true }); + } + }); + + function makeTempDir(prefix: string): string { + const dir = mkdtempSync(join(tmpdir(), prefix)); + tempDirs.push(dir); + return dir; + } + it('drives two interview responses through an injected JSONL transport', async () => { const requests: ProbeJsonlRequest[] = []; const transport: JsonlTransport = { @@ -137,6 +162,65 @@ describe('probe runner', () => { expect(result.errors).toEqual([]); }); + it('uses a process JSONL transport to write requests and parse responses', async () => { + const written: string[] = []; + let onStdoutData: ((chunk: string) => void) | null = null; + const process: SpawnedJsonlProcess = { + writeStdin(line) { + written.push(line); + const request = JSON.parse(line) as ProbeJsonlRequest; + onStdoutData?.( + `${JSON.stringify({ id: request.id, ok: true, output: { echoed: request.capability } })}\n`, + ); + }, + endStdin() {}, + onStdoutData(listener) { + onStdoutData = listener; + }, + }; + + const transport = createProcessJsonlTransport(process); + const response = await transport.send({ + id: 'create', + capability: 'spec.create', + input: { name: 'Probe' }, + }); + + expect(written).toEqual([ + JSON.stringify({ id: 'create', capability: 'spec.create', input: { name: 'Probe' } }), + ]); + expect(response).toEqual({ id: 'create', ok: true, output: { echoed: 'spec.create' } }); + }); + + it('creates an isolated workspace and writes minimal probe artifacts outside .brunch', async () => { + const outputDir = makeTempDir('brunch-probe-output-'); + const spawnedCwds: string[] = []; + + const result = await runProcessBackedProbe({ + scenario: { name: 'process-proof', specName: 'Process proof' }, + scriptedAnswers: ['A temp-workspace probe'], + outputDir, + spawnProcess({ cwd }) { + spawnedCwds.push(cwd); + return createFakeAgentProcess(); + }, + }); + + expect(result.summary).toMatchObject({ turnsAnswered: 2, finalFrontierState: 'answered' }); + expect(spawnedCwds).toHaveLength(1); + expect(spawnedCwds[0]).toContain('brunch-probe-workspace-'); + expect(outputDir).not.toContain(`${spawnedCwds[0]}/.brunch`); + + const rawJsonl = readFileSync(join(outputDir, 'raw-jsonl.ndjson'), 'utf8'); + const finalChat = JSON.parse(readFileSync(join(outputDir, 'final-chat.json'), 'utf8')) as unknown; + const summary = JSON.parse(readFileSync(join(outputDir, 'summary.json'), 'utf8')) as unknown; + + expect(rawJsonl).toContain('"direction":"request"'); + expect(rawJsonl).toContain('"direction":"response"'); + expect(finalChat).toMatchObject({ frontier: { state: 'answered' } }); + expect(summary).toMatchObject({ turnsAnswered: 2, finalFrontierState: 'answered' }); + }); + it('returns structured errors from failed JSONL responses', async () => { const transport: JsonlTransport = { async send(request) { @@ -168,3 +252,103 @@ describe('probe runner', () => { ]); }); }); + +function createFakeAgentProcess(): SpawnedJsonlProcess { + let onStdoutData: ((chunk: string) => void) | null = null; + + return { + writeStdin(line) { + const request = JSON.parse(line) as ProbeJsonlRequest; + const response = getFakeAgentResponse(request); + onStdoutData?.(`${JSON.stringify(response)}\n`); + }, + endStdin() {}, + onStdoutData(listener) { + onStdoutData = listener; + }, + }; +} + +function getFakeAgentResponse(request: ProbeJsonlRequest) { + if (request.capability === 'spec.create') { + return { id: request.id, ok: true, output: { specId: 1 } }; + } + if (request.capability === 'chat.getPrimary') { + return { + id: request.id, + ok: true, + output: { specId: 1, chatId: 10, kind: 'interview', activeTurnId: null }, + }; + } + if (request.capability === 'chat.ensureReady') { + const turnId = request.id === 'ready-1' ? 100 : 101; + return { + id: request.id, + ok: true, + output: { chatId: 10, specId: 1, state: 'awaiting_response', turnId }, + }; + } + if (request.id === 'read-1') { + return { + id: request.id, + ok: true, + output: { + frontier: { state: 'awaiting_response', phase: 'grounding', turnId: 100 }, + turns: [{ id: 100, question: 'What are you building?', answer: null, options: [] }], + nextCommands: [{ capability: 'turn.submitResponse', input: { chatId: 10, turnId: 100 } }], + }, + }; + } + if (request.id === 'read-2') { + return { + id: request.id, + ok: true, + output: { + frontier: { state: 'answered', phase: 'grounding', turnId: 100 }, + turns: [ + { id: 100, question: 'What are you building?', answer: 'A temp-workspace probe', options: [] }, + ], + nextCommands: [{ capability: 'chat.ensureReady', input: { chatId: 10 } }], + }, + }; + } + if (request.id === 'read-3') { + return { + id: request.id, + ok: true, + output: { + frontier: { state: 'awaiting_response', phase: 'grounding', turnId: 101 }, + turns: [ + { id: 100, question: 'What are you building?', answer: 'A temp-workspace probe', options: [] }, + { + id: 101, + question: 'What should be specified first?', + answer: null, + options: [{ id: 1, position: 0, content: 'Acceptance criteria' }], + }, + ], + nextCommands: [{ capability: 'turn.submitResponse', input: { chatId: 10, turnId: 101 } }], + }, + }; + } + if (request.id === 'read-4') { + return { + id: request.id, + ok: true, + output: { + frontier: { state: 'answered', phase: 'grounding', turnId: 101 }, + turns: [ + { id: 100, question: 'What are you building?', answer: 'A temp-workspace probe', options: [] }, + { + id: 101, + question: 'What should be specified first?', + answer: 'Acceptance criteria', + options: [], + }, + ], + nextCommands: [{ capability: 'chat.ensureReady', input: { chatId: 10 } }], + }, + }; + } + return { id: request.id, ok: true, output: { response: { ok: true } } }; +} diff --git a/src/server/probe-runner.ts b/src/server/probe-runner.ts index b3f4adf3..ce5fe0fd 100644 --- a/src/server/probe-runner.ts +++ b/src/server/probe-runner.ts @@ -1,3 +1,8 @@ +import { spawn } from 'node:child_process'; +import { mkdirSync, mkdtempSync, writeFileSync } from 'node:fs'; +import { tmpdir } from 'node:os'; +import { join, resolve } from 'node:path'; + export interface ProbeJsonlRequest { id: string; capability: string; @@ -12,6 +17,23 @@ export interface JsonlTransport { send(request: ProbeJsonlRequest): Promise; } +export interface SpawnedJsonlProcess { + writeStdin(line: string): void; + endStdin(): void; + onStdoutData(listener: (chunk: string) => void): void; + onStderrData?(listener: (chunk: string) => void): void; + onExit?(listener: (code: number | null) => void): void; +} + +export interface ProbeProcessSpawnOptions { + cwd: string; + command: string; + args: string[]; + env: NodeJS.ProcessEnv; +} + +export type ProbeProcessSpawner = (options: ProbeProcessSpawnOptions) => SpawnedJsonlProcess; + export interface ScriptedProbeScenario { name: string; specName: string; @@ -75,6 +97,69 @@ interface RunScriptedProbeOptions { scriptedAnswers: string[]; } +export interface ProcessBackedProbeOptions { + scenario: ScriptedProbeScenario; + scriptedAnswers: string[]; + outputDir: string; + spawnProcess?: ProbeProcessSpawner; + command?: string; + args?: string[]; + env?: NodeJS.ProcessEnv; +} + +export async function runProcessBackedProbe({ + scenario, + scriptedAnswers, + outputDir, + spawnProcess = spawnBrunchAgentProcess, + command = process.execPath, + args = [resolve('bin/brunch.js'), 'agent'], + env = process.env, +}: ProcessBackedProbeOptions): Promise { + const workspaceCwd = mkdtempSync(join(tmpdir(), 'brunch-probe-workspace-')); + const spawned = spawnProcess({ cwd: workspaceCwd, command, args, env }); + const transport = createProcessJsonlTransport(spawned); + + try { + const result = await runScriptedProbe({ transport, scenario, scriptedAnswers }); + writeProbeArtifacts(outputDir, result); + return result; + } finally { + spawned.endStdin(); + } +} + +export function createProcessJsonlTransport(process: SpawnedJsonlProcess): JsonlTransport { + let buffer = ''; + const pending = new Map void>(); + + process.onStdoutData((chunk) => { + buffer += chunk; + let newlineIndex = buffer.indexOf('\n'); + while (newlineIndex >= 0) { + const line = buffer.slice(0, newlineIndex).trim(); + buffer = buffer.slice(newlineIndex + 1); + if (line !== '') { + const response = JSON.parse(line) as ProbeJsonlResponse; + if (response.id) { + pending.get(response.id)?.(response); + pending.delete(response.id); + } + } + newlineIndex = buffer.indexOf('\n'); + } + }); + + return { + send(request) { + return new Promise((resolveResponse) => { + pending.set(request.id, resolveResponse); + process.writeStdin(JSON.stringify(request)); + }); + }, + }; +} + export async function runScriptedProbe({ transport, scenario, @@ -208,3 +293,39 @@ function buildScriptedResponse(turn: AgentChatTurn, scriptedAnswer: string | und freeText: scriptedAnswer?.trim() || `Scripted response to: ${turn.question}`, }; } + +function writeProbeArtifacts(outputDir: string, result: ProbeRunResult): void { + mkdirSync(outputDir, { recursive: true }); + const rawJsonl = result.requests + .flatMap((request, index) => [ + { direction: 'request', payload: request }, + { direction: 'response', payload: result.responses[index] ?? null }, + ]) + .map((entry) => JSON.stringify(entry)) + .join('\n'); + + writeFileSync(join(outputDir, 'raw-jsonl.ndjson'), `${rawJsonl}\n`); + writeFileSync(join(outputDir, 'final-chat.json'), `${JSON.stringify(result.finalChat, null, 2)}\n`); + writeFileSync(join(outputDir, 'summary.json'), `${JSON.stringify(result.summary, null, 2)}\n`); +} + +function spawnBrunchAgentProcess({ cwd, command, args, env }: ProbeProcessSpawnOptions): SpawnedJsonlProcess { + const child = spawn(command, args, { cwd, env, stdio: ['pipe', 'pipe', 'pipe'] }); + return { + writeStdin(line) { + child.stdin.write(`${line}\n`); + }, + endStdin() { + child.stdin.end(); + }, + onStdoutData(listener) { + child.stdout.on('data', (chunk) => listener(chunk.toString())); + }, + onStderrData(listener) { + child.stderr.on('data', (chunk) => listener(chunk.toString())); + }, + onExit(listener) { + child.on('exit', listener); + }, + }; +} From ef3b2eabb2958c84eff582f7204c0b82dcbcbe9c Mon Sep 17 00:00:00 2001 From: Lu Nelson Date: Tue, 12 May 2026 10:41:01 +0200 Subject: [PATCH 11/42] FE-705: Harden probe artifacts --- memory/CARDS.md | 2 +- src/server/probe-runner.test.ts | 60 ++++++++++++++++- src/server/probe-runner.ts | 110 ++++++++++++++++++++++++++------ 3 files changed, 151 insertions(+), 21 deletions(-) diff --git a/memory/CARDS.md b/memory/CARDS.md index a9d4a632..2ab6a5ef 100644 --- a/memory/CARDS.md +++ b/memory/CARDS.md @@ -81,7 +81,7 @@ A local probe runner can launch the packaged `brunch agent` process in an isolat ## Card 9 — Probe artifact schema and safe summaries -**Status:** queued +**Status:** done ### Objective diff --git a/src/server/probe-runner.test.ts b/src/server/probe-runner.test.ts index 8dfc1cee..ea6d9180 100644 --- a/src/server/probe-runner.test.ts +++ b/src/server/probe-runner.test.ts @@ -214,11 +214,69 @@ describe('probe runner', () => { const rawJsonl = readFileSync(join(outputDir, 'raw-jsonl.ndjson'), 'utf8'); const finalChat = JSON.parse(readFileSync(join(outputDir, 'final-chat.json'), 'utf8')) as unknown; const summary = JSON.parse(readFileSync(join(outputDir, 'summary.json'), 'utf8')) as unknown; + const bundle = JSON.parse(readFileSync(join(outputDir, 'artifact-bundle.json'), 'utf8')) as unknown; expect(rawJsonl).toContain('"direction":"request"'); expect(rawJsonl).toContain('"direction":"response"'); expect(finalChat).toMatchObject({ frontier: { state: 'answered' } }); - expect(summary).toMatchObject({ turnsAnswered: 2, finalFrontierState: 'answered' }); + expect(summary).toMatchObject({ + turnsAnswered: 2, + finalFrontierState: 'answered', + questionAnswers: [ + { question: 'What are you building?', answer: 'A temp-workspace probe' }, + { question: 'What should be specified first?', answer: 'Acceptance criteria' }, + ], + }); + expect(bundle).toMatchObject({ + schemaVersion: 1, + scenario: { name: 'process-proof', brief: null }, + commandSequence: expect.arrayContaining(['spec.create', 'chat.getPrimary', 'chat.ensureReady']), + environment: { platform: process.platform, arch: process.arch }, + }); + }); + + it('redacts secret-like failure summaries without provider stack dumps', async () => { + const transport: JsonlTransport = { + async send(request) { + if (request.capability === 'spec.create') { + return { id: request.id, ok: true, output: { specId: 1 } }; + } + return { + id: request.id, + ok: false, + error: { + code: 'handler_failed', + message: + 'Provider failed with ANTHROPIC_API_KEY=sk-ant-secret-value\n at internal/provider.ts:1', + }, + }; + }, + }; + + const result = await runScriptedProbe({ + transport, + scenario: { name: 'redaction', specName: 'Redaction proof', brief: 'check safe artifacts' }, + scriptedAnswers: [], + }); + + expect(result.errors).toEqual([ + { + requestId: 'primary', + capability: 'chat.getPrimary', + code: 'handler_failed', + message: 'Provider failed with ANTHROPIC_API_KEY=[redacted]', + }, + ]); + expect(result.summary).toMatchObject({ + errors: [ + { + requestId: 'primary', + capability: 'chat.getPrimary', + code: 'handler_failed', + message: 'Provider failed with ANTHROPIC_API_KEY=[redacted]', + }, + ], + }); }); it('returns structured errors from failed JSONL responses', async () => { diff --git a/src/server/probe-runner.ts b/src/server/probe-runner.ts index ce5fe0fd..f314894a 100644 --- a/src/server/probe-runner.ts +++ b/src/server/probe-runner.ts @@ -37,6 +37,7 @@ export type ProbeProcessSpawner = (options: ProbeProcessSpawnOptions) => Spawned export interface ScriptedProbeScenario { name: string; specName: string; + brief?: string; } export interface ProbeRunError { @@ -46,9 +47,32 @@ export interface ProbeRunError { message: string; } +export interface ProbeQuestionAnswer { + question: string; + answer: string; +} + export interface ProbeRunSummary { turnsAnswered: number; finalFrontierState: string | null; + durationMs: number; + questionAnswers: ProbeQuestionAnswer[]; + errors: ProbeRunError[]; +} + +export interface ProbeArtifactBundle { + schemaVersion: 1; + scenario: { name: string; brief: string | null; specName: string }; + commandSequence: string[]; + rawJsonlTranscript: Array<{ + direction: 'request' | 'response'; + payload: ProbeJsonlRequest | ProbeJsonlResponse | null; + }>; + parsedEvents: Array<{ index: number; request: ProbeJsonlRequest; response: ProbeJsonlResponse | null }>; + finalChat: AgentChatReadProjection | null; + summary: ProbeRunSummary; + errors: ProbeRunError[]; + environment: { nodeVersion: string; platform: NodeJS.Platform; arch: string }; } export interface ProbeRunResult { @@ -165,12 +189,13 @@ export async function runScriptedProbe({ scenario, scriptedAnswers, }: RunScriptedProbeOptions): Promise { + const startedAt = Date.now(); const state: ProbeRunResult = { scenario, requests: [], responses: [], finalChat: null, - summary: { turnsAnswered: 0, finalFrontierState: null }, + summary: { turnsAnswered: 0, finalFrontierState: null, durationMs: 0, questionAnswers: [], errors: [] }, errors: [], }; @@ -180,7 +205,7 @@ export async function runScriptedProbe({ input: { name: scenario.specName }, }); if (!created) { - return state; + return finishRun(state, startedAt); } const primary = await sendExpectingOutput(state, transport, { @@ -189,7 +214,7 @@ export async function runScriptedProbe({ input: { specId: created.specId }, }); if (!primary) { - return state; + return finishRun(state, startedAt); } for (let turnIndex = 0; turnIndex < 2; turnIndex += 1) { @@ -199,7 +224,7 @@ export async function runScriptedProbe({ input: { chatId: primary.chatId }, }); if (!ready) { - return state; + return finishRun(state, startedAt); } const readyRead = await sendExpectingOutput(state, transport, { @@ -208,7 +233,7 @@ export async function runScriptedProbe({ input: { chatId: primary.chatId }, }); if (!readyRead) { - return state; + return finishRun(state, startedAt); } state.finalChat = readyRead; state.summary.finalFrontierState = readyRead.frontier.state; @@ -221,7 +246,7 @@ export async function runScriptedProbe({ code: 'no_answerable_turn', message: 'chat.read did not expose an awaiting-response frontier turn', }); - return state; + return finishRun(state, startedAt); } const submit = await sendExpectingOutput(state, transport, { @@ -234,7 +259,7 @@ export async function runScriptedProbe({ }, }); if (!submit) { - return state; + return finishRun(state, startedAt); } state.summary.turnsAnswered += 1; @@ -244,13 +269,13 @@ export async function runScriptedProbe({ input: { chatId: primary.chatId }, }); if (!afterAnswerRead) { - return state; + return finishRun(state, startedAt); } state.finalChat = afterAnswerRead; state.summary.finalFrontierState = afterAnswerRead.frontier.state; } - return state; + return finishRun(state, startedAt); } async function sendExpectingOutput( @@ -267,7 +292,7 @@ async function sendExpectingOutput( requestId: request.id, capability: request.capability, code: response.error.code, - message: response.error.message, + message: sanitizeProbeErrorMessage(response.error.message), }); return null; } @@ -294,19 +319,66 @@ function buildScriptedResponse(turn: AgentChatTurn, scriptedAnswer: string | und }; } +function finishRun(state: ProbeRunResult, startedAt: number): ProbeRunResult { + state.summary.durationMs = Date.now() - startedAt; + state.summary.errors = state.errors; + state.summary.questionAnswers = extractQuestionAnswers(state.finalChat); + return state; +} + +function extractQuestionAnswers(finalChat: AgentChatReadProjection | null): ProbeQuestionAnswer[] { + return ( + finalChat?.turns + .filter((turn) => turn.answer !== null) + .map((turn) => ({ question: turn.question, answer: turn.answer ?? '' })) ?? [] + ); +} + +function sanitizeProbeErrorMessage(message: string): string { + return message + .split('\n')[0] + .replace(/(ANTHROPIC_API_KEY=)[^\s]+/gi, '$1[redacted]') + .replace(/(OPENAI_API_KEY=)[^\s]+/gi, '$1[redacted]') + .replace(/sk-[a-z0-9_-]+/gi, '[redacted]') + .slice(0, 300); +} + +export function buildProbeArtifactBundle(result: ProbeRunResult): ProbeArtifactBundle { + const rawJsonlTranscript = result.requests.flatMap((request, index) => [ + { direction: 'request' as const, payload: request }, + { direction: 'response' as const, payload: result.responses[index] ?? null }, + ]); + + return { + schemaVersion: 1, + scenario: { + name: result.scenario.name, + brief: result.scenario.brief ?? null, + specName: result.scenario.specName, + }, + commandSequence: result.requests.map((request) => request.capability), + rawJsonlTranscript, + parsedEvents: result.requests.map((request, index) => ({ + index, + request, + response: result.responses[index] ?? null, + })), + finalChat: result.finalChat, + summary: result.summary, + errors: result.errors, + environment: { nodeVersion: process.version, platform: process.platform, arch: process.arch }, + }; +} + function writeProbeArtifacts(outputDir: string, result: ProbeRunResult): void { mkdirSync(outputDir, { recursive: true }); - const rawJsonl = result.requests - .flatMap((request, index) => [ - { direction: 'request', payload: request }, - { direction: 'response', payload: result.responses[index] ?? null }, - ]) - .map((entry) => JSON.stringify(entry)) - .join('\n'); + const bundle = buildProbeArtifactBundle(result); + const rawJsonl = bundle.rawJsonlTranscript.map((entry) => JSON.stringify(entry)).join('\n'); + writeFileSync(join(outputDir, 'artifact-bundle.json'), `${JSON.stringify(bundle, null, 2)}\n`); writeFileSync(join(outputDir, 'raw-jsonl.ndjson'), `${rawJsonl}\n`); - writeFileSync(join(outputDir, 'final-chat.json'), `${JSON.stringify(result.finalChat, null, 2)}\n`); - writeFileSync(join(outputDir, 'summary.json'), `${JSON.stringify(result.summary, null, 2)}\n`); + writeFileSync(join(outputDir, 'final-chat.json'), `${JSON.stringify(bundle.finalChat, null, 2)}\n`); + writeFileSync(join(outputDir, 'summary.json'), `${JSON.stringify(bundle.summary, null, 2)}\n`); } function spawnBrunchAgentProcess({ cwd, command, args, env }: ProbeProcessSpawnOptions): SpawnedJsonlProcess { From 58076e1b4eb3b0e737c75c62287b941f8cdd0491 Mon Sep 17 00:00:00 2001 From: Lu Nelson Date: Tue, 12 May 2026 10:42:35 +0200 Subject: [PATCH 12/42] FE-705: Guard probe runner imports --- memory/CARDS.md | 143 -------------------------------- src/server/probe-runner.test.ts | 18 ++++ 2 files changed, 18 insertions(+), 143 deletions(-) delete mode 100644 memory/CARDS.md diff --git a/memory/CARDS.md b/memory/CARDS.md deleted file mode 100644 index 2ab6a5ef..00000000 --- a/memory/CARDS.md +++ /dev/null @@ -1,143 +0,0 @@ -# FE-705 scope cards — proof-of-life JSONL probe runner - -> Prepared by `ln-scope` on 2026-05-11. These cards stay under the existing FE-705 frontier item and branch (`ln/fe-705-agent-capability-cli`). They are sub-slices, not new Linear issues or branches. Do not add `turn.get`, phase closure/export, or LLM-as-user until this queue proves where the runner bottleneck is. - -## Orientation - -- Containing seam: FE-705 agent capability CLI / probe-runner seam governed by `memory/SPEC.md` Requirement 43, A89, D147, and I114. -- Relevant frontier item: `memory/PLAN.md` Next item 2, **Agent capability CLI + LLM-as-user fixture probe**; the JSONL adapter is working through two real-provider turns, and the next proof is an external runner over JSONL. -- Volatile handoff state: `HANDOFF.md` says no card is live; Card 6 passed `npm run verify` and a manual Anthropic-backed temp-workspace JSONL smoke reached a second answerable frontier. -- Main open risk: the probe runner must become an external client with reviewable artifacts without quietly importing DB/capability handlers or growing into speculative LLM-as-user / phase-export scope. - -## Card 7 — Probe runner JSONL client and scripted user policy - -**Status:** done - -### Target Behavior - -A probe-runner core can drive the first two interview responses through an injected JSONL transport using only `chat.read` projections and deterministic scripted answers. - -### Boundary Crossings - -```text -→ probe runner scenario command sequence -→ JSONL client / injected transport boundary -→ scripted user response policy -→ parsed Brunch read projection / response request payloads -``` - -### Risks and Assumptions - -- RISK: The runner accidentally couples to current `chat.read` object internals instead of the agent-facing projection contract → MITIGATION: centralize projection parsing in a tiny typed client module and keep policy tests fixture-shaped around observable `frontier`, `turns`, `options`, and `nextCommands` fields. -- RISK: Deterministic scripted answers cannot handle option-bearing turns → MITIGATION: support both free-text answers and option-position selection from the first implementation, with tests covering both response shapes. -- ASSUMPTION: `chat.read` is broad enough for a first scripted runner without adding `turn.get` → VALIDATE: tests construct payloads from `chat.read` alone and no card in this queue adds `turn.get` → `memory/SPEC.md` §Assumptions A89. - -### Acceptance Criteria - -✓ `probe-runner.test.ts` — a fake JSONL transport receives `spec.create → chat.getPrimary → chat.ensureReady → chat.read → turn.submitResponse → chat.read → chat.ensureReady → chat.read` in order. -✓ `probe-runner.test.ts` — the scripted policy submits free-text for an open question and an option selection payload when `chat.read` exposes options. -✓ `probe-runner.test.ts` — the runner reports `turnsAnswered: 2`, final frontier state, and structured errors without importing Brunch DB or capability dispatch modules. - -### Verification Approach - -- Inner: unit / fake-transport interaction oracle — proves sequencing, response construction, and error propagation without a provider. -- Middle: import-shape/code-boundary oracle — confirms the runner core is client-side over JSONL concepts, not a DB/handler caller. - -## Card 8 — Process-backed temp-workspace proof runner - -**Status:** done - -### Target Behavior - -A local probe runner can launch the packaged `brunch agent` process in an isolated temp workspace and persist a minimal proof-of-life artifact bundle. - -### Boundary Crossings - -```text -→ probe runner process adapter -→ child process stdin/stdout JSONL session -→ packaged Brunch CLI agent command -→ temp workspace `.brunch/` runtime state -→ probe output artifact directory outside `.brunch/` -``` - -### Risks and Assumptions - -- RISK: Real-provider availability makes automated tests flaky → MITIGATION: test process plumbing with a fake child process or injected spawn adapter, and keep compiled CLI / Anthropic smoke opt-in unless credentials are present. -- RISK: Temp workspace cleanup or output paths leak `.brunch/` internals into curated artifacts → MITIGATION: write artifacts to an explicit output directory outside the temp workspace state directory and assert artifact paths do not point inside `.brunch/`. -- ASSUMPTION: The packaged `bin/brunch.js agent` remains the right process boundary for proof-of-life use → VALIDATE: an opt-in/manual smoke command launches that binary in a temp cwd and reaches a second answerable frontier → `memory/SPEC.md` §Assumptions A89. - -### Acceptance Criteria - -✓ `probe-runner.test.ts` — process-backed runner uses an injected spawn/process adapter to write JSONL requests and parse JSONL responses. -✓ `probe-runner.test.ts` — a run creates an isolated workspace cwd and writes raw request/response JSONL, final `chat.read` projection, and run summary outside `.brunch/`. -✓ opt-in smoke command/documented invocation — when provider credentials and built package output are available, the runner reaches a second answerable frontier through `bin/brunch.js agent`. - - Manual invocation shape for a future smoke wrapper: build first, then call `runProcessBackedProbe()` with the default command (`node bin/brunch.js agent`), an explicit `outputDir`, and a temp workspace created by the runner; this is intentionally not a CI command until provider credentials are controlled. - -### Verification Approach - -- Inner: fake child-process oracle — proves process lifecycle, JSONL parsing, artifact paths, and cleanup semantics deterministically. -- Middle: opt-in real-provider smoke — proves the external process boundary against the compiled CLI without making CI depend on credentials. - -## Card 9 — Probe artifact schema and safe summaries - -**Status:** done - -### Objective - -Probe runs produce deterministic, reviewable artifact bundles that are safe to inspect, compare, and eventually curate into fixture candidates. - -### Acceptance Criteria - -✓ Artifact schema records scenario name/brief, command sequence, raw JSONL transcript, parsed events, final chat projection, run summary, errors, duration, and non-secret environment metadata. -✓ Failure summaries redact API-key-like values and avoid provider stack/internal object dumps while preserving useful failure class and message context. -✓ Successful summaries identify final frontier state and compact question/answer pairs from the driven turns. -✓ Tests cover successful and failed artifact rendering without launching a real provider. - -### Verification Approach - -- Inner: artifact serialization/redaction tests. -- Middle: snapshot-like deterministic summary oracle for success and failure bundles. - -### Promotion checklist - -- [ ] Does this change a requirement? -- [ ] Does this create, retire, or invalidate an assumption? -- [ ] Does this make or reverse a non-trivial design decision? -- [ ] Does this establish a new seam-level invariant? -- [ ] Does it cross more than two major seams? -- [ ] Is this the first touch in an unfamiliar seam from a fresh thread? -- [ ] Can you not name the containing seam or current rationale from the live docs? - -Result: stays light. This hardens artifacts inside the already-established FE-705 probe-runner seam and does not change durable product requirements or capability authority. - -## Card 10 — Probe runner import-boundary guard - -**Status:** queued - -### Objective - -The probe-runner module boundary is mechanically guarded so runner code can only exercise Brunch through the JSONL client/process wrapper. - -### Acceptance Criteria - -✓ Boundary test fails if probe-runner modules import `src/server/db`, `src/server/capabilities`, `src/server/capability-registry`, server handlers, or ORM schema modules directly. -✓ Allowed imports are documented in the test or module boundary helper: Node process/fs/path utilities, probe-runner private modules, and the JSONL client/process wrapper surface. -✓ Existing capability and JSONL tests continue to prove server-owned handlers remain the mutation authority. - -### Verification Approach - -- Inner: static import-boundary test plus existing capability / agent-jsonl unit tests. -- Middle: `npm run verify` gate. - -### Promotion checklist - -- [ ] Does this change a requirement? -- [ ] Does this create, retire, or invalidate an assumption? -- [ ] Does this make or reverse a non-trivial design decision? -- [ ] Does this establish a new seam-level invariant? -- [ ] Does it cross more than two major seams? -- [ ] Is this the first touch in an unfamiliar seam from a fresh thread? -- [ ] Can you not name the containing seam or current rationale from the live docs? - -Result: stays light. It enforces I114 for the new runner code without changing the I114 invariant itself. diff --git a/src/server/probe-runner.test.ts b/src/server/probe-runner.test.ts index ea6d9180..4f52a89a 100644 --- a/src/server/probe-runner.test.ts +++ b/src/server/probe-runner.test.ts @@ -279,6 +279,24 @@ describe('probe runner', () => { }); }); + it('guards the probe-runner import boundary from server mutation authority modules', () => { + const source = readFileSync(new URL('./probe-runner.ts', import.meta.url), 'utf8'); + const forbiddenImports = [ + './db.js', + './capabilities.js', + './capability-registry.js', + './schema.js', + './core.js', + './chat-route-transition.js', + './turn-response-transition.js', + ]; + + for (const forbiddenImport of forbiddenImports) { + expect(source).not.toContain(`from '${forbiddenImport}'`); + expect(source).not.toContain(`from "${forbiddenImport}"`); + } + }); + it('returns structured errors from failed JSONL responses', async () => { const transport: JsonlTransport = { async send(request) { From 068839140d7509d2568eaceae6fd3a38f9f385ed Mon Sep 17 00:00:00 2001 From: Lu Nelson Date: Tue, 12 May 2026 10:59:19 +0200 Subject: [PATCH 13/42] add the d3k skill, as potential better solution than agent-tail --- .agents/skills/d3k/SKILL.md | 145 ++++++++++++++++++++++++++++++++++++ 1 file changed, 145 insertions(+) create mode 100644 .agents/skills/d3k/SKILL.md diff --git a/.agents/skills/d3k/SKILL.md b/.agents/skills/d3k/SKILL.md new file mode 100644 index 00000000..4407f0e5 --- /dev/null +++ b/.agents/skills/d3k/SKILL.md @@ -0,0 +1,145 @@ +--- +name: "d3k" +description: "d3k assistant for debugging web apps" +--- + +# d3k Commands + +d3k captures browser and server logs in a unified log file. Use these commands: + +## Viewing Errors and Logs + +```bash +d3k errors # Show recent errors (browser + server combined) +d3k errors --context # Show errors + user actions that preceded them +d3k errors -n 20 # Show last 20 errors + +d3k logs # Show recent logs (browser + server combined) +d3k logs --type browser # Browser logs only +d3k logs --type server # Server logs only +``` + +## Other Commands + +```bash +d3k fix # Deep analysis of application errors +d3k fix --focus build # Focus on build errors + +d3k crawl # Discover app URLs +d3k crawl --depth all # Exhaustive crawl +``` + +## Browser Interaction + +`d3k agent-browser` auto-connects to the active session's browser via CDP: + +```bash +d3k agent-browser open http://localhost:3000/page +d3k agent-browser snapshot -i # Get element refs (@e1, @e2) +d3k agent-browser click @e2 +d3k agent-browser fill @e3 "text" +d3k agent-browser screenshot /tmp/shot.png +``` + +To target a different browser, run `d3k agent-browser connect ` first. + +## Codex Fresh Browser/Profile Startup + +Use this workflow when the user asks Codex to start d3k with a fresh browser/profile. + +1. Close any stale `agent-browser` daemon before launching with `--profile`. Otherwise `agent-browser` will reuse the existing daemon and print `--profile ignored`. + ```bash + d3k agent-browser close --all + ``` + +2. Start the app through d3k in `servers-only` mode and keep that command running. In Codex, this is more reliable than asking d3k to launch the browser itself when a fresh profile is required. + ```bash + d3k --no-agent --no-skills --servers-only --command "npm run dev -- -H 127.0.0.1 -p 3000" --port 3000 --startup-timeout 90 --no-tui + ``` + + Adjust the package-manager command and port for the project. Prefer `--command` over `--script` when passing framework flags. For npm scripts, put flags after `--`; otherwise tools like Next.js can interpret the port as a project directory. + +3. Verify the server before opening more browser windows: + ```bash + curl -I http://127.0.0.1:3000 + ``` + +4. Open the fresh profile as a separate browser step: + ```bash + d3k agent-browser --profile /tmp/d3k-fresh-profile --headed open http://127.0.0.1:3000 + ``` + +5. Sanity-check the opened page: + ```bash + d3k agent-browser get title + d3k agent-browser snapshot -i + d3k errors + ``` + +Practical rules: + +- Prefer `127.0.0.1` for this workflow. If `localhost` hangs or flips between IPv4/IPv6 behavior, do not keep retrying browser launches. +- If `curl -I` hangs, the server is wedged even if the port appears occupied; restart the d3k server process before opening a browser. +- In `servers-only` mode there is no d3k-monitored CDP browser. Use regular `d3k agent-browser` commands, not `d3k cdp-port`. +- In sandboxed agent environments, rerun local-network checks and `agent-browser` opens outside the sandbox when sandbox networking blocks access to `127.0.0.1`. + +## Browser Tool Choice + +Use `agent-browser` for browser work. + +Practical rule: + +- Need to drive the same monitored browser session: use `agent-browser`. +- Examples: + +```bash +d3k agent-browser snapshot -i +d3k agent-browser click @e2 +``` + +To make d3k prefer one locally when it launches helper browser commands, use: + +```bash +d3k --browser-tool agent-browser +``` + +## Fix Workflow + +1. `d3k errors --context` - See errors and what triggered them +2. Fix the code +3. `d3k agent-browser open ` then `d3k agent-browser click @e1` to replay +4. `d3k errors` - Verify fix worked + +## Creating PRs with Before/After Screenshots + +When creating a PR for visual changes, **always capture before/after screenshots** to show the impact: + +1. **Before making changes**, screenshot the production site: + ```bash + d3k agent-browser open https://production-url.com/affected-page + d3k agent-browser screenshot /tmp/before.png + ``` + +2. **After making changes**, screenshot localhost: + ```bash + d3k agent-browser open http://localhost:3000/affected-page + d3k agent-browser screenshot /tmp/after.png + ``` + +3. **Or use the tooling API** to capture multiple routes at once: + ``` + capture_before_after_screenshots( + productionUrl: "https://myapp.vercel.app", + routes: ["/", "/about", "/contact"] + ) + ``` + +4. **Include in PR description** using markdown: + ```markdown + ### Visual Comparison + | Route | Before | After | + |-------|--------|-------| + | `/` | ![Before](before.png) | ![After](after.png) | + ``` + + Upload screenshots by dragging them into the GitHub PR description. From 5ff5d03b69f1817ad59ae898bfb6911c620bc71b Mon Sep 17 00:00:00 2001 From: Lu Nelson Date: Tue, 12 May 2026 11:03:28 +0200 Subject: [PATCH 14/42] Move probe runner to scripts harness --- memory/CARDS.md | 236 ++++++++++++++++++ package.json | 8 +- .../agent-probes}/probe-runner.test.ts | 25 +- .../agent-probes}/probe-runner.ts | 0 tsconfig.json | 2 +- vite.config.ts | 2 +- 6 files changed, 258 insertions(+), 15 deletions(-) create mode 100644 memory/CARDS.md rename {src/server => scripts/agent-probes}/probe-runner.test.ts (95%) rename {src/server => scripts/agent-probes}/probe-runner.ts (100%) diff --git a/memory/CARDS.md b/memory/CARDS.md new file mode 100644 index 00000000..87045632 --- /dev/null +++ b/memory/CARDS.md @@ -0,0 +1,236 @@ +# FE-705 scope cards — fixture-capable LLM-as-user probe path + +> Prepared by `ln-scope` on 2026-05-12 and revised after the packaged-boundary proof. These cards stay under the existing FE-705 frontier item and branch (`ln/fe-705-agent-capability-cli`). They are sub-slices, not new Linear issues or branches. The goal is a tracer-bullet path from external JSONL runner → preserved local DB fixture candidate → minimal model-backed user simulation, without adding phase closure/export or changing product UI. + +## Orientation for a new thread + +- Start by reading `memory/SPEC.md`, `memory/PLAN.md`, and this file. There is currently no `HANDOFF.md`. +- Containing seam: FE-705 agent capability CLI / external probe-runner seam governed by `memory/SPEC.md` Requirement 43, A89, D147, and I114. +- Relevant frontier item: `memory/PLAN.md` Next item 2, **Agent capability CLI + LLM-as-user fixture probe**. Keep all cards on branch `ln/fe-705-agent-capability-cli`; do not create a new Linear issue or Graphite branch for these sub-slices. +- Current repo state at scoping time: branch is ahead of origin with four FE-705 probe-runner commits; only known unrelated dirty state is untracked `.agents/skills/d3k/`, which should be left alone. +- What has already been proved: `scripts/agent-probes/probe-runner.ts` contains a scripted process-backed runner, tests, artifact bundle writing, redaction, and an import-boundary guard. A manual packaged-boundary smoke built the app, drove `node bin/brunch.js agent` through two real-provider turns, and wrote artifacts at `/tmp/brunch-probe-artifacts-9FQyPB`. +- Main open risk: probe / LLM-as-user / fixture-candidate code must stay clearly outside Brunch product runtime and mutation authority. The next card preserves fixture state without making it product runtime state. + +## Layering decision for this queue + +Treat `brunch agent` itself as product/runtime code, but treat the probe runner and fixture generator as **development harness** code. + +- Keep in `src/server/`: + - `agent-jsonl.ts` + - `capabilities.ts` + - capability registry / DB / product mutation handlers +- Move out of `src/server/`: + - `probe-runner.ts` + - `probe-runner.test.ts` + - future LLM-as-user simulator + - future fixture-candidate helpers +- Target location: + +```text +scripts/agent-probes/ + probe-runner.ts + probe-runner.test.ts + llm-user.ts # later card, if useful + fixture-candidate.ts # later card, if useful +``` + +- Tooling must cover `scripts/` so this harness remains linted/formatted/tested. Update `package.json` scripts as needed so `npm run fix`, `npm run check`, and `npm run verify` include `scripts/`. +- Boundary rule: `scripts/agent-probes/**` may spawn `node bin/brunch.js agent`, use Node filesystem/process utilities, and import narrow shared request schemas if necessary, but must not import Brunch DB, capability dispatch/registry, ORM schema, core workflow handlers, route-transition handlers, or turn-response transition handlers. + +## Card 11 — Move probe runner to scripts harness boundary + +**Status:** done + +### Objective + +The probe runner lives under `scripts/agent-probes/` as development harness code while remaining covered by project lint/format/test tooling and protected from product mutation-authority imports. + +### Acceptance Criteria + +✓ `src/server/probe-runner.ts` and `src/server/probe-runner.test.ts` are moved to `scripts/agent-probes/probe-runner.ts` and `scripts/agent-probes/probe-runner.test.ts` or an equivalent `scripts/agent-probes/` mini-library shape. +✓ `package.json` `fmt`, `fmt:check`, `lint`, and `lint:fix` include `scripts/` so the moved harness remains in the normal `npm run fix`, `npm run check`, and `npm run verify` gates. +✓ The moved tests still pass and continue proving scripted JSONL transport, process-backed runner, artifact bundle writing, redaction, and import-boundary behavior. +✓ The import-boundary test is updated for `scripts/agent-probes/**` and forbids imports from `src/server/db`, `src/server/capabilities`, `src/server/capability-registry`, `src/server/schema`, `src/server/core`, `src/server/chat-route-transition`, and `src/server/turn-response-transition`. +✓ Any manual smoke snippets or comments refer to importing from `./scripts/agent-probes/probe-runner.ts`, not `./src/server/probe-runner.ts`. + +### Verification Approach + +- Inner: `npm run test -- scripts/agent-probes/probe-runner.test.ts` (or the moved test path) plus the static import-boundary test. +- Gate: `npm run verify` to prove scripts are included in check/test/build and the product runtime still builds without bundling the harness as server code. + +### Promotion checklist + +- [ ] Does this change a requirement? +- [ ] Does this create, retire, or invalidate an assumption? +- [ ] Does this make or reverse a non-trivial design decision? +- [ ] Does this establish a new seam-level invariant? +- [ ] Does it cross more than two major seams? +- [ ] Is this the first touch in an unfamiliar seam from a fresh thread? +- [ ] Can you not name the containing seam or current rationale from the live docs? + +Result: stays light. This aligns file placement with the existing FE-705 decision that probe artifacts and LLM-as-user scenarios belong to an external runner, while `brunch agent` remains the product JSONL adapter. + +## Card 12 — Preserve probe workspace state for fixture candidates + +**Status:** next + +### Objective + +Process-backed probe runs can optionally preserve the temp workspace state that contains the real `.brunch` SQLite database alongside the review artifacts. + +### Acceptance Criteria + +✓ `scripts/agent-probes/probe-runner.test.ts` — `runProcessBackedProbe()` records the temp `workspaceCwd` in the artifact bundle and run result without exposing it as ambient selected product state. +✓ `scripts/agent-probes/probe-runner.test.ts` — when fixture preservation is enabled, the runner copies the workspace `.brunch/` directory or database file into the output artifact directory under a stable `workspace-state/` path. +✓ `scripts/agent-probes/probe-runner.test.ts` — when fixture preservation is disabled, existing minimal artifacts still write without copying `.brunch/` state. +✓ The copied fixture state is outside the live temp workspace and can survive temp workspace cleanup. + +### Verification Approach + +- Inner: fake process / filesystem oracle in `scripts/agent-probes/probe-runner.test.ts` for workspace path metadata, fixture copy behavior, and disabled-by-default compatibility. +- Middle: manual packaged-boundary smoke can inspect the copied SQLite fixture candidate after `npm run build` when provider credentials are present. + +### Promotion checklist + +- [ ] Does this change a requirement? +- [ ] Does this create, retire, or invalidate an assumption? +- [ ] Does this make or reverse a non-trivial design decision? +- [ ] Does this establish a new seam-level invariant? +- [ ] Does it cross more than two major seams? +- [ ] Is this the first touch in an unfamiliar seam from a fresh thread? +- [ ] Can you not name the containing seam or current rationale from the live docs? + +Result: stays light. This preserves evidence inside the already-established external probe-runner seam and does not change Brunch product persistence semantics. + +## Card 13 — User-simulator policy interface + +**Status:** queued + +### Objective + +The probe runner can obtain turn responses through an injected user-simulator policy instead of only through positional scripted answers. + +### Acceptance Criteria + +✓ `scripts/agent-probes/probe-runner.test.ts` — `runScriptedProbe()` or its successor accepts an injected policy that receives the scenario brief, current `chat.read` projection, active turn, and prior answered turns. +✓ `scripts/agent-probes/probe-runner.test.ts` — the existing scripted behavior is reimplemented as one policy and still handles free-text and option-bearing turns. +✓ `scripts/agent-probes/probe-runner.test.ts` — policy errors become structured probe errors and artifact summaries instead of uncaught exceptions. +✓ No `scripts/agent-probes/**` code imports DB, capability dispatch/registry, schema, core, route-transition, or turn-response authority modules directly. + +### Verification Approach + +- Inner: fake transport / policy oracle proves response-policy inputs, response payload construction, and structured policy failure handling. +- Middle: import-boundary test protects the external-runner authority boundary. + +### Promotion checklist + +- [ ] Does this change a requirement? +- [ ] Does this create, retire, or invalidate an assumption? +- [ ] Does this make or reverse a non-trivial design decision? +- [ ] Does this establish a new seam-level invariant? +- [ ] Does it cross more than two major seams? +- [ ] Is this the first touch in an unfamiliar seam from a fresh thread? +- [ ] Can you not name the containing seam or current rationale from the live docs? + +Result: stays light. This is a local extension point inside the external runner, not a new Brunch product API. + +## Card 14 — Model-backed LLM-as-user policy with prompt artifacts + +**Status:** queued + +### Target Behavior + +A model-backed user-simulator policy can answer one probe turn from the current `chat.read` projection by rendering a strict JSON-response prompt and parsing the model output into a `turn.submitResponse` payload. + +### Boundary Crossings + +```text +→ scripts/agent-probes user-simulator policy +→ rendered simulated-user prompt/context +→ injected model adapter +→ strict JSON parse / response validation +→ turn.submitResponse payload +→ probe artifact event +``` + +### Risks and Assumptions + +- RISK: The simulated user accidentally acts like the interviewer or invents product state → MITIGATION: prompt frames the model as the user only, includes only scenario brief + current question/options + compact prior Q/A, and accepts only strict response JSON. +- RISK: Model output is malformed or semantically invalid for the current turn → MITIGATION: parse through the existing turn-response payload schema shape and record structured parse failures in artifacts. +- ASSUMPTION: A `chat.read` projection contains enough context for a minimal LLM-as-user to answer early grounding turns without `turn.get` → VALIDATE: fake adapter tests plus opt-in real-provider smoke over two turns → `memory/SPEC.md` §Assumptions A89. + +### Acceptance Criteria + +✓ `scripts/agent-probes/probe-runner.test.ts` or `scripts/agent-probes/llm-user.test.ts` — a fake model adapter receives a rendered prompt containing scenario brief, active question, options when present, and compact prior Q/A. +✓ Valid model JSON for free-text and option-selection turns becomes the correct `turn.submitResponse` payload. +✓ Invalid JSON or schema-invalid model output becomes a structured probe error, not a thrown crash. +✓ `artifact-bundle.json` includes simulated-user prompt, raw model output, parsed response, and parse/validation status events. + +### Verification Approach + +- Inner: fake model-adapter oracle proves prompt rendering, parsing, validation, and artifact event capture without provider credentials. +- Middle: opt-in real-provider smoke after Card 15 proves the adapter can drive the packaged CLI through real interviewer questions. + +## Card 15 — Opt-in LLM-as-user packaged-boundary smoke + +**Status:** queued + +### Objective + +A manual/opt-in smoke command can run the model-backed user simulator against `node bin/brunch.js agent`, preserve fixture state, and report whether a two-turn fixture candidate was produced. + +### Acceptance Criteria + +✓ A documented invocation or tiny test helper runs `npm run build` then `runProcessBackedProbe()` with the default packaged command, model-backed user policy, explicit output directory, and fixture preservation enabled. +✓ The smoke prints the artifact directory, final frontier state, turns answered, and errors as JSON only. +✓ On success, the artifact directory contains review artifacts plus preserved workspace state suitable for later golden-fixture curation. +✓ On provider/model failure, the artifact directory contains redacted failure artifacts and no secret-bearing stack dumps. + +### Verification Approach + +- Inner: fake model / fake process test covers smoke helper command construction and JSON summary shape without provider credentials. +- Outer: manual real-provider smoke proves packaged CLI + Brunch interviewer + LLM-as-user + persisted fixture artifacts end to end. + +### Promotion checklist + +- [ ] Does this change a requirement? +- [ ] Does this create, retire, or invalidate an assumption? +- [ ] Does this make or reverse a non-trivial design decision? +- [ ] Does this establish a new seam-level invariant? +- [ ] Does it cross more than two major seams? +- [ ] Is this the first touch in an unfamiliar seam from a fresh thread? +- [ ] Can you not name the containing seam or current rationale from the live docs? + +Result: stays light if implemented as an opt-in/manual proof wrapper over the existing runner seam. Promote if it becomes a committed product CLI surface or changes fixture authority semantics. + +## Card 16 — Fixture-candidate normalization checkpoint + +**Status:** queued + +### Objective + +A completed probe artifact directory can be evaluated as a fixture candidate using deterministic metadata checks before it is promoted into a golden fixture corpus. + +### Acceptance Criteria + +✓ A fixture-candidate helper inspects an artifact directory and reports presence/shape of `artifact-bundle.json`, `summary.json`, `raw-jsonl.ndjson`, `final-chat.json`, and preserved workspace state when expected. +✓ The helper reports non-deterministic fields that would need normalization for goldens, including timestamps, ids, durations, temp paths, and provider-dependent question wording. +✓ Tests cover a complete candidate, a missing workspace-state candidate, and an error-run candidate without requiring a provider. +✓ The helper does not bless or copy artifacts into a permanent corpus yet; it only reports readiness and normalization debt. + +### Verification Approach + +- Inner: filesystem fixture oracle over synthetic artifact directories. +- Middle: run against the manual smoke artifact directory to decide whether the next frontier is golden corpus curation or more normalization. + +### Promotion checklist + +- [ ] Does this change a requirement? +- [ ] Does this create, retire, or invalidate an assumption? +- [ ] Does this make or reverse a non-trivial design decision? +- [ ] Does this establish a new seam-level invariant? +- [ ] Does it cross more than two major seams? +- [ ] Is this the first touch in an unfamiliar seam from a fresh thread? +- [ ] Can you not name the containing seam or current rationale from the live docs? + +Result: stays light. This is a diagnostic checkpoint before creating any durable golden fixture corpus policy. diff --git a/package.json b/package.json index 6ef49603..27d0aed0 100644 --- a/package.json +++ b/package.json @@ -37,12 +37,12 @@ "check": "npm run fmt:check && npm run lint", "dev": "agent-tail run 'vite: vite' 'api: npx tsx --watch src/server/index.ts'", "fix": "npm run lint:fix && npm run fmt", - "fmt": "oxfmt src/ config/ .ladle/ vite.config.ts drizzle.config.ts", - "fmt:check": "oxfmt --check src/ config/ .ladle/ vite.config.ts drizzle.config.ts", + "fmt": "oxfmt src/ scripts/ config/ .ladle/ vite.config.ts drizzle.config.ts", + "fmt:check": "oxfmt --check src/ scripts/ config/ .ladle/ vite.config.ts drizzle.config.ts", "ladle": "ladle serve", "ladle:build": "ladle build", - "lint": "oxlint --type-aware --type-check src/ config/ .ladle/ vite.config.ts drizzle.config.ts", - "lint:fix": "oxlint --type-aware --type-check --fix src/ config/ .ladle/ vite.config.ts drizzle.config.ts", + "lint": "oxlint --type-aware --type-check src/ scripts/ config/ .ladle/ vite.config.ts drizzle.config.ts", + "lint:fix": "oxlint --type-aware --type-check --fix src/ scripts/ config/ .ladle/ vite.config.ts drizzle.config.ts", "release": "release-it", "seed": "npx tsx src/server/fixtures/seed.ts", "server": "npx tsx src/server/index.ts", diff --git a/src/server/probe-runner.test.ts b/scripts/agent-probes/probe-runner.test.ts similarity index 95% rename from src/server/probe-runner.test.ts rename to scripts/agent-probes/probe-runner.test.ts index 4f52a89a..25231d11 100644 --- a/src/server/probe-runner.test.ts +++ b/scripts/agent-probes/probe-runner.test.ts @@ -282,18 +282,25 @@ describe('probe runner', () => { it('guards the probe-runner import boundary from server mutation authority modules', () => { const source = readFileSync(new URL('./probe-runner.ts', import.meta.url), 'utf8'); const forbiddenImports = [ - './db.js', - './capabilities.js', - './capability-registry.js', - './schema.js', - './core.js', - './chat-route-transition.js', - './turn-response-transition.js', + '@/server/db', + '@/server/capabilities', + '@/server/capability-registry', + '@/server/schema', + '@/server/core', + '@/server/chat-route-transition', + '@/server/turn-response-transition', + '../../src/server/db', + '../../src/server/capabilities', + '../../src/server/capability-registry', + '../../src/server/schema', + '../../src/server/core', + '../../src/server/chat-route-transition', + '../../src/server/turn-response-transition', ]; for (const forbiddenImport of forbiddenImports) { - expect(source).not.toContain(`from '${forbiddenImport}'`); - expect(source).not.toContain(`from "${forbiddenImport}"`); + expect(source).not.toContain(`from '${forbiddenImport}`); + expect(source).not.toContain(`from "${forbiddenImport}`); } }); diff --git a/src/server/probe-runner.ts b/scripts/agent-probes/probe-runner.ts similarity index 100% rename from src/server/probe-runner.ts rename to scripts/agent-probes/probe-runner.ts diff --git a/tsconfig.json b/tsconfig.json index 24423841..942661c0 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -11,5 +11,5 @@ "@/*": ["./src/*"] } }, - "include": ["src", ".ladle", "node_modules/vite/client.d.ts"] + "include": ["src", "scripts", ".ladle", "node_modules/vite/client.d.ts"] } diff --git a/vite.config.ts b/vite.config.ts index 8779c82b..255fb1e8 100644 --- a/vite.config.ts +++ b/vite.config.ts @@ -44,7 +44,7 @@ export default defineConfig(({ command, mode }) => { dedupe: ['react', 'react-dom'], }, test: { - include: ['src/**/*.test.{js,ts,jsx,tsx}'], + include: ['src/**/*.test.{js,ts,jsx,tsx}', 'scripts/**/*.test.{js,ts,jsx,tsx}'], }, }; From d8477bfea5f56db0627e0fe51995c4d054959654 Mon Sep 17 00:00:00 2001 From: Lu Nelson Date: Tue, 12 May 2026 11:06:07 +0200 Subject: [PATCH 15/42] Preserve probe workspace state --- memory/CARDS.md | 4 +-- scripts/agent-probes/probe-runner.test.ts | 38 ++++++++++++++++++++++- scripts/agent-probes/probe-runner.ts | 35 ++++++++++++++++++++- 3 files changed, 73 insertions(+), 4 deletions(-) diff --git a/memory/CARDS.md b/memory/CARDS.md index 87045632..9efc95b6 100644 --- a/memory/CARDS.md +++ b/memory/CARDS.md @@ -72,7 +72,7 @@ Result: stays light. This aligns file placement with the existing FE-705 decisio ## Card 12 — Preserve probe workspace state for fixture candidates -**Status:** next +**Status:** done ### Objective @@ -104,7 +104,7 @@ Result: stays light. This preserves evidence inside the already-established exte ## Card 13 — User-simulator policy interface -**Status:** queued +**Status:** next ### Objective diff --git a/scripts/agent-probes/probe-runner.test.ts b/scripts/agent-probes/probe-runner.test.ts index 25231d11..7f4f1a35 100644 --- a/scripts/agent-probes/probe-runner.test.ts +++ b/scripts/agent-probes/probe-runner.test.ts @@ -1,4 +1,4 @@ -import { mkdtempSync, readFileSync, rmSync } from 'node:fs'; +import { existsSync, mkdirSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from 'node:fs'; import { tmpdir } from 'node:os'; import { join } from 'node:path'; @@ -207,6 +207,7 @@ describe('probe runner', () => { }); expect(result.summary).toMatchObject({ turnsAnswered: 2, finalFrontierState: 'answered' }); + expect(result.workspaceCwd).toBe(spawnedCwds[0]); expect(spawnedCwds).toHaveLength(1); expect(spawnedCwds[0]).toContain('brunch-probe-workspace-'); expect(outputDir).not.toContain(`${spawnedCwds[0]}/.brunch`); @@ -232,6 +233,41 @@ describe('probe runner', () => { scenario: { name: 'process-proof', brief: null }, commandSequence: expect.arrayContaining(['spec.create', 'chat.getPrimary', 'chat.ensureReady']), environment: { platform: process.platform, arch: process.arch }, + workspace: { cwd: spawnedCwds[0], preservedStatePath: null }, + }); + expect(existsSync(join(outputDir, 'workspace-state'))).toBe(false); + }); + + it('can preserve the temp workspace .brunch state into the artifact directory', async () => { + const outputDir = makeTempDir('brunch-probe-output-'); + let liveWorkspaceDbPath: string | null = null; + + const result = await runProcessBackedProbe({ + scenario: { name: 'preserve-fixture', specName: 'Preserve fixture proof' }, + scriptedAnswers: ['A fixture candidate'], + outputDir, + preserveWorkspaceState: true, + spawnProcess({ cwd }) { + const brunchDir = join(cwd, '.brunch'); + mkdirSync(brunchDir); + liveWorkspaceDbPath = join(brunchDir, 'brunch.db'); + writeFileSync(liveWorkspaceDbPath, 'sqlite fixture bytes'); + return createFakeAgentProcess(); + }, + }); + + const preservedDbPath = join(outputDir, 'workspace-state', '.brunch', 'brunch.db'); + const bundle = JSON.parse(readFileSync(join(outputDir, 'artifact-bundle.json'), 'utf8')) as unknown; + + expect(result.workspaceCwd).not.toBeNull(); + expect(result.workspaceCwd).not.toContain(outputDir); + expect(result.preservedWorkspaceStatePath).toBe(join(outputDir, 'workspace-state')); + expect(preservedDbPath).not.toBe(liveWorkspaceDbPath); + expect(readFileSync(preservedDbPath, 'utf8')).toBe('sqlite fixture bytes'); + rmSync(result.workspaceCwd ?? '', { recursive: true, force: true }); + expect(readFileSync(preservedDbPath, 'utf8')).toBe('sqlite fixture bytes'); + expect(bundle).toMatchObject({ + workspace: { cwd: result.workspaceCwd, preservedStatePath: join(outputDir, 'workspace-state') }, }); }); diff --git a/scripts/agent-probes/probe-runner.ts b/scripts/agent-probes/probe-runner.ts index f314894a..c843c522 100644 --- a/scripts/agent-probes/probe-runner.ts +++ b/scripts/agent-probes/probe-runner.ts @@ -1,5 +1,5 @@ import { spawn } from 'node:child_process'; -import { mkdirSync, mkdtempSync, writeFileSync } from 'node:fs'; +import { cpSync, existsSync, mkdirSync, mkdtempSync, writeFileSync } from 'node:fs'; import { tmpdir } from 'node:os'; import { join, resolve } from 'node:path'; @@ -63,6 +63,7 @@ export interface ProbeRunSummary { export interface ProbeArtifactBundle { schemaVersion: 1; scenario: { name: string; brief: string | null; specName: string }; + workspace: { cwd: string | null; preservedStatePath: string | null }; commandSequence: string[]; rawJsonlTranscript: Array<{ direction: 'request' | 'response'; @@ -77,6 +78,8 @@ export interface ProbeArtifactBundle { export interface ProbeRunResult { scenario: ScriptedProbeScenario; + workspaceCwd: string | null; + preservedWorkspaceStatePath: string | null; requests: ProbeJsonlRequest[]; responses: ProbeJsonlResponse[]; finalChat: AgentChatReadProjection | null; @@ -129,6 +132,7 @@ export interface ProcessBackedProbeOptions { command?: string; args?: string[]; env?: NodeJS.ProcessEnv; + preserveWorkspaceState?: boolean; } export async function runProcessBackedProbe({ @@ -139,6 +143,7 @@ export async function runProcessBackedProbe({ command = process.execPath, args = [resolve('bin/brunch.js'), 'agent'], env = process.env, + preserveWorkspaceState = false, }: ProcessBackedProbeOptions): Promise { const workspaceCwd = mkdtempSync(join(tmpdir(), 'brunch-probe-workspace-')); const spawned = spawnProcess({ cwd: workspaceCwd, command, args, env }); @@ -146,6 +151,10 @@ export async function runProcessBackedProbe({ try { const result = await runScriptedProbe({ transport, scenario, scriptedAnswers }); + result.workspaceCwd = workspaceCwd; + if (preserveWorkspaceState) { + result.preservedWorkspaceStatePath = copyWorkspaceState({ workspaceCwd, outputDir }); + } writeProbeArtifacts(outputDir, result); return result; } finally { @@ -192,6 +201,8 @@ export async function runScriptedProbe({ const startedAt = Date.now(); const state: ProbeRunResult = { scenario, + workspaceCwd: null, + preservedWorkspaceStatePath: null, requests: [], responses: [], finalChat: null, @@ -356,6 +367,10 @@ export function buildProbeArtifactBundle(result: ProbeRunResult): ProbeArtifactB brief: result.scenario.brief ?? null, specName: result.scenario.specName, }, + workspace: { + cwd: result.workspaceCwd, + preservedStatePath: result.preservedWorkspaceStatePath, + }, commandSequence: result.requests.map((request) => request.capability), rawJsonlTranscript, parsedEvents: result.requests.map((request, index) => ({ @@ -381,6 +396,24 @@ function writeProbeArtifacts(outputDir: string, result: ProbeRunResult): void { writeFileSync(join(outputDir, 'summary.json'), `${JSON.stringify(bundle.summary, null, 2)}\n`); } +function copyWorkspaceState({ + workspaceCwd, + outputDir, +}: { + workspaceCwd: string; + outputDir: string; +}): string { + const source = join(workspaceCwd, '.brunch'); + const destination = join(outputDir, 'workspace-state'); + mkdirSync(destination, { recursive: true }); + + if (existsSync(source)) { + cpSync(source, join(destination, '.brunch'), { recursive: true }); + } + + return destination; +} + function spawnBrunchAgentProcess({ cwd, command, args, env }: ProbeProcessSpawnOptions): SpawnedJsonlProcess { const child = spawn(command, args, { cwd, env, stdio: ['pipe', 'pipe', 'pipe'] }); return { From 6ddc1b12b5d021c43dace55c617351658d9e8c7f Mon Sep 17 00:00:00 2001 From: Lu Nelson Date: Tue, 12 May 2026 11:09:10 +0200 Subject: [PATCH 16/42] Add probe response policy seam --- memory/CARDS.md | 2 +- scripts/agent-probes/probe-runner.test.ts | 62 ++++++++++++++++++++- scripts/agent-probes/probe-runner.ts | 67 ++++++++++++++++++++--- 3 files changed, 121 insertions(+), 10 deletions(-) diff --git a/memory/CARDS.md b/memory/CARDS.md index 9efc95b6..92f4d305 100644 --- a/memory/CARDS.md +++ b/memory/CARDS.md @@ -104,7 +104,7 @@ Result: stays light. This preserves evidence inside the already-established exte ## Card 13 — User-simulator policy interface -**Status:** next +**Status:** done ### Objective diff --git a/scripts/agent-probes/probe-runner.test.ts b/scripts/agent-probes/probe-runner.test.ts index 7f4f1a35..1607afef 100644 --- a/scripts/agent-probes/probe-runner.test.ts +++ b/scripts/agent-probes/probe-runner.test.ts @@ -10,6 +10,7 @@ import { runScriptedProbe, type JsonlTransport, type ProbeJsonlRequest, + type ProbeJsonlResponse, type SpawnedJsonlProcess, } from './probe-runner.js'; @@ -162,6 +163,57 @@ describe('probe runner', () => { expect(result.errors).toEqual([]); }); + it('can answer turns through an injected response policy', async () => { + const policyInputs: Array<{ activeTurnId: number; priorAnswerCount: number; brief: string | undefined }> = + []; + const transport = createScriptedSuccessTransport(); + + const result = await runScriptedProbe({ + transport, + scenario: { name: 'policy-proof', specName: 'Policy proof', brief: 'answer like a user' }, + scriptedAnswers: [], + responsePolicy(input) { + policyInputs.push({ + activeTurnId: input.activeTurn.id, + priorAnswerCount: input.priorAnsweredTurns.length, + brief: input.scenario.brief, + }); + if (input.activeTurn.options?.[0]) { + return { kind: 'select-options', positions: [input.activeTurn.options[0].position] }; + } + return { kind: 'free-text', freeText: `Policy response to ${input.activeTurn.question}` }; + }, + }); + + expect(policyInputs).toEqual([ + { activeTurnId: 100, priorAnswerCount: 0, brief: 'answer like a user' }, + { activeTurnId: 101, priorAnswerCount: 1, brief: 'answer like a user' }, + ]); + expect(result.summary).toMatchObject({ turnsAnswered: 2, finalFrontierState: 'answered' }); + expect(result.errors).toEqual([]); + }); + + it('returns structured probe errors when the response policy fails', async () => { + const result = await runScriptedProbe({ + transport: createScriptedSuccessTransport(), + scenario: { name: 'policy-failure', specName: 'Policy failure proof' }, + scriptedAnswers: [], + responsePolicy() { + throw new Error('Simulated user could not answer\nwith stack details'); + }, + }); + + expect(result.summary.turnsAnswered).toBe(0); + expect(result.errors).toEqual([ + { + requestId: 'policy-1', + capability: 'probe.responsePolicy', + code: 'policy_failed', + message: 'Simulated user could not answer', + }, + ]); + }); + it('uses a process JSONL transport to write requests and parse responses', async () => { const written: string[] = []; let onStdoutData: ((chunk: string) => void) | null = null; @@ -372,6 +424,14 @@ describe('probe runner', () => { }); }); +function createScriptedSuccessTransport(): JsonlTransport { + return { + async send(request) { + return getFakeAgentResponse(request); + }, + }; +} + function createFakeAgentProcess(): SpawnedJsonlProcess { let onStdoutData: ((chunk: string) => void) | null = null; @@ -388,7 +448,7 @@ function createFakeAgentProcess(): SpawnedJsonlProcess { }; } -function getFakeAgentResponse(request: ProbeJsonlRequest) { +function getFakeAgentResponse(request: ProbeJsonlRequest): ProbeJsonlResponse { if (request.capability === 'spec.create') { return { id: request.id, ok: true, output: { specId: 1 } }; } diff --git a/scripts/agent-probes/probe-runner.ts b/scripts/agent-probes/probe-runner.ts index c843c522..830dd8c6 100644 --- a/scripts/agent-probes/probe-runner.ts +++ b/scripts/agent-probes/probe-runner.ts @@ -95,33 +95,50 @@ interface ChatGetPrimaryOutput { chatId: number; } -interface AgentChatReadProjection { +export interface AgentChatReadProjection { frontier: { state: string; turnId: number | null }; turns: AgentChatTurn[]; nextCommands?: AgentNextCommand[]; } -interface AgentChatTurn { +export interface AgentChatTurn { id: number; question: string; answer: string | null; options?: AgentTurnOption[]; } -interface AgentTurnOption { +export interface AgentTurnOption { position: number; content: string; } -interface AgentNextCommand { +export interface AgentNextCommand { capability: string; input?: unknown; } +export type ProbeTurnResponse = + | { kind: 'free-text'; freeText: string } + | { kind: 'select-options'; positions: number[] }; + +export interface ProbeResponsePolicyInput { + scenario: ScriptedProbeScenario; + chat: AgentChatReadProjection; + activeTurn: AgentChatTurn; + priorAnsweredTurns: AgentChatTurn[]; + turnIndex: number; +} + +export type ProbeResponsePolicy = ( + input: ProbeResponsePolicyInput, +) => ProbeTurnResponse | Promise; + interface RunScriptedProbeOptions { transport: JsonlTransport; scenario: ScriptedProbeScenario; scriptedAnswers: string[]; + responsePolicy?: ProbeResponsePolicy; } export interface ProcessBackedProbeOptions { @@ -197,6 +214,7 @@ export async function runScriptedProbe({ transport, scenario, scriptedAnswers, + responsePolicy = createScriptedResponsePolicy(scriptedAnswers), }: RunScriptedProbeOptions): Promise { const startedAt = Date.now(); const state: ProbeRunResult = { @@ -260,13 +278,24 @@ export async function runScriptedProbe({ return finishRun(state, startedAt); } + const policyResponse = await getPolicyResponse(state, responsePolicy, { + scenario, + chat: readyRead, + activeTurn, + priorAnsweredTurns: readyRead.turns.filter((turn) => turn.answer !== null), + turnIndex, + }); + if (!policyResponse) { + return finishRun(state, startedAt); + } + const submit = await sendExpectingOutput(state, transport, { id: `answer-${turnIndex + 1}`, capability: 'turn.submitResponse', input: { chatId: primary.chatId, turnId: activeTurn.id, - response: buildScriptedResponse(activeTurn, scriptedAnswers[turnIndex]), + response: policyResponse, }, }); if (!submit) { @@ -318,18 +347,40 @@ function getActiveTurn(read: AgentChatReadProjection): AgentChatTurn | null { return read.turns.find((turn) => turn.id === read.frontier.turnId) ?? null; } -function buildScriptedResponse(turn: AgentChatTurn, scriptedAnswer: string | undefined) { +function createScriptedResponsePolicy(scriptedAnswers: string[]): ProbeResponsePolicy { + return ({ activeTurn, turnIndex }) => buildScriptedResponse(activeTurn, scriptedAnswers[turnIndex]); +} + +function buildScriptedResponse(turn: AgentChatTurn, scriptedAnswer: string | undefined): ProbeTurnResponse { const firstOption = turn.options?.[0]; if (firstOption) { - return { kind: 'select-options' as const, positions: [firstOption.position] }; + return { kind: 'select-options', positions: [firstOption.position] }; } return { - kind: 'free-text' as const, + kind: 'free-text', freeText: scriptedAnswer?.trim() || `Scripted response to: ${turn.question}`, }; } +async function getPolicyResponse( + state: ProbeRunResult, + responsePolicy: ProbeResponsePolicy, + input: ProbeResponsePolicyInput, +): Promise { + try { + return await responsePolicy(input); + } catch (error) { + state.errors.push({ + requestId: `policy-${input.turnIndex + 1}`, + capability: 'probe.responsePolicy', + code: 'policy_failed', + message: sanitizeProbeErrorMessage(error instanceof Error ? error.message : String(error)), + }); + return null; + } +} + function finishRun(state: ProbeRunResult, startedAt: number): ProbeRunResult { state.summary.durationMs = Date.now() - startedAt; state.summary.errors = state.errors; From befc34aa221da40efa9f6259b7846229f44d3d87 Mon Sep 17 00:00:00 2001 From: Lu Nelson Date: Tue, 12 May 2026 11:14:18 +0200 Subject: [PATCH 17/42] Add model-backed probe user policy --- memory/CARDS.md | 4 +- scripts/agent-probes/llm-user.test.ts | 202 ++++++++++++++++++++++ scripts/agent-probes/llm-user.ts | 124 +++++++++++++ scripts/agent-probes/probe-runner.test.ts | 24 ++- scripts/agent-probes/probe-runner.ts | 15 ++ 5 files changed, 361 insertions(+), 8 deletions(-) create mode 100644 scripts/agent-probes/llm-user.test.ts create mode 100644 scripts/agent-probes/llm-user.ts diff --git a/memory/CARDS.md b/memory/CARDS.md index 92f4d305..6130800f 100644 --- a/memory/CARDS.md +++ b/memory/CARDS.md @@ -136,7 +136,7 @@ Result: stays light. This is a local extension point inside the external runner, ## Card 14 — Model-backed LLM-as-user policy with prompt artifacts -**Status:** queued +**Status:** done ### Target Behavior @@ -173,7 +173,7 @@ A model-backed user-simulator policy can answer one probe turn from the current ## Card 15 — Opt-in LLM-as-user packaged-boundary smoke -**Status:** queued +**Status:** next ### Objective diff --git a/scripts/agent-probes/llm-user.test.ts b/scripts/agent-probes/llm-user.test.ts new file mode 100644 index 00000000..b5d12426 --- /dev/null +++ b/scripts/agent-probes/llm-user.test.ts @@ -0,0 +1,202 @@ +import { describe, expect, it } from 'vitest'; + +import { createModelBackedUserPolicy, type SimulatedUserModelAdapter } from './llm-user.js'; +import { buildProbeArtifactBundle, runScriptedProbe, type JsonlTransport } from './probe-runner.js'; +import type { ProbeJsonlRequest, ProbeJsonlResponse, SimulatedUserEvent } from './probe-runner.js'; + +describe('model-backed simulated user policy', () => { + it('renders a strict JSON prompt with scenario, active question, options, and prior Q/A', async () => { + const prompts: string[] = []; + const events: SimulatedUserEvent[] = []; + const model: SimulatedUserModelAdapter = { + async generateText(prompt) { + prompts.push(prompt); + return JSON.stringify({ kind: 'free-text', freeText: 'I want a spec assistant.' }); + }, + }; + + const result = await runScriptedProbe({ + transport: createOneTurnTransport(), + scenario: { name: 'llm-user', specName: 'LLM user proof', brief: 'A tired founder wants help.' }, + scriptedAnswers: [], + responsePolicy: createModelBackedUserPolicy({ model, events }), + simulatedUserEvents: events, + }); + + expect(prompts[0]).toContain('You are simulating the user, not the interviewer.'); + expect(prompts[0]).toContain('A tired founder wants help.'); + expect(prompts[0]).toContain('What are you building?'); + expect(prompts[0]).toContain('Earlier answered turns'); + expect(prompts[1]).toContain('0. Acceptance criteria'); + expect(prompts[1]).toContain('Q: What are you building?'); + expect(prompts[1]).toContain('A: I want a spec assistant.'); + expect(result.summary.turnsAnswered).toBe(2); + }); + + it('parses valid model JSON into free-text and option-selection response payloads', async () => { + const events: SimulatedUserEvent[] = []; + const outputs = [ + JSON.stringify({ kind: 'free-text', freeText: 'A graph-first spec tool' }), + JSON.stringify({ kind: 'select-options', positions: [0] }), + ]; + const model: SimulatedUserModelAdapter = { + async generateText() { + return outputs.shift() ?? '{}'; + }, + }; + const requests: ProbeJsonlRequest[] = []; + + const result = await runScriptedProbe({ + transport: createOneTurnTransport(requests), + scenario: { name: 'parse', specName: 'Parse proof' }, + scriptedAnswers: [], + responsePolicy: createModelBackedUserPolicy({ model, events }), + simulatedUserEvents: events, + }); + + expect(requests[4]).toMatchObject({ + capability: 'turn.submitResponse', + input: { response: { kind: 'free-text', freeText: 'A graph-first spec tool' } }, + }); + expect(requests[8]).toMatchObject({ + capability: 'turn.submitResponse', + input: { response: { kind: 'select-options', positions: [0] } }, + }); + expect(result.errors).toEqual([]); + }); + + it('records simulated-user prompt artifacts and parse status in the artifact bundle', async () => { + const events: SimulatedUserEvent[] = []; + const model: SimulatedUserModelAdapter = { + async generateText() { + return JSON.stringify({ kind: 'free-text', freeText: 'Preserve prompt artifacts' }); + }, + }; + + const result = await runScriptedProbe({ + transport: createOneTurnTransport(), + scenario: { name: 'artifact', specName: 'Artifact proof' }, + scriptedAnswers: [], + responsePolicy: createModelBackedUserPolicy({ model, events }), + simulatedUserEvents: events, + }); + + const bundle = buildProbeArtifactBundle(result); + expect(bundle.simulatedUserEvents[0]).toMatchObject({ + turnId: 100, + status: 'parsed', + parsedResponse: { kind: 'free-text', freeText: 'Preserve prompt artifacts' }, + }); + expect(bundle.simulatedUserEvents[0]?.prompt).toContain('Return exactly one JSON object'); + expect(bundle.simulatedUserEvents[0]?.rawModelOutput).toContain('Preserve prompt artifacts'); + }); + + it('turns invalid model output into a structured probe error', async () => { + const events: SimulatedUserEvent[] = []; + const model: SimulatedUserModelAdapter = { + async generateText() { + return 'not json'; + }, + }; + + const result = await runScriptedProbe({ + transport: createOneTurnTransport(), + scenario: { name: 'bad-json', specName: 'Bad JSON proof' }, + scriptedAnswers: [], + responsePolicy: createModelBackedUserPolicy({ model, events }), + simulatedUserEvents: events, + }); + + expect(result.summary.turnsAnswered).toBe(0); + expect(result.errors).toEqual([ + { + requestId: 'policy-1', + capability: 'probe.responsePolicy', + code: 'policy_failed', + message: 'Simulated user returned invalid JSON', + }, + ]); + expect(result.simulatedUserEvents[0]).toMatchObject({ status: 'failed', rawModelOutput: 'not json' }); + }); +}); + +function createOneTurnTransport(requests: ProbeJsonlRequest[] = []): JsonlTransport { + return { + async send(request) { + requests.push(request); + return getFakeAgentResponse(request); + }, + }; +} + +function getFakeAgentResponse(request: ProbeJsonlRequest): ProbeJsonlResponse { + if (request.capability === 'spec.create') { + return { id: request.id, ok: true, output: { specId: 1 } }; + } + if (request.capability === 'chat.getPrimary') { + return { id: request.id, ok: true, output: { chatId: 10 } }; + } + if (request.capability === 'chat.ensureReady') { + const turnId = request.id === 'ready-1' ? 100 : 101; + return { id: request.id, ok: true, output: { chatId: 10, state: 'awaiting_response', turnId } }; + } + if (request.id === 'read-1') { + return { + id: request.id, + ok: true, + output: { + frontier: { state: 'awaiting_response', turnId: 100 }, + turns: [{ id: 100, question: 'What are you building?', answer: null, options: [] }], + }, + }; + } + if (request.id === 'read-2') { + return { + id: request.id, + ok: true, + output: { + frontier: { state: 'answered', turnId: 100 }, + turns: [ + { id: 100, question: 'What are you building?', answer: 'I want a spec assistant.', options: [] }, + ], + }, + }; + } + if (request.id === 'read-3') { + return { + id: request.id, + ok: true, + output: { + frontier: { state: 'awaiting_response', turnId: 101 }, + turns: [ + { id: 100, question: 'What are you building?', answer: 'I want a spec assistant.', options: [] }, + { + id: 101, + question: 'What should be specified first?', + answer: null, + options: [{ position: 0, content: 'Acceptance criteria' }], + }, + ], + }, + }; + } + if (request.id === 'read-4') { + return { + id: request.id, + ok: true, + output: { + frontier: { state: 'answered', turnId: 101 }, + turns: [ + { id: 100, question: 'What are you building?', answer: 'I want a spec assistant.', options: [] }, + { + id: 101, + question: 'What should be specified first?', + answer: 'Acceptance criteria', + options: [], + }, + ], + }, + }; + } + return { id: request.id, ok: true, output: { ok: true } }; +} diff --git a/scripts/agent-probes/llm-user.ts b/scripts/agent-probes/llm-user.ts new file mode 100644 index 00000000..d4bf4955 --- /dev/null +++ b/scripts/agent-probes/llm-user.ts @@ -0,0 +1,124 @@ +import type { + ProbeResponsePolicy, + ProbeResponsePolicyInput, + ProbeTurnResponse, + SimulatedUserEvent, +} from './probe-runner.js'; + +export interface SimulatedUserModelAdapter { + generateText(prompt: string): Promise; +} + +export function createModelBackedUserPolicy({ + model, + events, +}: { + model: SimulatedUserModelAdapter; + events: SimulatedUserEvent[]; +}): ProbeResponsePolicy { + return async (input) => { + const prompt = renderSimulatedUserPrompt(input); + const rawModelOutput = await model.generateText(prompt); + + try { + const parsedResponse = parseSimulatedUserResponse(rawModelOutput, input); + events.push({ + turnId: input.activeTurn.id, + prompt, + rawModelOutput, + parsedResponse, + status: 'parsed', + error: null, + }); + return parsedResponse; + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + events.push({ + turnId: input.activeTurn.id, + prompt, + rawModelOutput, + parsedResponse: null, + status: 'failed', + error: message, + }); + throw error; + } + }; +} + +function renderSimulatedUserPrompt(input: ProbeResponsePolicyInput): string { + const options = input.activeTurn.options?.length + ? input.activeTurn.options.map((option) => `${option.position}. ${option.content}`).join('\n') + : 'No options are available; answer with free text.'; + const priorTurns = input.priorAnsweredTurns.length + ? input.priorAnsweredTurns.map((turn) => `Q: ${turn.question}\nA: ${turn.answer ?? ''}`).join('\n\n') + : 'None yet.'; + + return [ + 'You are simulating the user, not the interviewer.', + 'Answer only as the user described by the scenario. Do not invent product state outside the prompt.', + 'Return exactly one JSON object and no Markdown.', + '', + 'Allowed response JSON:', + '- Free text: {"kind":"free-text","freeText":"your answer"}', + '- Option selection: {"kind":"select-options","positions":[0]}', + '', + `Scenario brief: ${input.scenario.brief ?? 'No scenario brief provided.'}`, + `Specification name: ${input.scenario.specName}`, + '', + 'Earlier answered turns:', + priorTurns, + '', + 'Active question:', + input.activeTurn.question, + '', + 'Options:', + options, + ].join('\n'); +} + +function parseSimulatedUserResponse( + rawModelOutput: string, + input: ProbeResponsePolicyInput, +): ProbeTurnResponse { + let parsed: unknown; + try { + parsed = JSON.parse(rawModelOutput); + } catch { + throw new Error('Simulated user returned invalid JSON'); + } + + if (!isRecord(parsed) || typeof parsed.kind !== 'string') { + throw new Error('Simulated user response did not match an allowed response shape'); + } + + if (parsed.kind === 'free-text') { + if (typeof parsed.freeText !== 'string' || parsed.freeText.trim() === '') { + throw new Error('Simulated user free-text response was empty or invalid'); + } + return { kind: 'free-text', freeText: parsed.freeText }; + } + + if (parsed.kind === 'select-options') { + if ( + !Array.isArray(parsed.positions) || + parsed.positions.some((position) => typeof position !== 'number') + ) { + throw new Error('Simulated user option response had invalid positions'); + } + const allowedPositions = new Set(input.activeTurn.options?.map((option) => option.position) ?? []); + if ( + parsed.positions.length === 0 || + parsed.positions.some((position) => !allowedPositions.has(position)) + ) { + throw new Error('Simulated user option response selected unavailable positions'); + } + return { kind: 'select-options', positions: parsed.positions }; + } + + throw new Error('Simulated user response did not match an allowed response shape'); +} + +function isRecord(value: unknown): value is Record { + return typeof value === 'object' && value !== null && !Array.isArray(value); +} diff --git a/scripts/agent-probes/probe-runner.test.ts b/scripts/agent-probes/probe-runner.test.ts index 1607afef..60e8f64c 100644 --- a/scripts/agent-probes/probe-runner.test.ts +++ b/scripts/agent-probes/probe-runner.test.ts @@ -1,4 +1,12 @@ -import { existsSync, mkdirSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from 'node:fs'; +import { + existsSync, + mkdirSync, + mkdtempSync, + readFileSync, + readdirSync, + rmSync, + writeFileSync, +} from 'node:fs'; import { tmpdir } from 'node:os'; import { join } from 'node:path'; @@ -367,8 +375,10 @@ describe('probe runner', () => { }); }); - it('guards the probe-runner import boundary from server mutation authority modules', () => { - const source = readFileSync(new URL('./probe-runner.ts', import.meta.url), 'utf8'); + it('guards the agent-probes import boundary from server mutation authority modules', () => { + const sources = readdirSync(new URL('.', import.meta.url)) + .filter((fileName) => fileName.endsWith('.ts') && !fileName.endsWith('.test.ts')) + .map((fileName) => readFileSync(new URL(`./${fileName}`, import.meta.url), 'utf8')); const forbiddenImports = [ '@/server/db', '@/server/capabilities', @@ -386,9 +396,11 @@ describe('probe runner', () => { '../../src/server/turn-response-transition', ]; - for (const forbiddenImport of forbiddenImports) { - expect(source).not.toContain(`from '${forbiddenImport}`); - expect(source).not.toContain(`from "${forbiddenImport}`); + for (const source of sources) { + for (const forbiddenImport of forbiddenImports) { + expect(source).not.toContain(`from '${forbiddenImport}`); + expect(source).not.toContain(`from "${forbiddenImport}`); + } } }); diff --git a/scripts/agent-probes/probe-runner.ts b/scripts/agent-probes/probe-runner.ts index 830dd8c6..e258e6dc 100644 --- a/scripts/agent-probes/probe-runner.ts +++ b/scripts/agent-probes/probe-runner.ts @@ -60,6 +60,15 @@ export interface ProbeRunSummary { errors: ProbeRunError[]; } +export interface SimulatedUserEvent { + turnId: number; + prompt: string; + rawModelOutput: string; + parsedResponse: ProbeTurnResponse | null; + status: 'parsed' | 'failed'; + error: string | null; +} + export interface ProbeArtifactBundle { schemaVersion: 1; scenario: { name: string; brief: string | null; specName: string }; @@ -73,6 +82,7 @@ export interface ProbeArtifactBundle { finalChat: AgentChatReadProjection | null; summary: ProbeRunSummary; errors: ProbeRunError[]; + simulatedUserEvents: SimulatedUserEvent[]; environment: { nodeVersion: string; platform: NodeJS.Platform; arch: string }; } @@ -85,6 +95,7 @@ export interface ProbeRunResult { finalChat: AgentChatReadProjection | null; summary: ProbeRunSummary; errors: ProbeRunError[]; + simulatedUserEvents: SimulatedUserEvent[]; } interface SpecCreateOutput { @@ -139,6 +150,7 @@ interface RunScriptedProbeOptions { scenario: ScriptedProbeScenario; scriptedAnswers: string[]; responsePolicy?: ProbeResponsePolicy; + simulatedUserEvents?: SimulatedUserEvent[]; } export interface ProcessBackedProbeOptions { @@ -215,6 +227,7 @@ export async function runScriptedProbe({ scenario, scriptedAnswers, responsePolicy = createScriptedResponsePolicy(scriptedAnswers), + simulatedUserEvents = [], }: RunScriptedProbeOptions): Promise { const startedAt = Date.now(); const state: ProbeRunResult = { @@ -226,6 +239,7 @@ export async function runScriptedProbe({ finalChat: null, summary: { turnsAnswered: 0, finalFrontierState: null, durationMs: 0, questionAnswers: [], errors: [] }, errors: [], + simulatedUserEvents, }; const created = await sendExpectingOutput(state, transport, { @@ -432,6 +446,7 @@ export function buildProbeArtifactBundle(result: ProbeRunResult): ProbeArtifactB finalChat: result.finalChat, summary: result.summary, errors: result.errors, + simulatedUserEvents: result.simulatedUserEvents, environment: { nodeVersion: process.version, platform: process.platform, arch: process.arch }, }; } From 6bedcca40bd2e2133ae6475aa0378ba0f6d38e04 Mon Sep 17 00:00:00 2001 From: Lu Nelson Date: Tue, 12 May 2026 11:18:33 +0200 Subject: [PATCH 18/42] Add packaged LLM user smoke helper --- memory/CARDS.md | 4 +- scripts/agent-probes/packaged-smoke.test.ts | 189 ++++++++++++++++++++ scripts/agent-probes/packaged-smoke.ts | 74 ++++++++ scripts/agent-probes/probe-runner.ts | 12 +- 4 files changed, 276 insertions(+), 3 deletions(-) create mode 100644 scripts/agent-probes/packaged-smoke.test.ts create mode 100644 scripts/agent-probes/packaged-smoke.ts diff --git a/memory/CARDS.md b/memory/CARDS.md index 6130800f..664c37c6 100644 --- a/memory/CARDS.md +++ b/memory/CARDS.md @@ -173,7 +173,7 @@ A model-backed user-simulator policy can answer one probe turn from the current ## Card 15 — Opt-in LLM-as-user packaged-boundary smoke -**Status:** next +**Status:** done ### Objective @@ -205,7 +205,7 @@ Result: stays light if implemented as an opt-in/manual proof wrapper over the ex ## Card 16 — Fixture-candidate normalization checkpoint -**Status:** queued +**Status:** next ### Objective diff --git a/scripts/agent-probes/packaged-smoke.test.ts b/scripts/agent-probes/packaged-smoke.test.ts new file mode 100644 index 00000000..f22f1bbe --- /dev/null +++ b/scripts/agent-probes/packaged-smoke.test.ts @@ -0,0 +1,189 @@ +import { mkdtempSync, readFileSync, rmSync } from 'node:fs'; +import { tmpdir } from 'node:os'; +import { join, resolve } from 'node:path'; + +import { afterEach, describe, expect, it } from 'vitest'; + +import { formatSmokeSummary, runPackagedLlmUserSmoke } from './packaged-smoke.js'; +import type { ProbeJsonlRequest, ProbeJsonlResponse, SpawnedJsonlProcess } from './probe-runner.js'; + +describe('packaged LLM-as-user smoke helper', () => { + const tempDirs: string[] = []; + + afterEach(() => { + for (const dir of tempDirs.splice(0)) { + rmSync(dir, { recursive: true, force: true }); + } + }); + + function makeTempDir(prefix: string): string { + const dir = mkdtempSync(join(tmpdir(), prefix)); + tempDirs.push(dir); + return dir; + } + + it('builds first, drives the packaged agent command, preserves fixture state, and returns JSON-only summary data', async () => { + const outputDir = makeTempDir('brunch-smoke-output-'); + const buildCommands: Array<{ command: string; args: string[] }> = []; + const spawnedCommands: Array<{ command: string; args: string[]; cwd: string }> = []; + + const summary = await runPackagedLlmUserSmoke({ + outputDir, + model: { + async generateText(prompt) { + if (prompt.includes('Options:') && prompt.includes('0. Acceptance criteria')) { + return JSON.stringify({ kind: 'select-options', positions: [0] }); + } + return JSON.stringify({ kind: 'free-text', freeText: 'A smoke-test spec assistant' }); + }, + }, + async runBuildCommand(command, args) { + buildCommands.push({ command, args }); + }, + spawnProcess(options) { + spawnedCommands.push({ command: options.command, args: options.args, cwd: options.cwd }); + return createFakeAgentProcess(); + }, + }); + + expect(buildCommands).toEqual([{ command: 'npm', args: ['run', 'build'] }]); + expect(spawnedCommands).toEqual([ + { + command: process.execPath, + args: [resolve('bin/brunch.js'), 'agent'], + cwd: expect.stringContaining('brunch-probe-workspace-'), + }, + ]); + expect(summary).toEqual({ + outputDir, + turnsAnswered: 2, + finalFrontierState: 'answered', + errors: [], + }); + expect(JSON.parse(formatSmokeSummary(summary))).toEqual(summary); + expect(readFileSync(join(outputDir, 'artifact-bundle.json'), 'utf8')).toContain('simulatedUserEvents'); + expect(readFileSync(join(outputDir, 'summary.json'), 'utf8')).toContain('"turnsAnswered": 2'); + }); + + it('writes redacted failure artifacts and returns JSON-safe errors when the model fails', async () => { + const outputDir = makeTempDir('brunch-smoke-failure-'); + + const summary = await runPackagedLlmUserSmoke({ + outputDir, + model: { + async generateText() { + throw new Error('Provider failed with ANTHROPIC_API_KEY=sk-ant-secret-value\nstack'); + }, + }, + async runBuildCommand() {}, + spawnProcess() { + return createFakeAgentProcess(); + }, + }); + + expect(summary).toEqual({ + outputDir, + turnsAnswered: 0, + finalFrontierState: 'awaiting_response', + errors: [ + { + requestId: 'policy-1', + capability: 'probe.responsePolicy', + code: 'policy_failed', + message: 'Provider failed with ANTHROPIC_API_KEY=[redacted]', + }, + ], + }); + expect(readFileSync(join(outputDir, 'artifact-bundle.json'), 'utf8')).not.toContain( + 'sk-ant-secret-value', + ); + }); +}); + +function createFakeAgentProcess(): SpawnedJsonlProcess { + let onStdoutData: ((chunk: string) => void) | null = null; + + return { + writeStdin(line) { + const request = JSON.parse(line) as ProbeJsonlRequest; + const response = getFakeAgentResponse(request); + onStdoutData?.(`${JSON.stringify(response)}\n`); + }, + endStdin() {}, + onStdoutData(listener) { + onStdoutData = listener; + }, + }; +} + +function getFakeAgentResponse(request: ProbeJsonlRequest): ProbeJsonlResponse { + if (request.capability === 'spec.create') { + return { id: request.id, ok: true, output: { specId: 1 } }; + } + if (request.capability === 'chat.getPrimary') { + return { id: request.id, ok: true, output: { chatId: 10 } }; + } + if (request.capability === 'chat.ensureReady') { + const turnId = request.id === 'ready-1' ? 100 : 101; + return { id: request.id, ok: true, output: { chatId: 10, state: 'awaiting_response', turnId } }; + } + if (request.id === 'read-1') { + return { + id: request.id, + ok: true, + output: { + frontier: { state: 'awaiting_response', turnId: 100 }, + turns: [{ id: 100, question: 'What are you building?', answer: null, options: [] }], + }, + }; + } + if (request.id === 'read-2') { + return { + id: request.id, + ok: true, + output: { + frontier: { state: 'answered', turnId: 100 }, + turns: [ + { id: 100, question: 'What are you building?', answer: 'A smoke-test spec assistant', options: [] }, + ], + }, + }; + } + if (request.id === 'read-3') { + return { + id: request.id, + ok: true, + output: { + frontier: { state: 'awaiting_response', turnId: 101 }, + turns: [ + { id: 100, question: 'What are you building?', answer: 'A smoke-test spec assistant', options: [] }, + { + id: 101, + question: 'What should be specified first?', + answer: null, + options: [{ position: 0, content: 'Acceptance criteria' }], + }, + ], + }, + }; + } + if (request.id === 'read-4') { + return { + id: request.id, + ok: true, + output: { + frontier: { state: 'answered', turnId: 101 }, + turns: [ + { id: 100, question: 'What are you building?', answer: 'A smoke-test spec assistant', options: [] }, + { + id: 101, + question: 'What should be specified first?', + answer: 'Acceptance criteria', + options: [], + }, + ], + }, + }; + } + return { id: request.id, ok: true, output: { ok: true } }; +} diff --git a/scripts/agent-probes/packaged-smoke.ts b/scripts/agent-probes/packaged-smoke.ts new file mode 100644 index 00000000..5072f1eb --- /dev/null +++ b/scripts/agent-probes/packaged-smoke.ts @@ -0,0 +1,74 @@ +import { spawn } from 'node:child_process'; + +import { createModelBackedUserPolicy, type SimulatedUserModelAdapter } from './llm-user.js'; +import { + runProcessBackedProbe, + type ProbeProcessSpawner, + type ProbeRunError, + type SimulatedUserEvent, +} from './probe-runner.js'; + +export interface PackagedLlmUserSmokeSummary { + outputDir: string; + turnsAnswered: number; + finalFrontierState: string | null; + errors: ProbeRunError[]; +} + +export type SmokeBuildCommandRunner = (command: string, args: string[]) => Promise; + +export interface PackagedLlmUserSmokeOptions { + outputDir: string; + model: SimulatedUserModelAdapter; + runBuildCommand?: SmokeBuildCommandRunner; + spawnProcess?: ProbeProcessSpawner; +} + +export async function runPackagedLlmUserSmoke({ + outputDir, + model, + runBuildCommand = runCommand, + spawnProcess, +}: PackagedLlmUserSmokeOptions): Promise { + await runBuildCommand('npm', ['run', 'build']); + + const simulatedUserEvents: SimulatedUserEvent[] = []; + const result = await runProcessBackedProbe({ + scenario: { + name: 'packaged-llm-user-smoke', + specName: 'LLM user smoke fixture candidate', + brief: 'Answer as a concise user who wants Brunch to help clarify a software specification.', + }, + scriptedAnswers: [], + outputDir, + preserveWorkspaceState: true, + responsePolicy: createModelBackedUserPolicy({ model, events: simulatedUserEvents }), + simulatedUserEvents, + spawnProcess, + }); + + return { + outputDir, + turnsAnswered: result.summary.turnsAnswered, + finalFrontierState: result.summary.finalFrontierState, + errors: result.errors, + }; +} + +export function formatSmokeSummary(summary: PackagedLlmUserSmokeSummary): string { + return `${JSON.stringify(summary)}\n`; +} + +function runCommand(command: string, args: string[]): Promise { + return new Promise((resolveCommand, rejectCommand) => { + const child = spawn(command, args, { stdio: 'inherit' }); + child.on('error', rejectCommand); + child.on('exit', (code) => { + if (code === 0) { + resolveCommand(); + return; + } + rejectCommand(new Error(`${command} ${args.join(' ')} exited with code ${code ?? 'unknown'}`)); + }); + }); +} diff --git a/scripts/agent-probes/probe-runner.ts b/scripts/agent-probes/probe-runner.ts index e258e6dc..d34454a4 100644 --- a/scripts/agent-probes/probe-runner.ts +++ b/scripts/agent-probes/probe-runner.ts @@ -162,6 +162,8 @@ export interface ProcessBackedProbeOptions { args?: string[]; env?: NodeJS.ProcessEnv; preserveWorkspaceState?: boolean; + responsePolicy?: ProbeResponsePolicy; + simulatedUserEvents?: SimulatedUserEvent[]; } export async function runProcessBackedProbe({ @@ -173,13 +175,21 @@ export async function runProcessBackedProbe({ args = [resolve('bin/brunch.js'), 'agent'], env = process.env, preserveWorkspaceState = false, + responsePolicy, + simulatedUserEvents, }: ProcessBackedProbeOptions): Promise { const workspaceCwd = mkdtempSync(join(tmpdir(), 'brunch-probe-workspace-')); const spawned = spawnProcess({ cwd: workspaceCwd, command, args, env }); const transport = createProcessJsonlTransport(spawned); try { - const result = await runScriptedProbe({ transport, scenario, scriptedAnswers }); + const result = await runScriptedProbe({ + transport, + scenario, + scriptedAnswers, + responsePolicy, + simulatedUserEvents, + }); result.workspaceCwd = workspaceCwd; if (preserveWorkspaceState) { result.preservedWorkspaceStatePath = copyWorkspaceState({ workspaceCwd, outputDir }); From 2db789d7ab3473842678e50343899bfaee0b5dd8 Mon Sep 17 00:00:00 2001 From: Lu Nelson Date: Tue, 12 May 2026 11:48:56 +0200 Subject: [PATCH 19/42] Add fixture candidate checkpoint --- memory/CARDS.md | 236 ------------------ .../agent-probes/fixture-candidate.test.ts | 157 ++++++++++++ scripts/agent-probes/fixture-candidate.ts | 173 +++++++++++++ 3 files changed, 330 insertions(+), 236 deletions(-) delete mode 100644 memory/CARDS.md create mode 100644 scripts/agent-probes/fixture-candidate.test.ts create mode 100644 scripts/agent-probes/fixture-candidate.ts diff --git a/memory/CARDS.md b/memory/CARDS.md deleted file mode 100644 index 664c37c6..00000000 --- a/memory/CARDS.md +++ /dev/null @@ -1,236 +0,0 @@ -# FE-705 scope cards — fixture-capable LLM-as-user probe path - -> Prepared by `ln-scope` on 2026-05-12 and revised after the packaged-boundary proof. These cards stay under the existing FE-705 frontier item and branch (`ln/fe-705-agent-capability-cli`). They are sub-slices, not new Linear issues or branches. The goal is a tracer-bullet path from external JSONL runner → preserved local DB fixture candidate → minimal model-backed user simulation, without adding phase closure/export or changing product UI. - -## Orientation for a new thread - -- Start by reading `memory/SPEC.md`, `memory/PLAN.md`, and this file. There is currently no `HANDOFF.md`. -- Containing seam: FE-705 agent capability CLI / external probe-runner seam governed by `memory/SPEC.md` Requirement 43, A89, D147, and I114. -- Relevant frontier item: `memory/PLAN.md` Next item 2, **Agent capability CLI + LLM-as-user fixture probe**. Keep all cards on branch `ln/fe-705-agent-capability-cli`; do not create a new Linear issue or Graphite branch for these sub-slices. -- Current repo state at scoping time: branch is ahead of origin with four FE-705 probe-runner commits; only known unrelated dirty state is untracked `.agents/skills/d3k/`, which should be left alone. -- What has already been proved: `scripts/agent-probes/probe-runner.ts` contains a scripted process-backed runner, tests, artifact bundle writing, redaction, and an import-boundary guard. A manual packaged-boundary smoke built the app, drove `node bin/brunch.js agent` through two real-provider turns, and wrote artifacts at `/tmp/brunch-probe-artifacts-9FQyPB`. -- Main open risk: probe / LLM-as-user / fixture-candidate code must stay clearly outside Brunch product runtime and mutation authority. The next card preserves fixture state without making it product runtime state. - -## Layering decision for this queue - -Treat `brunch agent` itself as product/runtime code, but treat the probe runner and fixture generator as **development harness** code. - -- Keep in `src/server/`: - - `agent-jsonl.ts` - - `capabilities.ts` - - capability registry / DB / product mutation handlers -- Move out of `src/server/`: - - `probe-runner.ts` - - `probe-runner.test.ts` - - future LLM-as-user simulator - - future fixture-candidate helpers -- Target location: - -```text -scripts/agent-probes/ - probe-runner.ts - probe-runner.test.ts - llm-user.ts # later card, if useful - fixture-candidate.ts # later card, if useful -``` - -- Tooling must cover `scripts/` so this harness remains linted/formatted/tested. Update `package.json` scripts as needed so `npm run fix`, `npm run check`, and `npm run verify` include `scripts/`. -- Boundary rule: `scripts/agent-probes/**` may spawn `node bin/brunch.js agent`, use Node filesystem/process utilities, and import narrow shared request schemas if necessary, but must not import Brunch DB, capability dispatch/registry, ORM schema, core workflow handlers, route-transition handlers, or turn-response transition handlers. - -## Card 11 — Move probe runner to scripts harness boundary - -**Status:** done - -### Objective - -The probe runner lives under `scripts/agent-probes/` as development harness code while remaining covered by project lint/format/test tooling and protected from product mutation-authority imports. - -### Acceptance Criteria - -✓ `src/server/probe-runner.ts` and `src/server/probe-runner.test.ts` are moved to `scripts/agent-probes/probe-runner.ts` and `scripts/agent-probes/probe-runner.test.ts` or an equivalent `scripts/agent-probes/` mini-library shape. -✓ `package.json` `fmt`, `fmt:check`, `lint`, and `lint:fix` include `scripts/` so the moved harness remains in the normal `npm run fix`, `npm run check`, and `npm run verify` gates. -✓ The moved tests still pass and continue proving scripted JSONL transport, process-backed runner, artifact bundle writing, redaction, and import-boundary behavior. -✓ The import-boundary test is updated for `scripts/agent-probes/**` and forbids imports from `src/server/db`, `src/server/capabilities`, `src/server/capability-registry`, `src/server/schema`, `src/server/core`, `src/server/chat-route-transition`, and `src/server/turn-response-transition`. -✓ Any manual smoke snippets or comments refer to importing from `./scripts/agent-probes/probe-runner.ts`, not `./src/server/probe-runner.ts`. - -### Verification Approach - -- Inner: `npm run test -- scripts/agent-probes/probe-runner.test.ts` (or the moved test path) plus the static import-boundary test. -- Gate: `npm run verify` to prove scripts are included in check/test/build and the product runtime still builds without bundling the harness as server code. - -### Promotion checklist - -- [ ] Does this change a requirement? -- [ ] Does this create, retire, or invalidate an assumption? -- [ ] Does this make or reverse a non-trivial design decision? -- [ ] Does this establish a new seam-level invariant? -- [ ] Does it cross more than two major seams? -- [ ] Is this the first touch in an unfamiliar seam from a fresh thread? -- [ ] Can you not name the containing seam or current rationale from the live docs? - -Result: stays light. This aligns file placement with the existing FE-705 decision that probe artifacts and LLM-as-user scenarios belong to an external runner, while `brunch agent` remains the product JSONL adapter. - -## Card 12 — Preserve probe workspace state for fixture candidates - -**Status:** done - -### Objective - -Process-backed probe runs can optionally preserve the temp workspace state that contains the real `.brunch` SQLite database alongside the review artifacts. - -### Acceptance Criteria - -✓ `scripts/agent-probes/probe-runner.test.ts` — `runProcessBackedProbe()` records the temp `workspaceCwd` in the artifact bundle and run result without exposing it as ambient selected product state. -✓ `scripts/agent-probes/probe-runner.test.ts` — when fixture preservation is enabled, the runner copies the workspace `.brunch/` directory or database file into the output artifact directory under a stable `workspace-state/` path. -✓ `scripts/agent-probes/probe-runner.test.ts` — when fixture preservation is disabled, existing minimal artifacts still write without copying `.brunch/` state. -✓ The copied fixture state is outside the live temp workspace and can survive temp workspace cleanup. - -### Verification Approach - -- Inner: fake process / filesystem oracle in `scripts/agent-probes/probe-runner.test.ts` for workspace path metadata, fixture copy behavior, and disabled-by-default compatibility. -- Middle: manual packaged-boundary smoke can inspect the copied SQLite fixture candidate after `npm run build` when provider credentials are present. - -### Promotion checklist - -- [ ] Does this change a requirement? -- [ ] Does this create, retire, or invalidate an assumption? -- [ ] Does this make or reverse a non-trivial design decision? -- [ ] Does this establish a new seam-level invariant? -- [ ] Does it cross more than two major seams? -- [ ] Is this the first touch in an unfamiliar seam from a fresh thread? -- [ ] Can you not name the containing seam or current rationale from the live docs? - -Result: stays light. This preserves evidence inside the already-established external probe-runner seam and does not change Brunch product persistence semantics. - -## Card 13 — User-simulator policy interface - -**Status:** done - -### Objective - -The probe runner can obtain turn responses through an injected user-simulator policy instead of only through positional scripted answers. - -### Acceptance Criteria - -✓ `scripts/agent-probes/probe-runner.test.ts` — `runScriptedProbe()` or its successor accepts an injected policy that receives the scenario brief, current `chat.read` projection, active turn, and prior answered turns. -✓ `scripts/agent-probes/probe-runner.test.ts` — the existing scripted behavior is reimplemented as one policy and still handles free-text and option-bearing turns. -✓ `scripts/agent-probes/probe-runner.test.ts` — policy errors become structured probe errors and artifact summaries instead of uncaught exceptions. -✓ No `scripts/agent-probes/**` code imports DB, capability dispatch/registry, schema, core, route-transition, or turn-response authority modules directly. - -### Verification Approach - -- Inner: fake transport / policy oracle proves response-policy inputs, response payload construction, and structured policy failure handling. -- Middle: import-boundary test protects the external-runner authority boundary. - -### Promotion checklist - -- [ ] Does this change a requirement? -- [ ] Does this create, retire, or invalidate an assumption? -- [ ] Does this make or reverse a non-trivial design decision? -- [ ] Does this establish a new seam-level invariant? -- [ ] Does it cross more than two major seams? -- [ ] Is this the first touch in an unfamiliar seam from a fresh thread? -- [ ] Can you not name the containing seam or current rationale from the live docs? - -Result: stays light. This is a local extension point inside the external runner, not a new Brunch product API. - -## Card 14 — Model-backed LLM-as-user policy with prompt artifacts - -**Status:** done - -### Target Behavior - -A model-backed user-simulator policy can answer one probe turn from the current `chat.read` projection by rendering a strict JSON-response prompt and parsing the model output into a `turn.submitResponse` payload. - -### Boundary Crossings - -```text -→ scripts/agent-probes user-simulator policy -→ rendered simulated-user prompt/context -→ injected model adapter -→ strict JSON parse / response validation -→ turn.submitResponse payload -→ probe artifact event -``` - -### Risks and Assumptions - -- RISK: The simulated user accidentally acts like the interviewer or invents product state → MITIGATION: prompt frames the model as the user only, includes only scenario brief + current question/options + compact prior Q/A, and accepts only strict response JSON. -- RISK: Model output is malformed or semantically invalid for the current turn → MITIGATION: parse through the existing turn-response payload schema shape and record structured parse failures in artifacts. -- ASSUMPTION: A `chat.read` projection contains enough context for a minimal LLM-as-user to answer early grounding turns without `turn.get` → VALIDATE: fake adapter tests plus opt-in real-provider smoke over two turns → `memory/SPEC.md` §Assumptions A89. - -### Acceptance Criteria - -✓ `scripts/agent-probes/probe-runner.test.ts` or `scripts/agent-probes/llm-user.test.ts` — a fake model adapter receives a rendered prompt containing scenario brief, active question, options when present, and compact prior Q/A. -✓ Valid model JSON for free-text and option-selection turns becomes the correct `turn.submitResponse` payload. -✓ Invalid JSON or schema-invalid model output becomes a structured probe error, not a thrown crash. -✓ `artifact-bundle.json` includes simulated-user prompt, raw model output, parsed response, and parse/validation status events. - -### Verification Approach - -- Inner: fake model-adapter oracle proves prompt rendering, parsing, validation, and artifact event capture without provider credentials. -- Middle: opt-in real-provider smoke after Card 15 proves the adapter can drive the packaged CLI through real interviewer questions. - -## Card 15 — Opt-in LLM-as-user packaged-boundary smoke - -**Status:** done - -### Objective - -A manual/opt-in smoke command can run the model-backed user simulator against `node bin/brunch.js agent`, preserve fixture state, and report whether a two-turn fixture candidate was produced. - -### Acceptance Criteria - -✓ A documented invocation or tiny test helper runs `npm run build` then `runProcessBackedProbe()` with the default packaged command, model-backed user policy, explicit output directory, and fixture preservation enabled. -✓ The smoke prints the artifact directory, final frontier state, turns answered, and errors as JSON only. -✓ On success, the artifact directory contains review artifacts plus preserved workspace state suitable for later golden-fixture curation. -✓ On provider/model failure, the artifact directory contains redacted failure artifacts and no secret-bearing stack dumps. - -### Verification Approach - -- Inner: fake model / fake process test covers smoke helper command construction and JSON summary shape without provider credentials. -- Outer: manual real-provider smoke proves packaged CLI + Brunch interviewer + LLM-as-user + persisted fixture artifacts end to end. - -### Promotion checklist - -- [ ] Does this change a requirement? -- [ ] Does this create, retire, or invalidate an assumption? -- [ ] Does this make or reverse a non-trivial design decision? -- [ ] Does this establish a new seam-level invariant? -- [ ] Does it cross more than two major seams? -- [ ] Is this the first touch in an unfamiliar seam from a fresh thread? -- [ ] Can you not name the containing seam or current rationale from the live docs? - -Result: stays light if implemented as an opt-in/manual proof wrapper over the existing runner seam. Promote if it becomes a committed product CLI surface or changes fixture authority semantics. - -## Card 16 — Fixture-candidate normalization checkpoint - -**Status:** next - -### Objective - -A completed probe artifact directory can be evaluated as a fixture candidate using deterministic metadata checks before it is promoted into a golden fixture corpus. - -### Acceptance Criteria - -✓ A fixture-candidate helper inspects an artifact directory and reports presence/shape of `artifact-bundle.json`, `summary.json`, `raw-jsonl.ndjson`, `final-chat.json`, and preserved workspace state when expected. -✓ The helper reports non-deterministic fields that would need normalization for goldens, including timestamps, ids, durations, temp paths, and provider-dependent question wording. -✓ Tests cover a complete candidate, a missing workspace-state candidate, and an error-run candidate without requiring a provider. -✓ The helper does not bless or copy artifacts into a permanent corpus yet; it only reports readiness and normalization debt. - -### Verification Approach - -- Inner: filesystem fixture oracle over synthetic artifact directories. -- Middle: run against the manual smoke artifact directory to decide whether the next frontier is golden corpus curation or more normalization. - -### Promotion checklist - -- [ ] Does this change a requirement? -- [ ] Does this create, retire, or invalidate an assumption? -- [ ] Does this make or reverse a non-trivial design decision? -- [ ] Does this establish a new seam-level invariant? -- [ ] Does it cross more than two major seams? -- [ ] Is this the first touch in an unfamiliar seam from a fresh thread? -- [ ] Can you not name the containing seam or current rationale from the live docs? - -Result: stays light. This is a diagnostic checkpoint before creating any durable golden fixture corpus policy. diff --git a/scripts/agent-probes/fixture-candidate.test.ts b/scripts/agent-probes/fixture-candidate.test.ts new file mode 100644 index 00000000..48206fd4 --- /dev/null +++ b/scripts/agent-probes/fixture-candidate.test.ts @@ -0,0 +1,157 @@ +import { mkdirSync, mkdtempSync, rmSync, writeFileSync } from 'node:fs'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; + +import { afterEach, describe, expect, it } from 'vitest'; + +import { inspectFixtureCandidate } from './fixture-candidate.js'; + +describe('fixture candidate checkpoint', () => { + const tempDirs: string[] = []; + + afterEach(() => { + for (const dir of tempDirs.splice(0)) { + rmSync(dir, { recursive: true, force: true }); + } + }); + + function makeTempDir(prefix: string): string { + const dir = mkdtempSync(join(tmpdir(), prefix)); + tempDirs.push(dir); + return dir; + } + + it('reports a complete artifact directory as ready with normalization debt', () => { + const dir = makeTempDir('brunch-fixture-complete-'); + writeCandidate(dir, { includeWorkspaceState: true }); + + const report = inspectFixtureCandidate(dir, { expectWorkspaceState: true }); + + expect(report.ready).toBe(true); + expect(report.files).toMatchObject({ + 'artifact-bundle.json': { present: true, validJson: true }, + 'summary.json': { present: true, validJson: true }, + 'raw-jsonl.ndjson': { present: true, validJson: true }, + 'final-chat.json': { present: true, validJson: true }, + }); + expect(report.workspaceState).toEqual({ + expected: true, + present: true, + path: join(dir, 'workspace-state'), + }); + expect(report.normalizationDebt).toEqual( + expect.arrayContaining([ + 'summary.durationMs', + 'artifact-bundle.environment.nodeVersion', + 'artifact-bundle.environment.platform', + 'artifact-bundle.environment.arch', + 'artifact-bundle.workspace.cwd', + 'artifact-bundle.workspace.preservedStatePath', + 'artifact-bundle.summary.durationMs', + 'raw-jsonl request/response ids and resource ids', + 'final-chat generated question wording', + ]), + ); + expect(report.errors).toEqual([]); + }); + + it('flags a missing expected workspace-state fixture', () => { + const dir = makeTempDir('brunch-fixture-missing-workspace-'); + writeCandidate(dir, { includeWorkspaceState: false }); + + const report = inspectFixtureCandidate(dir, { expectWorkspaceState: true }); + + expect(report.ready).toBe(false); + expect(report.workspaceState).toEqual({ + expected: true, + present: false, + path: join(dir, 'workspace-state'), + }); + expect(report.errors).toContain('workspace-state is missing'); + }); + + it('accepts an error-run candidate while reporting failure status and normalization debt', () => { + const dir = makeTempDir('brunch-fixture-error-run-'); + writeCandidate(dir, { includeWorkspaceState: false, errorRun: true }); + + const report = inspectFixtureCandidate(dir, { expectWorkspaceState: false }); + + expect(report.ready).toBe(true); + expect(report.runStatus).toEqual({ kind: 'error-run', turnsAnswered: 0, errorCount: 1 }); + expect(report.workspaceState).toEqual({ + expected: false, + present: false, + path: join(dir, 'workspace-state'), + }); + expect(report.normalizationDebt).toContain('error messages may need provider-specific redaction review'); + expect(report.errors).toEqual([]); + }); +}); + +function writeCandidate( + dir: string, + { includeWorkspaceState, errorRun = false }: { includeWorkspaceState: boolean; errorRun?: boolean }, +): void { + const summary = { + turnsAnswered: errorRun ? 0 : 2, + finalFrontierState: errorRun ? 'awaiting_response' : 'answered', + durationMs: 23446, + questionAnswers: errorRun + ? [] + : [ + { + question: 'What is this project?', + answer: 'A repeatable fixture candidate.', + }, + ], + errors: errorRun + ? [ + { + requestId: 'policy-1', + capability: 'probe.responsePolicy', + code: 'policy_failed', + message: 'redacted', + }, + ] + : [], + }; + const finalChat = { + frontier: { state: summary.finalFrontierState, turnId: 101 }, + turns: summary.questionAnswers.map((pair, index) => ({ id: index + 100, ...pair })), + }; + const bundle = { + schemaVersion: 1, + scenario: { name: 'candidate', brief: 'fixture brief', specName: 'Fixture spec' }, + workspace: { + cwd: '/var/folders/example/brunch-probe-workspace-abc123', + preservedStatePath: includeWorkspaceState ? join(dir, 'workspace-state') : null, + }, + commandSequence: ['spec.create', 'chat.getPrimary', 'chat.ensureReady'], + rawJsonlTranscript: [ + { + direction: 'request', + payload: { id: 'create', capability: 'spec.create', input: { name: 'Fixture spec' } }, + }, + { direction: 'response', payload: { id: 'create', ok: true, output: { specId: 1 } } }, + ], + parsedEvents: [], + finalChat, + summary, + errors: summary.errors, + simulatedUserEvents: [], + environment: { nodeVersion: 'v24.15.0', platform: 'darwin', arch: 'arm64' }, + }; + + writeFileSync(join(dir, 'summary.json'), `${JSON.stringify(summary, null, 2)}\n`); + writeFileSync(join(dir, 'final-chat.json'), `${JSON.stringify(finalChat, null, 2)}\n`); + writeFileSync(join(dir, 'artifact-bundle.json'), `${JSON.stringify(bundle, null, 2)}\n`); + writeFileSync( + join(dir, 'raw-jsonl.ndjson'), + `${bundle.rawJsonlTranscript.map((entry) => JSON.stringify(entry)).join('\n')}\n`, + ); + + if (includeWorkspaceState) { + mkdirSync(join(dir, 'workspace-state', '.brunch'), { recursive: true }); + writeFileSync(join(dir, 'workspace-state', '.brunch', 'brunch.db'), 'sqlite'); + } +} diff --git a/scripts/agent-probes/fixture-candidate.ts b/scripts/agent-probes/fixture-candidate.ts new file mode 100644 index 00000000..9bab9e0f --- /dev/null +++ b/scripts/agent-probes/fixture-candidate.ts @@ -0,0 +1,173 @@ +import { existsSync, readFileSync } from 'node:fs'; +import { join } from 'node:path'; + +export interface FixtureCandidateFileReport { + present: boolean; + validJson: boolean | null; +} + +export interface FixtureCandidateReport { + ready: boolean; + files: Record; + workspaceState: { expected: boolean; present: boolean; path: string }; + runStatus: { kind: 'completed' | 'error-run'; turnsAnswered: number; errorCount: number } | null; + normalizationDebt: string[]; + errors: string[]; +} + +export function inspectFixtureCandidate( + artifactDir: string, + { expectWorkspaceState = false }: { expectWorkspaceState?: boolean } = {}, +): FixtureCandidateReport { + const errors: string[] = []; + const files = { + 'artifact-bundle.json': inspectJsonFile(join(artifactDir, 'artifact-bundle.json'), errors), + 'summary.json': inspectJsonFile(join(artifactDir, 'summary.json'), errors), + 'raw-jsonl.ndjson': inspectNdjsonFile(join(artifactDir, 'raw-jsonl.ndjson'), errors), + 'final-chat.json': inspectJsonFile(join(artifactDir, 'final-chat.json'), errors), + }; + const workspaceStatePath = join(artifactDir, 'workspace-state'); + const workspaceState = { + expected: expectWorkspaceState, + present: existsSync(workspaceStatePath), + path: workspaceStatePath, + }; + if (expectWorkspaceState && !workspaceState.present) { + errors.push('workspace-state is missing'); + } + + const summary = readJson(join(artifactDir, 'summary.json')); + const bundle = readJson(join(artifactDir, 'artifact-bundle.json')); + const runStatus = getRunStatus(summary); + const normalizationDebt = collectNormalizationDebt({ + bundle, + summary, + hasErrors: runStatus?.kind === 'error-run', + }); + + return { + ready: + Object.values(files).every((file) => file.present && file.validJson !== false) && errors.length === 0, + files, + workspaceState, + runStatus, + normalizationDebt, + errors, + }; +} + +function inspectJsonFile(path: string, errors: string[]): FixtureCandidateFileReport { + if (!existsSync(path)) { + errors.push(`${fileName(path)} is missing`); + return { present: false, validJson: null }; + } + + try { + JSON.parse(readFileSync(path, 'utf8')); + return { present: true, validJson: true }; + } catch { + errors.push(`${fileName(path)} is not valid JSON`); + return { present: true, validJson: false }; + } +} + +function inspectNdjsonFile(path: string, errors: string[]): FixtureCandidateFileReport { + if (!existsSync(path)) { + errors.push(`${fileName(path)} is missing`); + return { present: false, validJson: null }; + } + + const lines = readFileSync(path, 'utf8') + .split('\n') + .filter((line) => line.trim() !== ''); + try { + for (const line of lines) { + JSON.parse(line); + } + return { present: true, validJson: true }; + } catch { + errors.push(`${fileName(path)} contains invalid NDJSON`); + return { present: true, validJson: false }; + } +} + +function readJson(path: string): unknown { + if (!existsSync(path)) { + return null; + } + try { + return JSON.parse(readFileSync(path, 'utf8')); + } catch { + return null; + } +} + +function getRunStatus(summary: unknown): FixtureCandidateReport['runStatus'] { + if (!isRecord(summary)) { + return null; + } + const turnsAnswered = typeof summary.turnsAnswered === 'number' ? summary.turnsAnswered : 0; + const errors = Array.isArray(summary.errors) ? summary.errors : []; + return { + kind: errors.length > 0 ? 'error-run' : 'completed', + turnsAnswered, + errorCount: errors.length, + }; +} + +function collectNormalizationDebt({ + bundle, + summary, + hasErrors, +}: { + bundle: unknown; + summary: unknown; + hasErrors: boolean; +}): string[] { + const debt = new Set(); + + if (isRecord(summary) && typeof summary.durationMs === 'number') { + debt.add('summary.durationMs'); + } + + if (isRecord(bundle)) { + const environment = isRecord(bundle.environment) ? bundle.environment : null; + if (environment) { + if (typeof environment.nodeVersion === 'string') debt.add('artifact-bundle.environment.nodeVersion'); + if (typeof environment.platform === 'string') debt.add('artifact-bundle.environment.platform'); + if (typeof environment.arch === 'string') debt.add('artifact-bundle.environment.arch'); + } + + const workspace = isRecord(bundle.workspace) ? bundle.workspace : null; + if (workspace) { + if (typeof workspace.cwd === 'string') debt.add('artifact-bundle.workspace.cwd'); + if (typeof workspace.preservedStatePath === 'string') { + debt.add('artifact-bundle.workspace.preservedStatePath'); + } + } + + const bundleSummary = isRecord(bundle.summary) ? bundle.summary : null; + if (bundleSummary && typeof bundleSummary.durationMs === 'number') { + debt.add('artifact-bundle.summary.durationMs'); + } + + if (Array.isArray(bundle.rawJsonlTranscript) && bundle.rawJsonlTranscript.length > 0) { + debt.add('raw-jsonl request/response ids and resource ids'); + } + } + + debt.add('final-chat generated question wording'); + if (hasErrors) { + debt.add('error messages may need provider-specific redaction review'); + } + + return [...debt]; +} + +function fileName(path: string): string { + return path.split('/').at(-1) ?? path; +} + +function isRecord(value: unknown): value is Record { + return typeof value === 'object' && value !== null && !Array.isArray(value); +} From 2e086f03214c33926d0943ffa25752d69a7f886b Mon Sep 17 00:00:00 2001 From: Lu Nelson Date: Tue, 12 May 2026 13:54:18 +0200 Subject: [PATCH 20/42] Harden probe JSONL transport failures --- scripts/agent-probes/probe-runner.test.ts | 116 ++++++++++++++++++++++ scripts/agent-probes/probe-runner.ts | 80 +++++++++++++-- 2 files changed, 188 insertions(+), 8 deletions(-) diff --git a/scripts/agent-probes/probe-runner.test.ts b/scripts/agent-probes/probe-runner.test.ts index 60e8f64c..cc2884d2 100644 --- a/scripts/agent-probes/probe-runner.test.ts +++ b/scripts/agent-probes/probe-runner.test.ts @@ -252,6 +252,109 @@ describe('probe runner', () => { expect(response).toEqual({ id: 'create', ok: true, output: { echoed: 'spec.create' } }); }); + it('settles a pending process JSONL request when the child emits an id:null protocol error', async () => { + let onStdoutData: ((chunk: string) => void) | null = null; + const process: SpawnedJsonlProcess = { + writeStdin() { + onStdoutData?.( + `${JSON.stringify({ + id: null, + ok: false, + error: { code: 'bad_request', message: 'Malformed request envelope' }, + })}\n`, + ); + }, + endStdin() {}, + onStdoutData(listener) { + onStdoutData = listener; + }, + }; + + const transport = createProcessJsonlTransport(process); + const response = await expectSettledJsonlResponse( + transport.send({ id: 'create', capability: 'spec.create', input: { name: 'Probe' } }), + ); + + expect(response).toEqual({ + id: 'create', + ok: false, + error: { code: 'protocol_error', message: 'Unmatched id:null response: Malformed request envelope' }, + }); + }); + + it('settles a pending process JSONL request when the child emits malformed JSON', async () => { + let onStdoutData: ((chunk: string) => void) | null = null; + const process: SpawnedJsonlProcess = { + writeStdin() { + onStdoutData?.('{not-json}\n'); + }, + endStdin() {}, + onStdoutData(listener) { + onStdoutData = listener; + }, + }; + + const transport = createProcessJsonlTransport(process); + const response = await expectSettledJsonlResponse( + transport.send({ id: 'create', capability: 'spec.create', input: { name: 'Probe' } }), + ); + + expect(response).toEqual({ + id: 'create', + ok: false, + error: { code: 'malformed_json', message: 'Malformed JSONL response from child process' }, + }); + }); + + it('settles pending process JSONL requests when the child process exits', async () => { + let onExit: ((code: number | null) => void) | null = null; + const process: SpawnedJsonlProcess = { + writeStdin() { + onExit?.(17); + }, + endStdin() {}, + onStdoutData() {}, + onExit(listener) { + onExit = listener; + }, + }; + + const transport = createProcessJsonlTransport(process); + const response = await expectSettledJsonlResponse( + transport.send({ id: 'create', capability: 'spec.create', input: { name: 'Probe' } }), + ); + + expect(response).toEqual({ + id: 'create', + ok: false, + error: { code: 'process_exit', message: 'JSONL child process exited with code 17' }, + }); + }); + + it('settles pending process JSONL requests when the child never responds before timeout', async () => { + const process: SpawnedJsonlProcess = { + writeStdin() {}, + endStdin() {}, + onStdoutData() {}, + }; + const transportFactory = createProcessJsonlTransport as ( + process: SpawnedJsonlProcess, + options: { requestTimeoutMs: number }, + ) => JsonlTransport; + + const transport = transportFactory(process, { requestTimeoutMs: 1 }); + const response = await expectSettledJsonlResponse( + transport.send({ id: 'create', capability: 'spec.create', input: { name: 'Probe' } }), + 50, + ); + + expect(response).toEqual({ + id: 'create', + ok: false, + error: { code: 'request_timeout', message: 'JSONL child process did not respond within 1ms' }, + }); + }); + it('creates an isolated workspace and writes minimal probe artifacts outside .brunch', async () => { const outputDir = makeTempDir('brunch-probe-output-'); const spawnedCwds: string[] = []; @@ -436,6 +539,19 @@ describe('probe runner', () => { }); }); +async function expectSettledJsonlResponse( + response: Promise, + timeoutMs = 20, +): Promise { + const timeout = new Promise<{ timedOut: true }>((resolve) => { + setTimeout(() => resolve({ timedOut: true }), timeoutMs); + }); + const settled = await Promise.race([response, timeout]); + + expect(settled).not.toEqual({ timedOut: true }); + return settled as ProbeJsonlResponse; +} + function createScriptedSuccessTransport(): JsonlTransport { return { async send(request) { diff --git a/scripts/agent-probes/probe-runner.ts b/scripts/agent-probes/probe-runner.ts index d34454a4..19e92e27 100644 --- a/scripts/agent-probes/probe-runner.ts +++ b/scripts/agent-probes/probe-runner.ts @@ -201,9 +201,31 @@ export async function runProcessBackedProbe({ } } -export function createProcessJsonlTransport(process: SpawnedJsonlProcess): JsonlTransport { +export function createProcessJsonlTransport( + process: SpawnedJsonlProcess, + { requestTimeoutMs = 30_000 }: { requestTimeoutMs?: number } = {}, +): JsonlTransport { let buffer = ''; - const pending = new Map void>(); + const pending = new Map< + string, + { resolveResponse: (response: ProbeJsonlResponse) => void; timeout: ReturnType } + >(); + + function settle(requestId: string, response: ProbeJsonlResponse): void { + const pendingRequest = pending.get(requestId); + if (!pendingRequest) { + return; + } + clearTimeout(pendingRequest.timeout); + pending.delete(requestId); + pendingRequest.resolveResponse(response); + } + + function settleAll(error: { code: string; message: string }): void { + for (const requestId of Array.from(pending.keys())) { + settle(requestId, { id: requestId, ok: false, error }); + } + } process.onStdoutData((chunk) => { buffer += chunk; @@ -212,21 +234,63 @@ export function createProcessJsonlTransport(process: SpawnedJsonlProcess): Jsonl const line = buffer.slice(0, newlineIndex).trim(); buffer = buffer.slice(newlineIndex + 1); if (line !== '') { - const response = JSON.parse(line) as ProbeJsonlResponse; - if (response.id) { - pending.get(response.id)?.(response); - pending.delete(response.id); + let response: ProbeJsonlResponse; + try { + response = JSON.parse(line) as ProbeJsonlResponse; + } catch { + settleAll({ code: 'malformed_json', message: 'Malformed JSONL response from child process' }); + newlineIndex = buffer.indexOf('\n'); + continue; + } + + if (response.id === null) { + const message = response.ok + ? 'Unmatched id:null response' + : `Unmatched id:null response: ${response.error.message}`; + settleAll({ code: 'protocol_error', message }); + } else { + settle(response.id, response); } } newlineIndex = buffer.indexOf('\n'); } }); + process.onStderrData?.((chunk) => { + const message = chunk.trim().split('\n')[0] || 'JSONL child process wrote to stderr'; + settleAll({ code: 'process_stderr', message }); + }); + + process.onExit?.((code) => { + settleAll({ code: 'process_exit', message: `JSONL child process exited with code ${code ?? 'null'}` }); + }); + return { send(request) { return new Promise((resolveResponse) => { - pending.set(request.id, resolveResponse); - process.writeStdin(JSON.stringify(request)); + const timeout = setTimeout(() => { + settle(request.id, { + id: request.id, + ok: false, + error: { + code: 'request_timeout', + message: `JSONL child process did not respond within ${requestTimeoutMs}ms`, + }, + }); + }, requestTimeoutMs); + pending.set(request.id, { resolveResponse, timeout }); + try { + process.writeStdin(JSON.stringify(request)); + } catch (error) { + settle(request.id, { + id: request.id, + ok: false, + error: { + code: 'stdin_write_failed', + message: error instanceof Error ? error.message : String(error), + }, + }); + } }); }, }; From c6d4bdfdf9d0e2ad7dc55ed84ddf80c0a0ab206e Mon Sep 17 00:00:00 2001 From: Lu Nelson Date: Tue, 12 May 2026 13:56:48 +0200 Subject: [PATCH 21/42] Capture process probe failure artifacts --- scripts/agent-probes/probe-runner.test.ts | 66 +++++++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/scripts/agent-probes/probe-runner.test.ts b/scripts/agent-probes/probe-runner.test.ts index cc2884d2..5373fdfb 100644 --- a/scripts/agent-probes/probe-runner.test.ts +++ b/scripts/agent-probes/probe-runner.test.ts @@ -401,6 +401,72 @@ describe('probe runner', () => { expect(existsSync(join(outputDir, 'workspace-state'))).toBe(false); }); + it('writes sanitized process-backed failure artifacts when JSONL protocol interaction fails', async () => { + const outputDir = makeTempDir('brunch-probe-output-'); + + const result = await runProcessBackedProbe({ + scenario: { name: 'process-protocol-failure', specName: 'Process protocol failure' }, + scriptedAnswers: [], + outputDir, + spawnProcess() { + let onStdoutData: ((chunk: string) => void) | null = null; + return { + writeStdin() { + onStdoutData?.( + `${JSON.stringify({ + id: null, + ok: false, + error: { code: 'bad_request', message: 'ANTHROPIC_API_KEY=sk-secret bad envelope' }, + })}\n`, + ); + }, + endStdin() {}, + onStdoutData(listener) { + onStdoutData = listener; + }, + }; + }, + }); + + const summary = JSON.parse(readFileSync(join(outputDir, 'summary.json'), 'utf8')) as unknown; + const bundle = JSON.parse(readFileSync(join(outputDir, 'artifact-bundle.json'), 'utf8')) as unknown; + const rawJsonl = readFileSync(join(outputDir, 'raw-jsonl.ndjson'), 'utf8'); + + expect(result.summary.turnsAnswered).toBe(0); + expect(result.errors).toEqual([ + { + requestId: 'create', + capability: 'spec.create', + code: 'protocol_error', + message: 'Unmatched id:null response: ANTHROPIC_API_KEY=[redacted] bad envelope', + }, + ]); + expect(summary).toMatchObject({ + turnsAnswered: 0, + errors: [ + { + requestId: 'create', + capability: 'spec.create', + code: 'protocol_error', + message: 'Unmatched id:null response: ANTHROPIC_API_KEY=[redacted] bad envelope', + }, + ], + }); + expect(bundle).toMatchObject({ + commandSequence: ['spec.create'], + errors: [ + { + requestId: 'create', + capability: 'spec.create', + code: 'protocol_error', + message: 'Unmatched id:null response: ANTHROPIC_API_KEY=[redacted] bad envelope', + }, + ], + }); + expect(rawJsonl).toContain('"direction":"request"'); + expect(rawJsonl).toContain('"direction":"response"'); + }); + it('can preserve the temp workspace .brunch state into the artifact directory', async () => { const outputDir = makeTempDir('brunch-probe-output-'); let liveWorkspaceDbPath: string | null = null; From be28f27962a1ed9127a4b090e7f74653c25281f7 Mon Sep 17 00:00:00 2001 From: Lu Nelson Date: Tue, 12 May 2026 13:58:39 +0200 Subject: [PATCH 22/42] Add probe runner turn budget --- scripts/agent-probes/probe-runner.test.ts | 56 +++++++++++++++++++++++ scripts/agent-probes/probe-runner.ts | 7 ++- 2 files changed, 62 insertions(+), 1 deletion(-) diff --git a/scripts/agent-probes/probe-runner.test.ts b/scripts/agent-probes/probe-runner.test.ts index 5373fdfb..806572bf 100644 --- a/scripts/agent-probes/probe-runner.test.ts +++ b/scripts/agent-probes/probe-runner.test.ts @@ -171,6 +171,32 @@ describe('probe runner', () => { expect(result.errors).toEqual([]); }); + it('stops scripted probing after an explicit one-turn budget', async () => { + const requests: ProbeJsonlRequest[] = []; + + const result = await runScriptedProbe({ + transport: { + async send(request) { + requests.push(request); + return getFakeAgentResponse(request); + }, + }, + scenario: { name: 'one-turn', specName: 'One turn proof' }, + scriptedAnswers: ['A one-turn answer'], + turnBudget: 1, + }); + + expect(result.summary).toMatchObject({ turnsAnswered: 1, finalFrontierState: 'answered' }); + expect(requests.map((request) => request.id)).toEqual([ + 'create', + 'primary', + 'ready-1', + 'read-1', + 'answer-1', + 'read-2', + ]); + }); + it('can answer turns through an injected response policy', async () => { const policyInputs: Array<{ activeTurnId: number; priorAnswerCount: number; brief: string | undefined }> = []; @@ -355,6 +381,36 @@ describe('probe runner', () => { }); }); + it('passes an explicit one-turn budget through process-backed probes', async () => { + const outputDir = makeTempDir('brunch-probe-output-'); + + const result = await runProcessBackedProbe({ + scenario: { name: 'process-one-turn', specName: 'Process one turn' }, + scriptedAnswers: ['A one-turn process probe'], + outputDir, + turnBudget: 1, + spawnProcess() { + return createFakeAgentProcess(); + }, + }); + + const summary = JSON.parse(readFileSync(join(outputDir, 'summary.json'), 'utf8')) as unknown; + const bundle = JSON.parse(readFileSync(join(outputDir, 'artifact-bundle.json'), 'utf8')) as unknown; + + expect(result.summary).toMatchObject({ turnsAnswered: 1, finalFrontierState: 'answered' }); + expect(summary).toMatchObject({ turnsAnswered: 1, finalFrontierState: 'answered' }); + expect(bundle).toMatchObject({ + commandSequence: [ + 'spec.create', + 'chat.getPrimary', + 'chat.ensureReady', + 'chat.read', + 'turn.submitResponse', + 'chat.read', + ], + }); + }); + it('creates an isolated workspace and writes minimal probe artifacts outside .brunch', async () => { const outputDir = makeTempDir('brunch-probe-output-'); const spawnedCwds: string[] = []; diff --git a/scripts/agent-probes/probe-runner.ts b/scripts/agent-probes/probe-runner.ts index 19e92e27..ae846de2 100644 --- a/scripts/agent-probes/probe-runner.ts +++ b/scripts/agent-probes/probe-runner.ts @@ -151,6 +151,7 @@ interface RunScriptedProbeOptions { scriptedAnswers: string[]; responsePolicy?: ProbeResponsePolicy; simulatedUserEvents?: SimulatedUserEvent[]; + turnBudget?: number; } export interface ProcessBackedProbeOptions { @@ -164,6 +165,7 @@ export interface ProcessBackedProbeOptions { preserveWorkspaceState?: boolean; responsePolicy?: ProbeResponsePolicy; simulatedUserEvents?: SimulatedUserEvent[]; + turnBudget?: number; } export async function runProcessBackedProbe({ @@ -177,6 +179,7 @@ export async function runProcessBackedProbe({ preserveWorkspaceState = false, responsePolicy, simulatedUserEvents, + turnBudget, }: ProcessBackedProbeOptions): Promise { const workspaceCwd = mkdtempSync(join(tmpdir(), 'brunch-probe-workspace-')); const spawned = spawnProcess({ cwd: workspaceCwd, command, args, env }); @@ -189,6 +192,7 @@ export async function runProcessBackedProbe({ scriptedAnswers, responsePolicy, simulatedUserEvents, + turnBudget, }); result.workspaceCwd = workspaceCwd; if (preserveWorkspaceState) { @@ -302,6 +306,7 @@ export async function runScriptedProbe({ scriptedAnswers, responsePolicy = createScriptedResponsePolicy(scriptedAnswers), simulatedUserEvents = [], + turnBudget = 2, }: RunScriptedProbeOptions): Promise { const startedAt = Date.now(); const state: ProbeRunResult = { @@ -334,7 +339,7 @@ export async function runScriptedProbe({ return finishRun(state, startedAt); } - for (let turnIndex = 0; turnIndex < 2; turnIndex += 1) { + for (let turnIndex = 0; turnIndex < turnBudget; turnIndex += 1) { const ready = await sendExpectingOutput(state, transport, { id: `ready-${turnIndex + 1}`, capability: 'chat.ensureReady', From eff1b4b3c1a71b1b86888a5c13bba614221b8b99 Mon Sep 17 00:00:00 2001 From: Lu Nelson Date: Tue, 12 May 2026 14:00:32 +0200 Subject: [PATCH 23/42] Validate fixture candidate structure --- .../agent-probes/fixture-candidate.test.ts | 28 +++- scripts/agent-probes/fixture-candidate.ts | 122 ++++++++++++++++++ 2 files changed, 149 insertions(+), 1 deletion(-) diff --git a/scripts/agent-probes/fixture-candidate.test.ts b/scripts/agent-probes/fixture-candidate.test.ts index 48206fd4..6d220565 100644 --- a/scripts/agent-probes/fixture-candidate.test.ts +++ b/scripts/agent-probes/fixture-candidate.test.ts @@ -1,4 +1,4 @@ -import { mkdirSync, mkdtempSync, rmSync, writeFileSync } from 'node:fs'; +import { mkdirSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from 'node:fs'; import { tmpdir } from 'node:os'; import { join } from 'node:path'; @@ -70,6 +70,32 @@ describe('fixture candidate checkpoint', () => { expect(report.errors).toContain('workspace-state is missing'); }); + it('rejects parseable artifacts with invalid structure or inconsistent duplicated fields', () => { + const dir = makeTempDir('brunch-fixture-invalid-'); + writeCandidate(dir, { includeWorkspaceState: false }); + const bundlePath = join(dir, 'artifact-bundle.json'); + const bundle = JSON.parse(readFileSync(bundlePath, 'utf8')) as Record; + bundle.schemaVersion = 2; + bundle.summary = { turnsAnswered: 'two' }; + bundle.finalChat = null; + bundle.rawJsonlTranscript = []; + delete bundle.commandSequence; + writeFileSync(bundlePath, `${JSON.stringify(bundle, null, 2)}\n`); + + const report = inspectFixtureCandidate(dir); + + expect(report.ready).toBe(false); + expect(report.errors).toEqual( + expect.arrayContaining([ + 'artifact-bundle.json schemaVersion must be 1', + 'artifact-bundle.json commandSequence must be an array', + 'artifact-bundle.summary does not match summary.json', + 'artifact-bundle.finalChat does not match final-chat.json', + 'artifact-bundle.rawJsonlTranscript does not match raw-jsonl.ndjson', + ]), + ); + }); + it('accepts an error-run candidate while reporting failure status and normalization debt', () => { const dir = makeTempDir('brunch-fixture-error-run-'); writeCandidate(dir, { includeWorkspaceState: false, errorRun: true }); diff --git a/scripts/agent-probes/fixture-candidate.ts b/scripts/agent-probes/fixture-candidate.ts index 9bab9e0f..27863263 100644 --- a/scripts/agent-probes/fixture-candidate.ts +++ b/scripts/agent-probes/fixture-candidate.ts @@ -38,6 +38,9 @@ export function inspectFixtureCandidate( const summary = readJson(join(artifactDir, 'summary.json')); const bundle = readJson(join(artifactDir, 'artifact-bundle.json')); + const finalChat = readJson(join(artifactDir, 'final-chat.json')); + const rawJsonlTranscript = readNdjson(join(artifactDir, 'raw-jsonl.ndjson')); + validateCandidateStructure({ bundle, summary, finalChat, rawJsonlTranscript, errors }); const runStatus = getRunStatus(summary); const normalizationDebt = collectNormalizationDebt({ bundle, @@ -102,6 +105,125 @@ function readJson(path: string): unknown { } } +function readNdjson(path: string): unknown[] | null { + if (!existsSync(path)) { + return null; + } + try { + return readFileSync(path, 'utf8') + .split('\n') + .filter((line) => line.trim() !== '') + .map((line) => JSON.parse(line) as unknown); + } catch { + return null; + } +} + +function validateCandidateStructure({ + bundle, + summary, + finalChat, + rawJsonlTranscript, + errors, +}: { + bundle: unknown; + summary: unknown; + finalChat: unknown; + rawJsonlTranscript: unknown[] | null; + errors: string[]; +}): void { + validateSummaryStructure(summary, errors); + validateBundleStructure(bundle, errors); + + if (isRecord(bundle)) { + if (!deepEqual(bundle.summary, summary)) { + errors.push('artifact-bundle.summary does not match summary.json'); + } + if (!deepEqual(bundle.finalChat, finalChat)) { + errors.push('artifact-bundle.finalChat does not match final-chat.json'); + } + if (!Array.isArray(rawJsonlTranscript) || !deepEqual(bundle.rawJsonlTranscript, rawJsonlTranscript)) { + errors.push('artifact-bundle.rawJsonlTranscript does not match raw-jsonl.ndjson'); + } + } +} + +function validateSummaryStructure(summary: unknown, errors: string[]): void { + if (!isRecord(summary)) { + errors.push('summary.json is not an object'); + return; + } + + requireField(summary, 'turnsAnswered', 'number', 'summary.json', errors); + if (typeof summary.finalFrontierState !== 'string' && summary.finalFrontierState !== null) { + errors.push('summary.json finalFrontierState must be a string or null'); + } + requireField(summary, 'durationMs', 'number', 'summary.json', errors); + requireArrayField(summary, 'questionAnswers', 'summary.json', errors); + requireArrayField(summary, 'errors', 'summary.json', errors); +} + +function validateBundleStructure(bundle: unknown, errors: string[]): void { + if (!isRecord(bundle)) { + errors.push('artifact-bundle.json is not an object'); + return; + } + + if (bundle.schemaVersion !== 1) { + errors.push('artifact-bundle.json schemaVersion must be 1'); + } + requireRecordField(bundle, 'scenario', 'artifact-bundle.json', errors); + requireRecordField(bundle, 'workspace', 'artifact-bundle.json', errors); + requireArrayField(bundle, 'commandSequence', 'artifact-bundle.json', errors); + requireArrayField(bundle, 'rawJsonlTranscript', 'artifact-bundle.json', errors); + requireArrayField(bundle, 'parsedEvents', 'artifact-bundle.json', errors); + if (!('finalChat' in bundle)) { + errors.push('artifact-bundle.json finalChat is missing'); + } + requireRecordField(bundle, 'summary', 'artifact-bundle.json', errors); + requireArrayField(bundle, 'errors', 'artifact-bundle.json', errors); + requireArrayField(bundle, 'simulatedUserEvents', 'artifact-bundle.json', errors); + requireRecordField(bundle, 'environment', 'artifact-bundle.json', errors); +} + +function requireField( + record: Record, + field: string, + type: 'number' | 'string', + label: string, + errors: string[], +): void { + if (typeof record[field] !== type) { + errors.push(`${label} ${field} must be a ${type}`); + } +} + +function requireArrayField( + record: Record, + field: string, + label: string, + errors: string[], +): void { + if (!Array.isArray(record[field])) { + errors.push(`${label} ${field} must be an array`); + } +} + +function requireRecordField( + record: Record, + field: string, + label: string, + errors: string[], +): void { + if (!isRecord(record[field])) { + errors.push(`${label} ${field} must be an object`); + } +} + +function deepEqual(left: unknown, right: unknown): boolean { + return JSON.stringify(left) === JSON.stringify(right); +} + function getRunStatus(summary: unknown): FixtureCandidateReport['runStatus'] { if (!isRecord(summary)) { return null; From 8309e547095ef4ced861b08988743b230dc6c395 Mon Sep 17 00:00:00 2001 From: Lu Nelson Date: Tue, 12 May 2026 14:02:05 +0200 Subject: [PATCH 24/42] Split fixture readiness reporting --- scripts/agent-probes/fixture-candidate.test.ts | 14 +++++++++----- scripts/agent-probes/fixture-candidate.ts | 8 +++++--- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/scripts/agent-probes/fixture-candidate.test.ts b/scripts/agent-probes/fixture-candidate.test.ts index 6d220565..0f8073e9 100644 --- a/scripts/agent-probes/fixture-candidate.test.ts +++ b/scripts/agent-probes/fixture-candidate.test.ts @@ -21,13 +21,14 @@ describe('fixture candidate checkpoint', () => { return dir; } - it('reports a complete artifact directory as ready with normalization debt', () => { + it('reports a complete artifact directory as parse-ready and structure-ready with normalization debt', () => { const dir = makeTempDir('brunch-fixture-complete-'); writeCandidate(dir, { includeWorkspaceState: true }); const report = inspectFixtureCandidate(dir, { expectWorkspaceState: true }); - expect(report.ready).toBe(true); + expect(report.parseReady).toBe(true); + expect(report.structureReady).toBe(true); expect(report.files).toMatchObject({ 'artifact-bundle.json': { present: true, validJson: true }, 'summary.json': { present: true, validJson: true }, @@ -61,7 +62,8 @@ describe('fixture candidate checkpoint', () => { const report = inspectFixtureCandidate(dir, { expectWorkspaceState: true }); - expect(report.ready).toBe(false); + expect(report.parseReady).toBe(true); + expect(report.structureReady).toBe(false); expect(report.workspaceState).toEqual({ expected: true, present: false, @@ -84,7 +86,8 @@ describe('fixture candidate checkpoint', () => { const report = inspectFixtureCandidate(dir); - expect(report.ready).toBe(false); + expect(report.parseReady).toBe(true); + expect(report.structureReady).toBe(false); expect(report.errors).toEqual( expect.arrayContaining([ 'artifact-bundle.json schemaVersion must be 1', @@ -102,7 +105,8 @@ describe('fixture candidate checkpoint', () => { const report = inspectFixtureCandidate(dir, { expectWorkspaceState: false }); - expect(report.ready).toBe(true); + expect(report.parseReady).toBe(true); + expect(report.structureReady).toBe(true); expect(report.runStatus).toEqual({ kind: 'error-run', turnsAnswered: 0, errorCount: 1 }); expect(report.workspaceState).toEqual({ expected: false, diff --git a/scripts/agent-probes/fixture-candidate.ts b/scripts/agent-probes/fixture-candidate.ts index 27863263..bec95ec4 100644 --- a/scripts/agent-probes/fixture-candidate.ts +++ b/scripts/agent-probes/fixture-candidate.ts @@ -7,7 +7,8 @@ export interface FixtureCandidateFileReport { } export interface FixtureCandidateReport { - ready: boolean; + parseReady: boolean; + structureReady: boolean; files: Record; workspaceState: { expected: boolean; present: boolean; path: string }; runStatus: { kind: 'completed' | 'error-run'; turnsAnswered: number; errorCount: number } | null; @@ -36,6 +37,7 @@ export function inspectFixtureCandidate( errors.push('workspace-state is missing'); } + const parseReady = Object.values(files).every((file) => file.present && file.validJson !== false); const summary = readJson(join(artifactDir, 'summary.json')); const bundle = readJson(join(artifactDir, 'artifact-bundle.json')); const finalChat = readJson(join(artifactDir, 'final-chat.json')); @@ -49,8 +51,8 @@ export function inspectFixtureCandidate( }); return { - ready: - Object.values(files).every((file) => file.present && file.validJson !== false) && errors.length === 0, + parseReady, + structureReady: parseReady && errors.length === 0, files, workspaceState, runStatus, From 49214bb659e809773af66c4994b34e7e9c388c4c Mon Sep 17 00:00:00 2001 From: Lu Nelson Date: Tue, 12 May 2026 17:10:07 +0200 Subject: [PATCH 25/42] first full grill of spec evolution strategeis --- docs/design/SPEC_EVOLUTION_STRATEGIES.md | 353 +++++++++++++++++++++++ 1 file changed, 353 insertions(+) create mode 100644 docs/design/SPEC_EVOLUTION_STRATEGIES.md diff --git a/docs/design/SPEC_EVOLUTION_STRATEGIES.md b/docs/design/SPEC_EVOLUTION_STRATEGIES.md new file mode 100644 index 00000000..94fa3ef4 --- /dev/null +++ b/docs/design/SPEC_EVOLUTION_STRATEGIES.md @@ -0,0 +1,353 @@ +# Spec Evolution Strategies + +> Status: **conversation capture / design seed**. +> Date: 2026-05-12. +> Scope: alternative strategies for advancing a Brunch specification's intent graph from vague user intent toward phase-mature, reviewable semantic truth. This note captures the model discovered while discussing the FE-705 `brunch agent` / probe-harness branch. +> +> Related docs: [`AGENT_MUTATION_SURFACE.md`](./AGENT_MUTATION_SURFACE.md), [`BEHAVIORAL_KERNELS.md`](./BEHAVIORAL_KERNELS.md), [`INTENT_GRAPH_SEMANTICS.md`](./INTENT_GRAPH_SEMANTICS.md), [`MULTI_CHAT.md`](./MULTI_CHAT.md), [`PATCH_LEDGER.md`](./PATCH_LEDGER.md). + +## Why this note exists + +The current FE-705 branch adds a local `brunch agent` JSONL capability adapter plus an external probe runner. The immediate implementation looks like CLI / harness infrastructure, but the design pressure behind it is broader: Brunch needs a way to try alternative **spec evolution strategies** outside the browser UI and compare their outputs against realistic completed-spec fixtures. + +The current interviewer strategy is grounded but long. Early users noticed that getting to a useful spec can require many questions. Alternative strategies should reduce user burden without weakening the intent graph into plausible but incoherent generated prose. + +## Core distinction + +A Brunch strategy is not just a prompt. A strategy is a policy for advancing a specification's semantic state: + +- what context it reads, +- what questions or candidate artifacts it produces, +- what unit of output it treats as coherent, +- what authority it has to commit graph truth, +- what review or validation must happen before commit, +- what evidence it contributes toward semantic maturity / phase advancement. + +This suggests a strategy layer above individual interviewer prompts and below durable graph mutation authority. + +## Chat-local strategy and turn shape + +A strategy is **chat-local process state**, not specification-level semantic truth. In the multi-chat model, a specification workspace can have many chats, each with its own strategy and resumable context. + +A Brunch `turn` is assistant- or system-first: the assistant/system offers, proposes, asks, or reports something, and the user response completes the bundle. Observer assessment reads the whole bundle, because the assistant/system part is the context that gives the user's response meaning. + +A strategy can therefore be established by the first turn in a chat: + +```text +assistant/system offer: + "How would you like to proceed?" + - Walk me through it step by step + - Show me strong options quickly + - Ask me targeted design cases + +user response: + "Show me strong options quickly" +``` + +Some globally-triggered flows may create or reuse a chat that is effectively pre-initialized to a strategy. For example, "start reconciliation" or "review this graph" can create a chat whose first assistant/system turn is already the kickoff for that procedure rather than a generic mode-selection offer. + +A chat's strategy should be technically mutable, with changes explainable through later turns, but explicit strategy-switch UX is deferred. Tactical sub-strategies are allowed inside a chat: a `scenario_options` chat might use targeted kernel cases to harden a selected candidate, and a `graph_review` chat might ask clarifying questions. + +## Changesets as semantic history spine + +The multi-chat move separates conversational provenance from semantic history. Turns should no longer be the specification's only historical spine. The future **changeset ledger** should record intent-graph evolution: + +```text +changeset: + one atomic semantic mutation set + +change: + one atomic add/update/link/unlink/retire/etc. inside the changeset +``` + +A `changeset` mutates a specification from one semantically / structurally valid and coherent state to another, including any `reconciliation_need` rows that are opened or resolved as part of the mutation. The data changes inside the changeset and the recording of the changeset itself must succeed or fail together. + +The changeset boundary should be the smallest atomic unit that preserves semantic coherence. If applying only half of a mutation would leave the graph incoherent, it belongs in one changeset. + +A graph-review finding, candidate proposal, or reconciliation suggestion is not itself a changeset until accepted or otherwise acted on. It is the assistant/system half of an open frontier turn in its chat. The turn becomes complete when the user responds through one of the afforded actions, and only then may the runtime apply a changeset. + +Proposal / finding artifacts can start as turn-owned structured assistant parts rather than standalone rows. A standalone proposal or proposed-changeset model should wait until batch review, expiry, assignment, cross-chat surfacing, or independent proposal lifecycle demands it. + +When a turn is created, it should stamp the latest applied changeset id for the specification — for example `turn.opened_at_changeset_id` or `turn.base_changeset_id`. This is not provenance; it is the semantic graph revision the assistant/system offer was based on. If a turn remains open while `specification.latest_changeset_id` advances, the open offer is considered stale in the first cut. The product can simply offer to regenerate or refresh the proposal rather than attempting sophisticated neighborhood-level staleness analysis. + +A chat should have at most one open frontier turn at a time. Otherwise the runtime cannot know which assistant/system offer the user's response completes. In normal operation, every active/resumable chat should have an open frontier turn, even if it is a scripted frontier such as the first offer in a new side-chat. If a chat somehow has no open turn, the UI can offer "continue this chat" or generate a new frontier when the chat is focused. The generated frontier may depend on the specification's current semantic maturity / `phase` value and the chat's strategy. A specification may have many open frontier turns across different chats. + +Proposal turns should share a small normalized completion-action vocabulary, with strategy-specific user-facing labels mapped onto common semantics: + +- `accept` — authorize the proposed action / bundle / fix; may apply a changeset. +- `reject` — decline the proposal without semantic mutation. This should be narrow: rejecting or arbitrarily editing part of a coherent proposal can itself create incoherence. In reconciliation contexts, rejection may leave the original `reconciliation_need` open or create a new one rather than resolving the issue. +- `revise` — request changes to the proposal; completes the current turn and usually opens a successor proposal turn. User-facing labels such as "Request changes" map here. +- `ask_followup` — request explanation or clarification before deciding. +- `defer` — intentionally leave the matter unresolved or parked. +- `regenerate` — ask the system to recreate the offer, especially when stale or low-quality. + +Only `accept` should apply semantic changesets. Other actions may create process metadata or successor turns, but should not mutate intent graph truth directly. If a no-edit outcome still resolves process debt, model it as accepting a proposal whose changeset resolves the relevant need. `revise` is proposal-level transformation: it asks the system to produce a new coherent proposal, not to partially mutate canonical graph truth. + +Direct editing is a sibling mutation path, not the same as proposal revision. In explicit edit mode, the user may make direct pending changes to one or more intent items in memory. When the user exits / applies edit mode, Brunch computes affected edges and opens required `reconciliation_need` rows; the direct item changes and reconciliation needs commit together in one changeset. Direct editing is safe because incoherence risk is materialized as process debt, not because arbitrary edits are prevented. + +Review-set direct edits have a special consequence: if the user directly edits proposed review-set items, accepting the review set as-is is no longer valid. The UI should disable `accept`; `request changes` becomes a reconciliation-oriented action such as `request reconciliation`. The edited candidate/review set must be reconciled before it can become canonical truth. + +Implementation can later choose whether these are distinct response shapes or a `kind` inside a discriminated response union. + +Changeset provenance may point to different initiators: + +- a turn in a chat, +- a user direct edit, +- a graph-review acceptance, +- a reconciliation pass, +- a verifier result, +- an import or migration, +- a future procedure run if the runtime needs a durable operation record distinct from any one turn. + +This makes a `procedure_run` concept useful but not automatically schema-worthy. Some procedures may be represented by one or more turns plus resulting changesets. A first-class `procedure_run` table becomes necessary only when operation lifecycle, retry/cancel, multi-turn grouping, or non-chat provenance cannot be represented cleanly by turns and changesets. + +## Strategy taxonomy discovered so far + +### 1. Design-decision-tree drilldown + +The current default interviewer strategy. + +It asks phase-shaped questions that walk down the user's design-decision tree at increasing levels of detail until enough shared understanding exists to project requirements and criteria. + +**Strengths** + +- High provenance: graph claims are supported by user answers. +- Incremental: each turn can be observed, classified, and committed. +- Good for users who have context and patience. + +**Weaknesses** + +- Slow and question-heavy. +- Asks the user to do much of the design judgment work. +- Can feel like the app is demanding effort before providing leverage. + +**Likely authority shape** + +Incremental canonical commits are acceptable when each answer is processed through existing observer / review semantics. + +### 2. Scenarios with tradeoffs + +A proposed low-friction strategy for users who are impatient, underspecified, or unsure how to judge design choices. + +Instead of asking for every detail, Brunch asks enough to identify the user's product / use-case typology, generates two or more complete scenario-shaped candidate specs, summarizes the tradeoffs of each, and lets the user choose or revise a coherent scenario. + +**Strengths** + +- Gives users something concrete to react to quickly. +- Shifts burden from open-ended design authorship to recognition and comparison. +- Can surface tradeoffs, excluded alternatives, and likely implications earlier. + +**Weaknesses / risk** + +- Generating a valid intent graph in one pass is a tall order for an LLM. +- The failure mode is not only bad prose; it is plausible graph structure whose items and edges are generic, internally weak, contradictory, overconfident, or unsupported. +- User item-by-item acceptance can create semantic incoherence because graph items are not independent. + +**Likely authority shape** + +Generated scenarios should enter as **candidate graph bundles**, not loose collections of proposed graph items. The default acceptance unit should be the coherent bundle. User edits should produce a revised coherent candidate, not arbitrary partial mutation of canonical truth. + +Partial acceptance should only be allowed when the accepted subset is semantically closed, or when the system can automatically bring along required supporting items / edges. + +### 3. Kernel-driven contrastive elicitation + +Inferred from [`BEHAVIORAL_KERNELS.md`](./BEHAVIORAL_KERNELS.md). + +The interviewer detects latent behavioral / correctness kernels in the user's feature and asks compact contrastive scenario questions. The user classifies a concrete divergent case, and the answer emits typed intent graph artifacts directly. + +Example: instead of asking "How should permissions work?", ask whether a user who receives folder access should automatically receive access to documents added later. + +**Strengths** + +- Lower friction than full drilldown. +- More grounded than whole-spec generation. +- Produces high-signal artifacts: decisions, invariants, criteria, positive examples, negative examples, and typed edges. +- Helps users judge concrete cases rather than author abstract requirements. + +**Weaknesses / risk** + +- Requires kernel-card machinery: detection signals, question templates, artifact schemas, validators, and cross-kernel deduplication. +- Kernel ordering and composition are unresolved. +- A graph can become locally strong around activated kernels while remaining globally incomplete. + +**Likely authority shape** + +Kernel answers may be safe to commit incrementally when the emitted artifacts are validated against the kernel contract and relation policy. Kernel-generated artifacts should retain the worked scenario as evidence. + +### 4. Topology-driven targeting + +Mentioned in the behavioral-kernels design as complementary to kernel-driven questioning. + +This may be less a user-facing strategy than a scheduler / targeting policy: once a graph exists, Brunch reads graph topology and epistemic metadata to choose where the next question, critique, or repair should focus. + +Examples: high-fanout low-confidence assumptions, decisions without rejected alternatives, requirements without verification edges, criteria without targets, or conflicting constraints. + +## Relation directionality + +The current `knowledge_edge` relation names mix directionality in ways that become risky once edges drive reconciliation. For example, `depends_on` and `derived_from` naturally read downstream-to-upstream, while `constrains` and `verifies` often read upstream-to-downstream or evidence-to-claim. + +Because FE-700 is already expected to expand the intent-graph ontology, breaking existing relation names and records remains acceptable. However, trying to force every useful edge verb into one dependency direction may distort the ontology around one operation. The graph must serve display, prompt context, export trace, requirements projection, reconciliation, critique, verification, candidate generation, and explanation. + +The safer rule is: + +> Edge verbs should be semantically clear; operational direction belongs in relation policy. + +Every relation kind should declare: + +- canonical sentence, e.g. `{source} verifies {target}`, +- inverse display sentence, +- whether it participates in visible graph display, export trace, staleness, reconciliation, criteria help, or weak suggestion flows, +- what happens when the source changes, +- what happens when the target changes. + +Code should not infer reconciliation behavior from raw edge direction. Direct edit and hard-impact cascade should enumerate incident accepted edges and ask relation policy which opposite endpoint, if any, receives a `reconciliation_need`. + +The contrastive-kernel strategy may also drive a further expanded ontology. Kernel questions naturally surface artifacts such as `alternative`, `question`, `ambiguity`, `candidate`, and rejected option records. Example: a containment/topology question about deleting a parent has multiple alternatives; the user's answer chooses one, rejects others, and emits an invariant plus positive/negative examples. FE-700 should leave room for these artifacts, even if the first implementation represents some of them as examples, decisions, or proposal-local structures rather than durable top-level item kinds. + +## Graph operations surfaced by the discussion + +### Graph reconciliation + +Repair-oriented. + +Starts from a known disturbance or process obligation, such as an open `reconciliation_need` caused by an edit, semantic conflict, verifier result, or changed upstream item. + +The reconciler's question is: + +> Given this specific change or conflict, what existing graph truth needs to be repaired, confirmed, dismissed, or escalated? + +Likely outputs: + +- auto-confirm target still holds, +- auto-edit a target through the standard mutation path, +- mark need irrelevant / resolved, +- escalate to HITL because a semantic conflict requires judgment, +- open or regenerate downstream reconciliation needs. + +This should stay tied to known coherence obligations. It should not become the umbrella term for all graph intelligence. + +### Graph review / critique + +Quality-oriented. + +Can run on any intent graph, whether produced by drilldown, scenario generation, import, direct editing, or kernel elicitation. + +The reviewer asks: + +> If this graph is supposed to represent a good spec at its current maturity stage, where is it weak, thin, overconfident, under-supported, ambiguous, generic, uncheckable, or missing important structure? + +Likely review dimensions: + +- internal coherence, +- coverage, +- decision usefulness, +- tradeoff honesty, +- checkability, +- granularity, +- scenario fidelity, +- epistemic labeling, +- provenance strength, +- downstream usefulness. + +A graph can have no reconciliation needs and still fail graph review. + +## Candidate graph bundles + +`scenarios-with-tradeoffs` introduces a unit that is not yet represented by today's canonical graph model: a speculative but coherent candidate world. + +A candidate bundle should probably contain: + +- scenario summary, +- intended maturity stage, +- tradeoff profile, +- generated items, +- generated edges, +- required core items, +- optional / swappable items, +- known risks, +- critic findings, +- provenance / epistemic labels, +- commit preconditions. + +The branch's probe harness can help compare candidate bundles against drilldown-produced fixture specs before any product UI commits to this flow. + +## Phase / maturity implication + +`spec.phase` should be understood as a semantic maturity signal, not merely a positional route label. The frontend currently presents separate phase routes, but phase is better interpreted as a cumulative rating of the spec's evolution stage. + +A strategy should therefore not merely "move the user to a phase." It should contribute evidence that the graph has reached a maturity bar. + +Potential maturity signals: + +- no blocking reconciliation needs, +- graph-review findings above an acceptable threshold, +- coverage across required item and edge families, +- enough checkable criteria, +- enough user-confirmed or strongly-supported provenance for high-impact claims, +- no blocking unresolved critique findings. + +This creates useful distinctions: + +- **coherent**: no known contradictions / open process debt, +- **complete enough**: covers the necessary semantic territory, +- **good enough**: specific, tradeoff-aware, checkable, and useful, +- **phase-mature**: meets the bar for the next projection / export stage. + +## Product sequencing + +The most desired product surfaces are likely: + +1. first-turn strategy choice for a new chat / spec start, +2. a mid-interview "speed this up" / "show me strong options" affordance. + +Engineering still needs part of `graph_review` to make scenario generation credible. `scenario_options` can be the first product-facing strategy while `graph_review` remains an internal oracle used to critique, repair, and score generated candidate bundles before they are shown or committed. + +For mid-interview acceleration, the preferred shape is to branch into a new or reused side-chat / strategy chat rather than switching the primary interview chat in place. The side-chat branch can use the current graph and transcript as context, generate reviewed scenario options, and preserve the main interview frontier if the generated path disappoints or needs to be resumed later. This is likely more flexible than the current main-chat UX, which still assumes the original guided interview shape. + +A scenario-options side-chat should receive a context pack rather than an unstructured full transcript dump. Minimum context includes the current spec name / mode, semantic maturity / phase, summarized user goal and context, accepted intent graph items, important edge neighborhoods, the current open frontier question if relevant, unresolved assumptions or low-confidence areas, and recent turns only when they explain user style or intent. + +For the mid-interview "speed this up" use case, generated scenarios should default to **complete the current direction**: treat accepted graph truth as fixed premises and fill in plausible missing structure. A more radical "show alternatives that challenge prior assumptions" mode is feasible but deferred. + +Scenario generation should present **2–3 options** with named tradeoff profiles rather than many variants. Each visible option should have a short name, scenario summary, key assumptions, what it optimizes for, what it gives up, confidence / review warnings, and `Use this` / `Revise` style actions. Internally, each option maps to a candidate graph bundle. + +Candidate quality gates should be tiered by latency budget. Synchronous gates before display should be fast: parse validity, schema validity, coarse fixed-premise check, no obvious contradiction, and a present tradeoff summary. Deeper graph review — coverage, checkability gaps, provenance warnings, and repair/refinement — can run asynchronously after the user has something to read. + +The existing observer-style async capture mechanism could generalize into an async semantic worker queue for capture / review / refine / repair. The product can show initial candidates while background graph-review proofing and optional repair improve their readiness. + +Candidate readiness should distinguish clean acceptance from acceptance with represented problems. Useful statuses include `draft`, `reviewing`, `reviewed_clean`, `reviewed_with_issues`, and `blocked`. `reviewed_clean` can be accepted normally. `reviewed_with_issues` may be accepted if Brunch can durably represent the open problems, for example by opening an immediate follow-on graph-review turn or by creating appropriate problem records / `reconciliation_need` rows in the accepting changeset. `blocked` candidates cannot be accepted without repair or regeneration. + +This preserves the reconciliation philosophy: imperfect graph states are allowed if their problems are explicit and durable, not hidden. When a candidate is accepted with open issues, Brunch should open or reuse a graph-review chat with a frontier turn that summarizes the remaining issues and asks what to address first. This keeps scenario comparison separate from problem repair, avoids polluting the primary interview, and reuses a long-lived review workbench. + +Broader graph-review issues should start as turn-owned structured artifacts rather than a new table. `reconciliation_need` remains the only first-class problem table for now, scoped to coherence / staleness process debt caused by relation impacts. A generalized `graph_issue` or `problem` table is a future option if review findings need cross-chat querying, filtering, assignment, badges, or lifecycle independent of a turn. + +## Why FE-705 matters to this direction + +The `brunch agent` JSONL seam is a strategy test harness, not just a CLI. + +It lets external probes: + +- drive the current drilldown path headlessly, +- produce realistic completed-spec fixture candidates, +- preserve workspace state for curation, +- compare alternative generation / review strategies against known-good or semi-golden graphs, +- exercise Brunch-owned mutation authority rather than direct DB shortcuts. + +This gives Brunch a way to evaluate strategy outputs before exposing them as product modes. + +## Open questions for grilling + +1. **Strategy selection** — Who chooses the strategy: user, system, or both? Can Brunch switch strategies midstream? +2. **User-facing mode names** — What does the user see: step-by-step interview, scenario options, targeted design cases, review for gaps? Or something else? +3. **Commit authority** — Which strategy outputs become canonical truth immediately, which become proposals, and which become candidate bundles? +4. **Candidate bundle boundary** — What makes a generated scenario bundle coherent enough to present? What makes it coherent enough to commit? +5. **Partial acceptance** — Do we ever allow item-level acceptance from a candidate graph? If so, how do we prove or maintain semantic closure? +6. **Graph review authority** — Does graph review only produce findings, or can it propose candidate changesets / revised bundles? +7. **Graph review bar** — What qualities matter beyond structural validity and absence of conflict? +8. **Reconciliation vs review boundary** — What exactly belongs to reconciliation, and what must stay in critique/review? +9. **Maturity model** — What evidence should count toward phase advancement for drilldown, kernel-driven, and scenario-generated specs? +10. **Kernel implementation boundary** — Are kernel cards configuration, prompts, code modules, or all three? What is the smallest useful kernel-card contract? +11. **Kernel ordering and composition** — When multiple kernels are active, who decides ordering and how are overlapping emitted artifacts deduplicated? +12. **Fixture evaluation** — What rubric determines whether a drilldown-produced spec is good enough to become a golden fixture? +13. **Strategy comparison** — What metrics or review process compare drilldown, kernel, and scenario outputs fairly? +14. **Changeset dependency** — Does `scenarios-with-tradeoffs` require a durable changeset / candidate graph model before productization, or can probes run with artifact-only bundles first? +15. **UI sequencing** — Should the first product surface be strategy choice at spec creation, a mid-interview assist, a graph-review button, or something else? From 36e72f89631ca89172b32a9f306f411ec9d5ce54 Mon Sep 17 00:00:00 2001 From: Lu Nelson Date: Tue, 12 May 2026 17:31:20 +0200 Subject: [PATCH 26/42] RFC version of spec evolution, integrated in to spec and plan --- docs/design/SPEC_EVOLUTION_STRATEGIES.md | 373 ++++++++--------------- 1 file changed, 132 insertions(+), 241 deletions(-) diff --git a/docs/design/SPEC_EVOLUTION_STRATEGIES.md b/docs/design/SPEC_EVOLUTION_STRATEGIES.md index 94fa3ef4..1045451e 100644 --- a/docs/design/SPEC_EVOLUTION_STRATEGIES.md +++ b/docs/design/SPEC_EVOLUTION_STRATEGIES.md @@ -1,194 +1,126 @@ # Spec Evolution Strategies -> Status: **conversation capture / design seed**. +> Status: **design RFC — graduated into `memory/SPEC.md` / `memory/PLAN.md`**. > Date: 2026-05-12. -> Scope: alternative strategies for advancing a Brunch specification's intent graph from vague user intent toward phase-mature, reviewable semantic truth. This note captures the model discovered while discussing the FE-705 `brunch agent` / probe-harness branch. +> Scope: chat-local strategies for advancing a Brunch specification's intent graph from vague user intent toward phase-mature, reviewable semantic truth. > > Related docs: [`AGENT_MUTATION_SURFACE.md`](./AGENT_MUTATION_SURFACE.md), [`BEHAVIORAL_KERNELS.md`](./BEHAVIORAL_KERNELS.md), [`INTENT_GRAPH_SEMANTICS.md`](./INTENT_GRAPH_SEMANTICS.md), [`MULTI_CHAT.md`](./MULTI_CHAT.md), [`PATCH_LEDGER.md`](./PATCH_LEDGER.md). -## Why this note exists +## Problem -The current FE-705 branch adds a local `brunch agent` JSONL capability adapter plus an external probe runner. The immediate implementation looks like CLI / harness infrastructure, but the design pressure behind it is broader: Brunch needs a way to try alternative **spec evolution strategies** outside the browser UI and compare their outputs against realistic completed-spec fixtures. +The current interviewer is grounded but slow. It uses a design-decision-tree drilldown strategy: ask phase-shaped questions, walk down the user's design tree, and gradually accumulate enough shared understanding for requirements and criteria. That produces high-provenance intent graph truth, but early users notice the question burden quickly. -The current interviewer strategy is grounded but long. Early users noticed that getting to a useful spec can require many questions. Alternative strategies should reduce user burden without weakening the intent graph into plausible but incoherent generated prose. +Brunch needs alternative spec-evolution strategies that reduce user burden without weakening the graph into plausible but incoherent generated prose. The FE-705 `brunch agent` / probe-harness branch is therefore not only a CLI feature; it is the first practical strategy test harness. It lets external probes drive the real Brunch lifecycle, generate drilldown-based completed-spec fixtures, and compare alternative strategy outputs before committing product UI. -## Core distinction +## Core model -A Brunch strategy is not just a prompt. A strategy is a policy for advancing a specification's semantic state: +A **strategy** is a chat-local policy for advancing semantic state. It decides: - what context it reads, -- what questions or candidate artifacts it produces, -- what unit of output it treats as coherent, +- what question / offer / candidate artifact it produces, +- what output unit it treats as coherent, - what authority it has to commit graph truth, - what review or validation must happen before commit, - what evidence it contributes toward semantic maturity / phase advancement. -This suggests a strategy layer above individual interviewer prompts and below durable graph mutation authority. +A strategy is not specification-level semantic truth. In the multi-chat model, one specification can have many chats, each with its own strategy and resumable context. -## Chat-local strategy and turn shape +A Brunch `turn` is assistant/system-first: the assistant/system asks, offers, proposes, or reports something; the user response completes the bundle. Observer/runtime assessment reads the whole bundle, because the assistant/system part gives the user's response its meaning. -A strategy is **chat-local process state**, not specification-level semantic truth. In the multi-chat model, a specification workspace can have many chats, each with its own strategy and resumable context. +A chat should have at most one open frontier turn. In normal operation, every active/resumable chat should have one open frontier turn, even if it is a scripted frontier such as the first offer in a side-chat. If a chat has no open turn, focusing it may generate a continuation frontier based on chat strategy, chat kind, latest semantic maturity / `phase`, and staleness. -A Brunch `turn` is assistant- or system-first: the assistant/system offers, proposes, asks, or reports something, and the user response completes the bundle. Observer assessment reads the whole bundle, because the assistant/system part is the context that gives the user's response meaning. +## Strategy taxonomy -A strategy can therefore be established by the first turn in a chat: +### Step-by-step drilldown -```text -assistant/system offer: - "How would you like to proceed?" - - Walk me through it step by step - - Show me strong options quickly - - Ask me targeted design cases - -user response: - "Show me strong options quickly" -``` - -Some globally-triggered flows may create or reuse a chat that is effectively pre-initialized to a strategy. For example, "start reconciliation" or "review this graph" can create a chat whose first assistant/system turn is already the kickoff for that procedure rather than a generic mode-selection offer. - -A chat's strategy should be technically mutable, with changes explainable through later turns, but explicit strategy-switch UX is deferred. Tactical sub-strategies are allowed inside a chat: a `scenario_options` chat might use targeted kernel cases to harden a selected candidate, and a `graph_review` chat might ask clarifying questions. - -## Changesets as semantic history spine - -The multi-chat move separates conversational provenance from semantic history. Turns should no longer be the specification's only historical spine. The future **changeset ledger** should record intent-graph evolution: - -```text -changeset: - one atomic semantic mutation set - -change: - one atomic add/update/link/unlink/retire/etc. inside the changeset -``` - -A `changeset` mutates a specification from one semantically / structurally valid and coherent state to another, including any `reconciliation_need` rows that are opened or resolved as part of the mutation. The data changes inside the changeset and the recording of the changeset itself must succeed or fail together. - -The changeset boundary should be the smallest atomic unit that preserves semantic coherence. If applying only half of a mutation would leave the graph incoherent, it belongs in one changeset. - -A graph-review finding, candidate proposal, or reconciliation suggestion is not itself a changeset until accepted or otherwise acted on. It is the assistant/system half of an open frontier turn in its chat. The turn becomes complete when the user responds through one of the afforded actions, and only then may the runtime apply a changeset. - -Proposal / finding artifacts can start as turn-owned structured assistant parts rather than standalone rows. A standalone proposal or proposed-changeset model should wait until batch review, expiry, assignment, cross-chat surfacing, or independent proposal lifecycle demands it. - -When a turn is created, it should stamp the latest applied changeset id for the specification — for example `turn.opened_at_changeset_id` or `turn.base_changeset_id`. This is not provenance; it is the semantic graph revision the assistant/system offer was based on. If a turn remains open while `specification.latest_changeset_id` advances, the open offer is considered stale in the first cut. The product can simply offer to regenerate or refresh the proposal rather than attempting sophisticated neighborhood-level staleness analysis. - -A chat should have at most one open frontier turn at a time. Otherwise the runtime cannot know which assistant/system offer the user's response completes. In normal operation, every active/resumable chat should have an open frontier turn, even if it is a scripted frontier such as the first offer in a new side-chat. If a chat somehow has no open turn, the UI can offer "continue this chat" or generate a new frontier when the chat is focused. The generated frontier may depend on the specification's current semantic maturity / `phase` value and the chat's strategy. A specification may have many open frontier turns across different chats. - -Proposal turns should share a small normalized completion-action vocabulary, with strategy-specific user-facing labels mapped onto common semantics: - -- `accept` — authorize the proposed action / bundle / fix; may apply a changeset. -- `reject` — decline the proposal without semantic mutation. This should be narrow: rejecting or arbitrarily editing part of a coherent proposal can itself create incoherence. In reconciliation contexts, rejection may leave the original `reconciliation_need` open or create a new one rather than resolving the issue. -- `revise` — request changes to the proposal; completes the current turn and usually opens a successor proposal turn. User-facing labels such as "Request changes" map here. -- `ask_followup` — request explanation or clarification before deciding. -- `defer` — intentionally leave the matter unresolved or parked. -- `regenerate` — ask the system to recreate the offer, especially when stale or low-quality. - -Only `accept` should apply semantic changesets. Other actions may create process metadata or successor turns, but should not mutate intent graph truth directly. If a no-edit outcome still resolves process debt, model it as accepting a proposal whose changeset resolves the relevant need. `revise` is proposal-level transformation: it asks the system to produce a new coherent proposal, not to partially mutate canonical graph truth. - -Direct editing is a sibling mutation path, not the same as proposal revision. In explicit edit mode, the user may make direct pending changes to one or more intent items in memory. When the user exits / applies edit mode, Brunch computes affected edges and opens required `reconciliation_need` rows; the direct item changes and reconciliation needs commit together in one changeset. Direct editing is safe because incoherence risk is materialized as process debt, not because arbitrary edits are prevented. - -Review-set direct edits have a special consequence: if the user directly edits proposed review-set items, accepting the review set as-is is no longer valid. The UI should disable `accept`; `request changes` becomes a reconciliation-oriented action such as `request reconciliation`. The edited candidate/review set must be reconciled before it can become canonical truth. - -Implementation can later choose whether these are distinct response shapes or a `kind` inside a discriminated response union. - -Changeset provenance may point to different initiators: - -- a turn in a chat, -- a user direct edit, -- a graph-review acceptance, -- a reconciliation pass, -- a verifier result, -- an import or migration, -- a future procedure run if the runtime needs a durable operation record distinct from any one turn. - -This makes a `procedure_run` concept useful but not automatically schema-worthy. Some procedures may be represented by one or more turns plus resulting changesets. A first-class `procedure_run` table becomes necessary only when operation lifecycle, retry/cancel, multi-turn grouping, or non-chat provenance cannot be represented cleanly by turns and changesets. - -## Strategy taxonomy discovered so far - -### 1. Design-decision-tree drilldown +Current default. The interviewer asks phase-shaped questions at increasing detail until shared understanding is sufficient. -The current default interviewer strategy. +- **Strength:** high provenance; each claim is supported by user answers. +- **Weakness:** long and user-burdensome. +- **Commit shape:** incremental canonical changesets after ordinary turn observation / review. -It asks phase-shaped questions that walk down the user's design-decision tree at increasing levels of detail until enough shared understanding exists to project requirements and criteria. +### Scenario options -**Strengths** +Low-friction strategy for impatient, under-informed, or underspecified users. Brunch asks enough to identify the product/use-case typology, then generates 2–3 coherent candidate graph bundles with named tradeoff profiles. -- High provenance: graph claims are supported by user answers. -- Incremental: each turn can be observed, classified, and committed. -- Good for users who have context and patience. +- **Strength:** users react to concrete options rather than authoring the whole design. +- **Weakness:** one-shot generation can produce plausible but generic, contradictory, or unsupported graph structure. +- **Commit shape:** candidate graph bundles, accepted cleanly or accepted with explicit open issues. -**Weaknesses** +### Targeted cases -- Slow and question-heavy. -- Asks the user to do much of the design judgment work. -- Can feel like the app is demanding effort before providing leverage. +Kernel-driven contrastive elicitation from [`BEHAVIORAL_KERNELS.md`](./BEHAVIORAL_KERNELS.md). The interviewer detects active behavioral kernels and asks concrete divergent cases whose classifications emit typed artifacts directly. -**Likely authority shape** +- **Strength:** lower-friction than drilldown, more grounded than whole-spec generation. +- **Weakness:** needs kernel cards, artifact schemas, validators, ordering, and cross-kernel deduplication. +- **Commit shape:** validated kernel artifacts such as decisions, invariants, examples/counterexamples, criteria, and typed edges. -Incremental canonical commits are acceptable when each answer is processed through existing observer / review semantics. +### Graph review -### 2. Scenarios with tradeoffs +Quality-oriented critique that can run over any graph, whether drilldown-created, scenario-generated, imported, or edited. -A proposed low-friction strategy for users who are impatient, underspecified, or unsure how to judge design choices. +- **Question:** where is this graph weak, thin, overconfident, unsupported, ambiguous, generic, uncheckable, or missing structure? +- **Commit shape:** findings start as turn-owned structured artifacts; accepted repairs may later apply changesets. -Instead of asking for every detail, Brunch asks enough to identify the user's product / use-case typology, generates two or more complete scenario-shaped candidate specs, summarizes the tradeoffs of each, and lets the user choose or revise a coherent scenario. - -**Strengths** +### Graph reconciliation -- Gives users something concrete to react to quickly. -- Shifts burden from open-ended design authorship to recognition and comparison. -- Can surface tradeoffs, excluded alternatives, and likely implications earlier. +Repair-oriented process over known disturbance or process debt such as open `reconciliation_need` rows. -**Weaknesses / risk** +- **Question:** given this specific change/conflict, what existing graph truth must be repaired, confirmed, dismissed, or escalated? +- **Commit shape:** changesets that edit items/edges and/or resolve/open reconciliation needs. -- Generating a valid intent graph in one pass is a tall order for an LLM. -- The failure mode is not only bad prose; it is plausible graph structure whose items and edges are generic, internally weak, contradictory, overconfident, or unsupported. -- User item-by-item acceptance can create semantic incoherence because graph items are not independent. +### Topology-driven targeting -**Likely authority shape** +Internal targeting machinery, not a user-facing strategy for now. Once a graph exists, Brunch can rank next questions, reviews, or repairs by topology: high-fanout low-confidence assumptions, decisions without rejected alternatives, criteria without targets, conflicting constraints, etc. -Generated scenarios should enter as **candidate graph bundles**, not loose collections of proposed graph items. The default acceptance unit should be the coherent bundle. User edits should produce a revised coherent candidate, not arbitrary partial mutation of canonical truth. +## Semantic history and proposal turns -Partial acceptance should only be allowed when the accepted subset is semantically closed, or when the system can automatically bring along required supporting items / edges. +Turns are conversational provenance and replay. They should not remain the only historical spine once multiple chats, direct edits, review passes, verifier feedback, and candidate bundles can mutate graph truth. -### 3. Kernel-driven contrastive elicitation +The future semantic spine is the **changeset ledger**: -Inferred from [`BEHAVIORAL_KERNELS.md`](./BEHAVIORAL_KERNELS.md). +```text +changeset: + one atomic semantic mutation set -The interviewer detects latent behavioral / correctness kernels in the user's feature and asks compact contrastive scenario questions. The user classifies a concrete divergent case, and the answer emits typed intent graph artifacts directly. +change: + one atomic add/update/link/unlink/retire/etc. inside the changeset +``` -Example: instead of asking "How should permissions work?", ask whether a user who receives folder access should automatically receive access to documents added later. +A changeset mutates a specification from one semantically / structurally valid graph state to another, including any `reconciliation_need` rows opened or resolved by that mutation. The data changes and changeset record must succeed or fail together. The changeset boundary is the smallest atomic unit that preserves semantic coherence: if applying only half the mutation would leave the graph incoherent, it belongs in one changeset. -**Strengths** +A graph-review finding, candidate proposal, or reconciliation suggestion is not itself a changeset until accepted or acted on. It is the assistant/system half of an open frontier turn. The turn completes when the user responds, and only then may the runtime apply a changeset. -- Lower friction than full drilldown. -- More grounded than whole-spec generation. -- Produces high-signal artifacts: decisions, invariants, criteria, positive examples, negative examples, and typed edges. -- Helps users judge concrete cases rather than author abstract requirements. +Proposal turns should share a small normalized completion vocabulary: -**Weaknesses / risk** +- `accept` — authorize the proposal; may apply a changeset. +- `reject` — decline without semantic mutation; narrow because rejection can leave or create process debt. +- `revise` — request a new coherent proposal; maps to labels like "Request changes". +- `ask_followup` — ask for explanation before deciding. +- `defer` — intentionally park the matter. +- `regenerate` — recreate the offer, especially when stale or low-quality. -- Requires kernel-card machinery: detection signals, question templates, artifact schemas, validators, and cross-kernel deduplication. -- Kernel ordering and composition are unresolved. -- A graph can become locally strong around activated kernels while remaining globally incomplete. +Only `accept` applies a proposal turn's semantic changeset. Other proposal actions may create process metadata or successor turns, but should not directly mutate intent graph truth. If a no-edit outcome resolves process debt, model it as accepting a proposal whose changeset resolves the relevant need. -**Likely authority shape** +Proposal/finding artifacts should start as turn-owned structured assistant parts. A standalone proposal or proposed-changeset model should wait until batch review, assignment, expiry, cross-chat surfacing, or independent proposal lifecycle demands it. -Kernel answers may be safe to commit incrementally when the emitted artifacts are validated against the kernel contract and relation policy. Kernel-generated artifacts should retain the worked scenario as evidence. +When a turn opens, it should stamp the latest applied changeset id for the specification — for example `turn.opened_at_changeset_id` or `turn.base_changeset_id`. This is not provenance; it is the graph revision the assistant/system offer was based on. First-cut staleness is conservative: if a turn remains open while `specification.latest_changeset_id` advances, the open offer is considered stale and the product offers regeneration / refresh rather than neighborhood-level diffing. -### 4. Topology-driven targeting +## Direct editing -Mentioned in the behavioral-kernels design as complementary to kernel-driven questioning. +Direct editing is a sibling mutation path, not proposal revision. -This may be less a user-facing strategy than a scheduler / targeting policy: once a graph exists, Brunch reads graph topology and epistemic metadata to choose where the next question, critique, or repair should focus. +In explicit edit mode, the user may make pending direct changes to one or more intent items. When they exit/apply edit mode, Brunch computes affected incident edges and opens required `reconciliation_need` rows under relation policy; direct item changes and reconciliation needs commit together in one changeset. Direct editing is safe because incoherence risk becomes explicit process debt, not because arbitrary edits are forbidden. -Examples: high-fanout low-confidence assumptions, decisions without rejected alternatives, requirements without verification edges, criteria without targets, or conflicting constraints. +Review-set direct edits have a special consequence. If the user directly edits proposed review-set items, accepting the review set as-is is no longer valid. `accept` should be disabled; `request changes` becomes a reconciliation-oriented action such as `request reconciliation`. The edited candidate/review set must be reconciled before it can become canonical truth. ## Relation directionality -The current `knowledge_edge` relation names mix directionality in ways that become risky once edges drive reconciliation. For example, `depends_on` and `derived_from` naturally read downstream-to-upstream, while `constrains` and `verifies` often read upstream-to-downstream or evidence-to-claim. +The current `knowledge_edge` relation names mix directionality. `depends_on` and `derived_from` naturally read downstream-to-upstream; `constrains` and `verifies` often read upstream-to-downstream or evidence-to-claim. That becomes risky once edges drive reconciliation. -Because FE-700 is already expected to expand the intent-graph ontology, breaking existing relation names and records remains acceptable. However, trying to force every useful edge verb into one dependency direction may distort the ontology around one operation. The graph must serve display, prompt context, export trace, requirements projection, reconciliation, critique, verification, candidate generation, and explanation. +FE-700 may break existing relation names/records while expanding the ontology, but forcing every useful edge verb into one dependency direction risks distorting the graph around one operation. The graph must serve display, prompt context, export trace, requirements projection, reconciliation, critique, verification, candidate generation, and explanation. -The safer rule is: +Rule: > Edge verbs should be semantically clear; operational direction belongs in relation policy. @@ -196,158 +128,117 @@ Every relation kind should declare: - canonical sentence, e.g. `{source} verifies {target}`, - inverse display sentence, -- whether it participates in visible graph display, export trace, staleness, reconciliation, criteria help, or weak suggestion flows, -- what happens when the source changes, -- what happens when the target changes. +- graph-display / export / staleness / reconciliation / criteria-help / weak-suggestion participation, +- what happens when source changes, +- what happens when target changes. -Code should not infer reconciliation behavior from raw edge direction. Direct edit and hard-impact cascade should enumerate incident accepted edges and ask relation policy which opposite endpoint, if any, receives a `reconciliation_need`. +Code should not infer reconciliation behavior from raw edge direction. Direct edit and hard-impact cascade should enumerate incident accepted edges and ask relation policy which endpoint, if any, receives a `reconciliation_need`. -The contrastive-kernel strategy may also drive a further expanded ontology. Kernel questions naturally surface artifacts such as `alternative`, `question`, `ambiguity`, `candidate`, and rejected option records. Example: a containment/topology question about deleting a parent has multiple alternatives; the user's answer chooses one, rejects others, and emits an invariant plus positive/negative examples. FE-700 should leave room for these artifacts, even if the first implementation represents some of them as examples, decisions, or proposal-local structures rather than durable top-level item kinds. +Contrastive kernels may pressure a further ontology expansion. Kernel questions naturally surface artifacts such as `alternative`, `question`, `ambiguity`, `candidate`, and rejected options. FE-700 should leave room for these artifacts, but the first implementation can represent them as examples, decisions, constraints, or proposal-local structures until durable top-level kinds prove necessary. -## Graph operations surfaced by the discussion - -### Graph reconciliation - -Repair-oriented. - -Starts from a known disturbance or process obligation, such as an open `reconciliation_need` caused by an edit, semantic conflict, verifier result, or changed upstream item. +## Candidate graph bundles -The reconciler's question is: +`scenario_options` produces speculative but coherent candidate worlds, not loose item lists. A candidate bundle should contain: -> Given this specific change or conflict, what existing graph truth needs to be repaired, confirmed, dismissed, or escalated? +- short name and scenario summary, +- intended maturity stage, +- tradeoff profile, +- generated items and edges, +- required core items, +- optional/swappable items, +- known risks, +- graph-review findings, +- provenance / epistemic labels, +- commit preconditions. -Likely outputs: +User review should be bundle-level by default: `Use this`, `Revise`, `Regenerate`, or ask follow-up. Arbitrary item-level pick-and-choose risks incoherence. Partial acceptance is only safe when the accepted subset is semantically closed or the system brings along required supporting items/edges. -- auto-confirm target still holds, -- auto-edit a target through the standard mutation path, -- mark need irrelevant / resolved, -- escalate to HITL because a semantic conflict requires judgment, -- open or regenerate downstream reconciliation needs. +Candidate readiness should distinguish clean acceptance from acceptance with represented problems: -This should stay tied to known coherence obligations. It should not become the umbrella term for all graph intelligence. +- `draft` — generated but not checked, +- `reviewing` — background review running, +- `reviewed_clean` — acceptable normally, +- `reviewed_with_issues` — acceptable only if open issues become durable, +- `blocked` — cannot be accepted without repair/regeneration. -### Graph review / critique +`reviewed_with_issues` can still be accepted if Brunch durably represents the problems, for example by opening a follow-on graph-review frontier turn or by creating appropriate problem records / `reconciliation_need` rows in the accepting changeset. Imperfect graph states are allowed if their problems are explicit and durable, not hidden. -Quality-oriented. +Broader graph-review issues should start as turn-owned structured artifacts. `reconciliation_need` remains the only first-class problem table for now, scoped to coherence / staleness process debt caused by relation impacts. A generalized `graph_issue` / `problem` table is a future option if review findings need cross-chat querying, filtering, assignment, badges, or lifecycle independent of turns. -Can run on any intent graph, whether produced by drilldown, scenario generation, import, direct editing, or kernel elicitation. +## Product sequencing -The reviewer asks: +The most desired product surfaces are: -> If this graph is supposed to represent a good spec at its current maturity stage, where is it weak, thin, overconfident, under-supported, ambiguous, generic, uncheckable, or missing important structure? +1. first-turn strategy choice for a new chat/spec start, +2. a mid-interview "speed this up" / "show me strong options" affordance. -Likely review dimensions: +Engineering still needs part of `graph_review` to make scenario generation credible. `scenario_options` can be the first product-facing strategy while graph review remains an internal oracle used to critique, repair, and score generated bundles before they are committed. -- internal coherence, -- coverage, -- decision usefulness, -- tradeoff honesty, -- checkability, -- granularity, -- scenario fidelity, -- epistemic labeling, -- provenance strength, -- downstream usefulness. +For mid-interview acceleration, branch into a new or reused side-chat / strategy chat rather than switching the primary interview chat in place. The side-chat branch receives a context pack — not a raw transcript dump — containing spec identity, maturity/phase, summarized goal/context, accepted graph truth, important edge neighborhoods, current frontier question if relevant, unresolved assumptions, and recent turns only when they explain user style or intent. -A graph can have no reconciliation needs and still fail graph review. +The first `speed this up` mode should **complete the current direction**: treat accepted graph truth as fixed premises and fill in plausible missing structure. A more radical "show alternatives that challenge prior assumptions" mode is feasible but deferred. -## Candidate graph bundles +Scenario generation should present 2–3 options with named tradeoff profiles. Candidate quality gates should be latency-tiered: -`scenarios-with-tradeoffs` introduces a unit that is not yet represented by today's canonical graph model: a speculative but coherent candidate world. +- fast synchronous gates before display: parse validity, schema validity, coarse fixed-premise check, no obvious contradiction, and tradeoff summary present; +- async gates after display: deeper graph review, coverage, checkability gaps, provenance warnings, repair/refinement. -A candidate bundle should probably contain: +The existing observer-style async capture mechanism could generalize into an async semantic worker queue for capture / review / refine / repair. Users can read initial candidates while background review improves readiness. If a candidate is accepted with open issues, Brunch should open or reuse a graph-review chat with a frontier turn summarizing remaining issues and asking what to address first. -- scenario summary, -- intended maturity stage, -- tradeoff profile, -- generated items, -- generated edges, -- required core items, -- optional / swappable items, -- known risks, -- critic findings, -- provenance / epistemic labels, -- commit preconditions. +## Concern map and dependencies -The branch's probe harness can help compare candidate bundles against drilldown-produced fixture specs before any product UI commits to this flow. +### Semantic substrate — highest coordination -## Phase / maturity implication +Owns ontology expansion, relation policy directionality, changeset/change ledger, `turn.opened_at_changeset_id`, `specification.latest_changeset_id`, chat-local strategy metadata, and one-open-frontier-per-chat invariants. -`spec.phase` should be understood as a semantic maturity signal, not merely a positional route label. The frontend currently presents separate phase routes, but phase is better interpreted as a cumulative rating of the spec's evolution stage. +Likely areas: `src/server/schema.ts`, `src/server/db.ts`, `src/server/knowledge-relationship-policy.ts`, future changeset modules, [`INTENT_GRAPH_SEMANTICS.md`](./INTENT_GRAPH_SEMANTICS.md), [`PATCH_LEDGER.md`](./PATCH_LEDGER.md). -A strategy should therefore not merely "move the user to a phase." It should contribute evidence that the graph has reached a maturity bar. +Sequential dependencies: relation policy before robust reconciliation/direct-edit cascade; changesets before productized candidate acceptance; turn staleness depends on latest changeset tracking. -Potential maturity signals: +### Strategy / proposal artifacts — parallelizable -- no blocking reconciliation needs, -- graph-review findings above an acceptable threshold, -- coverage across required item and edge families, -- enough checkable criteria, -- enough user-confirmed or strongly-supported provenance for high-impact claims, -- no blocking unresolved critique findings. +Owns candidate bundle shapes, graph-review finding shapes, proposal turn artifacts, candidate statuses, and normalized proposal responses. -This creates useful distinctions: +Likely areas: `src/server/parts.ts`, `src/server/turn-artifacts.ts`, a possible `strategy-artifacts` module, context packs, prompt scenarios. -- **coherent**: no known contradictions / open process debt, -- **complete enough**: covers the necessary semantic territory, -- **good enough**: specific, tradeoff-aware, checkable, and useful, -- **phase-mature**: meets the bar for the next projection / export stage. +Can start before durable changesets if artifacts remain turn-owned and do not commit canonical truth. -## Product sequencing +### Graph-review oracle — supports scenario options -The most desired product surfaces are likely: +Owns review rubric, graph critique prompt, candidate quality gates, accept-with-issues semantics, and follow-on review turns. -1. first-turn strategy choice for a new chat / spec start, -2. a mid-interview "speed this up" / "show me strong options" affordance. +Likely areas: new graph-review prompt/context pack, `src/server/scenario-runner.ts`, `scripts/agent-probes/`. -Engineering still needs part of `graph_review` to make scenario generation credible. `scenario_options` can be the first product-facing strategy while `graph_review` remains an internal oracle used to critique, repair, and score generated candidate bundles before they are shown or committed. +Can run probe-only before product UI; needs enough FE-700 ontology/relation policy to be meaningful. -For mid-interview acceleration, the preferred shape is to branch into a new or reused side-chat / strategy chat rather than switching the primary interview chat in place. The side-chat branch can use the current graph and transcript as context, generate reviewed scenario options, and preserve the main interview frontier if the generated path disappoints or needs to be resumed later. This is likely more flexible than the current main-chat UX, which still assumes the original guided interview shape. +### Scenario-options strategy — first product-facing acceleration -A scenario-options side-chat should receive a context pack rather than an unstructured full transcript dump. Minimum context includes the current spec name / mode, semantic maturity / phase, summarized user goal and context, accepted intent graph items, important edge neighborhoods, the current open frontier question if relevant, unresolved assumptions or low-confidence areas, and recent turns only when they explain user style or intent. +Owns 2–3 candidate bundles, tradeoff summaries, fast validation, async review/refine/repair handoff, and clean/with-issues acceptance. -For the mid-interview "speed this up" use case, generated scenarios should default to **complete the current direction**: treat accepted graph truth as fixed premises and fill in plausible missing structure. A more radical "show alternatives that challenge prior assumptions" mode is feasible but deferred. +Likely areas: `src/server/prompts/candidate-spec-system.md`, `src/server/context-pack/candidate-spec.ts`, scenario runner/probe harness, later side-chat UI. -Scenario generation should present **2–3 options** with named tradeoff profiles rather than many variants. Each visible option should have a short name, scenario summary, key assumptions, what it optimizes for, what it gives up, confidence / review warnings, and `Use this` / `Revise` style actions. Internally, each option maps to a candidate graph bundle. +Depends on graph-review minimum oracle and, for canonical acceptance, changeset ledger. -Candidate quality gates should be tiered by latency budget. Synchronous gates before display should be fast: parse validity, schema validity, coarse fixed-premise check, no obvious contradiction, and a present tradeoff summary. Deeper graph review — coverage, checkability gaps, provenance warnings, and repair/refinement — can run asynchronously after the user has something to read. +### Async semantic workers — staged infrastructure -The existing observer-style async capture mechanism could generalize into an async semantic worker queue for capture / review / refine / repair. The product can show initial candidates while background graph-review proofing and optional repair improve their readiness. +Own capture / review / refine / repair background work. Can begin as observer-style in-process tasks before durable queue tables exist. -Candidate readiness should distinguish clean acceptance from acceptance with represented problems. Useful statuses include `draft`, `reviewing`, `reviewed_clean`, `reviewed_with_issues`, and `blocked`. `reviewed_clean` can be accepted normally. `reviewed_with_issues` may be accepted if Brunch can durably represent the open problems, for example by opening an immediate follow-on graph-review turn or by creating appropriate problem records / `reconciliation_need` rows in the accepting changeset. `blocked` candidates cannot be accepted without repair or regeneration. +### Reconciliation / direct edit — adjacent but distinct -This preserves the reconciliation philosophy: imperfect graph states are allowed if their problems are explicit and durable, not hidden. When a candidate is accepted with open issues, Brunch should open or reuse a graph-review chat with a frontier turn that summarizes the remaining issues and asks what to address first. This keeps scenario comparison separate from problem repair, avoids polluting the primary interview, and reuses a long-lived review workbench. +Owns edit mode, affected-edge enumeration, relation-policy-driven `reconciliation_need` creation, reconciliation chat behavior, and review-set request-reconciliation behavior. -Broader graph-review issues should start as turn-owned structured artifacts rather than a new table. `reconciliation_need` remains the only first-class problem table for now, scoped to coherence / staleness process debt caused by relation impacts. A generalized `graph_issue` or `problem` table is a future option if review findings need cross-chat querying, filtering, assignment, badges, or lifecycle independent of a turn. +Likely areas: `src/server/edit-impact.ts`, `src/server/edit-route.ts`, `src/server/reconciliation-need.test.ts`, side-chat/patch-list UI. -## Why FE-705 matters to this direction +Depends on relation-policy directionality; eventually depends on changesets for atomic direct-edit history. -The `brunch agent` JSONL seam is a strategy test harness, not just a CLI. +## FE-705 implication -It lets external probes: +The `brunch agent` JSONL seam is a strategy test harness: -- drive the current drilldown path headlessly, -- produce realistic completed-spec fixture candidates, +- drive current drilldown headlessly, +- produce completed-spec fixture candidates, - preserve workspace state for curation, -- compare alternative generation / review strategies against known-good or semi-golden graphs, +- compare strategy outputs against known-good or semi-golden graphs, - exercise Brunch-owned mutation authority rather than direct DB shortcuts. -This gives Brunch a way to evaluate strategy outputs before exposing them as product modes. - -## Open questions for grilling - -1. **Strategy selection** — Who chooses the strategy: user, system, or both? Can Brunch switch strategies midstream? -2. **User-facing mode names** — What does the user see: step-by-step interview, scenario options, targeted design cases, review for gaps? Or something else? -3. **Commit authority** — Which strategy outputs become canonical truth immediately, which become proposals, and which become candidate bundles? -4. **Candidate bundle boundary** — What makes a generated scenario bundle coherent enough to present? What makes it coherent enough to commit? -5. **Partial acceptance** — Do we ever allow item-level acceptance from a candidate graph? If so, how do we prove or maintain semantic closure? -6. **Graph review authority** — Does graph review only produce findings, or can it propose candidate changesets / revised bundles? -7. **Graph review bar** — What qualities matter beyond structural validity and absence of conflict? -8. **Reconciliation vs review boundary** — What exactly belongs to reconciliation, and what must stay in critique/review? -9. **Maturity model** — What evidence should count toward phase advancement for drilldown, kernel-driven, and scenario-generated specs? -10. **Kernel implementation boundary** — Are kernel cards configuration, prompts, code modules, or all three? What is the smallest useful kernel-card contract? -11. **Kernel ordering and composition** — When multiple kernels are active, who decides ordering and how are overlapping emitted artifacts deduplicated? -12. **Fixture evaluation** — What rubric determines whether a drilldown-produced spec is good enough to become a golden fixture? -13. **Strategy comparison** — What metrics or review process compare drilldown, kernel, and scenario outputs fairly? -14. **Changeset dependency** — Does `scenarios-with-tradeoffs` require a durable changeset / candidate graph model before productization, or can probes run with artifact-only bundles first? -15. **UI sequencing** — Should the first product surface be strategy choice at spec creation, a mid-interview assist, a graph-review button, or something else? +This lets Brunch evaluate strategy outputs before exposing them as product modes. From 125c84fd5153bb9954dd00cd171aae166adec812 Mon Sep 17 00:00:00 2001 From: Lu Nelson Date: Wed, 13 May 2026 13:50:11 +0200 Subject: [PATCH 27/42] consolidation pass on design docs --- .../design/INTENT_SPEC_EVOLUTION.md | 12 ++-- docs/design/BEHAVIORAL_KERNELS.md | 6 +- .../CONVERSATIONAL_WORKSPACE_RUNTIME.md | 44 ++++++++++----- docs/design/DEFERRED_RECONCILIATIONS.md | 22 ++++---- docs/design/DEV_WORKFLOW_EVOLUTION.md | 10 ++-- docs/design/INTENT_GRAPH_SEMANTICS.md | 4 +- docs/design/MULTI_CHAT.md | 6 +- docs/design/PATCH_LEDGER.md | 6 +- docs/design/README.md | 56 +++++++++++++++---- docs/design/SIDE_CHAT.md | 10 ++-- memory/PLAN.md | 6 +- 11 files changed, 117 insertions(+), 65 deletions(-) rename docs/{ => archive}/design/INTENT_SPEC_EVOLUTION.md (97%) diff --git a/docs/design/INTENT_SPEC_EVOLUTION.md b/docs/archive/design/INTENT_SPEC_EVOLUTION.md similarity index 97% rename from docs/design/INTENT_SPEC_EVOLUTION.md rename to docs/archive/design/INTENT_SPEC_EVOLUTION.md index 9d62718d..68795ec9 100644 --- a/docs/design/INTENT_SPEC_EVOLUTION.md +++ b/docs/archive/design/INTENT_SPEC_EVOLUTION.md @@ -1,6 +1,8 @@ # Brunch Evolution Notes | @Yesterday -> Status: raw synthesis / ideation. +> Status: **source archive / raw synthesis**. +> Archived from `docs/design/` during FE-705 reconciliation cleanup on 2026-05-13. Active conclusions now live in `memory/SPEC.md`, `memory/PLAN.md`, and focused design docs under `docs/design/`. +> > Canonical conclusions must be promoted into `memory/SPEC.md` through `ln-spec` and into `memory/PLAN.md` through `ln-plan` before they are treated as accepted product direction or roadmap work. > > Synthesis started 2026-05-04 from external agent conversations about intent formalization, formal verification, and Brunch's elicitation methodology. @@ -632,7 +634,7 @@ This aligns with Brunch's existing direction: chat view and graph view should be ### Turn Spine vs Patch Ledger -A missing branch of the current capture concerns early-user feedback about how knowledge items are created and updated. The detailed proposal now lives in [Patch Ledger and Reconciliation](./PATCH_LEDGER.md); this section keeps only the architectural implication for intent-spec evolution. +A missing branch of the current capture concerns early-user feedback about how knowledge items are created and updated. The detailed proposal now lives in [Patch Ledger and Reconciliation](../../design/PATCH_LEDGER.md); this section keeps only the architectural implication for intent-spec evolution. One original Brunch assumption was that a single primary conversation would sit at the center of the product. The current architecture reflects that: durable conversational turns are the branch-bearing lineage spine, and knowledge items are extracted from answered turns or accepted review outputs. @@ -675,7 +677,7 @@ reconciliation_need: semantic debt created when graph changes may affect existing truth ``` -This is not a hybrid in the sense of two competing historical authorities. It is a separation of concerns: turns remain conversation history; patches become semantic history; workflow remains explicit process state; reconciliation becomes an agent-managed review flow for stale or contradictory graph truth. See [Multi-Chat Substrate](./MULTI_CHAT.md) for the concrete first substrate slice, and [Patch Ledger and Reconciliation](./PATCH_LEDGER.md) for later semantic mutation history, reconciliation ordering, and open schema questions. +This is not a hybrid in the sense of two competing historical authorities. It is a separation of concerns: turns remain conversation history; patches become semantic history; workflow remains explicit process state; reconciliation becomes an agent-managed review flow for stale or contradictory graph truth. See [Multi-Chat Substrate](../../design/MULTI_CHAT.md) for the concrete first substrate slice, and [Patch Ledger and Reconciliation](../../design/PATCH_LEDGER.md) for later semantic mutation history, reconciliation ordering, and open schema questions. The alternate branch makes an important persistence distinction: @@ -784,7 +786,7 @@ Near-term product implications: - detect behavioral kernels and ask pattern-specific questions - add `invariant` and `example` as likely product-ontology candidates - treat `knowledge_edge` as intent semantics, not only graph display -- treat open-ended graph editing as needing chat containers and reconciliation needs first, then semantic history separate from turn history; see [Multi-Chat Substrate](./MULTI_CHAT.md) and [Patch Ledger and Reconciliation](./PATCH_LEDGER.md) +- treat open-ended graph editing as needing chat containers and reconciliation needs first, then semantic history separate from turn history; see [Multi-Chat Substrate](../../design/MULTI_CHAT.md) and [Patch Ledger and Reconciliation](../../design/PATCH_LEDGER.md) - preserve approved / rejected examples as durable evidence - distinguish human-readable claims from checkable artifacts - eventually tie requirements and criteria through shared property-like claims @@ -808,7 +810,7 @@ Near-term development-methodology implications: - Should `invariant` and `example` become durable top-level product kinds? - What relation kinds need to participate in cascade and staleness, and which should remain display-only? - How should weak inferred edges be reviewed without flooding users or agents? -- Which patch-ledger schema choices in [Patch Ledger and Reconciliation](./PATCH_LEDGER.md) should be promoted after the [Multi-Chat Substrate](./MULTI_CHAT.md) slice lands? +- Which patch-ledger schema choices in [Patch Ledger and Reconciliation](../../design/PATCH_LEDGER.md) should be promoted after the [Multi-Chat Substrate](../../design/MULTI_CHAT.md) slice lands? - Which behavioral kernels are common enough to deserve first-class elicitation support? - Are the fifteen kernel families distinct enough in practice, or should some merge after transcript testing? - What should a first kernel-card implementation include: detection signals, question templates, artifact schema, validators, or all of these? diff --git a/docs/design/BEHAVIORAL_KERNELS.md b/docs/design/BEHAVIORAL_KERNELS.md index 3e033fe7..d13b21ca 100644 --- a/docs/design/BEHAVIORAL_KERNELS.md +++ b/docs/design/BEHAVIORAL_KERNELS.md @@ -6,7 +6,7 @@ > > This document is the canonical reference for the FE-702 frontier item ("Generative prompt probes before UI") in `memory/PLAN.md` insofar as that item names behavioral kernels as one probe target. It expands the `Recommended shape:` of that item with the full kernel taxonomy that is too long to live inside the plan. > -> Source synthesis: [`INTENT_SPEC_EVOLUTION.md`](./INTENT_SPEC_EVOLUTION.md) §7–8. Where this document overlaps, it supersedes the synthesis as the structured reference. +> Source synthesis: [`INTENT_SPEC_EVOLUTION.md`](../archive/design/INTENT_SPEC_EVOLUTION.md) §7–8. Where this document overlaps, it supersedes the synthesis as the structured reference. > > Companion: [`INTENT_GRAPH_SEMANTICS.md`](./INTENT_GRAPH_SEMANTICS.md). Kernels suggest *what kind* of question to ask; the intent graph defines *what their answers become*. Kernels emit the typed claims and edges that the intent graph stores. > @@ -385,7 +385,7 @@ The three are complementary, not competing. Template-driven keeps the conversati These three cover most of what a first interviewer prototype would need to demonstrate the kernel approach. The remaining twelve can be added incrementally as scenarios warrant. -For each probe, the scenario substrate ([`INTENT_SPEC_EVOLUTION.md`](./INTENT_SPEC_EVOLUTION.md) §Persistence; `memory/SPEC.md` Requirements 40, 41) should capture: rendered prompt, kernel context pack, model/provider settings, raw output, structured parse status, and qualitative review notes — the same artifact shape FE-698 already captures. +For each probe, the scenario substrate ([`INTENT_SPEC_EVOLUTION.md`](../archive/design/INTENT_SPEC_EVOLUTION.md) §Persistence; `memory/SPEC.md` Requirements 40, 41) should capture: rendered prompt, kernel context pack, model/provider settings, raw output, structured parse status, and qualitative review notes — the same artifact shape FE-698 already captures. ## Open questions @@ -399,7 +399,7 @@ For each probe, the scenario substrate ([`INTENT_SPEC_EVOLUTION.md`](./INTENT_SP ## References -- [`INTENT_SPEC_EVOLUTION.md`](./INTENT_SPEC_EVOLUTION.md) §7 (Behavioral pattern elicitation) and §8 (Kernel typology) — source synthesis. +- [`INTENT_SPEC_EVOLUTION.md`](../archive/design/INTENT_SPEC_EVOLUTION.md) §7 (Behavioral pattern elicitation) and §8 (Kernel typology) — source synthesis. - [`INTENT_GRAPH_SEMANTICS.md`](./INTENT_GRAPH_SEMANTICS.md) — the typed graph that kernel artifacts populate. - `memory/SPEC.md` Requirement 40 (prompt/context engineering names "behavioral kernels" as a context-pack consumer); Lexicon entries for `behavioral kernel`, `progressive checkability`, `context pack`. - `memory/PLAN.md` item 4 (FE-702) — the active probe item this document expands. diff --git a/docs/design/CONVERSATIONAL_WORKSPACE_RUNTIME.md b/docs/design/CONVERSATIONAL_WORKSPACE_RUNTIME.md index bc148f2e..a70aab2c 100644 --- a/docs/design/CONVERSATIONAL_WORKSPACE_RUNTIME.md +++ b/docs/design/CONVERSATIONAL_WORKSPACE_RUNTIME.md @@ -1,10 +1,10 @@ # Conversational Workspace Runtime — Umbrella Design -> Status: **proposed** — horizon synthesis. Output of brainstorm 2026-05-12, anchored on the two sync calls of 2026-05-11 (UX review of V3.1 side-chat) and 2026-05-12 (architecture review, post-V3.1 direction). +> Status: **active synthesis** — consolidated runtime-cluster concept. Output of brainstorm 2026-05-12, anchored on the two sync calls of 2026-05-11 (UX review of V3.1 side-chat) and 2026-05-12 (architecture review, post-V3.1 direction); audited during FE-705 reconciliation cleanup on 2026-05-13. > -> Scope: the next major architectural arc after FE-674's V3.1 closes. Synthesizes [MULTI_CHAT.md](./MULTI_CHAT.md), [SIDE_CHAT.md](./SIDE_CHAT.md), [PATCH_LEDGER.md](./PATCH_LEDGER.md), and [CONTINUOUS_WORKSPACE_HYBRID.md](./CONTINUOUS_WORKSPACE_HYBRID.md) into a single anchor for the next umbrella Linear issue. +> Scope: the next major architectural arc after FE-674's V3.1 closes. Synthesizes [MULTI_CHAT.md](./MULTI_CHAT.md), [SIDE_CHAT.md](./SIDE_CHAT.md), [PATCH_LEDGER.md](./PATCH_LEDGER.md), and [CONTINUOUS_WORKSPACE_HYBRID.md](./CONTINUOUS_WORKSPACE_HYBRID.md) into a single concept for the conversational workspace runtime. > -> Sibling docs remain authoritative on their subsystems. This doc resolves the cross-subsystem tensions and captures the deltas from the sync calls. Build slices fall out of this map via `/ln-plan`; this doc deliberately does **not** sequence implementation. +> Authority: this doc owns the cross-subsystem synthesis. The sibling docs remain subsystem/source references for shipped substrate details, user-surface history, algorithms, and open questions. Build slices fall out of this map via `/ln-plan`; this doc deliberately does **not** sequence implementation. ## 1. Purpose and positioning @@ -19,16 +19,16 @@ This is the **umbrella design** for what follows FE-674. It does three things: - Not an implementation plan. The sub-tracks in §5 each enter `/ln-plan` separately when picked up. - Not a re-derivation of the sibling docs. Where MULTI_CHAT / SIDE_CHAT / PATCH_LEDGER / CONTINUOUS_WORKSPACE_HYBRID already settle a question, this doc points there. - Not a UX spec. The shipped V3.1 surface, the UX review feedback, and any subsequent design pass own that. -- Not the FE-674 polish backlog (raised in UX review). Those flow into the existing branch sequence; see §6. +- Not the FE-674 polish backlog (raised in UX review). Those flow into the existing branch sequence; see §7. ### Relationship to the sibling docs | Sibling doc | Role going forward | |---|---| -| [MULTI_CHAT.md](./MULTI_CHAT.md) | Phase 1 substrate spec; the `chat` table and `reconciliation_need` queue it introduces are shipped. This umbrella inherits both as primitives and extends them. | -| [SIDE_CHAT.md](./SIDE_CHAT.md) | V1 / V2 / V3.0 / V3.1 user-surface phasing. V3.1 just closed. The user-surface trajectory continues here; SIDE_CHAT.md §13 already names the substrate alignment seam. | -| [PATCH_LEDGER.md](./PATCH_LEDGER.md) | Deep design pressure for changeset/change semantics and the reconciliation flow (target ordering, topological sort, agent decisions). The flow described in §Reconciliation Flow there is the canonical algorithm. Vocabulary is **changeset/change** going forward, per PLAN.md. | -| [CONTINUOUS_WORKSPACE_HYBRID.md](./CONTINUOUS_WORKSPACE_HYBRID.md) | The workspace shell that hosts the chat runtime. Already proposes three workspace shapes (route-alias, workspace controller, chart-backed supervisor). This umbrella commits to *that doc owning the shell choice* and treats the shell as a peer sub-track. | +| [MULTI_CHAT.md](./MULTI_CHAT.md) | Shipped Phase 1 substrate reference; the `chat` table and `reconciliation_need` queue it introduced are primitives this synthesis inherits. Its schema/migration details remain useful, but future thread and reconciliation product shape is governed here. | +| [SIDE_CHAT.md](./SIDE_CHAT.md) | User-surface history and phasing for V1 / V2 / V3.0 / V3.1, plus V4 notes. Future persistent side-chat history is folded into the unified chat/thread runtime here. | +| [PATCH_LEDGER.md](./PATCH_LEDGER.md) | Historical design pressure for semantic mutation history and reconciliation ordering. Its target-ordering algorithm remains useful; target vocabulary is **changeset/change** going forward, per SPEC/PLAN. | +| [CONTINUOUS_WORKSPACE_HYBRID.md](./CONTINUOUS_WORKSPACE_HYBRID.md) | Workspace-shell shape exploration. It still owns the route-alias / workspace-controller / chart-backed-supervisor choice; this doc treats that shell as the host prerequisite for runtime work. | ## 2. The shift, at a glance @@ -66,7 +66,7 @@ flowchart LR **What changes for the substrate** -- The `chat` table stays the durable primitive; the umbrella adds a substrate seam for threads (one of three shapes — see §3.2 / §7). +- The `chat` table stays the durable primitive; the umbrella adds a substrate seam for threads (one of three shapes — see §3.2 / §8). - `reconciliation_need.caused_by_changeset_id` becomes real once changesets land (§3.4). The `caused_by_*` placeholders already in MULTI_CHAT.md §3.4 are the seam. - The `changeset` / `change` records (PATCH_LEDGER.md Phase 2) become first-class. The transient client-side "patch" list in the V3.1 side-chat surface goes away with the popover. - Context-provision becomes a typed thread-scoped concern with TOON notation, # mention as a substrate-level mutation, and turn-zero seeding (§3.5). @@ -88,7 +88,7 @@ One main chat per spec is visible. Threads, sub-runs, and side conversations are **Primitives** - `chat` — already shipped per MULTI_CHAT.md. One interview chat per spec, addressable via `specification.primary_chat_id`. -- **Thread** — a sub-run inside the interview chat. **Substrate shape is an open question** (§7). Three plausible options: +- **Thread** — a sub-run inside the interview chat. **Substrate shape is an open question** (§8). Three plausible options: - **(p) `parent_chat_id` on `chat`** — a thread is just a child `chat` row. Smallest delta from MULTI_CHAT.md; the chat table absorbs hierarchy. - **(q) New `thread` table** — chats own threads; threads own turns. Spec → chat → thread → turn. Most expressive, biggest schema delta. - **(r) Pure UI-rendering** — chats stay sibling-of-spec; UI renders one chat's children inline. Substrate unchanged. @@ -267,7 +267,21 @@ Dependencies - The changeset ledger can run in parallel with the chat runtime once the shell exists; it has its own scope independent of in-stream rendering. - Context provision and reconciliation in-stream both ride on the chat runtime substrate; they parallelize once Track 2 has its first cut. -## 6. Out of scope / explicit deferrals +## 6. Cross-document audit + +This synthesis has to respect parallel design work that happened outside the runtime cluster. + +| Parallel design | Implication for the runtime cluster | +|---|---| +| [INTENT_GRAPH_SEMANTICS.md](./INTENT_GRAPH_SEMANTICS.md) | Reconciliation and direct-edit cascade must consult relation-policy directionality and edge support/status. The runtime cannot infer affected endpoints from raw `knowledge_edge` source/target direction. | +| [SPEC_EVOLUTION_STRATEGIES.md](./SPEC_EVOLUTION_STRATEGIES.md) | Strategy is chat-local process state. Scenario options, graph-review findings, and reconciliation suggestions are proposal turns until accepted; accepted candidate bundles become coherent changesets, not loose item-by-item mutations. | +| [AGENT_MUTATION_SURFACE.md](./AGENT_MUTATION_SURFACE.md) | Agent-originated writes must enter through Brunch-owned capability/handler contracts. The runtime may host agent runs, but those runs do not get direct ORM or route-wrapper mutation authority. | +| [BEHAVIORAL_KERNELS.md](./BEHAVIORAL_KERNELS.md) | Kernel-driven questions produce typed artifacts that the intent graph stores; the runtime provides thread/context affordances but should not invent a separate artifact ontology. | +| [DEV_WORKFLOW_EVOLUTION.md](./DEV_WORKFLOW_EVOLUTION.md) | Dev-layer file-backed registry ideas are separate from product runtime persistence. Do not mix product `changeset` tables with the future `memory/` registry experiment. | + +Audit result: the runtime concept stays coherent if it treats `chat`/thread as conversational process, `changeset`/`change` as semantic mutation history, `reconciliation_need` as process debt from a known disturbance, and graph review as a separate quality oracle. That matches the current SPEC/PLAN reconciliation. + +## 7. Out of scope / explicit deferrals - **FE-674 polish** (raised in UX review) — tactical V3.1 surface improvements that flow into the existing FE-674 branch sequence; not absorbed into this umbrella. They make the V3.1 surface more demo-legible but are tactical, not architectural. - **Designer consultation** (UX review) — visual UX directions for the new in-stream surfaces are out of scope until the design discussion lands. This doc commits to architecture, not pixel-level UI patterns. @@ -278,7 +292,7 @@ Dependencies - **Persistent side-chat history (SIDE_CHAT V4)** — superseded by Track 2. The user-visible "history" of side-chats is the main chat stream itself, where threads stay collapsed. - **Two-axis interview framing, progressive detail, candidate-spec completion assist, first-run provider setup, workspace hygiene gitignore assist, productized web research** — all PLAN.md Horizon items unrelated to the umbrella. Unaffected. -## 7. Open questions +## 8. Open questions - **Thread substrate** — (p) `parent_chat_id`, (q) new `thread` table, (r) UI-only rendering. To be decided by a Track 2 sub-RFC. - **Direct-edit thread-opening UX** — when a direct edit on the structured-list view triggers hard-impact cascade, does the system open (a) a fresh side thread anchored to the edited item, (b) append to the active reconciliation thread, or (c) both, contextually? Deferred to Track 3 / Track 4 design. @@ -291,7 +305,7 @@ Dependencies - **Continuous-workspace shape choice** — Design A / B / C in CONTINUOUS_WORKSPACE_HYBRID.md. Settled by Track 1, not this doc. - **Migration of existing client `patch` state** — the V3.1 transient staged-patches surface still uses "patch" terminology in code. Track 4 includes renaming the client state to `changeset` / `change` and folding it into durable storage, but the transition needs a stepwise plan. -## 8. Traceability +## 9. Traceability SPEC.md anchors that this umbrella inherits or extends. Identifiers are listed pending the next `/ln-sync` pass. @@ -313,7 +327,7 @@ SPEC.md anchors that this umbrella inherits or extends. Identifiers are listed p - [MULTI_CHAT.md](./MULTI_CHAT.md) §3 substrate, §4 context model, §5 reconciliation primitive - [SIDE_CHAT.md](./SIDE_CHAT.md) §5 edit-patch routing, §13 substrate alignment -- [PATCH_LEDGER.md](./PATCH_LEDGER.md) §Proposed Concepts (Chat, Patch, Patch Change, Reconciliation Need), §Reconciliation Flow, §Target Ordering, §Phase 2 Patch Ledger +- [PATCH_LEDGER.md](./PATCH_LEDGER.md) §Proposed Concepts (historical patch/patch_change vocabulary), §Reconciliation Flow, §Target Ordering, §Phase 2 Patch Ledger - [CONTINUOUS_WORKSPACE_HYBRID.md](./CONTINUOUS_WORKSPACE_HYBRID.md) §Design A/B/C, §Recommended direction - [memory/PLAN.md](../../memory/PLAN.md) §Active (continuous workspace), §Horizon (semantic changeset ledger, architect loop) -- [memory/CARDS.md](../../memory/CARDS.md) — FE-674 V3.1 closing cards; provides the V3.1 surface that this umbrella will absorb +- [memory/PLAN.md](../../memory/PLAN.md) Recently Completed — FE-674 V3.1 closing note; provides the shipped V3.1 surface that this umbrella will absorb diff --git a/docs/design/DEFERRED_RECONCILIATIONS.md b/docs/design/DEFERRED_RECONCILIATIONS.md index 78d06f17..8b80c5e5 100644 --- a/docs/design/DEFERRED_RECONCILIATIONS.md +++ b/docs/design/DEFERRED_RECONCILIATIONS.md @@ -1,18 +1,18 @@ # Deferred Reconciliations — Pending Promotions to SPEC / PLAN -> Status: **interim backlog**. +> Status: **interim backlog, audited 2026-05-13**. > Date: 2026-05-07. -> Scope: shaped product-direction items derived from the intent-spec synthesis ([`INTENT_SPEC_EVOLUTION.md`](./INTENT_SPEC_EVOLUTION.md)) that are *ready* for promotion but deliberately *deferred* until prerequisite work lands. +> Scope: shaped product-direction items derived from the archived intent-spec synthesis ([`INTENT_SPEC_EVOLUTION.md`](../archive/design/INTENT_SPEC_EVOLUTION.md)) that are *worthy but gated*: either partially captured in `memory/SPEC.md` / `memory/PLAN.md`, or deliberately deferred until prerequisite work lands. > > Each entry below has a clear destination (a `memory/SPEC.md` requirement / assumption / decision, a `memory/PLAN.md` item, or a new design doc) and a clear **trigger condition**. When the trigger fires, promote the entry through the appropriate `ln-*` skill and remove the entry from this file. When the file is empty it can be deleted. > -> This file exists because the items below would otherwise be lost or buried in the synthesis source. They are not on the active plan and they should not appear in agent task slices yet — but they should not have to be re-discovered when their triggers fire either. +> Audit result: none of the product impulses below should be promoted immediately. Edge metadata and topology-driven ranking are now represented in the FE-700/FE-702 frontier direction, but implementation evidence has not landed. Spec drift has a lexicon entry and remains a plausible product surface, but still lacks the typed-claim substrate that would make it actionable. ## How to use this doc 1. Before opening a new frontier item, check whether any deferred entries below have triggers that have now fired. 2. When promoting an entry, route through the canonical skill: `ln-spec` for SPEC.md changes, `ln-plan` for PLAN.md changes. Do not hand-edit canonical memory. -3. Delete promoted entries from this file. The synthesis source remains in [`INTENT_SPEC_EVOLUTION.md`](./INTENT_SPEC_EVOLUTION.md) for context, but this backlog is the single live tracking place. +3. Delete promoted entries from this file. The synthesis source remains in [`INTENT_SPEC_EVOLUTION.md`](../archive/design/INTENT_SPEC_EVOLUTION.md) for context, but this backlog is the single live tracking place. 4. If a trigger never fires, decide explicitly whether the entry is still relevant or should be retired with a note in the synthesis source. --- @@ -25,7 +25,7 @@ When a generated artifact (criterion, requirement, candidate-spec direction, export bundle, or downstream implementation behavior) diverges from its source claim, Brunch surfaces the divergence in human terms — "original intent vs generated behavior vs potential mismatch" — so the user can validate meaning at the point where it could have changed, rather than after the divergence has been laundered into a final document. - **Trigger:** FE-700 lands the `checkability` field and `claimMetadata` so drift can actually be detected at the typed-claim level. - **Promotes through:** `ln-spec` patch. -- **Cross-refs once promoted:** new design doc `docs/design/SPEC_DRIFT.md` (entry C3 below); links to existing Requirement 38 (invariant + example as kinds) and the `spec drift` Lexicon entry that already exists. +- **Cross-refs once promoted:** proposed design doc `docs/design/SPEC_DRIFT.md` (entry C3 below; not yet created); links to existing Requirement 38 (invariant + example as kinds) and the `spec drift` Lexicon entry that already exists. **REQ-D2. Disambiguation probes from graph topology.** The interviewer can issue contrastive A/B/C disambiguation questions when the typed graph contains a high-fanout assumption, an unwitnessed requirement, an unverified invariant, a decision without rejected alternatives, a goal without derived requirements, or a conflicting constraint. The TiCoder-style move is generalized beyond test cases: the interviewer generates cases where plausible interpretations diverge, then asks the user to classify them; the classifications emit typed claims and edges per [`INTENT_GRAPH_SEMANTICS.md`](./INTENT_GRAPH_SEMANTICS.md). @@ -67,7 +67,7 @@ After the typed claim metadata lands (FE-700) and the scenario substrate has pro - **Trigger:** REQ-D1 promotion + scenario-substrate drift probe complete. - **Depends on:** intent graph semantics + progressive checkability (FE-700 → next-3); scenario substrate (FE-698 → next-2); generative prompt probes (FE-702 → next-4). - **Promotes through:** `ln-plan` patch. -- **Once promoted:** point at the new design doc `docs/design/SPEC_DRIFT.md` (entry C3 below). +- **Once promoted:** point at the proposed design doc `docs/design/SPEC_DRIFT.md` (entry C3 below; not yet created). **PLAN-D2. Topology-driven next-question ranking interviewer behavior.** Refactor the interviewer's next-question selection to consult typed-graph topology (high-fanout low-confidence assumptions, requirements without `verifies` incoming, criteria without targets, decisions without rejected alternatives, conflicting `constrains` edges, goals without derived requirements). Distinct from kernel-driven questions: kernels suggest *what kind* of question; topology heuristics suggest *which item* to ask about. @@ -80,8 +80,8 @@ Refactor the interviewer's next-question selection to consult typed-graph topolo ## Pending design docs (1) -**C3. `docs/design/SPEC_DRIFT.md`.** -Canonical reference for spec-drift detection as a product surface. Layer 4 of the source synthesis's four-layer architecture (intent capture / ambiguity discovery / spec artifact generation / spec drift detection). Should specify: +**C3. Proposed `docs/design/SPEC_DRIFT.md`.** +Canonical reference for spec-drift detection as a product surface. This file does not exist yet; create it only if REQ-D1 is promoted. Layer 4 of the source synthesis's four-layer architecture (intent capture / ambiguity discovery / spec artifact generation / spec drift detection). Should specify: - What counts as drift (intent ↔ artifact ↔ implementation divergence cases) - How drift is detected per artifact type (criterion divergence, candidate-spec divergence, export divergence, implementation behavior divergence) - How drift is surfaced in the workspace stream (UI shape, when it interrupts, when it stays passive) @@ -96,13 +96,13 @@ Canonical reference for spec-drift detection as a product surface. Layer 4 of th ## When everything has promoted -When this file's three sections (SPEC, PLAN, design docs) are all empty, delete the file. The synthesis source remains in [`INTENT_SPEC_EVOLUTION.md`](./INTENT_SPEC_EVOLUTION.md) and the canonical references stand on their own. +When this file's three sections (SPEC, PLAN, design docs) are all empty, delete the file. The synthesis source remains in [`INTENT_SPEC_EVOLUTION.md`](../archive/design/INTENT_SPEC_EVOLUTION.md) and the canonical references stand on their own. If items remain unpromoted past their triggers (e.g., FE-700 ships but REQ-D1 still hasn't promoted three months later), reopen this file's relevant entry with a note explaining why — either retire it with reasoning, or escalate it to active triage through `ln-consult`. ## References -- [`INTENT_SPEC_EVOLUTION.md`](./INTENT_SPEC_EVOLUTION.md) — synthesis source for every entry above. +- [`INTENT_SPEC_EVOLUTION.md`](../archive/design/INTENT_SPEC_EVOLUTION.md) — synthesis source for every entry above. - [`INTENT_GRAPH_SEMANTICS.md`](./INTENT_GRAPH_SEMANTICS.md) — typed-graph reference; entries above all assume this lands first. - [`BEHAVIORAL_KERNELS.md`](./BEHAVIORAL_KERNELS.md) — kernel-driven question reference; complementary to topology-driven ranking. -- `memory/PLAN.md` items 3 (FE-700) and 4 (FE-702) — the active items whose completion will fire most triggers above. +- `memory/PLAN.md` Next items for FE-700 intent graph semantics and FE-702 graph-review / scenario probes — the frontier items whose completion will fire most triggers above. diff --git a/docs/design/DEV_WORKFLOW_EVOLUTION.md b/docs/design/DEV_WORKFLOW_EVOLUTION.md index d4c382c3..2f0b64fb 100644 --- a/docs/design/DEV_WORKFLOW_EVOLUTION.md +++ b/docs/design/DEV_WORKFLOW_EVOLUTION.md @@ -6,13 +6,13 @@ > > This document is **not** part of `memory/SPEC.md` because it does not describe Brunch the product. It is the canonical design home for the **dev layer**: how Brunch is built. Conclusions that affect product behavior should still be promoted into `memory/SPEC.md` through `ln-spec`, but most of the material here describes self-tooling rather than user-facing capability. > -> Source synthesis: external agent conversations captured in [`docs/design/INTENT_SPEC_EVOLUTION.md`](./INTENT_SPEC_EVOLUTION.md). That synthesis treats both the product layer and the dev layer in the same document; this note splits the dev-layer trajectory out so the layers stop colliding. +> Source synthesis: external agent conversations captured in [`docs/archive/design/INTENT_SPEC_EVOLUTION.md`](../archive/design/INTENT_SPEC_EVOLUTION.md). That synthesis treats both the product layer and the dev layer in the same document; this note splits the dev-layer trajectory out so the layers stop colliding. ## Why this note exists The intent-spec branching conversation produced two parallel trajectories: -1. A **product-layer** direction — Brunch should evolve from eliciting planning specs toward eliciting intent specs, with progressive checkability, behavioral kernels, semantic edges, and graph-first context. Most of that material has now landed in `memory/SPEC.md` (Requirements 38–41, A77–A87, D125, D134–D142, I109–I112, and the Lexicon entries for `intent graph` / `progressive checkability` / `behavioral kernel` / `context pack` / `scenario runner`) or in sibling design docs (`MULTI_CHAT.md`, `PATCH_LEDGER.md`, `INTENT_SPEC_EVOLUTION.md`). +1. A **product-layer** direction — Brunch should evolve from eliciting planning specs toward eliciting intent specs, with progressive checkability, behavioral kernels, semantic edges, and graph-first context. Most of that material has now landed in `memory/SPEC.md` (Requirements 38–41, A77–A87, D125, D134–D142, I109–I112, and the Lexicon entries for `intent graph` / `progressive checkability` / `behavioral kernel` / `context pack` / `scenario runner`), focused design docs (`MULTI_CHAT.md`, `PATCH_LEDGER.md`), or the archived source synthesis (`../archive/design/INTENT_SPEC_EVOLUTION.md`). 2. A **dev-layer** direction — the same critique, applied recursively to Brunch's *own* spec workflow. The current `memory/SPEC.md` is doing many jobs at once and the markdown-mediated nature of the document creates real cognitive cost on contributing LLMs. The conversation proposed a file-backed canonical spec registry with deterministic checkers and generated views. None of this has landed anywhere except as a one-line horizon item in `memory/PLAN.md` ("Structured development spec registry"). @@ -155,7 +155,7 @@ The point is not that the current system is broken — it works, and `ln-sync` e ## Proposed dev-layer trajectory -The trajectory is the one the source synthesis captures in §10–11 of [`INTENT_SPEC_EVOLUTION.md`](./INTENT_SPEC_EVOLUTION.md), but framed here as a self-tooling experiment for *this* repo, not as a product proposal. +The trajectory is the one the source synthesis captures in §10–11 of [`INTENT_SPEC_EVOLUTION.md`](../archive/design/INTENT_SPEC_EVOLUTION.md), but framed here as a self-tooling experiment for *this* repo, not as a product proposal. ### Target shape @@ -257,7 +257,7 @@ The structural argument for convergence is strong: The structural argument against immediate convergence is also strong: -- They have different persistence needs. The dev layer is diffable, branchable, reviewable in PRs — files. The product layer is interactive, multi-user, resume-precise — SQLite. (Source: [`INTENT_SPEC_EVOLUTION.md`](./INTENT_SPEC_EVOLUTION.md) §11.) +- They have different persistence needs. The dev layer is diffable, branchable, reviewable in PRs — files. The product layer is interactive, multi-user, resume-precise — SQLite. (Source: [`INTENT_SPEC_EVOLUTION.md`](../archive/design/INTENT_SPEC_EVOLUTION.md) §11.) - They have different mutation interfaces. The dev layer mutates through editor + CLI. The product layer mutates through interview turns, observer captures, and graph edits. - They have different operational metadata. The dev layer cares about test coverage and CI gates; the product layer cares about workflow phase, frontier ownership, review acceptance, and chat ownership. @@ -307,7 +307,7 @@ The decision rule: ## References -- [`INTENT_SPEC_EVOLUTION.md`](./INTENT_SPEC_EVOLUTION.md) §10–11 — source synthesis for the registry trajectory and the persistence adapter split. +- [`INTENT_SPEC_EVOLUTION.md`](../archive/design/INTENT_SPEC_EVOLUTION.md) §10–11 — source synthesis for the registry trajectory and the persistence adapter split. - [`AGENTS.md`](../../AGENTS.md) — current operational protocols, verification harness, naming conventions. - `.agents/skills/ln-*/SKILL.md` — current implementations of the dev-workflow skills. - `memory/PLAN.md` horizon item "Structured development spec registry" — the one-line pointer this document expands. diff --git a/docs/design/INTENT_GRAPH_SEMANTICS.md b/docs/design/INTENT_GRAPH_SEMANTICS.md index aab21e93..b0d8727c 100644 --- a/docs/design/INTENT_GRAPH_SEMANTICS.md +++ b/docs/design/INTENT_GRAPH_SEMANTICS.md @@ -6,7 +6,7 @@ > > This document is the canonical reference for the FE-700 frontier item ("Intent graph semantics + progressive checkability foundation") in `memory/PLAN.md`. It expands the `Recommended shape:` of that item with the full ontology and policy detail that is too long to live inside the plan. > -> Source synthesis: [`INTENT_SPEC_EVOLUTION.md`](./INTENT_SPEC_EVOLUTION.md) §3, §4, §6, §11. Where this document overlaps, it supersedes the synthesis as the structured reference; the synthesis remains the broader narrative. +> Source synthesis: [`INTENT_SPEC_EVOLUTION.md`](../archive/design/INTENT_SPEC_EVOLUTION.md) §3, §4, §6, §11. Where this document overlaps, it supersedes the synthesis as the structured reference; the synthesis remains the broader narrative. > > Layer note: this is the **product layer**. It describes what Brunch users build. The dev-layer ontology is a parallel-but-not-yet-converged register described in [`DEV_WORKFLOW_EVOLUTION.md`](./DEV_WORKFLOW_EVOLUTION.md). @@ -402,7 +402,7 @@ This ontology is the substrate for several near-term capabilities: ## References -- [`INTENT_SPEC_EVOLUTION.md`](./INTENT_SPEC_EVOLUTION.md) §3 (shared claims), §4 (knowledge edges), §6 (ambiguity-targeted disambiguation), §11 (persistence model). +- [`INTENT_SPEC_EVOLUTION.md`](../archive/design/INTENT_SPEC_EVOLUTION.md) §3 (shared claims), §4 (knowledge edges), §6 (ambiguity-targeted disambiguation), §11 (persistence model). - [`BEHAVIORAL_KERNELS.md`](./BEHAVIORAL_KERNELS.md) — kernels generate the questions; this document defines what their answers become. - `memory/SPEC.md` Requirement 38 (invariant + example as kinds), Requirement 30 (relation-first observer), I109 (compact existing-knowledge anchors), Lexicon entries for `intent graph`, `progressive checkability`, `behavioral kernel`, `edge-local neighborhood`, `property *(candidate)*`, `invariant *(planned)*`, `example *(planned)*`. - `memory/PLAN.md` item 3 (FE-700) — the active frontier item this document expands. diff --git a/docs/design/MULTI_CHAT.md b/docs/design/MULTI_CHAT.md index 5c11c3d3..9afb284d 100644 --- a/docs/design/MULTI_CHAT.md +++ b/docs/design/MULTI_CHAT.md @@ -1,10 +1,10 @@ # Multi-Chat Substrate — Design Spec -> Output of brainstorm session 2026-05-05 with Lu. First phase of the larger intent-graph evolution now captured in `memory/SPEC.md` as the split between conversational turn history, current intent-graph truth, reconciliation needs, and future semantic changesets / patches. Substrate-only: data model, relationships, migrations. Reconciliation-agent loop, side-chat UI changes, and the full patch ledger are deliberately out of scope. +> Output of brainstorm session 2026-05-05 with Lu. First phase of the larger intent-graph evolution now captured in `memory/SPEC.md` as the split between conversational turn history, current intent-graph truth, reconciliation needs, and future semantic changesets / changes. Substrate-only: data model, relationships, migrations. Reconciliation-agent loop, side-chat UI changes, and the full changeset ledger are deliberately out of scope. > -> Status: **proposed** — pending review before transitioning to an implementation plan. +> Status: **shipped substrate reference** — the Phase 1 `chat` / `turn.chat_id` / `specification.primary_chat_id` / `reconciliation_need` substrate has landed. Use this document for schema rationale and migration invariants; use [CONVERSATIONAL_WORKSPACE_RUNTIME.md](./CONVERSATIONAL_WORKSPACE_RUNTIME.md) for the consolidated future runtime concept. > -> Relationship to side-chat design: this document supersedes older side-chat substrate assumptions. The side-chat UI may still stage proposed changes in an in-memory patch list, but durable multi-chat and reconciliation storage should follow this RFC rather than earlier patch/event-stream assumptions. +> Relationship to side-chat/runtime design: this document superseded older side-chat substrate assumptions for Phase 1. Future thread hierarchy, persistent side-chat history, and reconciliation-in-stream decisions are folded into the conversational workspace runtime synthesis. ## 1. Concept & problem diff --git a/docs/design/PATCH_LEDGER.md b/docs/design/PATCH_LEDGER.md index ea6ff254..7ea97ed1 100644 --- a/docs/design/PATCH_LEDGER.md +++ b/docs/design/PATCH_LEDGER.md @@ -1,6 +1,6 @@ # Patch Ledger and Reconciliation -> Status: working design proposal. +> Status: **historical design pressure** — retained for semantic mutation history, reconciliation bases, target ordering, and phase-two ledger rationale. Future-facing schema and operation vocabulary is **changeset/change**, not patch/patch_change; the consolidated runtime concept lives in [CONVERSATIONAL_WORKSPACE_RUNTIME.md](./CONVERSATIONAL_WORKSPACE_RUNTIME.md). > Date: 2026-05-05. > Scope: Brunch runtime product persistence, not the file-backed development registry explored elsewhere. @@ -18,13 +18,13 @@ Brunch is moving from a single interview transcript toward an intent-graph works The current persistence model still treats `turn` as the main historical spine: turns belong directly to a `specification`, and knowledge items are linked back to turns through `turn_knowledge_item`. -That works for an interview-led product, but it becomes strained once semantic changes can originate outside the primary conversation. The proposal here is to separate three authorities: +That works for an interview-led product, but it becomes strained once semantic changes can originate outside the primary conversation. The proposal here is to separate three authorities. The original wording used `patch`; current canonical vocabulary uses `changeset` / `change` for that middle authority: ```text chat / turn: conversational provenance and replay -patch: +changeset / change: semantic mutation history for the intent graph reconciliation_need: diff --git a/docs/design/README.md b/docs/design/README.md index 30ecdef7..b0c22bb1 100644 --- a/docs/design/README.md +++ b/docs/design/README.md @@ -10,21 +10,57 @@ Canonical project memory lives in: Use design documents for deeper argumentation, raw synthesis, alternatives, and qualifying principles that are too large for `memory/SPEC.md` or `memory/PLAN.md`. Promote conclusions into canonical memory through the `ln-spec` and `ln-plan` workflows before treating them as roadmap commitments. -Status language: +## Status language -- `raw synthesis / ideation` — broad source material; requires grilling before promotion. +- `source archive / raw synthesis` — broad source material preserved for provenance; active docs may cite it, but it is not live guidance. - `working design proposal` — a shaped proposal that may guide planning, but still needs canonical SPEC / PLAN links. +- `active synthesis` — the current cross-document concept map for a cluster; subsystem/source docs remain useful for details, but this doc owns the combined direction. +- `shipped substrate reference` — an RFC whose first implementation has landed; use it for invariants, migrations, and historical rationale, but check `memory/SPEC.md` / `memory/PLAN.md` for current status. +- `historical design pressure` — still valuable for unresolved questions or algorithms, but terminology or product shape has been superseded. +- `interim backlog` — shaped impulses that are deliberately not in the plan until their triggers fire. +- `future-facing draft` — intentionally deferred architecture map. - `archived` — historical context only; no longer live design guidance. -Current live design proposals: +## Live index -- `MULTI_CHAT.md` — concrete phase-one substrate for chat containers and reconciliation needs. -- `PATCH_LEDGER.md` — deeper semantic mutation history and reconciliation design pressure after the multi-chat substrate. -- `INTENT_SPEC_EVOLUTION.md` — broader intent-spec ontology and progressive checkability synthesis (raw, the source for the more focused docs below). -- `INTENT_GRAPH_SEMANTICS.md` — product-layer ontology, edge taxonomy, relation policy, and progressive-checkability binding. Canonical reference for FE-700. -- `BEHAVIORAL_KERNELS.md` — product-layer behavioral-kernel typology, kernel cards, signal-phrase routing, and the contrastive-question interviewer workflow. Canonical reference for FE-702 kernel probes. -- `DEV_WORKFLOW_EVOLUTION.md` — **dev-layer** trajectory for the `ln-*` skill family, the proposed file-backed spec registry, and the long-horizon convergence between dev and product ontologies. Distinct from the product-layer docs above; not part of `memory/SPEC.md`. -- `DEFERRED_RECONCILIATIONS.md` — interim backlog of shaped product-direction items (SPEC requirements, assumptions, PLAN horizon items, future design docs) that are ready for promotion but deliberately deferred until prerequisite work fires their triggers. Delete the file when all entries have promoted. +### Product ontology and strategy + +| Document | Role | +| --- | --- | +| `INTENT_GRAPH_SEMANTICS.md` | Product-layer ontology, edge taxonomy, relation policy, and progressive-checkability binding. Canonical design reference for FE-700. | +| `BEHAVIORAL_KERNELS.md` | Behavioral-kernel typology, kernel cards, signal-phrase routing, and contrastive-question workflow. Canonical design reference for kernel probes. | +| `SPEC_EVOLUTION_STRATEGIES.md` | FE-705-era synthesis for chat-local strategies, scenario options, graph review, proposal turns, relation directionality, and candidate bundles. Graduated into `memory/SPEC.md` / `memory/PLAN.md`; keep as rationale. | +| `AGENT_MUTATION_SURFACE.md` | Audit of agent-originated/adjoining mutation paths and the capability/changeset boundary needed before agents write durable truth. | + +### Conversational workspace runtime cluster + +| Document | Role | +| --- | --- | +| `CONVERSATIONAL_WORKSPACE_RUNTIME.md` | **Active synthesis** for the continuous workspace + unified chat + reconciliation + changeset-ledger concept. Use this as the first stop for the cluster. | +| `MULTI_CHAT.md` | Shipped substrate reference for `chat`, `turn.chat_id`, `specification.primary_chat_id`, and `reconciliation_need`. | +| `SIDE_CHAT.md` | User-surface history and phasing for side-chat V1–V3.1, with V4 notes. Future runtime direction is folded into `CONVERSATIONAL_WORKSPACE_RUNTIME.md`. | +| `PATCH_LEDGER.md` | Historical design pressure for semantic mutation history and reconciliation ordering. Future-facing vocabulary is `changeset` / `change`; see `CONVERSATIONAL_WORKSPACE_RUNTIME.md` and `memory/PLAN.md`. | +| `CONTINUOUS_WORKSPACE_HYBRID.md` | Workspace-shell shape exploration; owns the route-alias / workspace-controller / chart-backed-supervisor choice. | + +### Dev process and deferred impulses + +| Document | Role | +| --- | --- | +| `DEV_WORKFLOW_EVOLUTION.md` | Dev-layer trajectory for the `ln-*` skill family, `memory/` ontology, proposed file-backed spec registry, and possible dev/product ontology convergence. Not product SPEC. | +| `DEFERRED_RECONCILIATIONS.md` | Interim backlog for product impulses that are worthy but intentionally gated. Audit before promoting or retiring entries. | + +### Isolated / future-facing notes + +| Document | Role | +| --- | --- | +| `PORTABILITY_BOUNDARIES.md` | Future adapter/hosting/remote-workspace boundary map. | +| `GRAPH_KIND_CHIP_TOGGLE.md` | Standalone graph-view split-button chip proposal; audit against current horizon before implementation. | + +### Archived source + +| Document | Role | +| --- | --- | +| `../archive/design/INTENT_SPEC_EVOLUTION.md` | Raw synthesis / ideation source for the May 2026 intent-spec evolution work. Active docs above supersede its conclusions. | Schema reference artifacts are intentionally kept outside this design directory. The canonical generated DBML lives at `docs/schema.dbml` and is derived from `src/server/schema.ts`; do not add parallel `schema.dbml` or `schema.dbdiagram` copies under `docs/design/`. diff --git a/docs/design/SIDE_CHAT.md b/docs/design/SIDE_CHAT.md index 06d55fbb..026fb309 100644 --- a/docs/design/SIDE_CHAT.md +++ b/docs/design/SIDE_CHAT.md @@ -1,8 +1,8 @@ # Side-Chat — Design Spec -> Output of brainstorm session 2026-04-30. Subsumes three previously-separate horizon items in `memory/PLAN.md`: graph-launched refinement (D128), trigger-popover composer, and revisit/edit mode (`docs/design/REVISIT_MODULE.md`). +> Output of brainstorm session 2026-04-30. Subsumes three previously-separate horizon items in `memory/PLAN.md`: graph-launched refinement (D128), trigger-popover composer, and revisit/edit mode (the archived revisit-module concept). > -> Status: **proposed** — pending review before transitioning to implementation plan. +> Status: **shipped through V3.1; V4 horizon reference** — V1/V2/V3.0/V3.1 user-surface phasing has landed through FE-674. Keep this doc for shipped side-chat history, V4 notes, and UI rationale; use [CONVERSATIONAL_WORKSPACE_RUNTIME.md](./CONVERSATIONAL_WORKSPACE_RUNTIME.md) for the consolidated future runtime direction. ## 1. Concept & Problem @@ -14,7 +14,7 @@ The side-chat adds a second interaction surface: a popover-to-panel chat that op - **D128 graph-launched refinement** — the disabled `chat-with` placeholder on each row in `-structured-list-view.tsx` is the seam this design activates. - **Trigger-popover composer** (`/` commands, `@` knowledge mentions, `#` phase refs) — folded into the side-chat surface as in-chat affordances. -- **Revisit/edit mode + cascade preview** (`docs/design/REVISIT_MODULE.md`) — the side-chat panel hosts the cascade preview and the secondary-thread walk, replacing the modal in the current REVISIT design. +- **Revisit/edit mode + cascade preview** — the side-chat panel hosts the cascade preview and the secondary-thread walk, replacing the older revisit-module/modal design. ### At a glance — user flow @@ -210,7 +210,7 @@ When a patch with kind `edit` is applied, the system routes by **two questions i |---|---|---| | **None** | `affectedCount === 0` (item is a graph leaf with no downstream edges) | Apply directly. Single-item content update; brief inline confirmation card in the panel: "Updated `[X]`." | | **Soft** | `1 ≤ affectedCount ≤ 2` AND no anchor or affected item is in an active review set *(active = generated and not yet accepted)* | Apply directly with affected-item context. Patch lands directly; brief inline confirmation lists the affected items: "Updated `[X]`; `[Y]`, `[Z]` may need a refresh." No cascade preview or durable `reconciliation_need` rows. | -| **Hard** | High downstream count, OR any anchor or affected item is in an active review set | **Cascade preview** backed by `reconciliation_need` rows → batch-resolution mode in the side-chat panel (§5.3). The archived REVISIT_MODULE walk is superseded. | +| **Hard** | High downstream count, OR any anchor or affected item is in an active review set | **Cascade preview** backed by `reconciliation_need` rows → batch-resolution mode in the side-chat panel (§5.3). The archived [REVISIT_MODULE](../archive/design/REVISIT_MODULE.md) walk is superseded. | ### 5.2 Confidence model — V1 @@ -431,7 +431,7 @@ V-versions in §9 describe the *user surface*; substrate phases in `docs/design/ ## Traceability -- **Replaces** PLAN.md horizon items: graph-launched refinement (under D128), trigger-popover composer, revisit / edit mode + cascade preview (`docs/design/REVISIT_MODULE.md` becomes a sub-document of this design). +- **Replaces** PLAN.md horizon items: graph-launched refinement (under D128), trigger-popover composer, revisit / edit mode + cascade preview (the older revisit-module/modal concept is subsumed by this design). - **Reuses** D125 (typed relation policy), D127 (progressive-detail seam), D128 (graph view actionable workspace mode), Requirement 25 (revision card pattern). - **Adds** future assumptions A71 (patch/event-stream model), A72 (item versioning), A73 (architect loop). - **Bounded by** D80 (no turn-tree branching), D89 (card-owned input), D113 (no second durable workflow model), D66 (user authorizes). diff --git a/memory/PLAN.md b/memory/PLAN.md index 1c3380d7..3095a5d7 100644 --- a/memory/PLAN.md +++ b/memory/PLAN.md @@ -8,7 +8,7 @@ The interaction model is mature: four-phase interview, interviewer-autonomous qu The next product arc is a **continuous conversational workspace** plus a stronger semantic/generative substrate. Continuous workspace is already active in parallel: it gives the chat runtime a stable phase-addressable host. The FE-705 branch contributes an integration substrate — a local agent capability CLI and external LLM-as-user probe harness — that should be reconciled into main before graph-review and scenario-options work depends on generated completed-spec fixtures. After that, the highest-coordination work is the intent-graph semantic model and semantic changeset ledger; lower-coordination provider, gitignore, and web-research work can proceed in parallel. -The May 2026 intent-spec, multi-chat, changeset-ledger, prompt/context, and agent-mutation design notes are reconciled into one direction. `docs/design/MULTI_CHAT.md` is the substrate document. `docs/design/SIDE_CHAT.md` describes side-chat V1 / V2 / V3.0 / V3.1 / V4 phasing on top of that substrate. `docs/design/PATCH_LEDGER.md` remains historical deeper design pressure for semantic mutation history, but canonical future-facing vocabulary is `changeset` / `change`; `docs/design/INTENT_SPEC_EVOLUTION.md` carries the broader synthesis. The product-layer ontology trajectory is split out as `docs/design/INTENT_GRAPH_SEMANTICS.md` (canonical reference for FE-700) and `docs/design/BEHAVIORAL_KERNELS.md` (kernel probes). FE-705's branch-local strategy/proposal notes add scenario options, graph-review oracle, chat-local strategies, and concern/dependency mapping; those notes should become a canonical design doc when the branch is integrated. The dev-layer self-tooling trajectory lives in `docs/design/DEV_WORKFLOW_EVOLUTION.md`. +The May 2026 intent-spec, multi-chat, changeset-ledger, prompt/context, and agent-mutation design notes are reconciled into one direction. `docs/design/MULTI_CHAT.md` is the substrate document. `docs/design/SIDE_CHAT.md` describes side-chat V1 / V2 / V3.0 / V3.1 / V4 phasing on top of that substrate. `docs/design/PATCH_LEDGER.md` remains historical deeper design pressure for semantic mutation history, but canonical future-facing vocabulary is `changeset` / `change`; `docs/archive/design/INTENT_SPEC_EVOLUTION.md` carries the broader synthesis. The product-layer ontology trajectory is split out as `docs/design/INTENT_GRAPH_SEMANTICS.md` (canonical reference for FE-700) and `docs/design/BEHAVIORAL_KERNELS.md` (kernel probes). FE-705's branch-local strategy/proposal notes add scenario options, graph-review oracle, chat-local strategies, and concern/dependency mapping; those notes should become a canonical design doc when the branch is integrated. The dev-layer self-tooling trajectory lives in `docs/design/DEV_WORKFLOW_EVOLUTION.md`. ## Active @@ -26,7 +26,7 @@ The May 2026 intent-spec, multi-chat, changeset-ledger, prompt/context, and agen - Recommended shape: preserve the branch's split between server-owned capability contracts and script-side probe harness. The adapter exposes explicit resource-id calls (`spec.create`, `chat.getPrimary`, `chat.ensureReady`, `chat.read`, `turn.submitResponse`, and follow-on lifecycle/export operations as scoped); the probe runner owns scenario briefs, model-backed simulated-user policy, artifact bundles, fixture-candidate inspection, and workspace-state preservation. Keep browser automation, product UI, provider credential UX, shared production provider routing, and durable runtime-operation ledgers out of the integration slice. - Verification approach: contract/dispatcher tests, JSONL protocol/session tests, import-boundary tests proving the probe runner uses only the JSONL client/process boundary, fake process tests, opt-in real-provider smoke, and fixture-candidate structure/readiness checks. - Traceability: Requirement 43; A89; D143, D147; I114. Also protects Requirements 40, 41, 42 by making prompt/context and mutation-surface probes executable through a real adapter. - - Design docs: `docs/design/AGENT_MUTATION_SURFACE.md`; `docs/design/INTENT_SPEC_EVOLUTION.md`; FE-705 branch artifacts until rebased. + - Design docs: `docs/design/AGENT_MUTATION_SURFACE.md`; `docs/archive/design/INTENT_SPEC_EVOLUTION.md`; FE-705 branch artifacts until rebased. 3. **Intent graph semantics + relation-policy directionality foundation** — refine the ontology and relation policy so the graph can represent invariants, examples/counterexamples, constraint subtypes, narrowed decisions, witness strength, checkability gaps, and operational edge behavior as source/destination material for future generative features. - Linear: FE-700. @@ -34,7 +34,7 @@ The May 2026 intent-spec, multi-chat, changeset-ledger, prompt/context, and agen - Recommended shape: add `invariant` and `example` as first-class durable kinds; subtype examples; narrow `decision`; enrich `constraint`, `criterion`, and `invariant` subtypes; add `checkability` and witness strength; introduce the five-family relation taxonomy and negative relations; add edge epistemic metadata; and make relation-policy directionality explicit (`canonicalSentence`, `inverseSentence`, source-change behavior, target-change behavior) rather than inferring cascade from raw edge direction. Leave room for contrastive-kernel artifacts such as `alternative`, `question`, `ambiguity`, and `candidate`, but keep them proposal-local unless probes prove they need durable top-level kinds. - Verification approach: corpus/fixture observer probes comparing old vs refined ontology; relation-policy unit tests for mixed-direction relations; graph-review manual assessment for precision/noise; context-pack probe outputs must show authority, witness, relation support, and directionality labels. - Traceability: Requirement 38; A77, A78, A80, A81, A84; D134, D136, D137, D139, D140. - - Design docs: `docs/design/INTENT_GRAPH_SEMANTICS.md`; `docs/design/INTENT_SPEC_EVOLUTION.md`; FE-705 strategy/proposal notes for relation directionality. + - Design docs: `docs/design/INTENT_GRAPH_SEMANTICS.md`; `docs/archive/design/INTENT_SPEC_EVOLUTION.md`; FE-705 strategy/proposal notes for relation directionality. 4. **Semantic changeset ledger + proposal-turn staleness** — introduce the semantic history spine that separates graph mutation history from conversational turn ancestry. - Linear: FE-701. From fb68170bad216f20a5a5fe633f71f42307cc5731 Mon Sep 17 00:00:00 2001 From: Lu Nelson Date: Wed, 13 May 2026 13:56:07 +0200 Subject: [PATCH 28/42] Map runtime design doc supersession --- .../CONVERSATIONAL_WORKSPACE_RUNTIME.md | 14 +++++ memory/REFACTOR.md | 52 +++++++++++++++++++ 2 files changed, 66 insertions(+) create mode 100644 memory/REFACTOR.md diff --git a/docs/design/CONVERSATIONAL_WORKSPACE_RUNTIME.md b/docs/design/CONVERSATIONAL_WORKSPACE_RUNTIME.md index a70aab2c..450ca942 100644 --- a/docs/design/CONVERSATIONAL_WORKSPACE_RUNTIME.md +++ b/docs/design/CONVERSATIONAL_WORKSPACE_RUNTIME.md @@ -30,6 +30,20 @@ This is the **umbrella design** for what follows FE-674. It does three things: | [PATCH_LEDGER.md](./PATCH_LEDGER.md) | Historical design pressure for semantic mutation history and reconciliation ordering. Its target-ordering algorithm remains useful; target vocabulary is **changeset/change** going forward, per SPEC/PLAN. | | [CONTINUOUS_WORKSPACE_HYBRID.md](./CONTINUOUS_WORKSPACE_HYBRID.md) | Workspace-shell shape exploration. It still owns the route-alias / workspace-controller / chart-backed-supervisor choice; this doc treats that shell as the host prerequisite for runtime work. | +### Runtime-cluster supersession map + +| Claim type | Current authority | Retained source detail | Superseded / historical | +|---|---|---|---| +| Runtime concept and cross-track direction | This document | Sync-call deltas captured here plus PLAN sequencing constraints | Reading MULTI_CHAT / SIDE_CHAT / PATCH_LEDGER as independent future roadmaps | +| Phase 1 chat substrate | [MULTI_CHAT.md](./MULTI_CHAT.md) | Schema, migration, compatibility invariants, and the `reconciliation_need` primitive | Any implication that MULTI_CHAT owns future thread hierarchy or unified-chat UX | +| Side-chat user-surface history | [SIDE_CHAT.md](./SIDE_CHAT.md) | V1–V3.1 shipped behavior, UI language, V4 persistence notes | Treating the popover, top-bar patch list, or standalone Pending review section as the long-term surface | +| Semantic mutation history | This document + SPEC/PLAN vocabulary; [PATCH_LEDGER.md](./PATCH_LEDGER.md) for algorithmic pressure | Reconciliation bases, target grouping, topological ordering, phase-two ledger rationale | New schema/operation names using `patch` / `patch_change` instead of `changeset` / `change` | +| Workspace shell shape | [CONTINUOUS_WORKSPACE_HYBRID.md](./CONTINUOUS_WORKSPACE_HYBRID.md) | Route-alias / workspace-controller / chart-backed-supervisor alternatives | Re-deciding shell architecture inside runtime/thread work | +| Reconciliation vs graph review | SPEC/PLAN + this document's cross-document audit | PATCH_LEDGER reconciliation-flow mechanics; SPEC_EVOLUTION_STRATEGIES graph-review distinctions | Using `reconciliation_need` as the table for all graph quality findings | +| Agent mutation authority | [AGENT_MUTATION_SURFACE.md](./AGENT_MUTATION_SURFACE.md) | Capability/handler boundary and changeset-centered mutation vocabulary | Agents writing directly through ORM helpers or harness-specific route wrappers | + +Open questions that remain live: thread substrate shape, reconciliation thread lifecycle, direct-edit thread-opening UX, `thread_context_item` ownership, `#` mention disambiguation, TOON implementation choice, async classifier scheduling, and migration of existing client patch terminology. + ## 2. The shift, at a glance ```mermaid diff --git a/memory/REFACTOR.md b/memory/REFACTOR.md new file mode 100644 index 00000000..da48b5e8 --- /dev/null +++ b/memory/REFACTOR.md @@ -0,0 +1,52 @@ +# Runtime Design-Doc Cluster Refactor + +## Problem Statement + +The `docs/design/` runtime cluster contains several documents that were written at different moments in the multi-chat, side-chat, reconciliation, and changeset-ledger arc. The first consolidation pass fixed status headers and links, but the cluster still requires a content-level audit: older subsystem docs may contain future-facing claims that are now superseded by `CONVERSATIONAL_WORKSPACE_RUNTIME.md`, while the active synthesis may still rely on algorithmic or substrate details that only exist in older docs. + +This makes the cluster harder for a builder or agent to navigate. A reader must infer which claims are shipped fact, which are current future direction, which are historical design pressure, and which are open questions. + +## Solution + +Refactor the runtime-cluster documentation so authority and supersession are explicit without deleting useful subsystem rationale. + +Target state: + +- `CONVERSATIONAL_WORKSPACE_RUNTIME.md` is the first-stop synthesis for the runtime cluster. +- `MULTI_CHAT.md` remains the shipped substrate reference for Phase 1 schema/migration invariants. +- `SIDE_CHAT.md` remains the user-surface history and V4 notes reference. +- `PATCH_LEDGER.md` remains historical design pressure for changeset/change semantics and reconciliation ordering. +- Each old doc clearly distinguishes shipped facts, retained rationale, superseded vocabulary/surfaces, and remaining open questions. +- Any canonical drift discovered during the audit is reported before touching `memory/SPEC.md` or `memory/PLAN.md`. + +## Commits + +1. Add a compact runtime-cluster supersession map to the active synthesis, naming current authority, retained subsystem details, superseded surfaces/vocabulary, and open questions. +2. Audit the side-chat design for shipped-versus-horizon claims, marking stale popover/patch-list/pending-review assumptions as historical where the unified runtime now supersedes them. +3. Audit the patch-ledger design for patch-to-changeset vocabulary boundaries, preserving target ordering and reconciliation-flow algorithms while marking old schema names as historical. +4. Audit the multi-chat substrate design for shipped Phase 1 facts versus later substrate possibilities, making clear what future thread/runtime questions are no longer owned there. +5. Run a final navigation pass over `docs/design/README.md` and local links so the cluster can be entered from the index without reading stale docs first. + +Each commit is documentation-only and should leave the repository working. + +## Decisions + +- Treat this as a documentation refactor, not a product plan rewrite. +- Do not delete `MULTI_CHAT.md`, `SIDE_CHAT.md`, or `PATCH_LEDGER.md`; they still contain useful subsystem rationale and implementation history. +- Do not promote deferred product impulses during this pass. If the audit reveals a real SPEC/PLAN gap, record it for separate `ln-spec` / `ln-plan` handling. +- Keep future-facing vocabulary aligned with current canonical language: `changeset` / `change`, proposal turns, chat-local strategies, relation-policy directionality, and graph review distinct from reconciliation. + +## Testing Decisions + +- Use markdown link checks for local references after each meaningful pass. +- Use `npm run fix` after edits as the inner-loop repository check. +- Full `npm run verify` is optional before commit if the changes remain docs-only, but should be run before final submission if this branch is going to PR. +- The main review oracle is conceptual: a reader should be able to answer, from headers and first sections alone, which runtime doc owns which kind of claim. + +## Out of Scope + +- Changing `memory/SPEC.md` or `memory/PLAN.md` except for trivial link/path fixes explicitly approved during the audit. +- Migrating `memory/PLAN.md` to a sequencing-plus-definitions format. +- Refactoring `ln-*` skills or `AGENTS.md` planning rules. +- Implementing changeset, thread, reconciliation, or workspace runtime code. +- Creating `docs/design/SPEC_DRIFT.md` or promoting `DEFERRED_RECONCILIATIONS.md` entries. From 9bb18a72f4d0ffbecfed47b504e08e24985b7e2b Mon Sep 17 00:00:00 2001 From: Lu Nelson Date: Wed, 13 May 2026 13:57:33 +0200 Subject: [PATCH 29/42] Clarify side-chat shipped and horizon claims --- docs/design/SIDE_CHAT.md | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/docs/design/SIDE_CHAT.md b/docs/design/SIDE_CHAT.md index 026fb309..70298594 100644 --- a/docs/design/SIDE_CHAT.md +++ b/docs/design/SIDE_CHAT.md @@ -4,11 +4,23 @@ > > Status: **shipped through V3.1; V4 horizon reference** — V1/V2/V3.0/V3.1 user-surface phasing has landed through FE-674. Keep this doc for shipped side-chat history, V4 notes, and UI rationale; use [CONVERSATIONAL_WORKSPACE_RUNTIME.md](./CONVERSATIONAL_WORKSPACE_RUNTIME.md) for the consolidated future runtime direction. +## How to read this after V3.1 + +This document is now a shipped-surface and horizon-reference record, not the active runtime synthesis. + +| Claim area | Current reading | +|---|---| +| Popover-to-panel side-chat, pinned context, and brand-halo UI | Shipped/near-shipped V1–V3 surface history and UI rationale. Useful when maintaining the current side-chat panel. | +| Patch list / top-bar staging surface | Historical V1/V2 design language. The durable future is `changeset` / `change`; the long-term user surface moves into in-stream threads per `CONVERSATIONAL_WORKSPACE_RUNTIME.md`. | +| Pending review section | Shipped V3.0/V3.1 bridge surface. Long-term reconciliation absorbs into a target-grouped reconciliation thread. | +| V4a side-chat persistence | Still a plausible substrate step, but now understood as part of the unified chat/thread runtime rather than a standalone tab-strip roadmap. | +| V4b patch ledger / item versioning / architect loop | Horizon. Use current vocabulary: changeset ledger, proposal turns, graph review, and architect proposals through HITL acceptance. | + ## 1. Concept & Problem -Today, all interaction with Brunch's spec runs through one long interview thread: a linear back-and-forth in a single phase chat. When the user opens the structured spec view (graph view) and notices something they want to discuss, edit, annotate, or refine, they have no way to act on that item *in place* — they have to navigate back to the chat and try to reintroduce the topic, often without the system understanding which item they're talking about. +At the time this design was written, all interaction with Brunch's spec ran through one long interview thread: a linear back-and-forth in a single phase chat. When the user opened the structured spec view (graph view) and noticed something they wanted to discuss, edit, annotate, or refine, they had no way to act on that item *in place* — they had to navigate back to the chat and try to reintroduce the topic, often without the system understanding which item they meant. -The side-chat adds a second interaction surface: a popover-to-panel chat that opens *from* an item in the structured spec view, with selection-aware context, and that can produce durable changes to the spec through a unified review surface called the **patch list**. +The side-chat added a second interaction surface: a popover-to-panel chat that opens *from* an item in the structured spec view, with selection-aware context, and that can produce durable changes to the spec through the then-current review surface called the **patch list**. The long-term runtime direction now folds this surface into a unified chat/thread stream. **The side-chat subsumes three horizon items:** @@ -178,7 +190,7 @@ The top-bar `Apply` button performs **bulk-apply** across all staged patches in ### 4.4 Why this matters -The patch list is **the unifying review surface for all spec mutations**. The same surface the architect loop (§7) will later use to deposit system-generated proposals for HITL review. Designing the side-chat around the patch list now means the architect loop has somewhere to deposit when it ships, with no second review UI to invent. +Historical V1/V2 reading: the patch list was designed as **the unifying review surface for all spec mutations** so later architect-loop proposals would have somewhere to deposit. Current target reading: the review unit is still HITL and batchable, but future durable semantics should be expressed as proposal turns and accepted changesets inside the unified runtime rather than a separate long-lived patch-list surface. ## 5. Edit Patch Routing @@ -288,13 +300,13 @@ Surfacing rules: The side-chat's substrate dependencies have shifted as the multi-chat work landed. Two assumptions are unchanged; one is partly satisfied. -### A71 *(partly satisfied)*: patch / event-stream data model +### A71 *(partly satisfied)*: chat substrate plus semantic mutation ledger -The original framing — `spec → chat → turns` with diff patches as the persistence primitive — is split. In this stack, the `spec → chat → turns` half is supplied by downstack FE-697: a `chat` table, nullable `turn.chat_id`, `specification.primary_chat_id`, mirrored `chat.active_turn_id`, and a `reconciliation_need` queue with placeholder `caused_by_patch_id`. The patch ledger half remains horizon work tracked in `docs/design/PATCH_LEDGER.md`. +The original framing — `spec → chat → turns` with diff patches as the persistence primitive — is split. In this stack, the `spec → chat → turns` half is supplied by downstack FE-697: a `chat` table, nullable `turn.chat_id`, `specification.primary_chat_id`, mirrored `chat.active_turn_id`, and a `reconciliation_need` queue with a future semantic-mutation cause placeholder. The changeset ledger half remains horizon work tracked historically in `docs/design/PATCH_LEDGER.md` and currently in `memory/PLAN.md` as the semantic changeset ledger. **Implication for V3.** The cascade preview reads `reconciliation_need` rows directly (see §5.3, §13). Side-chat threads themselves stay in-memory through V3 — durable side-chat persistence is MULTI_CHAT.md Phase 2 / V4 and is **not** a V3 prerequisite. -**Implication if the patch ledger lands later:** `reconciliation_need.caused_by_patch_id` becomes populated; resolutions write patches; the in-memory patch list translates to `appendPatch(spec, patch[])`. No user-facing change to V3 surfaces. +**Implication if the changeset ledger lands later:** reconciliation needs gain changeset-backed cause/resolution provenance; resolutions write changesets; the in-memory patch list either retires into proposal-turn state or translates through a compatibility layer. No user-facing change to shipped V3 surfaces is required. ### A72: knowledge-item versioning @@ -304,7 +316,7 @@ History per knowledge item, preserved through edits. Anchors annotations to spec ### A73: architect / generator loop -Captured in §7. The side-chat is *user-driven*; the architect is *system-driven*. Both deposit into the patch list. Designing the side-chat's patch-list surface now means the architect has a review surface ready when it ships. +Captured in §7. The side-chat is *user-driven*; the architect is *system-driven*. Historical design routed both into the patch list; current design routes architect proposals through proposal turns and accepted changesets, with graph review as the safety oracle. ## 9. Phasing @@ -314,8 +326,8 @@ Captured in §7. The side-chat is *user-driven*; the architect is *system-driven | **V2** | Edit (router) · Drill-down · Propose-edge in the patch list. **None** and **Soft** edit tiers apply directly. **Hard** edit defers to a placeholder "feature coming" message. Refine routes through normal turn machinery. | | **V3.0** | Hard-edit apply opens `reconciliation_need` rows from existing graph edges (Path 1, deterministic). Cascade preview surfaces as a `Pending review` section inside the canonical patch-list overlay; **single per-row Resolve action** that idempotently transitions `open → resolved`. The V2 `deferred: true` server response and the "Hard impact — coming in V3 cascade preview" banner are removed. Acceptance Criterion #7 satisfied mechanically. No reconciliation agent. REVISIT modal stays archived. (Note: the original three-action design — `accept-on-target / edit-target / dismiss` — is collapsed to a single Resolve in V3.0 because the open→resolved transition is the same regardless of intent label; V3.1 reintroduces richer kinds via the agent.) | | **V3.1** *(shipped, FE-674 PRs #119–#124)* | Reconciliation classifier writes `agent_status` / `agent_classification` / `agent_proposal` per row. Pending review surface renders chips, Run-agent + polling (`POST /api/specifications/:id/reconciliation-needs/run-agent`), per-row Re-run (`POST /api/specifications/:id/reconciliation-needs/:needId/reset-agent`), per-class actions, and bulk Confirm-all / Apply-all-suggested. Substantive walk lands inside the side-chat panel using pinned-context conversation. Path 2 observer expansion still horizon. | -| **V4a** *(next, FE-675 V4a half)* | Side-chat client persists turns into `chat` / `turn` with `chat.kind='side_chat'`; "Old chats" tab strip activates. | -| **V4b** *(horizon, FE-675 V4b half + FE-701)* | Patch ledger lands. `reconciliation_need.caused_by_patch_id` populates; resolutions write typed patches; item versioning anchors annotations and soft-edit audit. Architect loop deposits into the same patch list. | +| **V4a** *(horizon / runtime-track input)* | Side-chat client persists turns into `chat` / `turn` with `chat.kind='side_chat'`; "Old chats" tab strip activates. Current runtime synthesis may instead render side conversations as in-stream threads. | +| **V4b** *(horizon, FE-675 V4b half + FE-701)* | Changeset ledger lands. Reconciliation needs gain semantic-mutation cause/resolution provenance; item versioning anchors annotations and soft-edit audit. Architect-loop proposals use the same HITL proposal/changeset pathway rather than committing graph truth directly. | ## 10. Verification Stance @@ -414,8 +426,8 @@ V-versions in §9 describe the *user surface*; substrate phases in `docs/design/ | V2 (Edit / Drill-down / Propose-edge, None+Soft tiers) | Phase 1 not required | Shipped against in-memory patch list; hard branch returns `deferred: true`. | | V3.0 *(shipped, FE-674 PRs #115-#118)* | Phase 1 read side | Hard apply writes `reconciliation_need` rows; UI reads the queue. Per-row Resolve / Edit-target / View-source-diff. No agent. | | V3.1 *(shipped, FE-674 PRs #119-#124)* | Phase 3 | Reconciliation classifier writes `agent_status` / `agent_classification` / `agent_proposal` per row. Pending review surface renders `` (six variants), Run-agent button with conditional 1s polling, per-row Re-run on classified/failed rows, per-class actions (`auto-confirm` → Confirm, `auto-edit` → View / Apply / Skip, `substantive` → Open side-chat via `useSideChat().openFor`), bulk Confirm-all (N) and Apply-all-suggested (N) iterating serially. **HTTP:** `POST /api/specifications/:id/reconciliation-needs/run-agent` and `POST /api/specifications/:id/reconciliation-needs/:needId/reset-agent` (§5.3). | -| V4a *(next, FE-675 V4a half)* | Phase 2 | Side-chat client persists turns into `chat` / `turn` with `chat.kind='side_chat'`; "Old chats" tab strip activates. §349 anchor decision still open. | -| V4b *(horizon, FE-675 V4b half)* | Phase 4 | Patch ledger (FE-701); item versioning; branched exploration; architect loop. | +| V4a *(horizon / runtime-track input)* | Phase 2 | Side-chat client persists turns into `chat` / `turn` with `chat.kind='side_chat'`; "Old chats" tab strip activates in this document's original model. Current runtime synthesis may fold this into in-stream threads. | +| V4b *(horizon, FE-675 V4b half)* | Phase 4 | Changeset ledger (FE-701); item versioning; branched exploration; architect loop. | **Decisions and assumptions that govern V3.0:** From 34b23c38dfd9f02555b017ad36bc69e36b3c9d6b Mon Sep 17 00:00:00 2001 From: Lu Nelson Date: Wed, 13 May 2026 14:02:44 +0200 Subject: [PATCH 30/42] Translate patch ledger doc to changeset vocabulary --- docs/design/PATCH_LEDGER.md | 180 +++++++++++++++++++----------------- 1 file changed, 97 insertions(+), 83 deletions(-) diff --git a/docs/design/PATCH_LEDGER.md b/docs/design/PATCH_LEDGER.md index 7ea97ed1..357a643b 100644 --- a/docs/design/PATCH_LEDGER.md +++ b/docs/design/PATCH_LEDGER.md @@ -4,6 +4,20 @@ > Date: 2026-05-05. > Scope: Brunch runtime product persistence, not the file-backed development registry explored elsewhere. +## How to read this after the changeset vocabulary shift + +This document predates the final vocabulary choice. Treat it as an algorithm and rationale source, not as a naming authority. + +| Historical wording here | Current wording / authority | +|---|---| +| `patch` | `changeset` — one atomic semantic mutation bundle. | +| `patch_change` | `change` — one atomic operation inside a changeset. | +| `caused_by_patch_id`, `resolved_by_patch_id` | Future changeset-backed cause/resolution fields; final column names should be chosen by the FE-701 changeset-ledger design. | +| Patch list / reconciliation review set | Historical review-surface framing. Current runtime synthesis routes proposals through proposal turns and accepted changesets. | +| Target ordering and reconciliation bases | Still useful algorithmic pressure. Preserve these concepts when implementing reconciliation threads or graph-review repairs. | + +Do not introduce new schema, capability contracts, or operation ids with `patch` / `patch_change` unless deliberately referring to this historical design. + ## Why this note exists Brunch is moving from a single interview transcript toward an intent-graph workspace. A specification can now plausibly include: @@ -31,7 +45,7 @@ reconciliation_need: semantic debt created when a change may affect existing graph truth ``` -The intent graph remains the current semantic truth. The patch ledger records how that truth changed. Reconciliation records what may now need renewed judgment. +The intent graph remains the current semantic truth. The changeset ledger records how that truth changed. Reconciliation records what may now need renewed judgment. ## Current Shape @@ -80,7 +94,7 @@ This means: ## Proposed Concepts -`docs/design/MULTI_CHAT.md` is now the concrete phase-one substrate proposal for chat containers and reconciliation needs. This document remains the deeper design pressure for future semantic mutation history, richer reconciliation targeting, ordering, and patch-backed provenance. +`docs/design/MULTI_CHAT.md` is now the concrete phase-one substrate reference for chat containers and reconciliation needs. This document remains the deeper design pressure for future semantic mutation history, richer reconciliation targeting, ordering, and changeset-backed provenance. ### Chat @@ -132,11 +146,11 @@ The schema should support a primary chat, but should not require the product mod Focus fields should be deferred. A chat may eventually focus on one item, one relation, several reconciliation needs, or a graph neighborhood. That likely wants a later `chat_focus` table rather than early nullable columns on `chat`. -### Turn Patch Anchor +### Turn semantic-state anchor -A turn should know the semantic state that preceded it. +A turn should know the semantic state that preceded it. Historical examples below say `patch`; current implementations should read this as a changeset or semantic-revision anchor. -Proposed addition: +Proposed addition, in this document's historical vocabulary: ```text turn @@ -144,25 +158,25 @@ turn preceding_patch_id ``` -`preceding_patch_id` points to the latest applied patch known to the chat at the moment the turn was created. This gives Brunch a durable historical anchor for reviving old chat threads. +Read `preceding_patch_id` as `preceding_changeset_id` if the FE-701 schema adopts changeset naming. The field points to the latest applied semantic mutation bundle known to the chat at the moment the turn was created. This gives Brunch a durable historical anchor for reviving old chat threads. Example: ```text -Chat C7 last had a turn after Patch P12. -Elsewhere, P13-P18 changed the intent graph. +Chat C7 last had a turn after Changeset C12. +Elsewhere, C13-C18 changed the intent graph. The user returns to C7. The new turn can inject context: "Since the last turn in this chat, these semantic changes happened elsewhere..." ``` -This is especially important once multiple chats can mutate one specification. Without a patch anchor, a dormant side chat can accidentally continue from an obsolete semantic worldview. +This is especially important once multiple chats can mutate one specification. Without a semantic-state anchor, a dormant side chat can accidentally continue from an obsolete semantic worldview. -If the patch ledger is deferred, this field should also be deferred unless Brunch introduces a lightweight semantic revision/checkpoint first. Avoid adding a dangling nullable patch pointer before there is a real patch or revision concept to point at. +If the changeset ledger is deferred, this field should also be deferred unless Brunch introduces a lightweight semantic revision/checkpoint first. Avoid adding a dangling nullable semantic-history pointer before there is a real changeset or revision concept to point at. -### Patch +### Patch *(historical name; now changeset)* -A `patch` is a semantic mutation set against the intent graph. +A `patch` in this document means what current docs call a `changeset`: a semantic mutation set against the intent graph. It is not a workflow event and should not answer questions like "what phase is the user in?" It answers questions like: @@ -172,10 +186,10 @@ It is not a workflow event and should not answer questions like "what phase is t - what previous semantic state did it replace? - what downstream graph truth may now be stale? -Proposed table: +Proposed table, in historical naming: ```text -patch +patch # current name: changeset id specification_id provenance_json @@ -201,7 +215,7 @@ status: Provenance may want to be a discriminated JSON value rather than only an enum plus nullable foreign keys: ```typescript -type PatchProvenance = +type ChangesetProvenance = // historical draft name: PatchProvenance | { kind: 'turn'; turn_id: number; chat_id: number; capture_kind?: 'observer_capture' | 'review_acceptance' } | { kind: 'user_direct_edit'; chat_id?: number; actor_id?: string } | { kind: 'reconciliation_acceptance'; chat_id?: number; review_set_id?: number } @@ -212,9 +226,9 @@ type PatchProvenance = This keeps provenance extensible without adding nullable columns for every initiator shape. The relational columns `initiator_chat_id` and `initiator_turn_id` may still be useful as indexed convenience fields, but they should mirror `provenance_json`, not become a second provenance truth. -`observer_capture` is usually initiated by a chat turn, but patch provenance should not collapse to "chat turn." A turn can initiate a patch; it is not the patch. +`observer_capture` is usually initiated by a chat turn, but changeset provenance should not collapse to "chat turn." A turn can initiate a changeset; it is not the changeset. -### Patch vs Change Naming +### Patch vs Change Naming *(resolved)* The proposed model has two levels: @@ -226,7 +240,7 @@ atomic mutation: one add/update/link/unlink/retire operation inside that unit ``` -Those can be named either way: +The naming choice was still open when this document was written: ```text Option A: @@ -238,7 +252,7 @@ Option B: change ``` -`changeset` / `change` may be the clearer database naming because it avoids overloading "patch" with source-control connotations and because "change" naturally names the atomic unit. Under that naming: +That choice is now resolved in favor of `changeset` / `change` because it avoids overloading "patch" with source-control connotations and because "change" naturally names the atomic unit. Under that naming: ```text changeset: @@ -248,18 +262,18 @@ change: id, changeset_id, operation, target_kind, target_id, before_json, after_json ``` -The design question is not the word. The invariant is that Brunch needs an atomic semantic mutation set containing one or more atomic changes. +The design question is not the word. The invariant is that Brunch needs an atomic semantic mutation set containing one or more atomic changes. The current canonical naming is `changeset` / `change`. -### Patch Change +### Patch Change *(historical name; now change)* -A `patch_change` is one operation inside a patch. +A `patch_change` in this document means what current docs call a `change`: one operation inside a changeset. -Proposed table: +Proposed table, in historical naming: ```text -patch_change +patch_change # current name: change id - patch_id + patch_id # current name: changeset_id operation target_kind target_id @@ -310,7 +324,7 @@ decision D constrains requirement R ```text item B changed, so item A may need review -patch P changed an older premise, so later descendants may need coherence review +changeset C changed an older premise, so later descendants may need coherence review verifier V invalidated criterion C, so requirement R may need review ``` @@ -326,7 +340,7 @@ reconciliation_need status reason caused_by_turn_id - caused_by_patch_id + caused_by_patch_id # historical placeholder; current concept: caused_by_changeset_id created_at resolved_at ``` @@ -343,23 +357,23 @@ status: resolved ``` -This deliberately keeps phase one smaller than the fully expressive model. The first table should represent one directed process obligation from a changed source item to an affected target item, dedupe simultaneously open needs by `(source_item_id, target_item_id, kind)`, and carry enough nullable provenance to be patch-compatible later. +This deliberately keeps phase one smaller than the fully expressive model. The first table should represent one directed process obligation from a changed source item to an affected target item, dedupe simultaneously open needs by `(source_item_id, target_item_id, kind)`, and carry enough nullable provenance to be changeset-compatible later. Future extensions can add: ```text basis / strength -source_patch_id +source_patch_id # current concept: source_changeset_id affected_relation_from_item_id affected_relation_to_item_id affected_relation -resolved_by_patch_id +resolved_by_patch_id # current concept: resolved_by_changeset_id structured reason payload ``` The `affected_relation_*` fields avoid requiring a separate `knowledge_edge.id` migration before this work can start. If `knowledge_edge` later receives a surrogate `id`, `reconciliation_need` can switch to `affected_edge_id`. -`resolved_at` exists in phase one because no-op dismissal and non-patch resolution are useful before the patch ledger exists. Once `resolved_by_patch_id` is available, the timestamp may remain denormalized convenience rather than the only resolution source of truth. +`resolved_at` exists in phase one because no-op dismissal and non-changeset resolution are useful before the changeset ledger exists. Once changeset-backed resolution is available, the timestamp may remain denormalized convenience rather than the only resolution source of truth. ## Reconciliation Bases @@ -391,8 +405,8 @@ Example: ```text The user directly edits Knowledge Item K4. -K4 was last updated by Patch P12. -Later patches P13-P31 created or updated nearby items from a context that may no longer hold. +K4 was last updated by Changeset C12. +Later changesets C13-C31 created or updated nearby items from a context that may no longer hold. Those later descendants receive soft reconciliation needs. ``` @@ -433,7 +447,7 @@ agent attempts reconciliation -> present a reviewable set of reconciliation changes -> user accepts or comments / requests changes -> agent revises and presents the set again - -> accepted changes are applied as a patch + -> accepted changes are applied as a changeset ``` The important difference from ordinary review sets is the agent's first move. Reconciliation should not immediately push every stale item to the user. The agent should attempt to repair, dismiss, or consolidate needs itself when the graph context is sufficient. @@ -466,8 +480,8 @@ Proposed flow: - add clarifying edge or example - ask the user a disambiguating question 9. The user accepts or requests changes. -10. Accepted reconciliation emits a new patch. -11. The accepted patch resolves, dismisses, or supersedes the needs. +10. Accepted reconciliation emits a new changeset. +11. The accepted changeset resolves, dismisses, or supersedes the needs. ``` This mirrors review-set ergonomics without pretending reconciliation is the same as requirements or criteria review. @@ -479,7 +493,7 @@ reconciliation review set v1 -> user requests changes with comments -> agent creates revised review set v2 -> user accepts - -> accepted reconciliation patch is applied + -> accepted reconciliation changeset is applied ``` Rejected or superseded reconciliation proposals should remain explainable provenance, but only accepted reconciliation should mutate the intent graph. @@ -496,7 +510,7 @@ group by affected target sort needs within target by: 1. strength 2. basis - 3. source item / source patch + 3. source item / source changeset 4. creation time build an affected-target graph from semantic relations collapse cycles into strongly connected components @@ -508,26 +522,26 @@ Direction matters. If `Requirement R` depends on `Assumption A`, and `A` changes Cycles should not block reconciliation. They should be collapsed into a single unit and presented as a coupled coherence problem. -If an accepted reconciliation patch changes an upstream target, downstream needs may become superseded or may need to be regenerated from the new patch. The reconciliation loop should therefore treat topological ordering as a work plan, not as a guarantee that one pass resolves every downstream target. +If an accepted reconciliation changeset changes an upstream target, downstream needs may become superseded or may need to be regenerated from the new changeset. The reconciliation loop should therefore treat topological ordering as a work plan, not as a guarantee that one pass resolves every downstream target. ## Can This Be Split Into Two Phases? -Yes, with one caveat: phase one should make `reconciliation_need` future-compatible with patches even if the `patch` table does not exist yet. +Yes, with one caveat: phase one should make `reconciliation_need` future-compatible with changesets even if the `changeset` table does not exist yet. The split is plausible because `chat` and `reconciliation_need` each relieve a current architectural pressure independently: - `chat` creates the missing conversation container below `specification` - `reconciliation_need` creates a product-visible place for staleness and coherence work -- `patch` later upgrades provenance from turn-centered or event-centered records into a true semantic mutation ledger +- `changeset` later upgrades provenance from turn-centered or event-centered records into a true semantic mutation ledger -The caveat is that historical descendance is only approximate before patches exist. Brunch can detect graph-based semantic dependency in phase one. It cannot precisely answer "which later semantic mutations descend from this older state?" until patch history exists. +The caveat is that historical descendance is only approximate before changesets exist. Brunch can detect graph-based semantic dependency in phase one. It cannot precisely answer "which later semantic mutations descend from this older state?" until changeset history exists. ## Phase 1: Multi-Chat Substrate and Reconciliation Need Goal: ```text -Allow multiple chats per specification and introduce durable reconciliation needs without requiring the full patch ledger. +Allow multiple chats per specification and introduce durable reconciliation needs without requiring the full changeset ledger. ``` Schema work: @@ -555,15 +569,15 @@ Phase-one reconciliation causes: ```text caused_by_turn_id = the turn whose observer capture or review action caused the need -caused_by_patch_id = null +caused_by_patch_id = null # historical placeholder for future changeset-backed provenance ``` -`caused_by_kind` is intentionally omitted in the concrete phase-one schema while patches do not exist: `caused_by_turn_id` names turn-caused needs, and `caused_by_patch_id` remains null as a placeholder. +`caused_by_kind` is intentionally omitted in the concrete phase-one schema while changesets do not exist: `caused_by_turn_id` names turn-caused needs, and the historical `caused_by_patch_id` placeholder should be read as future changeset-backed provenance. Phase-one limitations: - no exact before / after semantic diff -- no exact patch chronology +- no exact changeset chronology - no reliable historical descendance beyond turn-linked provenance heuristics - reconciliation can identify affected items, but cannot yet provide a full mutation audit @@ -578,50 +592,50 @@ Phase-one implementation slices: 5. Add deterministic helper to create needs from changed item plus `knowledge_edge` traversal. 6. Surface a minimal reconciliation queue in data loaders or development fixtures. -## Phase 2: Patch Ledger +## Phase 2: Changeset Ledger *(formerly Patch Ledger)* Goal: ```text -Make semantic mutations first-class and use patches as the source of reconciliation cause, audit, and historical descendance. +Make semantic mutations first-class and use changesets as the source of reconciliation cause, audit, and historical descendance. ``` -Schema work: +Schema work, translated to current vocabulary: -- add `patch` -- add `patch_change` -- add `caused_by_patch_id` and `resolved_by_patch_id` foreign keys if they were not enforced in phase one -- optionally add `knowledge_item.last_patch_id` -- optionally add `knowledge_edge.last_patch_id` or give edges surrogate ids +- add `changeset` +- add `change` +- add changeset-backed cause/resolution foreign keys if they were not enforced in phase one +- optionally add `knowledge_item.last_changeset_id` +- optionally add `knowledge_edge.last_changeset_id` or give edges surrogate ids Application work: -- route observer capture through patch creation -- route accepted review outputs through patch creation -- route direct user edits through patch creation -- route reconciliation acceptance through patch creation +- route observer capture through changeset creation +- route accepted review outputs through changeset creation +- route direct user edits through changeset creation +- route reconciliation acceptance through changeset creation - derive `turn_knowledge_item` as provenance compatibility or keep it as a secondary projection -- use patch chronology for historical descendance +- use changeset chronology for historical descendance -Patch application invariant: +Changeset application invariant: ```text -Every semantic change to knowledge graph truth is represented by exactly one applied patch_change inside one applied patch. +Every semantic change to knowledge graph truth is represented by exactly one applied change inside one applied changeset. ``` That invariant should eventually replace "every knowledge item traces to a turn" as the semantic-history rule. -Patch history should make revision counts and previous values straightforward: +Changeset history should make revision counts and previous values straightforward: ```text revision count for item K: - count applied patch_change rows where target_kind = knowledge_item and target_id = K + count applied change rows where target_kind = knowledge_item and target_id = K change history for item K: - applied patch_change rows for K ordered by patch.applied_at, including before_json and after_json + applied change rows for K ordered by changeset.applied_at, including before_json and after_json ``` -The same should hold for knowledge relations. That creates an important schema pressure: `knowledge_edge` needs stable identity if edge revision history is first-class. A composite key can identify the current relation, but it is awkward for history when a relation's source, target, or type changes. Before patch history becomes authoritative for edges, Brunch should either: +The same should hold for knowledge relations. That creates an important schema pressure: `knowledge_edge` needs stable identity if edge revision history is first-class. A composite key can identify the current relation, but it is awkward for history when a relation's source, target, or type changes. Before changeset history becomes authoritative for edges, Brunch should either: - add a surrogate `knowledge_edge.id` - or replace `knowledge_edge` with a stable relation record table @@ -646,15 +660,15 @@ The existing `turn.parent_turn_id` chain remains valid if all current turns in a In phase one, keep `turn_knowledge_item` unchanged. -In phase two, create migration patches only if the audit value is worth the complexity. A low-risk path is: +In phase two, create migration changesets only if the audit value is worth the complexity. A low-risk path is: ```text -one migration patch per specification: - provenance_json = { kind: "migration", migration_id: "patch-ledger-backfill" } - summary = "Backfilled existing knowledge graph before patch ledger introduction" +one migration changeset per specification: + provenance_json = { kind: "migration", migration_id: "changeset-ledger-backfill" } + summary = "Backfilled existing knowledge graph before changeset ledger introduction" ``` -This avoids inventing fake historical patches for every old observer capture. +This avoids inventing fake historical changesets for every old observer capture. ### Existing Knowledge Edges @@ -692,30 +706,30 @@ Phase one invariants: - every reconciliation need belongs to one specification - a reconciliation need's affected item or affected relation belongs to the same specification - `caused_by_turn_id`, when present, points to a turn in the same specification -- `caused_by_patch_id` remains null until patch tables exist +- the changeset-backed cause field remains null until changeset tables exist Phase two invariants: -- every semantic graph mutation is represented by an applied patch change -- every patch belongs to one specification -- every patch change belongs to one patch -- every patch target belongs to the same specification as the patch -- every patch has exactly one provenance kind -- a patch may have chat or turn provenance, but does not require it +- every semantic graph mutation is represented by an applied change +- every changeset belongs to one specification +- every change belongs to one changeset +- every changeset target belongs to the same specification as the changeset +- every changeset has exactly one provenance kind +- a changeset may have chat or turn provenance, but does not require it - hard reconciliation needs must name a concrete affected item or relation -- resolved reconciliation needs should name the patch that resolved or dismissed them when resolution changes graph state +- resolved reconciliation needs should name the changeset that resolved or dismissed them when resolution changes graph state ## Practical Recommendation Do phase one first. -The split is worthwhile because `chat` is a clear foundation for multi-conversation workspaces, and `reconciliation_need` is a useful product concept even before full semantic patch history exists. +The split is worthwhile because `chat` is a clear foundation for multi-conversation workspaces, and `reconciliation_need` is a useful product concept even before full semantic changeset history exists. But phase one should be honest about its limits: - it can support graph-based reconciliation well - it can support soft, heuristic coherence review -- it cannot fully support historical descendance until patches exist +- it cannot fully support historical descendance until changesets exist - it should not imply a complete audit trail The safest phase-one framing is: @@ -723,7 +737,7 @@ The safest phase-one framing is: ```text Introduce chat containers and reconciliation queues. Keep turn-centered provenance for now. -Design reconciliation causes so patch-backed provenance can replace turn-backed provenance later. +Design reconciliation causes so changeset-backed provenance can replace turn-backed provenance later. ``` Then phase two becomes an upgrade of semantic provenance, not a rewrite of the reconciliation product model. @@ -733,7 +747,7 @@ Then phase two becomes an upgrade of semantic provenance, not a rewrite of the r - Should `turn.specification_id` be removed eventually, or kept as a denormalized convenience? - Should `specification.active_turn_id` be removed as soon as `chat.active_turn_id` is stable, or kept as a temporary compatibility mirror? - Should `chat.kind = reconciliation` own one reconciliation review set, or can one reconciliation chat cover multiple sets? -- Should direct user edits create proposed patches first, or applied patches with later reconciliation? +- Should direct user edits create proposed changesets first, or applied changesets with later reconciliation? - Should `knowledge_edge` receive a surrogate `id` before reconciliation targets relations heavily? - What is the first deterministic relation policy for creating reconciliation needs from `knowledge_edge` traversal? -- How noisy is historical descendance in realistic workspaces, and should it be grouped by patch rather than item? +- How noisy is historical descendance in realistic workspaces, and should it be grouped by changeset rather than item? From 8a946baea1a06cca6d74602a90b5f6d353f5b387 Mon Sep 17 00:00:00 2001 From: Lu Nelson Date: Wed, 13 May 2026 14:04:47 +0200 Subject: [PATCH 31/42] Clarify multi-chat substrate authority --- docs/design/MULTI_CHAT.md | 61 ++++++++++++++++++++++++--------------- 1 file changed, 37 insertions(+), 24 deletions(-) diff --git a/docs/design/MULTI_CHAT.md b/docs/design/MULTI_CHAT.md index 9afb284d..874eb45a 100644 --- a/docs/design/MULTI_CHAT.md +++ b/docs/design/MULTI_CHAT.md @@ -6,6 +6,19 @@ > > Relationship to side-chat/runtime design: this document superseded older side-chat substrate assumptions for Phase 1. Future thread hierarchy, persistent side-chat history, and reconciliation-in-stream decisions are folded into the conversational workspace runtime synthesis. +## How to read this after Phase 1 shipped + +This document is now a substrate reference. It owns Phase 1 schema rationale and compatibility invariants, not the future runtime UX. + +| Claim area | Current reading | +|---|---| +| `chat`, nullable `turn.chat_id`, `specification.primary_chat_id`, mirrored `chat.active_turn_id` | Shipped Phase 1 substrate and migration rationale. | +| `reconciliation_need` | Shipped process-debt primitive. Future cause/resolution provenance should use changeset vocabulary, not patch vocabulary. | +| Side-chat persistence / Phase 2 | Historical substrate option. Current runtime synthesis may persist side conversations as child chats, a new thread table, or UI-rendered threads. | +| Reconciliation agent / Phase 3 | Historical staging description. Current target is a reconciliation runtime/thread that uses V3.1 classifier outputs and future changeset attribution. | +| Patch ledger / Phase 4 | Historical name. Current plan calls this the semantic changeset ledger. | +| Context model for new chats | Still useful principle: new chats consume current graph state, not transcript snapshots. Thread-scoped context details now belong to `CONVERSATIONAL_WORKSPACE_RUNTIME.md`. | + ## 1. Concept & problem Today every turn anchors directly to a `specification`, and a single linear turn chain *is* the spec's history spine: @@ -16,13 +29,13 @@ Today every turn anchors directly to a `specification`, and a single linear turn This was correct when there was one interview thread per spec. It is no longer correct: -- **Side-chat** needs a parallel conversation surface anchored to graph items, not to the interview frontier. Early UI slices can ship this as an in-memory patch-list surface because the current durable substrate does not accommodate a second thread. +- **Side-chat** needs a parallel conversation surface anchored to graph items, not to the interview frontier. Early UI slices shipped this through an in-memory patch-list surface; the substrate now supports separate chats while the future runtime decides thread shape. - **Direct user edits** from graph view (and, later, the architect loop) produce mutations that don't originate from any turn at all — they need a place to live and a way to advertise their downstream impact. - **Reconciliation** of those mutations needs a typed signal: "this item changed, that item now needs confirmation". `knowledge_edge` carries semantic relations between items; it is the wrong place to record an open question between them. -This RFC introduces the smallest substrate change that unblocks both: a `chat` table that turns can relate to, and a `reconciliation_need` table that records directed open issues between graph targets. +This RFC introduced the smallest substrate change that unblocked both: a `chat` table that turns can relate to, and a `reconciliation_need` table that records directed open issues between graph targets. -It is **Phase 1** of the substrate evolution leading toward the patch ledger and ontology sharpening discussed in `memory/SPEC.md` decisions D134-D138. Subsequent substrate phases are listed in §10. Adjacent moves not part of this evolution — phase-route de-emphasis, typed patches with `prev_value` provenance, ontology additions (`invariant`, `example`) — are tracked separately. +It is **Phase 1** of the substrate evolution leading toward the changeset ledger and ontology sharpening discussed in `memory/SPEC.md` decisions D134-D138. Subsequent substrate phases in §10 are historical staging, not current sequencing authority. Adjacent moves not part of this evolution — phase-route de-emphasis, changesets with `before_json` / `after_json` provenance, ontology additions (`invariant`, `example`) — are tracked separately. ### At a glance — the relational shift @@ -203,7 +216,7 @@ export const reconciliationNeed = sqliteTable( status: text({ enum: ['open', 'resolved'] }).notNull().default('open'), reason: text(), caused_by_turn_id: integer().references(() => turn.id), - caused_by_patch_id: integer(), // nullable placeholder until patch ledger exists + caused_by_patch_id: integer(), // historical placeholder; current concept is caused_by_changeset_id created_at: text().notNull().default(sql`(datetime('now'))`), resolved_at: text(), }, @@ -225,7 +238,7 @@ export const reconciliationNeed = sqliteTable( - The enum is intentionally narrow. New kinds are added when we have a concrete reconciliation move that doesn't fit either; we don't pre-invent them. - **Status lifecycle.** `open` on creation; `resolved` on agent / user action. Resolved needs are kept for audit but do not participate in the reconciliation queue. - **Multiple needs per pair.** The unique index gates only `open` needs. Two successive edits to the same source can fire two `needs_confirmation` needs, the first being closed before the second is opened; what we forbid is *two simultaneously-open issues of the same kind for the same pair*. -- **Provenance.** Phase 1 carries `reason`, `caused_by_turn_id`, and nullable `caused_by_patch_id`. The turn pointer is useful immediately for observer / review-created needs; the patch pointer is a deliberate placeholder that stays null until the patch ledger gives every semantic mutation a stable id. +- **Provenance.** Phase 1 carries `reason`, `caused_by_turn_id`, and nullable historical `caused_by_patch_id`. The turn pointer is useful immediately for observer / review-created needs; the semantic-mutation pointer is a deliberate placeholder that stays null until the changeset ledger gives every semantic mutation a stable id. ### 3.5 Everything else @@ -249,17 +262,17 @@ A side-chat (or any non-interview chat) is created with: This is the second meeting's explicit decision: *new chats take in the current knowledge graph rather than previous conversation turns*. The interview transcript is provenance, not context. -How that gets formatted into a prompt and which agent owns the assembly is a follow-up RFC. +How that gets formatted into a prompt and which agent owns the assembly is now part of the runtime/context-provision track in [CONVERSATIONAL_WORKSPACE_RUNTIME.md](./CONVERSATIONAL_WORKSPACE_RUNTIME.md). ## 5. Reconciliation primitive -Substrate-only note: this RFC describes the **edge model and lifecycle**. The reconciliation agent (which reads the queue, decides severity, presents review sets) is a follow-up RFC. +Substrate-only note: this RFC describes the **edge model and lifecycle**. The reconciliation runtime that reads the queue, classifies severity, presents threads/review affordances, and applies changeset-backed resolutions is owned by [CONVERSATIONAL_WORKSPACE_RUNTIME.md](./CONVERSATIONAL_WORKSPACE_RUNTIME.md). ### 5.1 Two production paths ```mermaid flowchart TD - M[Knowledge item changes
(direct edit, patch apply,
review acceptance)] --> P1[Path 1: deterministic] + M[Knowledge item changes
(direct edit, changeset apply,
review acceptance)] --> P1[Path 1: deterministic] M --> P2[Path 2: observer pass] P1 --> KE[Look up existing
knowledge_edges
(depends_on, derived_from,
constrains, refines, verifies)] KE --> RE1[Insert reconciliation_need
per affected pair
kind = 'supersedes' / 'needs_confirmation'] @@ -280,13 +293,13 @@ flowchart TD ### 5.2 Resolution -When the queue is resolved (by user, agent, or both), the matching `reconciliation_need` rows transition `open → resolved` and pick up a `resolved_at` timestamp. The actual resolution moves — accept a proposed change set, edit the target item, mark the issue irrelevant — produce knowledge-item mutations and (in time) patches. Those are not modelled here; they go through the same paths everything else does. +When the queue is resolved (by user, agent, or both), the matching `reconciliation_need` rows transition `open → resolved` and pick up a `resolved_at` timestamp. The actual resolution moves — accept a proposed changeset, edit the target item, mark the issue irrelevant — produce knowledge-item mutations and, once FE-701 lands, changesets. Those are not modelled here; they go through the same paths everything else does. ### 5.3 What this is *not* - Not a workflow state. Reconciliation is a graph signal, not a phase. `phase_outcome` is the workflow state primitive and is unchanged. -- Not a patch. `reconciliation_need` records *that* an issue exists; it does not describe *what* should change. The proposed change is a separate artefact: today in-memory in the patch-list UI, durable in the patch ledger when it lands. -- Not an audit log of edits. `turn_knowledge_item` and (later) the patch ledger own that. +- Not a changeset. `reconciliation_need` records *that* an issue exists; it does not describe *what* should change. The proposed change is a separate artifact: historically in-memory in the patch-list UI, durably in the changeset ledger when it lands. +- Not an audit log of edits. `turn_knowledge_item` and (later) the changeset ledger own that. ## 6. Migration @@ -304,7 +317,7 @@ Drizzle / SQLite. One ordered migration, columns added before the dependent colu - Backfill: for each spec, set `primary_chat_id` to the interview chat created in step 1. 4. **0017_reconciliation_need.sql** - Create `reconciliation_need` table with the partial unique index from §3.4. - - Include `caused_by_turn_id` now and nullable `caused_by_patch_id` as a patch-ledger placeholder. + - Include `caused_by_turn_id` now and nullable historical `caused_by_patch_id` as a future changeset-ledger placeholder. Code changes paired with migrations: @@ -321,13 +334,13 @@ No data loss. Every existing turn lands inside the interview chat of its spec; e ## 7. Out of scope (acknowledged adjacents) -- **Patch ledger.** Typed semantic patches with `prev_value` / `value` and explicit provenance, replacing the in-memory patch-list model. This RFC creates room for the ledger by separating chat from spec, but does not introduce the ledger itself. +- **Changeset ledger.** Typed semantic changesets with before/after values and explicit provenance, replacing the in-memory patch-list model. This RFC creates room for the ledger by separating chat from spec, but does not introduce the ledger itself. - **Phase routes / phase as primary UI concept.** The second meeting agreed phase should de-emphasise as UI but stay as a background signal for prompting. UI work is its own RFC; the data model here keeps `turn.phase` exactly as-is. - **Ontology sharpening (`invariant`, `example` as `knowledge_item.kind`).** Discussed in `memory/SPEC.md` D134 and D136. Pure ontology change; no impact on the chat / reconciliation substrate. -- **Decision shape rework.** The meeting concluded a decision should capture both *chosen* and *not chosen*, and that the `option` table can probably go away in favour of in-turn data. Both moves belong with the patch-ledger work; today's `option` table stays. +- **Decision shape rework.** The meeting concluded a decision should capture both *chosen* and *not chosen*, and that the `option` table can probably go away in favour of in-turn data. Both moves belong with changeset-ledger / decision-shape work; today's `option` table stays. - **Phase outcome enum redesign.** The meeting flagged the `proposed | confirmed | superseded` enum as "find a better idea". Out of scope; `phase_outcome` is unchanged. -- **Reconciliation agent loop.** Who reads `reconciliation_need` rows, in what order, how it presents review sets. Substrate is ready; the agent design is a separate RFC. -- **Side-chat UI changes for multi-thread.** Today may ship a single side-chat-per-spec through an in-memory patch-list surface; the `chat` table accommodates many but the UI can continue to render one until persistent side-chat UX catches up. User-surface version labels from older UI design docs are independent of substrate Phase 1 / 2 / 3 / 4 — see §10 for the mapping. +- **Reconciliation runtime.** Who reads `reconciliation_need` rows, in what order, how it presents review affordances, and how accepted resolutions become changesets. Substrate is ready; the runtime design is in `CONVERSATIONAL_WORKSPACE_RUNTIME.md`. +- **Side-chat UI changes for multi-thread.** Historical UI could ship a single side-chat-per-spec through an in-memory patch-list surface; the `chat` table accommodates many but the future user surface may be child chats, a thread table, or UI-rendered in-stream threads. User-surface version labels from older UI design docs are independent of substrate Phase 1 / 2 / 3 / 4 — see §10 for the historical mapping. ## 8. Verification stance @@ -346,10 +359,10 @@ Manual: spin up an existing spec database (a current `.brunch/` fixture), run mi ## 9. Open questions - **`turn.specification_id` retention.** Phase 1 intentionally keeps it as a softer migration: existing spec-scoped reads keep working while new writes populate `chat_id` and assertions prove both pointers agree. The end-state cleanup should drop it once hot paths and tests read ownership through `chat_id`, unless profiling proves the denormalized field pays for itself. -- **Side-chat `chat.parent_turn_id` or anchor item.** A side-chat is started *from* a graph item. Should the `chat` row record the anchor item id? Default proposal: don't model it on `chat`; use a later `chat_focus` table when durable focus is wanted. +- **Side-chat/thread focus.** A side conversation is started *from* a graph item. Should focus live on `chat`, a later `chat_focus` table, a new thread table, or thread-context rows? Historical default: don't model it on `chat`; current runtime synthesis leaves this to the thread/context substrate decision. - **Reconciliation `reason` shape.** Free text in V1. Once the reconciliation agent ships, `reason` may want to be structured (template id + slots). Default proposal: stay free-text until the agent design forces a shape. - **Reconciliation cascade-on-resolve.** When a `supersedes` need resolves, does that ever fan out into new reconciliation needs (because the resolution itself is a mutation)? Yes — and that is exactly the reentrancy point Lu flagged in the second meeting. Substrate already handles it: any mutation re-runs path 1 + path 2. The agent decides whether to bundle resolution into one review set or accept a follow-up cycle. No substrate change needed. -- **`option` table fate.** Meeting tentatively concluded the table can go away in favour of in-turn data. Out of scope here; tracked alongside the patch-ledger / decision-shape work. +- **`option` table fate.** Meeting tentatively concluded the table can go away in favour of in-turn data. Out of scope here; tracked alongside changeset-ledger / decision-shape work. - **`phase_outcome` enum redesign.** Tracked alongside the de-emphasise-phase-as-UI RFC. - **Multiple `reconciliation_need.kind`s for one pair.** The partial unique index gates only same-kind same-direction. A single source change could legitimately produce both `supersedes` *and* `needs_confirmation` against the same target; allowed by design. Confirm this is intended. @@ -359,15 +372,15 @@ Manual: spin up an existing spec database (a current `.brunch/` fixture), run mi | Phase | Substrate state | Enables (user-surface mapping) | |---|---|---| -| **Phase 1** *(this RFC)* | `chat` table; nullable `turn.chat_id`; `specification.primary_chat_id`; mirrored `chat.active_turn_id`; `reconciliation_need` table with lightweight provenance placeholders. Backfill migrations. New writes populate both legacy and chat pointers. No user-visible change: still one chat per spec, still one rope per chat, side-chat can continue to use an in-memory patch-list surface. | Foundation. Existing side-chat / graph-edit surfaces can ship against today's mutation paths regardless of order. Hard-edit cascade gets a clean reshape once it reads from `reconciliation_need` rather than ad-hoc REVISIT state. Persistent multi-thread side-chat and the architect loop become shippable without waiting on the full patch ledger. | -| **Phase 2** | Side-chat persistence: side-chat threads write `chat` rows with `kind = 'side_chat'` and persist their turns. Multiple side-chats per spec become possible at the data layer. | Persistent side-chat history and old-thread UI can activate. | -| **Phase 3** | Reconciliation agent loop reads `reconciliation_need` queue, presents review sets through the same patch list as the side-chat. | Side-chat V3 hard-edit cascade ships against the reconciliation agent (replaces the REVISIT modal). Architect loop's review surface inherits the same machinery. | -| **Phase 4** *(later)* | Patch ledger lands. `reconciliation_need.caused_by_patch_id` becomes populated for patch-caused needs. Decision-shape rework, option-table removal, and phase-outcome enum redesign happen here. | Architect loop's typed-patch version. Item versioning. Cross-surface undo / time-travel. | +| **Phase 1** *(this RFC; shipped)* | `chat` table; nullable `turn.chat_id`; `specification.primary_chat_id`; mirrored `chat.active_turn_id`; `reconciliation_need` table with lightweight provenance placeholders. Backfill migrations. New writes populate both legacy and chat pointers. No user-visible change: still one chat per spec, still one rope per chat, side-chat can continue to use an in-memory patch-list surface. | Foundation. Existing side-chat / graph-edit surfaces can ship against today's mutation paths regardless of order. Hard-edit cascade gets a clean reshape once it reads from `reconciliation_need` rather than ad-hoc REVISIT state. Persistent multi-thread side-chat and the architect loop become shippable without waiting on the full changeset ledger. | +| **Phase 2** *(historical substrate option)* | Side-chat persistence: side-chat threads write `chat` rows with `kind = 'side_chat'` and persist their turns. Multiple side-chats per spec become possible at the data layer. | Persistent side-chat history and old-thread UI could activate, unless the runtime track chooses child chats, a separate thread table, or UI-rendered threads. | +| **Phase 3** *(historical staging)* | Reconciliation agent loop reads `reconciliation_need` queue, presents review sets through the same patch-list-style surface as the side-chat. | V3.1 has shipped classifier output in the Pending review bridge surface; the future target is a reconciliation thread in the unified runtime. | +| **Phase 4** *(later; current name FE-701 changeset ledger)* | Changeset ledger lands. Reconciliation needs gain changeset-backed cause/resolution provenance. Decision-shape rework, option-table removal, and phase-outcome enum redesign may happen here or in adjacent slices. | Architect-loop proposals, item versioning, and cross-surface undo / time-travel become possible through changeset history. | ## 11. Traceability - **Replaces** the implicit "one rope per spec" assumption baked into `turn.specification_id` and `specification.active_turn_id`. -- **Unblocks** the patch ledger, the architect / generator loop horizon item, and persistent multi-chat side-chat history. +- **Unblocks** the changeset ledger, the architect / generator loop horizon item, and persistent multi-chat / thread history. - **Bounded by** D113 (no second durable workflow model — `chat` is *not* workflow state, it is a conversation-thread substrate; workflow state stays on `phase_outcome`). - **Reuses** existing `knowledge_item`, `knowledge_edge`, `turn_knowledge_item`, `option`, `phase_outcome`, `annotation` schemas as-is. -- **References** `memory/SPEC.md` decisions D135, D137, and D138 plus `docs/design/PATCH_LEDGER.md` for the deeper semantic mutation ledger. Supersedes older side-chat substrate assumptions while remaining compatible with the user-facing side-chat surface. +- **References** `memory/SPEC.md` decisions D135, D137, and D138 plus `docs/design/PATCH_LEDGER.md` for deeper semantic mutation history pressure. Supersedes older side-chat substrate assumptions while remaining compatible with the user-facing side-chat surface. From 146c34737eaf6ed2b88cb72c8634abe5b9c4db28 Mon Sep 17 00:00:00 2001 From: Lu Nelson Date: Wed, 13 May 2026 14:06:43 +0200 Subject: [PATCH 32/42] Refresh design doc navigation index --- docs/design/README.md | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/docs/design/README.md b/docs/design/README.md index b0c22bb1..da9eb179 100644 --- a/docs/design/README.md +++ b/docs/design/README.md @@ -34,12 +34,14 @@ Use design documents for deeper argumentation, raw synthesis, alternatives, and ### Conversational workspace runtime cluster +Start with `CONVERSATIONAL_WORKSPACE_RUNTIME.md`. The other files in this cluster are retained source/subsystem references; do not read them as independent future roadmaps. + | Document | Role | | --- | --- | -| `CONVERSATIONAL_WORKSPACE_RUNTIME.md` | **Active synthesis** for the continuous workspace + unified chat + reconciliation + changeset-ledger concept. Use this as the first stop for the cluster. | -| `MULTI_CHAT.md` | Shipped substrate reference for `chat`, `turn.chat_id`, `specification.primary_chat_id`, and `reconciliation_need`. | -| `SIDE_CHAT.md` | User-surface history and phasing for side-chat V1–V3.1, with V4 notes. Future runtime direction is folded into `CONVERSATIONAL_WORKSPACE_RUNTIME.md`. | -| `PATCH_LEDGER.md` | Historical design pressure for semantic mutation history and reconciliation ordering. Future-facing vocabulary is `changeset` / `change`; see `CONVERSATIONAL_WORKSPACE_RUNTIME.md` and `memory/PLAN.md`. | +| `CONVERSATIONAL_WORKSPACE_RUNTIME.md` | **Active synthesis** for the continuous workspace + unified chat + reconciliation + changeset-ledger concept. Owns the cluster supersession map and current open questions. | +| `MULTI_CHAT.md` | Shipped substrate reference for `chat`, `turn.chat_id`, `specification.primary_chat_id`, and `reconciliation_need`. Phase 2/3/4 rows are historical staging, not current sequence authority. | +| `SIDE_CHAT.md` | User-surface history and phasing for side-chat V1–V3.1, with V4 notes. Patch-list/top-bar and Pending review claims are bridge/history unless reaffirmed by the runtime synthesis. | +| `PATCH_LEDGER.md` | Historical design pressure for semantic mutation history, reconciliation bases, and target ordering. Future-facing vocabulary is `changeset` / `change`; use it for algorithms, not names. | | `CONTINUOUS_WORKSPACE_HYBRID.md` | Workspace-shell shape exploration; owns the route-alias / workspace-controller / chart-backed-supervisor choice. | ### Dev process and deferred impulses @@ -55,6 +57,7 @@ Use design documents for deeper argumentation, raw synthesis, alternatives, and | --- | --- | | `PORTABILITY_BOUNDARIES.md` | Future adapter/hosting/remote-workspace boundary map. | | `GRAPH_KIND_CHIP_TOGGLE.md` | Standalone graph-view split-button chip proposal; audit against current horizon before implementation. | +| `README.md` | This index and local design-doc policy. | ### Archived source From d7d8f971994abefd7281cd579937803ef28e1c3d Mon Sep 17 00:00:00 2001 From: Lu Nelson Date: Wed, 13 May 2026 14:06:53 +0200 Subject: [PATCH 33/42] Retire runtime docs refactor plan --- memory/REFACTOR.md | 52 ---------------------------------------------- 1 file changed, 52 deletions(-) delete mode 100644 memory/REFACTOR.md diff --git a/memory/REFACTOR.md b/memory/REFACTOR.md deleted file mode 100644 index da48b5e8..00000000 --- a/memory/REFACTOR.md +++ /dev/null @@ -1,52 +0,0 @@ -# Runtime Design-Doc Cluster Refactor - -## Problem Statement - -The `docs/design/` runtime cluster contains several documents that were written at different moments in the multi-chat, side-chat, reconciliation, and changeset-ledger arc. The first consolidation pass fixed status headers and links, but the cluster still requires a content-level audit: older subsystem docs may contain future-facing claims that are now superseded by `CONVERSATIONAL_WORKSPACE_RUNTIME.md`, while the active synthesis may still rely on algorithmic or substrate details that only exist in older docs. - -This makes the cluster harder for a builder or agent to navigate. A reader must infer which claims are shipped fact, which are current future direction, which are historical design pressure, and which are open questions. - -## Solution - -Refactor the runtime-cluster documentation so authority and supersession are explicit without deleting useful subsystem rationale. - -Target state: - -- `CONVERSATIONAL_WORKSPACE_RUNTIME.md` is the first-stop synthesis for the runtime cluster. -- `MULTI_CHAT.md` remains the shipped substrate reference for Phase 1 schema/migration invariants. -- `SIDE_CHAT.md` remains the user-surface history and V4 notes reference. -- `PATCH_LEDGER.md` remains historical design pressure for changeset/change semantics and reconciliation ordering. -- Each old doc clearly distinguishes shipped facts, retained rationale, superseded vocabulary/surfaces, and remaining open questions. -- Any canonical drift discovered during the audit is reported before touching `memory/SPEC.md` or `memory/PLAN.md`. - -## Commits - -1. Add a compact runtime-cluster supersession map to the active synthesis, naming current authority, retained subsystem details, superseded surfaces/vocabulary, and open questions. -2. Audit the side-chat design for shipped-versus-horizon claims, marking stale popover/patch-list/pending-review assumptions as historical where the unified runtime now supersedes them. -3. Audit the patch-ledger design for patch-to-changeset vocabulary boundaries, preserving target ordering and reconciliation-flow algorithms while marking old schema names as historical. -4. Audit the multi-chat substrate design for shipped Phase 1 facts versus later substrate possibilities, making clear what future thread/runtime questions are no longer owned there. -5. Run a final navigation pass over `docs/design/README.md` and local links so the cluster can be entered from the index without reading stale docs first. - -Each commit is documentation-only and should leave the repository working. - -## Decisions - -- Treat this as a documentation refactor, not a product plan rewrite. -- Do not delete `MULTI_CHAT.md`, `SIDE_CHAT.md`, or `PATCH_LEDGER.md`; they still contain useful subsystem rationale and implementation history. -- Do not promote deferred product impulses during this pass. If the audit reveals a real SPEC/PLAN gap, record it for separate `ln-spec` / `ln-plan` handling. -- Keep future-facing vocabulary aligned with current canonical language: `changeset` / `change`, proposal turns, chat-local strategies, relation-policy directionality, and graph review distinct from reconciliation. - -## Testing Decisions - -- Use markdown link checks for local references after each meaningful pass. -- Use `npm run fix` after edits as the inner-loop repository check. -- Full `npm run verify` is optional before commit if the changes remain docs-only, but should be run before final submission if this branch is going to PR. -- The main review oracle is conceptual: a reader should be able to answer, from headers and first sections alone, which runtime doc owns which kind of claim. - -## Out of Scope - -- Changing `memory/SPEC.md` or `memory/PLAN.md` except for trivial link/path fixes explicitly approved during the audit. -- Migrating `memory/PLAN.md` to a sequencing-plus-definitions format. -- Refactoring `ln-*` skills or `AGENTS.md` planning rules. -- Implementing changeset, thread, reconciliation, or workspace runtime code. -- Creating `docs/design/SPEC_DRIFT.md` or promoting `DEFERRED_RECONCILIATIONS.md` entries. From a80332283487751bff82b033e253d7dc862a3e92 Mon Sep 17 00:00:00 2001 From: Lu Nelson Date: Wed, 13 May 2026 14:18:22 +0200 Subject: [PATCH 34/42] update deferred reconciliations --- docs/design/DEFERRED_RECONCILIATIONS.md | 127 ++++++++---------------- 1 file changed, 41 insertions(+), 86 deletions(-) diff --git a/docs/design/DEFERRED_RECONCILIATIONS.md b/docs/design/DEFERRED_RECONCILIATIONS.md index 8b80c5e5..52878265 100644 --- a/docs/design/DEFERRED_RECONCILIATIONS.md +++ b/docs/design/DEFERRED_RECONCILIATIONS.md @@ -1,108 +1,63 @@ -# Deferred Reconciliations — Pending Promotions to SPEC / PLAN +# Deferred Reconciliations — Audit Verdicts -> Status: **interim backlog, audited 2026-05-13**. -> Date: 2026-05-07. -> Scope: shaped product-direction items derived from the archived intent-spec synthesis ([`INTENT_SPEC_EVOLUTION.md`](../archive/design/INTENT_SPEC_EVOLUTION.md)) that are *worthy but gated*: either partially captured in `memory/SPEC.md` / `memory/PLAN.md`, or deliberately deferred until prerequisite work lands. -> -> Each entry below has a clear destination (a `memory/SPEC.md` requirement / assumption / decision, a `memory/PLAN.md` item, or a new design doc) and a clear **trigger condition**. When the trigger fires, promote the entry through the appropriate `ln-*` skill and remove the entry from this file. When the file is empty it can be deleted. -> -> Audit result: none of the product impulses below should be promoted immediately. Edge metadata and topology-driven ranking are now represented in the FE-700/FE-702 frontier direction, but implementation evidence has not landed. Spec drift has a lexicon entry and remains a plausible product surface, but still lacks the typed-claim substrate that would make it actionable. +> Status: **audited 2026-05-13**. +> Original date: 2026-05-07. +> Scope: product-direction items derived from the archived intent-spec synthesis ([`INTENT_SPEC_EVOLUTION.md`](../archive/design/INTENT_SPEC_EVOLUTION.md)) that needed a decision: promote into `memory/SPEC.md` / `memory/PLAN.md`, keep gated, or retire as duplicate/deprecated. -## How to use this doc - -1. Before opening a new frontier item, check whether any deferred entries below have triggers that have now fired. -2. When promoting an entry, route through the canonical skill: `ln-spec` for SPEC.md changes, `ln-plan` for PLAN.md changes. Do not hand-edit canonical memory. -3. Delete promoted entries from this file. The synthesis source remains in [`INTENT_SPEC_EVOLUTION.md`](../archive/design/INTENT_SPEC_EVOLUTION.md) for context, but this backlog is the single live tracking place. -4. If a trigger never fires, decide explicitly whether the entry is still relevant or should be retired with a note in the synthesis source. - ---- - -## Pending SPEC.md additions +## Audit summary -### Requirement candidates (3) +No item should be promoted into `memory/SPEC.md` or `memory/PLAN.md` immediately. -**REQ-D1. Spec drift surfacing.** -When a generated artifact (criterion, requirement, candidate-spec direction, export bundle, or downstream implementation behavior) diverges from its source claim, Brunch surfaces the divergence in human terms — "original intent vs generated behavior vs potential mismatch" — so the user can validate meaning at the point where it could have changed, rather than after the divergence has been laundered into a final document. -- **Trigger:** FE-700 lands the `checkability` field and `claimMetadata` so drift can actually be detected at the typed-claim level. -- **Promotes through:** `ln-spec` patch. -- **Cross-refs once promoted:** proposed design doc `docs/design/SPEC_DRIFT.md` (entry C3 below; not yet created); links to existing Requirement 38 (invariant + example as kinds) and the `spec drift` Lexicon entry that already exists. +| Theme | Verdict | Reason | +| --- | --- | --- | +| Spec drift surfacing | **Keep deferred** | The concept is still worth preserving, but it is not yet actionable as a requirement or horizon item until FE-700 lands typed checkability / witness metadata and FE-702-style probes can show drift detection is real rather than aspirational. `memory/SPEC.md` already has a lexicon entry for `spec drift`, which is enough for now. | +| Topology-driven disambiguation / next-question ranking | **Covered; do not promote standalone** | The useful part is already represented by the FE-700 semantic model and FE-702 behavioral-kernel / graph-review probe direction. It may later emerge as interviewer behavior, but promoting a separate horizon item now would duplicate those frontier items and over-specify mechanism before probes. | +| Edge epistemic metadata / relation participation | **Duplicate of current FE-700 direction** | `memory/SPEC.md` already records relation policy, support/status gating, operational directionality, edge-local neighborhoods, and relation-family vocabulary through Requirements 30/38, assumptions A81/A93, decisions D137/D150, invariants I109/I118, and lexicon rows. `memory/PLAN.md` FE-700 explicitly calls out edge epistemic metadata and relation-policy directionality. | -**REQ-D2. Disambiguation probes from graph topology.** -The interviewer can issue contrastive A/B/C disambiguation questions when the typed graph contains a high-fanout assumption, an unwitnessed requirement, an unverified invariant, a decision without rejected alternatives, a goal without derived requirements, or a conflicting constraint. The TiCoder-style move is generalized beyond test cases: the interviewer generates cases where plausible interpretations diverge, then asks the user to classify them; the classifications emit typed claims and edges per [`INTENT_GRAPH_SEMANTICS.md`](./INTENT_GRAPH_SEMANTICS.md). -- **Trigger:** FE-700 lands the typed graph (kinds + subtypes + relation families + edge metadata); FE-702 first kernel probes complete and the contrastive-question pattern is validated. -- **Promotes through:** `ln-spec` patch. -- **Cross-refs once promoted:** the topology-driven heuristics table in [`INTENT_GRAPH_SEMANTICS.md`](./INTENT_GRAPH_SEMANTICS.md); new horizon plan item B3 (below); behavioral-kernel composition in [`BEHAVIORAL_KERNELS.md`](./BEHAVIORAL_KERNELS.md). +## Retained deferred item -**REQ-D3. Edge epistemic metadata participation rules.** -Knowledge edges carry `support` (`explicit` / `strong_inference` / `weak_candidate`), `status` (`proposed` / `accepted` / `rejected` / `stale`), `provenanceTurnId`, and `rationale`. Only edges of certain support / status combinations participate in cascade, staleness, export-trace, reconciliation, and weak-suggestion capabilities, per the relation-policy registry. Inferred edges do not silently become false dependencies. -- **Trigger:** FE-700 lands the edge schema and the relation-policy registry. -- **Promotes through:** `ln-spec` patch. -- **Cross-refs once promoted:** the edge schema and relation-policy table in [`INTENT_GRAPH_SEMANTICS.md`](./INTENT_GRAPH_SEMANTICS.md); the existing I109 (compact existing-knowledge anchors); the existing `relation family` and `relation policy` Lexicon entries. +### Spec drift surfacing -### Assumption candidates (3) +**Deferred requirement candidate.** When a generated artifact (criterion, requirement, candidate-spec direction, export bundle, or downstream implementation behavior) diverges from its source intent, Brunch should surface the divergence in human terms — "original intent vs generated behavior vs potential mismatch" — so the user can validate meaning at the point where it could have changed, rather than after the divergence has been laundered into a final document. -**A-D1. Spec drift can be made user-legible without exposing formal-methods terminology.** -Drift surfaced as "original intent → generated behavior → potential mismatch" with a chosen-direction question is sufficient for users to validate meaning, without requiring users to read predicates, contracts, or proof obligations. -- **Trigger:** REQ-D1 promotion (paired assumption). -- **Validation approach:** prototype drift surfacing on one corpus of generated criteria; compare user comprehension against direct exposure to the underlying typed-claim diff. +- **Current canonical coverage:** `memory/SPEC.md` lexicon entry `spec drift`; broader progressive-checkability direction in Requirement 38, A77/A78, D134, and FE-700. +- **Why not promote now:** drift detection needs typed checkability / witness metadata and generated-artifact comparison evidence. Without that substrate, a requirement or plan item would be vague product aspiration rather than an actionable frontier. +- **Trigger to revisit:** FE-700 lands typed checkability / witness metadata and FE-702 or a follow-on probe demonstrates at least one credible drift-detection workflow. +- **Likely promotion path after trigger:** run `ln-spec` to add a requirement and paired assumption; run `ln-plan` only if the probe supports a distinct product surface beyond FE-700/FE-702 follow-through. +- **Possible future design doc:** `docs/design/SPEC_DRIFT.md`, created only if the requirement is promoted. -**A-D2. Topology-driven question ranking outperforms template-driven next-question generation as graph density grows.** -Once the typed graph carries kinds, subtypes, and edge metadata, an interviewer that ranks next questions by topology (gap-finding heuristics on the graph) produces more useful questions than one that ranks by phase template — especially for incremental-feature elicitation where the graph is dense from the start. -- **Trigger:** REQ-D2 promotion (paired assumption). -- **Validation approach:** scenario-substrate probes comparing template-driven vs topology-driven next-question generation on the same seeded graphs. +## Retired as standalone promotions -**A-D3. The five-family relation taxonomy is right-sized.** -Five families (justification / dependency / boundary / refinement / verification) is small enough to teach the observer reliably and large enough to drive cascade / export / staleness / reconciliation policy without flat equality of edges. Adding a sixth family creates more confusion than precision; collapsing to four loses too much policy distinction. -- **Trigger:** REQ-D3 promotion (paired assumption). -- **Validation approach:** observer corpus probes labelling edges across the five families; check whether classifier confusion concentrates on family boundaries that suggest splits or merges. +### Topology-driven disambiguation / next-question ranking ---- +**Original impulse.** The interviewer could issue contrastive A/B/C disambiguation questions when graph topology reveals high-fanout assumptions, unwitnessed requirements, unverified invariants, decisions without rejected alternatives, goals without derived requirements, or conflicting constraints. -## Pending PLAN.md additions +**Audit verdict:** do not promote as a separate SPEC requirement or PLAN horizon item now. -### Horizon items (2) +- **Captured by:** FE-700 intent graph semantics + relation-policy directionality; FE-702 graph-review / scenario-options probes; `docs/design/INTENT_GRAPH_SEMANTICS.md`; `docs/design/BEHAVIORAL_KERNELS.md`; A80/A85/A91; D134/D137/D151/D152. +- **Reason:** topology is one ranking signal inside the graph-review / behavioral-kernel direction, not a separate product capability yet. It should remain a probe hypothesis until the semantic substrate exists and kernel probes show that topology-driven ranking beats simpler prompt/context heuristics. +- **Future revisit condition:** if FE-702 probes demonstrate a specific topology-ranking algorithm that should become user-visible interviewer behavior, promote it through `ln-spec` / `ln-plan` then. -**PLAN-D1. Spec drift detection product surface.** -After the typed claim metadata lands (FE-700) and the scenario substrate has probed drift detection (FE-702 follow-on), promote drift detection from a discipline to a user-facing product surface: how divergences are surfaced in the workspace stream, how the user validates or corrects, what they produce durably, and whether drift items become first-class typed claims (likely a new `drift_finding` subtype on `example` or a new top-level kind). -- **Trigger:** REQ-D1 promotion + scenario-substrate drift probe complete. -- **Depends on:** intent graph semantics + progressive checkability (FE-700 → next-3); scenario substrate (FE-698 → next-2); generative prompt probes (FE-702 → next-4). -- **Promotes through:** `ln-plan` patch. -- **Once promoted:** point at the proposed design doc `docs/design/SPEC_DRIFT.md` (entry C3 below; not yet created). +### Edge epistemic metadata / relation participation rules -**PLAN-D2. Topology-driven next-question ranking interviewer behavior.** -Refactor the interviewer's next-question selection to consult typed-graph topology (high-fanout low-confidence assumptions, requirements without `verifies` incoming, criteria without targets, decisions without rejected alternatives, conflicting `constrains` edges, goals without derived requirements). Distinct from kernel-driven questions: kernels suggest *what kind* of question; topology heuristics suggest *which item* to ask about. -- **Trigger:** REQ-D2 promotion + first behavioral-kernel probes complete. -- **Depends on:** intent graph semantics + progressive checkability (FE-700); generative prompt probes for behavioral kernels (FE-702 partial). -- **Promotes through:** `ln-plan` patch. -- **Once promoted:** complement to behavioral-kernel work, not replacement. +**Original impulse.** Knowledge edges would carry support/status/provenance/rationale, and only certain support/status combinations would participate in cascade, staleness, export trace, reconciliation, and weak-suggestion behavior. ---- +**Audit verdict:** already adopted into canonical direction; no standalone promotion remains. -## Pending design docs (1) +- **Captured by:** Requirement 30, Requirement 38, A81, A93, D137, D150, I109, I118, and the lexicon rows for `edge-local neighborhood`, `relation family`, and `relation policy`; `memory/PLAN.md` FE-700 explicitly includes edge epistemic metadata and relation-policy directionality. +- **Reason:** keeping this as a pending promotion would create duplicate planning state. FE-700 is the right owning frontier. +- **Future revisit condition:** FE-700 implementation may refine field names or policy axes, but that should happen inside FE-700 scope rather than via this deferred ledger. -**C3. Proposed `docs/design/SPEC_DRIFT.md`.** -Canonical reference for spec-drift detection as a product surface. This file does not exist yet; create it only if REQ-D1 is promoted. Layer 4 of the source synthesis's four-layer architecture (intent capture / ambiguity discovery / spec artifact generation / spec drift detection). Should specify: -- What counts as drift (intent ↔ artifact ↔ implementation divergence cases) -- How drift is detected per artifact type (criterion divergence, candidate-spec divergence, export divergence, implementation behavior divergence) -- How drift is surfaced in the workspace stream (UI shape, when it interrupts, when it stays passive) -- How the user validates or rejects (chosen-direction question shape) -- What drift findings become durably (typed-claim subtype vs. process state vs. activity card) -- Relation to reconciliation needs (drift can produce reconciliation needs but is distinct from them) -- Drift detection vs. drift recovery — the second is a different problem -- **Trigger:** REQ-D1 promotion (the SPEC requirement should exist before the design doc commits to a shape). -- **Author through:** ordinary `docs/design/` workflow, not a skill. - ---- - -## When everything has promoted - -When this file's three sections (SPEC, PLAN, design docs) are all empty, delete the file. The synthesis source remains in [`INTENT_SPEC_EVOLUTION.md`](../archive/design/INTENT_SPEC_EVOLUTION.md) and the canonical references stand on their own. +## How to use this doc -If items remain unpromoted past their triggers (e.g., FE-700 ships but REQ-D1 still hasn't promoted three months later), reopen this file's relevant entry with a note explaining why — either retire it with reasoning, or escalate it to active triage through `ln-consult`. +1. Keep this file only while **spec drift surfacing** remains a deferred, not-yet-actionable product impulse. +2. Before opening post-FE-700 semantic/generative work, check whether the spec-drift trigger has fired. +3. If the trigger fires, promote through the canonical skills: `ln-spec` for SPEC.md changes and `ln-plan` for PLAN.md changes. +4. If spec drift is promoted or explicitly retired, delete this file. The synthesis source remains in [`INTENT_SPEC_EVOLUTION.md`](../archive/design/INTENT_SPEC_EVOLUTION.md). ## References -- [`INTENT_SPEC_EVOLUTION.md`](../archive/design/INTENT_SPEC_EVOLUTION.md) — synthesis source for every entry above. -- [`INTENT_GRAPH_SEMANTICS.md`](./INTENT_GRAPH_SEMANTICS.md) — typed-graph reference; entries above all assume this lands first. -- [`BEHAVIORAL_KERNELS.md`](./BEHAVIORAL_KERNELS.md) — kernel-driven question reference; complementary to topology-driven ranking. -- `memory/PLAN.md` Next items for FE-700 intent graph semantics and FE-702 graph-review / scenario probes — the frontier items whose completion will fire most triggers above. +- [`INTENT_SPEC_EVOLUTION.md`](../archive/design/INTENT_SPEC_EVOLUTION.md) — synthesis source for the original deferred impulses. +- [`INTENT_GRAPH_SEMANTICS.md`](./INTENT_GRAPH_SEMANTICS.md) — typed-graph reference and FE-700 semantic direction. +- [`BEHAVIORAL_KERNELS.md`](./BEHAVIORAL_KERNELS.md) — kernel-driven question reference, including topology-adjacent probe ideas. +- `memory/PLAN.md` Next items for FE-700 and FE-702 — the owning frontier items for the retired standalone topology and edge-metadata impulses. From 2463617d7295d13d8f938e9be9f0041a95d34972 Mon Sep 17 00:00:00 2001 From: Lu Nelson Date: Wed, 13 May 2026 14:22:42 +0200 Subject: [PATCH 35/42] first pass adoption of new pocock-derived skills --- .agents/skills/ln-build/SKILL.md | 8 +- .agents/skills/ln-diagnose/SKILL.md | 149 +++++++++++++++++++++++++++ .agents/skills/ln-prototype/SKILL.md | 121 ++++++++++++++++++++++ .agents/skills/ln-review/SKILL.md | 8 +- 4 files changed, 284 insertions(+), 2 deletions(-) create mode 100644 .agents/skills/ln-diagnose/SKILL.md create mode 100644 .agents/skills/ln-prototype/SKILL.md diff --git a/.agents/skills/ln-build/SKILL.md b/.agents/skills/ln-build/SKILL.md index fefbc4dc..e1422cde 100644 --- a/.agents/skills/ln-build/SKILL.md +++ b/.agents/skills/ln-build/SKILL.md @@ -62,18 +62,24 @@ Stop the serial loop immediately when any of these becomes true: Translate acceptance criteria into failing tests when the change benefits from them. For bugfixes or subtle seam changes, prefer one high-leverage regression test. For trivial maintenance or doc-only work, tests may be unnecessary. +Test behavior through public interfaces, not implementation details. A good test describes what capability exists and would survive internal refactoring. Avoid tests that mock internal collaborators, assert private call order, or inspect storage directly when the public interface can prove the behavior. + +Do not horizontal-slice TDD. Never write a batch of imagined tests first and then a batch of implementation. Use tracer bullets: one failing behavioral test → minimum code to pass → next failing behavioral test. Each new test should respond to what the previous cycle taught you. + Run the relevant checks. Confirm failures are meaningful. If the card is already green before any code change, treat that as evidence the queue item is already satisfied or stale — not as permission to create a ceremonial red/green cycle. ## Green Write the minimum code to pass. Build inside-out: functional core first, thin I/O shell second, then end-to-end wiring. -No speculative abstractions. Only extract when two concrete cases force it. +No speculative abstractions. Only extract when two concrete cases force it. Do not anticipate later tests or build shape-only scaffolding; let the current behavioral test pull the interface into existence. ## Refactor With tests green, improve names, boundaries, and obvious local structure. Do not widen scope. +Refactor only while green. Keep the tests pinned to the public behavior so they protect the slice while allowing internals to move. If refactoring reveals that the test is coupled to implementation, fix the test seam before trusting it. + ## Verify and commit Run the project's verification harness. All checks must pass. If the card proved already satisfied and no code or canonical-state change was needed, do not create an empty commit. diff --git a/.agents/skills/ln-diagnose/SKILL.md b/.agents/skills/ln-diagnose/SKILL.md new file mode 100644 index 00000000..bf7e068b --- /dev/null +++ b/.agents/skills/ln-diagnose/SKILL.md @@ -0,0 +1,149 @@ +--- +name: ln-diagnose +description: "Disciplined debugging for hard bugs and regressions. Use when something is broken, failing, throwing, flaky, slow, or when the user says diagnose/debug this. Builds a feedback loop, reproduces, hypothesizes, instruments, fixes, regression-tests, then routes back into ln-* canonical planning." +argument-hint: "[bug report, failing command, error, or regression description]" +--- + +# Ln Diagnose + +Diagnose one bug or regression before implementing the fix. The core deliverable is a trusted feedback loop plus a falsified/confirmed causal explanation. Do not jump straight to code changes unless the cause is already proven. + +## Input + +Bug, failure, flake, or regression to diagnose: $ARGUMENTS + +Orient first: + +1. Read `memory/SPEC.md` if present and use its lexicon / live invariants. +2. Read `memory/PLAN.md` if present and identify the containing frontier item if one exists. +3. Read `HANDOFF.md` if present for volatile context. +4. For runtime/UI failures, read the relevant project praxis doc before inspecting logs or driving browsers. + +Write a 2-4 bullet orientation note naming the observed symptom, suspected seam, current feedback loop (if any), and what would count as proof. + +## Phase 1 — Build a feedback loop + +This is the skill. A fast deterministic loop turns debugging into hypothesis testing. If no loop exists, build one before reasoning deeply. + +Try, in rough order: + +1. failing unit/integration/e2e test at the seam that reaches the bug +2. CLI or script with fixture input and asserted output +3. HTTP/curl script against a running server +4. headless browser or browser-automation script asserting DOM/console/network +5. replayed captured artifact: request payload, trace, event log, fixture, HAR +6. throwaway harness around the smallest subsystem that exercises the path +7. property/fuzz loop for intermittent wrong output +8. bisection/differential loop across commits, versions, datasets, or configs +9. structured HITL loop only when a human must observe/click + +Improve the loop before moving on: + +- make it faster +- make the assertion sharper than "did not crash" +- remove flake by pinning time, randomness, network, filesystem, or concurrency +- for nondeterministic bugs, raise reproduction rate with repetition/stress until it is debuggable + +If no loop can be built, stop and report exactly what was tried. Ask for access, logs, traces, fixtures, screen recordings with timestamps, or permission to add temporary instrumentation. Do not continue with vibe-based diagnosis. + +## Phase 2 — Reproduce + +Run the loop and confirm it demonstrates the user's bug, not a nearby failure. + +Capture: + +- exact command/script/test used +- exact symptom: error, diff, timing, screenshot, console/network evidence +- reproduction rate for flakes +- any fixture or artifact saved for replay + +Do not proceed until the bug reproduces, or until lack of reproduction is the explicit diagnosis result. + +## Phase 3 — Hypothesize + +Generate 3-5 ranked hypotheses before testing any one of them. Each must be falsifiable: + +```md +If [cause] is true, then [probe/change] will make [specific observation] happen. +``` + +Prefer hypotheses that distinguish seams or invariants from `memory/SPEC.md`. Show the ranked list to the user if they are present; proceed with the best available ranking if they are AFK. + +## Phase 4 — Instrument + +Probe one hypothesis at a time. Every probe must map to a prediction. + +Tool preference: + +1. debugger/REPL inspection when available +2. targeted boundary logs +3. minimal temporary assertions or counters + +Tag every temporary log or probe with a unique prefix like `[DEBUG-a4f2]` so cleanup is grep-able. Avoid "log everything and grep". + +For performance regressions: measure first. Establish baseline timing/profiler/query-plan evidence, then bisect or compare. Do not optimize before the measurement identifies the seam. + +## Phase 5 — Fix path and regression test + +Before coding the fix, decide the correct route: + +- If the fix is trivial and already inside a settled seam, continue directly into `ln-build` style red-green-refactor in this session. +- If the fix changes a seam, invariant, requirement, assumption, or frontier shape, route to `ln-scope` or `ln-spec` first. +- If the diagnosis answered a hard question but the fix is non-obvious, route to `ln-spike` or `ln-design`. + +Write the regression test before the fix when there is a correct seam. A correct seam exercises the real bug pattern as it occurs at the call site; shallow tests that cannot fail for the original bug create false confidence. + +If no correct seam exists, that is an architectural finding. Record it and route to `ln-review` or `ln-refactor` after the immediate fix decision. + +## Phase 6 — Cleanup and postmortem + +Before declaring done: + +- [ ] original repro loop no longer reproduces the bug, or the non-repro diagnosis is explicit +- [ ] regression test exists and passes, or absence of a correct seam is documented +- [ ] all `[DEBUG-...]` instrumentation is removed +- [ ] throwaway harnesses are deleted or clearly marked and still needed +- [ ] causal hypothesis is stated in the final report / commit message + +Ask: what would have prevented this bug? If the answer is a missing invariant, unclear seam, weak oracle, or bad module shape, route it into the appropriate `ln-*` skill rather than burying it in the diagnosis. + +## Canonical reconciliation + +After diagnosis, reconcile only durable truth: + +- New/retired assumption → update `memory/SPEC.md` §Assumptions. +- New seam-level invariant or oracle gap → update `memory/SPEC.md` and/or route to `ln-oracles`. +- Frontier status changed because the bug blocks/unblocks work → update `memory/PLAN.md`. +- Pure local bug with no durable design implication → no canonical update required beyond any tracked PLAN status. + +Do not create `CONTEXT.md`, ADRs, or alternate planning documents. This project's canonical docs are `memory/SPEC.md` and `memory/PLAN.md`. + +## Output + +```md +## Diagnosis: [symptom] + +**Feedback loop:** [command/script/test and reproduction rate] +**Confirmed cause:** [one sentence] +**Evidence:** [key observations] +**Fix route:** [direct fix | ln-scope | ln-build | ln-spike | ln-review | ln-refactor] +**Regression oracle:** [test/harness or why unavailable] +**Canonical updates:** [none | specific SPEC/PLAN changes needed] +``` + +## Routing + +After diagnosis, present these options to the user (use `tool-ask-question`): + +| # | Label | Target | Why | +| --- | ---------------- | ------------- | --- | +| 1 | Scope the fix | `ln-scope` | The fix needs a buildable card or durable seam update | +| 2 | Build the fix | `ln-build` | The fix is settled and ready for red-green-refactor | +| 3 | Spike deeper | `ln-spike` | A hard question remains after reproduction | +| 4 | Review structure | `ln-review` | No good seam/regression oracle exists or architecture contributed | +| 5 | Back to triage | `ln-consult` | Diagnosis changed priority or scope | + +Recommended: **2** only when the cause and seam are proven; otherwise **1**. + +--- +*Adapted from [mattpocock/skills/engineering/diagnose](https://github.com/mattpocock/skills/tree/main/skills/engineering/diagnose).* diff --git a/.agents/skills/ln-prototype/SKILL.md b/.agents/skills/ln-prototype/SKILL.md new file mode 100644 index 00000000..0b1e561c --- /dev/null +++ b/.agents/skills/ln-prototype/SKILL.md @@ -0,0 +1,121 @@ +--- +name: ln-prototype +description: "Build a clearly throwaway prototype to answer a design question before committing to production work. Use when the user wants to prototype, sanity-check a state model, try a few UI designs, make something playable, or explore logic/UI affordances before ln-spec/ln-plan/ln-scope." +argument-hint: "[prototype question or design uncertainty]" +--- + +# Ln Prototype + +A prototype is throwaway code that answers one question. The question determines the artifact. Do not let prototype code silently become production code. + +## Input + +Prototype question or design uncertainty: $ARGUMENTS + +Orient first: + +1. Read `memory/SPEC.md` if present and use its lexicon / live invariants. +2. Read `memory/PLAN.md` if present and identify whether the prototype serves an existing frontier item. +3. Read `HANDOFF.md` if present for volatile design context. +4. Inspect nearby code only enough to place the prototype where it is understandable and runnable. + +Write a 2-4 bullet orientation note naming the question, prototype branch, nearest seam/page/module, and how the answer will be captured. + +## Choose the branch + +Pick exactly one branch. Ask the user if ambiguous and they are present; otherwise state the assumption. + +### Logic prototype + +Use when the question is: + +- Does this state model feel right? +- Which transitions/actions are legal? +- Does this reducer, parser, planner, or workflow rule behave coherently across examples? +- Can a human play through edge cases faster than reading a spec? + +Build a tiny interactive terminal app or CLI harness around a portable logic module. + +Prefer one of these shapes: + +- pure reducer: `(state, action) => state` +- explicit state machine with named states and transitions +- small set of pure functions over plain data +- state-owning module/class only when ongoing internal state is the question + +Keep the shell thin. The logic should not know about prompts, terminal escape codes, stdout, or UI widgets. + +### UI prototype + +Use when the question is: + +- What should this look or feel like? +- Which layout/interaction pattern communicates the concept? +- How should a user navigate, compare, approve, recover, or inspect? + +Generate several meaningfully different variants in one local route/page/screen, switchable by URL search param or a small floating switcher. Prefer adapting an existing page/route over inventing a new top-level playground. + +Variants should differ in concept, not just color. Name each variant by its design bet. + +## Rules for both branches + +1. **Throwaway from day one.** Name files/routes with `prototype`, `scratch`, or equivalent. Add a short comment at the entry point: `PROTOTYPE — delete or absorb after verdict`. +2. **Place it near the real seam.** Keep context obvious, but do not pollute public exports unless needed to run it. +3. **One command to run.** Use the repo's task runner and document the exact command in the final report or `HANDOFF.md`. +4. **No persistence by default.** Use memory. If persistence is the question, use scratch storage clearly marked as wipeable. +5. **Skip production polish.** No comprehensive tests, error handling, abstractions, analytics, or accessibility hardening beyond what is needed to evaluate the question safely. +6. **Surface state.** After every logic action or UI variant switch, show the relevant state/inputs/outputs so the design can be judged. +7. **Do not widen scope.** A prototype answers one question; new questions become follow-up prototypes, spikes, or scope cards. + +## Capture the answer + +The answer is the only durable artifact. When the prototype has served its purpose, either delete it or explicitly keep it only as live volatile support. + +Capture: + +```md +## Prototype Verdict: [question] + +**Branch:** logic | UI +**Command:** [how to run] +**What we tried:** [variants/actions/cases] +**Verdict:** [decision or remaining uncertainty] +**Absorb:** [what production code/spec/plan should inherit] +**Delete:** [prototype files/routes/storage to remove] +``` + +Durability rule: + +- Decision changes requirements, assumptions, invariants, or lexicon → route to `ln-spec`. +- Decision changes sequencing/frontier → route to `ln-plan`. +- Decision makes one implementation slice obvious → route to `ln-scope`. +- Prototype still needs human judgment later → record volatile state in `HANDOFF.md`. + +Do not create `CONTEXT.md`, ADRs, or alternate planning documents. This project's canonical docs are `memory/SPEC.md` and `memory/PLAN.md`. + +## Cleanup + +Before finishing, state one of: + +- deleted prototype files +- kept prototype temporarily, with exact reason and deletion trigger +- absorbed prototype into production code through a scoped build + +If prototype files remain, they must be visibly non-production and easy to find. + +## Routing + +After the verdict, present these options to the user (use `tool-ask-question`): + +| # | Label | Target | Why | +| --- | -------------- | ------------ | --- | +| 1 | Revise spec | `ln-spec` | Prototype changed durable understanding | +| 2 | Revise plan | `ln-plan` | Prototype changed sequencing or frontier shape | +| 3 | Scope a slice | `ln-scope` | Prototype answered enough to build | +| 4 | Spike instead | `ln-spike` | The remaining question is technical feasibility | +| 5 | Back to triage | `ln-consult` | Prototype did not settle direction | + +Recommended: **3** when the prototype produced a concrete build direction; **1** when it changed the model. + +--- +*Adapted from [mattpocock/skills/engineering/prototype](https://github.com/mattpocock/skills/tree/main/skills/engineering/prototype).* diff --git a/.agents/skills/ln-review/SKILL.md b/.agents/skills/ln-review/SKILL.md index 67d57842..f43b57c1 100644 --- a/.agents/skills/ln-review/SKILL.md +++ b/.agents/skills/ln-review/SKILL.md @@ -18,6 +18,12 @@ If "recent" or unspecified, focus on recently modified files. Apply Ousterhout's depth test: modules should have small interfaces hiding significant complexity. Modules that move together should live together — clusters of small files always used in concert are a single deep module waiting to be extracted. +Use the deletion test for suspected shallow modules: if deleting the module makes complexity vanish, it was pass-through structure; if the same complexity reappears across multiple callers, the module was earning its keep. Prefer depth as leverage/locality, not line-count ratio. + +Treat the interface as the test surface. If callers or tests must reach past the interface to verify important behavior, the module shape is probably wrong. A good seam lets tests and callers cross the same public boundary. + +Apply seam discipline: one adapter usually means a hypothetical seam; two adapters make a real seam. Flag indirection introduced only for imagined future variation, especially when it spreads configuration, mocks, or ordering knowledge into callers. + Check the functional core / imperative shell boundary (Gary Bernhardt, "Boundaries"). Pure functions should stay pure. Flag when a pure function has acquired side effects or a growing parameter list — it has drifted into shell territory. Make invalid states unrepresentable (Yaron Minsky). Split optional fields into distinct types. Use branded types for domain-distinct values. @@ -51,7 +57,7 @@ Present findings as numbered candidates: ```md ## Review: [area] -1. **[Description]** — [category: depth|naming|model|coupling] — [impact: low|medium|high] +1. **[Description]** — [category: depth|naming|model|coupling|seam|oracle-coverage] — [impact: low|medium|high] [1-2 sentence explanation and suggested action] 2. ... From 521e68dc8bd5343ed0393f98f2211e02d1c7cee1 Mon Sep 17 00:00:00 2001 From: Lu Nelson Date: Wed, 13 May 2026 14:28:11 +0200 Subject: [PATCH 36/42] activation density for new skills --- .agents/skills/ln-diagnose/SKILL.md | 120 ++++++++++++--------------- .agents/skills/ln-prototype/SKILL.md | 89 +++++++++----------- 2 files changed, 91 insertions(+), 118 deletions(-) diff --git a/.agents/skills/ln-diagnose/SKILL.md b/.agents/skills/ln-diagnose/SKILL.md index bf7e068b..f945c38e 100644 --- a/.agents/skills/ln-diagnose/SKILL.md +++ b/.agents/skills/ln-diagnose/SKILL.md @@ -1,12 +1,12 @@ --- name: ln-diagnose -description: "Disciplined debugging for hard bugs and regressions. Use when something is broken, failing, throwing, flaky, slow, or when the user says diagnose/debug this. Builds a feedback loop, reproduces, hypothesizes, instruments, fixes, regression-tests, then routes back into ln-* canonical planning." +description: "Scientific debugging for bugs, flakes, failures, and performance regressions. Use when something is broken, throwing, failing, slow, nondeterministic, or when the user says diagnose/debug this. Builds a trusted repro loop, tests falsifiable hypotheses, installs a regression oracle, and routes durable findings back into ln-* planning." argument-hint: "[bug report, failing command, error, or regression description]" --- # Ln Diagnose -Diagnose one bug or regression before implementing the fix. The core deliverable is a trusted feedback loop plus a falsified/confirmed causal explanation. Do not jump straight to code changes unless the cause is already proven. +Debug by scientific method: trusted repro loop, falsifiable hypotheses, one-variable probes, regression oracle. Do not fix by inspection unless the cause is already proven. ## Input @@ -14,116 +14,104 @@ Bug, failure, flake, or regression to diagnose: $ARGUMENTS Orient first: -1. Read `memory/SPEC.md` if present and use its lexicon / live invariants. -2. Read `memory/PLAN.md` if present and identify the containing frontier item if one exists. -3. Read `HANDOFF.md` if present for volatile context. +1. Read `memory/SPEC.md` if present; use its lexicon and live invariants. +2. Read `memory/PLAN.md` if present; identify the containing frontier item if one exists. +3. Read `HANDOFF.md` if present. 4. For runtime/UI failures, read the relevant project praxis doc before inspecting logs or driving browsers. -Write a 2-4 bullet orientation note naming the observed symptom, suspected seam, current feedback loop (if any), and what would count as proof. +Write a 2-4 bullet orientation note: symptom, suspected seam, current feedback loop, proof standard. -## Phase 1 — Build a feedback loop +## 1. Build the repro loop -This is the skill. A fast deterministic loop turns debugging into hypothesis testing. If no loop exists, build one before reasoning deeply. +This is the skill. A fast deterministic pass/fail loop makes the rest mechanical. No loop, no diagnosis. Try, in rough order: 1. failing unit/integration/e2e test at the seam that reaches the bug -2. CLI or script with fixture input and asserted output +2. CLI/script with fixture input and asserted output 3. HTTP/curl script against a running server -4. headless browser or browser-automation script asserting DOM/console/network -5. replayed captured artifact: request payload, trace, event log, fixture, HAR +4. browser automation asserting DOM, console, or network +5. replayed artifact: request, trace, event log, fixture, HAR 6. throwaway harness around the smallest subsystem that exercises the path 7. property/fuzz loop for intermittent wrong output 8. bisection/differential loop across commits, versions, datasets, or configs -9. structured HITL loop only when a human must observe/click +9. structured HITL loop only when a human must observe or click -Improve the loop before moving on: +Improve the loop before moving on: faster, sharper assertion, less flake. Pin time, randomness, network, filesystem, and concurrency. For nondeterministic bugs, raise reproduction rate with repetition/stress until it is debuggable. -- make it faster -- make the assertion sharper than "did not crash" -- remove flake by pinning time, randomness, network, filesystem, or concurrency -- for nondeterministic bugs, raise reproduction rate with repetition/stress until it is debuggable +If no loop can be built, stop. Report what you tried and ask for access, logs, traces, fixtures, timestamped recordings, or permission for temporary instrumentation. -If no loop can be built, stop and report exactly what was tried. Ask for access, logs, traces, fixtures, screen recordings with timestamps, or permission to add temporary instrumentation. Do not continue with vibe-based diagnosis. +## 2. Reproduce the user's bug -## Phase 2 — Reproduce - -Run the loop and confirm it demonstrates the user's bug, not a nearby failure. +Run the loop. Confirm it demonstrates the reported bug, not a nearby failure. Capture: -- exact command/script/test used +- command/script/test used - exact symptom: error, diff, timing, screenshot, console/network evidence - reproduction rate for flakes -- any fixture or artifact saved for replay +- saved replay artifact, if any -Do not proceed until the bug reproduces, or until lack of reproduction is the explicit diagnosis result. +Lack of reproduction is allowed only as an explicit diagnosis result. -## Phase 3 — Hypothesize +## 3. Rank falsifiable hypotheses -Generate 3-5 ranked hypotheses before testing any one of them. Each must be falsifiable: +Generate 3-5 hypotheses before testing any one of them. Each hypothesis must predict an observation: ```md If [cause] is true, then [probe/change] will make [specific observation] happen. ``` -Prefer hypotheses that distinguish seams or invariants from `memory/SPEC.md`. Show the ranked list to the user if they are present; proceed with the best available ranking if they are AFK. - -## Phase 4 — Instrument - -Probe one hypothesis at a time. Every probe must map to a prediction. - -Tool preference: +Prefer hypotheses that distinguish seams or invariants from `memory/SPEC.md`. Show the ranking to the user when they are present; proceed if they are AFK. -1. debugger/REPL inspection when available -2. targeted boundary logs -3. minimal temporary assertions or counters +## 4. Probe one variable at a time -Tag every temporary log or probe with a unique prefix like `[DEBUG-a4f2]` so cleanup is grep-able. Avoid "log everything and grep". +Every probe maps to one prediction. Prefer debugger/REPL inspection, then targeted boundary logs, then temporary assertions/counters. -For performance regressions: measure first. Establish baseline timing/profiler/query-plan evidence, then bisect or compare. Do not optimize before the measurement identifies the seam. +Tag temporary instrumentation with a unique prefix like `[DEBUG-a4f2]`. Cleanup must be grep-able. Never "log everything and grep". -## Phase 5 — Fix path and regression test +Performance branch: measure first. Establish a baseline timing/profiler/query-plan signal, then bisect or compare. Do not optimize before the measurement identifies the seam. -Before coding the fix, decide the correct route: +## 5. Choose the fix route -- If the fix is trivial and already inside a settled seam, continue directly into `ln-build` style red-green-refactor in this session. -- If the fix changes a seam, invariant, requirement, assumption, or frontier shape, route to `ln-scope` or `ln-spec` first. -- If the diagnosis answered a hard question but the fix is non-obvious, route to `ln-spike` or `ln-design`. +Before coding, choose the route: -Write the regression test before the fix when there is a correct seam. A correct seam exercises the real bug pattern as it occurs at the call site; shallow tests that cannot fail for the original bug create false confidence. +- **Direct fix / `ln-build`** — cause is proven and the change stays inside a settled seam. +- **`ln-scope` or `ln-spec`** — the fix changes a seam, invariant, requirement, assumption, or frontier shape. +- **`ln-spike` or `ln-design`** — diagnosis answered one question but the fix shape remains uncertain. +- **`ln-review` / `ln-refactor`** — no correct regression seam exists, or architecture contributed to the bug. -If no correct seam exists, that is an architectural finding. Record it and route to `ln-review` or `ln-refactor` after the immediate fix decision. +Install the regression oracle before the fix when a correct seam exists. A correct seam reproduces the real bug pattern as it occurs at the call site. Shallow tests that cannot fail for the original bug are false confidence. -## Phase 6 — Cleanup and postmortem +## 6. Cleanup and postmortem Before declaring done: -- [ ] original repro loop no longer reproduces the bug, or the non-repro diagnosis is explicit -- [ ] regression test exists and passes, or absence of a correct seam is documented +- [ ] original repro loop no longer reproduces the bug, or non-repro is the diagnosis +- [ ] regression oracle exists and passes, or absence of a correct seam is documented - [ ] all `[DEBUG-...]` instrumentation is removed -- [ ] throwaway harnesses are deleted or clearly marked and still needed -- [ ] causal hypothesis is stated in the final report / commit message +- [ ] throwaway harnesses are deleted or visibly temporary +- [ ] confirmed causal hypothesis is stated in the report / commit message -Ask: what would have prevented this bug? If the answer is a missing invariant, unclear seam, weak oracle, or bad module shape, route it into the appropriate `ln-*` skill rather than burying it in the diagnosis. +Ask: what would have prevented this bug? Route missing invariants, unclear seams, weak oracles, and bad module shapes into the appropriate `ln-*` skill. ## Canonical reconciliation -After diagnosis, reconcile only durable truth: +Reconcile only durable truth: - New/retired assumption → update `memory/SPEC.md` §Assumptions. -- New seam-level invariant or oracle gap → update `memory/SPEC.md` and/or route to `ln-oracles`. -- Frontier status changed because the bug blocks/unblocks work → update `memory/PLAN.md`. -- Pure local bug with no durable design implication → no canonical update required beyond any tracked PLAN status. +- New seam-level invariant or oracle gap → update `memory/SPEC.md` or route to `ln-oracles`. +- Frontier status changed → update `memory/PLAN.md`. +- Local bug with no durable implication → no canonical update beyond tracked PLAN status. -Do not create `CONTEXT.md`, ADRs, or alternate planning documents. This project's canonical docs are `memory/SPEC.md` and `memory/PLAN.md`. +Do not create `CONTEXT.md`, ADRs, or alternate planning docs. Canonical docs are `memory/SPEC.md` and `memory/PLAN.md`. ## Output ```md ## Diagnosis: [symptom] -**Feedback loop:** [command/script/test and reproduction rate] +**Repro loop:** [command/script/test and reproduction rate] **Confirmed cause:** [one sentence] **Evidence:** [key observations] **Fix route:** [direct fix | ln-scope | ln-build | ln-spike | ln-review | ln-refactor] @@ -135,15 +123,15 @@ Do not create `CONTEXT.md`, ADRs, or alternate planning documents. This project' After diagnosis, present these options to the user (use `tool-ask-question`): -| # | Label | Target | Why | -| --- | ---------------- | ------------- | --- | -| 1 | Scope the fix | `ln-scope` | The fix needs a buildable card or durable seam update | -| 2 | Build the fix | `ln-build` | The fix is settled and ready for red-green-refactor | -| 3 | Spike deeper | `ln-spike` | A hard question remains after reproduction | -| 4 | Review structure | `ln-review` | No good seam/regression oracle exists or architecture contributed | -| 5 | Back to triage | `ln-consult` | Diagnosis changed priority or scope | +| # | Label | Target | Why | +| --- | ---------------- | ------------ | --- | +| 1 | Scope the fix | `ln-scope` | The fix needs a buildable card or durable seam update | +| 2 | Build the fix | `ln-build` | The fix is settled and ready for red-green-refactor | +| 3 | Spike deeper | `ln-spike` | A hard question remains after reproduction | +| 4 | Review structure | `ln-review` | No good seam/regression oracle exists or architecture contributed | +| 5 | Back to triage | `ln-consult` | Diagnosis changed priority or scope | -Recommended: **2** only when the cause and seam are proven; otherwise **1**. +Recommended: **2** only when cause and seam are proven; otherwise **1**. --- -*Adapted from [mattpocock/skills/engineering/diagnose](https://github.com/mattpocock/skills/tree/main/skills/engineering/diagnose).* +*Adapted from [mattpocock/skills/engineering/diagnose](https://github.com/mattpocock/skills/tree/main/skills/engineering/diagnose).* diff --git a/.agents/skills/ln-prototype/SKILL.md b/.agents/skills/ln-prototype/SKILL.md index 0b1e561c..a6ce915d 100644 --- a/.agents/skills/ln-prototype/SKILL.md +++ b/.agents/skills/ln-prototype/SKILL.md @@ -1,12 +1,14 @@ --- name: ln-prototype -description: "Build a clearly throwaway prototype to answer a design question before committing to production work. Use when the user wants to prototype, sanity-check a state model, try a few UI designs, make something playable, or explore logic/UI affordances before ln-spec/ln-plan/ln-scope." +description: "Throwaway design probe for logic, state models, UI variations, and affordances before production work. Use when the user wants to prototype, sanity-check a model, make something playable, compare UI directions, or explore a design before ln-spec/ln-plan/ln-scope." argument-hint: "[prototype question or design uncertainty]" --- # Ln Prototype -A prototype is throwaway code that answers one question. The question determines the artifact. Do not let prototype code silently become production code. +A prototype is a disposable answer to one design question. Keep the verdict, not the artifact. + +Use `ln-prototype` when the question needs feel, play, or comparison. Use `ln-spike` when the question is technical feasibility or unknown API behavior. ## Input @@ -14,64 +16,47 @@ Prototype question or design uncertainty: $ARGUMENTS Orient first: -1. Read `memory/SPEC.md` if present and use its lexicon / live invariants. -2. Read `memory/PLAN.md` if present and identify whether the prototype serves an existing frontier item. -3. Read `HANDOFF.md` if present for volatile design context. +1. Read `memory/SPEC.md` if present; use its lexicon and live invariants. +2. Read `memory/PLAN.md` if present; identify whether the prototype serves an existing frontier item. +3. Read `HANDOFF.md` if present. 4. Inspect nearby code only enough to place the prototype where it is understandable and runnable. -Write a 2-4 bullet orientation note naming the question, prototype branch, nearest seam/page/module, and how the answer will be captured. +Write a 2-4 bullet orientation note: question, prototype branch, nearest seam/page/module, answer-capture path. -## Choose the branch +## Choose one branch -Pick exactly one branch. Ask the user if ambiguous and they are present; otherwise state the assumption. +Ask if ambiguous and the user is present; otherwise state the assumption. ### Logic prototype -Use when the question is: - -- Does this state model feel right? -- Which transitions/actions are legal? -- Does this reducer, parser, planner, or workflow rule behave coherently across examples? -- Can a human play through edge cases faster than reading a spec? +Use for state, transition, reducer, parser, planner, or workflow questions. Build a tiny interactive terminal app or CLI harness around a portable logic module. -Build a tiny interactive terminal app or CLI harness around a portable logic module. - -Prefer one of these shapes: +Good shapes: - pure reducer: `(state, action) => state` -- explicit state machine with named states and transitions -- small set of pure functions over plain data -- state-owning module/class only when ongoing internal state is the question +- explicit state machine with named states and legal transitions +- small pure functions over plain data +- state-owning module/class only when internal ongoing state is the question -Keep the shell thin. The logic should not know about prompts, terminal escape codes, stdout, or UI widgets. +Keep the shell thin. The logic must not know about prompts, terminal escape codes, stdout, or UI widgets. ### UI prototype -Use when the question is: - -- What should this look or feel like? -- Which layout/interaction pattern communicates the concept? -- How should a user navigate, compare, approve, recover, or inspect? - -Generate several meaningfully different variants in one local route/page/screen, switchable by URL search param or a small floating switcher. Prefer adapting an existing page/route over inventing a new top-level playground. - -Variants should differ in concept, not just color. Name each variant by its design bet. - -## Rules for both branches +Use for layout, interaction, navigation, approval/recovery, inspection, or comparison questions. -1. **Throwaway from day one.** Name files/routes with `prototype`, `scratch`, or equivalent. Add a short comment at the entry point: `PROTOTYPE — delete or absorb after verdict`. -2. **Place it near the real seam.** Keep context obvious, but do not pollute public exports unless needed to run it. -3. **One command to run.** Use the repo's task runner and document the exact command in the final report or `HANDOFF.md`. -4. **No persistence by default.** Use memory. If persistence is the question, use scratch storage clearly marked as wipeable. -5. **Skip production polish.** No comprehensive tests, error handling, abstractions, analytics, or accessibility hardening beyond what is needed to evaluate the question safely. -6. **Surface state.** After every logic action or UI variant switch, show the relevant state/inputs/outputs so the design can be judged. -7. **Do not widen scope.** A prototype answers one question; new questions become follow-up prototypes, spikes, or scope cards. +Generate several meaningfully different variants in one local route/page/screen, switchable by URL search param or floating switcher. Prefer adapting an existing page over inventing a playground. Variants should differ by design bet, not skin: name the bet each variant tests. -## Capture the answer +## Prototype discipline -The answer is the only durable artifact. When the prototype has served its purpose, either delete it or explicitly keep it only as live volatile support. +1. **Throwaway from day one.** Name files/routes with `prototype`, `scratch`, or equivalent. Add: `PROTOTYPE — delete or absorb after verdict`. +2. **Near the real seam.** Keep context obvious; avoid public exports unless needed to run it. +3. **One command to run.** Use the repo's task runner and record the exact command. +4. **No persistence by default.** Use memory. If persistence is the question, use clearly wipeable scratch storage. +5. **No production polish.** Skip comprehensive tests, abstractions, analytics, and hardening beyond safe evaluation. +6. **Surface state.** After each logic action or UI variant switch, show relevant inputs, outputs, and state. +7. **One question only.** New questions become follow-up prototypes, spikes, or scope cards. -Capture: +## Capture the verdict ```md ## Prototype Verdict: [question] @@ -84,22 +69,22 @@ Capture: **Delete:** [prototype files/routes/storage to remove] ``` -Durability rule: +Durability routing: -- Decision changes requirements, assumptions, invariants, or lexicon → route to `ln-spec`. -- Decision changes sequencing/frontier → route to `ln-plan`. -- Decision makes one implementation slice obvious → route to `ln-scope`. -- Prototype still needs human judgment later → record volatile state in `HANDOFF.md`. +- Requirements, assumptions, invariants, or lexicon changed → `ln-spec`. +- Sequencing or frontier changed → `ln-plan`. +- One implementation slice is now obvious → `ln-scope`. +- Human judgment remains pending → record volatile state in `HANDOFF.md`. -Do not create `CONTEXT.md`, ADRs, or alternate planning documents. This project's canonical docs are `memory/SPEC.md` and `memory/PLAN.md`. +Do not create `CONTEXT.md`, ADRs, or alternate planning docs. Canonical docs are `memory/SPEC.md` and `memory/PLAN.md`. ## Cleanup -Before finishing, state one of: +Finish by stating one of: - deleted prototype files -- kept prototype temporarily, with exact reason and deletion trigger -- absorbed prototype into production code through a scoped build +- kept prototype temporarily, with reason and deletion trigger +- absorbed prototype into production through a scoped build If prototype files remain, they must be visibly non-production and easy to find. @@ -118,4 +103,4 @@ After the verdict, present these options to the user (use `tool-ask-question`): Recommended: **3** when the prototype produced a concrete build direction; **1** when it changed the model. --- -*Adapted from [mattpocock/skills/engineering/prototype](https://github.com/mattpocock/skills/tree/main/skills/engineering/prototype).* +*Adapted from [mattpocock/skills/engineering/prototype](https://github.com/mattpocock/skills/tree/main/skills/engineering/prototype).* From c9a5aa426e16b00565a3a5b369060f3273bb40f2 Mon Sep 17 00:00:00 2001 From: Lu Nelson Date: Wed, 13 May 2026 14:35:47 +0200 Subject: [PATCH 37/42] refactor of the ln-plan skill and template + all skills that reference planning, for low-conflict --- .agents/skills/ln-build/SKILL.md | 15 +-- .agents/skills/ln-grill/SKILL.md | 2 +- .agents/skills/ln-handoff/SKILL.md | 2 +- .agents/skills/ln-oracles/SKILL.md | 21 ++-- .agents/skills/ln-plan/SKILL.md | 79 ++++++++---- .../skills/ln-plan/assets/plan-template.md | 57 ++++++--- .agents/skills/ln-review/SKILL.md | 2 +- .agents/skills/ln-scope/SKILL.md | 8 +- .agents/skills/ln-spec/SKILL.md | 10 +- .../skills/ln-spec/assets/spec-template.md | 20 ++-- .agents/skills/ln-spike/SKILL.md | 14 +-- .agents/skills/ln-sync/SKILL.md | 24 ++-- .agents/skills/planning-pr/SKILL.md | 113 ++++++++++-------- AGENTS.md | 6 +- docs/praxis/graphite-workflow.md | 10 +- docs/praxis/worktree-agents.md | 6 +- 16 files changed, 235 insertions(+), 154 deletions(-) diff --git a/.agents/skills/ln-build/SKILL.md b/.agents/skills/ln-build/SKILL.md index e1422cde..510cc8bf 100644 --- a/.agents/skills/ln-build/SKILL.md +++ b/.agents/skills/ln-build/SKILL.md @@ -14,7 +14,7 @@ A full or light scope card from `ln-scope`, the next ready card in `memory/CARDS Extract: target behavior / objective, acceptance criteria, and verification approach. -Treat the scope card as the next implementation step inside its containing `memory/PLAN.md` frontier item. The frontier item is the plan-level work item; the scope card is just the current execution step inside it. Unless `ln-plan` has already split the frontier into separate items, do **not** infer a new Linear issue or Graphite branch from scope-card granularity; multiple consecutive scope cards may land on the same branch. +Treat the scope card as the next implementation slice inside its containing `memory/PLAN.md` frontier item. The frontier item is the plan-level work item and Linear/branch unit; the scope-card slice is just the current execution step inside it. Unless `ln-plan` has already split the frontier into separate items, do **not** infer a new Linear issue or Graphite branch from scope-card granularity; multiple consecutive slices may land on the same branch. If `memory/CARDS.md` exists, treat it as a derivative execution queue, not canonical planning state. Start with the next card marked `next` or the first unfinished card in that file. If that card is already satisfied on the current branch, do **not** manufacture a no-op build commit; verify the acceptance criteria, mark the card `done` or `dropped` as appropriate, reconcile the queue, and either continue to the next honest build target or route back to `ln-scope` if no build remains. @@ -35,7 +35,7 @@ Do not invent new planning docs, scratch histories, or alternate memory location ## Serial execution mode -When several prepared cards already exist for one settled frontier item, `ln-build` may execute them in sequence instead of routing back through the user after every commit. +When several prepared slice cards already exist for one settled frontier item, `ln-build` may execute them in sequence instead of routing back through the user after every commit. Loop shape: @@ -99,10 +99,10 @@ After the build lands and verification passes, ask: ### If all answers are no -- Mark the work done in `memory/PLAN.md` **if it was tracked there** +- Mark the containing frontier done in `memory/PLAN.md` **if the build completed the frontier item**, usually by updating `Sequencing` / frontier status rather than moving definition blocks - Update `Recently Completed` if the plan uses it -- Do **not** add new SPEC/PLAN bookkeeping just because work happened -- If the work was non-trivial, required manual verification, or leaves residual risk, record `Done / Verified / Watch` in `memory/PLAN.md` `Recently Completed` when that watch matters beyond the current session +- Do **not** add new SPEC/PLAN bookkeeping just because a slice happened +- If the slice was non-trivial, required manual verification, or leaves residual risk that matters beyond the current session, record it in the containing frontier definition or a terse `Recently Completed` entry only when it affects frontier-level re-entry ### If any answer is yes @@ -117,8 +117,9 @@ Update only the touched traceability items. #### Update rules 1. **PLAN** - - Mark the item done if it was tracked - - If the change closes or unblocks a frontier item, reflect that in `Active`, `Next`, or `Recently Completed` + - Mark the frontier item done if this slice completed it + - If the change closes, blocks, or unblocks a frontier item, reflect that in `Sequencing`, the affected `Frontier Definitions` entry, or `Recently Completed` + - Do not mirror detailed slice/card history into `memory/PLAN.md`; keep active execution queues in `memory/CARDS.md` 2. **Assumptions** - evidence answered it → update to `validated` or `invalidated` diff --git a/.agents/skills/ln-grill/SKILL.md b/.agents/skills/ln-grill/SKILL.md index 0c17ce77..593a5b35 100644 --- a/.agents/skills/ln-grill/SKILL.md +++ b/.agents/skills/ln-grill/SKILL.md @@ -26,7 +26,7 @@ When understanding is reached, present these options to the user (use `tool-ask- | # | Label | Target | Why | | --- | --------------- | ---------- | --------------------------------------- | | 1 | Write a spec | `ln-spec` | Understanding is sufficient for a spec | -| 2 | Plan slices | `ln-plan` | Problem is clear, needs slice breakdown | +| 2 | Plan frontier | `ln-plan` | Problem is clear, needs frontier breakdown | | 3 | Scope one slice | `ln-scope` | One slice is already obvious | Recommended: choose based on how much structure the understanding needs. diff --git a/.agents/skills/ln-handoff/SKILL.md b/.agents/skills/ln-handoff/SKILL.md index 0c5c3db1..f808b8a5 100644 --- a/.agents/skills/ln-handoff/SKILL.md +++ b/.agents/skills/ln-handoff/SKILL.md @@ -38,7 +38,7 @@ This is the critical step. Scan the conversation for volatile artifacts — info - **Queued scope cards** already persisted in `memory/CARDS.md` — capture only what is still volatile about them: which card is next, whether the queue is still valid, and any card-level corrections that have not been written back yet - **Plan drafts** from `ln-plan` — slice lists, ordering decisions, dependency reasoning not yet in `memory/PLAN.md` - **Design outputs** from `ln-design` — alternative module shapes considered, the chosen shape, and rejected tradeoffs -- **Oracle design outputs** from `ln-oracles` — O/R/C assessment, selected oracle families, per-slice verification approaches, acknowledged blind spots, and whether slice verification design is complete / pending / stale relative to the code +- **Oracle design outputs** from `ln-oracles` — O/R/C assessment, selected oracle families, per-frontier or per-slice verification approaches, acknowledged blind spots, and whether verification design is complete / pending / stale relative to the code - **Spike state** from `ln-spike` — the question, what was tried, partial findings, verdict if reached - **Review findings** from `ln-review` — **ALL findings, not just the one being acted on.** Review debt is critical context. Name every finding, its status (addressed / in-progress / deferred), and any remaining implications. A fresh thread that only knows about the active finding will lose track of deferred review debt. - **Refactor state** from `ln-refactor` — commit sequence, target structure, and any constraints on safe ordering diff --git a/.agents/skills/ln-oracles/SKILL.md b/.agents/skills/ln-oracles/SKILL.md index c636a97e..121e7faf 100644 --- a/.agents/skills/ln-oracles/SKILL.md +++ b/.agents/skills/ln-oracles/SKILL.md @@ -1,7 +1,7 @@ --- name: ln-oracles -description: "Design verification strategy: diagnose observability, select oracle families, map to loop tiers, surface blind spots. Use after ln-plan when slices need oracle design — especially for LLM, visual, or compositional work — or when verification coverage has drifted." -argument-hint: "[slices to design oracles for, or 'all' for full reassessment]" +description: "Design verification strategy: diagnose observability, select oracle families, map to loop tiers, surface blind spots. Use after ln-plan when frontier items or scoped slices need oracle design — especially for LLM, visual, or compositional work — or when verification coverage has drifted." +argument-hint: "[frontier items or scoped slices to design oracles for, or 'all' for full reassessment]" --- # Ln Oracles @@ -18,9 +18,9 @@ Read the [diagnostic framework](assets/diagnostic-framework.md) and [oracle taxo ## Input -The slices to design oracles for: $ARGUMENTS +The frontier items or scoped slices to design oracles for: $ARGUMENTS -Read `memory/SPEC.md` (invariants, assumptions, decisions, verification design) and `memory/PLAN.md` (slices, acceptance criteria). If `memory/SPEC.md` already has a §Verification Design section, this is an update -- read it as prior state to evolve, not preserve uncritically. +Read `memory/SPEC.md` (invariants, assumptions, decisions, verification design) and `memory/PLAN.md` (frontier definitions, sequencing, acceptance criteria). If `memory/SPEC.md` already has a §Verification Design section, this is an update -- read it as prior state to evolve, not preserve uncritically. ## Procedure @@ -34,7 +34,7 @@ Score **Observability**, **Reproducibility**, and **Controllability** (see the [ ### 2. Extract verification claims -From `memory/SPEC.md` invariant bundles, acceptance criteria, and `memory/PLAN.md` slice definitions -- list what must be proved. Distinguish: +From `memory/SPEC.md` invariant bundles, acceptance criteria, `memory/PLAN.md` frontier definitions, and any in-hand scope-card slices -- list what must be proved. Distinguish: - **Structural claims** (schema conformance, DB round-trips, type safety) -- oracle-able programmatically - **Behavioral claims** (LLM output quality, UX judgment) -- require human assessment or statistical thresholds @@ -56,9 +56,9 @@ Assign each selected oracle to inner (ms, agent-autonomous), middle (seconds-min **Grill**: For middle-loop oracles that require external resources (API calls, fixtures), ask: how will fixtures be created? What bootstraps ground truth? Is single-shot measurement sufficient or do we need multi-run variance? -### 5. Design per-slice verification approach +### 5. Design per-frontier / per-slice verification approach -For each in-scope slice in `memory/PLAN.md`, specify: which oracles apply, what they prove, and which loop tier they belong to. This becomes the `**Verification approach**` annotation on each slice. +For each in-scope frontier item in `memory/PLAN.md`, specify: which oracles apply, what they prove, and which loop tier they belong to. This becomes the `Verification` annotation in the frontier definition. If a scope-card slice is already available, add slice-level oracle notes there without promoting detailed card history into `memory/PLAN.md`. **Grill**: For each slice, ask: does this oracle strategy cover the slice's acceptance criteria? What's the gap between "oracle says pass" and "slice is actually correct"? @@ -78,15 +78,16 @@ Update `memory/SPEC.md` §Verification Design: - **Design notes** -- project-specific oracle design decisions (e.g. observer history projection, fixture bootstrapping strategy) - **Acknowledged Blind Spots** -- table with blind spot, reason, mitigation, and revisit trigger -Update `memory/PLAN.md` per-slice annotations: +Update `memory/PLAN.md` frontier annotations: -- Add `**Verification approach**` line to each in-scope slice with oracle family, loop tier, and cross-reference to `memory/SPEC.md` sections +- Add or refresh the `Verification` line in each in-scope frontier definition with oracle family, loop tier, and cross-reference to `memory/SPEC.md` sections +- Keep slice-level oracle detail in the current `ln-scope` card or `memory/CARDS.md` queue unless it changes the frontier definition ### Cross-reference integrity After writing, verify: - Every `memory/SPEC.md` invariant has at least one oracle assigned (inner, middle, or outer) -- Every in-scope `memory/PLAN.md` slice has a verification approach annotation +- Every in-scope `memory/PLAN.md` frontier definition has a verification approach annotation - The blind spots section is non-empty - Middle/outer loop oracles cross-reference the invariants or assumptions they prove diff --git a/.agents/skills/ln-plan/SKILL.md b/.agents/skills/ln-plan/SKILL.md index 6f645d66..93dd23a9 100644 --- a/.agents/skills/ln-plan/SKILL.md +++ b/.agents/skills/ln-plan/SKILL.md @@ -1,6 +1,6 @@ --- name: ln-plan -description: "Break a feature or project area into frontier-ordered slices and update `memory/PLAN.md`. Re-run to retire completed work, reorder priorities, or add new items." +description: "Break a feature or project area into frontier items and update `memory/PLAN.md`. Re-run to retire completed work, reorder priorities, or add new items." argument-hint: "[feature or project area to plan]" --- @@ -10,17 +10,34 @@ Plan the **rolling frontier**, not the whole historical timeline. `memory/PLAN.md` is the canonical record of what's next. `docs/archive/PLAN_HISTORY.md` is the only sanctioned archive for retired plan history. `memory/CARDS.md` is the sanctioned derivative queue for multiple prepared scope cards inside one frontier item; it is not canonical planning state. Do not invent other sidecar plan docs, milestone ledgers, or alternate memory locations without explicit permission. -The mature-mode shape is: +## Frontier vs slice vocabulary -- `Active` — ordered work that is open now -- `Next` — near-horizon items, loosely ordered +Use **frontier item** for a named canonical work item in `memory/PLAN.md`. Frontier items are the unit of Linear issue / Graphite branch work and should be vertical enough to establish or unlock a meaningful product or architecture step. + +Use **slice** for the buildable scope card produced by `ln-scope` and implemented by `ln-build`. A slice is often a sub-unit of one frontier item. Several slices may land on the same frontier branch. Do not turn slices into separate PLAN entries unless the frontier itself changes shape, ownership, or dependency ordering. + +The vertical-slicing instinct still applies at planning time: frontier items should cut through the relevant concerns of `memory/SPEC.md` instead of becoming layer-by-layer chores. The term "frontier" names their canonical/branch role; the term "slice" remains reserved for scoped execution. + +## Plan document shape + +Prefer the conflict-resistant mature shape: + +- `Context` — short rolling narrative for re-entry +- `Sequencing` — small, frequently edited ordering/status references by stable frontier id +- `Frontier Definitions` — relatively stable per-frontier definitions keyed by stable id +- `Recently Completed` — last 2-3 completed frontier items only +- `Dependencies` — active / next blocking relationships by stable id only + +Within `Sequencing`, use: + +- `Active` — ordered frontier items open now +- `Next` — near-horizon frontier items, loosely ordered +- `Parallel / Low-conflict` — useful work that can proceed without disturbing the main stack - `Horizon` — future work, lightly shaped -- `Recently Completed` — last 2-3 completed items only -- `Dependencies` — active / next blocking relationships only Archive deeper history to `docs/archive/PLAN_HISTORY.md` instead of keeping it live in `memory/PLAN.md`. -Treat frontier items as branch-sized work, not commit-sized work. If one frontier item will unfold as several consecutive verified commits, keep that execution queue in `memory/CARDS.md` or in session context instead of fragmenting `memory/PLAN.md` into a commit ledger. +Treat frontier items as branch-sized work, not commit-sized work. If one frontier item will unfold as several consecutive verified slices, keep that execution queue in `memory/CARDS.md` or in session context instead of fragmenting `memory/PLAN.md` into a commit ledger. `memory/PLAN.md` may carry at most a lightweight pointer such as `current card queue: memory/CARDS.md`; detailed discretionary sub-slicing belongs in `memory/CARDS.md`. ## Input @@ -32,52 +49,74 @@ If this is a fresh thread or the frontier rationale is unclear, read `HANDOFF.md ## Planning rules +### Stable frontier ids + +Every frontier definition should have a stable lowercase id / slug. Good ids are short and semantic, e.g. `agent-fixture-substrate`, `intent-graph-semantics`, `changeset-ledger`. + +Rules: + +- `Sequencing` references frontier ids; it does not duplicate definition blocks. +- `Frontier Definitions` are keyed by frontier id and should not move just because ordering changes. +- Rename a frontier id only when the identity of the work changed, not because the title improved. +- Linear issue ids belong in the definition metadata when known; they are not the only stable id. + ### Work-type awareness -Classify each item before deciding how much planning weight it needs. +Classify each frontier item before deciding how much planning weight it needs. | Work type | Planning weight | | --- | --- | -| Structural | full slice with `memory/SPEC.md` traceability | -| Bounded feature | objective + acceptance + verification for a slice; add `memory/SPEC.md` links only if durable boundaries change | +| Structural | full frontier definition with `memory/SPEC.md` traceability | +| Bounded feature | objective + acceptance + verification; add `memory/SPEC.md` links only if durable boundaries change | | Hardening | task-level objective + acceptance | | Bugfix | usually do not add to `memory/PLAN.md` unless it changes frontier priority | | Refactor | route through `ln-refactor` unless it is itself frontier work | ### Anti-fragmentation -Create a new item only when it introduces at least one of: +Create a new frontier item only when it introduces at least one of: 1. a new lifecycle seam 2. a new transport or persistence seam 3. a new workflow entry / exit behavior 4. a meaningful unblocker for forward progress +5. a distinct dependency / branch boundary that should be tracked independently Do not fragment the plan for minor action/status variants or ordinary follow-through inside a settled seam. Do not split one frontier item into several new PLAN entries just because execution will require several scope cards or commits. Only split when the frontier itself changes shape, ownership, or dependency ordering. +### Sequencing vs definition edits + +When priorities change, edit `Sequencing` first. Do not move or rewrite frontier definitions merely to reorder work. + +When the meaning, acceptance, verification, traceability, or design-doc references of a frontier changes, edit its `Frontier Definitions` entry. + +When a frontier completes, remove it from `Sequencing`, add a terse `Recently Completed` entry, and archive older completion history if needed. Keep the definition only if it still carries live rationale for nearby work; otherwise archive/retire it. + ### Epistemic horizon -If live low-confidence assumptions block downstream work, stop the plan at that boundary. Plan spikes or thinner proving steps, not fantasy certainty. +If live low-confidence assumptions block downstream work, stop the plan at that boundary. Plan spikes or thinner proving frontier items, not fantasy certainty. ## Procedure -1. Read `memory/PLAN.md` if it exists. Retire or archive stale completed material into `docs/archive/PLAN_HISTORY.md`. +1. Read `memory/PLAN.md` if it exists. Identify existing frontier ids and retire/archive stale completed material into `docs/archive/PLAN_HISTORY.md`. 2. Read `memory/SPEC.md` if it exists. Pull only the live requirements, assumptions, decisions, and invariants that still constrain forward work. 3. Explore the codebase enough to understand real boundaries. -4. Draft or revise `Active`, `Next`, and `Horizon`. -5. Add `Why now / unlocks` for `Active` or `Next` items when ordering would otherwise be opaque to a fresh thread. -6. Keep `Recently Completed` to 2-3 terse items max. Move older history to `docs/archive/PLAN_HISTORY.md`, not to handoff files or ad hoc notes. -7. Update `Dependencies` to reflect only active / next items. -8. If several commit-sized execution steps are already obvious inside one frontier item, keep them out of `memory/PLAN.md`; they belong in `memory/CARDS.md` or in the active thread as derivative execution detail. +4. Draft or revise `Sequencing` (`Active`, `Next`, `Parallel / Low-conflict`, `Horizon`) by stable frontier id. +5. Draft or revise `Frontier Definitions` only for new or substantively changed frontier items. +6. Add `Why now / unlocks` in a frontier definition when ordering would otherwise be opaque to a fresh thread. +7. Keep `Recently Completed` to 2-3 terse items max. Move older history to `docs/archive/PLAN_HISTORY.md`, not to handoff files or ad hoc notes. +8. Update `Dependencies` to reflect only active / next items, by frontier id. +9. If several commit-sized execution steps are already obvious inside one frontier item, keep them out of `memory/PLAN.md`; they belong in `memory/CARDS.md` or in the active thread as derivative execution detail. ## Traceability Traceability is conditional on structural significance. -- Structural items should name relevant requirements, assumptions, decisions, or invariants from `memory/SPEC.md`. +- Structural frontier items should name relevant requirements, assumptions, decisions, or invariants from `memory/SPEC.md`. - Bounded features and hardening tasks only need SPEC links if they change durable boundaries or depend on a live assumption. +- Scope-card slices inherit traceability from their containing frontier unless `ln-scope` discovers a durable change that must promote back into SPEC/PLAN. ## Output @@ -89,7 +128,7 @@ After writing the plan, present these options to the user (use `tool-ask-questio | # | Label | Target | Why | | --- | ----------------- | ------------ | --- | -| 1 | Scope next item | `ln-scope` | The frontier is clear and ready to scope | +| 1 | Scope next slice | `ln-scope` | The frontier is clear and ready to scope | | 2 | Design oracles | `ln-oracles` | Verification design needs explicit work | | 3 | Grill it more | `ln-grill` | Planning surfaced unresolved product questions | | 4 | Back to triage | `ln-consult` | Direction needs reassessment | diff --git a/.agents/skills/ln-plan/assets/plan-template.md b/.agents/skills/ln-plan/assets/plan-template.md index b97405a3..e5083836 100644 --- a/.agents/skills/ln-plan/assets/plan-template.md +++ b/.agents/skills/ln-plan/assets/plan-template.md @@ -2,39 +2,62 @@ Created by ln-plan · Read by all skills · Updated by ln-build, ln-sync, and ln-spike. Authority: active frontier, near-horizon ordering, and dependencies that still matter. + Frontier item = canonical plan/Linear/branch unit. + Slice = scoped execution unit from ln-scope/ln-build, often inside one frontier. + Keep this file light. Archive older completed work to docs/archive/PLAN_HISTORY.md. - Only Active / Next items should usually carry detailed traceability. + Edit Sequencing for ordering/status churn; keep Frontier Definitions relatively stable. Do not spread retired work history across handoff files, refactor plans, or ad hoc status notes. --> # Plan -## Active +## Context + +[Short rolling narrative for fresh-thread re-entry: where the product/initiative stands, which arc is active, and what the next coordination bottleneck is.] + +## Sequencing + +### Active + +1. `[frontier-id]` — [status: not-started|in-progress|branch-complete|blocked] — [one-line current state] -1. **[Item name]** — [structural | bounded feature | hardening | bugfix] `[status: not-started|in-progress]` - - Objective: [what this work changes] - - Why now / unlocks: [why this is on the frontier now] - - Acceptance: [observable outcome] - - Verification: [inner / middle / outer summary] - - Traceability: [→ SPEC.md requirement / assumption / decision / invariant if needed] +### Next -## Next +1. `[frontier-id]` — [why it follows the active work] -1. **[Item name]** — [why it follows the active work] - - Why now / unlocks: [what this prepares or depends on] +### Parallel / Low-conflict -## Horizon +- `[frontier-id]` — [why it can proceed independently] -- [Future item, intentionally loose] +### Horizon + +- `[frontier-id]` — [future item, intentionally loose] + +## Frontier Definitions + +### frontier-id + +- **Name:** [Human-readable frontier name] +- **Linear:** [FE-XXX if known, or `unassigned`] +- **Kind:** [structural | bounded feature | hardening | bugfix | refactor] +- **Status:** [not-started | in-progress | branch-complete | blocked | done] +- **Objective:** [what this frontier changes] +- **Why now / unlocks:** [why this belongs on the frontier and what it unlocks] +- **Acceptance:** [observable frontier-level outcome] +- **Verification:** [inner / middle / outer summary] +- **Traceability:** [→ SPEC.md requirement / assumption / decision / invariant if needed] +- **Design docs:** [links if relevant] +- **Current execution pointer:** [optional: `memory/CARDS.md` or next intended scope card; omit when not needed] ## Recently Completed -- [YYYY-MM-DD] [item] — Done: [shipped outcome]. Verified: [command / manual step]. Watch: [residual risk or none]. -- [YYYY-MM-DD] [item] — Done: [shipped outcome]. Verified: [command / manual step]. Watch: [residual risk or none]. +- [YYYY-MM-DD] `[frontier-id]` — Done: [shipped outcome]. Verified: [command / manual step]. Watch: [residual risk or none]. +- [YYYY-MM-DD] `[frontier-id]` — Done: [shipped outcome]. Verified: [command / manual step]. Watch: [residual risk or none]. Older history: `docs/archive/PLAN_HISTORY.md` ## Dependencies -``` -[ASCII diagram of blocking relationships among Active / Next items] +```text +[ASCII diagram of blocking relationships among Active / Next frontier ids] ``` diff --git a/.agents/skills/ln-review/SKILL.md b/.agents/skills/ln-review/SKILL.md index f43b57c1..d27e45b8 100644 --- a/.agents/skills/ln-review/SKILL.md +++ b/.agents/skills/ln-review/SKILL.md @@ -30,7 +30,7 @@ Make invalid states unrepresentable (Yaron Minsky). Split optional fields into d ### Oracle coverage -If `memory/SPEC.md` §Oracle Strategy by Loop Tier exists, check whether recent slices implemented the oracles their persisted `memory/PLAN.md` verification approaches declare. If a full or light scope card is available in session context, use it as a higher-resolution supplement, not the primary source of truth. Look for: +If `memory/SPEC.md` §Oracle Strategy by Loop Tier exists, check whether recent work implemented the oracles declared by the relevant `memory/PLAN.md` frontier definition. If a full or light scope card is available in session context, use it as a higher-resolution slice supplement, not the primary source of truth. Look for: - Scope card promised schema validation → is there a Zod parse in the test? - Scope card promised differential oracle → are there golden master fixtures? diff --git a/.agents/skills/ln-scope/SKILL.md b/.agents/skills/ln-scope/SKILL.md index 753b342d..e09d351c 100644 --- a/.agents/skills/ln-scope/SKILL.md +++ b/.agents/skills/ln-scope/SKILL.md @@ -21,9 +21,9 @@ Orient before weighting. If `memory/SPEC.md` exists, use its lexicon and respect its live invariants. -If `memory/PLAN.md` exists, check whether the named work is already in `Active`, `Next`, or `Horizon`. +If `memory/PLAN.md` exists, check whether the named work is already represented as a frontier item in `Sequencing` (`Active`, `Next`, `Parallel / Low-conflict`, or `Horizon`) and `Frontier Definitions`. -Treat the containing `memory/PLAN.md` frontier item as the Linear-issue / branch boundary. Here, a frontier item means the plan-level work item itself, not the scope card you are about to write. Your scope card may narrow that frontier item into the next buildable sub-slice, but scope-card granularity alone does **not** imply a new issue or branch. Only route to `ln-plan` for new plan items when the frontier itself must be split or reordered. +Treat the containing `memory/PLAN.md` frontier item as the Linear-issue / branch boundary. Here, a frontier item means the canonical plan item, preferably keyed by a stable frontier id in `Frontier Definitions`, not the scope card you are about to write. Your scope card may narrow that frontier item into the next buildable slice, but scope-card granularity alone does **not** imply a new issue or branch. Only route to `ln-plan` for new frontier items when the frontier itself must be split or reordered. If this is a fresh thread or an unfamiliar area, also read `HANDOFF.md` if present. Read `docs/archive/PLAN_HISTORY.md` only if the frontier rationale or touched area is still unclear. @@ -31,7 +31,7 @@ Write a 2-4 bullet orientation note naming the containing seam, the relevant fro Do not create new planning documents or scratch scope files without explicit permission. The canonical planning state remains `memory/SPEC.md` and `memory/PLAN.md`. The sanctioned derivative exception is `memory/CARDS.md`, which may hold several prepared scope cards for one frontier item while that execution queue is still live. -If scoping reveals that one frontier item needs multiple sequential sub-slices, keep them nested under that same frontier item unless the plan-level frontier must change. Do not silently turn sub-slices into separate tracker / branch work items. +If scoping reveals that one frontier item needs multiple sequential slices, keep them nested under that same frontier item unless the plan-level frontier must change. Do not silently turn slices into separate tracker / branch work items. ## Prepared card queue @@ -159,7 +159,7 @@ Canonical reconciliation is **mandatory**; durable updates are **conditional**. - Full scope card: update `memory/SPEC.md` / `memory/PLAN.md` as needed during or after scoping. - Light scope card: run the promotion checklist explicitly. If it stays light, canonical reconciliation may be a no-op; if it promotes, reconcile the durable change before build. -- Multi-card queue: keep the queue itself in `memory/CARDS.md`, but do not mirror those queued cards into `memory/PLAN.md` unless the frontier item itself changes. +- Multi-card queue: keep the queue itself in `memory/CARDS.md`, but do not mirror those queued slice cards into `memory/PLAN.md` unless the frontier item itself changes. At most, add a lightweight `Current execution pointer` in the frontier definition. When adding or updating an assumption, apply the same-item test first: diff --git a/.agents/skills/ln-spec/SKILL.md b/.agents/skills/ln-spec/SKILL.md index 103912f9..3ef76204 100644 --- a/.agents/skills/ln-spec/SKILL.md +++ b/.agents/skills/ln-spec/SKILL.md @@ -32,11 +32,11 @@ Write or update `memory/SPEC.md` following the [spec template](assets/spec-templ ### Verification Design boundary -ln-spec owns the **inner loop** of verification design: verification commands, verification policy, and inner-loop oracle items (type checks, fast unit tests, linting). Middle and outer loop oracle strategy, diagnostic assessment, and blind spots are owned by `ln-oracles`. Not every slice requires a full oracle-design pass, but slices involving LLM behavior, visual rendering, or compositional/system-level claims should route through `ln-oracles` before implementation. When writing or updating §Verification Design, preserve any content written by ln-oracles (§Verification Stance, §Diagnostic Assessment, §Oracle Strategy middle/outer tiers, §Design notes, §Acknowledged Blind Spots). +ln-spec owns the **inner loop** of verification design: verification commands, verification policy, and inner-loop oracle items (type checks, fast unit tests, linting). Middle and outer loop oracle strategy, diagnostic assessment, and blind spots are owned by `ln-oracles`. Not every scoped slice requires a full oracle-design pass, but frontier items or slices involving LLM behavior, visual rendering, or compositional/system-level claims should route through `ln-oracles` before implementation. When writing or updating §Verification Design, preserve any content written by ln-oracles (§Verification Stance, §Diagnostic Assessment, §Oracle Strategy middle/outer tiers, §Design notes, §Acknowledged Blind Spots). ### Traceability -If `memory/PLAN.md` exists, verify that changed assumptions and decisions still align with affected slices. If it does not exist yet, close the reference chain as far as current artifacts allow: assumptions should still name dependent decisions and validation approaches, and slice links can be added later by `ln-plan`. +If `memory/PLAN.md` exists, verify that changed assumptions and decisions still align with affected frontier items. If it does not exist yet, close the reference chain as far as current artifacts allow: assumptions should still name dependent decisions and validation approaches, and frontier links can be added later by `ln-plan`. ### Weight management @@ -66,9 +66,9 @@ Large cleanup is `ln-sync` work. When writing or patching, keep the touched area Every amendment must close its reference chain as far as the current lifecycle stage allows. After editing, verify: -- **New assumption** → has: dependent decision(s), validation approach, and implicated slice(s) in `memory/PLAN.md` **if `memory/PLAN.md` already exists** +- **New assumption** → has: dependent decision(s), validation approach, and implicated frontier item(s) in `memory/PLAN.md` **if `memory/PLAN.md` already exists** - **New decision** → has: dependent assumption(s), supersession note -- **New invariant** → has: establishing slice in `memory/PLAN.md` **if known**, protecting test (or `manual (outer loop)`), proved decision +- **New invariant** → has: establishing frontier item in `memory/PLAN.md` **if known** (or scoped slice if already defined), protecting test (or `manual (outer loop)`), proved decision - **New constraint** → has: rationale for exclusion - **New inner-loop oracle item** → names the invariant(s) it protects @@ -78,7 +78,7 @@ After filing the spec, present these options to the user (use `tool-ask-question | # | Label | Target | Why | | --- | ---------------- | ------------- | ------------------------------------------------- | -| 1 | Plan slices | `ln-plan` | Spec is complete, break it into slices | +| 1 | Plan frontier | `ln-plan` | Spec is complete, break it into frontier items | | 2 | Design oracles | `ln-oracles` | Spec needs middle/outer loop verification design | | 3 | Grill it more | `ln-grill` | Spec has gaps that need deeper understanding | | 4 | Back to triage | `ln-consult` | Direction needs reassessment | diff --git a/.agents/skills/ln-spec/assets/spec-template.md b/.agents/skills/ln-spec/assets/spec-template.md index bd362240..cf142dd4 100644 --- a/.agents/skills/ln-spec/assets/spec-template.md +++ b/.agents/skills/ln-spec/assets/spec-template.md @@ -3,7 +3,7 @@ Authority: requirements, constraints, assumptions, decisions, invariants, domain language, verification strategy. When re-running ln-spec: read this file first, preserve existing content, evolve sections that need change. - Cross-referenced by PLAN.md slices and spikes via §-prefixed section links. + Cross-referenced by PLAN.md frontier items and scoped slices via §-prefixed section links. Together with PLAN.md, this is the only canonical planning state; do not create sidecar spec ledgers without explicit permission. --> # [Project Name] @@ -19,7 +19,7 @@ ## Requirements + Each numbered for cross-reference from PLAN.md frontier items / scoped slices. --> 1. [Requirement] 2. ... @@ -28,15 +28,15 @@ + When invalidated: record in §Decisions, flag implicated frontier items in PLAN.md. --> -| # | Assumption | Confidence | Status | Dependent decisions | Implicated slices | Validation approach | +| # | Assumption | Confidence | Status | Dependent decisions | Implicated frontier items | Validation approach | | --- | ------------ | --------------- | -------------------------- | ------------------- | ----------------- | ------------------- | -| A1 | [hypothesis] | low/medium/high | open/validated/invalidated | [→ §Decisions #N] | [→ PLAN.md slice] | [how to falsify] | +| A1 | [hypothesis] | low/medium/high | open/validated/invalidated | [→ §Decisions #N] | [→ PLAN.md frontier id] | [how to falsify] | ## Decisions @@ -52,7 +52,7 @@ Once established, must not regress. Each links to the decision it proves and the tests that protect it. Established by ln-build/ln-spike traceability. - Referenced by PLAN.md slices (to establish / to respect). --> + Referenced by PLAN.md frontier items or scoped slices (to establish / to respect). --> | # | Invariant | Established by | Protected by | Proves | | --- | -------------- | -------------- | ------------ | ----------------- | @@ -66,13 +66,13 @@ | Term | Definition | | --------------- | --------------------------------------------------------------------------------------------- | -| **assumption** | A falsifiable belief accepted as true; tracked with confidence and status, linked to decisions and slices | +| **assumption** | A falsifiable belief accepted as true; tracked with confidence and status, linked to decisions and frontier items / scoped slices | | **decision** | A recorded choice that resolves a question; ordered, with supersession chain | | **invariant** | A structural property proven by implementation and protected by tests; must not regress | | **requirement** | A capability the system must provide | | **slice** | A thin end-to-end tracer-bullet path through all integration layers | | **spike** | A time-boxed throwaway investigation to answer one hard question | -| **phase** | A temporal grouping of slices and spikes in PLAN.md | +| **phase** | A temporal grouping of frontier items / scoped slices and spikes in PLAN.md | | **[Term]** | [Definition] | ## Verification Design @@ -94,7 +94,7 @@ ### Verification Policy - + + -## Horizon - -### Semantic and generative follow-through - -- **Relation-first observer capture enrichment** — the first cut is shipped; enrichment waits for FE-700 relation policy so observer output can broaden across the refined ontology without flooding the graph. - - Depends on: intent graph semantics + relation-policy directionality; prompt/context substrate. - - Traceability: Requirements 30, 38, 40; A66, A81, A84; D125, D136, D137, D139, D140; I109. - -- **Architect / generator loop** — autonomous agent that iterates over the intent graph and proposes semantic changes for HITL review through the same future changeset / reconciliation pathway as user-driven edits. - - Status: related to scenario-options but broader. Keep productized architect proposals behind multi-chat + reconciliation + semantic changesets; use the scenario substrate for shadow/proposal-only probes first. - - Traceability: A73, A85, A87; D139, D141. - -- **Server mini-library compartmentalization** — refactor growing server seams into plural public roots with same-named private subtrees where FE-698 / FE-705 pressure has made boundaries too implicit. - - Status: near-term refactor candidate after FE-705 integration, not product roadmap work. - - Candidate shape: `fixtures.ts` + `fixtures/`, `context-packs.ts` + `context-packs/`, `prompts.ts` + `prompts/`, `scenario-runner.ts` + `scenario-runner/`, `entity-apis.ts` + route submodules, and `agent-apis.ts` + capability/protocol subtrees. - -### Side-chat follow-on - -- **Side-chat persistence — V4a (multi-chat Phase 2 substrate)** — side-chat client persists its turns into the existing `chat` / `turn` tables with `chat.kind='side_chat'`, loads prior side-chat sessions on remount, and surfaces an "Old chats" affordance per pinned item / spec. - - Status: deprioritized below continuous workspace and semantic/generative substrate. Phase 1 substrate already ships schema support; the remaining decision is the anchor model (`chat` row anchor fields vs deferred `chat_focus` table). - - Linear: FE-675 (umbrella; V4a half). - - Traceability: Requirement 39; A82, A83; D138. - - Design docs: `docs/design/MULTI_CHAT.md` §10 Phase 2; `docs/design/SIDE_CHAT.md` §9 V4 row. - -- **Side-chat V4b — item versioning + branched exploration** — once the changeset ledger lands, item versioning unblocks dangling-annotation repair and soft-edit audit; branched exploration lets drill-downs / past-turn edits / revisits coexist with the original chain. - - Depends on: semantic changeset ledger; V4a side-chat persistence. - - Traceability: A72, A73, A85; D139, D141. - -### Lower-priority / unclear product surface +# Plan -- **Dashboard result summaries and completeness metrics** — progress visibility across specifications. -- **Spatial canvas layout for graph view** — add the spatial DAG layout as a second layout choice inside graph mode, alongside the structured-list route. -- **Graph view active-path render filter + scope toggle** — render only active-path items by default in graph view, with a `Show all` toggle. -- **MCP server adapter for core operations** — future adapter over capability contracts, not direct ORM / route wrappers. -- **Git-friendly file-based persistence representation for diffable exported specs**. -- **Typed fixture-builder convergence for happy-path tests**. - -### Meta / deferred boundaries - -- **Structured development spec registry** — prototype file-backed canonical spec records, deterministic checks, generated markdown views, and task-local slices for Brunch's own development workflow. - - Meaning: self-tooling experiment for Brunch's development process, not product functionality. It would make `memory/SPEC.md` / `memory/PLAN.md` generated views over structured records to reduce drift and merge conflicts. - - Status: design horizon, not a migration commitment. - - Design doc: `docs/design/DEV_WORKFLOW_EVOLUTION.md`. - -- **Portability boundaries** — split durable store/read-model, interview session runtime, and workspace capability provider if Brunch targets hosted, remote, embedded, or sandbox-backed operation. - - Meaning: future architecture boundary map for non-local deployments or adapter-backed execution. Deferred until hosted/remote/sandbox operation becomes a product goal. - - Deep design source: `docs/design/PORTABILITY_BOUNDARIES.md`. +## Context + +The interaction model is mature: four-phase interview, interviewer-autonomous question format, phase-agnostic preface cards with workspace exploration, structured review with per-item commenting, observer knowledge extraction, workflow ownership extraction, distribution hardening, graph view's structured-list peer route, the first relation-first observer capture seam, the multi-chat substrate, side-chat V3.0 hard-impact cascade, and side-chat V3.1 agent-grouped reconciliation resolution all ship as working product. + +The next product arc is a **continuous conversational workspace** plus a stronger semantic/generative substrate. Continuous workspace is active in a parallel lane and gives the chat runtime a stable phase-addressable host. The FE-705 branch contributes an integration substrate — a local agent capability CLI and external LLM-as-user probe harness — that should be reconciled into main before graph-review and scenario-options work depends on generated completed-spec fixtures. After that, the highest-coordination work is intent-graph semantics and the semantic changeset ledger; lower-coordination provider, gitignore, and web-research work can proceed in parallel. + +The May 2026 intent-spec, multi-chat, changeset-ledger, prompt/context, and agent-mutation design notes are reconciled into one direction. `docs/design/MULTI_CHAT.md` is the substrate document. `docs/design/SIDE_CHAT.md` describes side-chat V1 / V2 / V3.0 / V3.1 / V4 phasing on top of that substrate. `docs/design/PATCH_LEDGER.md` remains historical deeper design pressure for semantic mutation history, but canonical future-facing vocabulary is `changeset` / `change`. The product-layer ontology trajectory is split out as `docs/design/INTENT_GRAPH_SEMANTICS.md` and `docs/design/BEHAVIORAL_KERNELS.md`; broader synthesis lives in `docs/archive/design/INTENT_SPEC_EVOLUTION.md`. FE-705's branch-local strategy/proposal notes add scenario options, graph-review oracle, chat-local strategies, and concern/dependency mapping; those notes should become a canonical design doc when the branch is integrated. The dev-layer self-tooling trajectory lives in `docs/design/DEV_WORKFLOW_EVOLUTION.md`. + +## Sequencing + +### Active + +1. `continuous-workspace` — in progress in parallel lane — stable phase-addressable host for the chat runtime. +2. `agent-fixture-substrate` — branch-complete off main, reconciling — FE-705 integration substrate for JSONL agent capability CLI and LLM-as-user probes. + +### Next + +1. `intent-graph-semantics` — highest-coordination semantic substrate after FE-705 reconciliation. +2. `changeset-ledger` — semantic history spine needed before canonical proposal acceptance and productized scenario options. +3. `graph-review-scenario-options` — artifact-only critique/probe lane; can advance in parallel with FE-700 if it does not commit canonical graph truth. +4. `productized-scenario-options` — user-facing acceleration surface after FE-700 semantics, FE-701 changesets, and graph-review probes. + +### Parallel / Low-conflict + +- `first-run-provider-setup` — provider/key UX and runtime seam can progress independently of semantic-stack work. +- `workspace-gitignore-assist` — small workspace hygiene surface with low overlap. +- `productized-web-research` — waits on prompt/context scenario substrate for probe quality, but can remain separate from semantic schema work. + +### Horizon + +- `relation-first-observer-enrichment` +- `architect-generator-loop` +- `server-mini-library-compartmentalization` +- `side-chat-persistence-v4a` +- `side-chat-v4b-item-versioning` +- `dashboard-summaries` +- `spatial-graph-layout` +- `graph-view-active-path-filter` +- `mcp-adapter` +- `file-based-persistence` +- `typed-fixture-builder-convergence` +- `structured-development-spec-registry` +- `portability-boundaries` + +## Frontier Definitions + +### continuous-workspace + +- **Name:** Continuous workspace / phase-addressable interview surface +- **Linear:** unassigned in this plan snapshot +- **Kind:** structural +- **Status:** in-progress +- **Objective:** Replace per-phase rendering boundaries with a cumulative center pane, realized phase sections, one chat runtime per specification, sidebar section navigation, scroll/focus behavior, and preservation of the single actionable frontier at the current reachable phase. +- **Why now / unlocks:** Workflow read/write ownership is extracted, the multi-chat substrate ships chat containers below the specification, and side-chat V3.0/V3.1 closed the cascade surface. This gives future side-chat persistence, strategy chats, and graph/workspace routes a stable host without introducing a second durable workflow model. +- **Acceptance:** Realized phase sections remain legible, future sections stay unreachable until valid, navigation is focus/scroll state only, and the current phase retains exactly one actionable frontier/recovery/handoff/completion affordance. +- **Verification:** Manual workspace walkthroughs across kickoff-ready, active, review-active, recovery, close-to-next-phase, resume/reload, and future-phase deep-link states; regression tests around route/workflow state where available. +- **Traceability:** A58; D86, D87, D110, D113, D114; I24, I102. +- **Design docs:** `docs/design/CONTINUOUS_WORKSPACE_HYBRID.md`; umbrella synthesis in `docs/design/CONVERSATIONAL_WORKSPACE_RUNTIME.md`. + +### agent-fixture-substrate + +- **Name:** FE-705 integration — agent capability CLI + LLM-as-user fixture probe +- **Linear:** FE-705 +- **Kind:** structural +- **Status:** branch-complete / reconciling +- **Objective:** Integrate the branch-complete local `brunch agent` JSONL capability adapter and external probe runner so agents can drive the real Brunch interview flow through Brunch-owned contracts rather than privileged ORM access. +- **Why now / unlocks:** Prompt/context and graph-review probes need realistic graph/transcript fixtures, but hand-authoring those fixtures is chicken-and-egg. A JSONL capability adapter lets an external LLM-as-user drive the real lifecycle through the same mutation authority future agents must use, pressure-testing tool-call vocabulary, chat readiness, resource identity, fixture curation, and import-boundary discipline. Pi comparison remains FE-635 after this seam has a real Brunch use case to compare against. +- **Acceptance:** Server-owned capability contracts and JSONL protocol/session code are integrated; the probe runner uses only the JSONL client/process boundary; fixture-candidate artifacts preserve scenario briefs, model policy, generated transcripts, and workspace-state inspection without becoming Brunch authority. +- **Verification:** Contract/dispatcher tests, JSONL protocol/session tests, import-boundary tests, fake process tests, opt-in real-provider smoke, and fixture-candidate structure/readiness checks. +- **Traceability:** Requirement 43; A89; D143, D147; I114. Also protects Requirements 40, 41, 42 by making prompt/context and mutation-surface probes executable through a real adapter. +- **Design docs:** `docs/design/AGENT_MUTATION_SURFACE.md`; `docs/archive/design/INTENT_SPEC_EVOLUTION.md`; FE-705 branch artifacts until rebased. + +### intent-graph-semantics + +- **Name:** Intent graph semantics + relation-policy directionality foundation +- **Linear:** FE-700 +- **Kind:** structural +- **Status:** not-started +- **Objective:** Refine the ontology and relation policy so the graph can represent invariants, examples/counterexamples, constraint subtypes, narrowed decisions, witness strength, checkability gaps, and operational edge behavior as source/destination material for future generative features. +- **Why now / unlocks:** Candidate generation, behavioral kernels, graph review, scenario-options acceleration, architect proposals, direct-edit cascade, and downstream verification-aware decomposition all need a sharper semantic target than the current exploration/review ontology. This semantic-layer lane is most likely to collide with parallel work, so it should land before broad observer enrichment or canonical candidate-bundle acceptance. +- **Acceptance:** `invariant` and `example` are first-class durable kinds; examples are subtyped; `decision` is narrowed; `constraint`, `criterion`, and `invariant` semantics are enriched; `checkability` and witness strength are represented; relation families, negative relations, edge epistemic metadata, and relation-policy directionality are explicit. +- **Verification:** Corpus/fixture observer probes comparing old vs refined ontology; relation-policy unit tests for mixed-direction relations; graph-review manual assessment for precision/noise; context-pack probe outputs show authority, witness, relation support, and directionality labels. +- **Traceability:** Requirement 38; A77, A78, A80, A81, A84; D134, D136, D137, D139, D140. +- **Design docs:** `docs/design/INTENT_GRAPH_SEMANTICS.md`; `docs/archive/design/INTENT_SPEC_EVOLUTION.md`; FE-705 strategy/proposal notes for relation directionality. + +### changeset-ledger + +- **Name:** Semantic changeset ledger + proposal-turn staleness +- **Linear:** FE-701 +- **Kind:** structural +- **Status:** not-started +- **Objective:** Introduce the semantic history spine that separates graph mutation history from conversational turn ancestry. +- **Why now / unlocks:** Scenario bundle acceptance, direct-edit atomicity, accepted-with-issues flows, stale proposal detection, graph-review repairs, side-chat V4b item versioning, and future architect/reconciliation agents all need a durable semantic mutation boundary. Without it, productized scenario-options can stay probe-only but cannot safely commit candidate bundles. +- **Acceptance:** Schema and operation vocabulary use `changeset` / `change`; specifications track latest semantic changeset; proposal turns carry base/opened changeset identity; `reconciliation_need.caused_by_changeset_id` is connected; non-accept proposal actions cannot mutate graph truth; a changeset is the smallest atomic unit preserving semantic coherence. +- **Verification:** DB atomicity tests for changeset + changes + reconciliation_need writes, staleness tests for open proposal turns across multi-chat changes, and capability/transition tests proving non-accept actions cannot mutate graph truth. +- **Traceability:** Requirements 39, 42, 44; A71, A79; D135, D138, D143. +- **Design docs:** `docs/design/PATCH_LEDGER.md` (historical filename; future vocabulary is changeset/change); FE-705 strategy/proposal notes for semantic history and proposal turns. + +### graph-review-scenario-options + +- **Name:** Graph-review oracle + scenario-options probes +- **Linear:** FE-702 for graph-review / scenario probes; FE-649 and FE-640 remain productization children under FE-698 where relevant +- **Kind:** structural +- **Status:** not-started +- **Objective:** Build the internal critique path and artifact-only candidate bundle probes before product UI. +- **Why now / unlocks:** Product wants first-turn strategy choice and mid-interview acceleration, but engineering needs graph-review critique to make generated candidate bundles credible. This lane can advance in parallel with FE-700 if it stays artifact-only and does not commit canonical graph truth. +- **Acceptance:** Candidate graph bundle and graph-review finding artifacts exist; graph-review prompt/context pack and rubric cover coherence, fixed-premise respect, coverage, tradeoff honesty, checkability, granularity, scenario fidelity, epistemic labels, provenance, and downstream usefulness; candidate readiness is classified as `draft` / `reviewing` / `reviewed_clean` / `reviewed_with_issues` / `blocked`; broader graph-review issues remain turn-owned unless querying/filtering needs prove otherwise. +- **Verification:** Scenario-runner fixtures, FE-705 JSONL-generated completed-spec fixtures, raw output review, structured parse validation, qualitative scorecards, and comparison against drilldown-produced graphs. Middle/outer-loop oracle design should decide when fixture candidates become golden. +- **Traceability:** Requirements 20, 21, 31, 32, 40, 41, 43, 44; A67, A68, A80, A85, A87, A89; D126, D127, D139, D141, D147. +- **Design docs:** `docs/design/BEHAVIORAL_KERNELS.md`; `docs/design/INTENT_GRAPH_SEMANTICS.md`; `docs/design/AGENT_MUTATION_SURFACE.md`; FE-705 strategy/proposal notes. + +### productized-scenario-options + +- **Name:** Productized scenario-options / candidate-spec completion assist +- **Linear:** unassigned in this plan snapshot +- **Kind:** structural +- **Status:** blocked +- **Objective:** Replace skip-only remainder handling with first-turn strategy choice and a mid-interview `speed this up` path that generates reviewed candidate graph bundles with tradeoffs, completing the current direction by default. +- **Why now / unlocks:** This is the likely first user-visible alternative to long drilldown, but product UI waits on graph-review probes, FE-700 semantics, and FE-701 changesets. Until then, scenario-options remain artifact/proposal-only. +- **Acceptance:** Users can choose or request acceleration via scenario options; generated bundles preserve accepted graph truth as fixed premise, present tradeoff profiles, and become canonical only through coherent accepted changesets with known issues represented as follow-on review/process debt. +- **Verification:** Probe comparison against direct drilldown, graph-review scorecards, accepted-with-issues flow tests once changesets exist, and manual user-flow review for trust/comprehension. +- **Traceability:** Requirements 31, 40, 44; A67, A77, A78, A85, A90, A91; D126, D134, D136, D139, D151, D152. +- **Design docs:** FE-705 strategy/proposal notes until canonicalized; `docs/design/BEHAVIORAL_KERNELS.md`; `docs/design/INTENT_GRAPH_SEMANTICS.md`. + +### first-run-provider-setup + +- **Name:** First-run provider setup +- **Linear:** FE-633 covers the OpenRouter/default-provider part; dashboard credential UX + XDG key storage may need a sibling issue if split from provider proving +- **Kind:** bounded feature +- **Status:** not-started +- **Objective:** Make missing LLM credentials visible on the dashboard, add a shared AI runtime provider seam for interviewer/observer model construction, support UI-entered keys through XDG-compliant user auth state, and evaluate whether OpenRouter should become the preferred onboarding provider while preserving Anthropic-specific capabilities or explicit degradation. +- **Why now / unlocks:** Can proceed independently and reduces first-run friction for real users and probe workflows. +- **Acceptance:** Dashboard surfaces provider credential status before specification creation; setup flow stores UI-entered keys outside the project workspace; interviewer/observer construction routes through a shared provider seam. +- **Verification:** Unit tests for provider precedence/storage paths, manual first-run walkthroughs, and provider capability spike for model naming, structured output, tool use, and reasoning/thinking support. +- **Traceability:** Requirements 34, 35, 36; A74, A75; D130, D131, D132; I106. +- **Design docs:** none yet beyond SPEC/PLAN entries. + +### workspace-gitignore-assist + +- **Name:** Workspace hygiene / `.brunch/` gitignore assist +- **Linear:** FE-648 +- **Kind:** bounded feature +- **Status:** not-started +- **Objective:** Detect whether generated local state is already ignored and, with explicit confirmation, add an idempotent `.gitignore` entry or create `.gitignore` when absent. +- **Why now / unlocks:** Low-conflict guardrail that reduces accidental commits of local Brunch state. +- **Acceptance:** The app detects absent, present, and already-covering ignore states; previews repository mutation; mutates `.gitignore` only after explicit confirmation; append/create behavior is idempotent and content-preserving. +- **Verification:** Unit tests for ignore detection/append behavior and manual dashboard walkthrough with absent, present, and already-covering `.gitignore` states. +- **Traceability:** Requirement 37; A76; D133; I107. +- **Design docs:** none yet beyond SPEC/PLAN entries. + +### productized-web-research + +- **Name:** Productized web research capability +- **Linear:** FE-649 +- **Kind:** structural +- **Status:** not-started +- **Objective:** Add web search and page-fetch tools as interviewer-invoked context gathering, surfaced as preface cards after the scenario substrate proves query framing, tool ergonomics, and provisional-context handling. +- **Why now / unlocks:** Extends the same phase-agnostic preface-card model to external research, but should wait for prompt/context scenario substrate proof so web research does not become an ad hoc tool surface. +- **Acceptance:** Research tools are invoked through interviewer context gathering, outputs render as provisional preface cards paired with questions, and observer capture treats the validated full turn as atomic. +- **Verification:** Prompt/context scenario probes for query framing and tool-output summarization, plus manual review of provisional-context handling. +- **Traceability:** Requirements 20, 21, 40, 41; D99, D112, D139, D142. +- **Design docs:** FE-698 prompt/context scenario substrate references; future productized research notes if needed. + +### relation-first-observer-enrichment + +- **Name:** Relation-first observer capture enrichment +- **Linear:** unassigned in this plan snapshot +- **Kind:** structural +- **Status:** horizon +- **Objective:** Broaden observer output across the refined ontology without flooding the graph. +- **Why now / unlocks:** First cut is shipped; enrichment waits for FE-700 relation policy so observer output can become semantically richer while preserving prompt-budgeted compact anchors and user trust. +- **Acceptance:** Observer extraction captures richer relation families and operational metadata with abstention under weak support. +- **Verification:** Observer corpus probes, graph/export review for precision/noise, and context-pack output review. +- **Traceability:** Requirements 30, 38, 40; A66, A81, A84; D125, D136, D137, D139, D140; I109. +- **Design docs:** `docs/design/INTENT_GRAPH_SEMANTICS.md`. + +### architect-generator-loop + +- **Name:** Architect / generator loop +- **Linear:** unassigned in this plan snapshot +- **Kind:** structural +- **Status:** horizon +- **Objective:** Explore an autonomous agent that iterates over the intent graph and proposes semantic changes for HITL review through the same future changeset/reconciliation pathway as user-driven edits. +- **Why now / unlocks:** Related to scenario-options but broader; keep productized architect proposals behind multi-chat, reconciliation, and semantic changesets. Use the scenario substrate for shadow/proposal-only probes first. +- **Acceptance:** Shadow/proposal-only architect outputs can be compared against user-driven edits without mutating canonical graph truth. +- **Verification:** Scenario substrate probes and human comparison against accepted user edits. +- **Traceability:** A73, A85, A87; D139, D141. +- **Design docs:** `docs/design/BEHAVIORAL_KERNELS.md`; future design doc if promoted. + +### server-mini-library-compartmentalization + +- **Name:** Server mini-library compartmentalization +- **Linear:** unassigned in this plan snapshot +- **Kind:** refactor +- **Status:** horizon +- **Objective:** Refactor growing server seams into plural public roots with same-named private subtrees where FE-698 / FE-705 pressure has made boundaries too implicit. +- **Why now / unlocks:** Near-term refactor candidate after FE-705 integration, not product roadmap work. +- **Acceptance:** Candidate seams such as `fixtures.ts`, `context-packs.ts`, `prompts.ts`, `scenario-runner.ts`, `entity-apis.ts`, and `agent-apis.ts` hide private implementation subtrees behind stable public roots where real pressure exists. +- **Verification:** Existing test suite plus import-boundary review. +- **Traceability:** code organization convention in `AGENTS.md`. +- **Design docs:** none. + +### side-chat-persistence-v4a + +- **Name:** Side-chat persistence — V4a (multi-chat Phase 2 substrate) +- **Linear:** FE-675 umbrella, V4a half +- **Kind:** structural +- **Status:** horizon +- **Objective:** Persist side-chat client turns into the existing `chat` / `turn` tables with `chat.kind='side_chat'`, load prior side-chat sessions on remount, and surface an "Old chats" affordance per pinned item/spec. +- **Why now / unlocks:** Deprioritized below continuous workspace and semantic/generative substrate. Phase 1 substrate already ships schema support; the remaining decision is the anchor model (`chat` row anchor fields vs deferred `chat_focus` table). +- **Acceptance:** Side-chat sessions survive remount/reload and remain coherent with graph truth without introducing a second workflow model. +- **Verification:** Persistence/reload tests and manual side-chat walkthroughs. +- **Traceability:** Requirement 39; A82, A83; D138. +- **Design docs:** `docs/design/MULTI_CHAT.md` §10 Phase 2; `docs/design/SIDE_CHAT.md` §9 V4 row. + +### side-chat-v4b-item-versioning + +- **Name:** Side-chat V4b — item versioning + branched exploration +- **Linear:** FE-675 umbrella, V4b half +- **Kind:** structural +- **Status:** horizon +- **Objective:** Add item versioning and branched exploration once the changeset ledger lands. +- **Why now / unlocks:** Item versioning unblocks dangling-annotation repair and soft-edit audit; branched exploration lets drill-downs, past-turn edits, and revisits coexist with the original chain. +- **Acceptance:** Prior item versions are queryable for diff/comparison/audit while active-path projection always reflects latest semantic truth. +- **Verification:** Changeset-backed versioning tests, revisit cascade tests, and annotation repair walkthroughs. +- **Traceability:** A72, A73, A85; D139, D141. +- **Design docs:** `docs/design/MULTI_CHAT.md`; `docs/design/PATCH_LEDGER.md`. + +### dashboard-summaries + +- **Name:** Dashboard result summaries and completeness metrics +- **Linear:** unassigned in this plan snapshot +- **Kind:** bounded feature +- **Status:** horizon +- **Objective:** Improve progress visibility across specifications. +- **Why now / unlocks:** Lower-priority product surface after core workspace and semantic substrate stabilize. +- **Acceptance:** Dashboard communicates spec progress/completeness without implying false closure. +- **Verification:** Manual dashboard walkthroughs. +- **Traceability:** Requirements 8, 13, 15. +- **Design docs:** none. + +### spatial-graph-layout + +- **Name:** Spatial canvas layout for graph view +- **Linear:** unassigned in this plan snapshot +- **Kind:** bounded feature +- **Status:** horizon +- **Objective:** Add the spatial DAG layout as a second layout choice inside graph mode, alongside the structured-list route. +- **Why now / unlocks:** Graph view already ships as a structured-list peer route; spatial layout follows once relation density and graph interaction needs justify it. +- **Acceptance:** Users can switch between structured-list and spatial canvas layouts without changing projection semantics or action contracts. +- **Verification:** Manual graph-view walkthroughs at low/high edge density plus visual regression if available. +- **Traceability:** Requirement 33; A69, A70; D128. +- **Design docs:** graph-view sections in SPEC; future graph-view design notes if promoted. + +### graph-view-active-path-filter + +- **Name:** Graph view active-path render filter + scope toggle +- **Linear:** unassigned in this plan snapshot +- **Kind:** bounded feature +- **Status:** horizon +- **Objective:** Render only active-path items by default in graph view, with a `Show all` toggle. +- **Why now / unlocks:** Lower-priority graph legibility improvement after core graph semantics and projection surfaces stabilize. +- **Acceptance:** Active-path filtering is default, user can inspect all items, and edge rendering remains honest under both scopes. +- **Verification:** Graph-view fixtures for active-path/all toggles. +- **Traceability:** D128 and graph-view requirements. +- **Design docs:** none. + +### mcp-adapter + +- **Name:** MCP server adapter for core operations +- **Linear:** unassigned in this plan snapshot +- **Kind:** structural +- **Status:** horizon +- **Objective:** Expose future adapter over capability contracts, not direct ORM/route wrappers. +- **Why now / unlocks:** Deferred until capability contracts stabilize through FE-705 and real agent/probe use. +- **Acceptance:** MCP tools wrap Brunch-owned capability contracts and preserve resource identity, authority metadata, and mutation semantics. +- **Verification:** Contract adapter tests and import-boundary tests. +- **Traceability:** Requirements 42, 43; D143, D147. +- **Design docs:** `docs/design/AGENT_MUTATION_SURFACE.md`. + +### file-based-persistence + +- **Name:** Git-friendly file-based persistence representation for diffable exported specs +- **Linear:** unassigned in this plan snapshot +- **Kind:** structural +- **Status:** horizon +- **Objective:** Explore a diffable file representation for exported/durable spec truth. +- **Why now / unlocks:** Deferred until product ontology and changeset semantics are clearer. +- **Acceptance:** File representation preserves intent graph meaning and review/export boundaries without becoming a second source of truth. +- **Verification:** Round-trip and diff-fixture tests if promoted. +- **Traceability:** Product direction from planning specs toward intent specs; D134, D135. +- **Design docs:** future design needed if promoted. + +### typed-fixture-builder-convergence + +- **Name:** Typed fixture-builder convergence for happy-path tests +- **Linear:** unassigned in this plan snapshot +- **Kind:** hardening +- **Status:** horizon +- **Objective:** Converge test fixtures around typed builders that represent current product semantics. +- **Why now / unlocks:** Useful after semantic schema work stabilizes so tests do not fossilize obsolete ontology names. +- **Acceptance:** Happy-path tests can create coherent specs/chats/turns/intent graph state through typed builders with minimal duplication. +- **Verification:** Existing test suite, fixture API review, and migration of representative tests. +- **Traceability:** I48, I109, I111, I112. +- **Design docs:** none. + +### structured-development-spec-registry + +- **Name:** Structured development spec registry +- **Linear:** unassigned in this plan snapshot +- **Kind:** structural / process +- **Status:** horizon +- **Objective:** Prototype file-backed canonical spec records, deterministic checks, generated markdown views, and task-local slices for Brunch's own development workflow. +- **Why now / unlocks:** Self-tooling experiment, not product functionality. It would make `memory/SPEC.md` / `memory/PLAN.md` generated views over structured records to reduce drift and merge conflicts. +- **Acceptance:** Generated views preserve current planning ergonomics while reducing merge churn and cross-reference drift. +- **Verification:** Deterministic generation checks and branch-conflict dry runs. +- **Traceability:** dev-layer trajectory only; not product-layer ontology. +- **Design docs:** `docs/design/DEV_WORKFLOW_EVOLUTION.md`. + +### portability-boundaries + +- **Name:** Portability boundaries +- **Linear:** unassigned in this plan snapshot +- **Kind:** structural +- **Status:** horizon +- **Objective:** Split durable store/read-model, interview session runtime, and workspace capability provider if Brunch targets hosted, remote, embedded, or sandbox-backed operation. +- **Why now / unlocks:** Future architecture boundary map for non-local deployments or adapter-backed execution. Deferred until hosted/remote/sandbox operation becomes a product goal. +- **Acceptance:** Boundary map supports hosted/remote/sandbox decisions without prematurely abstracting the local-first product. +- **Verification:** Architecture review and spike if product direction changes. +- **Traceability:** portability assumptions in design docs; current local-first constraint in SPEC. +- **Design docs:** `docs/design/PORTABILITY_BOUNDARIES.md`. ## Recently Completed -- [2026-05-11] **Side-chat V3.1 — agent-grouped reconciliation resolution** (FE-674, PR #124 + downstack) — closes the V3.x arc end-to-end. Server: `POST /api/specifications/:id/reconciliation-needs/run-agent` (spec-level classifier loop) and `POST /api/specifications/:id/reconciliation-needs/:needId/reset-agent` (per-row Re-run) walk every awaiting open need through I114's `null → queued → classifying → classified | failed` lifecycle; agent_classification persists one of `auto-confirm` / `auto-edit` / `substantive`; agent_proposal carries an optional text suggestion. Client: `` renders six visual variants per row; `` in the Pending review header with conditional 1s polling while any need is in flight; per-row Re-run on classified/failed rows; per-class action buttons (`auto-confirm` → Confirm, `auto-edit` → View proposal + Apply + Skip, `substantive` → Open side-chat via `useSideChat().openFor`); bulk Confirm-all (N) and Apply-all-suggested (N) iterate serially over existing per-row endpoints. Listing endpoint extended with `target_item_kind` + `target_reference_code` to feed the Open-side-chat handoff. Verified: `npm run verify` 1178 / 1179 pass (one unrelated `side-chat-route` flake). **Watch**: A88 outer-loop walkthrough has not yet happened — empirical signal on whether agent grouping helps legibility vs V3.0's flat list remains open; capture qualitative notes during the next manual walkthrough on a dense spec. -- [2026-05-11] FE-698 reconciliation context-pack slice — Added a proposal-only reconciliation prompt/context scenario that renders open reconciliation needs with source/target anchors, reason/status, prompt/context fingerprints, and read-only capability metadata. This is substrate-only: no FE-674 need lifecycle endpoint, overlay action, side-chat reducer, or durable mutation behavior. Verified: `npm run verify`. Watch: next FE-698 work can move to broader read-only/proposal-only probes and the Pi adapter spike without treating this pack as a resolution agent. -- [2026-05-08] **Side-chat V3.0 — hard-impact cascade through `reconciliation_need`** (FE-674, PR #115 + #116 + #117) — three-card stack closes V3.0. Card 1 (PR #115): server `cascade-producer` + `getDownstreamEdges` + `openReconciliationNeedIfAbsent`; hard-impact apply mutates the source and opens one need per typed dependency edge; response shape adds `openedNeedIds`; partial-unique-index dedupe. Card 2 (PR #116): drop deferred banner; new `GET /api/specifications/:id/reconciliation-needs` endpoint and `useSpecificationOpenReconciliationNeeds` query; patch-list overlay renders a Pending review section listing open needs with kind chip and source/target references. Card 3 (PR #117): idempotent `POST /api/specifications/:id/reconciliation-needs/:needId/resolve` endpoint and per-row Resolve button; mutation pending state disables the button mid-flight. Verified: `npm run verify` (1063 tests, 0 lint warnings). Watch: A88 (Path 1 sufficiency without agent) is partially validated mechanically — full validation depends on outer-loop walkthrough on dense graphs. V3.1 (agent-grouped resolution) shipped 2026-05-11; richer per-row kinds beyond single Resolve are V3.1. SIDE_CHAT.md §9 updated to reflect the V3.0 single-action shape. +- [2026-05-11] `side-chat-v3-1-agent-grouped-reconciliation` — Done: FE-674 / PR #124 + downstack closed the V3.x arc end-to-end with spec-level classifier route, per-row reset route, agent classification lifecycle, chips, per-class actions, and bulk Confirm-all / Apply-all-suggested. Verified: `npm run verify` 1178 / 1179 pass with one unrelated `side-chat-route` flake. Watch: A88 outer-loop walkthrough on a dense spec remains open to assess legibility vs V3.0's flat list. +- [2026-05-11] `fe-698-reconciliation-context-pack` — Done: added proposal-only reconciliation prompt/context scenario rendering open reconciliation needs with source/target anchors, reason/status, prompt/context fingerprints, and read-only capability metadata. Verified: `npm run verify`. Watch: next FE-698 work can broaden read-only/proposal-only probes and Pi adapter spike without treating this pack as a resolution agent. +- [2026-05-08] `side-chat-v3-0-hard-impact-cascade` — Done: FE-674 / PR #115 + #116 + #117 shipped hard-impact cascade through `reconciliation_need`, Pending review listing, and idempotent resolve. Verified: `npm run verify` (1063 tests, 0 lint warnings). Watch: A88 mechanical grouping remains only partially validated until outer-loop walkthrough on dense graphs. Older history: `docs/archive/PLAN_HISTORY.md` @@ -134,46 +355,46 @@ Older history: `docs/archive/PLAN_HISTORY.md` ```text TRACK A — Workspace shell (parallel colleague lane) -continuous-workspace / phase-addressable interview surface (active) - ├──→ stable host for side-chat persistence and strategy chats +continuous-workspace + ├──→ stable host for side-chat-persistence-v4a └──→ workspace-aware graph / structured-list peer routes -TRACK B — Agent fixture substrate (FE-705 integration lane) -prompt/context scenario substrate foundation (completed) - └──→ agent capability CLI + LLM-as-user fixture probe (next, branch-complete off main) +TRACK B — Agent fixture substrate +prompt/context scenario substrate foundation (completed) + └──→ agent-fixture-substrate ├──→ generated completed-spec fixture candidates - ├──→ graph-review oracle + scenario-options probes - └──→ Pi harness comparison (future, FE-635) + ├──→ graph-review-scenario-options + └──→ Pi harness comparison (future, FE-635) TRACK C — Semantic substrate (highest coordination) -multi-chat-substrate + reconciliation-needs (completed) - ├──→ intent graph semantics + relation-policy directionality (next, FE-700) - │ ├──→ relation-first observer enrichment (horizon, first cut already shipped) +multi-chat-substrate + reconciliation-needs (completed) + ├──→ intent-graph-semantics + │ ├──→ relation-first-observer-enrichment │ ├──→ robust direct-edit / reconciliation cascade policy - │ └──→ graph-review oracle can become semantically meaningful - └──→ semantic changeset ledger + proposal-turn staleness (next, FE-701) + │ └──→ graph-review-scenario-options becomes semantically meaningful + └──→ changeset-ledger ├──→ canonical scenario bundle acceptance ├──→ direct-edit atomicity with caused_by_changeset_id ├──→ stale open proposal detection - └──→ architect-loop / verifier/import mutation provenance + └──→ architect-generator-loop / verifier/import mutation provenance TRACK D — Strategy probes and product acceleration -FE-705 fixtures + FE-700 semantics - └──→ graph-review oracle + scenario-options probes (next, artifact-only) - └──→ productized scenario-options / candidate-spec completion assist (after changesets) +agent-fixture-substrate + intent-graph-semantics + └──→ graph-review-scenario-options + └──→ productized-scenario-options ├──→ absorbs / reshapes two-axis interview framing └──→ absorbs / reshapes progressive detail / recursive deflation TRACK E — Low-conflict parallel work -first-run provider setup -workspace hygiene gitignore assist -productized web research capability +first-run-provider-setup +workspace-gitignore-assist +productized-web-research LOWER-PRIORITY / DEFERRED -side-chat persistence V4a / V4b -spatial graph layout + active-path filter -dashboard metrics -MCP adapter / file-based persistence / typed fixture builders -structured development spec registry -portability boundaries +side-chat-persistence-v4a / side-chat-v4b-item-versioning +spatial-graph-layout + graph-view-active-path-filter +dashboard-summaries +mcp-adapter / file-based-persistence / typed-fixture-builder-convergence +structured-development-spec-registry +portability-boundaries ``` From d80f37a64eecd619cf0b7008af244852f17f832f Mon Sep 17 00:00:00 2001 From: Lu Nelson Date: Wed, 13 May 2026 14:44:29 +0200 Subject: [PATCH 39/42] separate documentation of ln- skills vs product workflows --- docs/design/CONVERSATIONAL_WORKSPACE_RUNTIME.md | 2 +- docs/design/INTENT_GRAPH_SEMANTICS.md | 2 +- docs/design/README.md | 2 +- .../EVOLUTION.md} | 12 ++++++------ docs/design/ln-skills/README.md | 9 +++++++++ memory/PLAN.md | 4 ++-- memory/SPEC.md | 2 +- 7 files changed, 21 insertions(+), 12 deletions(-) rename docs/design/{DEV_WORKFLOW_EVOLUTION.md => ln-skills/EVOLUTION.md} (96%) create mode 100644 docs/design/ln-skills/README.md diff --git a/docs/design/CONVERSATIONAL_WORKSPACE_RUNTIME.md b/docs/design/CONVERSATIONAL_WORKSPACE_RUNTIME.md index 450ca942..56d6f08d 100644 --- a/docs/design/CONVERSATIONAL_WORKSPACE_RUNTIME.md +++ b/docs/design/CONVERSATIONAL_WORKSPACE_RUNTIME.md @@ -291,7 +291,7 @@ This synthesis has to respect parallel design work that happened outside the run | [SPEC_EVOLUTION_STRATEGIES.md](./SPEC_EVOLUTION_STRATEGIES.md) | Strategy is chat-local process state. Scenario options, graph-review findings, and reconciliation suggestions are proposal turns until accepted; accepted candidate bundles become coherent changesets, not loose item-by-item mutations. | | [AGENT_MUTATION_SURFACE.md](./AGENT_MUTATION_SURFACE.md) | Agent-originated writes must enter through Brunch-owned capability/handler contracts. The runtime may host agent runs, but those runs do not get direct ORM or route-wrapper mutation authority. | | [BEHAVIORAL_KERNELS.md](./BEHAVIORAL_KERNELS.md) | Kernel-driven questions produce typed artifacts that the intent graph stores; the runtime provides thread/context affordances but should not invent a separate artifact ontology. | -| [DEV_WORKFLOW_EVOLUTION.md](./DEV_WORKFLOW_EVOLUTION.md) | Dev-layer file-backed registry ideas are separate from product runtime persistence. Do not mix product `changeset` tables with the future `memory/` registry experiment. | +| [ln-skills/EVOLUTION.md](./ln-skills/EVOLUTION.md) | Dev-layer file-backed registry ideas are separate from product runtime persistence. Do not mix product `changeset` tables with the future `memory/` registry experiment. | Audit result: the runtime concept stays coherent if it treats `chat`/thread as conversational process, `changeset`/`change` as semantic mutation history, `reconciliation_need` as process debt from a known disturbance, and graph review as a separate quality oracle. That matches the current SPEC/PLAN reconciliation. diff --git a/docs/design/INTENT_GRAPH_SEMANTICS.md b/docs/design/INTENT_GRAPH_SEMANTICS.md index b0d8727c..78042e62 100644 --- a/docs/design/INTENT_GRAPH_SEMANTICS.md +++ b/docs/design/INTENT_GRAPH_SEMANTICS.md @@ -8,7 +8,7 @@ > > Source synthesis: [`INTENT_SPEC_EVOLUTION.md`](../archive/design/INTENT_SPEC_EVOLUTION.md) §3, §4, §6, §11. Where this document overlaps, it supersedes the synthesis as the structured reference; the synthesis remains the broader narrative. > -> Layer note: this is the **product layer**. It describes what Brunch users build. The dev-layer ontology is a parallel-but-not-yet-converged register described in [`DEV_WORKFLOW_EVOLUTION.md`](./DEV_WORKFLOW_EVOLUTION.md). +> Layer note: this is the **product layer**. It describes what Brunch users build. The dev-layer ontology is a parallel-but-not-yet-converged register described in [`ln-skills/EVOLUTION.md`](./ln-skills/EVOLUTION.md). ## Why this note exists diff --git a/docs/design/README.md b/docs/design/README.md index da9eb179..ea29b883 100644 --- a/docs/design/README.md +++ b/docs/design/README.md @@ -48,7 +48,7 @@ Start with `CONVERSATIONAL_WORKSPACE_RUNTIME.md`. The other files in this cluste | Document | Role | | --- | --- | -| `DEV_WORKFLOW_EVOLUTION.md` | Dev-layer trajectory for the `ln-*` skill family, `memory/` ontology, proposed file-backed spec registry, and possible dev/product ontology convergence. Not product SPEC. | +| `ln-skills/EVOLUTION.md` | Dev-layer trajectory for the `ln-*` skill family, `memory/` ontology, proposed file-backed spec registry, and possible dev/product ontology convergence. Not product SPEC. | | `DEFERRED_RECONCILIATIONS.md` | Interim backlog for product impulses that are worthy but intentionally gated. Audit before promoting or retiring entries. | ### Isolated / future-facing notes diff --git a/docs/design/DEV_WORKFLOW_EVOLUTION.md b/docs/design/ln-skills/EVOLUTION.md similarity index 96% rename from docs/design/DEV_WORKFLOW_EVOLUTION.md rename to docs/design/ln-skills/EVOLUTION.md index 2f0b64fb..3731f404 100644 --- a/docs/design/DEV_WORKFLOW_EVOLUTION.md +++ b/docs/design/ln-skills/EVOLUTION.md @@ -6,13 +6,13 @@ > > This document is **not** part of `memory/SPEC.md` because it does not describe Brunch the product. It is the canonical design home for the **dev layer**: how Brunch is built. Conclusions that affect product behavior should still be promoted into `memory/SPEC.md` through `ln-spec`, but most of the material here describes self-tooling rather than user-facing capability. > -> Source synthesis: external agent conversations captured in [`docs/archive/design/INTENT_SPEC_EVOLUTION.md`](../archive/design/INTENT_SPEC_EVOLUTION.md). That synthesis treats both the product layer and the dev layer in the same document; this note splits the dev-layer trajectory out so the layers stop colliding. +> Source synthesis: external agent conversations captured in [`docs/archive/design/INTENT_SPEC_EVOLUTION.md`](../../archive/design/INTENT_SPEC_EVOLUTION.md). That synthesis treats both the product layer and the dev layer in the same document; this note splits the dev-layer trajectory out so the layers stop colliding. ## Why this note exists The intent-spec branching conversation produced two parallel trajectories: -1. A **product-layer** direction — Brunch should evolve from eliciting planning specs toward eliciting intent specs, with progressive checkability, behavioral kernels, semantic edges, and graph-first context. Most of that material has now landed in `memory/SPEC.md` (Requirements 38–41, A77–A87, D125, D134–D142, I109–I112, and the Lexicon entries for `intent graph` / `progressive checkability` / `behavioral kernel` / `context pack` / `scenario runner`), focused design docs (`MULTI_CHAT.md`, `PATCH_LEDGER.md`), or the archived source synthesis (`../archive/design/INTENT_SPEC_EVOLUTION.md`). +1. A **product-layer** direction — Brunch should evolve from eliciting planning specs toward eliciting intent specs, with progressive checkability, behavioral kernels, semantic edges, and graph-first context. Most of that material has now landed in `memory/SPEC.md` (Requirements 38–41, A77–A87, D125, D134–D142, I109–I112, and the Lexicon entries for `intent graph` / `progressive checkability` / `behavioral kernel` / `context pack` / `scenario runner`), focused design docs (`MULTI_CHAT.md`, `PATCH_LEDGER.md`), or the archived source synthesis (`../../archive/design/INTENT_SPEC_EVOLUTION.md`). 2. A **dev-layer** direction — the same critique, applied recursively to Brunch's *own* spec workflow. The current `memory/SPEC.md` is doing many jobs at once and the markdown-mediated nature of the document creates real cognitive cost on contributing LLMs. The conversation proposed a file-backed canonical spec registry with deterministic checkers and generated views. None of this has landed anywhere except as a one-line horizon item in `memory/PLAN.md` ("Structured development spec registry"). @@ -155,7 +155,7 @@ The point is not that the current system is broken — it works, and `ln-sync` e ## Proposed dev-layer trajectory -The trajectory is the one the source synthesis captures in §10–11 of [`INTENT_SPEC_EVOLUTION.md`](../archive/design/INTENT_SPEC_EVOLUTION.md), but framed here as a self-tooling experiment for *this* repo, not as a product proposal. +The trajectory is the one the source synthesis captures in §10–11 of [`INTENT_SPEC_EVOLUTION.md`](../../archive/design/INTENT_SPEC_EVOLUTION.md), but framed here as a self-tooling experiment for *this* repo, not as a product proposal. ### Target shape @@ -257,7 +257,7 @@ The structural argument for convergence is strong: The structural argument against immediate convergence is also strong: -- They have different persistence needs. The dev layer is diffable, branchable, reviewable in PRs — files. The product layer is interactive, multi-user, resume-precise — SQLite. (Source: [`INTENT_SPEC_EVOLUTION.md`](../archive/design/INTENT_SPEC_EVOLUTION.md) §11.) +- They have different persistence needs. The dev layer is diffable, branchable, reviewable in PRs — files. The product layer is interactive, multi-user, resume-precise — SQLite. (Source: [`INTENT_SPEC_EVOLUTION.md`](../../archive/design/INTENT_SPEC_EVOLUTION.md) §11.) - They have different mutation interfaces. The dev layer mutates through editor + CLI. The product layer mutates through interview turns, observer captures, and graph edits. - They have different operational metadata. The dev layer cares about test coverage and CI gates; the product layer cares about workflow phase, frontier ownership, review acceptance, and chat ownership. @@ -307,7 +307,7 @@ The decision rule: ## References -- [`INTENT_SPEC_EVOLUTION.md`](../archive/design/INTENT_SPEC_EVOLUTION.md) §10–11 — source synthesis for the registry trajectory and the persistence adapter split. -- [`AGENTS.md`](../../AGENTS.md) — current operational protocols, verification harness, naming conventions. +- [`INTENT_SPEC_EVOLUTION.md`](../../archive/design/INTENT_SPEC_EVOLUTION.md) §10–11 — source synthesis for the registry trajectory and the persistence adapter split. +- [`AGENTS.md`](../../../AGENTS.md) — current operational protocols, verification harness, naming conventions. - `.agents/skills/ln-*/SKILL.md` — current implementations of the dev-workflow skills. - `memory/PLAN.md` horizon item "Structured development spec registry" — the one-line pointer this document expands. diff --git a/docs/design/ln-skills/README.md b/docs/design/ln-skills/README.md new file mode 100644 index 00000000..1432cbb3 --- /dev/null +++ b/docs/design/ln-skills/README.md @@ -0,0 +1,9 @@ +# ln-skills Design Notes + +This directory holds design rationale for Brunch's `ln-*` agent-skill workflow and related dev-layer self-tooling. + +These documents are not executable skills. Runtime skill instructions live under `.agents/skills/ln-*/`; accepted operational protocols belong in `AGENTS.md` or `docs/praxis/`; canonical product truth remains in `memory/SPEC.md` and `memory/PLAN.md`. + +| Document | Role | +| --- | --- | +| `EVOLUTION.md` | Dev-layer trajectory for the `ln-*` skill family, `memory/` ontology, proposed file-backed spec registry, and possible dev/product ontology convergence. Not product SPEC. | diff --git a/memory/PLAN.md b/memory/PLAN.md index 7568534f..2a8f3d41 100644 --- a/memory/PLAN.md +++ b/memory/PLAN.md @@ -17,7 +17,7 @@ The interaction model is mature: four-phase interview, interviewer-autonomous qu The next product arc is a **continuous conversational workspace** plus a stronger semantic/generative substrate. Continuous workspace is active in a parallel lane and gives the chat runtime a stable phase-addressable host. The FE-705 branch contributes an integration substrate — a local agent capability CLI and external LLM-as-user probe harness — that should be reconciled into main before graph-review and scenario-options work depends on generated completed-spec fixtures. After that, the highest-coordination work is intent-graph semantics and the semantic changeset ledger; lower-coordination provider, gitignore, and web-research work can proceed in parallel. -The May 2026 intent-spec, multi-chat, changeset-ledger, prompt/context, and agent-mutation design notes are reconciled into one direction. `docs/design/MULTI_CHAT.md` is the substrate document. `docs/design/SIDE_CHAT.md` describes side-chat V1 / V2 / V3.0 / V3.1 / V4 phasing on top of that substrate. `docs/design/PATCH_LEDGER.md` remains historical deeper design pressure for semantic mutation history, but canonical future-facing vocabulary is `changeset` / `change`. The product-layer ontology trajectory is split out as `docs/design/INTENT_GRAPH_SEMANTICS.md` and `docs/design/BEHAVIORAL_KERNELS.md`; broader synthesis lives in `docs/archive/design/INTENT_SPEC_EVOLUTION.md`. FE-705's branch-local strategy/proposal notes add scenario options, graph-review oracle, chat-local strategies, and concern/dependency mapping; those notes should become a canonical design doc when the branch is integrated. The dev-layer self-tooling trajectory lives in `docs/design/DEV_WORKFLOW_EVOLUTION.md`. +The May 2026 intent-spec, multi-chat, changeset-ledger, prompt/context, and agent-mutation design notes are reconciled into one direction. `docs/design/MULTI_CHAT.md` is the substrate document. `docs/design/SIDE_CHAT.md` describes side-chat V1 / V2 / V3.0 / V3.1 / V4 phasing on top of that substrate. `docs/design/PATCH_LEDGER.md` remains historical deeper design pressure for semantic mutation history, but canonical future-facing vocabulary is `changeset` / `change`. The product-layer ontology trajectory is split out as `docs/design/INTENT_GRAPH_SEMANTICS.md` and `docs/design/BEHAVIORAL_KERNELS.md`; broader synthesis lives in `docs/archive/design/INTENT_SPEC_EVOLUTION.md`. FE-705's branch-local strategy/proposal notes add scenario options, graph-review oracle, chat-local strategies, and concern/dependency mapping; those notes should become a canonical design doc when the branch is integrated. The dev-layer self-tooling trajectory lives in `docs/design/ln-skills/EVOLUTION.md`. ## Sequencing @@ -328,7 +328,7 @@ The May 2026 intent-spec, multi-chat, changeset-ledger, prompt/context, and agen - **Acceptance:** Generated views preserve current planning ergonomics while reducing merge churn and cross-reference drift. - **Verification:** Deterministic generation checks and branch-conflict dry runs. - **Traceability:** dev-layer trajectory only; not product-layer ontology. -- **Design docs:** `docs/design/DEV_WORKFLOW_EVOLUTION.md`. +- **Design docs:** `docs/design/ln-skills/EVOLUTION.md`. ### portability-boundaries diff --git a/memory/SPEC.md b/memory/SPEC.md index 8bc60464..34a38bce 100644 --- a/memory/SPEC.md +++ b/memory/SPEC.md @@ -11,7 +11,7 @@ surfaces in `src/` schema and at runtime. The dev-workflow trajectory (the `ln-*` skill family, the proposed file-backed spec registry, and the long-horizon convergence between dev and - product ontologies) lives in `docs/design/DEV_WORKFLOW_EVOLUTION.md`. --> + product ontologies) lives in `docs/design/ln-skills/EVOLUTION.md`. --> # Brunch v2 — Spec Elicitation Tool From de8221e0c057ad3d8ae90b7c813dc197401113c2 Mon Sep 17 00:00:00 2001 From: Lu Nelson Date: Wed, 13 May 2026 14:50:35 +0200 Subject: [PATCH 40/42] document and synchronize policy WRT pre-release posture --- .agents/skills/ln-build/SKILL.md | 4 +++- .agents/skills/ln-review/SKILL.md | 2 ++ .agents/skills/ln-scope/SKILL.md | 2 ++ .agents/skills/ln-sync/SKILL.md | 2 ++ AGENTS.md | 10 ++++++++++ 5 files changed, 19 insertions(+), 1 deletion(-) diff --git a/.agents/skills/ln-build/SKILL.md b/.agents/skills/ln-build/SKILL.md index 510cc8bf..10ae39b4 100644 --- a/.agents/skills/ln-build/SKILL.md +++ b/.agents/skills/ln-build/SKILL.md @@ -70,7 +70,9 @@ Run the relevant checks. Confirm failures are meaningful. If the card is already ## Green -Write the minimum code to pass. Build inside-out: functional core first, thin I/O shell second, then end-to-end wiring. +Write the minimum coherent code to pass. Build inside-out: functional core first, thin I/O shell second, then end-to-end wiring. + +Honor the repo's pre-release posture: if the current schema, fixture shape, dummy data, or terminology is wrong for the model, change it and regenerate dependent artifacts rather than preserving accidental compatibility. Delete obsolete paths in the same slice when they are inside the active seam. No speculative abstractions. Only extract when two concrete cases force it. Do not anticipate later tests or build shape-only scaffolding; let the current behavioral test pull the interface into existence. diff --git a/.agents/skills/ln-review/SKILL.md b/.agents/skills/ln-review/SKILL.md index d27e45b8..6fefc8fb 100644 --- a/.agents/skills/ln-review/SKILL.md +++ b/.agents/skills/ln-review/SKILL.md @@ -8,6 +8,8 @@ argument-hint: "[area of codebase to review, or 'recent' for recently changed fi Explore the codebase. Surface structural improvement opportunities. Be opinionated. +Use the repo's pre-release posture: reward conceptual clarity over compatibility scaffolding, and treat unnecessary preservation as review debt. Look for stale code, obsolete fixtures, legacy terms, and compatibility paths that should be deleted rather than protected. + ## Input What to review: $ARGUMENTS diff --git a/.agents/skills/ln-scope/SKILL.md b/.agents/skills/ln-scope/SKILL.md index e09d351c..1befe791 100644 --- a/.agents/skills/ln-scope/SKILL.md +++ b/.agents/skills/ln-scope/SKILL.md @@ -13,6 +13,8 @@ Define **one** buildable scope card. The card always describes one slice, but it If the target behavior needs "and", split it. +Apply the repo's pre-release posture while scoping: prefer correcting the model and regenerating fixtures over preserving accidental compatibility, unless live docs or the user require migration support. Include deletion/retirement work in the slice when obsolete code, data, or terminology would otherwise linger. + ## Input The behavior to deliver: $ARGUMENTS diff --git a/.agents/skills/ln-sync/SKILL.md b/.agents/skills/ln-sync/SKILL.md index 8d96b8f0..9fd8c993 100644 --- a/.agents/skills/ln-sync/SKILL.md +++ b/.agents/skills/ln-sync/SKILL.md @@ -9,6 +9,8 @@ Audit and refresh the canonical documents so they stay lightweight enough for fa `ln-sync` is the family-wide ontology repair and garbage-collection pass. Merge equivalent facts, repair stale references, and delete exhausted derivative artifacts. Only `docs/archive/PLAN_HISTORY.md` acts as archive history. +Apply the repo's pre-release posture: optimize canonical memory for the model we now believe in, not compatibility with stale docs. Retire superseded claims, delete obsolete derivative artifacts, and tighten lexicon drift instead of preserving historical aliases in active truth. + ## When to run Prefer `ln-sync` at these moments: diff --git a/AGENTS.md b/AGENTS.md index 405a78c7..f0ec17a8 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -31,6 +31,16 @@ PR descriptions are written only when tying off a branch — not during active d Use `git` for commits and reads (status, log, diff, add, commit). Use `gt` for stack-aware operations (create, submit, restack, move, track, checkout). Details and rationale in `docs/praxis/graphite-workflow.md`. +## development phase posture + +Brunch is pre-release. Optimize for conceptual correctness, domain clarity, and future leverage over backward compatibility with existing local/dev data. + +Do not preserve old data models, fixtures, dummy data, or compatibility shims merely because they exist. If a schema or domain model is wrong, change it and regenerate fixtures/seeds/tests as needed. Migration support is required only when SPEC.md, PLAN.md, or the user explicitly says existing data must be preserved. + +Be rigorous about deletion. Retire stale concepts, obsolete code paths, superseded docs, unused fixtures, and compatibility scaffolding once they no longer serve the current model. Keep the lexicon tight: prefer one canonical domain/conceptual term, update callers/docs/tests to match it, and remove aliases or legacy names when they stop carrying useful history. + +This is not permission for unrelated rewrites: keep changes scoped to the active seam, preserve accepted invariants, and verify behavior through the normal harness. + ## code organization Use a lightweight fractal sub-tree pattern when a file outgrows its current mini-library boundary. Keep the original file as the public entry point (for example, `context-pack.ts`) and place private implementation modules in a same-named folder (for example, `context-pack/observer-capture.ts`). External consumers should continue importing from the public root file; only that root file should import from its private sub-tree. Split along semantic purpose, not file shape, and avoid speculative folder scaffolding until the file has real pressure. From 1bcd5a7bb060248f5ec7495ac24cfa804a0a5e07 Mon Sep 17 00:00:00 2001 From: Lu Nelson Date: Wed, 13 May 2026 14:55:39 +0200 Subject: [PATCH 41/42] plan a restructuring of SPEC doc and template --- memory/SPEC_RESTRUCTURE.md | 175 +++++++++++++++++++++++++++++++++++++ 1 file changed, 175 insertions(+) create mode 100644 memory/SPEC_RESTRUCTURE.md diff --git a/memory/SPEC_RESTRUCTURE.md b/memory/SPEC_RESTRUCTURE.md new file mode 100644 index 00000000..8c159187 --- /dev/null +++ b/memory/SPEC_RESTRUCTURE.md @@ -0,0 +1,175 @@ +# SPEC Restructure Plan + +> Status: proposed one-off workflow doc. +> Created: 2026-05-13. +> Purpose: capture the intended cleanup for `memory/SPEC.md` before splitting this work into a separate branch / PR. Delete this file after the restructure is completed or explicitly abandoned. + +## Goal + +Make `memory/SPEC.md` lighter, more structurally resistant to branch conflicts, and clearer about what belongs in the live architecture register versus historical/product-embedded truth. + +The cleanup should preserve durable product/architecture authority while retiring rows that are already fully embedded in code, tests, or design docs. + +## Diagnosis + +`memory/SPEC.md` now mixes several kinds of truth in one long mutable document: + +1. **Stable product contract** — concept, non-goals, durable product requirements. +2. **Live uncertainty** — assumptions still awaiting validation or still shaping frontier work. +3. **Current architectural guardrails** — decisions and invariants that actively constrain near-term work. +4. **Historical embedded decisions** — shipped seams whose rationale is now code/test/design-doc truth. +5. **Future direction** — semantic/generative/agent/provider trajectories not yet productized. +6. **Verification policy and coverage** — useful, but partly over-detailed as implementation/test history. + +This creates churn because ordinary feature work edits the same numbered tables/sections, and because sequential IDs (`Requirement N`, `A##`, `D###`, `I###`) are collision-prone across branches. + +## Desired document shape + +Target structure, to be refined during the cleanup: + +```md +# Brunch v2 — Spec Elicitation Tool + +## Product Contract +### Concept +### Constraints & Non-goals +### Capability Requirements +#### Runtime & persistence +#### Interview workflow +#### Knowledge / intent graph +#### Review & export +#### Workspace / graph UI +#### Provider / agent substrate + +## Live Architecture Register +### Open Assumptions +### Active Decisions +### Critical Invariants + +## Future Direction Register +### Semantic / generative substrate +### Agent capability substrate +### Provider / workspace hardening + +## Interaction Stream Model +[keep if still actively useful, but compress or move details to design docs] + +## Layout Architecture +[compress; move design-level detail out if it is no longer needed as SPEC authority] + +## Lexicon + +## Verification Design +``` + +Principles: + +- Separate **stable product contract** from **live architecture register** from **future direction**. +- Keep `SPEC.md` as the authority for active constraints, not as the full archive of how each seam was built. +- Prefer short guardrails plus links to design docs over long design-doc-scale paragraphs. +- Do not renumber surviving tracked IDs unless the cleanup explicitly adopts a new ID scheme. +- Leave concise retirement comments for removed ID ranges when useful. + +## Assessment pass + +Classify each tracked row before editing: + +| Classification | Meaning | Action | +| --- | --- | --- | +| keep live | Still unresolved or actively constrains near-term work | Keep, possibly tighten wording | +| compress / merge | Overlaps another row or carries too much rationale | Merge into one active guardrail | +| retire embedded | Fully shipped and now protected by code/tests/design docs | Remove from live table; optionally note retired IDs in an HTML comment | +| move rationale | Valuable context but too detailed for SPEC | Keep a short SPEC guardrail and point to design doc | +| future direction | Not current product contract but shapes frontier work | Move under Future Direction Register or ensure PLAN owns it | + +### Assumptions to inspect first + +Strong candidates: + +- `A82`, `A83` — already validated; likely retire from live assumptions unless still needed as FE-701 constraints. + +Possible embedded/product-fact candidates: + +- `A51`, `A53`, `A54`, `A55` — workspace turn-card / activity / frontier projection assumptions may now be product facts or invariants. +- `A59`, `A60`, `A63` — prompt/question/header assumptions may be embedded or lower-priority watch items. +- `A64` — query invalidation may have become a concrete architectural decision/invariant if already built. +- `A66`–`A70` — graph/relation assumptions should be checked against shipped graph view and FE-700 direction. +- `A71`–`A73`, `A77`–`A81`, `A84`–`A91`, `A93` — likely still live future/semantic/generative assumptions; may move to Future Direction Register. + +### Decisions to inspect first + +Potential merge/compression clusters: + +- Runtime / stream / workflow cluster: + - `D22`, `D89`, `D93`, `D94`, `D95`, `D96`, `D110`, `D112`, `D113`, `D116`, `D121`, `D123`, `D114` + - Goal: compress overlapping turn-centered stream, projected controls, lifecycle, observer backlog, route/query ownership, and continuous workspace guardrails. + +- Graph / side-chat / semantic mutation cluster: + - `D80`, `D125`, `D134`, `D135`, `D136`, `D137`, `D138`, `D144`, `D145`, `D146`, `D149`, `D150`, `D152` + - Goal: keep current semantic direction and active changeset/reconciliation guardrails; retire or compress older side-chat/revisit wording superseded by multi-chat + reconciliation docs. + +- Prompt/context / agent capability cluster: + - `D139`, `D140`, `D141`, `D142`, `D143`, `D147` + - Goal: keep concise active guardrails for prompt/context substrate and Brunch-owned mutation surface; move implementation boundary detail to design docs where possible. + +- Candidate/scenario strategy cluster: + - `D126`, `D127`, `D148`, `D151` + - Goal: separate current product contract from future strategy/proposal direction. + +- Provider/workspace hardening cluster: + - `D130`, `D131`, `D132`, `D133` + - Goal: likely keep as active near-term frontier constraints; wording can be shorter. + +### Invariants to inspect first + +Keep only critical seam-level invariants live. + +Candidates to compress or retire: + +- Rows that primarily enumerate test filenames or implementation history rather than a reusable invariant. +- Older invariants whose protected behavior is fully covered by a broader newer invariant. +- Planned invariants for not-yet-built future work should be checked against `memory/PLAN.md`; if they only describe future acceptance criteria, PLAN may be the better home until implemented. + +Likely keep live: + +- Distribution/runtime startup invariants (`I4`, `I100`). +- Boundary/schema invariants (`I17`, `I48`, `I54`). +- Workflow/turn/lifecycle invariants (`I24`, `I72`, `I87`, `I104`, `I105`, `I108`, `I110`). +- Current frontier invariants for provider/gitignore/agent/changing semantic substrate (`I106` onward), if they still correspond to active PLAN frontier items. + +## Rewrite pass + +1. Create a branch specifically for SPEC restructuring. +2. Read `memory/SPEC.md`, `memory/PLAN.md`, and current design docs named by SPEC rows. +3. Classify rows using the assessment table above. +4. Rewrite `SPEC.md` into the target structure. +5. Preserve cross-reference integrity: + - `PLAN.md` frontier definitions still point at surviving SPEC requirements/assumptions/decisions/invariants. + - Retired IDs are not referenced by live PLAN frontier definitions unless intentionally historical. + - Design docs carry detailed rationale that SPEC no longer repeats. +6. Run link/reference checks if available, then `npm run fix` and `npm run verify` before PR. + +## Output expectations + +The completed PR should include: + +- `memory/SPEC.md` rewritten / pruned. +- Any necessary small updates to `memory/PLAN.md` traceability references caused by retired/merged SPEC rows. +- Optional updates to `ln-spec` / `ln-sync` instructions **only if** the restructure changes the intended SPEC shape. +- Deletion of this `memory/SPEC_RESTRUCTURE.md` file once its plan has been executed or superseded. + +## Non-goals + +- Do not change product behavior. +- Do not add new requirements just because there is a new section for them. +- Do not migrate to a structured generated spec registry in this pass; that remains `structured-development-spec-registry` horizon work. +- Do not rewrite design docs unless a SPEC row is moved there and the target doc needs a small anchor. +- Do not renumber surviving IDs casually. + +## Open design questions for the restructure branch + +1. Should requirements remain a single numbered sequence, or should they become grouped stable IDs by capability area? +2. Should assumptions/decisions/invariants stay as global tables/lists, or be grouped by subsystem to reduce edit conflicts? +3. Should validated assumptions be removed immediately, or retained for one release window with a retirement note? +4. How much of Interaction Stream Model and Layout Architecture still belongs in SPEC versus `docs/design/CONVERSATIONAL_WORKSPACE_RUNTIME.md` and related design docs? +5. Should future direction rows live in SPEC at all, or should SPEC only link to PLAN frontier definitions and design docs for unbuilt future work? From b7a6f9edaa3324a6671091f1c0c1153fe052b08f Mon Sep 17 00:00:00 2001 From: Lu Nelson Date: Fri, 15 May 2026 09:51:35 +0200 Subject: [PATCH 42/42] FE-705: Address agent capability review comments --- scripts/agent-probes/probe-runner.test.ts | 36 ++++++++- scripts/agent-probes/probe-runner.ts | 34 ++++++-- src/server/agent-jsonl.test.ts | 45 +++++++++-- src/server/agent-jsonl.ts | 3 + src/server/capabilities.test.ts | 95 +++++++++++++++++++++++ src/server/capabilities.ts | 33 ++++++-- src/server/capability-registry.test.ts | 2 +- src/server/capability-registry.ts | 2 +- src/server/cli.ts | 2 +- 9 files changed, 230 insertions(+), 22 deletions(-) diff --git a/scripts/agent-probes/probe-runner.test.ts b/scripts/agent-probes/probe-runner.test.ts index 806572bf..249f61b0 100644 --- a/scripts/agent-probes/probe-runner.test.ts +++ b/scripts/agent-probes/probe-runner.test.ts @@ -455,6 +455,27 @@ describe('probe runner', () => { workspace: { cwd: spawnedCwds[0], preservedStatePath: null }, }); expect(existsSync(join(outputDir, 'workspace-state'))).toBe(false); + expect(existsSync(result.workspaceCwd ?? '')).toBe(false); + }); + + it('cleans up the temp workspace when process startup fails', async () => { + const outputDir = makeTempDir('brunch-probe-output-'); + let workspaceCwd: string | null = null; + + await expect( + runProcessBackedProbe({ + scenario: { name: 'startup-failure', specName: 'Startup failure' }, + scriptedAnswers: [], + outputDir, + spawnProcess({ cwd }) { + workspaceCwd = cwd; + throw new Error('spawn failed'); + }, + }), + ).rejects.toThrow('spawn failed'); + + expect(workspaceCwd).toContain('brunch-probe-workspace-'); + expect(existsSync(workspaceCwd ?? '')).toBe(false); }); it('writes sanitized process-backed failure artifacts when JSONL protocol interaction fails', async () => { @@ -485,10 +506,21 @@ describe('probe runner', () => { }); const summary = JSON.parse(readFileSync(join(outputDir, 'summary.json'), 'utf8')) as unknown; - const bundle = JSON.parse(readFileSync(join(outputDir, 'artifact-bundle.json'), 'utf8')) as unknown; + const artifactBundle = readFileSync(join(outputDir, 'artifact-bundle.json'), 'utf8'); + const bundle = JSON.parse(artifactBundle) as unknown; const rawJsonl = readFileSync(join(outputDir, 'raw-jsonl.ndjson'), 'utf8'); expect(result.summary.turnsAnswered).toBe(0); + expect(result.responses).toEqual([ + { + id: 'create', + ok: false, + error: { + code: 'protocol_error', + message: 'Unmatched id:null response: ANTHROPIC_API_KEY=[redacted] bad envelope', + }, + }, + ]); expect(result.errors).toEqual([ { requestId: 'create', @@ -521,6 +553,8 @@ describe('probe runner', () => { }); expect(rawJsonl).toContain('"direction":"request"'); expect(rawJsonl).toContain('"direction":"response"'); + expect(rawJsonl).not.toContain('sk-secret'); + expect(artifactBundle).not.toContain('sk-secret'); }); it('can preserve the temp workspace .brunch state into the artifact directory', async () => { diff --git a/scripts/agent-probes/probe-runner.ts b/scripts/agent-probes/probe-runner.ts index ae846de2..35f45393 100644 --- a/scripts/agent-probes/probe-runner.ts +++ b/scripts/agent-probes/probe-runner.ts @@ -1,5 +1,5 @@ import { spawn } from 'node:child_process'; -import { cpSync, existsSync, mkdirSync, mkdtempSync, writeFileSync } from 'node:fs'; +import { cpSync, existsSync, mkdirSync, mkdtempSync, rmSync, writeFileSync } from 'node:fs'; import { tmpdir } from 'node:os'; import { join, resolve } from 'node:path'; @@ -182,10 +182,11 @@ export async function runProcessBackedProbe({ turnBudget, }: ProcessBackedProbeOptions): Promise { const workspaceCwd = mkdtempSync(join(tmpdir(), 'brunch-probe-workspace-')); - const spawned = spawnProcess({ cwd: workspaceCwd, command, args, env }); - const transport = createProcessJsonlTransport(spawned); + let spawned: SpawnedJsonlProcess | null = null; try { + spawned = spawnProcess({ cwd: workspaceCwd, command, args, env }); + const transport = createProcessJsonlTransport(spawned); const result = await runScriptedProbe({ transport, scenario, @@ -201,7 +202,8 @@ export async function runProcessBackedProbe({ writeProbeArtifacts(outputDir, result); return result; } finally { - spawned.endStdin(); + spawned?.endStdin(); + rmSync(workspaceCwd, { recursive: true, force: true }); } } @@ -417,7 +419,7 @@ async function sendExpectingOutput( request: ProbeJsonlRequest, ): Promise { state.requests.push(request); - const response = await transport.send(request); + const response = sanitizeProbeJsonlResponse(await transport.send(request)); state.responses.push(response); if (!response.ok) { @@ -501,7 +503,7 @@ function sanitizeProbeErrorMessage(message: string): string { export function buildProbeArtifactBundle(result: ProbeRunResult): ProbeArtifactBundle { const rawJsonlTranscript = result.requests.flatMap((request, index) => [ { direction: 'request' as const, payload: request }, - { direction: 'response' as const, payload: result.responses[index] ?? null }, + { direction: 'response' as const, payload: sanitizeJsonlResponse(result.responses[index] ?? null) }, ]); return { @@ -520,7 +522,7 @@ export function buildProbeArtifactBundle(result: ProbeRunResult): ProbeArtifactB parsedEvents: result.requests.map((request, index) => ({ index, request, - response: result.responses[index] ?? null, + response: sanitizeJsonlResponse(result.responses[index] ?? null), })), finalChat: result.finalChat, summary: result.summary, @@ -530,6 +532,24 @@ export function buildProbeArtifactBundle(result: ProbeRunResult): ProbeArtifactB }; } +function sanitizeProbeJsonlResponse(response: ProbeJsonlResponse): ProbeJsonlResponse { + if (response.ok) { + return response; + } + + return { + ...response, + error: { + ...response.error, + message: sanitizeProbeErrorMessage(response.error.message), + }, + }; +} + +function sanitizeJsonlResponse(response: ProbeJsonlResponse | null): ProbeJsonlResponse | null { + return response ? sanitizeProbeJsonlResponse(response) : null; +} + function writeProbeArtifacts(outputDir: string, result: ProbeRunResult): void { mkdirSync(outputDir, { recursive: true }); const bundle = buildProbeArtifactBundle(result); diff --git a/src/server/agent-jsonl.test.ts b/src/server/agent-jsonl.test.ts index db78cfcb..4b658575 100644 --- a/src/server/agent-jsonl.test.ts +++ b/src/server/agent-jsonl.test.ts @@ -27,7 +27,12 @@ describe('agent JSONL session', () => { return db; } - async function runSession(lines: string[]) { + async function runSession( + lines: string[], + options: Partial< + Pick[0], 'generateAnswerableFrontier' | 'projectCwd'> + > = {}, + ) { const input = new PassThrough(); const output = new PassThrough(); const chunks: string[] = []; @@ -37,10 +42,13 @@ describe('agent JSONL session', () => { db: createTempDb(), input, output, - generateAnswerableFrontier: async () => ({ - question: 'What are you trying to build?', - assistantParts: [{ type: 'text', text: 'What are you trying to build?' }], - }), + generateAnswerableFrontier: + options.generateAnswerableFrontier ?? + (async () => ({ + question: 'What are you trying to build?', + assistantParts: [{ type: 'text', text: 'What are you trying to build?' }], + })), + projectCwd: options.projectCwd, }); for (const line of lines) { input.write(`${line}\n`); @@ -146,6 +154,33 @@ describe('agent JSONL session', () => { ]); }); + it('passes project cwd into brownfield chat readiness generation', async () => { + const generationInputs: Array<{ modeOptions?: unknown }> = []; + + await runSession( + [ + JSON.stringify({ + id: 'create-1', + capability: 'spec.create', + input: { name: 'JSONL brownfield spec', mode: 'brownfield' }, + }), + JSON.stringify({ id: 'ready-1', capability: 'chat.ensureReady', input: { chatId: 1 } }), + ], + { + projectCwd: '/workspace/brunch', + generateAnswerableFrontier: async (input) => { + generationInputs.push({ modeOptions: input.modeOptions }); + return { + question: 'What are you trying to understand?', + assistantParts: [{ type: 'text', text: 'What are you trying to understand?' }], + }; + }, + }, + ); + + expect(generationInputs).toEqual([{ modeOptions: { mode: 'brownfield', cwd: '/workspace/brunch' } }]); + }); + it('submits a turn response and reads the answered turn over JSONL', async () => { const responses = await runSession([ JSON.stringify({ id: 'create-1', capability: 'spec.create', input: { name: 'JSONL response spec' } }), diff --git a/src/server/agent-jsonl.ts b/src/server/agent-jsonl.ts index 9fbd0fdd..5398fa6a 100644 --- a/src/server/agent-jsonl.ts +++ b/src/server/agent-jsonl.ts @@ -21,6 +21,7 @@ export interface AgentJsonlSessionOptions { input: Readable; output: Writable; generateAnswerableFrontier?: GenerateAnswerableFrontier; + projectCwd?: string; } type AgentJsonlResponse = @@ -51,6 +52,7 @@ export async function runAgentJsonlSession({ input, output, generateAnswerableFrontier, + projectCwd, }: AgentJsonlSessionOptions): Promise { const lines = createInterface({ input, crlfDelay: Infinity }); @@ -84,6 +86,7 @@ export async function runAgentJsonlSession({ capability: parsedRequest.data.capability, input: parsedRequest.data.input, generateAnswerableFrontier, + projectCwd, }); writeResponse(output, { id: parsedRequest.data.id, ok: true, output: result }); } catch (error) { diff --git a/src/server/capabilities.test.ts b/src/server/capabilities.test.ts index bc0e5bff..b0e84ec3 100644 --- a/src/server/capabilities.test.ts +++ b/src/server/capabilities.test.ts @@ -2,11 +2,13 @@ import { mkdtempSync, rmSync } from 'node:fs'; import { tmpdir } from 'node:os'; import { join } from 'node:path'; +import { eq } from 'drizzle-orm'; import { afterEach, describe, expect, it, vi } from 'vitest'; import { dispatchCapability } from './capabilities.js'; import { advanceHead, + createConfirmedPhaseOutcome, createDb, createTurn, getActivePath, @@ -15,6 +17,7 @@ import { listSpecifications, type DB, } from './db.js'; +import * as schema from './schema.js'; describe('agent capabilities', () => { const tempDirs: string[] = []; @@ -148,6 +151,63 @@ describe('agent capabilities', () => { }); }); + it('reports the first open workflow phase when a chat has no active frontier', async () => { + const activeDb = createTempDb(); + const created = await dispatchCapability({ + db: activeDb, + capability: 'spec.create', + input: { name: 'Idle requirements spec' }, + }); + const groundingTurn = createTurn(activeDb, created.specId, { + parent_turn_id: null, + phase: 'grounding', + question: 'What are you trying to build?', + answer: 'A product planning tool', + }); + advanceHead(activeDb, created.specId, groundingTurn.id); + createConfirmedPhaseOutcome(activeDb, { + specificationId: created.specId, + phase: 'grounding', + proposal_turn_id: groundingTurn.id, + confirmation_turn_id: groundingTurn.id, + summary: 'Grounding closed.', + }); + const designTurn = createTurn(activeDb, created.specId, { + parent_turn_id: groundingTurn.id, + phase: 'design', + question: 'What should the design emphasize?', + answer: 'Agent-facing workflow affordances', + }); + advanceHead(activeDb, created.specId, designTurn.id); + createConfirmedPhaseOutcome(activeDb, { + specificationId: created.specId, + phase: 'design', + proposal_turn_id: designTurn.id, + confirmation_turn_id: designTurn.id, + summary: 'Design closed.', + }); + const primary = await dispatchCapability({ + db: activeDb, + capability: 'chat.getPrimary', + input: { specId: created.specId }, + }); + activeDb + .update(schema.chat) + .set({ active_turn_id: null }) + .where(eq(schema.chat.id, primary.chatId)) + .run(); + + await expect( + dispatchCapability({ + db: activeDb, + capability: 'chat.read', + input: { chatId: primary.chatId }, + }), + ).resolves.toMatchObject({ + frontier: { state: 'idle_no_frontier', phase: 'requirements', turnId: null }, + }); + }); + it('dispatches chat.ensureReady by generating an answerable frontier', async () => { const activeDb = createTempDb(); const generateAnswerableFrontier = vi.fn(async () => ({ @@ -311,6 +371,41 @@ describe('agent capabilities', () => { }); }); + it('accepts legacy spec-owned turns that predate chat id backfill', async () => { + const activeDb = createTempDb(); + const created = await dispatchCapability({ + db: activeDb, + capability: 'spec.create', + input: { name: 'Legacy turn owner' }, + }); + const primary = await dispatchCapability({ + db: activeDb, + capability: 'chat.getPrimary', + input: { specId: created.specId }, + }); + const turn = createTurn(activeDb, created.specId, { + parent_turn_id: null, + phase: 'grounding', + question: 'What are you trying to build?', + answer: null, + }); + advanceHead(activeDb, created.specId, turn.id); + activeDb.update(schema.turn).set({ chat_id: null }).where(eq(schema.turn.id, turn.id)).run(); + + await expect( + dispatchCapability({ + db: activeDb, + capability: 'turn.submitResponse', + input: { + chatId: primary.chatId, + turnId: turn.id, + response: { kind: 'free-text', freeText: 'A migrated pre-chat turn' }, + }, + }), + ).resolves.toMatchObject({ response: { ok: true } }); + expect(getTurn(activeDb, turn.id)?.answer).toBe('A migrated pre-chat turn'); + }); + it('rejects turn.submitResponse for turns outside the explicit chat', async () => { const activeDb = createTempDb(); const first = await dispatchCapability({ diff --git a/src/server/capabilities.ts b/src/server/capabilities.ts index 42d0e65a..9d674b77 100644 --- a/src/server/capabilities.ts +++ b/src/server/capabilities.ts @@ -4,13 +4,14 @@ import { z } from 'zod'; import { submitTurnResponseRequestSchema } from '@/shared/api-types.js'; import { extractTextFromMessage, structuredQuestionSchema, type BrunchUIMessage } from '@/shared/chat.js'; +import { getCurrentWorkflowPhase } from '@/shared/phase-close.js'; import { getCapabilityContract, type CapabilityId } from './capability-registry.js'; import { applyChatRouteTransition } from './chat-route-transition.js'; import { createNewSpecification, finalizeTurn, getSpecificationState, type TurnWithOptions } from './core.js'; import type { DB, Turn } from './db.js'; import { getTurn, updateTurn } from './db.js'; -import { persistFallbackQuestionText, streamInterviewer } from './interview.js'; +import { persistFallbackQuestionText, streamInterviewer, type InterviewerModeOptions } from './interview.js'; import { serializeParts, type AssistantPart } from './parts.js'; import * as schema from './schema.js'; import { materializeTurnArtifacts } from './turn-artifacts.js'; @@ -72,6 +73,7 @@ export interface GenerateAnswerableFrontierInput { turn: Turn; activePath: TurnWithOptions[]; userMessage: string; + modeOptions?: InterviewerModeOptions; } export type GenerateAnswerableFrontier = ( @@ -81,6 +83,7 @@ export type GenerateAnswerableFrontier = ( export interface CapabilityDispatchContext { db: DB; generateAnswerableFrontier?: GenerateAnswerableFrontier; + projectCwd?: string; } export interface DispatchCapabilityInput extends CapabilityDispatchContext { @@ -256,9 +259,10 @@ async function generateAnswerableFrontierWithInterviewer({ turn, activePath, userMessage, + modeOptions, }: GenerateAnswerableFrontierInput): Promise { const startedAt = Date.now(); - const interviewer = await streamInterviewer(db, turn, activePath, userMessage, turn.phase); + const interviewer = await streamInterviewer(db, turn, activePath, userMessage, turn.phase, modeOptions); const stream = interviewer.toUIMessageStream({ sendReasoning: true, sendFinish: false, @@ -326,7 +330,7 @@ function readChatFromCapability(db: DB, input: ChatReadInput) { throw new CapabilityDispatchError(`Specification ${chat.specification_id} not found`, 'handler_failed'); } - const currentPhase = state.workflow.phases.grounding.status === 'closed' ? 'design' : 'grounding'; + const currentPhase = getCurrentWorkflowPhase(state.workflow); const activeTurn = state.turns.find((turn) => turn.id === chat.active_turn_id) ?? null; const frontier = activeTurn ? { state: getReadyStateForTurn(activeTurn), phase: activeTurn.phase, turnId: activeTurn.id } @@ -373,7 +377,9 @@ function submitTurnResponseFromCapability(db: DB, input: TurnSubmitResponseInput if (!turn) { throw new CapabilityDispatchError(`Turn ${input.turnId} not found`, 'handler_failed'); } - if (turn.chat_id !== chat.id || turn.specification_id !== chat.specification_id) { + const belongsToChat = turn.chat_id === chat.id; + const belongsToLegacySpecChat = turn.chat_id === null && turn.specification_id === chat.specification_id; + if ((!belongsToChat && !belongsToLegacySpecChat) || turn.specification_id !== chat.specification_id) { throw new CapabilityDispatchError( `Turn ${input.turnId} does not belong to chat ${input.chatId}`, 'handler_failed', @@ -403,7 +409,10 @@ function submitTurnResponseFromCapability(db: DB, input: TurnSubmitResponseInput async function ensureChatReadyFromCapability( db: DB, input: ChatEnsureReadyInput, - generateAnswerableFrontier: GenerateAnswerableFrontier = generateAnswerableFrontierWithInterviewer, + { + generateAnswerableFrontier = generateAnswerableFrontierWithInterviewer, + projectCwd, + }: Pick = {}, ) { const chat = getChatById(db, input.chatId); if (!chat) { @@ -415,6 +424,10 @@ async function ensureChatReadyFromCapability( throw new CapabilityDispatchError(`Specification ${chat.specification_id} not found`, 'handler_failed'); } + const modeOptions = + state.specification.mode === 'brownfield' && projectCwd + ? { mode: 'brownfield' as const, cwd: projectCwd } + : undefined; const activeTurn = state.turns.find((turn) => turn.id === chat.active_turn_id) ?? null; if (activeTurn) { const activeState = getReadyStateForTurn(activeTurn); @@ -438,6 +451,7 @@ async function ensureChatReadyFromCapability( turn: persistedActiveTurn, activePath: state.turns, userMessage: INITIAL_INTERVIEWER_PROMPT, + modeOptions, }); await persistGeneratedAnswerableFrontier(db, persistedActiveTurn, generated); @@ -473,6 +487,7 @@ async function ensureChatReadyFromCapability( turn: transition.prepared.turn, activePath: transition.prepared.activePath, userMessage: answeredText, + modeOptions, }); await persistGeneratedAnswerableFrontier(db, transition.prepared.turn, generated); @@ -521,6 +536,7 @@ async function ensureChatReadyFromCapability( turn: transition.prepared.turn, activePath: transition.prepared.activePath, userMessage: INITIAL_INTERVIEWER_PROMPT, + modeOptions, }); await persistGeneratedAnswerableFrontier(db, transition.prepared.turn, generated); @@ -558,6 +574,7 @@ export function dispatchCapability(input: { capability: 'chat.ensureReady'; input: unknown; generateAnswerableFrontier?: GenerateAnswerableFrontier; + projectCwd?: string; }): Promise; export function dispatchCapability(input: { db: DB; @@ -570,6 +587,7 @@ export async function dispatchCapability({ capability, input, generateAnswerableFrontier, + projectCwd, }: DispatchCapabilityInput): Promise { assertExecutableCapability(capability); @@ -590,7 +608,10 @@ export async function dispatchCapability({ } if (capability === 'chat.ensureReady') { - return ensureChatReadyFromCapability(db, parseChatEnsureReadyInput(input), generateAnswerableFrontier); + return ensureChatReadyFromCapability(db, parseChatEnsureReadyInput(input), { + generateAnswerableFrontier, + projectCwd, + }); } if (capability === 'turn.submitResponse') { diff --git a/src/server/capability-registry.test.ts b/src/server/capability-registry.test.ts index 35de001a..21629a7e 100644 --- a/src/server/capability-registry.test.ts +++ b/src/server/capability-registry.test.ts @@ -67,7 +67,7 @@ describe('capability registry', () => { }), expect.objectContaining({ id: 'chat.ensureReady', - authority: 'runtime_replay', + authority: 'commit_truth', inputSchema: 'chat.ensureReady.input.v1', outputSchema: 'chat.ensureReady.output.v1', }), diff --git a/src/server/capability-registry.ts b/src/server/capability-registry.ts index ac50b20f..eff042bc 100644 --- a/src/server/capability-registry.ts +++ b/src/server/capability-registry.ts @@ -134,7 +134,7 @@ const capabilityContracts = [ }, { id: 'chat.ensureReady', - authority: 'runtime_replay', + authority: 'commit_truth', summary: 'Ensure an explicit chat has an answerable generated frontier.', inputSchema: 'chat.ensureReady.input.v1', outputSchema: 'chat.ensureReady.output.v1', diff --git a/src/server/cli.ts b/src/server/cli.ts index 2b6dbf86..f879eb00 100644 --- a/src/server/cli.ts +++ b/src/server/cli.ts @@ -25,7 +25,7 @@ if (args.has('--help') || args.has('-h') || args.has('help')) { if (rawArgs[0] === 'agent') { const project = resolveBrunchProject(launchCwd); const db = createDb(project.dbPath); - runAgentJsonlSession({ db, input: process.stdin, output: process.stdout }) + runAgentJsonlSession({ db, input: process.stdin, output: process.stdout, projectCwd: project.cwd }) .then(() => { db.$client.close(); })