diff --git a/plugins/codex/commands/adversarial-review.md b/plugins/codex/commands/adversarial-review.md index da440ab4..bbe9b945 100644 --- a/plugins/codex/commands/adversarial-review.md +++ b/plugins/codex/commands/adversarial-review.md @@ -1,6 +1,6 @@ --- description: Run a Codex review that challenges the implementation approach and design choices -argument-hint: '[--wait|--background] [--base ] [--scope auto|working-tree|branch] [focus ...]' +argument-hint: '[--wait|--background] [--base ] [--scope auto|working-tree|branch] [--max-investigation-turns N] [focus ...]' disable-model-invocation: true allowed-tools: Read, Glob, Grep, Bash(node:*), Bash(git:*), AskUserQuestion --- @@ -43,6 +43,7 @@ Argument handling: - It supports working-tree review, branch review, and `--base `. - It does not support `--scope staged` or `--scope unstaged`. - Unlike `/codex:review`, it can still take extra focus text after the flags. +- For very large diffs that exceed the inline threshold, Codex investigates the diff with read-only commands across multiple turns. Use `--max-investigation-turns N` (default 10) to raise or lower the cap. Foreground flow: - Run: diff --git a/plugins/codex/prompts/adversarial-review-finalize.md b/plugins/codex/prompts/adversarial-review-finalize.md new file mode 100644 index 00000000..c4d0097b --- /dev/null +++ b/plugins/codex/prompts/adversarial-review-finalize.md @@ -0,0 +1,54 @@ + +You have just completed an investigation of a code change. Now produce the structured adversarial review. + + + +Based on your investigation in the prior turns of this thread, write up your findings as a structured review. +Target: {{TARGET_LABEL}} +User focus: {{USER_FOCUS}} + + + +Report only material findings. +Do not include style feedback, naming feedback, low-value cleanup, or speculative concerns without evidence. +A finding should answer: +1. What can go wrong? +2. Why is this code path vulnerable? +3. What is the likely impact? +4. What concrete change would reduce the risk? + + + +This is the finalization turn. Do NOT run any shell commands or tool calls in this turn — your investigation is already complete and you have all the context you need from the prior turns of this thread. +Return only valid JSON matching the provided schema. Your entire output must be that JSON — no prose before or after, no shell commands, no tool-call payloads. +Keep the output compact and specific. +Use `needs-attention` if there is any material risk worth blocking on. +Use `approve` only if you cannot support any substantive adversarial finding from your investigation. +Every finding must include: +- the affected file +- `line_start` and `line_end` +- a confidence score from 0 to 1 +- a concrete recommendation +Write the summary like a terse ship/no-ship assessment, not a neutral recap. + + + +Be aggressive, but stay grounded. +Every finding must be defensible from what you read during the investigation. +Do not invent files, lines, code paths, incidents, attack chains, or runtime behavior you cannot support. +If a conclusion depends on an inference, state that explicitly in the finding body and keep the confidence honest. + + + +Prefer one strong finding over several weak ones. +Do not dilute serious issues with filler. +If the change looks safe, say so directly and return no findings. + + + +Before finalizing, check that each finding is: +- adversarial rather than stylistic +- tied to a concrete code location +- plausible under a real failure scenario +- actionable for an engineer fixing the issue + diff --git a/plugins/codex/prompts/adversarial-review-investigate.md b/plugins/codex/prompts/adversarial-review-investigate.md new file mode 100644 index 00000000..54d6dcc9 --- /dev/null +++ b/plugins/codex/prompts/adversarial-review-investigate.md @@ -0,0 +1,48 @@ + +You are Codex performing an adversarial software review. +Your job is to break confidence in the change, not to validate it. +This is the investigation phase: gather evidence with read-only commands before producing any structured output. + + + +Investigate the change so you can later produce a confident adversarial assessment. +Target: {{TARGET_LABEL}} +User focus: {{USER_FOCUS}} + + + +Default to skepticism. +Assume the change can fail in subtle, high-cost, or user-visible ways until the evidence says otherwise. +Do not give credit for good intent, partial fixes, or likely follow-up work. +If something only works on the happy path, treat that as a real weakness. + + + +Prioritize the kinds of failures that are expensive, dangerous, or hard to detect: +- auth, permissions, tenant isolation, and trust boundaries +- data loss, corruption, duplication, and irreversible state changes +- rollback safety, retries, partial failure, and idempotency gaps +- race conditions, ordering assumptions, stale state, and re-entrancy +- empty-state, null, timeout, and degraded dependency behavior +- version skew, schema drift, migration hazards, and compatibility regressions +- observability gaps that would hide failure or make recovery harder + + + +Use read-only shell commands to inspect the diff and the surrounding code. +Useful starting points: `git diff`, `git log`, `git show`, `git blame`, `cat`, `rg`/`grep`. +Read the changed files, follow references, and confirm or refute hypotheses with evidence from the code. +Do not modify any files. Your sandbox is read-only. +{{REVIEW_COLLECTION_GUIDANCE}} + + + +Continue investigating until you can defend a confident adversarial assessment. +When you have seen enough, emit a brief summary message describing what you found and stop running commands. +A summary message with no further command calls signals that you are ready for the finalization phase. +Do not produce a structured review yet — that comes in the next phase. + + + +{{REVIEW_INPUT}} + diff --git a/plugins/codex/prompts/adversarial-review.md b/plugins/codex/prompts/adversarial-review.md index 78668af6..036e66f7 100644 --- a/plugins/codex/prompts/adversarial-review.md +++ b/plugins/codex/prompts/adversarial-review.md @@ -46,7 +46,8 @@ A finding should answer: -Return only valid JSON matching the provided schema. +This is a single-turn review. Do NOT run any shell commands or tool calls — the file contents you need are already embedded in the repository_context block below. Do not emit a tool-use stub like `{"cmd": "..."}` instead of the review; that is not the schema. +Return only valid JSON matching the provided schema. Your entire output must be that JSON — no prose before or after, no shell commands, no tool-call payloads. Keep the output compact and specific. Use `needs-attention` if there is any material risk worth blocking on. Use `approve` only if you cannot support any substantive adversarial finding from the provided context. diff --git a/plugins/codex/scripts/codex-companion.mjs b/plugins/codex/scripts/codex-companion.mjs index 35222fd5..78e65c3d 100644 --- a/plugins/codex/scripts/codex-companion.mjs +++ b/plugins/codex/scripts/codex-companion.mjs @@ -17,6 +17,7 @@ import { interruptAppServerTurn, parseStructuredOutput, readOutputSchema, + runAppServerInvestigation, runAppServerReview, runAppServerTurn } from "./lib/codex.mjs"; @@ -246,6 +247,24 @@ function buildAdversarialReviewPrompt(context, focusText) { }); } +function buildAdversarialInvestigatePrompt(context, focusText) { + const template = loadPromptTemplate(ROOT_DIR, "adversarial-review-investigate"); + return interpolateTemplate(template, { + TARGET_LABEL: context.target.label, + USER_FOCUS: focusText || "No extra focus provided.", + REVIEW_COLLECTION_GUIDANCE: context.collectionGuidance, + REVIEW_INPUT: context.content + }); +} + +function buildAdversarialFinalizePrompt(context, focusText) { + const template = loadPromptTemplate(ROOT_DIR, "adversarial-review-finalize"); + return interpolateTemplate(template, { + TARGET_LABEL: context.target.label, + USER_FOCUS: focusText || "No extra focus provided." + }); +} + function ensureCodexAvailable(cwd) { const availability = getCodexAvailability(cwd); if (!availability.available) { @@ -404,14 +423,29 @@ async function executeReviewRun(request) { } const context = collectReviewContext(request.cwd, target); - const prompt = buildAdversarialReviewPrompt(context, focusText); - const result = await runAppServerTurn(context.repoRoot, { - prompt, - model: request.model, - sandbox: "read-only", - outputSchema: readOutputSchema(REVIEW_SCHEMA), - onProgress: request.onProgress - }); + let result; + if (context.inputMode === "self-collect") { + const investigatePrompt = buildAdversarialInvestigatePrompt(context, focusText); + const finalizePrompt = buildAdversarialFinalizePrompt(context, focusText); + result = await runAppServerInvestigation(context.repoRoot, { + investigatePrompt, + finalizePrompt, + outputSchema: readOutputSchema(REVIEW_SCHEMA), + model: request.model, + sandbox: "read-only", + maxInvestigationTurns: request.maxInvestigationTurns, + onProgress: request.onProgress + }); + } else { + const prompt = buildAdversarialReviewPrompt(context, focusText); + result = await runAppServerTurn(context.repoRoot, { + prompt, + model: request.model, + sandbox: "read-only", + outputSchema: readOutputSchema(REVIEW_SCHEMA), + onProgress: request.onProgress + }); + } const parsed = parseStructuredOutput(result.finalMessage, { status: result.status, failureMessage: result.error?.message ?? result.stderr @@ -436,6 +470,9 @@ async function executeReviewRun(request) { parseError: parsed.parseError, reasoningSummary: result.reasoningSummary }; + if (result.investigation) { + payload.investigation = result.investigation; + } return { exitStatus: result.status, @@ -445,7 +482,8 @@ async function executeReviewRun(request) { rendered: renderReviewResult(parsed, { reviewLabel: reviewName, targetLabel: context.target.label, - reasoningSummary: result.reasoningSummary + reasoningSummary: result.reasoningSummary, + investigation: result.investigation ?? null }), summary: parsed.parsed?.summary ?? parsed.parseError ?? firstMeaningfulLine(result.finalMessage, `${reviewName} finished.`), jobTitle: `Codex ${reviewName}`, @@ -681,13 +719,22 @@ function enqueueBackgroundTask(cwd, job, request) { async function handleReviewCommand(argv, config) { const { options, positionals } = parseCommandInput(argv, { - valueOptions: ["base", "scope", "model", "cwd"], + valueOptions: ["base", "scope", "model", "cwd", "max-investigation-turns"], booleanOptions: ["json", "background", "wait"], aliasMap: { m: "model" } }); + const rawMaxTurns = options["max-investigation-turns"]; + let maxInvestigationTurns; + if (rawMaxTurns !== undefined) { + if (!/^[1-9][0-9]*$/.test(String(rawMaxTurns))) { + throw new Error(`--max-investigation-turns must be a positive integer (got: ${rawMaxTurns})`); + } + maxInvestigationTurns = Number(rawMaxTurns); + } + const cwd = resolveCommandCwd(options); const workspaceRoot = resolveCommandWorkspace(options); const focusText = positionals.join(" ").trim(); @@ -716,6 +763,7 @@ async function handleReviewCommand(argv, config) { model: options.model, focusText, reviewName: config.reviewName, + maxInvestigationTurns, onProgress: progress }), { json: options.json } diff --git a/plugins/codex/scripts/lib/codex.mjs b/plugins/codex/scripts/lib/codex.mjs index f2fe88bd..9a64f530 100644 --- a/plugins/codex/scripts/lib/codex.mjs +++ b/plugins/codex/scripts/lib/codex.mjs @@ -658,6 +658,9 @@ async function resumeThread(client, threadId, cwd, options = {}) { } function buildResultStatus(turnState) { + if (turnState.error) { + return 1; + } return turnState.finalTurn?.status === "completed" ? 0 : 1; } @@ -1028,6 +1031,224 @@ export async function runAppServerTurn(cwd, options = {}) { }); } +const DEFAULT_MAX_INVESTIGATION_TURNS = 10; +const INVESTIGATION_CONTINUATION_CUE = "Continue your investigation."; + +export async function runAppServerInvestigation(cwd, options = {}) { + const availability = getCodexAvailability(cwd); + if (!availability.available) { + throw new Error("Codex CLI is not installed or is missing required runtime support. Install it with `npm install -g @openai/codex`, then rerun `/codex:setup`."); + } + + const investigatePrompt = options.investigatePrompt?.trim(); + const finalizePrompt = options.finalizePrompt?.trim(); + if (!investigatePrompt) { + throw new Error("runAppServerInvestigation requires investigatePrompt."); + } + if (!finalizePrompt) { + throw new Error("runAppServerInvestigation requires finalizePrompt."); + } + const maxInvestigationTurns = Number.isFinite(options.maxInvestigationTurns) && options.maxInvestigationTurns > 0 + ? Math.floor(options.maxInvestigationTurns) + : DEFAULT_MAX_INVESTIGATION_TURNS; + const sandbox = options.sandbox ?? "read-only"; + + return withAppServer(cwd, async (client) => { + emitProgress(options.onProgress, "Starting Codex investigation thread.", "starting"); + const startResponse = await startThread(client, cwd, { + model: options.model, + sandbox, + ephemeral: true, + threadName: null + }); + const threadId = startResponse.thread.id; + emitProgress(options.onProgress, `Thread ready (${threadId}).`, "starting", { threadId }); + + let turnCount = 0; + let truncated = false; + let totalCommandsRun = 0; + const aggregatedCommandExecutions = []; + const aggregatedFileChanges = []; + + for (let i = 1; i <= maxInvestigationTurns; i += 1) { + const promptText = i === 1 ? investigatePrompt : INVESTIGATION_CONTINUATION_CUE; + emitProgress(options.onProgress, `Investigation turn ${i}.`, "investigating"); + + let turnState; + try { + turnState = await captureTurn( + client, + threadId, + () => + client.request("turn/start", { + threadId, + input: buildTurnInput(promptText), + model: options.model ?? null, + effort: options.effort ?? null, + outputSchema: null + }), + { onProgress: options.onProgress } + ); + } catch (transportError) { + return { + status: 1, + threadId, + turnId: null, + finalMessage: "", + reasoningSummary: [], + turn: null, + error: { message: transportError?.message ?? String(transportError) }, + stderr: cleanCodexStderr(client.stderr), + fileChanges: aggregatedFileChanges, + touchedFiles: collectTouchedFiles(aggregatedFileChanges), + commandExecutions: aggregatedCommandExecutions, + investigation: { turnCount, truncated: false } + }; + } + + turnCount = i; + const turnCommandCount = turnState.commandExecutions.length; + totalCommandsRun += turnCommandCount; + for (const cmd of turnState.commandExecutions) { + aggregatedCommandExecutions.push(cmd); + } + for (const change of turnState.fileChanges) { + aggregatedFileChanges.push(change); + } + + if (turnState.error) { + return { + status: buildResultStatus(turnState), + threadId, + turnId: turnState.turnId, + finalMessage: turnState.lastAgentMessage, + reasoningSummary: turnState.reasoningSummary, + turn: turnState.finalTurn, + error: turnState.error, + stderr: cleanCodexStderr(client.stderr), + fileChanges: aggregatedFileChanges, + touchedFiles: collectTouchedFiles(aggregatedFileChanges), + commandExecutions: aggregatedCommandExecutions, + investigation: { turnCount, truncated: false } + }; + } + + // Convergence: a turn that produces no commands and emits an agent + // message is the contract the investigate prompt teaches the model + // ("a summary message with no further command calls signals readiness"). + // The legacy check required `phase: "final_answer"`, but recon turns + // run with outputSchema=null so the app-server does not always tag + // messages with that phase — leading to runaway turns where the model + // keeps insisting it has converged but the loop refuses to listen. + const turnHadAgentMessage = Boolean(turnState.lastAgentMessage); + const converged = turnCommandCount === 0 && turnHadAgentMessage; + if (converged) { + break; + } + + if (i === maxInvestigationTurns) { + truncated = true; + } + } + + if (totalCommandsRun === 0) { + truncated = true; + } + + emitProgress(options.onProgress, "Investigation complete; finalizing structured output.", "finalizing"); + + // The finalize turn is supposed to emit only the structured JSON. In + // practice the model sometimes emits a tool-call stub instead (e.g. + // {"cmd": "wc -l ..."}) — if any commands ran during finalize, treat + // that as a contract violation and retry once with a sharper prompt. + const STRICT_FINALIZE_REMINDER = + "STRICT FINALIZE: do not run any shell commands. Output ONLY the JSON " + + "matching the schema, with no prose, no tool calls, and nothing else.\n\n"; + let finalizeState; + let finalizeAttempts = 0; + const MAX_FINALIZE_ATTEMPTS = 2; + while (finalizeAttempts < MAX_FINALIZE_ATTEMPTS) { + finalizeAttempts += 1; + const promptText = finalizeAttempts === 1 + ? finalizePrompt + : STRICT_FINALIZE_REMINDER + finalizePrompt; + try { + finalizeState = await captureTurn( + client, + threadId, + () => + client.request("turn/start", { + threadId, + input: buildTurnInput(promptText), + model: options.model ?? null, + effort: options.effort ?? null, + outputSchema: options.outputSchema ?? null + }), + { onProgress: options.onProgress } + ); + } catch (transportError) { + return { + status: 1, + threadId, + turnId: null, + finalMessage: "", + reasoningSummary: [], + turn: null, + error: { message: transportError?.message ?? String(transportError) }, + stderr: cleanCodexStderr(client.stderr), + fileChanges: aggregatedFileChanges, + touchedFiles: collectTouchedFiles(aggregatedFileChanges), + commandExecutions: aggregatedCommandExecutions, + investigation: { turnCount, truncated } + }; + } + + // If the finalize turn ran commands, the model violated the contract. + // Retry once with a stricter prompt; if it still fails, accept the + // (likely-malformed) output and let the parser surface the error. + if (finalizeState.commandExecutions.length === 0) { + break; + } + if (finalizeAttempts < MAX_FINALIZE_ATTEMPTS) { + emitProgress( + options.onProgress, + "Finalize turn ran commands; retrying with stricter prompt.", + "finalizing" + ); + } + // Aggregate the wasted commands so the caller still sees what happened. + for (const cmd of finalizeState.commandExecutions) { + aggregatedCommandExecutions.push(cmd); + } + for (const change of finalizeState.fileChanges) { + aggregatedFileChanges.push(change); + } + } + + for (const cmd of finalizeState.commandExecutions) { + aggregatedCommandExecutions.push(cmd); + } + for (const change of finalizeState.fileChanges) { + aggregatedFileChanges.push(change); + } + + return { + status: buildResultStatus(finalizeState), + threadId, + turnId: finalizeState.turnId, + finalMessage: finalizeState.lastAgentMessage, + reasoningSummary: finalizeState.reasoningSummary, + turn: finalizeState.finalTurn, + error: finalizeState.error, + stderr: cleanCodexStderr(client.stderr), + fileChanges: aggregatedFileChanges, + touchedFiles: collectTouchedFiles(aggregatedFileChanges), + commandExecutions: aggregatedCommandExecutions, + investigation: { turnCount, truncated } + }; + }); +} + export async function findLatestTaskThread(cwd) { const availability = getCodexAvailability(cwd); if (!availability.available) { diff --git a/plugins/codex/scripts/lib/git.mjs b/plugins/codex/scripts/lib/git.mjs index 1749cfc8..c06c599b 100644 --- a/plugins/codex/scripts/lib/git.mjs +++ b/plugins/codex/scripts/lib/git.mjs @@ -5,7 +5,12 @@ import { isProbablyText } from "./fs.mjs"; import { formatCommandFailure, runCommand, runCommandChecked } from "./process.mjs"; const MAX_UNTRACKED_BYTES = 24 * 1024; -const DEFAULT_INLINE_DIFF_MAX_FILES = 2; +// Inline-diff embeds full file contents into the prompt and pins outputSchema +// on a single turn — there is no recovery if the model wants to investigate +// before producing the verdict. Keep this path narrow: only single-file +// reviews of small diffs use it. Anything larger falls through to the +// two-phase self-collect path which can tolerate exploratory turns. +const DEFAULT_INLINE_DIFF_MAX_FILES = 1; const DEFAULT_INLINE_DIFF_MAX_BYTES = 256 * 1024; function git(cwd, args, options = {}) { diff --git a/plugins/codex/scripts/lib/render.mjs b/plugins/codex/scripts/lib/render.mjs index 2ec18523..f4591c65 100644 --- a/plugins/codex/scripts/lib/render.mjs +++ b/plugins/codex/scripts/lib/render.mjs @@ -208,6 +208,16 @@ export function renderSetupReport(report) { return `${lines.join("\n").trimEnd()}\n`; } +function appendInvestigationBanner(lines, meta) { + if (meta.investigation?.truncated === true) { + const turns = meta.investigation.turnCount ?? "?"; + lines.push( + `Investigation truncated at ${turns} turns; findings may be shallow. Use --max-investigation-turns to raise the cap.`, + "" + ); + } +} + export function renderReviewResult(parsedResult, meta) { if (!parsedResult.parsed) { const lines = [ @@ -217,6 +227,7 @@ export function renderReviewResult(parsedResult, meta) { "", `- Parse error: ${parsedResult.parseError}` ]; + appendInvestigationBanner(lines, meta); if (parsedResult.rawOutput) { lines.push("", "Raw final message:", "", "```text", parsedResult.rawOutput, "```"); @@ -237,6 +248,7 @@ export function renderReviewResult(parsedResult, meta) { "", `- Validation error: ${validationError}` ]; + appendInvestigationBanner(lines, meta); if (parsedResult.rawOutput) { lines.push("", "Raw final message:", "", "```text", parsedResult.rawOutput, "```"); @@ -258,6 +270,7 @@ export function renderReviewResult(parsedResult, meta) { data.summary, "" ]; + appendInvestigationBanner(lines, meta); if (findings.length === 0) { lines.push("No material findings."); diff --git a/tests/commands.test.mjs b/tests/commands.test.mjs index 3724ffa4..496a740e 100644 --- a/tests/commands.test.mjs +++ b/tests/commands.test.mjs @@ -49,7 +49,9 @@ test("adversarial review command uses AskUserQuestion and background Bash while assert.match(source, /```bash/); assert.match(source, /```typescript/); assert.match(source, /adversarial-review "\$ARGUMENTS"/); - assert.match(source, /\[--scope auto\|working-tree\|branch\] \[focus \.\.\.\]/); + assert.match(source, /\[--scope auto\|working-tree\|branch\]/); + assert.match(source, /\[--max-investigation-turns N\]/); + assert.match(source, /\[focus \.\.\.\]/); assert.match(source, /run_in_background:\s*true/); assert.match(source, /command:\s*`node "\$\{CLAUDE_PLUGIN_ROOT\}\/scripts\/codex-companion\.mjs" adversarial-review "\$ARGUMENTS"`/); assert.match(source, /description:\s*"Codex adversarial review"/); diff --git a/tests/fake-codex-fixture.mjs b/tests/fake-codex-fixture.mjs index debcadce..e1ac99cd 100644 --- a/tests/fake-codex-fixture.mjs +++ b/tests/fake-codex-fixture.mjs @@ -2,7 +2,7 @@ import fs from "node:fs"; import path from "node:path"; import process from "node:process"; -import { writeExecutable } from "./helpers.mjs"; +import { makeTempDir, writeExecutable } from "./helpers.mjs"; export function installFakeCodex(binDir, behavior = "review-ok") { const statePath = path.join(binDir, "fake-codex-state.json"); @@ -369,6 +369,50 @@ rl.on("line", (line) => { case "turn/start": { const thread = ensureThread(state, message.params.threadId); + + if (BEHAVIOR === "queue-driven") { + if (!state.requests) { state.requests = []; } + state.requests.push({ method: "turn/start", params: message.params }); + + const turnId = nextTurnId(state); + thread.updatedAt = now(); + + const entry = (state.queue && state.queue.length > 0) ? state.queue.shift() : null; + saveState(state); + + if (entry && entry.rpcError) { + send({ id: message.id, error: { code: -32000, message: entry.rpcError.message } }); + break; + } + + send({ id: message.id, result: { turn: buildTurn(turnId) } }); + send({ method: "turn/started", params: { threadId: thread.id, turn: buildTurn(turnId) } }); + + const commands = (entry && entry.commands) || []; + let cmdCounter = 0; + for (const cmd of commands) { + const itemId = "cmd_" + turnId + "_" + (cmdCounter++); + send({ method: "item/started", params: { threadId: thread.id, turnId, item: { type: "commandExecution", id: itemId, command: cmd.command, status: "in_progress" } } }); + send({ method: "item/completed", params: { threadId: thread.id, turnId, item: { type: "commandExecution", id: itemId, command: cmd.command, exitCode: cmd.exitCode ?? 0, status: "completed" } } }); + } + + if (entry && entry.finalAnswer) { + const phase = entry.finalAnswer.phase ?? "final_answer"; + send({ method: "item/completed", params: { threadId: thread.id, turnId, item: { type: "agentMessage", id: "msg_" + turnId, text: entry.finalAnswer.text, phase } } }); + } + + if (entry && entry.turnError) { + send({ method: "error", params: { threadId: thread.id, turnId, error: { message: entry.turnError.message } } }); + } + + if (!entry) { + send({ method: "item/completed", params: { threadId: thread.id, turnId, item: { type: "agentMessage", id: "msg_" + turnId, text: "", phase: "agent_message" } } }); + } + + send({ method: "turn/completed", params: { threadId: thread.id, turn: buildTurn(turnId, "completed") } }); + break; + } + const prompt = (message.params.input || []) .filter((item) => item.type === "text") .map((item) => item.text) @@ -587,3 +631,71 @@ export function buildEnv(binDir) { PATH: `${binDir}${sep}${process.env.PATH}` }; } + +/** + * Sets up a queue-driven fake Codex harness for multi-turn tests. + * Returns a handle with helpers for scripting turn responses and + * inspecting captured requests. + */ +export function setupFakeCodex({ cwd } = {}) { + const binDir = makeTempDir("codex-queue-driven-"); + installFakeCodex(binDir, "queue-driven"); + + const statePath = path.join(binDir, "fake-codex-state.json"); + const initialState = { + nextThreadId: 1, + nextTurnId: 1, + appServerStarts: 0, + threads: [], + capabilities: null, + lastInterrupt: null, + queue: [], + requests: [] + }; + fs.writeFileSync(statePath, JSON.stringify(initialState, null, 2)); + + const sep = process.platform === "win32" ? ";" : ":"; + process.env.PATH = `${binDir}${sep}${process.env.PATH}`; + + const env = buildEnv(binDir); + const resolvedCwd = cwd || process.cwd(); + + function readState() { + return JSON.parse(fs.readFileSync(statePath, "utf8")); + } + + function writeState(state) { + fs.writeFileSync(statePath, JSON.stringify(state, null, 2)); + } + + return { + cwd: resolvedCwd, + env, + binDir, + queueTurnResponse(entry) { + const state = readState(); + if (!state.queue) { state.queue = []; } + state.queue.push(entry); + writeState(state); + }, + queueTurnRpcError({ message }) { + const state = readState(); + if (!state.queue) { state.queue = []; } + state.queue.push({ rpcError: { message } }); + writeState(state); + }, + // `requests` re-reads the state file each access; assign to a local variable for repeated use. + get requests() { + const state = readState(); + return state.requests ?? []; + }, + close() { + const sep = process.platform === "win32" ? ";" : ":"; + process.env.PATH = (process.env.PATH ?? "") + .split(sep) + .filter((entry) => entry !== binDir) + .join(sep); + fs.rmSync(binDir, { recursive: true, force: true }); + } + }; +} diff --git a/tests/fake-codex-fixture.test.mjs b/tests/fake-codex-fixture.test.mjs new file mode 100644 index 00000000..5876460d --- /dev/null +++ b/tests/fake-codex-fixture.test.mjs @@ -0,0 +1,107 @@ +import test from "node:test"; +import assert from "node:assert/strict"; + +import { setupFakeCodex } from "./fake-codex-fixture.mjs"; +import { runAppServerTurn } from "../plugins/codex/scripts/lib/codex.mjs"; +import { makeTempDir } from "./helpers.mjs"; + +test("queue-driven fake: final answer is returned via runAppServerTurn", async () => { + const cwd = makeTempDir("codex-queue-test-"); + const handle = setupFakeCodex({ cwd }); + try { + handle.queueTurnResponse({ finalAnswer: { text: "hi" } }); + + const result = await runAppServerTurn(cwd, { + prompt: "say hi" + }); + + assert.equal(result.finalMessage, "hi"); + assert.equal(result.status, 0); + } finally { + handle.close(); + } +}); + +test("queue-driven fake: requests are captured with params", async () => { + const cwd = makeTempDir("codex-queue-test-"); + const handle = setupFakeCodex({ cwd }); + try { + handle.queueTurnResponse({ finalAnswer: { text: "captured" } }); + + await runAppServerTurn(cwd, { + prompt: "check capture" + }); + + const turnStarts = handle.requests.filter((r) => r.method === "turn/start"); + assert.equal(turnStarts.length, 1); + // Verify the captured params include the input text + const inputTexts = turnStarts[0].params.input + .filter((item) => item.type === "text") + .map((item) => item.text); + assert.ok(inputTexts.some((text) => text.includes("check capture"))); + } finally { + handle.close(); + } +}); + +test("queue-driven fake: commandExecution items are emitted and captured", async () => { + const cwd = makeTempDir("codex-queue-test-"); + const handle = setupFakeCodex({ cwd }); + try { + handle.queueTurnResponse({ + commands: [{ command: "git diff", exitCode: 0 }], + finalAnswer: { text: "done" } + }); + + const result = await runAppServerTurn(cwd, { + prompt: "run commands" + }); + + assert.equal(result.finalMessage, "done"); + assert.equal(result.commandExecutions.length, 1); + assert.equal(result.commandExecutions[0].command, "git diff"); + assert.equal(result.commandExecutions[0].exitCode, 0); + } finally { + handle.close(); + } +}); + +test("queue-driven fake: RPC error causes runAppServerTurn to reject", async () => { + const cwd = makeTempDir("codex-queue-test-"); + const handle = setupFakeCodex({ cwd }); + try { + handle.queueTurnRpcError({ message: "boom" }); + + await assert.rejects( + runAppServerTurn(cwd, { prompt: "should fail" }), + (error) => { + assert.ok(error.message.includes("boom")); + return true; + } + ); + } finally { + handle.close(); + } +}); + +test("queue-driven fake: soft error (turnError) is captured", async () => { + const cwd = makeTempDir("codex-queue-test-"); + const handle = setupFakeCodex({ cwd }); + try { + handle.queueTurnResponse({ + finalAnswer: { text: "partial" }, + turnError: { message: "soft failure" } + }); + + const result = await runAppServerTurn(cwd, { + prompt: "trigger soft error" + }); + + // The turn still completes, but state.error is set + assert.equal(result.finalMessage, "partial"); + assert.ok(result.error); + assert.equal(result.error.message, "soft failure"); + } finally { + handle.close(); + } +}); diff --git a/tests/git.test.mjs b/tests/git.test.mjs index 14ff2576..9b567e4d 100644 --- a/tests/git.test.mjs +++ b/tests/git.test.mjs @@ -86,6 +86,28 @@ test("collectReviewContext keeps inline diffs for tiny adversarial reviews", () assert.match(context.content, /INLINE_MARKER/); }); +test("collectReviewContext routes 2-file changes to self-collect (inline cap is 1)", () => { + // Regression guard: a 2-file change used to slip into inline-diff because + // the cap was 2, embedding both files into a single-turn schema-pinned + // prompt — and the model often responded with a tool-call stub instead + // of the review JSON. Two files now go through the two-phase self-collect + // path which tolerates exploratory turns. + const cwd = makeTempDir(); + initGitRepo(cwd); + fs.writeFileSync(path.join(cwd, "seed.js"), "export const value = 'seed';\n"); + run("git", ["add", "seed.js"], { cwd }); + run("git", ["commit", "-m", "init"], { cwd }); + fs.writeFileSync(path.join(cwd, "doc-one.md"), "# planning doc\n".repeat(50)); + fs.writeFileSync(path.join(cwd, "doc-two.md"), "# spec doc\n".repeat(50)); + + const target = resolveReviewTarget(cwd, {}); + const context = collectReviewContext(cwd, target); + + assert.equal(context.fileCount, 2); + assert.equal(context.inputMode, "self-collect", + "2-file changes must NOT be inlined; they hit the schema-pinned single-turn bug otherwise"); +}); + test("collectReviewContext skips untracked directories in working tree review", () => { const cwd = makeTempDir(); initGitRepo(cwd); diff --git a/tests/investigation.test.mjs b/tests/investigation.test.mjs new file mode 100644 index 00000000..4ee571ec --- /dev/null +++ b/tests/investigation.test.mjs @@ -0,0 +1,883 @@ +import test from "node:test"; +import assert from "node:assert/strict"; + +import { setupFakeCodex } from "./fake-codex-fixture.mjs"; +import { runAppServerInvestigation } from "../plugins/codex/scripts/lib/codex.mjs"; +import { makeTempDir } from "./helpers.mjs"; + +// Structured JSON payloads used by multiple tests. +const STRUCTURED_REVIEW = JSON.stringify({ + verdict: "needs-attention", + summary: "Concern X.", + findings: [{ + severity: "high", + title: "Race", + file: "a.js", + line_start: 10, + line_end: 12, + confidence: 0.8, + body: "Potential race condition.", + recommendation: "Add a mutex." + }], + next_steps: [] +}); + +const APPROVE_REVIEW = JSON.stringify({ + verdict: "approve", + summary: "No material issues found.", + findings: [], + next_steps: [] +}); + +test("converges when Codex emits a final-answer turn with no commands", async () => { + const cwd = makeTempDir("codex-inv-test-"); + const fake = setupFakeCodex({ cwd }); + try { + // Recon turn 1: commands, no final answer + fake.queueTurnResponse({ + commands: [{ command: "git diff HEAD~1", exitCode: 0 }], + finalAnswer: null + }); + // Recon turn 2: no commands, final answer => converges + fake.queueTurnResponse({ + commands: [], + finalAnswer: { text: "Investigation done." } + }); + // Finalize turn 3 + fake.queueTurnResponse({ + finalAnswer: { text: STRUCTURED_REVIEW } + }); + + const result = await runAppServerInvestigation(fake.cwd, { + investigatePrompt: "Investigate the changes.", + finalizePrompt: "Produce your structured verdict.", + outputSchema: { type: "object", required: ["verdict"] } + }); + + assert.equal(result.investigation.turnCount, 2); + assert.equal(result.investigation.truncated, false); + + const requests = fake.requests; + const starts = requests.filter((r) => r.method === "turn/start"); + assert.equal(starts.length, 3, "should have 3 turn/start requests (2 recon + 1 finalize)"); + } finally { + fake.close(); + } +}); + +test("converges when agentMessage has no `final_answer` phase tag (real-world case)", async () => { + // In production, recon turns run with outputSchema=null, and the + // app-server does NOT always tag agent messages with phase="final_answer". + // The convergence detector must treat any 0-command turn that emits an + // agent message as convergence — otherwise the model keeps insisting it + // has converged but the loop refuses to stop. + const cwd = makeTempDir("codex-inv-test-"); + const fake = setupFakeCodex({ cwd }); + try { + fake.queueTurnResponse({ + commands: [{ command: "git diff HEAD~1", exitCode: 0 }], + finalAnswer: null + }); + // Recon turn 2: no commands, agent message WITHOUT final_answer phase + // => must still converge (this is what real codex sends in recon mode). + fake.queueTurnResponse({ + commands: [], + finalAnswer: { text: "My investigation is complete.", phase: "agent_message" } + }); + fake.queueTurnResponse({ + finalAnswer: { text: STRUCTURED_REVIEW } + }); + + const result = await runAppServerInvestigation(fake.cwd, { + investigatePrompt: "Investigate.", + finalizePrompt: "Finalize.", + outputSchema: { type: "object", required: ["verdict"] } + }); + + assert.equal(result.investigation.turnCount, 2, + "convergence on the no-command + agentMessage turn must not be missed"); + assert.equal(result.investigation.truncated, false); + + const requests = fake.requests; + const starts = requests.filter((r) => r.method === "turn/start"); + assert.equal(starts.length, 3, "2 recon + 1 finalize"); + } finally { + fake.close(); + } +}); + +test("respects maxInvestigationTurns and marks truncated", async () => { + const cwd = makeTempDir("codex-inv-test-"); + const fake = setupFakeCodex({ cwd }); + try { + // Queue 3 recon turns: all have commands, no final answer. + // (With maxInvestigationTurns=3 the loop will exhaust here.) + for (let i = 0; i < 3; i++) { + fake.queueTurnResponse({ + commands: [{ command: `check-${i}`, exitCode: 0 }] + }); + } + // Finalize turn: pure JSON, zero commands. + fake.queueTurnResponse({ + finalAnswer: { text: APPROVE_REVIEW } + }); + + const result = await runAppServerInvestigation(fake.cwd, { + investigatePrompt: "Investigate.", + finalizePrompt: "Finalize.", + maxInvestigationTurns: 3 + }); + + assert.equal(result.investigation.turnCount, 3); + assert.equal(result.investigation.truncated, true); + + const requests = fake.requests; + const starts = requests.filter((r) => r.method === "turn/start"); + assert.equal(starts.length, 4, "3 recon + 1 finalize"); + } finally { + fake.close(); + } +}); + +test("turn with both finalAnswer and commands does not converge", async () => { + const cwd = makeTempDir("codex-inv-test-"); + const fake = setupFakeCodex({ cwd }); + try { + // Recon turn 1: commands AND final answer => does NOT converge + fake.queueTurnResponse({ + commands: [{ command: "grep -r TODO", exitCode: 0 }], + finalAnswer: { text: "Partial finding." } + }); + // Recon turn 2: only final answer, no commands => converges + fake.queueTurnResponse({ + commands: [], + finalAnswer: { text: "Investigation done." } + }); + // Finalize turn + fake.queueTurnResponse({ + finalAnswer: { text: APPROVE_REVIEW } + }); + + const result = await runAppServerInvestigation(fake.cwd, { + investigatePrompt: "Investigate.", + finalizePrompt: "Finalize." + }); + + assert.equal(result.investigation.turnCount, 2); + assert.equal(result.investigation.truncated, false); + } finally { + fake.close(); + } +}); + +test("outputSchema is null on recon turns and set on finalize turn", async () => { + const cwd = makeTempDir("codex-inv-test-"); + const fake = setupFakeCodex({ cwd }); + try { + // Recon turn 1: commands + fake.queueTurnResponse({ + commands: [{ command: "cat file.js", exitCode: 0 }] + }); + // Recon turn 2: final answer, no commands => converges + fake.queueTurnResponse({ + commands: [], + finalAnswer: { text: "Done investigating." } + }); + // Finalize turn + fake.queueTurnResponse({ + finalAnswer: { text: STRUCTURED_REVIEW } + }); + + const schema = { type: "object", required: ["verdict"] }; + const result = await runAppServerInvestigation(fake.cwd, { + investigatePrompt: "Investigate.", + finalizePrompt: "Finalize.", + outputSchema: schema + }); + + const requests = fake.requests; + const starts = requests.filter((r) => r.method === "turn/start"); + assert.equal(starts.length, 3); + + // Recon turns must have outputSchema === null + assert.equal(starts[0].params.outputSchema, null, "recon turn 1 outputSchema should be null"); + assert.equal(starts[1].params.outputSchema, null, "recon turn 2 outputSchema should be null"); + + // Finalize turn must have the schema + assert.deepEqual(starts[2].params.outputSchema, schema, "finalize turn should have the outputSchema"); + } finally { + fake.close(); + } +}); + +test("phase-1 soft error (turn/failed) aborts before phase-2 finalize", async () => { + const cwd = makeTempDir("codex-inv-test-"); + const fake = setupFakeCodex({ cwd }); + try { + // Recon turn 1: commands, normal + fake.queueTurnResponse({ + commands: [{ command: "git log --oneline", exitCode: 0 }] + }); + // Recon turn 2: soft error + fake.queueTurnResponse({ + turnError: { message: "model produced unrenderable response" } + }); + + const result = await runAppServerInvestigation(fake.cwd, { + investigatePrompt: "Investigate.", + finalizePrompt: "Finalize." + }); + + assert.ok(result.error, "result should have an error"); + assert.match(result.error.message, /model produced unrenderable response/); + assert.equal(result.investigation.turnCount, 2, "soft-error turn IS counted"); + // A turn/completed with status="completed" can still arrive after an + // error notification, so result.status must be derived from the error, + // not from finalTurn.status. CI/automation relies on this. + assert.equal(result.status, 1, "soft-error path returns numeric status 1"); + + const requests = fake.requests; + const starts = requests.filter((r) => r.method === "turn/start"); + assert.equal(starts.length, 2, "NO finalize turn should be attempted"); + } finally { + fake.close(); + } +}); + +test("phase-1 hard error (transport throw) aborts before phase-2 finalize", async () => { + const cwd = makeTempDir("codex-inv-test-"); + const fake = setupFakeCodex({ cwd }); + try { + // Recon turn 1: commands, normal + fake.queueTurnResponse({ + commands: [{ command: "git status", exitCode: 0 }] + }); + // Recon turn 2: RPC error (transport throw) + fake.queueTurnRpcError({ message: "ECONNRESET" }); + + const result = await runAppServerInvestigation(fake.cwd, { + investigatePrompt: "Investigate.", + finalizePrompt: "Finalize." + }); + + assert.ok(result.error, "result should have an error"); + assert.match(result.error.message, /ECONNRESET/); + assert.equal(result.investigation.turnCount, 1, "hard error returns BEFORE incrementing turnCount"); + assert.equal(result.status, 1, "transport-error path returns numeric status"); + + const requests = fake.requests; + const starts = requests.filter((r) => r.method === "turn/start"); + assert.equal(starts.length, 2, "the failing turn was still attempted"); + } finally { + fake.close(); + } +}); + +test("finalize turn that runs commands triggers one strict-prompt retry", async () => { + // Production observation: the model occasionally emits a tool-call stub + // during finalize (e.g. {"cmd": "wc -l ..."}) instead of the structured + // JSON. When that happens the finalize turn has commandExecutions.length > 0. + // The orchestrator should detect this contract violation and retry once + // with a stricter prompt. + const cwd = makeTempDir("codex-inv-test-"); + const fake = setupFakeCodex({ cwd }); + try { + // Recon: converge. + fake.queueTurnResponse({ + commands: [], + finalAnswer: { text: "Investigation done." } + }); + // First finalize attempt: model misbehaves and runs a command. + fake.queueTurnResponse({ + commands: [{ command: "wc -l README.md", exitCode: 0 }], + finalAnswer: { text: "{\"cmd\":\"wc -l README.md\"}" } + }); + // Second finalize attempt: model behaves and emits proper JSON. + fake.queueTurnResponse({ + commands: [], + finalAnswer: { text: APPROVE_REVIEW } + }); + + const result = await runAppServerInvestigation(fake.cwd, { + investigatePrompt: "Investigate.", + finalizePrompt: "Finalize.", + outputSchema: { type: "object", required: ["verdict"] } + }); + + assert.equal(result.finalMessage, APPROVE_REVIEW, + "the second (well-behaved) finalize attempt should be the final message"); + const requests = fake.requests; + const starts = requests.filter((r) => r.method === "turn/start"); + assert.equal(starts.length, 3, "1 recon + 2 finalize attempts"); + + // The retry must use a stricter prompt distinct from the first try. + const finalizeStarts = starts.slice(1); // first start is recon + assert.match( + finalizeStarts[1].params.input?.[0]?.text ?? "", + /STRICT FINALIZE/, + "retry prompt must include the stricter directive" + ); + assert.doesNotMatch( + finalizeStarts[0].params.input?.[0]?.text ?? "", + /STRICT FINALIZE/, + "first finalize attempt uses the normal prompt" + ); + } finally { + fake.close(); + } +}); + +test("finalize retry gives up after the second attempt and surfaces the output", async () => { + // If the model misbehaves twice in a row, the orchestrator must not loop + // forever — it should accept the second attempt's output and let the + // upstream parser produce a useful validation error. + const cwd = makeTempDir("codex-inv-test-"); + const fake = setupFakeCodex({ cwd }); + try { + fake.queueTurnResponse({ + commands: [], + finalAnswer: { text: "Investigation done." } + }); + // Both finalize attempts misbehave. + fake.queueTurnResponse({ + commands: [{ command: "wc -l a", exitCode: 0 }], + finalAnswer: { text: "{\"cmd\":\"wc -l a\"}" } + }); + fake.queueTurnResponse({ + commands: [{ command: "wc -l b", exitCode: 0 }], + finalAnswer: { text: "{\"cmd\":\"wc -l b\"}" } + }); + + const result = await runAppServerInvestigation(fake.cwd, { + investigatePrompt: "Investigate.", + finalizePrompt: "Finalize.", + outputSchema: { type: "object", required: ["verdict"] } + }); + + const requests = fake.requests; + const starts = requests.filter((r) => r.method === "turn/start"); + assert.equal(starts.length, 3, "1 recon + 2 finalize attempts only — no infinite loop"); + assert.equal(result.finalMessage, "{\"cmd\":\"wc -l b\"}", + "the second-attempt output is surfaced so the parser can flag it"); + } finally { + fake.close(); + } +}); + +test("phase-2 finalize transport error preserves investigation metadata", async () => { + const cwd = makeTempDir("codex-inv-test-"); + const fake = setupFakeCodex({ cwd }); + try { + // Recon turn 1: commands + fake.queueTurnResponse({ + commands: [{ command: "git diff", exitCode: 0 }] + }); + // Recon turn 2: converge with final answer + no commands + fake.queueTurnResponse({ + commands: [], + finalAnswer: { text: "Investigation done." } + }); + // Finalize turn: RPC error + fake.queueTurnRpcError({ message: "ETIMEDOUT during finalize" }); + + const result = await runAppServerInvestigation(fake.cwd, { + investigatePrompt: "Investigate.", + finalizePrompt: "Finalize." + }); + + assert.ok(result.error, "result should carry the finalize error"); + assert.match(result.error.message, /ETIMEDOUT during finalize/); + assert.equal(result.investigation.turnCount, 2, "investigation completed both recon turns before finalize failed"); + assert.equal(result.investigation.truncated, false, "investigation converged; not truncated"); + assert.equal(result.status, 1, "finalize-error path returns numeric status"); + + const requests = fake.requests; + const starts = requests.filter((r) => r.method === "turn/start"); + assert.equal(starts.length, 3, "2 recon + 1 finalize attempted"); + } finally { + fake.close(); + } +}); + +test("converges with zero commands flags truncated=true", async () => { + const cwd = makeTempDir("codex-inv-test-"); + const fake = setupFakeCodex({ cwd }); + try { + // Recon turn 1: only final answer, no commands => converges immediately + fake.queueTurnResponse({ + commands: [], + finalAnswer: { text: "Nothing to investigate." } + }); + // Finalize turn + fake.queueTurnResponse({ + finalAnswer: { text: APPROVE_REVIEW } + }); + + const result = await runAppServerInvestigation(fake.cwd, { + investigatePrompt: "Investigate.", + finalizePrompt: "Finalize." + }); + + assert.equal(result.investigation.turnCount, 1); + assert.equal(result.investigation.truncated, true, "zero commands across investigation => truncated"); + + const requests = fake.requests; + const starts = requests.filter((r) => r.method === "turn/start"); + assert.equal(starts.length, 2, "1 recon + 1 finalize"); + } finally { + fake.close(); + } +}); + +test("recon turn 1 sends the investigate prompt; turn 2+ sends the continuation cue", async () => { + const cwd = makeTempDir("codex-inv-test-"); + const fake = setupFakeCodex({ cwd }); + try { + // Recon turn 1: commands + fake.queueTurnResponse({ + commands: [{ command: "ls", exitCode: 0 }] + }); + // Recon turn 2: commands + fake.queueTurnResponse({ + commands: [{ command: "cat a.js", exitCode: 0 }] + }); + // Recon turn 3: final answer, no commands => converges + fake.queueTurnResponse({ + commands: [], + finalAnswer: { text: "All done." } + }); + // Finalize turn + fake.queueTurnResponse({ + finalAnswer: { text: APPROVE_REVIEW } + }); + + const result = await runAppServerInvestigation(fake.cwd, { + investigatePrompt: "FULL INVESTIGATE PROMPT: look at the code", + finalizePrompt: "FINALIZE PROMPT: produce verdict" + }); + + const requests = fake.requests; + const starts = requests.filter((r) => r.method === "turn/start"); + assert.equal(starts.length, 4, "3 recon + 1 finalize"); + + // Extract input text from each turn/start + const inputTexts = starts.map((s) => + (s.params.input || []) + .filter((p) => p.type === "text") + .map((p) => p.text) + .join("") + ); + + // Turn 1: investigate prompt + assert.match(inputTexts[0], /FULL INVESTIGATE PROMPT/, "turn 1 should use investigate prompt"); + // Turn 2 and 3: continuation cue + assert.equal(inputTexts[1], "Continue your investigation.", "turn 2 should use continuation cue"); + assert.equal(inputTexts[2], "Continue your investigation.", "turn 3 should use continuation cue"); + // Turn 4: finalize prompt + assert.match(inputTexts[3], /FINALIZE PROMPT/, "turn 4 should use finalize prompt"); + } finally { + fake.close(); + } +}); + +// ------------------------------------------------------------------- +// Integration tests: subprocess-based end-to-end companion tests +// ------------------------------------------------------------------- + +import { mkdtempSync, writeFileSync, mkdirSync, rmSync } from "node:fs"; +import { tmpdir } from "node:os"; +import path from "node:path"; +import { spawnSync } from "node:child_process"; +import { fileURLToPath } from "node:url"; + +const COMPANION_PATH = fileURLToPath( + new URL("../plugins/codex/scripts/codex-companion.mjs", import.meta.url) +); + +function makeSelfCollectGitFixture() { + // 3+ changed files triggers self-collect (DEFAULT_INLINE_DIFF_MAX_FILES = 2) + const root = mkdtempSync(path.join(tmpdir(), "codex-self-collect-test-")); + spawnSync("git", ["init", "-q", "-b", "main"], { cwd: root }); + spawnSync("git", ["config", "user.email", "test@example.com"], { cwd: root }); + spawnSync("git", ["config", "user.name", "Test"], { cwd: root }); + spawnSync("git", ["config", "commit.gpgsign", "false"], { cwd: root }); + writeFileSync(path.join(root, "README.md"), "# repo\n"); + spawnSync("git", ["add", "."], { cwd: root }); + spawnSync("git", ["commit", "-q", "-m", "init"], { cwd: root }); + spawnSync("git", ["checkout", "-q", "-b", "feature"], { cwd: root }); + mkdirSync(path.join(root, "src"), { recursive: true }); + for (let i = 0; i < 5; i += 1) { + writeFileSync(path.join(root, "src", `f${i}.js`), `export const v${i} = ${i};\n`); + } + spawnSync("git", ["add", "."], { cwd: root }); + spawnSync("git", ["commit", "-q", "-m", "feature"], { cwd: root }); + return root; +} + +function makeInlineGitFixture() { + // 1 changed file stays on inline-diff path + const root = mkdtempSync(path.join(tmpdir(), "codex-inline-test-")); + spawnSync("git", ["init", "-q", "-b", "main"], { cwd: root }); + spawnSync("git", ["config", "user.email", "test@example.com"], { cwd: root }); + spawnSync("git", ["config", "user.name", "Test"], { cwd: root }); + spawnSync("git", ["config", "commit.gpgsign", "false"], { cwd: root }); + writeFileSync(path.join(root, "README.md"), "# repo\n"); + spawnSync("git", ["add", "."], { cwd: root }); + spawnSync("git", ["commit", "-q", "-m", "init"], { cwd: root }); + spawnSync("git", ["checkout", "-q", "-b", "feature"], { cwd: root }); + writeFileSync(path.join(root, "one.js"), "export const v = 1;\n"); + spawnSync("git", ["add", "."], { cwd: root }); + spawnSync("git", ["commit", "-q", "-m", "tiny"], { cwd: root }); + return root; +} + +function runCompanion(args, env) { + return spawnSync("node", [COMPANION_PATH, ...args], { + env: { ...process.env, ...env }, + encoding: "utf8", + timeout: 30000 + }); +} + +test("self-collect path uses runAppServerInvestigation end-to-end", async () => { + const cwd = makeSelfCollectGitFixture(); + const fake = setupFakeCodex({ cwd }); + try { + // Recon turn 1: runs a command, no convergence + fake.queueTurnResponse({ + commands: [{ command: "git diff main...HEAD", exitCode: 0 }], + finalAnswer: null + }); + // Recon turn 2: no commands, final answer => converges + fake.queueTurnResponse({ + commands: [], + finalAnswer: { text: "Investigation done." } + }); + // Finalize turn: structured output + fake.queueTurnResponse({ + finalAnswer: { + text: JSON.stringify({ + verdict: "needs-attention", + summary: "Found risk in src/f1.js.", + findings: [{ + severity: "high", + title: "Unguarded export", + file: "src/f1.js", + line_start: 1, + line_end: 1, + confidence: 0.7, + body: "Module exports v1 with no validation.", + recommendation: "Add validation." + }], + next_steps: [] + }) + } + }); + + const result = runCompanion( + ["adversarial-review", "--base", "main", "--scope", "branch", "--cwd", cwd, "--json"], + fake.env + ); + + assert.equal(result.status, 0, `expected exit 0, stderr: ${result.stderr}`); + + // stdout may contain progress lines followed by JSON; parse the last JSON object + const stdout = result.stdout.trim(); + const payload = JSON.parse(stdout); + assert.ok(payload.investigation, "self-collect payload must have investigation field"); + assert.equal(payload.investigation.turnCount, 2); + assert.equal(payload.investigation.truncated, false); + assert.equal(payload.result?.verdict, "needs-attention"); + } finally { + fake.close(); + rmSync(cwd, { recursive: true, force: true }); + } +}); + +test("inline-diff path does not call runAppServerInvestigation", async () => { + const cwd = makeInlineGitFixture(); + const fake = setupFakeCodex({ cwd }); + try { + // Single turn: inline path uses runAppServerTurn + fake.queueTurnResponse({ + finalAnswer: { + text: JSON.stringify({ + verdict: "approve", + summary: "No material issues found.", + findings: [], + next_steps: [] + }) + } + }); + + const result = runCompanion( + ["adversarial-review", "--base", "main", "--scope", "branch", "--cwd", cwd, "--json"], + fake.env + ); + + assert.equal(result.status, 0, `expected exit 0, stderr: ${result.stderr}`); + const payload = JSON.parse(result.stdout.trim()); + assert.equal(payload.investigation, undefined, + "inline-path payload must not carry the investigation field"); + + const requests = fake.requests; + const starts = requests.filter((r) => r.method === "turn/start"); + assert.equal(starts.length, 1, "inline path is single-turn"); + } finally { + fake.close(); + rmSync(cwd, { recursive: true, force: true }); + } +}); + +// ------------------------------------------------------------------- +// Unit tests for runAppServerInvestigation (continued from above) +// ------------------------------------------------------------------- + +test("outputSchema-set finalize turn produces schema-conformant final message", async () => { + const cwd = makeTempDir("codex-inv-test-"); + const fake = setupFakeCodex({ cwd }); + try { + // Recon turn 1: commands, no final answer + fake.queueTurnResponse({ + commands: [{ command: "git diff HEAD~1", exitCode: 0 }], + finalAnswer: null + }); + // Recon turn 2: no commands, final answer => converges + fake.queueTurnResponse({ + commands: [], + finalAnswer: { text: "Investigation done." } + }); + // Finalize turn with structured output + fake.queueTurnResponse({ + finalAnswer: { text: STRUCTURED_REVIEW } + }); + + const schema = { + type: "object", + required: ["verdict"], + properties: { + verdict: { type: "string" }, + summary: { type: "string" }, + findings: { type: "array" }, + next_steps: { type: "array" } + } + }; + + const result = await runAppServerInvestigation(fake.cwd, { + investigatePrompt: "Investigate the changes.", + finalizePrompt: "Produce your structured verdict.", + outputSchema: schema + }); + + // The finalMessage should be parseable JSON with a verdict field + const parsed = JSON.parse(result.finalMessage); + assert.equal(parsed.verdict, "needs-attention"); + assert.equal(parsed.findings.length, 1); + assert.equal(parsed.findings[0].severity, "high"); + + // The finalize turn should have received the schema + const requests = fake.requests; + const starts = requests.filter((r) => r.method === "turn/start"); + assert.deepEqual(starts[starts.length - 1].params.outputSchema, schema); + + // Only the finalize turn's reasoningSummary is returned + assert.ok(Array.isArray(result.reasoningSummary)); + } finally { + fake.close(); + } +}); + +// ------------------------------------------------------------------- +// Task 5: renderer truncation banner +// ------------------------------------------------------------------- + +import { renderReviewResult } from "../plugins/codex/scripts/lib/render.mjs"; + +test("renderer prepends truncation banner when investigation.truncated is true", () => { + const parsed = { + parsed: { + verdict: "needs-attention", + summary: "Risk identified.", + findings: [], + next_steps: [] + } + }; + const out = renderReviewResult(parsed, { + reviewLabel: "Adversarial Review", + targetLabel: "branch:feature", + reasoningSummary: [], + investigation: { turnCount: 10, truncated: true } + }); + assert.match(out, /Investigation truncated at 10 turns; findings may be shallow\./); +}); + +test("renderer omits truncation banner when investigation is null or not truncated", () => { + const parsed = { + parsed: { + verdict: "approve", + summary: "Looks fine.", + findings: [], + next_steps: [] + } + }; + const outNull = renderReviewResult(parsed, { + reviewLabel: "Adversarial Review", + targetLabel: "branch:feature", + reasoningSummary: [], + investigation: null + }); + assert.doesNotMatch(outNull, /Investigation truncated/); + + const outOk = renderReviewResult(parsed, { + reviewLabel: "Adversarial Review", + targetLabel: "branch:feature", + reasoningSummary: [], + investigation: { turnCount: 4, truncated: false } + }); + assert.doesNotMatch(outOk, /Investigation truncated/); +}); + +test("renderer shows truncation banner on parse-error and validation-error paths", () => { + const parseErrorOut = renderReviewResult( + { parsed: null, parseError: "Unexpected token", rawOutput: "{not json" }, + { + reviewLabel: "Adversarial Review", + targetLabel: "branch:feature", + reasoningSummary: [], + investigation: { turnCount: 10, truncated: true } + } + ); + assert.match(parseErrorOut, /Investigation truncated at 10 turns/, + "banner must appear when output is unparseable AND investigation was truncated"); + + const validationErrorOut = renderReviewResult( + { parsed: { not: "review-shaped" } }, + { + reviewLabel: "Adversarial Review", + targetLabel: "branch:feature", + reasoningSummary: [], + investigation: { turnCount: 10, truncated: true } + } + ); + assert.match(validationErrorOut, /Investigation truncated at 10 turns/, + "banner must appear when output has wrong shape AND investigation was truncated"); +}); + +// ------------------------------------------------------------------- +// Task 6: --max-investigation-turns CLI flag +// ------------------------------------------------------------------- + +import { parseArgs } from "../plugins/codex/scripts/lib/args.mjs"; + +test("parseArgs accepts --max-investigation-turns as a value option", () => { + const { options } = parseArgs( + ["--base", "main", "--max-investigation-turns", "15", "auth"], + { + valueOptions: ["base", "scope", "model", "cwd", "max-investigation-turns"], + booleanOptions: ["json", "background", "wait"] + } + ); + assert.equal(options["max-investigation-turns"], "15"); +}); + +test("--max-investigation-turns propagates from CLI to runAppServerInvestigation", async () => { + const cwd = makeSelfCollectGitFixture(); + const fake = setupFakeCodex({ cwd }); + try { + for (let i = 0; i < 6; i += 1) { + fake.queueTurnResponse({ + commands: [{ command: "git diff", exitCode: 0 }], + finalAnswer: null + }); + } + fake.queueTurnResponse({ + finalAnswer: { text: JSON.stringify({ verdict: "approve", summary: "ok", findings: [], next_steps: [] }) } + }); + + const result = runCompanion( + ["adversarial-review", + "--base", "main", "--scope", "branch", "--cwd", cwd, + "--max-investigation-turns", "5", + "--json"], + fake.env + ); + + assert.equal(result.status, 0, `expected exit 0, stderr: ${result.stderr}`); + const payload = JSON.parse(result.stdout.trim()); + assert.equal(payload.investigation.turnCount, 5); + assert.equal(payload.investigation.truncated, true); + } finally { + fake.close(); + rmSync(cwd, { recursive: true, force: true }); + } +}); + +test("invalid --max-investigation-turns raises a clear error", () => { + const r = spawnSync("node", + [COMPANION_PATH, "adversarial-review", "--max-investigation-turns", "abc"], + { encoding: "utf8", timeout: 30000 } + ); + assert.notEqual(r.status, 0, "should exit non-zero on invalid flag value"); + assert.match( + r.stderr, + /must be a positive integer/, + "error message should explain the validation failure" + ); +}); + +test("--max-investigation-turns rejects malformed numeric tokens", () => { + // parseInt-style salvage must NOT accept these — they are typos, not valid input. + const cases = [ + "1.5", // parseInt would yield 1 + "10abc", // parseInt would yield 10 + "1e2", // exponential notation: Number(...) yields 100, but the contract is "integer literal" + " 5", // leading whitespace from accidental quoting + "0", // zero is not positive + "-3", // negative integer + "" // empty string + ]; + for (const value of cases) { + const r = spawnSync("node", + [COMPANION_PATH, "adversarial-review", "--max-investigation-turns", value], + { encoding: "utf8", timeout: 30000 } + ); + assert.notEqual(r.status, 0, `value ${JSON.stringify(value)} should exit non-zero`); + assert.match( + r.stderr, + /must be a positive integer/, + `value ${JSON.stringify(value)} should trigger the validation error` + ); + } +}); + +// ------------------------------------------------------------------- +// Prompt contract: inline + finalize prompts must forbid tool-call stubs +// ------------------------------------------------------------------- + +import { readFileSync } from "node:fs"; +import { fileURLToPath as fileURLToPathPromptCheck } from "node:url"; + +const PROMPT_DIR = fileURLToPathPromptCheck( + new URL("../plugins/codex/prompts/", import.meta.url) +); + +test("inline adversarial-review prompt forbids tool-call stub output", () => { + const text = readFileSync(`${PROMPT_DIR}adversarial-review.md`, "utf8"); + // The model used to respond with payloads like {"cmd": "wc -l ..."} + // instead of the review JSON — the prompt must explicitly disallow it + // since this path is single-turn and has no recovery. + assert.match(text, /tool[- ]?call|tool[- ]?use/i, + "inline prompt must mention tool-call/tool-use"); + assert.match(text, /\{"cmd"|stub/i, + "inline prompt must show or name the tool-call stub anti-pattern"); + assert.match(text, /Do NOT run any shell commands/, + "inline prompt must explicitly forbid running shell commands"); +}); + +test("finalize prompt forbids tool-call stub output", () => { + const text = readFileSync(`${PROMPT_DIR}adversarial-review-finalize.md`, "utf8"); + assert.match(text, /Do NOT run any shell commands/, + "finalize prompt must explicitly forbid running shell commands"); + assert.match(text, /no tool-call payloads|no shell commands/i, + "finalize prompt must spell out the no-tool-call rule"); +}); diff --git a/tests/runtime.test.mjs b/tests/runtime.test.mjs index 90408372..1961a592 100644 --- a/tests/runtime.test.mjs +++ b/tests/runtime.test.mjs @@ -295,8 +295,13 @@ test("adversarial review asks Codex to inspect larger diffs itself", () => { assert.equal(result.status, 0, result.stderr); const state = JSON.parse(fs.readFileSync(path.join(binDir, "fake-codex-state.json"), "utf8")); - assert.match(state.lastTurnStart.prompt, /lightweight summary/i); - assert.match(state.lastTurnStart.prompt, /read-only git commands/i); + // With the two-phase investigation flow, the default fake converges on turn 1 + // (no commands, finalAnswer), then the finalize turn fires. lastTurnStart + // captures the finalize turn, which uses buildAdversarialFinalizePrompt. + // The investigate prompt (turn 1) contained the self-collect guidance; the + // finalize prompt (turn 2) references the investigation completed in prior turns. + assert.match(state.lastTurnStart.prompt, /investigation|structured review/i); + // File contents must not be inlined in either prompt (self-collect mode) assert.doesNotMatch(state.lastTurnStart.prompt, /PROMPT_SELF_COLLECT_[ABC]/); });