From 867e943f1e44fd73c6fd2937dd70ba9769f0b8de Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sun, 28 Jun 2026 13:18:00 +0000
Subject: [PATCH 1/3] Add experimental BinEval evaluation support

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

Co-authored-by: pelikhan <4175913+pelikhan@users.noreply.github.com>
---
 actions/setup/js/eval_harness.cjs             | 371 ++++++++++++++++++
 actions/setup/js/eval_harness.test.cjs        | 267 +++++++++++++
 pkg/constants/constants_test.go               |   2 +
 pkg/constants/job_constants.go                |   8 +
 pkg/parser/schemas/main_workflow_schema.json  |  29 ++
 pkg/workflow/compiler_evals.go                | 250 ++++++++++++
 pkg/workflow/compiler_evals_test.go           | 234 +++++++++++
 pkg/workflow/compiler_jobs.go                 |   5 +
 .../compiler_orchestrator_workflow.go         |   3 +
 pkg/workflow/compiler_types.go                |   1 +
 pkg/workflow/compiler_validators.go           |   1 +
 .../evals_experimental_warning_test.go        | 138 +++++++
 pkg/workflow/frontmatter_types.go             |  52 +++
 13 files changed, 1361 insertions(+)
 create mode 100644 actions/setup/js/eval_harness.cjs
 create mode 100644 actions/setup/js/eval_harness.test.cjs
 create mode 100644 pkg/workflow/compiler_evals.go
 create mode 100644 pkg/workflow/compiler_evals_test.go
 create mode 100644 pkg/workflow/evals_experimental_warning_test.go
diff --git a/actions/setup/js/eval_harness.cjs b/actions/setup/js/eval_harness.cjs
new file mode 100644
index 00000000000..7c5ea4d3f04
--- /dev/null
+++ b/actions/setup/js/eval_harness.cjs
@@ -0,0 +1,371 @@
+// @ts-check
+/// <reference types="@actions/github-script" />
+
+/**
+ * eval_harness.cjs
+ *
+ * BinEval Evaluation Harness (experimental)
+ *
+ * Evaluates a set of binary questions about a completed agent workflow run.
+ * Each question is evaluated independently by an LLM, producing a binary
+ * pass/fail result with an optional rationale.
+ *
+ * Environment variables (set by the compiled workflow step):
+ *   GH_AW_EVAL_SPEC       - JSON array of {id, question} evaluation definitions
+ *   GH_AW_EVAL_WORK_DIR   - Working directory for evals (default: /tmp/gh-aw/evals)
+ *   GH_AW_EVAL_MODEL      - LLM model to use (default: gpt-4o-mini)
+ *
+ * Input files (downloaded from agent artifact into GH_AW_EVAL_WORK_DIR):
+ *   agent_output.json     - Structured agent output for context
+ *   aw-prompts/prompt.txt - Original workflow prompt for context
+ *
+ * Output files (written to GH_AW_EVAL_WORK_DIR):
+ *   eval_results.json     - Aggregated evaluation summary and per-question results
+ *
+ * Design principles:
+ *   - Each question is evaluated independently (BinEval)
+ *   - Partial failures are tolerated (a failed LLM call for one question does not
+ *     abort evaluation of the remaining questions)
+ *   - The evaluator is deterministic in aggregation: pass_rate = passed / total
+ *   - No MCPs, no checkout: the harness only reads downloaded artifact files
+ */
+
+"use strict";
+
+const fs = require("fs");
+const path = require("path");
+const https = require("https");
+
+// ---------------------------------------------------------------------------
+// Constants
+// ---------------------------------------------------------------------------
+
+const DEFAULT_WORK_DIR = "/tmp/gh-aw/evals";
+const DEFAULT_MODEL = "gpt-4o-mini";
+
+// GitHub Models API endpoint for chat completions (OpenAI-compatible).
+// Uses GITHUB_TOKEN for authentication — no additional credentials required.
+const GITHUB_MODELS_ENDPOINT = "models.github.com";
+const GITHUB_MODELS_PATH = "/inference/chat/completions";
+const EVAL_SYSTEM_PROMPT =
+  "You are an objective evaluator. Answer binary (yes/no) questions about agentic workflow outputs. Always respond with a JSON object containing 'passed' (boolean), 'rationale' (string), and 'confidence' (number 0-1).";
+
+// These caps keep the prompt comfortably within the context window of the
+// default small eval model while still leaving room for the JSON answer.
+const MAX_AGENT_OUTPUT_CHARS = 8000;
+const MAX_PROMPT_CHARS = 4000;
+const MAX_RATIONALE_CHARS = 500;
+
+// ---------------------------------------------------------------------------
+// Types (JSDoc)
+// ---------------------------------------------------------------------------
+
+/**
+ * @typedef {Object} EvalDefinition
+ * @property {string} id
+ * @property {string} question
+ */
+
+/**
+ * @typedef {Object} EvalResult
+ * @property {string} id
+ * @property {boolean} passed
+ * @property {string} [rationale]
+ * @property {number} [confidence]
+ */
+
+/**
+ * @typedef {Object} EvalSummary
+ * @property {number} total
+ * @property {number} passed
+ * @property {number} failed
+ * @property {number} pass_rate
+ * @property {EvalResult[]} results
+ */
+
+// ---------------------------------------------------------------------------
+// Helpers
+// ---------------------------------------------------------------------------
+
+/**
+ * Reads the eval specification from the GH_AW_EVAL_SPEC environment variable.
+ * @returns {EvalDefinition[]}
+ */
+function readEvalSpec() {
+  const raw = process.env.GH_AW_EVAL_SPEC || "[]";
+  try {
+    const parsed = JSON.parse(raw);
+    if (!Array.isArray(parsed)) {
+      throw new Error("GH_AW_EVAL_SPEC must be a JSON array");
+    }
+    return parsed.filter(e => e && typeof e.id === "string" && e.id && typeof e.question === "string" && e.question);
+  } catch (err) {
+    throw new Error(`Failed to parse GH_AW_EVAL_SPEC: ${err.message}`);
+  }
+}
+
+/**
+ * Reads and truncates a file for inclusion as LLM context.
+ * Returns an empty string if the file does not exist.
+ * @param {string} filePath
+ * @param {number} maxChars
+ * @returns {string}
+ */
+function readContextFile(filePath, maxChars) {
+  if (!fs.existsSync(filePath)) {
+    return "";
+  }
+  try {
+    const content = fs.readFileSync(filePath, "utf-8");
+    if (content.length <= maxChars) return content;
+    return content.slice(0, maxChars) + "\n... (truncated)";
+  } catch {
+    return "";
+  }
+}
+
+/**
+ * Sanitizes an error message before including it in eval artifacts or logs.
+ * Redacts tokens, URLs, and control characters to prevent credential leaks.
+ * @param {unknown} err
+ * @returns {string}
+ */
+function sanitizeEvalError(err) {
+  const raw = err instanceof Error ? err.message : String(err ?? "unknown error");
+  const sanitized = raw
+    .replace(/Bearer\s+[A-Za-z0-9._-]+/gi, "[REDACTED_TOKEN]")
+    .replace(/\*{4,}/g, "[REDACTED_TOKEN]")
+    .replace(/\b[A-Za-z0-9._-]*token[A-Za-z0-9._-]*\b/gi, "[REDACTED_TOKEN]")
+    .replace(/\b(gh[pousr]_[A-Za-z0-9_]+)\b/g, "[REDACTED_TOKEN]")
+    .replace(/https?:\/\/\S+/gi, "[REDACTED_URL]")
+    .replace(/[\r\n\t]+/g, " ")
+    .trim();
+  return sanitized.slice(0, 200) || "unknown error";
+}
+
+/**
+ * Builds an evaluation prompt for a single binary question given the agent context.
+ * @param {string} question
+ * @param {string} agentContext
+ * @returns {string}
+ */
+function buildEvalPrompt(question, agentContext) {
+  const contextSection = agentContext ? `## Agent Output Context\n\n${agentContext}\n\n` : "";
+  return (
+    `${contextSection}` +
+    `## Evaluation Question\n\n${question}\n\n` +
+    `## Instructions\n\n` +
+    `Answer the evaluation question above based solely on the agent output context provided.\n` +
+    `Respond with a JSON object containing exactly these fields:\n` +
+    `- "passed": true if the answer is yes, false if the answer is no\n` +
+    `- "rationale": a brief one-sentence explanation (max 100 words)\n` +
+    `- "confidence": a number between 0 and 1 indicating your confidence\n\n` +
+    `Respond only with the JSON object, no other text.`
+  );
+}
+
+/**
+ * Makes an HTTPS POST request and returns the response body as a string.
+ * @param {object} options - https.request options
+ * @param {string} body - Request body
+ * @returns {Promise<string>}
+ */
+function httpsPost(options, body) {
+  return new Promise((resolve, reject) => {
+    const req = https.request(options, res => {
+      const chunks = [];
+      res.on("data", chunk => chunks.push(chunk));
+      res.on("end", () => {
+        const responseBody = Buffer.concat(chunks).toString("utf-8");
+        if (res.statusCode && res.statusCode >= 400) {
+          reject(new Error(`HTTP ${res.statusCode}: ${responseBody.slice(0, 200)}`));
+        } else {
+          resolve(responseBody);
+        }
+      });
+    });
+    req.on("error", reject);
+    req.write(body);
+    req.end();
+  });
+}
+
+/**
+ * Calls the GitHub Models API to evaluate a single binary question.
+ * Returns an EvalResult. On failure, returns a failed result with an error rationale
+ * so the harness can continue evaluating remaining questions (partial failure tolerance).
+ * @param {string} token
+ * @param {string} model
+ * @param {string} question
+ * @param {string} agentContext
+ * @returns {Promise<{passed: boolean, rationale: string, confidence?: number}>}
+ */
+async function callLLMForQuestion(token, model, question, agentContext) {
+  const prompt = buildEvalPrompt(question, agentContext);
+  const requestBody = JSON.stringify({
+    model,
+    messages: [
+      {
+        role: "system",
+        content: EVAL_SYSTEM_PROMPT,
+      },
+      { role: "user", content: prompt },
+    ],
+    response_format: { type: "json_object" },
+    temperature: 0,
+    // The response is a tiny JSON object with three fields, so 256 tokens leaves
+    // ample headroom for rationale text without inflating per-question cost.
+    max_tokens: 256,
+  });
+
+  const options = {
+    hostname: GITHUB_MODELS_ENDPOINT,
+    path: GITHUB_MODELS_PATH,
+    method: "POST",
+    headers: {
+      "Content-Type": "application/json",
+      Authorization: "Bearer " + token,
+      "Content-Length": Buffer.byteLength(requestBody),
+    },
+  };
+
+  const responseBody = await httpsPost(options, requestBody);
+  const response = JSON.parse(responseBody);
+
+  const content = response?.choices?.[0]?.message?.content;
+  if (!content) {
+    throw new Error("Empty response from LLM");
+  }
+
+  const parsed = JSON.parse(content);
+  const rationale = typeof parsed.rationale === "string" ? parsed.rationale : "";
+  if (rationale.length > MAX_RATIONALE_CHARS) {
+    console.warn(`Truncating eval rationale from ${rationale.length} to ${MAX_RATIONALE_CHARS} characters`);
+  }
+  const result = {
+    passed: Boolean(parsed.passed),
+    rationale: rationale.slice(0, MAX_RATIONALE_CHARS),
+  };
+
+  if (typeof parsed.confidence === "number") {
+    result.confidence = Math.max(0, Math.min(1, parsed.confidence));
+  }
+
+  return result;
+}
+
+/**
+ * Aggregates an array of EvalResult into an EvalSummary.
+ * Aggregation is deterministic: pass_rate = passed / total.
+ * @param {EvalResult[]} results
+ * @returns {EvalSummary}
+ */
+function aggregateResults(results) {
+  const total = results.length;
+  const passed = results.filter(r => r.passed).length;
+  const failed = total - passed;
+  const pass_rate = total > 0 ? passed / total : 0;
+  return { total, passed, failed, pass_rate, results };
+}
+
+/**
+ * Renders a markdown summary table from an EvalSummary.
+ * @param {EvalSummary} summary
+ * @returns {string}
+ */
+function renderMarkdownSummary(summary) {
+  const passRatePercent = (summary.pass_rate * 100).toFixed(1);
+  const lines = ["## 🧪 BinEval Results (experimental)\n", `**${summary.passed}/${summary.total} passed** (${passRatePercent}%)\n\n`, "| Question ID | Result | Rationale |\n", "| --- | --- | --- |\n"];
+  for (const r of summary.results) {
+    const icon = r.passed ? "✅ pass" : "❌ fail";
+    const rationale = (r.rationale || "").replace(/\|/g, "\\|");
+    lines.push(`| \`${r.id}\` | ${icon} | ${rationale} |\n`);
+  }
+  return lines.join("");
+}
+
+// ---------------------------------------------------------------------------
+// Main
+// ---------------------------------------------------------------------------
+
+/**
+ * Main entry point for the eval harness.
+ * @returns {Promise<void>}
+ */
+async function main() {
+  const workDir = process.env.GH_AW_EVAL_WORK_DIR || DEFAULT_WORK_DIR;
+  const model = process.env.GH_AW_EVAL_MODEL || DEFAULT_MODEL;
+  const token = process.env.GITHUB_TOKEN || "";
+
+  core.info(`BinEval harness starting (model: ${model}, workDir: ${workDir})`);
+
+  // Parse eval definitions
+  const evals = readEvalSpec();
+  if (evals.length === 0) {
+    core.warning("No eval definitions found in GH_AW_EVAL_SPEC — nothing to evaluate");
+    return;
+  }
+  core.info(`Evaluating ${evals.length} question(s)`);
+
+  // Load agent context files
+  const agentOutputPath = path.join(workDir, "agent_output.json");
+  const promptPath = path.join(workDir, "aw-prompts", "prompt.txt");
+
+  const agentOutputRaw = readContextFile(agentOutputPath, MAX_AGENT_OUTPUT_CHARS);
+  const promptRaw = readContextFile(promptPath, MAX_PROMPT_CHARS);
+
+  // Build context string from available files
+  const contextParts = [];
+  if (promptRaw) contextParts.push(`### Workflow Prompt\n${promptRaw}`);
+  if (agentOutputRaw) contextParts.push(`### Agent Output\n${agentOutputRaw}`);
+  const agentContext = contextParts.join("\n\n");
+
+  if (!agentContext) {
+    core.warning("No agent context files found — evaluations will have limited context");
+  }
+
+  // Evaluate each question independently (partial failure tolerance)
+  const results = [];
+  for (const evalDef of evals) {
+    core.info(`Evaluating: ${evalDef.id} — "${evalDef.question}"`);
+    try {
+      if (!token) {
+        throw new Error("GITHUB_TOKEN is not set — cannot call GitHub Models API");
+      }
+      const result = await callLLMForQuestion(token, model, evalDef.question, agentContext);
+      results.push({ id: evalDef.id, ...result });
+      const icon = result.passed ? "✅" : "❌";
+      core.info(`  ${icon} ${result.passed ? "pass" : "fail"} — ${result.rationale}`);
+    } catch (err) {
+      const sanitizedError = sanitizeEvalError(err);
+      core.warning(`  ⚠️ Evaluation failed for "${evalDef.id}": ${sanitizedError}`);
+      results.push({
+        id: evalDef.id,
+        passed: false,
+        rationale: `Evaluation error: ${sanitizedError}`,
+      });
+    }
+  }
+
+  // Aggregate results
+  const summary = aggregateResults(results);
+
+  // Write JSON results
+  fs.mkdirSync(workDir, { recursive: true });
+  const resultsPath = path.join(workDir, "eval_results.json");
+  fs.writeFileSync(resultsPath, JSON.stringify(summary, null, 2));
+  core.info(`Results written to ${resultsPath}`);
+
+  // Write step summary
+  const markdownSummary = renderMarkdownSummary(summary);
+  core.summary.addRaw(markdownSummary).write();
+
+  // Set outputs
+  core.setOutput("eval_passed", String(summary.passed));
+  core.setOutput("eval_total", String(summary.total));
+  core.setOutput("eval_pass_rate", summary.pass_rate.toFixed(4));
+
+  core.info(`BinEval complete: ${summary.passed}/${summary.total} passed (${(summary.pass_rate * 100).toFixed(1)}%)`);
+}
+
+module.exports = { main, readEvalSpec, buildEvalPrompt, aggregateResults, renderMarkdownSummary, sanitizeEvalError };
diff --git a/actions/setup/js/eval_harness.test.cjs b/actions/setup/js/eval_harness.test.cjs
new file mode 100644
index 00000000000..a19b674880c
--- /dev/null
+++ b/actions/setup/js/eval_harness.test.cjs
@@ -0,0 +1,267 @@
+// @ts-check
+
+/**
+ * Tests for eval_harness.cjs
+ */
+
+import { describe, it, expect } from "vitest";
+import { readEvalSpec, buildEvalPrompt, aggregateResults, renderMarkdownSummary, sanitizeEvalError } from "./eval_harness.cjs";
+
+// ---------------------------------------------------------------------------
+// readEvalSpec
+// ---------------------------------------------------------------------------
+
+describe("readEvalSpec", () => {
+  it("returns empty array when GH_AW_EVAL_SPEC is absent", () => {
+    delete process.env.GH_AW_EVAL_SPEC;
+    const result = readEvalSpec();
+    expect(result).toEqual([]);
+  });
+
+  it("returns empty array when GH_AW_EVAL_SPEC is empty array JSON", () => {
+    process.env.GH_AW_EVAL_SPEC = "[]";
+    expect(readEvalSpec()).toEqual([]);
+  });
+
+  it("parses a valid eval spec", () => {
+    process.env.GH_AW_EVAL_SPEC = JSON.stringify([
+      { id: "builds", question: "Does the code compile?" },
+      { id: "tests", question: "Are all tests passing?" },
+    ]);
+    const result = readEvalSpec();
+    expect(result).toHaveLength(2);
+    expect(result[0]).toEqual({ id: "builds", question: "Does the code compile?" });
+    expect(result[1]).toEqual({ id: "tests", question: "Are all tests passing?" });
+  });
+
+  it("filters out entries with missing id or question", () => {
+    const spec = [{ id: "valid", question: "Is it good?" }, { id: "", question: "Missing id" }, { id: "no-question", question: "" }, { question: "No id field" }];
+    process.env.GH_AW_EVAL_SPEC = JSON.stringify(spec);
+    const result = readEvalSpec();
+    expect(result).toHaveLength(1);
+    expect(result[0].id).toBe("valid");
+  });
+
+  it("throws when GH_AW_EVAL_SPEC is invalid JSON", () => {
+    process.env.GH_AW_EVAL_SPEC = "not-json";
+    expect(() => readEvalSpec()).toThrow(/Failed to parse GH_AW_EVAL_SPEC/);
+  });
+
+  it("throws when GH_AW_EVAL_SPEC is not an array", () => {
+    process.env.GH_AW_EVAL_SPEC = JSON.stringify({ id: "x" });
+    expect(() => readEvalSpec()).toThrow(/must be a JSON array/);
+  });
+});
+
+// ---------------------------------------------------------------------------
+// buildEvalPrompt
+// ---------------------------------------------------------------------------
+
+describe("buildEvalPrompt", () => {
+  it("includes the question in the prompt", () => {
+    const prompt = buildEvalPrompt("Does the code compile?", "");
+    expect(prompt).toContain("Does the code compile?");
+  });
+
+  it("includes agent context when provided", () => {
+    const prompt = buildEvalPrompt("Is it good?", "Agent output: success");
+    expect(prompt).toContain("Agent output: success");
+    expect(prompt).toContain("Agent Output Context");
+  });
+
+  it("omits context section when agent context is empty", () => {
+    const prompt = buildEvalPrompt("Is it good?", "");
+    expect(prompt).not.toContain("Agent Output Context");
+  });
+
+  it("requires binary yes/no answer in JSON format", () => {
+    const prompt = buildEvalPrompt("Question?", "Context");
+    expect(prompt).toContain('"passed"');
+    expect(prompt).toContain('"rationale"');
+    expect(prompt).toContain('"confidence"');
+    expect(prompt).toContain("JSON object");
+  });
+
+  it("instructs to respond only with JSON", () => {
+    const prompt = buildEvalPrompt("Question?", "Context");
+    expect(prompt).toContain("Respond only with the JSON object");
+  });
+});
+
+// ---------------------------------------------------------------------------
+// aggregateResults
+// ---------------------------------------------------------------------------
+
+describe("aggregateResults", () => {
+  it("returns zero counts for empty results", () => {
+    const summary = aggregateResults([]);
+    expect(summary.total).toBe(0);
+    expect(summary.passed).toBe(0);
+    expect(summary.failed).toBe(0);
+    expect(summary.pass_rate).toBe(0);
+    expect(summary.results).toEqual([]);
+  });
+
+  it("counts all passed when all pass", () => {
+    const results = [
+      { id: "a", passed: true, rationale: "yes" },
+      { id: "b", passed: true, rationale: "yes" },
+    ];
+    const summary = aggregateResults(results);
+    expect(summary.total).toBe(2);
+    expect(summary.passed).toBe(2);
+    expect(summary.failed).toBe(0);
+    expect(summary.pass_rate).toBe(1);
+  });
+
+  it("counts all failed when all fail", () => {
+    const results = [
+      { id: "a", passed: false, rationale: "no" },
+      { id: "b", passed: false, rationale: "no" },
+    ];
+    const summary = aggregateResults(results);
+    expect(summary.total).toBe(2);
+    expect(summary.passed).toBe(0);
+    expect(summary.failed).toBe(2);
+    expect(summary.pass_rate).toBe(0);
+  });
+
+  it("computes correct pass rate for mixed results", () => {
+    const results = [
+      { id: "a", passed: true },
+      { id: "b", passed: false },
+      { id: "c", passed: true },
+      { id: "d", passed: false },
+    ];
+    const summary = aggregateResults(results);
+    expect(summary.total).toBe(4);
+    expect(summary.passed).toBe(2);
+    expect(summary.failed).toBe(2);
+    expect(summary.pass_rate).toBe(0.5);
+  });
+
+  it("preserves result order", () => {
+    const results = [
+      { id: "c", passed: true },
+      { id: "a", passed: false },
+      { id: "b", passed: true },
+    ];
+    const summary = aggregateResults(results);
+    expect(summary.results.map(r => r.id)).toEqual(["c", "a", "b"]);
+  });
+
+  it("aggregation is deterministic (pass_rate = passed / total)", () => {
+    const results = Array.from({ length: 10 }, (_, i) => ({
+      id: `q${i}`,
+      passed: i < 7,
+    }));
+    const summary = aggregateResults(results);
+    expect(summary.passed).toBe(7);
+    expect(summary.failed).toBe(3);
+    expect(summary.pass_rate).toBeCloseTo(0.7, 10);
+  });
+});
+
+// ---------------------------------------------------------------------------
+// renderMarkdownSummary
+// ---------------------------------------------------------------------------
+
+describe("renderMarkdownSummary", () => {
+  it("includes BinEval heading", () => {
+    const summary = aggregateResults([]);
+    const md = renderMarkdownSummary(summary);
+    expect(md).toContain("BinEval Results");
+  });
+
+  it("shows pass count and total", () => {
+    const summary = aggregateResults([
+      { id: "a", passed: true, rationale: "ok" },
+      { id: "b", passed: false, rationale: "nope" },
+    ]);
+    const md = renderMarkdownSummary(summary);
+    expect(md).toContain("1/2 passed");
+  });
+
+  it("shows pass rate as percentage", () => {
+    const summary = aggregateResults([
+      { id: "a", passed: true },
+      { id: "b", passed: true },
+      { id: "c", passed: false },
+      { id: "d", passed: false },
+    ]);
+    const md = renderMarkdownSummary(summary);
+    expect(md).toContain("50.0%");
+  });
+
+  it("includes pass/fail icons for each result", () => {
+    const summary = aggregateResults([
+      { id: "pass-q", passed: true, rationale: "good" },
+      { id: "fail-q", passed: false, rationale: "bad" },
+    ]);
+    const md = renderMarkdownSummary(summary);
+    expect(md).toContain("✅");
+    expect(md).toContain("❌");
+  });
+
+  it("includes question IDs in the table", () => {
+    const summary = aggregateResults([{ id: "my-eval-question", passed: true }]);
+    const md = renderMarkdownSummary(summary);
+    expect(md).toContain("my-eval-question");
+  });
+
+  it("escapes pipe characters in rationale to avoid breaking markdown table", () => {
+    const summary = aggregateResults([{ id: "a", passed: true, rationale: "a | b | c" }]);
+    const md = renderMarkdownSummary(summary);
+    // Pipes within the rationale cell should be escaped
+    expect(md).toContain("a \\| b \\| c");
+  });
+
+  it("renders a markdown table with correct columns", () => {
+    const summary = aggregateResults([{ id: "q1", passed: true, rationale: "looks good" }]);
+    const md = renderMarkdownSummary(summary);
+    expect(md).toContain("| Question ID |");
+    expect(md).toContain("| Result |");
+    expect(md).toContain("| Rationale |");
+  });
+});
+
+// ---------------------------------------------------------------------------
+// sanitizeEvalError
+// ---------------------------------------------------------------------------
+
+describe("sanitizeEvalError", () => {
+  it("returns the error message from an Error object", () => {
+    const result = sanitizeEvalError(new Error("something went wrong"));
+    expect(result).toContain("something went wrong");
+  });
+
+  it("redacts bearer tokens", () => {
+    const result = sanitizeEvalError(new Error("****** was rejected"));
+    expect(result).not.toContain("secret-token");
+    expect(result).toContain("[REDACTED_TOKEN]");
+  });
+
+  it("redacts GitHub personal access tokens", () => {
+    const result = sanitizeEvalError(new Error("token ghp_ABCDEFGH1234567890 expired"));
+    expect(result).not.toContain("ghp_ABCDEFGH1234567890");
+    expect(result).toContain("[REDACTED_TOKEN]");
+  });
+
+  it("redacts URLs", () => {
+    const result = sanitizeEvalError(new Error("failed to fetch https://api.example.com/secret"));
+    expect(result).not.toContain("https://api.example.com/secret");
+    expect(result).toContain("[REDACTED_URL]");
+  });
+
+  it("handles non-Error values", () => {
+    expect(sanitizeEvalError("plain string error")).toContain("plain string error");
+    expect(sanitizeEvalError(null)).toBe("unknown error");
+    expect(sanitizeEvalError(undefined)).toBe("unknown error");
+  });
+
+  it("truncates very long error messages to 200 chars", () => {
+    const longMessage = "x".repeat(500);
+    const result = sanitizeEvalError(new Error(longMessage));
+    expect(result.length).toBeLessThanOrEqual(200);
+  });
+});
diff --git a/pkg/constants/constants_test.go b/pkg/constants/constants_test.go
index bccde15c2b4..f38d4906e9a 100644
--- a/pkg/constants/constants_test.go
+++ b/pkg/constants/constants_test.go
@@ -260,6 +260,7 @@ func TestConstantValues(t *testing.T) {
 		{"UploadCodeScanningJobName", string(UploadCodeScanningJobName), "upload_code_scanning_sarif"},
 		{"ConclusionJobName", string(ConclusionJobName), "conclusion"},
 		{"UnlockJobName", string(UnlockJobName), "unlock"},
+		{"EvalJobName", string(EvalJobName), "eval"},
 		{"SafeOutputArtifactName", SafeOutputArtifactName, "safe-output"},
 		{"AgentOutputArtifactName", AgentOutputArtifactName, "agent-output"},
 		{"SafeOutputItemsArtifactName", SafeOutputItemsArtifactName, "safe-outputs-items"},
@@ -301,6 +302,7 @@ func TestKnownBuiltInJobNamesContainsAllKnownJobs(t *testing.T) {
 		string(UploadCodeScanningJobName),
 		string(ConclusionJobName),
 		string(UnlockJobName),
+		string(EvalJobName),
 	}
 
 	for _, jobName := range knownJobs {
diff --git a/pkg/constants/job_constants.go b/pkg/constants/job_constants.go
index 87cc0e51ee9..9cc0a5ee6a2 100644
--- a/pkg/constants/job_constants.go
+++ b/pkg/constants/job_constants.go
@@ -67,6 +67,7 @@ const UploadAssetsJobName JobName = "upload_assets"
 const UploadCodeScanningJobName JobName = "upload_code_scanning_sarif"
 const ConclusionJobName JobName = "conclusion"
 const UnlockJobName JobName = "unlock"
+const EvalJobName JobName = "eval"
 
 // KnownBuiltInJobNames contains all known built-in workflow job names (including aliases).
 // It is used for O(1) membership checks when validating or filtering user-defined job
@@ -84,6 +85,7 @@ var KnownBuiltInJobNames = map[string]struct{}{
 	string(UploadCodeScanningJobName):  {},
 	string(ConclusionJobName):          {},
 	string(UnlockJobName):              {},
+	string(EvalJobName):                {},
 }
 
 // Artifact name constants
@@ -97,6 +99,12 @@ const AgentArtifactName = "agent"
 // DetectionArtifactName is the artifact name for the threat detection log.
 const DetectionArtifactName = "detection"
 
+// EvalArtifactName is the artifact name for BinEval evaluation results.
+const EvalArtifactName = "eval"
+
+// EvalResultsFilename is the filename of the evaluation results JSON written to /tmp/gh-aw/evals/.
+const EvalResultsFilename = "eval_results.json"
+
 // LegacyDetectionArtifactName is the old artifact name used before the rename.
 // Kept for backward compatibility when downloading artifacts from older workflow runs.
 const LegacyDetectionArtifactName = "threat-detection.log"
diff --git a/pkg/parser/schemas/main_workflow_schema.json b/pkg/parser/schemas/main_workflow_schema.json
index d4ff6273c59..3046c822d0d 100644
--- a/pkg/parser/schemas/main_workflow_schema.json
+++ b/pkg/parser/schemas/main_workflow_schema.json
@@ -2947,6 +2947,35 @@
         }
       }
     },
+    "evals": {
+      "description": "(Experimental) BinEval evaluation questions. Each entry defines a binary question to be evaluated independently by an LLM after the agent job completes. Questions must have unique IDs and non-empty text. Results are aggregated into a pass rate summary and uploaded as a workflow artifact.",
+      "type": "array",
+      "items": {
+        "type": "object",
+        "required": ["id", "question"],
+        "properties": {
+          "id": {
+            "type": "string",
+            "minLength": 1,
+            "pattern": "^[a-zA-Z_][a-zA-Z0-9_-]*$",
+            "description": "Unique identifier for this evaluation question. Used in results to correlate questions with outcomes."
+          },
+          "question": {
+            "type": "string",
+            "minLength": 1,
+            "description": "Binary evaluation question phrased to elicit a yes/no answer from an LLM (e.g. 'Does the generated code compile?')."
+          }
+        },
+        "additionalProperties": false
+      },
+      "examples": [
+        [
+          { "id": "builds", "question": "Does the generated code compile?" },
+          { "id": "tests", "question": "Are all tests passing?" },
+          { "id": "focused", "question": "Is the implementation limited to the requested change?" }
+        ]
+      ]
+    },
     "secrets": {
       "description": "Secret values passed to workflow execution. Secrets can be defined as simple strings (GitHub Actions expressions) or objects with 'value' and 'description' properties. Typically used to provide secrets to MCP servers or custom engines. Note: For passing secrets to reusable workflows, use the jobs.<job_id>.secrets field instead.",
       "type": "object",
diff --git a/pkg/workflow/compiler_evals.go b/pkg/workflow/compiler_evals.go
new file mode 100644
index 00000000000..bf1a95fc29f
--- /dev/null
+++ b/pkg/workflow/compiler_evals.go
@@ -0,0 +1,250 @@
+// Package workflow - BinEval evaluation job assembler and frontmatter extraction.
+package workflow
+
+import (
+	"encoding/json"
+	"errors"
+	"fmt"
+	"strings"
+
+	"github.com/github/gh-aw/pkg/constants"
+	"github.com/github/gh-aw/pkg/logger"
+)
+
+var evalsLog = logger.New("workflow:compiler_evals")
+
+// evalsWorkDir is the runtime directory where eval inputs and outputs are stored.
+const evalsWorkDir = "/tmp/gh-aw/evals"
+
+// evalDefaultModel is the default GitHub Models choice for evals when no model is configured.
+// Evals deliberately use a small, low-latency, cost-efficient model because each
+// question is a lightweight yes/no classification task rather than open-ended generation.
+const evalDefaultModel = "gpt-4o-mini"
+
+const evalJobConditionTemplate = "always() && !cancelled() && needs.%s.result != 'skipped'"
+
+// extractEvalsFromFrontmatter reads the "evals" array from the frontmatter map
+// and returns a slice of typed EvalDefinition values.
+// Returns nil when the key is absent or empty.
+func extractEvalsFromFrontmatter(frontmatter map[string]any) []EvalDefinition {
+	raw, ok := frontmatter["evals"]
+	if !ok || raw == nil {
+		return nil
+	}
+	rawSlice, ok := raw.([]any)
+	if !ok {
+		evalsLog.Printf("evals: unexpected type %T, expected []any", raw)
+		return nil
+	}
+	defs := make([]EvalDefinition, 0, len(rawSlice))
+	for i, item := range rawSlice {
+		m, ok := item.(map[string]any)
+		if !ok {
+			evalsLog.Printf("evals[%d]: unexpected type %T, expected map", i, item)
+			continue
+		}
+		id, _ := m["id"].(string)
+		question, _ := m["question"].(string)
+		if id == "" || question == "" {
+			evalsLog.Printf("evals[%d]: skipping entry with missing id or question", i)
+			continue
+		}
+		defs = append(defs, EvalDefinition{ID: id, Question: question})
+	}
+	if len(defs) == 0 {
+		return nil
+	}
+	return defs
+}
+
+// validateEvals checks that the eval definitions satisfy all constraints:
+//   - each ID is non-empty (already enforced by extractEvalsFromFrontmatter)
+//   - each question is non-empty (already enforced above)
+//   - IDs are unique within the list
+//
+// Returns the first validation error encountered.
+func validateEvals(evals []EvalDefinition) error {
+	if len(evals) == 0 {
+		return nil
+	}
+	seen := make(map[string]struct{}, len(evals))
+	for _, e := range evals {
+		if e.ID == "" {
+			return errors.New("evals: each evaluation must have a non-empty id")
+		}
+		if e.Question == "" {
+			return fmt.Errorf("evals: evaluation %q must have a non-empty question", e.ID)
+		}
+		if _, exists := seen[e.ID]; exists {
+			return fmt.Errorf("evals: duplicate evaluation id %q — all ids must be unique", e.ID)
+		}
+		seen[e.ID] = struct{}{}
+	}
+	return nil
+}
+
+// buildEvalSpecJSON serializes the eval definitions to a compact JSON array suitable
+// for embedding in a YAML step env var.
+// Uses encoding/json for correct escaping of special characters.
+func buildEvalSpecJSON(evals []EvalDefinition) string {
+	b, err := json.Marshal(evals)
+	if err != nil {
+		evalsLog.Printf("buildEvalSpecJSON: marshal error: %v", err)
+		return "[]"
+	}
+	return string(b)
+}
+
+// buildEvalJob creates the eval job that runs after the agent (and detection, if present)
+// to execute all declared BinEval questions and upload the results artifact.
+// Returns nil when no evals are declared.
+func (c *Compiler) buildEvalJob(data *WorkflowData) (*Job, error) {
+	if len(data.Evals) == 0 {
+		evalsLog.Print("No evals declared, skipping eval job")
+		return nil, nil
+	}
+
+	evalsLog.Printf("Building eval job for %d evaluation(s)", len(data.Evals))
+
+	var steps []string
+
+	// Setup action (same as detection job — sets up runtime tools)
+	setupActionRef := c.resolveActionReference("./actions/setup", data)
+	if setupActionRef != "" || c.actionMode.IsScript() {
+		steps = append(steps, c.generateCheckoutActionsFolder(data)...)
+		// Eval job shares the agent trace ID for cohesive OTLP traces.
+		evalTraceID := fmt.Sprintf("${{ needs.%s.outputs.setup-trace-id }}", constants.ActivationJobName)
+		evalParentSpanID := setupParentSpanNeedsExpr(constants.ActivationJobName)
+		steps = append(steps, c.generateSetupStep(data, setupActionRef, SetupActionDestination, false, evalTraceID, evalParentSpanID)...)
+	}
+
+	// Download agent output artifact for context (prompt, agent_output.json, patches).
+	agentArtifactPrefix := artifactPrefixExprForAgentDownstreamJob(data)
+	steps = append(steps, buildAgentOutputDownloadStepsForEval(agentArtifactPrefix, c.getActionPin)...)
+
+	// Run the eval harness via github-script.
+	steps = append(steps, c.buildEvalHarnessStep(data)...)
+
+	// Upload eval results artifact.
+	steps = append(steps, c.buildEvalArtifactUploadStep(data)...)
+
+	// The eval job always depends on the agent and activation jobs.
+	needs := []string{string(constants.AgentJobName), string(constants.ActivationJobName)}
+
+	// When threat detection is enabled, eval also depends on detection so that
+	// detection conclusions are visible in the artifact download path.
+	if data.SafeOutputs != nil && IsDetectionJobEnabled(data.SafeOutputs) {
+		needs = append(needs, string(constants.DetectionJobName))
+	}
+
+	// Eval job condition: always run whenever the agent job ran (regardless of outcome).
+	condition := fmt.Sprintf(evalJobConditionTemplate, constants.AgentJobName)
+
+	return &Job{
+		Name:    string(constants.EvalJobName),
+		Needs:   needs,
+		If:      condition,
+		RunsOn:  "runs-on: ubuntu-latest",
+		Outputs: nil,
+		Steps:   steps,
+	}, nil
+}
+
+// buildAgentOutputDownloadStepsForEval creates steps to download the agent output
+// artifact into the evals working directory.  It mirrors the detection job download
+// but writes to evalsWorkDir instead of the threat-detection directory.
+func buildAgentOutputDownloadStepsForEval(artifactPrefix string, pinAction func(string) string) []string {
+	downloadAction := pinAction("actions/download-artifact")
+	return []string{
+		"      - name: Download agent output artifact\n",
+		fmt.Sprintf("        uses: %s\n", downloadAction),
+		"        with:\n",
+		fmt.Sprintf("          name: %s%s\n", artifactPrefix, constants.AgentArtifactName),
+		fmt.Sprintf("          path: %s\n", evalsWorkDir),
+		"          merge-multiple: true\n",
+		"        continue-on-error: true\n",
+	}
+}
+
+// buildEvalHarnessStep generates the github-script step that invokes eval_harness.cjs.
+func (c *Compiler) buildEvalHarnessStep(data *WorkflowData) []string {
+	specJSON := buildEvalSpecJSON(data.Evals)
+	// Escape single quotes for YAML single-quoted scalar embedding (YAML §7.3.3).
+	escapedSpec := strings.ReplaceAll(specJSON, "'", "''")
+
+	return []string{
+		"      - name: Run BinEval evaluations\n",
+		"        id: run-evals\n",
+		fmt.Sprintf("        uses: %s\n", getCachedActionPin("actions/github-script", data)),
+		"        env:\n",
+		fmt.Sprintf("          GH_AW_EVAL_SPEC: '%s'\n", escapedSpec),
+		fmt.Sprintf("          GH_AW_EVAL_WORK_DIR: %s\n", evalsWorkDir),
+		fmt.Sprintf("          GH_AW_EVAL_MODEL: %s\n", evalDefaultModel),
+		"        with:\n",
+		"          script: |\n",
+		"            const { setupGlobals } = require('" + SetupActionDestination + "/setup_globals.cjs');\n",
+		"            setupGlobals(core, github, context, exec, io, getOctokit);\n",
+		"            const { main } = require('" + SetupActionDestination + "/eval_harness.cjs');\n",
+		"            await main();\n",
+	}
+}
+
+// buildEvalArtifactUploadStep uploads the eval results as a workflow artifact.
+func (c *Compiler) buildEvalArtifactUploadStep(data *WorkflowData) []string {
+	uploadAction := c.getActionPin("actions/upload-artifact")
+	artifactName := evalArtifactUploadName(data)
+	return []string{
+		"      - name: Upload eval results artifact\n",
+		"        if: always()\n",
+		fmt.Sprintf("        uses: %s\n", uploadAction),
+		"        with:\n",
+		fmt.Sprintf("          name: %s\n", artifactName),
+		fmt.Sprintf("          path: %s\n", evalsWorkDir),
+		"          if-no-files-found: ignore\n",
+		"          retention-days: 30\n",
+	}
+}
+
+// evalArtifactUploadName returns the artifact name for the eval results.
+// For workflow_call, the prefix expression is prepended at runtime.
+func evalArtifactUploadName(data *WorkflowData) string {
+	if data == nil {
+		return constants.EvalArtifactName
+	}
+	sanitizedID := SanitizeWorkflowIDForCacheKey(data.WorkflowID)
+	if strings.Contains(data.On, "workflow_call") {
+		return fmt.Sprintf("${{ needs.%s.outputs.%s }}%s-%s",
+			constants.ActivationJobName,
+			constants.ArtifactPrefixOutputName,
+			sanitizedID,
+			constants.EvalArtifactName,
+		)
+	}
+	if sanitizedID != "" {
+		return fmt.Sprintf("%s-%s", sanitizedID, constants.EvalArtifactName)
+	}
+	return constants.EvalArtifactName
+}
+
+// buildAndAddEvalJob validates evals, builds the eval job, and registers it with the
+// job manager.  It is a no-op when no evals are declared.
+func (c *Compiler) buildAndAddEvalJob(data *WorkflowData) error {
+	if len(data.Evals) == 0 {
+		return nil
+	}
+	if err := validateEvals(data.Evals); err != nil {
+		return err
+	}
+	evalJob, err := c.buildEvalJob(data)
+	if err != nil {
+		return err
+	}
+	if evalJob == nil {
+		return nil
+	}
+	if err := c.jobManager.AddJob(evalJob); err != nil {
+		return fmt.Errorf("failed to add eval job: %w", err)
+	}
+	evalsLog.Print("Added eval job")
+	return nil
+}
diff --git a/pkg/workflow/compiler_evals_test.go b/pkg/workflow/compiler_evals_test.go
new file mode 100644
index 00000000000..f5f1ea11b21
--- /dev/null
+++ b/pkg/workflow/compiler_evals_test.go
@@ -0,0 +1,234 @@
+package workflow
+
+import (
+	"strings"
+	"testing"
+)
+
+// TestExtractEvalsFromFrontmatter validates parsing of the "evals" frontmatter field.
+func TestExtractEvalsFromFrontmatter(t *testing.T) {
+	tests := []struct {
+		name        string
+		frontmatter map[string]any
+		wantLen     int
+		wantIDs     []string
+	}{
+		{
+			name:        "no evals key returns nil",
+			frontmatter: map[string]any{},
+			wantLen:     0,
+		},
+		{
+			name:        "nil evals value returns nil",
+			frontmatter: map[string]any{"evals": nil},
+			wantLen:     0,
+		},
+		{
+			name: "valid evals parsed correctly",
+			frontmatter: map[string]any{
+				"evals": []any{
+					map[string]any{"id": "builds", "question": "Does the code compile?"},
+					map[string]any{"id": "tests", "question": "Are all tests passing?"},
+				},
+			},
+			wantLen: 2,
+			wantIDs: []string{"builds", "tests"},
+		},
+		{
+			name: "entries with empty id are skipped",
+			frontmatter: map[string]any{
+				"evals": []any{
+					map[string]any{"id": "", "question": "Question?"},
+					map[string]any{"id": "valid", "question": "Valid question?"},
+				},
+			},
+			wantLen: 1,
+			wantIDs: []string{"valid"},
+		},
+		{
+			name: "entries with empty question are skipped",
+			frontmatter: map[string]any{
+				"evals": []any{
+					map[string]any{"id": "noquestion", "question": ""},
+					map[string]any{"id": "good", "question": "Good question?"},
+				},
+			},
+			wantLen: 1,
+			wantIDs: []string{"good"},
+		},
+		{
+			name: "wrong type for evals returns nil",
+			frontmatter: map[string]any{
+				"evals": "not-a-slice",
+			},
+			wantLen: 0,
+		},
+		{
+			name: "non-map items in slice are skipped",
+			frontmatter: map[string]any{
+				"evals": []any{
+					"not-a-map",
+					map[string]any{"id": "valid", "question": "Valid?"},
+				},
+			},
+			wantLen: 1,
+			wantIDs: []string{"valid"},
+		},
+		{
+			name: "empty slice returns nil",
+			frontmatter: map[string]any{
+				"evals": []any{},
+			},
+			wantLen: 0,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := extractEvalsFromFrontmatter(tt.frontmatter)
+			if len(result) != tt.wantLen {
+				t.Errorf("extractEvalsFromFrontmatter() len = %d, want %d", len(result), tt.wantLen)
+			}
+			for i, wantID := range tt.wantIDs {
+				if i >= len(result) {
+					t.Errorf("missing result[%d], expected id=%q", i, wantID)
+					continue
+				}
+				if result[i].ID != wantID {
+					t.Errorf("result[%d].ID = %q, want %q", i, result[i].ID, wantID)
+				}
+			}
+		})
+	}
+}
+
+// TestValidateEvals validates the uniqueness and non-empty constraints on eval definitions.
+func TestValidateEvals(t *testing.T) {
+	tests := []struct {
+		name    string
+		evals   []EvalDefinition
+		wantErr bool
+		errMsg  string
+	}{
+		{
+			name:    "nil evals passes",
+			evals:   nil,
+			wantErr: false,
+		},
+		{
+			name:    "empty evals passes",
+			evals:   []EvalDefinition{},
+			wantErr: false,
+		},
+		{
+			name: "valid evals pass",
+			evals: []EvalDefinition{
+				{ID: "builds", Question: "Does the code compile?"},
+				{ID: "tests", Question: "Are all tests passing?"},
+			},
+			wantErr: false,
+		},
+		{
+			name: "duplicate id fails",
+			evals: []EvalDefinition{
+				{ID: "builds", Question: "Does the code compile?"},
+				{ID: "builds", Question: "Another question?"},
+			},
+			wantErr: true,
+			errMsg:  "duplicate evaluation id",
+		},
+		{
+			name: "empty id fails",
+			evals: []EvalDefinition{
+				{ID: "", Question: "Some question?"},
+			},
+			wantErr: true,
+			errMsg:  "non-empty id",
+		},
+		{
+			name: "empty question fails",
+			evals: []EvalDefinition{
+				{ID: "myeval", Question: ""},
+			},
+			wantErr: true,
+			errMsg:  "non-empty question",
+		},
+		{
+			name: "single valid eval passes",
+			evals: []EvalDefinition{
+				{ID: "focused", Question: "Is the implementation limited to the requested change?"},
+			},
+			wantErr: false,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			err := validateEvals(tt.evals)
+			if tt.wantErr && err == nil {
+				t.Error("validateEvals() expected error, got nil")
+			}
+			if !tt.wantErr && err != nil {
+				t.Errorf("validateEvals() unexpected error: %v", err)
+			}
+			if tt.wantErr && err != nil && tt.errMsg != "" {
+				if !containsString(err.Error(), tt.errMsg) {
+					t.Errorf("validateEvals() error %q does not contain %q", err.Error(), tt.errMsg)
+				}
+			}
+		})
+	}
+}
+
+// TestBuildEvalSpecJSON validates JSON serialization of eval definitions.
+func TestBuildEvalSpecJSON(t *testing.T) {
+	tests := []struct {
+		name  string
+		evals []EvalDefinition
+	}{
+		{
+			name:  "empty evals produces empty array",
+			evals: nil,
+		},
+		{
+			name: "single eval is serialized",
+			evals: []EvalDefinition{
+				{ID: "builds", Question: "Does the code compile?"},
+			},
+		},
+		{
+			name: "multiple evals are serialized in order",
+			evals: []EvalDefinition{
+				{ID: "a", Question: "Question A?"},
+				{ID: "b", Question: "Question B?"},
+				{ID: "c", Question: "Question C?"},
+			},
+		},
+		{
+			name: "questions with special characters are escaped",
+			evals: []EvalDefinition{
+				{ID: "special", Question: `Does it handle "quotes" and 'apostrophes'?`},
+			},
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := buildEvalSpecJSON(tt.evals)
+			if result == "" {
+				t.Error("buildEvalSpecJSON() returned empty string")
+			}
+			// Must be valid JSON
+			if result != "[]" && result != "null" {
+				// Should start with [ and end with ]
+				if len(result) < 2 || result[0] != '[' || result[len(result)-1] != ']' {
+					t.Errorf("buildEvalSpecJSON() = %q, expected JSON array", result)
+				}
+			}
+		})
+	}
+}
+
+func containsString(s, substr string) bool {
+	return strings.Contains(s, substr)
+}
diff --git a/pkg/workflow/compiler_jobs.go b/pkg/workflow/compiler_jobs.go
index fd0242b5ecc..1072559d2aa 100644
--- a/pkg/workflow/compiler_jobs.go
+++ b/pkg/workflow/compiler_jobs.go
@@ -212,6 +212,11 @@ func (c *Compiler) buildJobs(data *WorkflowData, markdownPath string) error {
 		return fmt.Errorf("failed to build safe outputs jobs: %w", err)
 	}
 
+	// Build eval job if BinEval questions are declared
+	if err := c.buildAndAddEvalJob(data); err != nil {
+		return fmt.Errorf("failed to build eval job: %w", err)
+	}
+
 	// Apply jobs.<builtin-job>.pre-steps customizations to already-created built-in jobs
 	// before processing non-built-in custom jobs.
 	if err := c.applyBuiltinJobPreSteps(data); err != nil {
diff --git a/pkg/workflow/compiler_orchestrator_workflow.go b/pkg/workflow/compiler_orchestrator_workflow.go
index e4f3c21d3ef..d1bc9da84da 100644
--- a/pkg/workflow/compiler_orchestrator_workflow.go
+++ b/pkg/workflow/compiler_orchestrator_workflow.go
@@ -544,6 +544,9 @@ func (c *Compiler) extractAdditionalConfigurations(
 	workflowData.Experiments = experimentVariantsFromConfigs(workflowData.ExperimentConfigs)
 	workflowData.ExperimentsStorage = extractExperimentsStorageFromFrontmatter(frontmatter)
 
+	// Extract BinEval evaluation definitions.
+	workflowData.Evals = extractEvalsFromFrontmatter(frontmatter)
+
 	return nil
 }
 
diff --git a/pkg/workflow/compiler_types.go b/pkg/workflow/compiler_types.go
index 7e085984236..03d87916de9 100644
--- a/pkg/workflow/compiler_types.go
+++ b/pkg/workflow/compiler_types.go
@@ -603,6 +603,7 @@ type WorkflowData struct {
 	Experiments                    map[string][]string             // A/B testing experiments: maps experiment name to variant list (from frontmatter)
 	ExperimentConfigs              map[string]*ExperimentConfig    // Full A/B experiment metadata (populated alongside Experiments)
 	ExperimentsStorage             string                          // "cache" or "repo" (default "repo"); controls how experiment state is persisted across runs
+	Evals                          []EvalDefinition                // BinEval evaluation questions declared in the workflow (experimental)
 	CachedConcurrencyGroupExprSet  bool                            // true once CachedConcurrencyGroupExprErr has been populated; distinguishes "valid (nil)" from "not yet computed"
 	CachedParsedToolsets           []string                        // cached result of ParseGitHubToolsets for the GitHub tool (for performance optimization); populated by applyDefaults
 	CachedAllowedDomainsStr        string                          // cached allowed-domains string for sanitization (for performance optimization); computed once and reused across multiple compilation steps
diff --git a/pkg/workflow/compiler_validators.go b/pkg/workflow/compiler_validators.go
index 29fa8f21f84..1adb178778d 100644
--- a/pkg/workflow/compiler_validators.go
+++ b/pkg/workflow/compiler_validators.go
@@ -305,6 +305,7 @@ func (c *Compiler) emitExperimentalFeatureWarnings(workflowData *WorkflowData) {
 		{enabled: workflowData.EngineConfig != nil && workflowData.EngineConfig.CopilotSDK, message: "Using experimental feature: engine.copilot-sdk"},
 		{enabled: isFeatureEnabled(constants.GHAWDetectionFeatureFlag, workflowData), message: "Using experimental feature: gh-aw-detection"},
 		{enabled: len(workflowData.LSP) > 0, message: "Using experimental feature: lsp"},
+		{enabled: len(workflowData.Evals) > 0, message: "Using experimental feature: evals"},
 	}
 	for _, warning := range warnings {
 		if warning.enabled {
diff --git a/pkg/workflow/evals_experimental_warning_test.go b/pkg/workflow/evals_experimental_warning_test.go
new file mode 100644
index 00000000000..f25bbdb8d2e
--- /dev/null
+++ b/pkg/workflow/evals_experimental_warning_test.go
@@ -0,0 +1,138 @@
+//go:build integration
+
+package workflow
+
+import (
+	"bytes"
+	"io"
+	"os"
+	"path/filepath"
+	"strings"
+	"testing"
+
+	"github.com/github/gh-aw/pkg/testutil"
+)
+
+// TestEvalsExperimentalWarning tests that declaring an "evals" section in the
+// workflow frontmatter emits an experimental feature warning at compile time.
+func TestEvalsExperimentalWarning(t *testing.T) {
+	tests := []struct {
+		name                 string
+		content              string
+		expectWarning        bool
+		expectedOtherWarning string
+	}{
+		{
+			name: "evals field produces experimental warning",
+			content: `---
+on: workflow_dispatch
+engine: copilot
+permissions:
+  contents: read
+evals:
+  - id: builds
+    question: Does the generated code compile?
+  - id: tests
+    question: Are all tests passing?
+---
+
+# Test Workflow
+`,
+			expectWarning: true,
+		},
+		{
+			name: "no evals field does not produce experimental warning",
+			content: `---
+on: workflow_dispatch
+engine: copilot
+permissions:
+  contents: read
+---
+
+# Test Workflow
+`,
+			expectWarning: false,
+		},
+		{
+			name: "other experimental warning does not trigger evals warning",
+			content: `---
+on: workflow_dispatch
+engine:
+  id: copilot
+  copilot-sdk: true
+permissions:
+  contents: read
+---
+
+# Test Workflow
+`,
+			expectWarning:        false,
+			expectedOtherWarning: "Using experimental feature: engine.copilot-sdk",
+		},
+		{
+			name: "single eval produces experimental warning",
+			content: `---
+on: workflow_dispatch
+engine: copilot
+permissions:
+  contents: read
+evals:
+  - id: focused
+    question: Is the implementation limited to the requested change?
+---
+
+# Test Workflow
+`,
+			expectWarning: true,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			tmpDir := testutil.TempDir(t, "evals-experimental-warning-test")
+
+			testFile := filepath.Join(tmpDir, "test-workflow.md")
+			if err := os.WriteFile(testFile, []byte(tt.content), 0644); err != nil {
+				t.Fatal(err)
+			}
+
+			oldStderr := os.Stderr
+			r, w, _ := os.Pipe()
+			os.Stderr = w
+
+			compiler := NewCompiler()
+			compiler.SetStrictMode(false)
+			err := compiler.CompileWorkflow(testFile)
+
+			w.Close()
+			os.Stderr = oldStderr
+			var buf bytes.Buffer
+			io.Copy(&buf, r)
+			stderrOutput := buf.String()
+
+			if err != nil {
+				t.Errorf("Expected compilation to succeed but it failed: %v", err)
+				return
+			}
+
+			expectedMessage := "Using experimental feature: evals"
+
+			if tt.expectWarning {
+				if !strings.Contains(stderrOutput, expectedMessage) {
+					t.Errorf("Expected warning containing '%q', got stderr:\n%s", expectedMessage, stderrOutput)
+				}
+				if compiler.GetWarningCount() == 0 {
+					t.Error("Expected warning count > 0 but got 0")
+				}
+				return
+			}
+
+			if strings.Contains(stderrOutput, expectedMessage) {
+				t.Errorf("Did not expect warning '%s', but got stderr:\n%s", expectedMessage, stderrOutput)
+			}
+			if tt.expectedOtherWarning != "" && !strings.Contains(stderrOutput, tt.expectedOtherWarning) {
+				t.Errorf("Expected non-evals warning containing %q, got stderr:\n%s", tt.expectedOtherWarning, stderrOutput)
+			}
+		})
+	}
+}
diff --git a/pkg/workflow/frontmatter_types.go b/pkg/workflow/frontmatter_types.go
index 975fa3d7fff..b80597b1307 100644
--- a/pkg/workflow/frontmatter_types.go
+++ b/pkg/workflow/frontmatter_types.go
@@ -120,6 +120,54 @@ type GuardrailMetric struct {
 	Threshold string `json:"threshold"`
 }
 
+// EvalDefinition represents a single BinEval evaluation question declared in the workflow.
+// Each question is evaluated independently after agent execution, producing a binary
+// pass/fail result with an optional rationale.
+type EvalDefinition struct {
+	// ID is a unique identifier for this evaluation question.
+	// Must be non-empty and unique within the workflow's evals list.
+	ID string `json:"id"`
+
+	// Question is the binary evaluation question to be answered by the LLM.
+	// Must be non-empty and should be phrased to elicit a yes/no response.
+	Question string `json:"question"`
+}
+
+// EvalResult represents the outcome of evaluating a single BinEval question.
+// It is produced by the eval harness and aggregated into a workflow summary.
+type EvalResult struct {
+	// ID matches the EvalDefinition.ID that produced this result.
+	ID string `json:"id"`
+
+	// Passed indicates whether the evaluation question was answered affirmatively.
+	Passed bool `json:"passed"`
+
+	// Rationale is an optional brief explanation for the result from the LLM.
+	Rationale string `json:"rationale,omitempty"`
+
+	// Confidence is an optional confidence score in the range [0, 1].
+	Confidence *float64 `json:"confidence,omitempty"`
+}
+
+// EvalSummary aggregates individual EvalResult entries into workflow-level statistics.
+type EvalSummary struct {
+	// Total is the number of evaluation questions evaluated.
+	Total int `json:"total"`
+
+	// Passed is the number of questions that received a passing result.
+	Passed int `json:"passed"`
+
+	// Failed is the number of questions that received a failing result.
+	Failed int `json:"failed"`
+
+	// PassRate is the fraction of questions that passed (Passed / Total).
+	// Zero when Total is zero.
+	PassRate float64 `json:"pass_rate"`
+
+	// Results holds the individual evaluation results in declaration order.
+	Results []EvalResult `json:"results"`
+}
+
 // ExperimentNotify specifies where to post significance alerts when an experiment reaches
 // statistical significance.
 type ExperimentNotify struct {
@@ -371,6 +419,10 @@ type FrontmatterConfig struct {
 	// Experiments during frontmatter parsing.  Keys match those of Experiments.
 	ExperimentConfigs map[string]*ExperimentConfig `json:"-"`
 
+	// Evals holds the BinEval evaluation questions declared in the workflow.
+	// Each entry defines a binary question that is evaluated after the agent job completes.
+	Evals []EvalDefinition `json:"evals,omitempty"`
+
 	// ModelCosts holds model pricing data in the same structure as models.json.
 	// Declared in frontmatter as the `models` field (json:"models,omitempty") using a top-level
 	// `providers` key. At runtime the activation job merges this with the built-in models.json

From 5d43e82f2a7faa66fcfa136061cfeb8a35e34366 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sun, 28 Jun 2026 17:29:23 +0000
Subject: [PATCH 2/3] refactor: replace GitHub Models API with AWF agentic
 engine for BinEval
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Remove direct GitHub Models API calls from eval_harness.cjs; keep only shared utility functions (readEvalSpec, buildEvalPrompt, aggregateResults, renderMarkdownSummary, sanitizeEvalError)
- Add actions/setup/md/eval.md: eval prompt template instructing the engine to output EVAL_RESULT:{...json...}
- Add actions/setup/js/setup_eval.cjs: prompt setup script (mirrors setup_threat_detection.cjs)
- Add actions/setup/js/parse_eval_results.cjs: result parser extracting EVAL_RESULT from engine log
- Update compiler_evals.go: eval job now follows the detection job pattern — pulls AWF containers, clears MCP config, installs the agentic engine, runs it inside AWF, parses results
- Add EvalLogPath, EvalDir, DefaultEvalMaxAICredits constants to pkg/constants

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

Co-authored-by: pelikhan <4175913+pelikhan@users.noreply.github.com>
---
 actions/setup/js/eval_harness.cjs       | 249 +-----------------------
 actions/setup/js/parse_eval_results.cjs | 207 ++++++++++++++++++++
 actions/setup/js/setup_eval.cjs         | 132 +++++++++++++
 actions/setup/md/eval.md                |  40 ++++
 pkg/constants/constants.go              |  10 +
 pkg/workflow/compiler_evals.go          | 237 +++++++++++++++++++---
 6 files changed, 611 insertions(+), 264 deletions(-)
 create mode 100644 actions/setup/js/parse_eval_results.cjs
 create mode 100644 actions/setup/js/setup_eval.cjs
 create mode 100644 actions/setup/md/eval.md

diff --git a/actions/setup/js/eval_harness.cjs b/actions/setup/js/eval_harness.cjs
index 7c5ea4d3f04..ce5e70b5bc3 100644
--- a/actions/setup/js/eval_harness.cjs
+++ b/actions/setup/js/eval_harness.cjs
@@ -4,58 +4,18 @@
 /**
  * eval_harness.cjs
  *
- * BinEval Evaluation Harness (experimental)
+ * BinEval Evaluation Harness utilities (experimental)
  *
- * Evaluates a set of binary questions about a completed agent workflow run.
- * Each question is evaluated independently by an LLM, producing a binary
- * pass/fail result with an optional rationale.
+ * Shared utility functions used by the BinEval evaluation pipeline:
+ *   - setup_eval.cjs      (prompt setup — writes /tmp/gh-aw/aw-prompts/prompt.txt)
+ *   - parse_eval_results.cjs (result parsing — reads engine log, writes eval_results.json)
  *
- * Environment variables (set by the compiled workflow step):
- *   GH_AW_EVAL_SPEC       - JSON array of {id, question} evaluation definitions
- *   GH_AW_EVAL_WORK_DIR   - Working directory for evals (default: /tmp/gh-aw/evals)
- *   GH_AW_EVAL_MODEL      - LLM model to use (default: gpt-4o-mini)
- *
- * Input files (downloaded from agent artifact into GH_AW_EVAL_WORK_DIR):
- *   agent_output.json     - Structured agent output for context
- *   aw-prompts/prompt.txt - Original workflow prompt for context
- *
- * Output files (written to GH_AW_EVAL_WORK_DIR):
- *   eval_results.json     - Aggregated evaluation summary and per-question results
- *
- * Design principles:
- *   - Each question is evaluated independently (BinEval)
- *   - Partial failures are tolerated (a failed LLM call for one question does not
- *     abort evaluation of the remaining questions)
- *   - The evaluator is deterministic in aggregation: pass_rate = passed / total
- *   - No MCPs, no checkout: the harness only reads downloaded artifact files
+ * Inference is performed by the configured agentic engine running inside AWF,
+ * not by direct API calls from this module.
  */
 
 "use strict";
 
-const fs = require("fs");
-const path = require("path");
-const https = require("https");
-
-// ---------------------------------------------------------------------------
-// Constants
-// ---------------------------------------------------------------------------
-
-const DEFAULT_WORK_DIR = "/tmp/gh-aw/evals";
-const DEFAULT_MODEL = "gpt-4o-mini";
-
-// GitHub Models API endpoint for chat completions (OpenAI-compatible).
-// Uses GITHUB_TOKEN for authentication — no additional credentials required.
-const GITHUB_MODELS_ENDPOINT = "models.github.com";
-const GITHUB_MODELS_PATH = "/inference/chat/completions";
-const EVAL_SYSTEM_PROMPT =
-  "You are an objective evaluator. Answer binary (yes/no) questions about agentic workflow outputs. Always respond with a JSON object containing 'passed' (boolean), 'rationale' (string), and 'confidence' (number 0-1).";
-
-// These caps keep the prompt comfortably within the context window of the
-// default small eval model while still leaving room for the JSON answer.
-const MAX_AGENT_OUTPUT_CHARS = 8000;
-const MAX_PROMPT_CHARS = 4000;
-const MAX_RATIONALE_CHARS = 500;
-
 // ---------------------------------------------------------------------------
 // Types (JSDoc)
 // ---------------------------------------------------------------------------
@@ -104,26 +64,6 @@ function readEvalSpec() {
   }
 }
 
-/**
- * Reads and truncates a file for inclusion as LLM context.
- * Returns an empty string if the file does not exist.
- * @param {string} filePath
- * @param {number} maxChars
- * @returns {string}
- */
-function readContextFile(filePath, maxChars) {
-  if (!fs.existsSync(filePath)) {
-    return "";
-  }
-  try {
-    const content = fs.readFileSync(filePath, "utf-8");
-    if (content.length <= maxChars) return content;
-    return content.slice(0, maxChars) + "\n... (truncated)";
-  } catch {
-    return "";
-  }
-}
-
 /**
  * Sanitizes an error message before including it in eval artifacts or logs.
  * Redacts tokens, URLs, and control characters to prevent credential leaks.
@@ -145,6 +85,7 @@ function sanitizeEvalError(err) {
 
 /**
  * Builds an evaluation prompt for a single binary question given the agent context.
+ * Used by tests and by setup_eval.cjs when constructing the engine prompt.
  * @param {string} question
  * @param {string} agentContext
  * @returns {string}
@@ -164,96 +105,6 @@ function buildEvalPrompt(question, agentContext) {
   );
 }
 
-/**
- * Makes an HTTPS POST request and returns the response body as a string.
- * @param {object} options - https.request options
- * @param {string} body - Request body
- * @returns {Promise<string>}
- */
-function httpsPost(options, body) {
-  return new Promise((resolve, reject) => {
-    const req = https.request(options, res => {
-      const chunks = [];
-      res.on("data", chunk => chunks.push(chunk));
-      res.on("end", () => {
-        const responseBody = Buffer.concat(chunks).toString("utf-8");
-        if (res.statusCode && res.statusCode >= 400) {
-          reject(new Error(`HTTP ${res.statusCode}: ${responseBody.slice(0, 200)}`));
-        } else {
-          resolve(responseBody);
-        }
-      });
-    });
-    req.on("error", reject);
-    req.write(body);
-    req.end();
-  });
-}
-
-/**
- * Calls the GitHub Models API to evaluate a single binary question.
- * Returns an EvalResult. On failure, returns a failed result with an error rationale
- * so the harness can continue evaluating remaining questions (partial failure tolerance).
- * @param {string} token
- * @param {string} model
- * @param {string} question
- * @param {string} agentContext
- * @returns {Promise<{passed: boolean, rationale: string, confidence?: number}>}
- */
-async function callLLMForQuestion(token, model, question, agentContext) {
-  const prompt = buildEvalPrompt(question, agentContext);
-  const requestBody = JSON.stringify({
-    model,
-    messages: [
-      {
-        role: "system",
-        content: EVAL_SYSTEM_PROMPT,
-      },
-      { role: "user", content: prompt },
-    ],
-    response_format: { type: "json_object" },
-    temperature: 0,
-    // The response is a tiny JSON object with three fields, so 256 tokens leaves
-    // ample headroom for rationale text without inflating per-question cost.
-    max_tokens: 256,
-  });
-
-  const options = {
-    hostname: GITHUB_MODELS_ENDPOINT,
-    path: GITHUB_MODELS_PATH,
-    method: "POST",
-    headers: {
-      "Content-Type": "application/json",
-      Authorization: "Bearer " + token,
-      "Content-Length": Buffer.byteLength(requestBody),
-    },
-  };
-
-  const responseBody = await httpsPost(options, requestBody);
-  const response = JSON.parse(responseBody);
-
-  const content = response?.choices?.[0]?.message?.content;
-  if (!content) {
-    throw new Error("Empty response from LLM");
-  }
-
-  const parsed = JSON.parse(content);
-  const rationale = typeof parsed.rationale === "string" ? parsed.rationale : "";
-  if (rationale.length > MAX_RATIONALE_CHARS) {
-    console.warn(`Truncating eval rationale from ${rationale.length} to ${MAX_RATIONALE_CHARS} characters`);
-  }
-  const result = {
-    passed: Boolean(parsed.passed),
-    rationale: rationale.slice(0, MAX_RATIONALE_CHARS),
-  };
-
-  if (typeof parsed.confidence === "number") {
-    result.confidence = Math.max(0, Math.min(1, parsed.confidence));
-  }
-
-  return result;
-}
-
 /**
  * Aggregates an array of EvalResult into an EvalSummary.
  * Aggregation is deterministic: pass_rate = passed / total.
@@ -284,88 +135,4 @@ function renderMarkdownSummary(summary) {
   return lines.join("");
 }
 
-// ---------------------------------------------------------------------------
-// Main
-// ---------------------------------------------------------------------------
-
-/**
- * Main entry point for the eval harness.
- * @returns {Promise<void>}
- */
-async function main() {
-  const workDir = process.env.GH_AW_EVAL_WORK_DIR || DEFAULT_WORK_DIR;
-  const model = process.env.GH_AW_EVAL_MODEL || DEFAULT_MODEL;
-  const token = process.env.GITHUB_TOKEN || "";
-
-  core.info(`BinEval harness starting (model: ${model}, workDir: ${workDir})`);
-
-  // Parse eval definitions
-  const evals = readEvalSpec();
-  if (evals.length === 0) {
-    core.warning("No eval definitions found in GH_AW_EVAL_SPEC — nothing to evaluate");
-    return;
-  }
-  core.info(`Evaluating ${evals.length} question(s)`);
-
-  // Load agent context files
-  const agentOutputPath = path.join(workDir, "agent_output.json");
-  const promptPath = path.join(workDir, "aw-prompts", "prompt.txt");
-
-  const agentOutputRaw = readContextFile(agentOutputPath, MAX_AGENT_OUTPUT_CHARS);
-  const promptRaw = readContextFile(promptPath, MAX_PROMPT_CHARS);
-
-  // Build context string from available files
-  const contextParts = [];
-  if (promptRaw) contextParts.push(`### Workflow Prompt\n${promptRaw}`);
-  if (agentOutputRaw) contextParts.push(`### Agent Output\n${agentOutputRaw}`);
-  const agentContext = contextParts.join("\n\n");
-
-  if (!agentContext) {
-    core.warning("No agent context files found — evaluations will have limited context");
-  }
-
-  // Evaluate each question independently (partial failure tolerance)
-  const results = [];
-  for (const evalDef of evals) {
-    core.info(`Evaluating: ${evalDef.id} — "${evalDef.question}"`);
-    try {
-      if (!token) {
-        throw new Error("GITHUB_TOKEN is not set — cannot call GitHub Models API");
-      }
-      const result = await callLLMForQuestion(token, model, evalDef.question, agentContext);
-      results.push({ id: evalDef.id, ...result });
-      const icon = result.passed ? "✅" : "❌";
-      core.info(`  ${icon} ${result.passed ? "pass" : "fail"} — ${result.rationale}`);
-    } catch (err) {
-      const sanitizedError = sanitizeEvalError(err);
-      core.warning(`  ⚠️ Evaluation failed for "${evalDef.id}": ${sanitizedError}`);
-      results.push({
-        id: evalDef.id,
-        passed: false,
-        rationale: `Evaluation error: ${sanitizedError}`,
-      });
-    }
-  }
-
-  // Aggregate results
-  const summary = aggregateResults(results);
-
-  // Write JSON results
-  fs.mkdirSync(workDir, { recursive: true });
-  const resultsPath = path.join(workDir, "eval_results.json");
-  fs.writeFileSync(resultsPath, JSON.stringify(summary, null, 2));
-  core.info(`Results written to ${resultsPath}`);
-
-  // Write step summary
-  const markdownSummary = renderMarkdownSummary(summary);
-  core.summary.addRaw(markdownSummary).write();
-
-  // Set outputs
-  core.setOutput("eval_passed", String(summary.passed));
-  core.setOutput("eval_total", String(summary.total));
-  core.setOutput("eval_pass_rate", summary.pass_rate.toFixed(4));
-
-  core.info(`BinEval complete: ${summary.passed}/${summary.total} passed (${(summary.pass_rate * 100).toFixed(1)}%)`);
-}
-
-module.exports = { main, readEvalSpec, buildEvalPrompt, aggregateResults, renderMarkdownSummary, sanitizeEvalError };
+module.exports = { readEvalSpec, buildEvalPrompt, aggregateResults, renderMarkdownSummary, sanitizeEvalError };
diff --git a/actions/setup/js/parse_eval_results.cjs b/actions/setup/js/parse_eval_results.cjs
new file mode 100644
index 00000000000..6f5eb18d1f5
--- /dev/null
+++ b/actions/setup/js/parse_eval_results.cjs
@@ -0,0 +1,207 @@
+// @ts-check
+/// <reference types="@actions/github-script" />
+
+/**
+ * parse_eval_results.cjs
+ *
+ * BinEval Evaluation Result Parser (experimental)
+ *
+ * Parses the agentic engine's log file for the structured EVAL_RESULT marker
+ * written by the engine during a BinEval evaluation run, then aggregates the
+ * per-question results into a JSON summary and a markdown step summary.
+ *
+ * The engine writes its verdict to stdout which is piped through `tee -a` to
+ * eval.log. This parser reads that file to extract EVAL_RESULT:{...json...}.
+ *
+ * Output files (written to GH_AW_EVAL_WORK_DIR):
+ *   eval_results.json - Aggregated evaluation summary and per-question results
+ */
+
+"use strict";
+
+const fs = require("fs");
+const path = require("path");
+const { aggregateResults, renderMarkdownSummary } = require("./eval_harness.cjs");
+const { ERR_PARSE, ERR_SYSTEM } = require("./error_codes.cjs");
+
+const DEFAULT_WORK_DIR = "/tmp/gh-aw/eval";
+const EVAL_LOG_FILENAME = "eval.log";
+const RESULT_PREFIX = "EVAL_RESULT:";
+
+// ---------------------------------------------------------------------------
+// Log parsing
+// ---------------------------------------------------------------------------
+
+/**
+ * Extracts a complete JSON object from a string that begins with RESULT_PREFIX,
+ * using brace counting to find the matching closing brace. Handles string contexts
+ * and escape sequences correctly.
+ *
+ * @param {string} text - Text starting with RESULT_PREFIX
+ * @returns {string|null} RESULT_PREFIX + complete JSON, or null
+ */
+function extractResultFromText(text) {
+  const jsonStartPos = text.indexOf("{", RESULT_PREFIX.length);
+  if (jsonStartPos === -1) return null;
+
+  let depth = 0;
+  let inString = false;
+  let escaped = false;
+  let jsonEndPos = -1;
+
+  for (let i = jsonStartPos; i < text.length; i++) {
+    const ch = text[i];
+    if (escaped) {
+      escaped = false;
+      continue;
+    }
+    if (ch === "\\" && inString) {
+      escaped = true;
+      continue;
+    }
+    if (ch === '"') {
+      inString = !inString;
+      continue;
+    }
+    if (!inString) {
+      if (ch === "{") depth++;
+      else if (ch === "}") {
+        depth--;
+        if (depth === 0) {
+          jsonEndPos = i;
+          break;
+        }
+      }
+    }
+  }
+
+  if (jsonEndPos === -1) return null;
+  return text.slice(0, jsonEndPos + 1);
+}
+
+/**
+ * Unwrap a stream-json encoded line: if the text is a JSON object with a
+ * "result" string field that contains RESULT_PREFIX, extract that inner string.
+ *
+ * @param {string} line
+ * @returns {string} The unwrapped line, or the original if not stream-json
+ */
+function extractFromStreamJson(line) {
+  if (!line.includes(RESULT_PREFIX)) return line;
+  try {
+    const outer = JSON.parse(line);
+    if (outer && typeof outer.result === "string" && outer.result.includes(RESULT_PREFIX)) {
+      return outer.result;
+    }
+  } catch {
+    // Not valid JSON — use the line as-is
+  }
+  return line;
+}
+
+/**
+ * Parse the eval log file for the EVAL_RESULT marker.
+ * Returns the parsed results array or an error string.
+ *
+ * @param {string} logContent - Contents of eval.log
+ * @returns {{ results: Array<{id: string, passed: boolean, rationale?: string}> | null, error: string | null }}
+ */
+function parseEvalLog(logContent) {
+  const lines = logContent.split("\n");
+  for (const rawLine of lines) {
+    const line = extractFromStreamJson(rawLine.trim());
+    const idx = line.indexOf(RESULT_PREFIX);
+    if (idx === -1) continue;
+    const candidate = line.slice(idx);
+    const extracted = extractResultFromText(candidate);
+    if (!extracted) continue;
+    const jsonStr = extracted.slice(RESULT_PREFIX.length);
+    try {
+      const escapedJson = jsonStr.replace(/\n/g, "\\n");
+      const parsed = JSON.parse(escapedJson);
+      if (parsed && Array.isArray(parsed.results)) {
+        return { results: parsed.results, error: null };
+      }
+      return { results: null, error: "EVAL_RESULT JSON did not contain a 'results' array" };
+    } catch (e) {
+      return { results: null, error: `Failed to parse EVAL_RESULT JSON: ${e instanceof Error ? e.message : String(e)}` };
+    }
+  }
+  return { results: null, error: "No EVAL_RESULT found in eval log" };
+}
+
+// ---------------------------------------------------------------------------
+// Main
+// ---------------------------------------------------------------------------
+
+/**
+ * Main entry point.
+ * @returns {Promise<void>}
+ */
+async function main() {
+  const workDir = process.env.GH_AW_EVAL_WORK_DIR || DEFAULT_WORK_DIR;
+  const logPath = path.join(workDir, EVAL_LOG_FILENAME);
+
+  core.info(`Parsing BinEval results from: ${logPath}`);
+
+  // Verify log file exists
+  if (!fs.existsSync(logPath)) {
+    const msg = `${ERR_SYSTEM}: Eval log not found at ${logPath}`;
+    core.error(msg);
+    core.setFailed(msg);
+    return;
+  }
+
+  let logContent;
+  try {
+    logContent = fs.readFileSync(logPath, "utf-8");
+  } catch (/** @type {any} */ err) {
+    const msg = `${ERR_SYSTEM}: Failed to read eval log: ${err.message}`;
+    core.error(msg);
+    core.setFailed(msg);
+    return;
+  }
+
+  core.info(`Eval log: ${logContent.split("\n").length} lines, ${logContent.length} bytes`);
+
+  // Parse the log
+  const { results: rawResults, error } = parseEvalLog(logContent);
+  if (error || !rawResults) {
+    const msg = `${ERR_PARSE}: ${error || "No eval results found"}`;
+    core.error(msg);
+    core.info('Expected format: EVAL_RESULT:{"results":[{"id":"<id>","passed":true|false,"rationale":"..."},...]}');
+    core.setFailed(msg);
+    return;
+  }
+
+  // Normalise: ensure each entry has id (string) and passed (boolean)
+  const normalised = rawResults
+    .filter(r => r && typeof r.id === "string" && r.id)
+    .map(r => ({
+      id: r.id,
+      passed: Boolean(r.passed),
+      rationale: typeof r.rationale === "string" ? r.rationale.slice(0, 500) : undefined,
+    }));
+
+  // Aggregate
+  const summary = aggregateResults(normalised);
+
+  // Write JSON results
+  fs.mkdirSync(workDir, { recursive: true });
+  const resultsPath = path.join(workDir, "eval_results.json");
+  fs.writeFileSync(resultsPath, JSON.stringify(summary, null, 2));
+  core.info(`Eval results written to ${resultsPath}`);
+
+  // Write step summary
+  const markdownSummary = renderMarkdownSummary(summary);
+  await core.summary.addRaw(markdownSummary).write();
+
+  // Set outputs
+  core.setOutput("eval_passed", String(summary.passed));
+  core.setOutput("eval_total", String(summary.total));
+  core.setOutput("eval_pass_rate", summary.pass_rate.toFixed(4));
+
+  core.info(`BinEval complete: ${summary.passed}/${summary.total} passed (${(summary.pass_rate * 100).toFixed(1)}%)`);
+}
+
+module.exports = { main, parseEvalLog, extractResultFromText, extractFromStreamJson };
diff --git a/actions/setup/js/setup_eval.cjs b/actions/setup/js/setup_eval.cjs
new file mode 100644
index 00000000000..a6f203d2247
--- /dev/null
+++ b/actions/setup/js/setup_eval.cjs
@@ -0,0 +1,132 @@
+// @ts-check
+/// <reference types="@actions/github-script" />
+
+/**
+ * setup_eval.cjs
+ *
+ * BinEval Evaluation Setup (experimental)
+ *
+ * Prepares the evaluation prompt file that the agentic engine reads to answer
+ * binary (yes/no) questions about a completed workflow run. This script is the
+ * counterpart to setup_threat_detection.cjs — it creates /tmp/gh-aw/aw-prompts/prompt.txt
+ * with the rendered eval prompt so the engine execution step can pick it up via
+ * GH_AW_PROMPT (set as an Actions environment variable by this script).
+ *
+ * Environment variables (set by the compiled workflow step):
+ *   GH_AW_EVAL_SPEC     - JSON array of {id, question} evaluation definitions
+ *   GH_AW_EVAL_WORK_DIR - Working directory where artifact was downloaded (default: /tmp/gh-aw/eval)
+ *
+ * Input files (downloaded from agent artifact into GH_AW_EVAL_WORK_DIR):
+ *   agent_output.json     - Structured agent output for context
+ *   aw-prompts/prompt.txt - Original workflow prompt for context
+ *
+ * Output:
+ *   /tmp/gh-aw/aw-prompts/prompt.txt - Rendered eval prompt for the engine
+ */
+
+"use strict";
+
+const fs = require("fs");
+const path = require("path");
+const { getPromptPath } = require("./messages_core.cjs");
+const { ERR_VALIDATION } = require("./error_codes.cjs");
+
+const DEFAULT_WORK_DIR = "/tmp/gh-aw/eval";
+
+/**
+ * Reads the eval specification from the GH_AW_EVAL_SPEC environment variable.
+ * @returns {{ id: string, question: string }[]}
+ */
+function readEvalSpec() {
+  const raw = process.env.GH_AW_EVAL_SPEC || "[]";
+  try {
+    const parsed = JSON.parse(raw);
+    if (!Array.isArray(parsed)) {
+      throw new Error("GH_AW_EVAL_SPEC must be a JSON array");
+    }
+    return parsed.filter(e => e && typeof e.id === "string" && e.id && typeof e.question === "string" && e.question);
+  } catch (err) {
+    throw new Error(`Failed to parse GH_AW_EVAL_SPEC: ${err.message}`);
+  }
+}
+
+/**
+ * Formats the eval questions for embedding in the prompt template.
+ * Each question is rendered as a numbered list item with its ID label.
+ * @param {{ id: string, question: string }[]} evals
+ * @returns {string}
+ */
+function formatEvalQuestions(evals) {
+  return evals.map((e, i) => `${i + 1}. **${e.id}**: ${e.question}`).join("\n");
+}
+
+/**
+ * Main entry point for eval setup.
+ * @returns {Promise<void>}
+ */
+async function main() {
+  const workDir = process.env.GH_AW_EVAL_WORK_DIR || DEFAULT_WORK_DIR;
+
+  // Parse eval definitions
+  const evals = readEvalSpec();
+  if (evals.length === 0) {
+    core.warning(`⚠️ ${ERR_VALIDATION}: No eval definitions found in GH_AW_EVAL_SPEC — skipping eval setup`);
+    return;
+  }
+  core.info(`Setting up ${evals.length} BinEval question(s)`);
+
+  // Read the eval prompt template
+  const templatePath = getPromptPath("eval.md");
+  if (!fs.existsSync(templatePath)) {
+    core.setFailed(`${ERR_VALIDATION}: Eval prompt template not found at: ${templatePath}`);
+    return;
+  }
+  const templateContent = fs.readFileSync(templatePath, "utf-8");
+
+  // Locate agent context files (downloaded from the agent artifact)
+  const promptPath = path.join(workDir, "aw-prompts", "prompt.txt");
+  let promptFileInfo;
+  if (!fs.existsSync(promptPath)) {
+    promptFileInfo = `${promptPath} (unavailable)`;
+    core.warning(`⚠️ ${ERR_VALIDATION}: Missing workflow prompt at ${promptPath}. Eval will proceed with reduced context.`);
+  } else {
+    const stats = fs.statSync(promptPath);
+    promptFileInfo = stats.size > 0 ? `${promptPath} (${stats.size} bytes)` : `${promptPath} (unavailable)`;
+    if (stats.size === 0) {
+      core.warning(`⚠️ ${ERR_VALIDATION}: Workflow prompt is empty at ${promptPath}. Eval will proceed with reduced context.`);
+    } else {
+      core.info(`Prompt file found: ${promptPath} (${stats.size} bytes)`);
+    }
+  }
+
+  const agentOutputPath = path.join(workDir, "agent_output.json");
+  let agentOutputFileInfo;
+  if (!fs.existsSync(agentOutputPath)) {
+    agentOutputFileInfo = `${agentOutputPath} (unavailable)`;
+    core.warning(`⚠️ ${ERR_VALIDATION}: Missing agent output at ${agentOutputPath}. Eval will proceed with reduced context.`);
+  } else {
+    const stats = fs.statSync(agentOutputPath);
+    agentOutputFileInfo = `${agentOutputPath} (${stats.size} bytes)`;
+    core.info(`Agent output found: ${agentOutputPath} (${stats.size} bytes)`);
+  }
+
+  // Render the prompt template
+  const evalQuestions = formatEvalQuestions(evals);
+  const promptContent = templateContent
+    .replace(/{WORKFLOW_PROMPT_FILE}/g, promptFileInfo)
+    .replace(/{AGENT_OUTPUT_FILE}/g, agentOutputFileInfo)
+    .replace(/{EVAL_QUESTIONS}/g, evalQuestions);
+
+  // Write prompt file
+  fs.mkdirSync("/tmp/gh-aw/aw-prompts", { recursive: true });
+  fs.writeFileSync("/tmp/gh-aw/aw-prompts/prompt.txt", promptContent);
+  core.exportVariable("GH_AW_PROMPT", "/tmp/gh-aw/aw-prompts/prompt.txt");
+  core.info(`Eval prompt written to /tmp/gh-aw/aw-prompts/prompt.txt`);
+
+  // Write rendered prompt to step summary
+  await core.summary.addRaw("<details>\n<summary>BinEval Prompt</summary>\n\n" + "``````markdown\n" + promptContent + "\n" + "``````\n\n</details>\n").write();
+
+  core.info("BinEval setup completed");
+}
+
+module.exports = { main, readEvalSpec, formatEvalQuestions };
diff --git a/actions/setup/md/eval.md b/actions/setup/md/eval.md
new file mode 100644
index 00000000000..8dddfb753b6
--- /dev/null
+++ b/actions/setup/md/eval.md
@@ -0,0 +1,40 @@
+# BinEval Evaluation (experimental)
+
+You are an objective evaluator. Your task is to evaluate a completed agent workflow run by answering binary (yes/no) questions about the agent's work.
+
+## Workflow Context
+
+The original workflow prompt file is available at: {WORKFLOW_PROMPT_FILE}
+
+Read this file to understand the intent and scope of the task the agent was asked to perform.
+
+## Agent Output
+
+The agent output file is available at: {AGENT_OUTPUT_FILE}
+
+Read this file to understand what the agent produced.
+
+## Evaluation Questions
+
+{EVAL_QUESTIONS}
+
+## Response Format
+
+**IMPORTANT**: You must output exactly one line containing only the JSON response with the unique identifier. Do not include any other text, explanations, or formatting around the result line.
+
+Output format:
+
+    EVAL_RESULT:{"results":[{"id":"<id>","passed":true,"rationale":"<brief explanation>"},{"id":"<id2>","passed":false,"rationale":"<brief explanation>"}]}
+
+Instructions:
+- For each question above, set `"passed": true` if the answer is yes, `false` if the answer is no.
+- The `"passed"` field **must** be a JSON boolean (`true` or `false`), not a string.
+- Include a brief one-sentence `"rationale"` for each answer (max 100 words).
+- Preserve the `"id"` values exactly as listed above.
+- Include all questions in the `"results"` array, in the same order as listed.
+
+## Guidelines
+
+- Base your evaluation solely on the agent output and workflow context provided.
+- Be objective and evidence-based; avoid speculation when evidence is absent.
+- For yes/no questions, err toward `false` when there is insufficient evidence to confirm the positive.
diff --git a/pkg/constants/constants.go b/pkg/constants/constants.go
index 8e28fbf27ba..bf6cac0b83b 100644
--- a/pkg/constants/constants.go
+++ b/pkg/constants/constants.go
@@ -285,6 +285,10 @@ const DefaultMaxAICredits int64 = 1000
 // AWF API proxy for threat-detection runs.
 const DefaultDetectionMaxAICredits int64 = 400
 
+// DefaultEvalMaxAICredits is the default AI Credits budget enforced by the
+// AWF API proxy for BinEval runs.
+const DefaultEvalMaxAICredits int64 = 200
+
 // DefaultMaxDailyAICredits is the default per-workflow daily AI Credits guardrail.
 const DefaultMaxDailyAICredits = "5000"
 
@@ -474,6 +478,12 @@ const TmpPiAgentDir = TmpGhAwDir + "/pi-agent-dir"
 // ThreatDetectionLogPath is the threat detection engine log file path.
 const ThreatDetectionLogPath = TmpGhAwDir + "/threat-detection/detection.log"
 
+// EvalLogPath is the BinEval engine log file path.
+const EvalLogPath = TmpGhAwDir + "/eval/eval.log"
+
+// EvalDir is the BinEval working directory.
+const EvalDir = TmpGhAwDir + "/eval"
+
 // ThreatDetectionDir is the threat detection working directory.
 const ThreatDetectionDir = TmpGhAwDir + "/threat-detection"
 
diff --git a/pkg/workflow/compiler_evals.go b/pkg/workflow/compiler_evals.go
index bf1a95fc29f..a8342a66481 100644
--- a/pkg/workflow/compiler_evals.go
+++ b/pkg/workflow/compiler_evals.go
@@ -9,20 +9,20 @@ import (
 
 	"github.com/github/gh-aw/pkg/constants"
 	"github.com/github/gh-aw/pkg/logger"
+	"github.com/github/gh-aw/pkg/workflow/compilerenv"
 )
 
 var evalsLog = logger.New("workflow:compiler_evals")
 
 // evalsWorkDir is the runtime directory where eval inputs and outputs are stored.
-const evalsWorkDir = "/tmp/gh-aw/evals"
-
-// evalDefaultModel is the default GitHub Models choice for evals when no model is configured.
-// Evals deliberately use a small, low-latency, cost-efficient model because each
-// question is a lightweight yes/no classification task rather than open-ended generation.
-const evalDefaultModel = "gpt-4o-mini"
+const evalsWorkDir = constants.EvalDir
 
 const evalJobConditionTemplate = "always() && !cancelled() && needs.%s.result != 'skipped'"
 
+// evalStepCondition is the condition used on each engine execution step in the eval job.
+// It mirrors detectionStepCondition: always run so that the parse step can report failures.
+const evalStepCondition = "always()"
+
 // extractEvalsFromFrontmatter reads the "evals" array from the frontmatter map
 // and returns a slice of typed EvalDefinition values.
 // Returns nil when the key is absent or empty.
@@ -95,8 +95,26 @@ func buildEvalSpecJSON(evals []EvalDefinition) string {
 	return string(b)
 }
 
+// getEvalEngineID returns the effective engine ID for the eval job.
+// Defaults to "claude" (same as detection) and normalises Pi → Copilot.
+func (c *Compiler) getEvalEngineID(data *WorkflowData) string {
+	engineID := data.AI
+	if engineID == "" && data.EngineConfig != nil && data.EngineConfig.ID != "" {
+		engineID = data.EngineConfig.ID
+	}
+	if engineID == "" {
+		engineID = "claude"
+	}
+	// Pi is not supported in eval; normalise to Copilot.
+	if engineID == "pi" {
+		return "copilot"
+	}
+	return engineID
+}
+
 // buildEvalJob creates the eval job that runs after the agent (and detection, if present)
-// to execute all declared BinEval questions and upload the results artifact.
+// to execute all declared BinEval questions via the configured agentic engine inside AWF,
+// and uploads the results artifact.
 // Returns nil when no evals are declared.
 func (c *Compiler) buildEvalJob(data *WorkflowData) (*Job, error) {
 	if len(data.Evals) == 0 {
@@ -108,22 +126,41 @@ func (c *Compiler) buildEvalJob(data *WorkflowData) (*Job, error) {
 
 	var steps []string
 
-	// Setup action (same as detection job — sets up runtime tools)
+	// Setup action (same as detection job — sets up runtime tools).
 	setupActionRef := c.resolveActionReference("./actions/setup", data)
 	if setupActionRef != "" || c.actionMode.IsScript() {
 		steps = append(steps, c.generateCheckoutActionsFolder(data)...)
-		// Eval job shares the agent trace ID for cohesive OTLP traces.
 		evalTraceID := fmt.Sprintf("${{ needs.%s.outputs.setup-trace-id }}", constants.ActivationJobName)
 		evalParentSpanID := setupParentSpanNeedsExpr(constants.ActivationJobName)
 		steps = append(steps, c.generateSetupStep(data, setupActionRef, SetupActionDestination, false, evalTraceID, evalParentSpanID)...)
 	}
 
-	// Download agent output artifact for context (prompt, agent_output.json, patches).
+	// Download agent output artifact for context (prompt, agent_output.json).
 	agentArtifactPrefix := artifactPrefixExprForAgentDownstreamJob(data)
 	steps = append(steps, buildAgentOutputDownloadStepsForEval(agentArtifactPrefix, c.getActionPin)...)
 
-	// Run the eval harness via github-script.
-	steps = append(steps, c.buildEvalHarnessStep(data)...)
+	// Clean stale firewall files from the agent artifact download (same as detection job).
+	steps = append(steps, c.buildCleanFirewallDirsStep()...)
+
+	// Pull AWF container images so the eval engine runs inside AWF.
+	steps = append(steps, c.buildPullAWFContainersStep(data)...)
+
+	// Clear MCP config so the eval engine runs without MCP servers.
+	steps = append(steps, buildClearMCPConfigForEvalStep()...)
+
+	// Setup eval: create the prompt file for the engine.
+	steps = append(steps, c.buildSetupEvalStep(data)...)
+
+	// Engine installation and execution inside AWF.
+	engineSteps, err := c.buildEvalEngineExecutionSteps(data)
+	if err != nil {
+		evalsLog.Printf("Warning: failed to build eval engine steps: %v", err)
+	} else {
+		steps = append(steps, engineSteps...)
+	}
+
+	// Parse eval results from the engine log and write eval_results.json.
+	steps = append(steps, c.buildParseEvalResultsStep(data)...)
 
 	// Upload eval results artifact.
 	steps = append(steps, c.buildEvalArtifactUploadStep(data)...)
@@ -131,8 +168,7 @@ func (c *Compiler) buildEvalJob(data *WorkflowData) (*Job, error) {
 	// The eval job always depends on the agent and activation jobs.
 	needs := []string{string(constants.AgentJobName), string(constants.ActivationJobName)}
 
-	// When threat detection is enabled, eval also depends on detection so that
-	// detection conclusions are visible in the artifact download path.
+	// When threat detection is enabled, eval also depends on detection.
 	if data.SafeOutputs != nil && IsDetectionJobEnabled(data.SafeOutputs) {
 		needs = append(needs, string(constants.DetectionJobName))
 	}
@@ -151,8 +187,7 @@ func (c *Compiler) buildEvalJob(data *WorkflowData) (*Job, error) {
 }
 
 // buildAgentOutputDownloadStepsForEval creates steps to download the agent output
-// artifact into the evals working directory.  It mirrors the detection job download
-// but writes to evalsWorkDir instead of the threat-detection directory.
+// artifact into the evals working directory.
 func buildAgentOutputDownloadStepsForEval(artifactPrefix string, pinAction func(string) string) []string {
 	downloadAction := pinAction("actions/download-artifact")
 	return []string{
@@ -166,25 +201,181 @@ func buildAgentOutputDownloadStepsForEval(artifactPrefix string, pinAction func(
 	}
 }
 
-// buildEvalHarnessStep generates the github-script step that invokes eval_harness.cjs.
-func (c *Compiler) buildEvalHarnessStep(data *WorkflowData) []string {
+// buildClearMCPConfigForEvalStep creates a step that removes MCP configuration files
+// so the eval engine runs without any MCP servers.
+func buildClearMCPConfigForEvalStep() []string {
+	return []string{
+		"      - name: Clear MCP config for eval\n",
+		"        run: |\n",
+		"          rm -f \"${RUNNER_TEMP}/gh-aw/mcp-config/mcp-servers.json\"\n",
+		"          rm -f \"$HOME/.copilot/mcp-config.json\"\n",
+	}
+}
+
+// buildSetupEvalStep generates the github-script step that calls setup_eval.cjs,
+// which writes the eval prompt to /tmp/gh-aw/aw-prompts/prompt.txt.
+func (c *Compiler) buildSetupEvalStep(data *WorkflowData) []string {
 	specJSON := buildEvalSpecJSON(data.Evals)
-	// Escape single quotes for YAML single-quoted scalar embedding (YAML §7.3.3).
 	escapedSpec := strings.ReplaceAll(specJSON, "'", "''")
 
 	return []string{
-		"      - name: Run BinEval evaluations\n",
-		"        id: run-evals\n",
+		"      - name: Setup BinEval prompt\n",
+		"        id: setup-eval\n",
 		fmt.Sprintf("        uses: %s\n", getCachedActionPin("actions/github-script", data)),
 		"        env:\n",
 		fmt.Sprintf("          GH_AW_EVAL_SPEC: '%s'\n", escapedSpec),
 		fmt.Sprintf("          GH_AW_EVAL_WORK_DIR: %s\n", evalsWorkDir),
-		fmt.Sprintf("          GH_AW_EVAL_MODEL: %s\n", evalDefaultModel),
 		"        with:\n",
 		"          script: |\n",
 		"            const { setupGlobals } = require('" + SetupActionDestination + "/setup_globals.cjs');\n",
 		"            setupGlobals(core, github, context, exec, io, getOctokit);\n",
-		"            const { main } = require('" + SetupActionDestination + "/eval_harness.cjs');\n",
+		"            const { main } = require('" + SetupActionDestination + "/setup_eval.cjs');\n",
+		"            await main();\n",
+	}
+}
+
+// buildEvalEngineExecutionSteps generates the engine installation and execution steps
+// for the eval job. The engine runs inside AWF with no MCP servers and limited network
+// access (only the inference API), similar to the inline threat detection path.
+func (c *Compiler) buildEvalEngineExecutionSteps(data *WorkflowData) ([]string, error) {
+	engineSetting := c.getEvalEngineID(data)
+
+	engine, err := c.getAgenticEngine(engineSetting)
+	if err != nil {
+		return nil, fmt.Errorf("eval engine %q not found: %w", engineSetting, err)
+	}
+
+	// Build eval engine config: inherit from the main engine config but apply
+	// eval-specific overrides (credits cap, no MaxTurns/Concurrency).
+	evalEngineConfig := data.EngineConfig
+	if evalEngineConfig == nil {
+		evalEngineConfig = &EngineConfig{ID: engineSetting}
+	} else {
+		evalEngineConfig = &EngineConfig{
+			ID:            evalEngineConfig.ID,
+			Model:         evalEngineConfig.Model,
+			Version:       evalEngineConfig.Version,
+			Env:           evalEngineConfig.Env,
+			Config:        evalEngineConfig.Config,
+			Args:          evalEngineConfig.Args,
+			APITarget:     evalEngineConfig.APITarget,
+			HarnessScript: evalEngineConfig.HarnessScript,
+			Driver:        evalEngineConfig.Driver,
+		}
+	}
+	if evalEngineConfig.ID == "" {
+		evalEngineConfig.ID = engineSetting
+	}
+
+	// Apply eval AI credits budget (smaller than detection: binary questions only).
+	evalEngineConfig.MaxAICredits = constants.DefaultEvalMaxAICredits
+
+	// Apply detection default model when no model is explicitly configured; eval
+	// questions are lightweight yes/no tasks so the detection model is appropriate.
+	if evalEngineConfig.Model == "" {
+		if envModel := compilerenv.ResolveDefaultDetectionModel(""); envModel != "" {
+			evalEngineConfig.Model = envModel
+		} else if engineModel := engine.GetDefaultDetectionModel(); engineModel != "" {
+			evalEngineConfig.Model = engineModel
+		}
+	}
+
+	// Normalise Pi model to bare model ID for Copilot CLI compatibility.
+	if engineSetting == "copilot" && data.AI == "pi" {
+		evalEngineConfig.Model = extractPiModelID(evalEngineConfig.Model)
+	}
+
+	// Build minimal WorkflowData for the eval engine run. Mirrors the pattern used
+	// by buildDetectionEngineExecutionStep: AWF sandbox, no MCP, no safe outputs,
+	// minimal network (only the inference API).
+	evalData := &WorkflowData{
+		Tools: map[string]any{
+			"bash": []any{"*"},
+		},
+		SafeOutputs:  nil,
+		EngineConfig: evalEngineConfig,
+		AI:           engineSetting,
+		Features:     data.Features,
+		Permissions:  data.Permissions,
+		// IsDetectionRun: reuse detection semantics for domain allow-listing and credits.
+		IsDetectionRun: true,
+		NetworkPermissions: &NetworkPermissions{
+			Allowed: getThreatDetectionAdditionalAllowedDomains(data),
+		},
+		SandboxConfig: &SandboxConfig{
+			Agent: &AgentSandboxConfig{
+				Type: SandboxTypeAWF,
+			},
+		},
+	}
+
+	var steps []string
+
+	// Install the engine (eval job runs on a fresh runner).
+	installSteps := engine.GetInstallationSteps(evalData)
+
+	// Ensure node is on PATH when the engine needs a JS harness.
+	if engineRequiresNodeHarness(engine) && !installStepsContainNodeSetup(installSteps) {
+		for _, line := range GenerateNodeJsSetupStep() {
+			steps = append(steps, line+"\n")
+		}
+	}
+
+	for _, step := range installSteps {
+		for _, line := range step {
+			steps = append(steps, line+"\n")
+		}
+	}
+
+	// Codex needs MCP gateway bootstrap for OpenAI proxy provider configuration.
+	if engine.GetID() == "codex" {
+		var mcpSetup strings.Builder
+		if err := c.generateMCPSetup(&mcpSetup, evalData.Tools, engine, evalData); err == nil {
+			for line := range strings.SplitSeq(mcpSetup.String(), "\n") {
+				if line != "" {
+					steps = append(steps, line+"\n")
+				}
+			}
+		} else {
+			evalsLog.Printf("Failed to generate MCP setup for Codex eval; OpenAI proxy config may be incomplete: %v", err)
+		}
+	}
+
+	executionSteps := engine.GetExecutionSteps(evalData, constants.EvalLogPath)
+	for _, step := range executionSteps {
+		for i, line := range step {
+			// Prefix step IDs with "eval_" to avoid conflicts.
+			prefixed := strings.Replace(line, "id: agentic_execution", "id: eval_agentic_execution", 1)
+			steps = append(steps, prefixed+"\n")
+			// Inject the if condition and continue-on-error after the first line (- name:).
+			// continue-on-error: true ensures infrastructure failures don't fail the eval job;
+			// the parse step uses if: always() and handles missing logs gracefully.
+			if i == 0 {
+				steps = append(steps, fmt.Sprintf("        if: %s\n", evalStepCondition))
+				steps = append(steps, "        continue-on-error: true\n")
+			}
+		}
+	}
+
+	return steps, nil
+}
+
+// buildParseEvalResultsStep generates the github-script step that calls
+// parse_eval_results.cjs to extract EVAL_RESULT:{...} from the engine log,
+// write eval_results.json, and set job outputs.
+func (c *Compiler) buildParseEvalResultsStep(data *WorkflowData) []string {
+	return []string{
+		"      - name: Parse BinEval results\n",
+		"        id: parse-eval-results\n",
+		"        if: always()\n",
+		fmt.Sprintf("        uses: %s\n", getCachedActionPin("actions/github-script", data)),
+		"        env:\n",
+		fmt.Sprintf("          GH_AW_EVAL_WORK_DIR: %s\n", evalsWorkDir),
+		"        with:\n",
+		"          script: |\n",
+		"            const { setupGlobals } = require('" + SetupActionDestination + "/setup_globals.cjs');\n",
+		"            setupGlobals(core, github, context, exec, io, getOctokit);\n",
+		"            const { main } = require('" + SetupActionDestination + "/parse_eval_results.cjs');\n",
 		"            await main();\n",
 	}
 }

From 9ea3ed5fa68a96ca8d07abe75ba8dd5c7c9ffb15 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sun, 28 Jun 2026 18:22:13 +0000
Subject: [PATCH 3/3] test: add setup_eval and parse_eval_results test coverage

Add missing test files for the two BinEval JS modules introduced in the
AWF-engine refactor:

- setup_eval.test.cjs: 14 tests covering prompt template rendering,
  missing/empty context files, eval spec parsing, and step summary output
- parse_eval_results.test.cjs: 25 tests covering EVAL_RESULT extraction
  from plain and stream-json logs, main() error/success paths, and result
  normalisation

Mirrors the test coverage pattern of setup_threat_detection.test.cjs and
parse_threat_detection_results.test.cjs.

Co-authored-by: pelikhan <4175913+pelikhan@users.noreply.github.com>
---
 actions/setup/js/parse_eval_results.test.cjs | 253 +++++++++++++++++++
 actions/setup/js/setup_eval.test.cjs         | 220 ++++++++++++++++
 2 files changed, 473 insertions(+)
 create mode 100644 actions/setup/js/parse_eval_results.test.cjs
 create mode 100644 actions/setup/js/setup_eval.test.cjs

diff --git a/actions/setup/js/parse_eval_results.test.cjs b/actions/setup/js/parse_eval_results.test.cjs
new file mode 100644
index 00000000000..a34c7b82c11
--- /dev/null
+++ b/actions/setup/js/parse_eval_results.test.cjs
@@ -0,0 +1,253 @@
+import { describe, it, expect, beforeEach, afterEach, vi } from "vitest";
+import fs from "fs";
+import path from "path";
+
+// ---------------------------------------------------------------------------
+// Unit tests for exported parse helpers — no fs needed
+// ---------------------------------------------------------------------------
+
+// Use require (not import) for CJS modules that export named functions directly;
+// this avoids ESM-interop wrapping and keeps the references stable across tests.
+const { parseEvalLog, extractResultFromText, extractFromStreamJson } = require("./parse_eval_results.cjs");
+
+describe("extractResultFromText", () => {
+  it("extracts a simple JSON object", () => {
+    const text = 'EVAL_RESULT:{"results":[{"id":"builds","passed":true}]}';
+    const result = extractResultFromText(text);
+    expect(result).toBe('EVAL_RESULT:{"results":[{"id":"builds","passed":true}]}');
+  });
+
+  it("stops at the matching closing brace and ignores trailing content", () => {
+    const text = 'EVAL_RESULT:{"results":[{"id":"builds","passed":true}]}\nSome trailing text';
+    const result = extractResultFromText(text);
+    expect(result).toBe('EVAL_RESULT:{"results":[{"id":"builds","passed":true}]}');
+    expect(result).not.toContain("trailing");
+  });
+
+  it("handles nested objects correctly", () => {
+    const text = 'EVAL_RESULT:{"results":[{"id":"q","passed":true,"meta":{"k":1}}]}';
+    const result = extractResultFromText(text);
+    expect(result).toBe('EVAL_RESULT:{"results":[{"id":"q","passed":true,"meta":{"k":1}}]}');
+  });
+
+  it("does not count braces inside JSON string values", () => {
+    const text = 'EVAL_RESULT:{"results":[{"id":"q","rationale":"found {injection} here","passed":false}]}';
+    const result = extractResultFromText(text);
+    expect(result).toBe('EVAL_RESULT:{"results":[{"id":"q","rationale":"found {injection} here","passed":false}]}');
+  });
+
+  it("handles escaped quotes inside strings", () => {
+    const text = 'EVAL_RESULT:{"results":[{"id":"q","rationale":"he said \\"yes\\"","passed":true}]}';
+    const result = extractResultFromText(text);
+    expect(result).toBe('EVAL_RESULT:{"results":[{"id":"q","rationale":"he said \\"yes\\"","passed":true}]}');
+  });
+
+  it("returns null when no opening brace is found", () => {
+    expect(extractResultFromText("EVAL_RESULT:null")).toBeNull();
+    expect(extractResultFromText("EVAL_RESULT:[]")).toBeNull();
+    expect(extractResultFromText("EVAL_RESULT:")).toBeNull();
+  });
+
+  it("returns null when closing brace is missing (truncated JSON)", () => {
+    expect(extractResultFromText('EVAL_RESULT:{"results":[')).toBeNull();
+    expect(extractResultFromText('EVAL_RESULT:{"results":[{"id":"builds","passed":true')).toBeNull();
+  });
+});
+
+describe("extractFromStreamJson", () => {
+  it("extracts result from a stream-json envelope", () => {
+    const inner = 'EVAL_RESULT:{"results":[{"id":"builds","passed":true,"rationale":"yes"}]}';
+    const line = JSON.stringify({ type: "result", subtype: "success", result: inner });
+    const result = extractFromStreamJson(line);
+    expect(result).toContain("EVAL_RESULT:");
+  });
+
+  it("returns the original line when it does not contain EVAL_RESULT", () => {
+    const line = '{"type":"text","text":"some output"}';
+    expect(extractFromStreamJson(line)).toBe(line);
+  });
+
+  it("returns the original line when it is not valid JSON", () => {
+    const line = "EVAL_RESULT:plaintext";
+    expect(extractFromStreamJson(line)).toBe(line);
+  });
+
+  it("returns the original line when the outer JSON has no result field with EVAL_RESULT", () => {
+    const line = JSON.stringify({ type: "text", text: "EVAL_RESULT: partial" });
+    // outer.result is undefined → falls through to returning the original line
+    const result = extractFromStreamJson(line);
+    expect(result).toBe(line);
+  });
+});
+
+describe("parseEvalLog", () => {
+  it("extracts results from a plain log line", () => {
+    const log = 'EVAL_RESULT:{"results":[{"id":"builds","passed":true,"rationale":"ok"},{"id":"tests","passed":false,"rationale":"no"}]}';
+    const { results, error } = parseEvalLog(log);
+    expect(error).toBeNull();
+    expect(results).toHaveLength(2);
+    expect(results[0]).toMatchObject({ id: "builds", passed: true });
+    expect(results[1]).toMatchObject({ id: "tests", passed: false });
+  });
+
+  it("returns error when no EVAL_RESULT is found", () => {
+    const log = "Engine output with no result marker\nSome other output";
+    const { results, error } = parseEvalLog(log);
+    expect(results).toBeNull();
+    expect(error).toContain("No EVAL_RESULT found");
+  });
+
+  it("returns error when EVAL_RESULT JSON has no results array", () => {
+    const log = 'EVAL_RESULT:{"passed":true}';
+    const { results, error } = parseEvalLog(log);
+    expect(results).toBeNull();
+    expect(error).toContain("'results' array");
+  });
+
+  it("returns error when EVAL_RESULT JSON is malformed", () => {
+    const log = "EVAL_RESULT:{malformed json}";
+    const { results, error } = parseEvalLog(log);
+    expect(results).toBeNull();
+    expect(error).toContain("Failed to parse EVAL_RESULT JSON");
+  });
+
+  it("parses the first EVAL_RESULT in a multi-line log", () => {
+    const log = [
+      "Engine starting up...",
+      "Reading prompt file...",
+      'EVAL_RESULT:{"results":[{"id":"q1","passed":true,"rationale":"good"}]}',
+      "Engine finishing.",
+    ].join("\n");
+    const { results, error } = parseEvalLog(log);
+    expect(error).toBeNull();
+    expect(results).toHaveLength(1);
+    expect(results[0].id).toBe("q1");
+  });
+
+  it("handles EVAL_RESULT embedded in a stream-json wrapper", () => {
+    const inner = 'EVAL_RESULT:{"results":[{"id":"focused","passed":false,"rationale":"not focused"}]}';
+    const streamLine = JSON.stringify({ type: "result", subtype: "success", result: inner });
+    const log = `Engine output\n${streamLine}\n`;
+    const { results, error } = parseEvalLog(log);
+    expect(error).toBeNull();
+    expect(results).toHaveLength(1);
+    expect(results[0]).toMatchObject({ id: "focused", passed: false });
+  });
+
+  it("returns empty results array when results JSON field is empty", () => {
+    const log = 'EVAL_RESULT:{"results":[]}';
+    const { results, error } = parseEvalLog(log);
+    expect(error).toBeNull();
+    expect(results).toEqual([]);
+  });
+});
+
+// ---------------------------------------------------------------------------
+// Integration tests for main() — uses real fs with a temp directory
+// ---------------------------------------------------------------------------
+
+// Use a directory that does not overlap with setup_eval.test.cjs (/tmp/gh-aw),
+// preventing test-parallelism conflicts when both suites run simultaneously.
+const PARSE_TEST_ROOT = "/tmp/gh-aw-parse-eval-test";
+const EVAL_DIR = path.join(PARSE_TEST_ROOT, "eval");
+
+// Use require (not dynamic import) so the module is cached once, avoiding
+// vi.resetModules() interference with the real fs reference inside the module.
+const { main: evalMain } = require("./parse_eval_results.cjs");
+
+function makeCoreMocks() {
+  const summary = {
+    addRaw: vi.fn().mockReturnThis(),
+    write: vi.fn().mockResolvedValue(undefined),
+  };
+  return {
+    info: vi.fn(),
+    warning: vi.fn(),
+    error: vi.fn(),
+    setFailed: vi.fn(),
+    setOutput: vi.fn(),
+    exportVariable: vi.fn(),
+    summary,
+  };
+}
+
+describe("main (parse_eval_results)", () => {
+  beforeEach(() => {
+    fs.rmSync(PARSE_TEST_ROOT, { recursive: true, force: true });
+    fs.mkdirSync(EVAL_DIR, { recursive: true });
+    process.env.GH_AW_EVAL_WORK_DIR = EVAL_DIR;
+    global.core = makeCoreMocks();
+  });
+
+  afterEach(() => {
+    fs.rmSync(PARSE_TEST_ROOT, { recursive: true, force: true });
+    delete process.env.GH_AW_EVAL_WORK_DIR;
+  });
+
+  it("fails when eval log file does not exist", async () => {
+    await evalMain();
+    expect(global.core.setFailed).toHaveBeenCalledWith(expect.stringContaining("Eval log not found"));
+  });
+
+  it("fails when eval log cannot be parsed (no EVAL_RESULT marker)", async () => {
+    fs.writeFileSync(path.join(EVAL_DIR, "eval.log"), "Engine ran but produced no result marker\n");
+    await evalMain();
+    expect(global.core.setFailed).toHaveBeenCalledWith(expect.stringContaining("No EVAL_RESULT found"));
+  });
+
+  it("writes eval_results.json and sets outputs on success", async () => {
+    fs.writeFileSync(path.join(EVAL_DIR, "eval.log"), 'EVAL_RESULT:{"results":[{"id":"builds","passed":true,"rationale":"compiles"},{"id":"tests","passed":false,"rationale":"failing"}]}');
+    await evalMain();
+
+    expect(global.core.setFailed).not.toHaveBeenCalled();
+
+    const resultsPath = path.join(EVAL_DIR, "eval_results.json");
+    expect(fs.existsSync(resultsPath)).toBe(true);
+    const summary = JSON.parse(fs.readFileSync(resultsPath, "utf-8"));
+    expect(summary.total).toBe(2);
+    expect(summary.passed).toBe(1);
+    expect(summary.failed).toBe(1);
+    expect(summary.pass_rate).toBeCloseTo(0.5);
+
+    expect(global.core.setOutput).toHaveBeenCalledWith("eval_passed", "1");
+    expect(global.core.setOutput).toHaveBeenCalledWith("eval_total", "2");
+    expect(global.core.setOutput).toHaveBeenCalledWith("eval_pass_rate", "0.5000");
+  });
+
+  it("writes step summary with BinEval Results heading", async () => {
+    fs.writeFileSync(path.join(EVAL_DIR, "eval.log"), 'EVAL_RESULT:{"results":[{"id":"q1","passed":true,"rationale":"ok"}]}');
+    await evalMain();
+
+    expect(global.core.summary.addRaw).toHaveBeenCalledWith(expect.stringContaining("BinEval Results"));
+    expect(global.core.summary.write).toHaveBeenCalled();
+  });
+
+  it("handles all-passed results correctly", async () => {
+    fs.writeFileSync(path.join(EVAL_DIR, "eval.log"), 'EVAL_RESULT:{"results":[{"id":"q1","passed":true},{"id":"q2","passed":true}]}');
+    await evalMain();
+
+    expect(global.core.setFailed).not.toHaveBeenCalled();
+    expect(global.core.setOutput).toHaveBeenCalledWith("eval_passed", "2");
+    expect(global.core.setOutput).toHaveBeenCalledWith("eval_total", "2");
+    expect(global.core.setOutput).toHaveBeenCalledWith("eval_pass_rate", "1.0000");
+  });
+
+  it("normalizes results: drops entries without id, coerces passed to boolean", async () => {
+    // entry 2 has no id (dropped), entry 1 has passed:1 (coerced to true)
+    fs.writeFileSync(path.join(EVAL_DIR, "eval.log"), 'EVAL_RESULT:{"results":[{"id":"q1","passed":1},{"passed":true},{"id":"q3","passed":false}]}');
+    await evalMain();
+
+    expect(global.core.setFailed).not.toHaveBeenCalled();
+    expect(global.core.setOutput).toHaveBeenCalledWith("eval_total", "2");
+    expect(global.core.setOutput).toHaveBeenCalledWith("eval_passed", "1");
+  });
+
+  it("truncates long rationale to 500 chars in the results JSON", async () => {
+    const longRationale = "x".repeat(600);
+    fs.writeFileSync(path.join(EVAL_DIR, "eval.log"), `EVAL_RESULT:{"results":[{"id":"q1","passed":true,"rationale":"${longRationale}"}]}`);
+    await evalMain();
+
+    const summary = JSON.parse(fs.readFileSync(path.join(EVAL_DIR, "eval_results.json"), "utf-8"));
+    expect(summary.results[0].rationale.length).toBeLessThanOrEqual(500);
+  });
+});
diff --git a/actions/setup/js/setup_eval.test.cjs b/actions/setup/js/setup_eval.test.cjs
new file mode 100644
index 00000000000..4f08eb499d7
--- /dev/null
+++ b/actions/setup/js/setup_eval.test.cjs
@@ -0,0 +1,220 @@
+import { describe, it, expect, beforeEach, afterEach, vi } from "vitest";
+import fs from "fs";
+import path from "path";
+
+const TMP_ROOT = "/tmp/gh-aw";
+const EVAL_DIR = path.join(TMP_ROOT, "eval");
+const TEMPLATE_DIR = "/tmp/gh-aw-test-prompts";
+
+describe("setup_eval", () => {
+  beforeEach(() => {
+    vi.resetModules();
+    fs.rmSync(TMP_ROOT, { recursive: true, force: true });
+    fs.rmSync(TEMPLATE_DIR, { recursive: true, force: true });
+    fs.mkdirSync(TEMPLATE_DIR, { recursive: true });
+
+    fs.writeFileSync(
+      path.join(TEMPLATE_DIR, "eval.md"),
+      `prompt={WORKFLOW_PROMPT_FILE}\noutput={AGENT_OUTPUT_FILE}\nquestions={EVAL_QUESTIONS}\n`
+    );
+
+    fs.mkdirSync(EVAL_DIR, { recursive: true });
+    fs.writeFileSync(path.join(EVAL_DIR, "agent_output.json"), '{"ok":true}');
+
+    process.env.GH_AW_PROMPTS_DIR = TEMPLATE_DIR;
+    process.env.GH_AW_EVAL_SPEC = JSON.stringify([{ id: "builds", question: "Does the code compile?" }]);
+    process.env.GH_AW_EVAL_WORK_DIR = EVAL_DIR;
+  });
+
+  afterEach(() => {
+    fs.rmSync(TMP_ROOT, { recursive: true, force: true });
+    fs.rmSync(TEMPLATE_DIR, { recursive: true, force: true });
+    delete process.env.GH_AW_PROMPTS_DIR;
+    delete process.env.GH_AW_EVAL_SPEC;
+    delete process.env.GH_AW_EVAL_WORK_DIR;
+  });
+
+  function setupCoreMocks() {
+    const summary = {
+      addRaw: vi.fn().mockReturnThis(),
+      write: vi.fn().mockResolvedValue(undefined),
+    };
+    global.core = {
+      info: vi.fn(),
+      warning: vi.fn(),
+      error: vi.fn(),
+      setFailed: vi.fn(),
+      exportVariable: vi.fn(),
+      summary,
+    };
+  }
+
+  it("warns and returns early when no eval definitions are found", async () => {
+    setupCoreMocks();
+    process.env.GH_AW_EVAL_SPEC = "[]";
+
+    const module = await import("./setup_eval.cjs");
+    await module.main();
+
+    expect(global.core.setFailed).not.toHaveBeenCalled();
+    expect(global.core.warning).toHaveBeenCalledWith(expect.stringContaining("No eval definitions found"));
+    expect(global.core.exportVariable).not.toHaveBeenCalled();
+  });
+
+  it("fails when eval prompt template is missing", async () => {
+    setupCoreMocks();
+    fs.rmSync(path.join(TEMPLATE_DIR, "eval.md"));
+
+    const module = await import("./setup_eval.cjs");
+    await module.main();
+
+    expect(global.core.setFailed).toHaveBeenCalledWith(expect.stringContaining("Eval prompt template not found"));
+  });
+
+  it("writes prompt with all placeholders replaced when all files are present", async () => {
+    setupCoreMocks();
+    const promptDir = path.join(EVAL_DIR, "aw-prompts");
+    fs.mkdirSync(promptDir, { recursive: true });
+    fs.writeFileSync(path.join(promptDir, "prompt.txt"), "original workflow prompt");
+
+    const module = await import("./setup_eval.cjs");
+    await module.main();
+
+    expect(global.core.setFailed).not.toHaveBeenCalled();
+    expect(global.core.exportVariable).toHaveBeenCalledWith("GH_AW_PROMPT", "/tmp/gh-aw/aw-prompts/prompt.txt");
+
+    const generatedPromptPath = "/tmp/gh-aw/aw-prompts/prompt.txt";
+    expect(fs.existsSync(generatedPromptPath)).toBe(true);
+    const content = fs.readFileSync(generatedPromptPath, "utf-8");
+    expect(content).toContain("prompt=");
+    expect(content).toContain("output=");
+    expect(content).toContain("questions=");
+    expect(content).toContain("builds");
+    expect(content).toContain("Does the code compile?");
+  });
+
+  it("continues with reduced context when workflow prompt is missing", async () => {
+    setupCoreMocks();
+
+    const module = await import("./setup_eval.cjs");
+    await module.main();
+
+    expect(global.core.setFailed).not.toHaveBeenCalled();
+    expect(global.core.warning).toHaveBeenCalledWith(expect.stringContaining("Missing workflow prompt"));
+    expect(global.core.exportVariable).toHaveBeenCalledWith("GH_AW_PROMPT", "/tmp/gh-aw/aw-prompts/prompt.txt");
+
+    const content = fs.readFileSync("/tmp/gh-aw/aw-prompts/prompt.txt", "utf-8");
+    expect(content).toContain("unavailable");
+  });
+
+  it("continues with reduced context when workflow prompt is empty", async () => {
+    setupCoreMocks();
+    const promptDir = path.join(EVAL_DIR, "aw-prompts");
+    fs.mkdirSync(promptDir, { recursive: true });
+    fs.writeFileSync(path.join(promptDir, "prompt.txt"), "");
+
+    const module = await import("./setup_eval.cjs");
+    await module.main();
+
+    expect(global.core.setFailed).not.toHaveBeenCalled();
+    expect(global.core.warning).toHaveBeenCalledWith(expect.stringContaining("is empty"));
+    expect(global.core.exportVariable).toHaveBeenCalledWith("GH_AW_PROMPT", "/tmp/gh-aw/aw-prompts/prompt.txt");
+  });
+
+  it("continues with reduced context when agent output is missing", async () => {
+    setupCoreMocks();
+    fs.rmSync(path.join(EVAL_DIR, "agent_output.json"), { force: true });
+
+    const module = await import("./setup_eval.cjs");
+    await module.main();
+
+    expect(global.core.setFailed).not.toHaveBeenCalled();
+    expect(global.core.warning).toHaveBeenCalledWith(expect.stringContaining("Missing agent output"));
+    expect(global.core.exportVariable).toHaveBeenCalledWith("GH_AW_PROMPT", "/tmp/gh-aw/aw-prompts/prompt.txt");
+  });
+
+  it("embeds multiple eval questions in order", async () => {
+    setupCoreMocks();
+    process.env.GH_AW_EVAL_SPEC = JSON.stringify([
+      { id: "builds", question: "Does the code compile?" },
+      { id: "tests", question: "Are all tests passing?" },
+    ]);
+
+    const module = await import("./setup_eval.cjs");
+    await module.main();
+
+    const content = fs.readFileSync("/tmp/gh-aw/aw-prompts/prompt.txt", "utf-8");
+    expect(content).toContain("**builds**: Does the code compile?");
+    expect(content).toContain("**tests**: Are all tests passing?");
+    const buildsPos = content.indexOf("builds");
+    const testsPos = content.indexOf("tests");
+    expect(buildsPos).toBeLessThan(testsPos);
+  });
+
+  it("writes step summary with details block", async () => {
+    setupCoreMocks();
+
+    const module = await import("./setup_eval.cjs");
+    await module.main();
+
+    expect(global.core.summary.addRaw).toHaveBeenCalledWith(expect.stringContaining("BinEval Prompt"));
+    expect(global.core.summary.write).toHaveBeenCalled();
+  });
+});
+
+describe("readEvalSpec (setup_eval)", () => {
+  beforeEach(() => {
+    vi.resetModules();
+    delete process.env.GH_AW_EVAL_SPEC;
+  });
+
+  afterEach(() => {
+    delete process.env.GH_AW_EVAL_SPEC;
+  });
+
+  it("returns empty array when env var is absent", async () => {
+    const module = await import("./setup_eval.cjs");
+    const result = module.readEvalSpec();
+    expect(result).toEqual([]);
+  });
+
+  it("parses a valid spec", async () => {
+    process.env.GH_AW_EVAL_SPEC = JSON.stringify([{ id: "q1", question: "Question one?" }]);
+    const module = await import("./setup_eval.cjs");
+    const result = module.readEvalSpec();
+    expect(result).toHaveLength(1);
+    expect(result[0]).toEqual({ id: "q1", question: "Question one?" });
+  });
+
+  it("throws on invalid JSON", async () => {
+    process.env.GH_AW_EVAL_SPEC = "not-json";
+    const module = await import("./setup_eval.cjs");
+    expect(() => module.readEvalSpec()).toThrow(/Failed to parse GH_AW_EVAL_SPEC/);
+  });
+});
+
+describe("formatEvalQuestions", () => {
+  beforeEach(() => {
+    vi.resetModules();
+  });
+
+  it("formats a single question", async () => {
+    const module = await import("./setup_eval.cjs");
+    const result = module.formatEvalQuestions([{ id: "builds", question: "Does it compile?" }]);
+    expect(result).toBe("1. **builds**: Does it compile?");
+  });
+
+  it("formats multiple questions as a numbered list", async () => {
+    const module = await import("./setup_eval.cjs");
+    const result = module.formatEvalQuestions([
+      { id: "builds", question: "Does it compile?" },
+      { id: "tests", question: "Do tests pass?" },
+    ]);
+    expect(result).toBe("1. **builds**: Does it compile?\n2. **tests**: Do tests pass?");
+  });
+
+  it("returns empty string for empty array", async () => {
+    const module = await import("./setup_eval.cjs");
+    expect(module.formatEvalQuestions([])).toBe("");
+  });
+});