diff --git a/AGENTS.md b/AGENTS.md index 51f73868f..99f13a779 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -37,7 +37,7 @@ Fix root cause. Unsure: read more code; if stuck, ask w/ short options. Unrecogn | tool-contract | Public LLM Tool trait contract | | git-support | Sandboxed git operations on VFS | | python-builtin | Embedded Python via Monty, security, resource limits | -| eval | LLM evaluation harness, dataset format, scoring | +| eval | LLM eval study on the mira framework, dataset format, scoring | | maintenance | Pre-release maintenance requirements | | python-package | Python package, PyPI wheels, platform matrix | | scripted-tool-orchestration | Compose ToolDef+callback pairs into OrchestratorTool via bash scripts | diff --git a/Cargo.lock b/Cargo.lock index 38b7428f3..cddb62457 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -404,7 +404,7 @@ dependencies = [ "async-trait", "bashkit", "chrono", - "clap", + "mira-eval", "regex", "reqwest", "rustls", @@ -2130,6 +2130,15 @@ dependencies = [ "memoffset", ] +[[package]] +name = "inventory" +version = "0.3.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4f0c30c76f2f4ccee3fe55a2435f691ca00c0e4bd87abe4f4a851b1d4dac39b" +dependencies = [ + "rustversion", +] + [[package]] name = "ipnet" version = "2.12.0" @@ -2516,6 +2525,33 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "mira-eval" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9fee2dcd277811402b7ddf08c1c59d49965e864998341e44cda300c91c10b0f5" +dependencies = [ + "async-trait", + "inventory", + "mira-macros", + "regex", + "serde", + "serde_json", + "tempfile", + "tokio", +] + +[[package]] +name = "mira-macros" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a95d6edf1d47285aada4966c9e87d7a372020ce2e7d1786f9459ad4d01fc05d8" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "ml-kem" version = "0.3.2" diff --git a/README.md b/README.md index ce6e90253..466fc98b4 100644 --- a/README.md +++ b/README.md @@ -467,21 +467,27 @@ just pre-pr # Pre-PR checks ## LLM Eval Results -Bashkit includes an [eval harness](crates/bashkit-eval/) that measures how well LLMs use bashkit as a bash tool in agentic workloads — 58 tasks across 15 categories. +Bashkit includes a [mira eval study](crates/bashkit-eval/) that measures how well LLMs use bashkit as a bash tool in agentic workloads — 58 tasks across 15 categories. + +_Latest run: 2026-06-27, on the mira eval framework (58 tasks)._ | Model | Score | Tasks Passed | Tool Call Success | Duration | |-------|-------|-------------|-------------------|----------| -| Claude Opus 4.7 | **98%** | **56/58** | 90% | 22.6 min | -| Claude Haiku 4.5 | **98%** | 54/58 | 92% | **8.0 min** | -| Claude Sonnet 4.6 | 94% | 49/58 | 91% | 19.7 min | -| GPT-5.5 | 93% | 50/58 | 92% | 11.2 min | -| GPT-5.3-Codex | 93% | 54/58 | 87% | 13.7 min | +| Claude Opus 4.8 | **95%** | **55/58** | **96%** | 12.8 min | +| Claude Haiku 4.5 | **95%** | **55/58** | 94% | **7.4 min** | +| GPT-5.3-Codex | 93% | 54/58 | 85% | 12.1 min | +| GPT-5.5 | 88% | 51/58 | 90% | 8.2 min | +| Claude Sonnet 4.6 | 84% | 49/58 | 93% | 19.9 min | -**Delta from 2026-02-28** (same 58 tasks): Opus 4.6→4.7 jumped +6 tasks (50→56), GPT-5.2→5.5 jumped +9 tasks (41→50). Haiku 4.5 ties with Opus 4.7 at 98% in ⅓ the wall-clock time. See the [detailed analysis](crates/bashkit-eval/README.md#results). +Opus 4.8 and Haiku 4.5 lead at 55/58 — Haiku matches Opus in ~⅗ the wall-clock +time. Two tasks trip every model (`file_path_organizer`, `script_getopts_parser`). +See the [detailed analysis](crates/bashkit-eval/README.md#results). ```bash -just eval # Run eval with default model -just eval-save # Run and save results +cargo install mira-cli # one-time: the `mira` host CLI +just eval-list # list evals, samples, scorers, targets +just eval # run the bash eval across the model matrix +just eval-scripting # run the scripting-tool eval ``` ## Benchmarks diff --git a/crates/bashkit-eval/Cargo.toml b/crates/bashkit-eval/Cargo.toml index dd7f79350..d554fc7d9 100644 --- a/crates/bashkit-eval/Cargo.toml +++ b/crates/bashkit-eval/Cargo.toml @@ -1,6 +1,10 @@ -# bashkit-eval: LLM evaluation harness for bashkit tool usage -# Measures how well models use a sandboxed bash tool in agentic workloads -# See specs/eval.md for design decisions +# bashkit-eval: mira eval study for bashkit tool usage +# Measures how well models use a sandboxed bash tool in agentic workloads. +# Reimplemented on the mira eval framework (github.com/everruns/mira): +# bashkit drives the agent loop as a mira `Subject`; tasks are mira `Sample`s; +# the bashkit expectation checks run as a mira `Scorer`. The `mira` CLI hosts +# this binary over stdio and owns the model matrix, scheduling, and reporting. +# See specs/eval.md for design decisions. [package] name = "bashkit-eval" @@ -8,18 +12,22 @@ version.workspace = true edition.workspace = true license.workspace = true authors.workspace = true -description = "LLM evaluation harness for bashkit tool usage" +description = "Mira eval study for bashkit tool usage" [[bin]] name = "bashkit-eval" path = "src/main.rs" [dependencies] +# Mira eval framework (crates.io). Imported as `mira`; the `macros` feature +# provides the #[eval] discovery attribute. We use the in-process Subject path +# (our own agent loop + provider stack) so we do NOT need mira-everruns. +mira-eval = { version = "0.2", features = ["macros"] } + bashkit = { path = "../bashkit", features = ["scripted_tool", "jq"] } -tokio = { workspace = true, features = ["rt-multi-thread"] } +tokio = { workspace = true, features = ["rt-multi-thread", "io-std", "io-util", "fs"] } serde.workspace = true serde_json.workspace = true -clap.workspace = true anyhow.workspace = true reqwest.workspace = true # Direct rustls dep so we can install the `ring` crypto provider before diff --git a/crates/bashkit-eval/README.md b/crates/bashkit-eval/README.md index 97ec75670..a80860f27 100644 --- a/crates/bashkit-eval/README.md +++ b/crates/bashkit-eval/README.md @@ -1,47 +1,56 @@ # Bashkit Eval -LLM evaluation harness for bashkit tool usage. Measures how well models use bashkit's bash tool in agentic workloads. +A [mira](https://github.com/everruns/mira) eval study for bashkit tool usage. +Measures how well models use bashkit's bash tool in agentic workloads. + +bashkit-eval is a **study binary** the `mira` host CLI spawns over stdio; mira +owns the model matrix, scheduling, retries, resume, and reporting. bashkit +supplies the subject (its agent loop over a persistent VFS) and the scorer (the +deterministic expectation checks). See [`specs/eval.md`](../../specs/eval.md). ## Usage +Install the host CLI once, then drive the study through it: + ```bash -# Run eval (terminal output only) -ANTHROPIC_API_KEY=... cargo run -p bashkit-eval -- run \ - --dataset crates/bashkit-eval/data/eval-tasks.jsonl \ - --provider anthropic --model claude-sonnet-4-20250514 - -# Run and save results (Chat Completions API) -OPENAI_API_KEY=... cargo run -p bashkit-eval -- run \ - --dataset crates/bashkit-eval/data/eval-tasks.jsonl \ - --provider openai --model gpt-5.2 --save - -# Run against OpenAI Responses API (required for codex models) -OPENAI_API_KEY=... cargo run -p bashkit-eval -- run \ - --dataset crates/bashkit-eval/data/eval-tasks.jsonl \ - --provider openresponses --model gpt-5.3-codex --save - -# Custom moniker -cargo run -p bashkit-eval -- run \ - --dataset crates/bashkit-eval/data/eval-tasks.jsonl \ - --provider anthropic --model claude-sonnet-4-20250514 \ - --save --moniker my-test-run +cargo install mira-cli # provides the `mira` binary + +# List advertised evals, samples, scorers, and targets +mira --bin bashkit-eval list + +# Run the bash agent eval (set keys for the models you want; unkeyed targets skip) +ANTHROPIC_API_KEY=... OPENAI_API_KEY=... \ + mira --bin bashkit-eval run bashkit_bash + +# A specific model + one category (--targets takes exact labels, comma-separated) +ANTHROPIC_API_KEY=... \ + mira --bin bashkit-eval run bashkit_bash --targets anthropic/claude-opus-4-8 --tag json_processing + +# Scripting-tool eval, scripted mode only, self-contained HTML report +OPENAI_API_KEY=... \ + mira --bin bashkit-eval run bashkit_scripting --axis mode=scripted --format html --out report.html # Via just -just eval -just eval-save +just eval-list +just eval --targets anthropic/claude-opus-4-8 +just eval-smoke +just eval-scripting ``` -## Options +Results are written by mira under `./results//`. + +## Evals & selection -| Option | Description | -|--------|-------------| -| `--dataset ` | Path to JSONL dataset file | -| `--provider ` | `anthropic`, `openai`, or `openresponses` | -| `--model ` | Model name (e.g., `claude-sonnet-4-20250514`, `gpt-5.2`, `gpt-5.3-codex`) | -| `--max-turns ` | Max agent turns per task (default: 10) | -| `--save` | Save JSON + Markdown results to disk | -| `--output ` | Output directory (default: `crates/bashkit-eval/results`) | -| `--moniker ` | Custom run identifier (default: `{provider}-{model}`) | +| Eval | Samples | Selection | +|------|---------|-----------| +| `bashkit_bash` | 58 tasks, 15 categories | `--tag `, `--samples ` | +| `bashkit_smoke` | 3 tasks | quick verification | +| `bashkit_scripting` | scripting-tool tasks | `--axis mode=scripted\|baseline` | + +Targets (model matrix) are defined in `src/mira_study.rs` and gated on +`ANTHROPIC_API_KEY` / `OPENAI_API_KEY`; offline runs skip them all. Select a +subset with `--targets