everruns · chaliy · Jun 27, 2026 · Jun 26, 2026 · Jun 26, 2026 · Jun 26, 2026
diff --git a/AGENTS.md b/AGENTS.md
@@ -37,7 +37,7 @@ Fix root cause. Unsure: read more code; if stuck, ask w/ short options. Unrecogn
 | tool-contract | Public LLM Tool trait contract |
 | git-support | Sandboxed git operations on VFS |
 | python-builtin | Embedded Python via Monty, security, resource limits |
-| eval | LLM evaluation harness, dataset format, scoring |
+| eval | LLM eval study on the mira framework, dataset format, scoring |
 | maintenance | Pre-release maintenance requirements |
 | python-package | Python package, PyPI wheels, platform matrix |
 | scripted-tool-orchestration | Compose ToolDef+callback pairs into OrchestratorTool via bash scripts |

diff --git a/Cargo.lock b/Cargo.lock
diff --git a/README.md b/README.md
@@ -467,21 +467,27 @@ just pre-pr       # Pre-PR checks
 
 ## LLM Eval Results
 
-Bashkit includes an [eval harness](crates/bashkit-eval/) that measures how well LLMs use bashkit as a bash tool in agentic workloads — 58 tasks across 15 categories.
+Bashkit includes a [mira eval study](crates/bashkit-eval/) that measures how well LLMs use bashkit as a bash tool in agentic workloads — 58 tasks across 15 categories.
+
+_Latest run: 2026-06-27, on the mira eval framework (58 tasks)._
 
 | Model | Score | Tasks Passed | Tool Call Success | Duration |
 |-------|-------|-------------|-------------------|----------|
-| Claude Opus 4.7 | **98%** | **56/58** | 90% | 22.6 min |
-| Claude Haiku 4.5 | **98%** | 54/58 | 92% | **8.0 min** |
-| Claude Sonnet 4.6 | 94% | 49/58 | 91% | 19.7 min |
-| GPT-5.5 | 93% | 50/58 | 92% | 11.2 min |
-| GPT-5.3-Codex | 93% | 54/58 | 87% | 13.7 min |
+| Claude Opus 4.8 | **95%** | **55/58** | **96%** | 12.8 min |
+| Claude Haiku 4.5 | **95%** | **55/58** | 94% | **7.4 min** |
+| GPT-5.3-Codex | 93% | 54/58 | 85% | 12.1 min |
+| GPT-5.5 | 88% | 51/58 | 90% | 8.2 min |
+| Claude Sonnet 4.6 | 84% | 49/58 | 93% | 19.9 min |
 
-**Delta from 2026-02-28** (same 58 tasks): Opus 4.6→4.7 jumped +6 tasks (50→56), GPT-5.2→5.5 jumped +9 tasks (41→50). Haiku 4.5 ties with Opus 4.7 at 98% in ⅓ the wall-clock time. See the [detailed analysis](crates/bashkit-eval/README.md#results).
+Opus 4.8 and Haiku 4.5 lead at 55/58 — Haiku matches Opus in ~⅗ the wall-clock
+time. Two tasks trip every model (`file_path_organizer`, `script_getopts_parser`).
+See the [detailed analysis](crates/bashkit-eval/README.md#results).
 
 ```bash
-just eval                    # Run eval with default model
-just eval-save               # Run and save results
+cargo install mira-cli       # one-time: the `mira` host CLI
+just eval-list               # list evals, samples, scorers, targets
+just eval                    # run the bash eval across the model matrix
+just eval-scripting          # run the scripting-tool eval
 ```
 
 ## Benchmarks

diff --git a/crates/bashkit-eval/Cargo.toml b/crates/bashkit-eval/Cargo.toml
@@ -1,25 +1,33 @@
-# bashkit-eval: LLM evaluation harness for bashkit tool usage
-# Measures how well models use a sandboxed bash tool in agentic workloads
-# See specs/eval.md for design decisions
+# bashkit-eval: mira eval study for bashkit tool usage
+# Measures how well models use a sandboxed bash tool in agentic workloads.
+# Reimplemented on the mira eval framework (github.com/everruns/mira):
+# bashkit drives the agent loop as a mira `Subject`; tasks are mira `Sample`s;
+# the bashkit expectation checks run as a mira `Scorer`. The `mira` CLI hosts
+# this binary over stdio and owns the model matrix, scheduling, and reporting.
+# See specs/eval.md for design decisions.
 
 [package]
 name = "bashkit-eval"
 version.workspace = true
 edition.workspace = true
 license.workspace = true
 authors.workspace = true
-description = "LLM evaluation harness for bashkit tool usage"
+description = "Mira eval study for bashkit tool usage"
 
 [[bin]]
 name = "bashkit-eval"
 path = "src/main.rs"
 
 [dependencies]
+# Mira eval framework (crates.io). Imported as `mira`; the `macros` feature
+# provides the #[eval] discovery attribute. We use the in-process Subject path
+# (our own agent loop + provider stack) so we do NOT need mira-everruns.
+mira-eval = { version = "0.2", features = ["macros"] }
+
 bashkit = { path = "../bashkit", features = ["scripted_tool", "jq"] }
-tokio = { workspace = true, features = ["rt-multi-thread"] }
+tokio = { workspace = true, features = ["rt-multi-thread", "io-std", "io-util", "fs"] }
 serde.workspace = true
 serde_json.workspace = true
-clap.workspace = true
 anyhow.workspace = true
 reqwest.workspace = true
 # Direct rustls dep so we can install the `ring` crypto provider before

diff --git a/crates/bashkit-eval/README.md b/crates/bashkit-eval/README.md
@@ -1,47 +1,56 @@
 # Bashkit Eval
 
-LLM evaluation harness for bashkit tool usage. Measures how well models use bashkit's bash tool in agentic workloads.
+A [mira](https://github.com/everruns/mira) eval study for bashkit tool usage.
+Measures how well models use bashkit's bash tool in agentic workloads.
+
+bashkit-eval is a **study binary** the `mira` host CLI spawns over stdio; mira
+owns the model matrix, scheduling, retries, resume, and reporting. bashkit
+supplies the subject (its agent loop over a persistent VFS) and the scorer (the
+deterministic expectation checks). See [`specs/eval.md`](../../specs/eval.md).
 
 ## Usage
 
+Install the host CLI once, then drive the study through it:
+
 ```bash
-# Run eval (terminal output only)
-ANTHROPIC_API_KEY=... cargo run -p bashkit-eval -- run \
-  --dataset crates/bashkit-eval/data/eval-tasks.jsonl \
-  --provider anthropic --model claude-sonnet-4-20250514
-
-# Run and save results (Chat Completions API)
-OPENAI_API_KEY=... cargo run -p bashkit-eval -- run \
-  --dataset crates/bashkit-eval/data/eval-tasks.jsonl \
-  --provider openai --model gpt-5.2 --save
-
-# Run against OpenAI Responses API (required for codex models)
-OPENAI_API_KEY=... cargo run -p bashkit-eval -- run \
-  --dataset crates/bashkit-eval/data/eval-tasks.jsonl \
-  --provider openresponses --model gpt-5.3-codex --save
-
-# Custom moniker
-cargo run -p bashkit-eval -- run \
-  --dataset crates/bashkit-eval/data/eval-tasks.jsonl \
-  --provider anthropic --model claude-sonnet-4-20250514 \
-  --save --moniker my-test-run
+cargo install mira-cli            # provides the `mira` binary
+
+# List advertised evals, samples, scorers, and targets
+mira --bin bashkit-eval list
+
+# Run the bash agent eval (set keys for the models you want; unkeyed targets skip)
+ANTHROPIC_API_KEY=... OPENAI_API_KEY=... \
+  mira --bin bashkit-eval run bashkit_bash
+
+# A specific model + one category (--targets takes exact labels, comma-separated)
+ANTHROPIC_API_KEY=... \
+  mira --bin bashkit-eval run bashkit_bash --targets anthropic/claude-opus-4-8 --tag json_processing
+
+# Scripting-tool eval, scripted mode only, self-contained HTML report
+OPENAI_API_KEY=... \
+  mira --bin bashkit-eval run bashkit_scripting --axis mode=scripted --format html --out report.html
 
 # Via just
-just eval
-just eval-save
+just eval-list
+just eval --targets anthropic/claude-opus-4-8
+just eval-smoke
+just eval-scripting
 ```
 
-## Options
+Results are written by mira under `./results/<run_id>/`.
+
+## Evals & selection
 
-| Option | Description |
-|--------|-------------|
-| `--dataset <path>` | Path to JSONL dataset file |
-| `--provider <name>` | `anthropic`, `openai`, or `openresponses` |
-| `--model <name>` | Model name (e.g., `claude-sonnet-4-20250514`, `gpt-5.2`, `gpt-5.3-codex`) |
-| `--max-turns <n>` | Max agent turns per task (default: 10) |
-| `--save` | Save JSON + Markdown results to disk |
-| `--output <dir>` | Output directory (default: `crates/bashkit-eval/results`) |
-| `--moniker <id>` | Custom run identifier (default: `{provider}-{model}`) |
+| Eval | Samples | Selection |
+|------|---------|-----------|
+| `bashkit_bash` | 58 tasks, 15 categories | `--tag <category>`, `--samples <id>` |
+| `bashkit_smoke` | 3 tasks | quick verification |
+| `bashkit_scripting` | scripting-tool tasks | `--axis mode=scripted\|baseline` |
+
+Targets (model matrix) are defined in `src/mira_study.rs` and gated on
+`ANTHROPIC_API_KEY` / `OPENAI_API_KEY`; offline runs skip them all. Select a
+subset with `--targets <label>` — **exact** labels, comma-separated (globs are
+not supported), e.g. `--targets anthropic/claude-opus-4-8,openai/gpt-5.5`.
 
 ## Dataset
 
@@ -51,7 +60,39 @@ Smoke test dataset (`data/smoke-test.jsonl`) has 3 tasks for quick verification.
 
 ## Results
 
-### 2026-05-26 — Opus 4.7 + GPT-5.5 Lineup (58 tasks, latest)
+> Runs from 2026-06-27 onward use the mira eval framework (saved under
+> `results/mira/<run_id>/`). Earlier entries were produced by the original
+> (pre-mira) harness and are retained as a record.
+
+### 2026-06-27 — mira harness, 5-model lineup (58 tasks, latest)
+
+First full run on the [mira](https://github.com/everruns/mira) framework, model
+lineup refreshed to `claude-opus-4-8` and `claude-haiku-4-5`. Anthropic and
+OpenAI targets were run separately (two run folders under `results/mira/`).
+
+| Metric | Opus 4.8 | Haiku 4.5 | GPT-5.3-Codex | GPT-5.5 | Sonnet 4.6 |
+|--------|----------|-----------|---------------|---------|------------|
+| Tasks passed | **55/58** | **55/58** | 54/58 | 51/58 | 49/58 |
+| Score | **95%** | **95%** | 93% | 88% | 84% |
+| Tool-call success | **96%** | 94% | 85% | 90% | 93% |
+| Tokens (in/out) | 282K / 55K | 388K / 51K | 116K / 57K | 116K / 31K | 412K / 68K |
+| Duration | 12.8 min | **7.4 min** | 12.1 min | 8.2 min | 19.9 min |
+
+#### Highlights
+
+1. **Opus 4.8 and Haiku 4.5 tie at 55/58 (95%)** — Haiku matches Opus in ~⅗ the
+   wall-clock time and ~⅓-fewer reasoning tokens out; the value pick.
+2. **GPT-5.3-Codex at 54/58 (93%)** — strongest OpenAI model, but the lowest
+   tool-call success (85%); GPT-5.5 trails at 51/58 (88%).
+3. **Sonnet 4.6 at 49/58 (84%)** — consistent with prior runs, trips on a wider
+   spread of categories than the flagships.
+4. **Two tasks fail across every model** — `file_path_organizer` and
+   `script_getopts_parser` — plus `script_array_stats` for all three non-Anthropic
+   runs. These also exposed real bashkit gaps (see the filed issues): `ls -d`
+   unimplemented, the awk parser rejecting bracket/pipe constructs, and writes to
+   `$HOME` failing because the user's home directory isn't created.
+
+### 2026-05-26 — Opus 4.7 + GPT-5.5 Lineup (58 tasks, pre-mira)
 
 Refreshed model lineup: upgraded flagships to `claude-opus-4-7` (from 4.6) and
 `gpt-5.5` (from 5.2). Haiku 4.5, Sonnet 4.6, and GPT-5.3-Codex kept as

diff --git a/...4f5/cases/bashkit_5fsmoke_2fsmoke_5fecho_40anthropic_2fclaude_2dhaiku_2d4_2d5/result.json b/...4f5/cases/bashkit_5fsmoke_2fsmoke_5fecho_40anthropic_2fclaude_2dhaiku_2d4_2d5/result.json
@@ -0,0 +1,59 @@
+{
+  "eval": "bashkit_smoke",
+  "sample": "smoke_echo",
+  "target": "anthropic/claude-haiku-4-5",
+  "passed": true,
+  "aggregate": 1.0,
+  "scores": [
+    {
+      "scorer": "bashkit_expectations",
+      "value": 1.0,
+      "pass": true,
+      "reason": "2/2 checks passed (weighted rate 100%)"
+    }
+  ],
+  "transcript": {
+    "final_response": "Done! The command `echo 'hello world'` printed \"hello world\" to stdout.",
+    "iterations": 2,
+    "tool_calls_count": 1,
+    "tool_calls": [
+      "echo 'hello world'"
+    ],
+    "usage": {
+      "input_tokens": 1510,
+      "output_tokens": 80,
+      "cost_usd": 0.0
+    },
+    "timing": {
+      "duration_ms": 1301
+    },
+    "metrics": {
+      "natural_stop": 1.0,
+      "tool_calls": 1.0,
+      "tool_calls_err": 0.0,
+      "tool_calls_ok": 1.0,
+      "turns": 2.0
+    },
+    "metadata": {
+      "bashkit": {
+        "dirs": [
+          "/dev",
+          "/dev/fd",
+          "/home",
+          "/home/user",
+          "/tmp"
+        ],
+        "last_exit_code": 0,
+        "tool_outputs": [
+          {
+            "commands": "echo 'hello world'",
+            "exit_code": 0,
+            "stderr": "",
+            "stdout": "hello world\n"
+          }
+        ]
+      }
+    }
+  },
+  "skipped": false
+}
diff --git a/...34f5/cases/bashkit_5fsmoke_2fsmoke_5fecho_40anthropic_2fclaude_2dopus_2d4_2d8/result.json b/...34f5/cases/bashkit_5fsmoke_2fsmoke_5fecho_40anthropic_2fclaude_2dopus_2d4_2d8/result.json
@@ -0,0 +1,59 @@
+{
+  "eval": "bashkit_smoke",
+  "sample": "smoke_echo",
+  "target": "anthropic/claude-opus-4-8",
+  "passed": true,
+  "aggregate": 1.0,
+  "scores": [
+    {
+      "scorer": "bashkit_expectations",
+      "value": 1.0,
+      "pass": true,
+      "reason": "2/2 checks passed (weighted rate 100%)"
+    }
+  ],
+  "transcript": {
+    "final_response": "Done. The command `echo 'hello world'` printed **hello world** to stdout.",
+    "iterations": 2,
+    "tool_calls_count": 1,
+    "tool_calls": [
+      "echo 'hello world'"
+    ],
+    "usage": {
+      "input_tokens": 1313,
+      "output_tokens": 101,
+      "cost_usd": 0.0
+    },
+    "timing": {
+      "duration_ms": 2723
+    },
+    "metrics": {
+      "natural_stop": 1.0,
+      "tool_calls": 1.0,
+      "tool_calls_err": 0.0,
+      "tool_calls_ok": 1.0,
+      "turns": 2.0
+    },
+    "metadata": {
+      "bashkit": {
+        "dirs": [
+          "/dev",
+          "/dev/fd",
+          "/home",
+          "/home/user",
+          "/tmp"
+        ],
+        "last_exit_code": 0,
+        "tool_outputs": [
+          {
+            "commands": "echo 'hello world'",
+            "exit_code": 0,
+            "stderr": "",
+            "stdout": "hello world\n"
+          }
+        ]
+      }
+    }
+  },
+  "skipped": false
+}