From e53611493a5e957991f65991cccf5cce7f0a269c Mon Sep 17 00:00:00 2001 From: ahd-weekly-eval Date: Mon, 15 Jun 2026 13:03:21 +0000 Subject: [PATCH] docs(evals): weekly run 2026-06-15 Automated weekly eval against Cloudflare Workers AI OSS roster. See the report for per-cell numbers. --- docs/evals/weekly/2026-06-15.md | 94 ++++++ docs/evals/weekly/2026-06-15.replay.json | 376 +++++++++++++++++++++++ 2 files changed, 470 insertions(+) create mode 100644 docs/evals/weekly/2026-06-15.md create mode 100644 docs/evals/weekly/2026-06-15.replay.json diff --git a/docs/evals/weekly/2026-06-15.md b/docs/evals/weekly/2026-06-15.md new file mode 100644 index 0000000..9afba93 --- /dev/null +++ b/docs/evals/weekly/2026-06-15.md @@ -0,0 +1,94 @@ +# ahd eval · swiss-editorial · 2026-06-15T13:03:21.834Z + +```yaml ahd-replay +schema_version: 1 +kind: eval-live +ahd_version: 0.11.0 +ahd_commit: a772e367f2a1d021433d2ab181d1cd53845a8485 +git_dirty: true +node_version: v20.20.2 +platform: linux-x64 +invoked_at: 2026-06-15T12:32:05.978Z +token: + path: /home/runner/work/ahd/ahd/tokens/swiss-editorial.yml + hash: sha256:380a3d833d94 +brief: + path: briefs/landing.yml + hash: sha256:8b7d42759643 +sampling: + n: 30 + temperature: null + seed: null +models: + - id: @cf/google/gemma-4-26b-a4b-it + provider: cloudflare-workers-ai + provider_request_ids: 53 captured + - id: @cf/meta/llama-4-scout-17b-16e-instruct + provider: cloudflare-workers-ai + provider_request_ids: 60 captured + - id: @cf/mistralai/mistral-small-3.1-24b-instruct + provider: cloudflare-workers-ai + provider_request_ids: 60 captured + - id: @cf/openai/gpt-oss-120b + provider: cloudflare-workers-ai + provider_request_ids: 60 captured + - id: @cf/qwen/qwen3-30b-a3b-fp8 + provider: cloudflare-workers-ai + provider_request_ids: 60 captured +conditions: + requested: [raw, compiled] + effective: [raw, compiled] +``` + +Replay this run: + +```sh +git checkout a772e367f2a1 +npm ci && npm run build +/opt/hostedtoolcache/node/20.20.2/x64/bin/node /home/runner/work/ahd/ahd/bin/ahd.js eval-live swiss-editorial --brief briefs/landing.yml --models cf:@cf/google/gemma-4-26b-a4b-it,cf:@cf/meta/llama-4-scout-17b-16e-instruct,cf:@cf/mistralai/mistral-small-3.1-24b-instruct,cf:@cf/openai/gpt-oss-120b,cf:@cf/qwen/qwen3-30b-a3b-fp8 --n 30 --sample-concurrency 6 --out evals --report docs/evals/weekly/2026-06-15.md +``` + +## Run + +- Brief: `briefs/landing.yml` +- Samples per cell: **30** +- Max tokens: 12000 +- Models: + - `@cf/google/gemma-4-26b-a4b-it` (cloudflare-workers-ai) · spec `cf:@cf/google/gemma-4-26b-a4b-it` + - `@cf/meta/llama-4-scout-17b-16e-instruct` (cloudflare-workers-ai) · spec `cf:@cf/meta/llama-4-scout-17b-16e-instruct` + - `@cf/mistralai/mistral-small-3.1-24b-instruct` (cloudflare-workers-ai) · spec `cf:@cf/mistralai/mistral-small-3.1-24b-instruct` + - `@cf/openai/gpt-oss-120b` (cloudflare-workers-ai) · spec `cf:@cf/openai/gpt-oss-120b` + - `@cf/qwen/qwen3-30b-a3b-fp8` (cloudflare-workers-ai) · spec `cf:@cf/qwen/qwen3-30b-a3b-fp8` + +## Per-model slop reduction + +| model | raw attempted → scored | compiled attempted → scored | raw mean tells | compiled mean tells | Δ | reduction | +|---|---:|---:|---:|---:|---:|---:| +| `@cf/google/gemma-4-26b-a4b-it` | 30 → 28 | 30 → 25 | 2.57 | 1.20 | 1.37 | 53.3% | +| `@cf/meta/llama-4-scout-17b-16e-instruct` | 30 → 30 | 30 → 30 | 2.00 | 2.00 | 0.00 | 0.0% | +| `@cf/mistralai/mistral-small-3.1-24b-instruct` | 30 → 30 | 30 → 30 | 3.33 | 1.13 | 2.20 | 66.0% | +| `@cf/openai/gpt-oss-120b` | 30 → 30 | 30 → 30 | 3.33 | 0.90 | 2.43 | 73.0% | +| `@cf/qwen/qwen3-30b-a3b-fp8` | 30 → 30 | 30 → 30 | 1.87 | 1.73 | 0.13 | 7.1% | + +## Per-tell frequency (scored samples only) + +| tell | @cf/google/gemma-4-26b-a4b-it/raw | @cf/google/gemma-4-26b-a4b-it/compiled | @cf/meta/llama-4-scout-17b-16e-instruct/raw | @cf/meta/llama-4-scout-17b-16e-instruct/compiled | @cf/mistralai/mistral-small-3.1-24b-instruct/raw | @cf/mistralai/mistral-small-3.1-24b-instruct/compiled | @cf/openai/gpt-oss-120b/raw | @cf/openai/gpt-oss-120b/compiled | @cf/qwen/qwen3-30b-a3b-fp8/raw | @cf/qwen/qwen3-30b-a3b-fp8/compiled | +|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:| +| ahd/a11y/heading-skip | 0% | 8% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | +| ahd/body-measure | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 3% | +| ahd/line-height-per-size | 82% | 0% | 0% | 100% | 53% | 20% | 100% | 0% | 60% | 37% | +| ahd/no-em-dashes-in-prose | 0% | 0% | 0% | 0% | 0% | 3% | 7% | 0% | 0% | 0% | +| ahd/no-flat-dark-mode | 4% | 0% | 0% | 0% | 0% | 0% | 0% | 0% | 3% | 0% | +| ahd/radius-hierarchy | 50% | 4% | 0% | 100% | 100% | 0% | 90% | 3% | 27% | 47% | +| ahd/require-named-grid | 0% | 0% | 100% | 0% | 100% | 50% | 33% | 0% | 13% | 3% | +| ahd/require-type-pairing | 21% | 0% | 100% | 0% | 80% | 0% | 53% | 0% | 70% | 0% | +| ahd/tracking-per-size | 0% | 32% | 0% | 0% | 0% | 30% | 0% | 3% | 0% | 0% | +| ahd/weight-variety | 100% | 76% | 0% | 0% | 0% | 10% | 50% | 83% | 13% | 83% | + +## Caveats +- Scoring runs the deterministic AHD linter (38 source-level rules) over every sample that passes a basic HTML sanity check. +- Counts reported per cell: attempted (runs initiated) / errored (API / runtime errors) / extractionFailed (response contained no usable HTML) / scored (linted). A large gap between attempted and scored is a signal that the model is struggling with the instruction, not that it passed the taxonomy. +- Raw condition: the brief is expanded as plain prose (intent + audience + surfaces + mustInclude + mustAvoid) with no AHD system prompt, no style token, no forbidden list. Compiled condition: same brief plus the AHD-compiled system prompt. The only thing that differs between conditions is the AHD intervention. +- Vision-only tells (14 rules in the critic) are not scored in this pipeline; run the critic on rendered screenshots for full taxonomy coverage. +- Tells-per-page is a proxy metric: a thin page has little surface for rules to fire against. Read the Δ alongside the actual rendered HTML, not in isolation. +- Model versions change. See the run manifest for exact canonical model ids. \ No newline at end of file diff --git a/docs/evals/weekly/2026-06-15.replay.json b/docs/evals/weekly/2026-06-15.replay.json new file mode 100644 index 0000000..800d748 --- /dev/null +++ b/docs/evals/weekly/2026-06-15.replay.json @@ -0,0 +1,376 @@ +{ + "schema_version": 1, + "kind": "eval-live", + "ahd_version": "0.11.0", + "ahd_commit": "a772e367f2a1d021433d2ab181d1cd53845a8485", + "git_dirty": true, + "node_version": "v20.20.2", + "platform": "linux-x64", + "invoked_at": "2026-06-15T12:32:05.978Z", + "argv": [ + "/opt/hostedtoolcache/node/20.20.2/x64/bin/node", + "/home/runner/work/ahd/ahd/bin/ahd.js", + "eval-live", + "swiss-editorial", + "--brief", + "briefs/landing.yml", + "--models", + "cf:@cf/google/gemma-4-26b-a4b-it,cf:@cf/meta/llama-4-scout-17b-16e-instruct,cf:@cf/mistralai/mistral-small-3.1-24b-instruct,cf:@cf/openai/gpt-oss-120b,cf:@cf/qwen/qwen3-30b-a3b-fp8", + "--n", + "30", + "--sample-concurrency", + "6", + "--out", + "evals", + "--report", + "docs/evals/weekly/2026-06-15.md" + ], + "token": { + "path": "/home/runner/work/ahd/ahd/tokens/swiss-editorial.yml", + "hash": "sha256:380a3d833d9463dbc681df7465993ab6413d8e77188f55f97359f60dc4b746b1" + }, + "brief": { + "path": "briefs/landing.yml", + "hash": "sha256:8b7d4275964399a91e6ddec525151ba672ed9c4721279a37ec16ab3450493a4c" + }, + "sampling": { + "n": 30, + "temperature": null, + "seed": null + }, + "models": [ + { + "id": "@cf/google/gemma-4-26b-a4b-it", + "provider": "cloudflare-workers-ai", + "provider_request_ids": [ + "a0c198761daa2157-SJC", + "a0c198761d03c4f3-SJC", + "a0c198760a4c6142-SJC", + "a0c198761a167b83-SJC", + "a0c198760d3aebe2-SJC", + "a0c198761b03fc53-SJC", + "a0c19a0cec0eebe2-SJC", + "a0c199e79c1ec4f3-SJC", + "a0c19a073ad27b83-SJC", + "a0c199e37ed32157-SJC", + "a0c199ff2c516142-SJC", + "a0c19b84bd71c4f3-SJC", + "a0c19aee2f14fc53-SJC", + "a0c19b82cb32ebe2-SJC", + "a0c19bec7ecc2157-SJC", + "a0c19bfcfa776142-SJC", + "a0c19bab39b67b83-SJC", + "a0c19cf07d90ebe2-SJC", + "a0c19c8ccb72c4f3-SJC", + "a0c19d89fa887b83-SJC", + "a0c19d796bde6142-SJC", + "a0c19dd229c8ebe2-SJC", + "a0c19d7159ae2157-SJC", + "a0c19e661bb3c4f3-SJC", + "a0c19ebaaa4f7b83-SJC", + "a0c19ed99a22ebe2-SJC", + "a0c19eefee232157-SJC", + "a0c19f83b99ec4f3-SJC", + "a0c1a20d69c022ae-SJC", + "a0c1a20d68afb5f2-SJC", + "a0c1a20d6d6ffc53-SJC", + "a0c1a20d6f9e2714-SJC", + "a0c1a4464a542714-SJC", + "a0c1a4fb6cdd2a0c-SJC", + "a0c1a4fb68bb33dc-SJC", + "a0c1a629ce262714-SJC", + "a0c1a70ddb122af7-SJC", + "a0c1a6d99b402734-SJC", + "a0c1a7338f0a2157-SJC", + "a0c1a67f5aab6142-SJC", + "a0c1a792bcee2a0c-SJC", + "a0c1a8307caa2714-SJC", + "a0c1a8664bf92af7-SJC", + "a0c1a90e5a9f6142-SJC", + "a0c1a8f748052157-SJC", + "a0c1a8ddbb552734-SJC", + "a0c1aa7b9ee32714-SJC", + "a0c1aa582bb22a0c-SJC", + "a0c1aa8d0f266142-SJC", + "a0c1aa827cc12af7-SJC", + "a0c1ab07fb232157-SJC", + "a0c1ab59d80b2734-SJC", + "a0c1aba4ab1d2714-SJC" + ] + }, + { + "id": "@cf/meta/llama-4-scout-17b-16e-instruct", + "provider": "cloudflare-workers-ai", + "provider_request_ids": [ + "a0c1ad9f5a9bfc53-SJC", + "a0c1ad9f5ad433dc-SJC", + "a0c1ad9f5fc12a0c-SJC", + "a0c1ad9f4aca2714-SJC", + "a0c1ad9f59bcebe2-SJC", + "a0c1ad9f5d34c4f3-SJC", + "a0c1ae0cbaddfc53-SJC", + "a0c1ae192d10c4f3-SJC", + "a0c1ae0cb98933dc-SJC", + "a0c1ae192a5e2a0c-SJC", + "a0c1ae191f672714-SJC", + "a0c1ae19281bebe2-SJC", + "a0c1ae774e37fc53-SJC", + "a0c1ae81d8cbc4f3-SJC", + "a0c1ae972f9233dc-SJC", + "a0c1aea2db542a0c-SJC", + "a0c1aea32f8debe2-SJC", + "a0c1aea30d962714-SJC", + "a0c1aee15b16fc53-SJC", + "a0c1aee6b8b6c4f3-SJC", + "a0c1af211d0bebe2-SJC", + "a0c1af212b042714-SJC", + "a0c1af1e5de533dc-SJC", + "a0c1af20ce5c2a0c-SJC", + "a0c1af56d969fc53-SJC", + "a0c1af5a7bebc4f3-SJC", + "a0c1af8d7ff52714-SJC", + "a0c1af9baa802a0c-SJC", + "a0c1af8d1c15ebe2-SJC", + "a0c1af95cf8533dc-SJC", + "a0c1b019bd62b5f2-SJC", + "a0c1b019a9d22714-SJC", + "a0c1b019acdaebe2-SJC", + "a0c1b019abdd2a0c-SJC", + "a0c1b019cf872734-SJC", + "a0c1b019acb133dc-SJC", + "a0c1b0adaf85b5f2-SJC", + "a0c1b0d36e142a0c-SJC", + "a0c1b0db6df033dc-SJC", + "a0c1b0bffce22714-SJC", + "a0c1b0d34db6ebe2-SJC", + "a0c1b0d89a3e2734-SJC", + "a0c1b13fba41b5f2-SJC", + "a0c1b172c9a72a0c-SJC", + "a0c1b18c29152714-SJC", + "a0c1b17c5e6433dc-SJC", + "a0c1b1adbcb12734-SJC", + "a0c1b1a29b82ebe2-SJC", + "a0c1b1d9ef5ab5f2-SJC", + "a0c1b1feb96a2a0c-SJC", + "a0c1b205c93a2714-SJC", + "a0c1b240ab702734-SJC", + "a0c1b2244f9f33dc-SJC", + "a0c1b26348fab5f2-SJC", + "a0c1b27fac022a0c-SJC", + "a0c1b25eefa3ebe2-SJC", + "a0c1b287dacb2714-SJC", + "a0c1b2cf6aa62734-SJC", + "a0c1b2e0a82a33dc-SJC", + "a0c1b3063f91b5f2-SJC" + ] + }, + { + "id": "@cf/mistralai/mistral-small-3.1-24b-instruct", + "provider": "cloudflare-workers-ai", + "provider_request_ids": [ + "a0c1b3d20eacb5f2-SJC", + "a0c1b3d20e926142-SJC", + "a0c1b3d20cab26d3-SJC", + "a0c1b3d21f862a0c-SJC", + "a0c1b3d208cd2734-SJC", + "a0c1b3d21992de51-SJC", + "a0c1b439ef9226d3-SJC", + "a0c1b42defe2b5f2-SJC", + "a0c1b44389712734-SJC", + "a0c1b436ea4d6142-SJC", + "a0c1b44d0d56de51-SJC", + "a0c1b4416c302a0c-SJC", + "a0c1b49cd9e0b5f2-SJC", + "a0c1b4a6eebc2734-SJC", + "a0c1b4a76c696142-SJC", + "a0c1b498881426d3-SJC", + "a0c1b4cb4a4c2a0c-SJC", + "a0c1b4bfec36de51-SJC", + "a0c1b508db93b5f2-SJC", + "a0c1b5154f966142-SJC", + "a0c1b525c96b26d3-SJC", + "a0c1b5118aae2734-SJC", + "a0c1b52b18a62a0c-SJC", + "a0c1b53c1ecbde51-SJC", + "a0c1b57ce8fbb5f2-SJC", + "a0c1b57e990b6142-SJC", + "a0c1b591bd592734-SJC", + "a0c1b589bbbc26d3-SJC", + "a0c1b5aaabf32a0c-SJC", + "a0c1b5b74fd9de51-SJC", + "a0c1b62c19f02343-SJC", + "a0c1b62c0821de51-SJC", + "a0c1b62c0bb22a0c-SJC", + "a0c1b62c1837cf2f-SJC", + "a0c1b62c0a2e26d3-SJC", + "a0c1b62c1f15fc53-SJC", + "a0c1b6b59ca72343-SJC", + "a0c1b6c7c852de51-SJC", + "a0c1b6eddad32a0c-SJC", + "a0c1b6f00af7cf2f-SJC", + "a0c1b6f8f90d26d3-SJC", + "a0c1b7345b95fc53-SJC", + "a0c1b79d0f8b2343-SJC", + "a0c1b7c72e742a0c-SJC", + "a0c1b7aea9fcde51-SJC", + "a0c1b7d3fb5ecf2f-SJC", + "a0c1b7de8f5526d3-SJC", + "a0c1b85d6ccd2343-SJC", + "a0c1b88709e3de51-SJC", + "a0c1b83bf88bfc53-SJC", + "a0c1b87c49e72a0c-SJC", + "a0c1b8b4cdafcf2f-SJC", + "a0c1b8da8bfa26d3-SJC", + "a0c1b9220da52343-SJC", + "a0c1b9348a4fde51-SJC", + "a0c1b9413dc6fc53-SJC", + "a0c1b98cedb4cf2f-SJC", + "a0c1b978fe2b2a0c-SJC", + "a0c1b9b27b0826d3-SJC", + "a0c1ba0598c72343-SJC" + ] + }, + { + "id": "@cf/openai/gpt-oss-120b", + "provider": "cloudflare-workers-ai", + "provider_request_ids": [ + "a0c1bb102a0eb5f2-SJC", + "a0c1bb102afb2af7-SJC", + "a0c1bb101a0cfc53-SJC", + "a0c1bb101b3d33dc-SJC", + "a0c1bb100dde2343-SJC", + "a0c1bb101ab2ebe2-SJC", + "a0c1bb44eeb2b5f2-SJC", + "a0c1bb45c8542af7-SJC", + "a0c1bb475881fc53-SJC", + "a0c1bb678e2febe2-SJC", + "a0c1bb610ac22343-SJC", + "a0c1bb513cbc33dc-SJC", + "a0c1bb72c8b7b5f2-SJC", + "a0c1bb818c4d2af7-SJC", + "a0c1bb995de3ebe2-SJC", + "a0c1bb9cbd8a2343-SJC", + "a0c1bb923cc0fc53-SJC", + "a0c1bbb2a92f2af7-SJC", + "a0c1bbad2e5b33dc-SJC", + "a0c1bbb28841b5f2-SJC", + "a0c1bbe1adedfc53-SJC", + "a0c1bbd5fa03ebe2-SJC", + "a0c1bbdc39282343-SJC", + "a0c1bbe71f7533dc-SJC", + "a0c1bbe3c9dc2af7-SJC", + "a0c1bc1e3be6fc53-SJC", + "a0c1bc012d81b5f2-SJC", + "a0c1bc2bab122343-SJC", + "a0c1bc29ee23ebe2-SJC", + "a0c1bc362cea33dc-SJC", + "a0c1bc87be4933dc-SJC", + "a0c1bc87bfd3ebe2-SJC", + "a0c1bc87cdfbc4f3-SJC", + "a0c1bc87cae42af7-SJC", + "a0c1bc87cf4526d3-SJC", + "a0c1bc87cafa2734-SJC", + "a0c1bcde3a79ebe2-SJC", + "a0c1bcd7f82e33dc-SJC", + "a0c1bce95f772af7-SJC", + "a0c1bcea0f5826d3-SJC", + "a0c1bd278a7a2734-SJC", + "a0c1bce63c1fc4f3-SJC", + "a0c1bd377a2633dc-SJC", + "a0c1bd302cacebe2-SJC", + "a0c1bd4bcd562af7-SJC", + "a0c1bd768cfd2734-SJC", + "a0c1bd84ad0a33dc-SJC", + "a0c1bd610cd526d3-SJC", + "a0c1bd7c8f03c4f3-SJC", + "a0c1bdb96b082af7-SJC", + "a0c1bd994803ebe2-SJC", + "a0c1bdf9ebbe2734-SJC", + "a0c1be0749b226d3-SJC", + "a0c1be3039b62af7-SJC", + "a0c1bdff9c3a33dc-SJC", + "a0c1be3b09c8ebe2-SJC", + "a0c1be294fa4c4f3-SJC", + "a0c1be5aaddc2734-SJC", + "a0c1be6ed8d126d3-SJC", + "a0c1be79cdb42af7-SJC" + ] + }, + { + "id": "@cf/qwen/qwen3-30b-a3b-fp8", + "provider": "cloudflare-workers-ai", + "provider_request_ids": [ + "a0c1bf0888fe2af7-SJC", + "a0c1bf0899e326d3-SJC", + "a0c1bf089bfeebe2-SJC", + "a0c1bf08980933dc-SJC", + "a0c1bf089a5dfc53-SJC", + "a0c1bfb908da26d3-SJC", + "a0c1bf089d8ec4f3-SJC", + "a0c1bfa458a22af7-SJC", + "a0c1c01b2d52c4f3-SJC", + "a0c1bff70ceafc53-SJC", + "a0c1c00ee86626d3-SJC", + "a0c1bfc77a42ebe2-SJC", + "a0c1c0989a59c4f3-SJC", + "a0c1bfcd1e4e33dc-SJC", + "a0c1c0baaf47fc53-SJC", + "a0c1c0731c9c2af7-SJC", + "a0c1c0e51b8bebe2-SJC", + "a0c1c13ac9bffc53-SJC", + "a0c1c0d38ed526d3-SJC", + "a0c1c1071cc9c4f3-SJC", + "a0c1c16199f92af7-SJC", + "a0c1c192ed8bc4f3-SJC", + "a0c1c1907b5ffc53-SJC", + "a0c1c11c3dcc33dc-SJC", + "a0c1c182d8c2ebe2-SJC", + "a0c1c1c2df7ac4f3-SJC", + "a0c1c1c24e082af7-SJC", + "a0c1c190cecc26d3-SJC", + "a0c1c1d7bb4efc53-SJC", + "a0c1c1ef4f8d33dc-SJC", + "a0c1c2cbe8c42157-SJC", + "a0c1c2cbed7a2a0c-SJC", + "a0c1c2cbdba76142-SJC", + "a0c1c2cbda05148e-SJC", + "a0c1c2cbdf6debe2-SJC", + "a0c1c2cbcf2433dc-SJC", + "a0c1c3681ce62a0c-SJC", + "a0c1c370f997ebe2-SJC", + "a0c1c36e8ff26142-SJC", + "a0c1c38148cf33dc-SJC", + "a0c1c36f7a2d148e-SJC", + "a0c1c362edaa2157-SJC", + "a0c1c3fdbd5b2a0c-SJC", + "a0c1c4015dd7ebe2-SJC", + "a0c1c441db9a148e-SJC", + "a0c1c40a1a4933dc-SJC", + "a0c1c4092a5f6142-SJC", + "a0c1c4700edd2157-SJC", + "a0c1c48a7c83ebe2-SJC", + "a0c1c48718212a0c-SJC", + "a0c1c4e7fbef148e-SJC", + "a0c1c4f168802157-SJC", + "a0c1c4ed6f7633dc-SJC", + "a0c1c4edae796142-SJC", + "a0c1c5106c48ebe2-SJC", + "a0c1c5426a6b2a0c-SJC", + "a0c1c56c996e148e-SJC", + "a0c1c58b59522157-SJC", + "a0c1c5929be76142-SJC", + "a0c1c58f4a4333dc-SJC" + ] + } + ], + "conditions": { + "requested": [ + "raw", + "compiled" + ], + "effective": [ + "raw", + "compiled" + ] + } +}