ianm199 · ianm199 · Jun 2, 2026 · Jun 2, 2026
diff --git a/harness/bench/README.md b/harness/bench/README.md
@@ -16,6 +16,7 @@ workload is the only fair comparison.
 harness/bench/
 ├── README.md            <- this file
 ├── compare.sh           <- main ledgered bench: run all workloads vs reference
+├── compare_bins.sh      <- direct A/B bench for two arbitrary Lua binaries
 ├── gc-profile.sh        <- end-of-run collector counters
 ├── opcode-profile.sh    <- feature-gated opcode execution counters
 ├── profile-inventory.sh <- repo + host profiler/tool availability
@@ -91,6 +92,27 @@ python3 harness/bench/history.py        # writes harness/bench/history/index.htm
 python3 harness/bench/history.py --open # also opens it in your browser
 ```
 
+For direct Rust-vs-Rust packet validation, compare two built binaries without
+touching the evidence ledger:
+
+```bash
+bash harness/bench/compare_bins.sh \
+  --a /tmp/lua-rs-base \
+  --b target/release/lua-rs \
+  --label-a base \
+  --label-b candidate \
+  --runs 20 \
+  --workloads gc_pressure,binarytrees
+```
+
+Output:
+- `harness/bench/results/<UTC>-<sha>-bin-ab.tsv`
+- `harness/bench/results/<UTC>-<sha>-bin-ab.json`
+
+This runner checks that both binaries produce byte-identical workload output
+and reports `candidate_over_base` wall/RSS ratios. Use it for local packet
+evidence; use `compare.sh` for reference-C ratios and dashboard history.
+
 ## How to read the numbers
 
 `wall_ratio` is the headline. It is best-of-N wall-clock for lua-rs divided
@@ -234,13 +256,15 @@ unsafe representation ceilings.
 4. `gc-profile.sh` covers collector counters and start/end cadence deltas. It
    does not provide allocation stack attribution or cumulative per-phase
    timing.
-5. `compare.sh` appends ledger rows directly. Typed bench runner entries in
+5. `compare_bins.sh` covers direct Rust-vs-Rust A/B checks for small packets
+   without appending ledger rows.
+6. `compare.sh` appends ledger rows directly. Typed bench runner entries in
    `harness/runners.toml` are still useful future cleanup, but not required
    for evidence-backed perf work.
-6. `profile-inventory.sh` and `value-layout.sh` are telemetry probes. They do
+7. `profile-inventory.sh` and `value-layout.sh` are telemetry probes. They do
    not write ledger rows and should be cited as design evidence, not speed
    claims.
-7. Backfill remains future work for answering "when did this regress?" across
+8. Backfill remains future work for answering "when did this regress?" across
    older commits.
-8. Keep `results/` and `profiles/` generated artifacts ignored unless a run is
+9. Keep `results/` and `profiles/` generated artifacts ignored unless a run is
    deliberately promoted into committed evidence.
diff --git a/harness/bench/compare_bins.sh b/harness/bench/compare_bins.sh
@@ -0,0 +1,187 @@
+#!/usr/bin/env bash
+# compare_bins.sh — direct A/B timing for two Lua interpreter binaries.
+#
+# This is for packet validation when the question is "did this Rust change move
+# the workload?" rather than "how far are we from reference C?". It runs the
+# same harness workloads through both binaries, asserts byte-identical output,
+# and reports best-of-N wall time plus max RSS. It intentionally does not append
+# ledger rows; use compare.sh for dashboard/history evidence.
+#
+# Usage:
+#   bash harness/bench/compare_bins.sh --a /tmp/lua-rs-base --b target/release/lua-rs \
+#     --label-a base --label-b candidate --runs 20 --workloads gc_pressure,binarytrees
+set -euo pipefail
+
+ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
+cd "$ROOT"
+
+WORKLOAD_DIR="$ROOT/harness/bench/workloads"
+OUT_DIR="$ROOT/harness/bench/results"
+mkdir -p "$OUT_DIR"
+
+A_BIN=""
+B_BIN=""
+LABEL_A="a"
+LABEL_B="b"
+RUNS=10
+WORKLOAD_FILTER=""
+
+while [ $# -gt 0 ]; do
+    case "$1" in
+        --a)         A_BIN="$2";           shift 2 ;;
+        --b)         B_BIN="$2";           shift 2 ;;
+        --label-a)   LABEL_A="$2";         shift 2 ;;
+        --label-b)   LABEL_B="$2";         shift 2 ;;
+        --runs)      RUNS="$2";            shift 2 ;;
+        --workloads) WORKLOAD_FILTER="$2"; shift 2 ;;
+        -h|--help)
+            sed -n '2,/^set -euo/p' "${BASH_SOURCE[0]}" | sed 's/^# //; s/^#//'
+            exit 0 ;;
+        *) echo "unknown flag: $1" >&2; exit 1 ;;
+    esac
+done
+
+[ -n "$A_BIN" ] || { echo "[err] missing --a binary" >&2; exit 2; }
+[ -n "$B_BIN" ] || { echo "[err] missing --b binary" >&2; exit 2; }
+[ -x "$A_BIN" ] || { echo "[err] --a binary not executable: $A_BIN" >&2; exit 2; }
+[ -x "$B_BIN" ] || { echo "[err] --b binary not executable: $B_BIN" >&2; exit 2; }
+case "$RUNS" in
+    ''|*[!0-9]*) echo "[err] --runs must be a positive integer" >&2; exit 2 ;;
+    0)           echo "[err] --runs must be >= 1" >&2; exit 2 ;;
+esac
+
+TS=$(date -u +%Y%m%dT%H%M%SZ)
+COMMIT=$(git rev-parse --short HEAD 2>/dev/null || echo "unknown")
+TSV="$OUT_DIR/${TS}-${COMMIT}-bin-ab.tsv"
+JSON="$OUT_DIR/${TS}-${COMMIT}-bin-ab.json"
+
+OS_NAME="$(uname -sr)"
+ARCH="$(uname -m)"
+CPU="$(sysctl -n machdep.cpu.brand_string 2>/dev/null || grep -m1 'model name' /proc/cpuinfo 2>/dev/null | cut -d: -f2- | sed 's/^ *//' || echo 'unknown')"
+
+run_out() {
+    local bin="$1"
+    local workload="$2"
+    "$bin" "$workload" 2>&1
+}
+
+measure_one() {
+    local bin="$1"
+    local workload="$2"
+    local tmp real rss parsed rss_kb
+    tmp=$(mktemp)
+    case "$(uname -s)" in
+        Darwin)
+            /usr/bin/time -lp "$bin" "$workload" >/dev/null 2>"$tmp"
+            real=$(awk '$1=="real" {print $2; exit}' "$tmp")
+            rss=$(awk '/maximum resident set size/ {print $1; exit}' "$tmp")
+            ;;
+        *)
+            /usr/bin/time -f '%e %M' "$bin" "$workload" >/dev/null 2>"$tmp"
+            parsed=$(awk '/^[0-9.]+ [0-9]+$/ {r=$1; k=$2} END {if (r != "") print r, k}' "$tmp")
+            real=$(printf '%s' "$parsed" | awk '{print $1}')
+            rss_kb=$(printf '%s' "$parsed" | awk '{print $2}')
+            [ -n "$rss_kb" ] && rss=$((rss_kb * 1024))
+            ;;
+    esac
+    rm -f "$tmp"
+    if [ -z "${real:-}" ] || [ -z "${rss:-}" ]; then
+        echo "[err] failed to parse /usr/bin/time output for $bin $workload" >&2
+        return 1
+    fi
+    printf "%s %s\n" "$real" "$rss"
+}
+
+best_of_n() {
+    local bin="$1"
+    local workload="$2"
+    local best_real="" best_rss="" pair real rss
+    for _ in $(seq 1 "$RUNS"); do
+        pair=$(measure_one "$bin" "$workload") || return 1
+        real=$(echo "$pair" | awk '{print $1}')
+        rss=$(echo "$pair" | awk '{print $2}')
+        if [ -z "$best_real" ] || awk -v a="$real" -v b="$best_real" 'BEGIN{exit !(a < b)}'; then
+            best_real="$real"
+        fi
+        if [ -z "$best_rss" ] || awk -v a="$rss" -v b="$best_rss" 'BEGIN{exit !(a > b)}'; then
+            best_rss="$rss"
+        fi
+    done
+    printf "%s %s\n" "$best_real" "$best_rss"
+}
+
+{
+    printf '# lua-rs binary A/B compare\n'
+    printf '# timestamp_utc: %s\n' "$TS"
+    printf '# commit:        %s\n' "$COMMIT"
+    printf '# os:            %s\n' "$OS_NAME"
+    printf '# arch:          %s\n' "$ARCH"
+    printf '# cpu:           %s\n' "$CPU"
+    printf '# runs:          %d (reporting best wall-clock, max RSS)\n' "$RUNS"
+    printf '# %s: %s\n' "$LABEL_A" "$A_BIN"
+    printf '# %s: %s\n' "$LABEL_B" "$B_BIN"
+    printf '#\n'
+    printf 'workload\t%s_wall_s\t%s_wall_s\t%s_over_%s_wall_ratio\t%s_rss_kb\t%s_rss_kb\t%s_over_%s_rss_ratio\tmatch\n' \
+        "$LABEL_A" "$LABEL_B" "$LABEL_B" "$LABEL_A" "$LABEL_A" "$LABEL_B" "$LABEL_B" "$LABEL_A"
+} > "$TSV"
+
+JSON_ROWS=""
+TOTAL_A=0
+TOTAL_B=0
+
+for wpath in "$WORKLOAD_DIR"/*.lua; do
+    wname=$(basename "$wpath" .lua)
+    if [ -n "$WORKLOAD_FILTER" ]; then
+        echo ",$WORKLOAD_FILTER," | grep -q ",$wname," || continue
+    fi
+
+    echo "==> $wname" >&2
+    out_a=$(run_out "$A_BIN" "$wpath")
+    out_b=$(run_out "$B_BIN" "$wpath")
+    match="ok"
+    [ "$out_a" = "$out_b" ] || match="diff"
+
+    pair_a=$(best_of_n "$A_BIN" "$wpath")
+    pair_b=$(best_of_n "$B_BIN" "$wpath")
+    a_wall=$(echo "$pair_a" | awk '{print $1}')
+    a_rss=$(echo "$pair_a" | awk '{print $2}')
+    b_wall=$(echo "$pair_b" | awk '{print $1}')
+    b_rss=$(echo "$pair_b" | awk '{print $2}')
+
+    wall_ratio=$(awk -v a="$b_wall" -v b="$a_wall" 'BEGIN{if (b>0) printf "%.3f", a/b; else print "NaN"}')
+    rss_ratio=$(awk -v a="$b_rss" -v b="$a_rss" 'BEGIN{if (b>0) printf "%.3f", a/b; else print "NaN"}')
+    a_rss_kb=$(awk -v b="$a_rss" 'BEGIN{printf "%.0f", b/1024}')
+    b_rss_kb=$(awk -v b="$b_rss" 'BEGIN{printf "%.0f", b/1024}')
+
+    printf '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' \
+        "$wname" "$a_wall" "$b_wall" "$wall_ratio" "$a_rss_kb" "$b_rss_kb" "$rss_ratio" "$match" >> "$TSV"
+
+    if [ -n "$JSON_ROWS" ]; then JSON_ROWS="$JSON_ROWS,"; fi
+    JSON_ROWS="$JSON_ROWS{\"workload\":\"$wname\",\"${LABEL_A}_wall_s\":$a_wall,\"${LABEL_B}_wall_s\":$b_wall,\"${LABEL_B}_over_${LABEL_A}_wall_ratio\":$wall_ratio,\"${LABEL_A}_rss_kb\":$a_rss_kb,\"${LABEL_B}_rss_kb\":$b_rss_kb,\"${LABEL_B}_over_${LABEL_A}_rss_ratio\":$rss_ratio,\"match\":\"$match\"}"
+
+    TOTAL_A=$(awk -v t="$TOTAL_A" -v a="$a_wall" 'BEGIN{printf "%.4f", t+a}')
+    TOTAL_B=$(awk -v t="$TOTAL_B" -v a="$b_wall" 'BEGIN{printf "%.4f", t+a}')
+done
+
+OVERALL_RATIO=$(awk -v a="$TOTAL_B" -v b="$TOTAL_A" 'BEGIN{if (b>0) printf "%.3f", a/b; else print "NaN"}')
+
+{
+    printf '{\n'
+    printf '  "timestamp_utc": "%s",\n' "$TS"
+    printf '  "commit": "%s",\n' "$COMMIT"
+    printf '  "os": "%s", "arch": "%s", "cpu": "%s",\n' "$OS_NAME" "$ARCH" "$CPU"
+    printf '  "runs_per_workload": %d,\n' "$RUNS"
+    printf '  "labels": {"a": "%s", "b": "%s"},\n' "$LABEL_A" "$LABEL_B"
+    printf '  "binaries": {"%s": "%s", "%s": "%s"},\n' "$LABEL_A" "$A_BIN" "$LABEL_B" "$B_BIN"
+    printf '  "totals": {"%s_wall_s": %s, "%s_wall_s": %s, "%s_over_%s_wall_ratio": %s},\n' \
+        "$LABEL_A" "$TOTAL_A" "$LABEL_B" "$TOTAL_B" "$LABEL_B" "$LABEL_A" "$OVERALL_RATIO"
+    printf '  "rows": [%s]\n' "$JSON_ROWS"
+    printf '}\n'
+} > "$JSON"
+
+echo >&2
+echo "==> results:" >&2
+echo "    tsv:  $TSV" >&2
+echo "    json: $JSON" >&2
+echo >&2
+cat "$TSV"