diff --git a/benchmark/miner/CALIBRATION.md b/benchmark/miner/CALIBRATION.md new file mode 100644 index 00000000..ded9f521 --- /dev/null +++ b/benchmark/miner/CALIBRATION.md @@ -0,0 +1,127 @@ +# Calibrating the `insufficient_evidence` threshold + +The release decision raises `insufficient_evidence` (IE) when, with no +blockers, static extraction is too weak to gate confidently. The threshold is +two constants in +[`src/agents_shipgate/ci/release_decision.py`](../../src/agents_shipgate/ci/release_decision.py): + +```python +_LOW_CONFIDENCE_TOOL_RATIO = 0.5 # IE if low_confidence_tools >= ceil(0.5 * total_tools) +_MAX_TOLERATED_SOURCE_WARNINGS = 3 # IE if source_warning_count > 3 +``` + +They have shipped since v0.14 and were never tuned against data. This note +records the attempt to calibrate them, the honest finding, and the precise +conditions under which they should be revisited. The numbers below are +reproduced by the tests named at the end — they come from data on disk, not +prose. + +## The question + +Are `0.5` and `3` the right constants — i.e. does the gate raise IE on (and +only on) scans whose evidence is genuinely too weak to gate? Moving them moves +the **IE rate**, a headline metric. + +## What calibration needs + +For each scan: `low_confidence_tool_count`, `source_warning_count`, +`total_tools` (the threshold's inputs) **and** a ground-truth label — was IE +the correct call, or could the scan have gated as passed / review / blocked? +You cannot tell whether a threshold is too strict or too loose without the +label. + +## What the corpora actually contain + +**Mined real history** (`results/2026-W24-mined.*`, `results/2026-W25-mined.*`) +— 241 merged PRs, 9 decided, IE-dominated, and **unlabeled** (the human pass in +[`LABELING.md`](LABELING.md) is unstarted). Two further gaps made the rows +unusable for calibration even descriptively: + +- `tools_scanned` was captured from the wrong place (`summary`, which carries + no tool count) and came back `null` on every row — the ratio denominator was + missing entirely. **Fixed** in `evaluate._tool_count` (now reads + `tool_surface.total_tools`); future mines record it. The committed corpus + predates the fix and still shows `null` — re-mine to populate. +- The row schema records `evidence_gaps` (low-confidence tools **+** source + warnings, combined) but not the split, so the two threshold terms can't be + separated. Splitting them is a `MinedRow` schema change, which forces a full + re-mine (the corpus-integrity guard pins the CSV columns) — deferred to the + next mine, not done speculatively. + +**Constructed labeled fixtures** (`results/constructed.*`, +[`constructed.py`](constructed.py)) — 7 fixtures with definitional labels, run +through the live engine. Their measured evidence coverage: + +| fixture | label | decision | low-conf | warns | tools | exercises IE threshold? | +|---|---|---|---:|---:|---:|---| +| `openai_agents_sdk_agent` | needs_human | `insufficient_evidence` | 2 | 0 | 2 | **yes** (2/2 = 1.0 ≥ 0.5) | +| `support_refund_agent` | must_block | `blocked` | 1 | 1 | 8 | no — blockers outrank IE | +| `ai_generated_refund_pr` | must_block | `blocked` | 0 | 0 | 2 | no | +| `agent_weakens_gate` | must_block | `blocked` | 0 | 0 | 1 | no | +| `clean_read_only_agent` | safe_to_merge | `passed` | 0 | 0 | 1 | no | +| `hitl_evidence_covered_agent` | safe_to_merge | `passed` | 0 | 0 | 1 | no | +| `hitl_evidence_agent` | needs_human | `review_required` | 0 | 0 | 1 | no | + +## Finding + +Exactly **one** labeled case (`openai_agents_sdk_agent`) exercises the IE +threshold, and it sits at the robust extreme — *every* tool is low-confidence +(ratio 1.0), so it is classified IE for any ratio in `(0, 1]`. A single point +at the extreme **cannot distinguish** `0.3` from `0.5` from `0.7`. The source +-warning constant (`3`) is exercised by no labeled case at all. + +So neither corpus can justify *moving* the constants: the real corpus is +unlabeled and was missing the denominator; the constructed corpus has labels +but only one threshold-exercising point, and it is uninformative about where in +`(0, 1]` the boundary belongs. + +## Decision + +**Hold `0.5` / `3`.** No available data supports a change, and an unjustified +change would move the IE rate blindly. The constants are now *examined and +guarded* rather than unexamined — two guards with distinct jobs: + +- **Threshold edits** → `test_ie_threshold_constants_are_frozen` + (`tests/test_release_decision.py`) asserts the constants equal `0.5` / `3`. + Changing either fails CI, so a recalibration is a deliberate edit that must + update this note alongside it. +- **Extraction regressions** → `test_ie_threshold_is_exercised_and_robust_on_the_labeled_coverage_fixture` + (`tests/test_miner_constructed.py`) re-runs the one labeled IE fixture; if + extraction later resolves its dynamic surface the verdict flips and this + fails. It does *not* catch threshold edits — the point sits at ratio 1.0, so + it stays above any threshold in `(0, 1]` (which is the whole reason it can't + calibrate the constant). +- **The denominator** → `test_record_head_report_*` (`tests/test_miner.py`) + lock the `tools_scanned` capture fix so the next mine records it. + +## When to revisit + +Recalibrate when **both** prerequisites are met: + +1. The human labeling pass ([`LABELING.md`](LABELING.md)) produces a labeled + decided set (target ≥ 50, including near-threshold cases — not only the + ratio-1.0 framework cores). +2. A re-mine populates `tools_scanned` (fix shipped) and, ideally, a + low-confidence / source-warning split (next `MinedRow` schema bump). + +Then sweep `(ratio, max_warnings)` against the labeled set, pick the point that +maximizes IE precision/recall, and update this note + the constants together. + +## Considered and declined: an `extraction_coverage` ratio report field + +A precomputed `extraction_coverage` ratio (`low_confidence_tool_count / +total_tools`) on the report was considered as a sibling to this work and +**declined** under the [surface-discipline gate](../../CONTRIBUTING.md#surface-discipline): + +- It moves **no** headline metric. The IE rate is moved by the *threshold* + (above), not by exposing the ratio. +- It is fully derivable by any consumer from fields the report already carries + (`release_decision.evidence_coverage.low_confidence_tool_count` and the tool + count in `tool_surface.total_tools`), and the structured + `evidence_coverage.evidence_gaps[]` already enumerates each gap with a + remediation `next_action`. +- "Legibility / completeness" is an explicitly rejected justification for new + surface, and a new report field is a schema bump. + +Per the gate, the default is not to add it. Revisit only if a concrete +consumer names a headline metric the raw ratio (not the existing counts) moves. diff --git a/benchmark/miner/README.md b/benchmark/miner/README.md index 2d83e508..7db66de2 100644 --- a/benchmark/miner/README.md +++ b/benchmark/miner/README.md @@ -72,6 +72,9 @@ python -m benchmark.miner evaluate \ - A row with `status=evaluated` and `head_decision=insufficient_evidence` counts toward the IE-rate KPI. `trigger_skip` rows are the negative-control pool (the 0-noise-on-irrelevant-diffs property, on real history). +- The IE-threshold constants (`_LOW_CONFIDENCE_TOOL_RATIO`, + `_MAX_TOLERATED_SOURCE_WARNINGS`) are examined and held — the calibration + attempt, finding, and revisit conditions are in [`CALIBRATION.md`](CALIBRATION.md). - Labeling for the accuracy corpus happens in a separate adjudicated file next to the CSV (`.labels.csv`: `pr_url,label,rationale`) — the mined row is evidence, the label is the ground truth. The rubric, two-labeler process, and diff --git a/benchmark/miner/evaluate.py b/benchmark/miner/evaluate.py index 106d8aed..a7eb0d0d 100644 --- a/benchmark/miner/evaluate.py +++ b/benchmark/miner/evaluate.py @@ -528,16 +528,43 @@ def _head_scan(row: MinedRow, scan_root: Path, tmp_path: Path) -> bool: except (OSError, json.JSONDecodeError): row.notes = _append_note(row.notes, "head_report_unparseable") return False + _record_head_report(row, report) + return True + + +def _tool_count(report: dict) -> int | None: + """Total tools the scan enumerated. + + Lives in ``tool_surface.total_tools`` (``tool_inventory`` length is the + fallback) — NOT in ``summary``. The previous read looked in ``summary`` + for ``tools_scanned``/``tool_count``, keys ``ReportSummary`` does not + carry, so ``tools_scanned`` came back null on every run — blanking the + denominator the IE-threshold ratio (low_confidence_tools / total_tools) + is calibrated against. + """ + surface = report.get("tool_surface") or {} + total = surface.get("total_tools") + if isinstance(total, int): + return total + inventory = report.get("tool_inventory") + if isinstance(inventory, list): + return len(inventory) + return None + + +def _record_head_report(row: MinedRow, report: dict) -> None: + """Project a parsed head ``report.json`` onto the row. + + Pure (no I/O) so the head-decision/evidence capture is unit-testable + without running a scan. + """ decision = report.get("release_decision") or {} row.head_decision = str(decision.get("decision") or "") row.head_blockers = _safe_len(decision.get("blockers")) row.head_review_items = _safe_len(decision.get("review_items")) coverage = decision.get("evidence_coverage") or {} row.evidence_gaps = _safe_len(coverage.get("evidence_gaps")) - summary = report.get("summary") or {} - tools = summary.get("tools_scanned", summary.get("tool_count")) - row.tools_scanned = int(tools) if isinstance(tools, int) else None - return True + row.tools_scanned = _tool_count(report) def _capability_delta( diff --git a/src/agents_shipgate/ci/release_decision.py b/src/agents_shipgate/ci/release_decision.py index 920cc520..22dbd962 100644 --- a/src/agents_shipgate/ci/release_decision.py +++ b/src/agents_shipgate/ci/release_decision.py @@ -25,7 +25,14 @@ # Thresholds for the `insufficient_evidence` decision state. Private # module-level constants so they're tunable in code without expanding -# the manifest or CLI surface. +# the manifest or CLI surface. Examined and deliberately HELD at these +# values: see benchmark/miner/CALIBRATION.md — the available corpora cannot +# justify a change (the real corpus is unlabeled; the labeled constructed set +# has a single threshold-exercising point at the robust extreme). Editing +# either constant fails test_ie_threshold_constants_are_frozen +# (tests/test_release_decision.py), so a change is a deliberate recalibration +# that must update CALIBRATION.md too. Recalibrate only after the human +# labeling pass + a re-mine (prerequisites in that doc). _LOW_CONFIDENCE_TOOL_RATIO = 0.5 _MAX_TOLERATED_SOURCE_WARNINGS = 3 diff --git a/tests/test_miner.py b/tests/test_miner.py index 9822dcce..76bd5b61 100644 --- a/tests/test_miner.py +++ b/tests/test_miner.py @@ -216,6 +216,57 @@ def test_summarize_reports_ie_rate() -> None: assert summary["ie_rate_on_decided"] == 0.5 +def _blank_row() -> MinedRow: + return MinedRow( + repo="r", pr_number=1, pr_url="", title="", merged_at="", + base_sha="", head_sha="", + ) + + +def test_record_head_report_reads_tool_count_from_tool_surface() -> None: + # Regression: the total tool count lives in tool_surface.total_tools, NOT + # summary. Reading it from summary left tools_scanned null on every run, + # blanking the IE-threshold ratio denominator. + from benchmark.miner.evaluate import _record_head_report + + report = { + "release_decision": { + "decision": "insufficient_evidence", + "blockers": [], + "review_items": [{"id": "x"}], + "evidence_coverage": {"evidence_gaps": [{"kind": "low_confidence_tool"}, {"kind": "low_confidence_tool"}]}, + }, + "tool_surface": {"total_tools": 5, "high_risk_tools": 0}, + "summary": {"status": "clean"}, # deliberately carries no tool count + } + row = _blank_row() + _record_head_report(row, report) + assert row.tools_scanned == 5 + assert row.head_decision == "insufficient_evidence" + assert row.evidence_gaps == 2 + assert row.head_review_items == 1 + + +def test_record_head_report_falls_back_to_tool_inventory_length() -> None: + from benchmark.miner.evaluate import _record_head_report + + report = { + "release_decision": {"decision": "passed"}, + "tool_inventory": [{"name": "a"}, {"name": "b"}, {"name": "c"}], + } + row = _blank_row() + _record_head_report(row, report) + assert row.tools_scanned == 3 + + +def test_record_head_report_tools_scanned_null_when_absent() -> None: + from benchmark.miner.evaluate import _record_head_report + + row = _blank_row() + _record_head_report(row, {"release_decision": {"decision": "passed"}}) + assert row.tools_scanned is None + + # --- Source-hermeticity: parent trigger eval must use THIS checkout ---------- diff --git a/tests/test_miner_constructed.py b/tests/test_miner_constructed.py index 81210c46..ce7f474b 100644 --- a/tests/test_miner_constructed.py +++ b/tests/test_miner_constructed.py @@ -14,6 +14,7 @@ from __future__ import annotations import csv +import json import os import subprocess import sys @@ -72,6 +73,49 @@ def test_live_engine_still_produces_the_constructed_verdicts() -> None: assert live == committed, "committed constructed corpus is stale vs the live engine" +def test_ie_threshold_is_exercised_and_robust_on_the_labeled_coverage_fixture( + tmp_path: Path, +) -> None: + """Calibration data point for the IE threshold (see CALIBRATION.md). + + ``openai_agents_sdk_agent`` is the one labeled constructed case that lands + on the IE threshold (a dynamic SDK toolset static extraction can't + resolve). It is ``insufficient_evidence`` because every tool is + low-confidence (ratio 1.0), well above the current threshold. + + What this guards: an **extraction change** for this fixture — if extraction + later resolves the dynamic surface, ``low`` drops, the verdict flips, and + this fails. It does NOT guard threshold edits: the point sits at the robust + extreme (low == total), so it stays ``low >= threshold`` for any ratio in + ``(0, 1]`` — which is exactly why one labeled point cannot *calibrate* the + constant. Freezing the constant itself is + ``test_ie_threshold_constants_are_frozen`` in ``test_release_decision.py``. + """ + from agents_shipgate.ci.release_decision import _low_confidence_tool_threshold + + result = subprocess.run( + [sys.executable, "-m", "agents_shipgate", "fixture", "run", + "openai_agents_sdk_agent", "--out", str(tmp_path / "out")], + capture_output=True, text=True, timeout=180, env=cli_env(), check=False, + ) + report_path = tmp_path / "out" / "report.json" + assert report_path.is_file(), f"exit {result.returncode}: {result.stderr[:400]}" + report = json.loads(report_path.read_text(encoding="utf-8")) + decision = report["release_decision"] + coverage = decision["evidence_coverage"] + low = coverage["low_confidence_tool_count"] + total = (report.get("tool_surface") or {}).get("total_tools") or len( + report.get("tool_inventory") or [] + ) + assert decision["decision"] == "insufficient_evidence" + # The current constant correctly classifies it as below-threshold. + assert low >= _low_confidence_tool_threshold(total) + # It sits at the robust extreme (every tool is low-confidence), so this + # single labeled point cannot distinguish ratio values in (0, 1]. Moving + # the constant needs the human labeling pass + a re-mine, not this fixture. + assert low == total and total > 0 + + def test_cli_env_prepends_this_checkouts_src() -> None: """The child `python -m agents_shipgate` must import THIS checkout. diff --git a/tests/test_release_decision.py b/tests/test_release_decision.py index 7bac5a2e..b4af19ec 100644 --- a/tests/test_release_decision.py +++ b/tests/test_release_decision.py @@ -6,7 +6,11 @@ GATE_FAILURE_EXIT_CODE, exit_code_for_report, ) -from agents_shipgate.ci.release_decision import build_release_decision +from agents_shipgate.ci.release_decision import ( + _LOW_CONFIDENCE_TOOL_RATIO, + _MAX_TOLERATED_SOURCE_WARNINGS, + build_release_decision, +) from agents_shipgate.core.domain import ( AuthInfo, Tool, @@ -342,6 +346,27 @@ def test_accepted_high_finding_does_not_outrank_insufficient_evidence(): assert decision.decision == "insufficient_evidence" +def test_ie_threshold_constants_are_frozen(): + """Freeze the IE-threshold constants at their examined-and-held values. + + benchmark/miner/CALIBRATION.md decided to HOLD these because no available + data justifies a change. The labeled constructed fixture + (test_miner_constructed) only guards extraction for that case — it sits at + ratio 1.0 and so cannot detect a threshold edit. This is the guard that + makes a threshold edit surface in CI: changing 0.5 / 3 here is a deliberate + recalibration that must update CALIBRATION.md (and the revisit + prerequisites: the human labeling pass + a re-mine) alongside this test. + """ + assert _LOW_CONFIDENCE_TOOL_RATIO == 0.5, ( + "IE low-confidence ratio changed — recalibrate per " + "benchmark/miner/CALIBRATION.md and update this guard." + ) + assert _MAX_TOLERATED_SOURCE_WARNINGS == 3, ( + "IE source-warning tolerance changed — recalibrate per " + "benchmark/miner/CALIBRATION.md and update this guard." + ) + + def test_insufficient_evidence_reason_lists_both_counts(): """When both gates trip, the reason names both counts.""" tools = [_tool(name="a", confidence="low"), _tool(name="b", confidence="low")]