From d65dcb758cc51f16bfa0e5f8b78eedfba0a0c548 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 16 Jun 2026 07:31:02 +0000 Subject: [PATCH] =?UTF-8?q?fix(plans)+test:=20#500=20review=20=E2=80=94=20?= =?UTF-8?q?OCR=20rides=20Full=20not=20Compressed;=20OCR-RT=20gate=20is=20e?= =?UTF-8?q?xact?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two review threads on the merged #500: - codex P2: "post-POC OCR rides Compressed" was wrong — Compressed lacks Energy+Plasticity, so the schema-gated transcode would silently drop confidence (→Energy) and repair-provenance (→Plasticity). Corrected: OCR rides Full (the only preset with the codec residues AND the hot lifecycle columns). The OCR-SCHEMA contract test now asserts Compressed lacks Energy/Plasticity (regression guard). - CodeRabbit Major: OCR-RT reversibility gate tightened 99% → 100% exact (a lossy residue→rank map is NOT "reversible"; tolerance moved to a separate quality probe). https://claude.ai/code/session_01D2WSmezQBNC3bUdHuGfGmo --- .../plans/ocr-canonical-soa-integration-v1.md | 12 +++++----- .claude/plans/ocr-probes-v1.md | 6 +++-- crates/lance-graph-contract/src/ocr.rs | 22 ++++++++++++++----- 3 files changed, 27 insertions(+), 13 deletions(-) diff --git a/.claude/plans/ocr-canonical-soa-integration-v1.md b/.claude/plans/ocr-canonical-soa-integration-v1.md index 2bf5c670..3b76539c 100644 --- a/.claude/plans/ocr-canonical-soa-integration-v1.md +++ b/.claude/plans/ocr-canonical-soa-integration-v1.md @@ -93,11 +93,13 @@ keyed by `identity`. Not bundled into the node. **ValueSchema:** do **NOT** add a 5th `ValueSchema::Ocr` enum variant — that is a contract-surface addition against the #496 §0 anti-invention guardrail. Shipped `ocr.rs` already transcodes by riding the POC-`Full` default (`classid_read_mode → -Full`) and writing only the tenants it populates. Post-POC, OCR rides the existing -**`Compressed`** preset (already = Fingerprint + HelixResidue + TurbovecResidue + -EntityType) — or, if a distinct tenant set is truly needed, **mint an OCR class** in -OGAR whose `ClassView` selects existing tenants (the §0-sanctioned opt-in route). -New capability = new column/class, never a new enum variant. +Full`) and writing only the tenants it populates. Post-POC, OCR rides **`Full`** — +the only existing preset carrying the codec residues (HelixResidue + TurbovecResidue) +AND the hot columns the §4 writeback needs (Energy for confidence, Plasticity for the +repair stamp). `Compressed` lacks Energy/Plasticity and `Cognitive` lacks the +residues, so neither fits OCR (codex P2 on #500). A leaner OCR row would need an +operator-minted preset — that is an operator decision, not a plan default; the rule +that holds is **no new enum variant from a plan**. ## 4. Repair: DeepNSM + CAM/PQ nearest-valid-token (D-OCR-52) diff --git a/.claude/plans/ocr-probes-v1.md b/.claude/plans/ocr-probes-v1.md index ee3e3203..6194a5f7 100644 --- a/.claude/plans/ocr-probes-v1.md +++ b/.claude/plans/ocr-probes-v1.md @@ -27,8 +27,10 @@ turbovec PQ residue), attempt to recover the **rank** from the residue bytes ALONE (no stored-rank lookup). Needs deepnsm `Codebook` + helix `Signed360` wired in one crate (they are not today — that wiring is itself part of the gate). -- **Pass:** ≥ 99 % of the 4096-word vocab round-trips residue→rank→word exactly. -- **Fail:** < 99 %, OR recovery requires the original rank as input ⇒ "reversible +- **Pass:** **100 %** of the 4096-word vocab round-trips residue→rank→word exactly — + a reversibility gate must be exact; a single miss fails it (a lossy map is NOT + "reversible"). Any tolerance belongs in a separate *quality* probe, never this gate. +- **Fail:** any miss, OR recovery requires the original rank as input ⇒ "reversible without a hash" is FALSE; the corrected plans already say text = identity → content-store lookup, codebook = repair signal (this probe confirms or lifts that). - **Cost:** ~80 LOC once deepnsm+helix are co-located; the wiring is the real work. diff --git a/crates/lance-graph-contract/src/ocr.rs b/crates/lance-graph-contract/src/ocr.rs index 6724a942..034c9c2e 100644 --- a/crates/lance-graph-contract/src/ocr.rs +++ b/crates/lance-graph-contract/src/ocr.rs @@ -237,9 +237,10 @@ mod tests { fn ocr_schema_fit_rides_existing_preset_no_new_variant() { // Probe OCR-SCHEMA (.claude/plans/ocr-probes-v1.md): the OCR value tenants // fit an EXISTING ValueSchema preset, so a 5th `ValueSchema::Ocr` enum variant - // is NOT needed (#496 §0 anti-invention). The codec-residue set OCR rides — - // HelixResidue + TurbovecResidue + EntityType (+ Fingerprint) — is exactly - // `Compressed`; everything else OCR could want is in the POC `Full` default. + // is NOT needed (#496 §0 anti-invention). `Compressed` carries the codec + // residues — but OCR also writes confidence→Energy + repair→Plasticity, which + // `Compressed` LACKS, so OCR rides `Full` (the only preset with residues AND + // the hot lifecycle columns), not `Compressed` (codex P2 on #500). let compressed = ValueSchema::Compressed; for t in [ ValueTenant::HelixResidue, @@ -249,11 +250,20 @@ mod tests { ] { assert!( compressed.has(t), - "Compressed already carries {t:?} — OCR rides it" + "Compressed carries the codec residue {t:?}" ); } - // The shipped transcode rides POC `Full`, which carries every tenant OCR touches - // (incl. Meta anchor / Energy confidence / Plasticity provenance). + // ...but NOT the hot columns OCR's writeback needs — Compressed alone drops them. + assert!( + !compressed.has(ValueTenant::Energy), + "Compressed lacks Energy" + ); + assert!( + !compressed.has(ValueTenant::Plasticity), + "Compressed lacks Plasticity" + ); + // OCR rides `Full`, which carries every tenant OCR touches (residues + Meta + // anchor + Energy confidence + Plasticity provenance + EntityType). let full = ValueSchema::Full; for t in [ ValueTenant::HelixResidue,