From f7e34267ef22a14436e61d12f3490d5eb52eb9e9 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 21 Jun 2026 11:06:49 +0000 Subject: [PATCH 1/3] =?UTF-8?q?draft(contract):=20ContentStore=20=E2=80=94?= =?UTF-8?q?=20content-addressed=20cold=20text=20store?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Draft reference for the AriGraph/OSINT episodic-arc wiring (D-CC-ARI-3), parked on its own branch off merged main (content_store does not yet exist on main). Zero-dep typed surface in lance-graph-contract: - ContentId(u64) = fnv1a of the bytes (canon hash, stable across versions — the correct content address; DefaultHasher must never key one). - SourceSpan{ContentId,u32,u32} = the typed (source_id,start,end) form of template-equivalence's provenance; is_cited() = "no source span -> no claim". - ContentStore (cold read, resolve -> Option<&[u8]> zero-copy slice) + ContentSink (idempotent put -> dedup by content-address). Hot path touches only ContentId/SourceSpan; bytes hydrate cold at the membrane (ADR-022). Logic-complete + self-reviewed; cargo verification deferred (worktree was disk/sibling-constrained). Run `cargo test -p lance-graph-contract content_store` in a full checkout before merge. Author canonically or supersede as the other session's content_store work lands. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01VGXeWN4XfVjteBVcVeuLo4 --- .../lance-graph-contract/src/content_store.rs | 240 ++++++++++++++++++ crates/lance-graph-contract/src/lib.rs | 1 + 2 files changed, 241 insertions(+) create mode 100644 crates/lance-graph-contract/src/content_store.rs diff --git a/crates/lance-graph-contract/src/content_store.rs b/crates/lance-graph-contract/src/content_store.rs new file mode 100644 index 00000000..24f9e808 --- /dev/null +++ b/crates/lance-graph-contract/src/content_store.rs @@ -0,0 +1,240 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! `content_store` — content-addressed cold text/blob store contract (zero-dep). +//! +//! The episodic/OSINT **text table**: `ContentId` (the `fnv1a` hash of the bytes) +//! → bytes, resolved **cold, at the membrane** — never in the hot path. This is +//! the typed surface for the rule the OGAR canon + `I-VSA-IDENTITIES` Test 0 +//! (register laziness) demand: *the reference is the identity, never a serialized +//! pointer/offset inlined in the SoA*. +//! +//! ## Three invariants this encodes +//! +//! 1. **The join key IS the identity.** Nothing variable-length enters the 512 B +//! node. The node carries only a fixed-size [`ContentId`] (a value tenant); +//! the text lives in a columnar table next to it and joins by id. No pointer +//! field, no budget break. +//! 2. **Content-address, not raw GUID.** OSINT sources are shared (one document +//! backs many observations). [`ContentId::of`] hashes the bytes, so identical +//! sources dedup (many episodic edges → one source row). Uses [`crate::hash::fnv1a`] +//! — **stable across versions/platforms** (unlike `DefaultHasher`, which must +//! never key a content address; see `TECH_DEBT` re `WitnessEntry::tie_break_hash`). +//! 3. **Hot/cold firewall (ADR-022).** [`ContentStore::resolve`] is the COLD / +//! membrane surface: bytes are materialized only when genuinely needed (LLM +//! hydration, rendering, citing). The hot path (SIMD sweep, resonance, +//! AriGraph edge traversal, family-basin routing) touches only the fixed-size +//! [`ContentId`] + [`SourceSpan`] — the fingerprint is the hot-path stand-in +//! for the text; this trait is never called during computation. +//! +//! ## Provenance: `SourceSpan` is the typed `(source_id, start, end)` +//! +//! The merged `template-equivalence` provenance model uses +//! `source_spans: Vec<(String, usize, usize)>` = `(source_id, start, end)`. +//! [`SourceSpan`] is its fixed-size typed form: `source_id` IS a [`ContentId`] +//! (the content-table key), `start`/`end` index into the resolved bytes. The +//! gate "no source span → no claim" is literally [`SourceSpan::is_cited`]. + +use crate::hash::fnv1a; + +/// A content address: the `fnv1a`-64 hash of the stored bytes. +/// +/// Identical bytes ⇒ identical id ⇒ natural dedup. `ContentId(0)` is the +/// reserved **empty/sentinel** (no content), mirroring the canon's zero-fallback +/// ladder (a zero tier = "not consulted", never a valid address). +/// +/// Note: 64-bit fnv1a is the workspace-canonical hash and is sufficient for +/// OSINT-corpus scale; if a corpus ever approaches birthday-collision range +/// (~2^32 distinct sources), widen to a 128-bit content address — the upgrade +/// is local to this type. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, Default)] +pub struct ContentId(pub u64); + +impl ContentId { + /// Content-address arbitrary bytes. + #[must_use] + pub fn of(bytes: &[u8]) -> Self { + Self(fnv1a(bytes)) + } + + /// Content-address a string slice. + #[must_use] + pub fn of_str(s: &str) -> Self { + Self(fnv1a(s.as_bytes())) + } + + /// The reserved empty/sentinel address (no content). + #[must_use] + pub fn is_sentinel(self) -> bool { + self.0 == 0 + } +} + +/// A provenance reference: which content, and the `[start, end)` byte span within +/// it. Fixed-size and `Copy` — it lives on the episodic node (a value tenant); +/// the bytes resolve cold via [`ContentStore`]. The typed form of +/// `template-equivalence`'s `(source_id, start, end)`. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Default)] +pub struct SourceSpan { + /// The content-table key (the source the span cites). + pub content: ContentId, + /// Inclusive start byte offset into the resolved content. + pub start: u32, + /// Exclusive end byte offset. + pub end: u32, +} + +impl SourceSpan { + /// New span; `end` is clamped to be `>= start`. + #[must_use] + pub fn new(content: ContentId, start: u32, end: u32) -> Self { + Self { content, start, end: end.max(start) } + } + + /// Span length in bytes. + #[must_use] + pub fn len(self) -> u32 { + self.end - self.start + } + + /// Whether the span covers zero bytes. + #[must_use] + pub fn is_empty(self) -> bool { + self.end <= self.start + } + + /// "No source span → no claim": a claim is cited iff it carries a non-empty + /// span into real (non-sentinel) content. The provenance gate's predicate. + #[must_use] + pub fn is_cited(self) -> bool { + !self.content.is_sentinel() && !self.is_empty() + } +} + +/// Failure resolving content from the cold store. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum ContentError { + /// No content stored under this id. + NotFound, + /// The span's `[start, end)` exceeds the resolved content's length. + SpanOutOfBounds, +} + +impl core::fmt::Display for ContentError { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + match self { + ContentError::NotFound => write!(f, "content-store: id not found"), + ContentError::SpanOutOfBounds => write!(f, "content-store: span out of bounds"), + } + } +} + +/// The content-addressed **cold** store (read side). +/// +/// Lives in the zero-dep contract so any consumer can declare it without pulling +/// Arrow/Lance. Implemented downstream by a Lance text table (and, in-RAM, by the +/// AriGraph `EpisodicMemory` / `WitnessCorpus` acting as the cold tier). +/// `resolve` returns a borrow into the backing store (mmap'd Lance buffer or +/// in-RAM `Bytes`), so reads are zero-copy at the membrane. +pub trait ContentStore { + /// Resolve the full content bytes for an id. `None` if absent. COLD path only. + fn resolve(&self, id: ContentId) -> Option<&[u8]>; + + /// Resolve a span's bytes (cold). Default composes [`resolve`](Self::resolve) + /// with a bounds check. + fn resolve_span(&self, span: SourceSpan) -> Result<&[u8], ContentError> { + let bytes = self.resolve(span.content).ok_or(ContentError::NotFound)?; + bytes + .get(span.start as usize..span.end as usize) + .ok_or(ContentError::SpanOutOfBounds) + } + + /// Whether an id is present without committing to a borrow shape. + fn contains(&self, id: ContentId) -> bool { + self.resolve(id).is_some() + } +} + +/// The content-addressed store (write side, membrane-only). +/// +/// Ingest is idempotent by construction: identical bytes ⇒ same [`ContentId`] ⇒ +/// dedup (the many-episodes → one-source rule). Writing happens at the cold +/// membrane during ingestion, never on the hot path. +pub trait ContentSink { + /// Store `bytes`, returning their content address. Idempotent. + fn put(&mut self, bytes: &[u8]) -> ContentId; + + /// Store a string slice. + fn put_str(&mut self, s: &str) -> ContentId { + self.put(s.as_bytes()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::collections::HashMap; + + /// Reference in-RAM impl (the cold tier mirror) used to exercise the contract. + #[derive(Default)] + struct MemStore { + map: HashMap>, + } + impl ContentStore for MemStore { + fn resolve(&self, id: ContentId) -> Option<&[u8]> { + self.map.get(&id).map(Vec::as_slice) + } + } + impl ContentSink for MemStore { + fn put(&mut self, bytes: &[u8]) -> ContentId { + let id = ContentId::of(bytes); + self.map.entry(id).or_insert_with(|| bytes.to_vec()); + id + } + } + + #[test] + fn content_address_is_stable_and_dedups() { + let a = ContentId::of_str("the same source document"); + let b = ContentId::of_str("the same source document"); + assert_eq!(a, b); // identical bytes ⇒ identical id (dedup key) + assert_ne!(a, ContentId::of_str("a different document")); + } + + #[test] + fn put_is_idempotent_one_row_per_source() { + let mut s = MemStore::default(); + let id1 = s.put_str("shared OSINT source"); + let id2 = s.put_str("shared OSINT source"); // many episodes → one source + assert_eq!(id1, id2); + assert_eq!(s.map.len(), 1); + } + + #[test] + fn resolve_span_returns_the_cited_slice() { + let mut s = MemStore::default(); + let id = s.put_str("Alice met Bob in Paris."); + let span = SourceSpan::new(id, 10, 13); // "Bob" + assert_eq!(s.resolve_span(span).unwrap(), b"Bob"); + assert!(span.is_cited()); + } + + #[test] + fn out_of_bounds_and_missing_fail() { + let mut s = MemStore::default(); + let id = s.put_str("short"); + assert_eq!(s.resolve_span(SourceSpan::new(id, 0, 999)), Err(ContentError::SpanOutOfBounds)); + assert_eq!( + s.resolve_span(SourceSpan::new(ContentId(123), 0, 1)), + Err(ContentError::NotFound) + ); + } + + #[test] + fn uncited_span_is_rejected_by_the_gate() { + // sentinel content, or empty span ⇒ not a citation + assert!(!SourceSpan::new(ContentId(0), 0, 5).is_cited()); + assert!(!SourceSpan::new(ContentId(7), 5, 5).is_cited()); + assert!(SourceSpan::new(ContentId(7), 0, 5).is_cited()); + } +} diff --git a/crates/lance-graph-contract/src/lib.rs b/crates/lance-graph-contract/src/lib.rs index 0356d9e9..a3248fcf 100644 --- a/crates/lance-graph-contract/src/lib.rs +++ b/crates/lance-graph-contract/src/lib.rs @@ -62,6 +62,7 @@ pub mod codegen_spine; pub mod cognitive_shader; pub mod collapse_gate; pub mod container; +pub mod content_store; pub mod counterfactual; pub mod crystal; pub mod cycle_accumulator; From 10b9bb5dc0d4fb78824e10cb1d75e82b6a87a1d0 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 21 Jun 2026 15:39:47 +0000 Subject: [PATCH 2/3] =?UTF-8?q?feat(contract):=20ContentStore=20=E2=80=94?= =?UTF-8?q?=20content-addressed=20cold=20text=20store?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The content-addressed cold text/blob store for the AriGraph/OSINT episodic arc (D-CC-ARI-3). Zero-dep typed surface in lance-graph-contract: - ContentId(u64) = hash::fnv1a of the bytes (stable across versions — the correct content address; DefaultHasher must never key one; 0 = sentinel). - SourceSpan{ContentId,u32,u32} = the fixed-size Copy typed form of template-equivalence's (source_id,start,end); is_cited() = "no source span -> no claim". - ContentStore (cold resolve -> Option<&[u8]> zero-copy slice) + ContentSink (idempotent put -> dedup by content-address: many episodes -> one source). Hot/cold firewall (ADR-022): the hot path touches only the fixed-size ContentId/SourceSpan; bytes hydrate cold at the membrane. Nothing variable-length enters the 512 B node. Additive, zero-dep; +6 tests, clippy clean. Board: LATEST_STATE Contract Inventory. Consumers: rs-graph-llm/episodic-arc-task, template-equivalence. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01VGXeWN4XfVjteBVcVeuLo4 --- .claude/board/LATEST_STATE.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.claude/board/LATEST_STATE.md b/.claude/board/LATEST_STATE.md index 22be2f35..96fdd984 100644 --- a/.claude/board/LATEST_STATE.md +++ b/.claude/board/LATEST_STATE.md @@ -120,6 +120,8 @@ ## Current Contract Inventory (lance-graph-contract) +> **2026-06-21 — ADDED (content-store for the AriGraph/OSINT episodic arc)**: `lance_graph_contract::content_store::{ContentId, SourceSpan, ContentError, ContentStore, ContentSink}` — the content-addressed **cold text/blob store** contract. `ContentId(u64)` = `hash::fnv1a` of the bytes (stable across versions — the correct content address; `DefaultHasher` must never key one; `0` = sentinel). `SourceSpan{ContentId,u32,u32}` = the fixed-size, `Copy` typed form of `template-equivalence`'s `(source_id,start,end)` provenance; `is_cited()` = "no source span → no claim" (non-sentinel content + non-empty span). `ContentStore` (cold read: `resolve(id) -> Option<&[u8]>` zero-copy slice into the mmap/backing store; `resolve_span`/`contains` defaulted) + `ContentSink` (idempotent `put -> ContentId`, dedup by content-address: many episodes → one source row). **Hot/cold firewall (ADR-022)**: the hot path (SIMD sweep, AriGraph edge traversal) touches only the fixed-size `ContentId`/`SourceSpan`; bytes hydrate cold at the membrane (the fingerprint is the hot-path stand-in for text). Nothing variable-length enters the 512 B node. Additive, zero-dep; +6 tests (stable/dedup, idempotent put, resolve_span slice, OOB/missing errors, uncited-rejected); clippy clean. Consumers: `rs-graph-llm/episodic-arc-task` (replaces its local fnv1a), `template-equivalence` (typed provenance). Plan: `.claude/plans/arigraph-osint-episodic-v1.md` (D-CC-ARI-3). Branch `claude/content-store-contract-draft`. + > **2026-06-18 — ADDED (probe-excel-compute-dag-v1 Inc 0, the `compute_dag` Core gap)**: `lance_graph_contract::class_view::{ComputeEdge, compute_dag_is_acyclic}` + `ClassView::compute_dag(class) -> &[ComputeEdge]` (default `&[]`, zero-fallback). `ComputeEdge {target: u8, inputs: &'static [u8]}` is the harvest-sourced recompute edge (`emitted_by` target ← `depends_on` inputs; field positions index the class `FieldMask`), `const`-constructible like `MethodSig`/`ActionDef` (the harvest IS the manifest). `compute_dag_is_acyclic` is the **registry-build gate** — a cyclic recompute DAG (formula loop / `@api.depends` cycle / self-loop) is rejected at build (Kahn over ≤64 positions, allocation-free; out-of-range positions ignored, no panic, mirrors `FieldMask::from_positions`). This is the Core home for computed-field recompute *dispatch* that EVERY computed-field AR consumer needs (Odoo `@api.depends`, Excel formulas, medcare lab-trends, woa calc, q2 cells — they reduce to a sheet; `E-EXCEL-SHADER-PROJECTION`) and the NNUE-incremental existence-proof shape (`E-CHESS-TENSOR-PROVEN`). **Layout-preserving**: a default trait method + a free fn, resolution metadata ABOVE the SoA, stores nothing on the row, zero `NODE_ROW_STRIDE`/`ENVELOPE_LAYOUT_VERSION` impact (core-gap-auditor's EXTEND-CORE, never an adapter-state hack). The instance recompute that consumes it is gated per-cell by the cycle-aware `write_row` (`E-SOA-CYCLE-OWNERSHIP`). Additive, zero-dep; +4 tests (default-empty, acyclic-chain, cycle/self-loop/3-cycle rejected, out-of-range ignored); 10/10 class_view, clippy/fmt clean. Sibling `ClassView::constraints` (`validation_kind`-sourced) deferred to Inc-follow-up. Plan: `.claude/plans/probe-excel-compute-dag-v1.md`. Branch `claude/particle-wave-click-epiphany`. > **2026-06-18 — ADDED (D-DO-ARM-1, the OGAR DO arm)**: `lance_graph_contract::action::{ActionState, StateGuard, ActionDef, ClassActions, actions_for, effective_actions, ActionInvocation}` — the Perdurant DO arm completing the OGAR IR (the action-axis sibling of `codegen_manifest`'s `MethodSig`/THINK). Both the 4-agent `sale_order` AR→DO probe (runtime-archaeologist) AND the merged cross-repo PR survey (ruff/OGAR/lance-graph/openproject/tesseract) agreed this was the ONE missing wire: the THINK arm (`classid → ClassView`, `has_function → MethodSig`) is converged + merged; the DO-arm `ActionInvocation`/`ActionDef` type was ABSENT. **`ActionDef`** (static, `const`-constructible, all `&'static`/`Copy`): `predicate` (= harvested `has_function` method), `object_class` (classid), `exec` (`ExecTarget` incl `SurrealQl`), `guard` (`StateGuard` = KausalSpec field==value), `required_role` (RBAC), `overrides` (OGAR `classid→ClassView` inheritance). **`ClassActions`+`actions_for`** (zero-fallback) mirror `ClassMethods`/`methods_for`. **`effective_actions(parent, child)`** = OGAR inheritance on the action axis (child overrides parent by predicate). **`ActionInvocation`** (dynamic, `Copy`): lifecycle `ActionState{Pending→Committed|Failed|Cancelled}` (sticky terminals), S2.5 `cycle` stamp, idempotency/trace keys, HLC `emitted_at_millis`. **`ActionInvocation::commit(def, actor, impact, now)`** is the gated egress — RBAC FIRST (`auth::ActorContext` must hold `required_role` or be admin → else `Failed`), THEN MUL impact (`mul::GateDecision`: `Flow→Committed`+stamped, `Hold→`Pending/escalate, `Block→Cancelled`). This IS "commit to the external consumer (odoo/openproject/woa/tesseract) after the cycle decides sound." Dispatched via `UnifiedStep`/`ExecTarget`, NOT a per-crate endpoint. Additive, zero-dep. +5 tests green. Consumer reference: `docs/OGAR_CONSUMER_API.md`. Branch `claude/soa-write-deinterlace-inc2`. From 6103438b00d8eb9cbad306b307bf3da89c777b91 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 21 Jun 2026 15:48:59 +0000 Subject: [PATCH 3/3] fix(content_store): saturating SourceSpan::len (codex P2 #581) SourceSpan's fields are public, so a consumer can build end < start (bypassing new()'s clamp); the old `end - start` panicked in debug and wrapped to a huge u32 in release, inconsistent with is_empty(). Use saturating_sub so len() reports 0 for a malformed span, matching is_empty()/is_cited(). +1 test (malformed_span_len_saturates_not_panics). Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01VGXeWN4XfVjteBVcVeuLo4 --- .../lance-graph-contract/src/content_store.rs | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/crates/lance-graph-contract/src/content_store.rs b/crates/lance-graph-contract/src/content_store.rs index 24f9e808..43f5c0ec 100644 --- a/crates/lance-graph-contract/src/content_store.rs +++ b/crates/lance-graph-contract/src/content_store.rs @@ -91,10 +91,13 @@ impl SourceSpan { Self { content, start, end: end.max(start) } } - /// Span length in bytes. + /// Span length in bytes. Saturating: a malformed span (`end < start`, only + /// constructible by bypassing [`new`](Self::new) via the public fields) + /// reports `0`, consistent with [`is_empty`](Self::is_empty) — never panics + /// (debug) or wraps to a huge value (release). #[must_use] pub fn len(self) -> u32 { - self.end - self.start + self.end.saturating_sub(self.start) } /// Whether the span covers zero bytes. @@ -237,4 +240,14 @@ mod tests { assert!(!SourceSpan::new(ContentId(7), 5, 5).is_cited()); assert!(SourceSpan::new(ContentId(7), 0, 5).is_cited()); } + + #[test] + fn malformed_span_len_saturates_not_panics() { + // Public fields let a consumer build end < start, bypassing new()'s clamp. + // len() must saturate to 0 (consistent with is_empty), never panic/wrap. + let bad = SourceSpan { content: ContentId(7), start: 13, end: 0 }; + assert_eq!(bad.len(), 0); + assert!(bad.is_empty()); + assert!(!bad.is_cited()); + } }