Merge pull request #191 from AdaWorldAPI/claude/continue-ndarray-x0Oaw

AdaWorldAPI · web-flow · commit c10e1e03d4f4 · 2026-05-21T13:54:00.000+02:00
fix+test: cpu_tier_for_cpu cross-arch + Pillar 12/13/14 drift-checks
diff --git a/src/hpc/pillar/hhtl_contraction.rs b/src/hpc/pillar/hhtl_contraction.rs
@@ -448,4 +448,96 @@ mod tests {
         assert!((r1.psd_rate - r2.psd_rate).abs() < 1e-12);
         assert!((r1.lognorm_concentration - r2.lognorm_concentration).abs() < 1e-12);
     }
+
+    /// Drift-detection: the pillar's `bundle_step` independently re-derives
+    /// the bit-mixing bundle operator. The production code path at
+    /// `crate::hpc::dn_tree::bundle_into` (PR #189, exposed `pub(crate)`)
+    /// is the substrate the pillar is defending. This test runs both on
+    /// seed-aligned SplitMix64 RNGs and asserts the first 16 u64 words of
+    /// production's `GraphHV.channels[0]` agree bit-exactly with the
+    /// pillar's `[u64; WORDS]` output.
+    ///
+    /// # Why this is a bit-exact (not ε-tolerant) check
+    ///
+    /// Per the substrate's bit-exactness contract (W1a + the data-flow
+    /// rules), bundling is a *gated XOR* (Bernoulli-mixture per bit) —
+    /// the mask draws come from `SplitMix64` which is bit-deterministic.
+    /// Both pillar's `probability_mask` and production's
+    /// `make_probability_mask` consume the same number of `next_u64()`
+    /// draws at lr=0.25 (n=ceil(-log2(0.25))=2 per word), so the masks
+    /// for the first `WORDS` words align exactly across the two
+    /// functions. The remaining 240 words of channel 0 (and channels 1/2)
+    /// consume extra RNG draws on the production side; those don't affect
+    /// the first WORDS=16 words because each word is independent.
+    ///
+    /// # Why not lr=0.5
+    ///
+    /// Production's `make_probability_mask(0.5)` has a latent
+    /// infinite-recursion bug: `p >= 0.5` recurses with `1.0 - 0.5 = 0.5`
+    /// forever. Pillar's `probability_mask` uses `p > 0.5` (strict) and
+    /// falls through to the AND-cascade at p=0.5. Real production usage
+    /// (DNConfig default lr=0.03, boost up to ~30 → effective_lr~0.9)
+    /// never hits 0.5 exactly, so the bug is dormant. This drift-check
+    /// uses lr=0.25 where both implementations agree; the lr=0.5 case
+    /// is recorded as a follow-up.
+    #[test]
+    fn pillar_13_matches_production_bundle_into() {
+        use crate::hpc::cam_index::GraphHV;
+        use crate::hpc::dn_tree::{bundle_into, SplitMix64 as DnSplitMix64};
+
+        const N_TRIALS: u32 = 16;
+        const TEST_LR: f64 = 0.25;
+
+        // Both SplitMix64 implementations use identical algorithm (same
+        // multiplier constants 0x9E3779B97F4A7C15, 0xBF58476D1CE4E5B9,
+        // 0x94D049BB133111EB and same shift sequence), so identical seeds
+        // → identical sequences. Both functions consume the same number
+        // of next_u64() draws per word at p=0.25 (n=ceil(-log2(0.25))=2),
+        // so the mask sequences align bit-exactly across the first WORDS
+        // positions of each call.
+        //
+        // The RNGs MUST be re-seeded per trial because production's
+        // bundle_into consumes 48× more RNG draws per call (3 channels ×
+        // 256 words × 2 draws = 1536) than pillar's bundle_step (16 words
+        // × 2 draws = 32). Without re-seeding, post-trial-0 RNG states
+        // diverge.
+
+        for trial in 0..N_TRIALS {
+            // Per-trial seed for both bundling RNGs (must be the same so
+            // masks align). Inputs come from a separate stream so the
+            // bundling RNG state isn't disturbed by input generation.
+            let trial_seed = PILLAR_13_SEED.wrapping_add(trial as u64);
+            let mut rng_pillar = SplitMix64::new(trial_seed);
+            let mut rng_prod = DnSplitMix64::new(trial_seed);
+
+            let mut rng_inputs = SplitMix64::new(trial_seed.wrapping_mul(0x9E37_79B9_7F4A_7C15));
+            let x = random_bits(&mut rng_inputs);
+            let y = random_bits(&mut rng_inputs);
+
+            // Pillar side: WORDS=16 u64 mixing
+            let out_pillar = bundle_step(&x, &y, TEST_LR as f32, &mut rng_pillar);
+
+            // Production side: pack x/y into channel 0 of a GraphHV,
+            // zero the rest. Pillar's `bundle(x, y, lr)` is "keep x where
+            // mask=0, take y where mask=1"; production's `bundle_into`
+            // contract is the same with `current` ↔ x and `hv` ↔ y
+            // (per src/hpc/dn_tree.rs line 125). boost=1.0 means
+            // effective_lr = lr * 1.0 = TEST_LR (matching pillar).
+            let mut hv_x = GraphHV::zero();
+            let mut hv_y = GraphHV::zero();
+            hv_x.channels[0].words[..WORDS].copy_from_slice(&x);
+            hv_y.channels[0].words[..WORDS].copy_from_slice(&y);
+            let hv_out = bundle_into(&hv_x, &hv_y, TEST_LR, 1.0, &mut rng_prod);
+
+            // Compare first WORDS=16 u64 words bit-exactly
+            for w in 0..WORDS {
+                assert_eq!(
+                    out_pillar[w], hv_out.channels[0].words[w],
+                    "Pillar/bundle_into drift at trial {trial} word {w}: \
+                     pillar=0x{:016x} prod=0x{:016x}",
+                    out_pillar[w], hv_out.channels[0].words[w]
+                );
+            }
+        }
+    }
 }
diff --git a/src/hpc/pillar/ogit_lattice.rs b/src/hpc/pillar/ogit_lattice.rs
@@ -450,4 +450,108 @@ mod tests {
         assert!((r1.psd_rate - r2.psd_rate).abs() < 1e-12);
         assert!((r1.lognorm_concentration - r2.lognorm_concentration).abs() < 1e-12);
     }
+
+    /// Drift-detection: the pillar's `transitive_closure` independently
+    /// derives the partial-order closure on synthetic DAGs. The production
+    /// code path at `crate::hpc::ogit_bridge::schema::OntologySchema::is_ancestor`
+    /// (PR #189, exposed `pub`) is the substrate the pillar is defending.
+    ///
+    /// This test generates a small **single-parent** tree (production's
+    /// `OntologySchema.parent: Option<Box<str>>` is single-parent, so the
+    /// drift-check operates on a strict subset of pillar's DAG family),
+    /// builds it as Turtle source, runs both:
+    ///   - pillar's `transitive_closure` on the equivalent boolean
+    ///     direct-edge matrix
+    ///   - production's `is_ancestor(a, d)` on the parsed `OntologySchema`
+    /// and asserts agreement on EVERY (ancestor, descendant) pair.
+    ///
+    /// # Pillar/production closure axes
+    ///
+    /// Pillar `le[i * N + j] = true` means "type `i` ≤ type `j`" (i.e.,
+    /// `i` extends/is-subclass-of `j`). Production
+    /// `is_ancestor(a, d) = true` means "a is an ancestor of d" (i.e.,
+    /// d extends/is-subclass-of a). So the equivalence is:
+    /// `pillar.le[i][j] == production.is_ancestor(types[j], types[i])`.
+    #[cfg(feature = "ogit_bridge")]
+    #[test]
+    fn pillar_14_matches_production_is_ancestor() {
+        use crate::hpc::ogit_bridge::schema::OntologySchema;
+        use crate::hpc::ogit_bridge::turtle_parser::TurtleParser;
+
+        // Small N — Turtle parsing scales linearly but we want a fast test.
+        const N: usize = 8;
+
+        // Type names: ogit:T0, ogit:T1, …, ogit:T{N-1}
+        let names: Vec<String> = (0..N).map(|i| format!("ogit:T{i}")).collect();
+
+        // Generate a deterministic single-parent tree. Type 0 is the root;
+        // type k>0 picks parent uniformly from {0..k}. Seed-anchored so
+        // the test is reproducible.
+        let mut rng = SplitMix64::new(PILLAR_14_SEED);
+        let mut parent = vec![usize::MAX; N];
+        for k in 1..N {
+            // Uniform sample over {0..k}; range is small so modulo-bias
+            // is negligible and reproducibility matters more than rigor.
+            parent[k] = (rng.next_u64() as usize) % k;
+        }
+
+        // Build Turtle source and parse to OntologySchema.
+        let mut src = String::from(
+            "@prefix ogit: <http://www.purl.org/ogit/> .\n\
+             @prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .\n",
+        );
+        src.push_str(&format!("{} a rdfs:Class .\n", names[0]));
+        for k in 1..N {
+            src.push_str(&format!("{} a rdfs:Class ; rdfs:subClassOf {} .\n", names[k], names[parent[k]]));
+        }
+        let triples = TurtleParser::parse(&src).unwrap();
+        let schema = OntologySchema::from_triples(&triples).unwrap();
+
+        // Build the equivalent direct-edge boolean matrix in pillar's
+        // [N × N] flat layout. direct[k * N + parent[k]] = true.
+        let mut direct = vec![false; N * N];
+        for k in 1..N {
+            direct[k * N + parent[k]] = true;
+        }
+        // Hand-compute closure using pillar's helper (not full Pillar 14
+        // version which is N_TYPES-sized; inline the Floyd-Warshall here).
+        let mut le = vec![false; N * N];
+        for i in 0..N {
+            le[i * N + i] = true;
+            for j in 0..N {
+                if direct[i * N + j] {
+                    le[i * N + j] = true;
+                }
+            }
+        }
+        for kk in 0..N {
+            for i in 0..N {
+                if !le[i * N + kk] {
+                    continue;
+                }
+                for j in 0..N {
+                    if le[kk * N + j] {
+                        le[i * N + j] = true;
+                    }
+                }
+            }
+        }
+
+        // Cross-check every (ancestor, descendant) pair.
+        let mut total = 0u32;
+        for i in 0..N {
+            for j in 0..N {
+                let pillar_says = le[i * N + j]; // i extends j (j is ancestor of i)
+                let prod_says = schema.is_ancestor(&names[j], &names[i]);
+                assert_eq!(
+                    pillar_says, prod_says,
+                    "Pillar/is_ancestor drift on pair (ancestor={}, descendant={}): \
+                     pillar.le[{i}][{j}]={pillar_says} production.is_ancestor={prod_says}",
+                    names[j], names[i]
+                );
+                total += 1;
+            }
+        }
+        eprintln!("Pillar 14 ↔ is_ancestor agreement: {total} pair-checks pass over N={N} single-parent tree");
+    }
 }
diff --git a/src/hpc/pillar/splat_invariants.rs b/src/hpc/pillar/splat_invariants.rs
@@ -437,4 +437,52 @@ mod tests {
         assert!((r1.psd_rate - r2.psd_rate).abs() < 1e-12);
         assert!((r1.lognorm_concentration - r2.lognorm_concentration).abs() < 1e-12);
     }
+
+    /// Drift-detection: the pillar's `covariance_from_scale_quat`
+    /// independently re-derives `Σ = R(q) · diag(s²) · R(q)ᵀ`. The
+    /// production code path at `crate::hpc::splat3d::spd3::Spd3::from_scale_quat`
+    /// is the substrate the pillar is *defending*. This test runs both
+    /// on the same SplitMix64-seeded sample of 256 `(scale, quat)` pairs
+    /// and asserts agreement to within `1e-5` per upper-triangle entry.
+    ///
+    /// Any divergence ≥ ε indicates one of two failure modes:
+    ///   (a) production drifted from the canonical quaternion-rotation
+    ///       formula (the pillar definition wins by design — fix the
+    ///       production code), or
+    ///   (b) the pillar itself drifted (audit `covariance_from_scale_quat`
+    ///       against Kerbl 2023 Eq. 3 before changing).
+    ///
+    /// This is the *coupling* the per-pillar docstring promises:
+    /// production and pillar share no code, but they share a CI gate
+    /// that compares them point-for-point.
+    #[test]
+    fn pillar_12_matches_production_spd3_from_scale_quat() {
+        use crate::hpc::splat3d::spd3::Spd3;
+
+        const N: u32 = 256;
+        let mut rng = SplitMix64::new(PILLAR_12_SEED);
+        let mut max_abs_err: f32 = 0.0;
+
+        for _ in 0..N {
+            let s = [sample_scale_axis(&mut rng), sample_scale_axis(&mut rng), sample_scale_axis(&mut rng)];
+            let q = sample_unit_quaternion(&mut rng);
+
+            let pillar = covariance_from_scale_quat(s, q);
+            let prod = Spd3::from_scale_quat(s, q);
+            let prod_ut = [prod.a11, prod.a12, prod.a13, prod.a22, prod.a23, prod.a33];
+
+            for (i, (&p, &pr)) in pillar.iter().zip(prod_ut.iter()).enumerate() {
+                let err = (p - pr).abs();
+                if err > max_abs_err {
+                    max_abs_err = err;
+                }
+                assert!(
+                    err < 1e-5,
+                    "Pillar/Spd3 drift at lane {i}: pillar={p:.7} prod={pr:.7} err={err:.2e} s={s:?} quat={q:?}"
+                );
+            }
+        }
+
+        eprintln!("Pillar 12 ↔ Spd3::from_scale_quat agreement: max_abs_err={max_abs_err:.3e} over {N} pairs");
+    }
 }
diff --git a/src/simd_runtime/cpu_ops.rs b/src/simd_runtime/cpu_ops.rs
@@ -243,29 +243,42 @@ pub fn cpu_ops_for_tier(name: &str) -> Option<&'static CpuOps> {
     }
 }
 
-/// Lookup by GCC CPU codename (e.g. `"sapphirerapids"`,
-/// `"neoverse-v2"`, `"apple-m2"`). Maps the canonical GCC name to the
-/// dispatch tier the CPU lands in, sourced from the scrape recorded
-/// in the matrix doc § M.
+/// Lookup a [`CpuOps`] by GCC CPU codename (e.g. `"sapphirerapids"`,
+/// `"neoverse-v2"`, `"apple-m2"`) on the **current build host**.
 ///
-/// Used for "what would this CPU pick?" introspection without
-/// touching CPUID on the running host — e.g. cross-compilation
-/// reports, deployment-planning tools, integration tests that want
-/// to assert tier selection for a named target without running on
-/// that silicon.
+/// Returns `Some(&'static CpuOps)` only when the named CPU's tier is
+/// reachable from the current `target_arch` (e.g. an x86_64 CPU name
+/// on an x86_64 build, an aarch64 CPU name on an aarch64 build).
+/// Cross-arch lookups — e.g. `cpu_ops_for_cpu("apple-m2")` on an
+/// x86_64 build — return `None` because the underlying NEON kernel
+/// fn pointers are compiled out and there is no honest `CpuOps` to
+/// return.
+///
+/// For pure introspection ("what tier would this CPU pick?", with no
+/// intent to call kernels), use [`cpu_tier_for_cpu`] instead — it is
+/// `cfg`-free and works on any build host.
 ///
 /// Returns `None` for unknown CPU names. Only modern (V8.2-A+ on
 /// aarch64, AVX-512+ or AVX-VNNI+ on x86_64) names are mapped — older
 /// silicon falls through to `cpu_ops_for_tier("scalar")` by
 /// convention if you really need it.
 pub fn cpu_ops_for_cpu(name: &str) -> Option<&'static CpuOps> {
-    cpu_ops_for_tier(cpu_to_tier(name)?)
+    cpu_ops_for_tier(cpu_tier_for_cpu(name)?)
 }
 
-/// Maps a GCC CPU codename to the [`CpuOps`] tier it lands in. Data
-/// from the scrape recorded in `.claude/knowledge/agnostic-surface-cpu-matrix.md`
-/// § M (aarch64) plus the GCC i386 cpu definitions for x86_64.
-fn cpu_to_tier(cpu: &str) -> Option<&'static str> {
+/// Lookup the dispatch tier name (e.g. `"amx_int8"`, `"avx512vnni"`,
+/// `"neon"`) for a GCC CPU codename. Data from the scrape recorded
+/// in `.claude/knowledge/agnostic-surface-cpu-matrix.md` § M
+/// (aarch64) plus the GCC i386 cpu definitions for x86_64.
+///
+/// `cfg`-free — works on any build host regardless of `target_arch`.
+/// This is the right entry point for cross-target introspection:
+/// deployment-planning tools, cross-compilation reports, integration
+/// tests that assert "apple-m2 lands at the neon tier" without
+/// actually building for that silicon.
+///
+/// Returns `None` for unknown CPU names.
+pub fn cpu_tier_for_cpu(cpu: &str) -> Option<&'static str> {
     Some(match cpu {
         // x86_64 — AMX-INT8 hosts
         "sapphirerapids" | "graniterapids" | "graniterapids-d" | "emeraldrapids" => "amx_int8",
@@ -336,24 +349,41 @@ mod tests {
     }
 
     #[test]
-    fn cpu_ops_for_cpu_data_driven_lookup() {
-        // Spot-check the GCC-scraped mapping (matrix doc § M).
-        assert_eq!(cpu_to_tier("sapphirerapids"), Some("amx_int8"));
-        assert_eq!(cpu_to_tier("graniterapids"), Some("amx_int8"));
-        assert_eq!(cpu_to_tier("cascadelake"), Some("avx512vnni"));
-        assert_eq!(cpu_to_tier("znver4"), Some("avx512vnni"));
-        assert_eq!(cpu_to_tier("znver5"), Some("avx512vnni"));
-        assert_eq!(cpu_to_tier("alderlake"), Some("avxvnni"));
-        assert_eq!(cpu_to_tier("arrowlake"), Some("avxvnni"));
-        assert_eq!(cpu_to_tier("haswell"), Some("avx2_fma"));
-        assert_eq!(cpu_to_tier("znver3"), Some("avx2_fma"));
-
-        assert_eq!(cpu_to_tier("apple-m2"), Some("neon"));
-        assert_eq!(cpu_to_tier("neoverse-v2"), Some("neon"));
-        assert_eq!(cpu_to_tier("oryon-1"), Some("neon"));
-        assert_eq!(cpu_to_tier("grace"), Some("neon"));
-
-        assert_eq!(cpu_to_tier("totally-fake-cpu"), None);
+    fn cpu_tier_for_cpu_data_driven_lookup() {
+        // Spot-check the GCC-scraped mapping (matrix doc § M). This
+        // function is cfg-free — every assertion must hold on every
+        // build host, regardless of target_arch.
+        assert_eq!(cpu_tier_for_cpu("sapphirerapids"), Some("amx_int8"));
+        assert_eq!(cpu_tier_for_cpu("graniterapids"), Some("amx_int8"));
+        assert_eq!(cpu_tier_for_cpu("cascadelake"), Some("avx512vnni"));
+        assert_eq!(cpu_tier_for_cpu("znver4"), Some("avx512vnni"));
+        assert_eq!(cpu_tier_for_cpu("znver5"), Some("avx512vnni"));
+        assert_eq!(cpu_tier_for_cpu("alderlake"), Some("avxvnni"));
+        assert_eq!(cpu_tier_for_cpu("arrowlake"), Some("avxvnni"));
+        assert_eq!(cpu_tier_for_cpu("haswell"), Some("avx2_fma"));
+        assert_eq!(cpu_tier_for_cpu("znver3"), Some("avx2_fma"));
+
+        assert_eq!(cpu_tier_for_cpu("apple-m2"), Some("neon"));
+        assert_eq!(cpu_tier_for_cpu("neoverse-v2"), Some("neon"));
+        assert_eq!(cpu_tier_for_cpu("oryon-1"), Some("neon"));
+        assert_eq!(cpu_tier_for_cpu("grace"), Some("neon"));
+
+        assert_eq!(cpu_tier_for_cpu("totally-fake-cpu"), None);
+    }
+
+    /// Regression for the cross-arch-introspection bug Codex flagged
+    /// on PR #187: `cpu_tier_for_cpu` MUST return the same Some-string
+    /// regardless of the build host. Previously, ARM CPU names like
+    /// `"apple-m2"` would fall to `None` on an x86_64 build because the
+    /// lookup piped through the cfg-gated `cpu_ops_for_tier`.
+    #[test]
+    fn cpu_tier_for_cpu_is_cross_arch() {
+        // These four must resolve on EVERY build host (x86_64, aarch64,
+        // wasm, etc.) — no cfg gating on this surface.
+        assert_eq!(cpu_tier_for_cpu("apple-m2"), Some("neon"));
+        assert_eq!(cpu_tier_for_cpu("sapphirerapids"), Some("amx_int8"));
+        assert_eq!(cpu_tier_for_cpu("neoverse-v2"), Some("neon"));
+        assert_eq!(cpu_tier_for_cpu("alderlake"), Some("avxvnni"));
     }
 
     #[test]
diff --git a/src/simd_runtime/mod.rs b/src/simd_runtime/mod.rs
@@ -92,6 +92,6 @@ pub mod vnni_dot;
 // consumers can `use crate::simd_runtime::*` and get every op flat.
 pub use add_mul::{add_mul_f32, add_mul_f64};
 pub use casts::{bf16_to_f32_batch, cast_f16_to_f32_batch, cast_f32_to_f16_batch, f32_to_bf16_batch_rne};
-pub use cpu_ops::{cpu_ops, cpu_ops_for_cpu, cpu_ops_for_tier, CpuOps};
+pub use cpu_ops::{cpu_ops, cpu_ops_for_cpu, cpu_ops_for_tier, cpu_tier_for_cpu, CpuOps};
 pub use matmul::{gemm_u8_i8, matmul_bf16_to_f32, matmul_f32, matmul_i8_to_i32};
 pub use vnni_dot::vnni_dot_u8_i8;