Skip to content

Commit c10e1e0

Browse files
authored
Merge pull request #191 from AdaWorldAPI/claude/continue-ndarray-x0Oaw
fix+test: cpu_tier_for_cpu cross-arch + Pillar 12/13/14 drift-checks
2 parents 77955ed + 0e61ea1 commit c10e1e0

5 files changed

Lines changed: 307 additions & 33 deletions

File tree

src/hpc/pillar/hhtl_contraction.rs

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -448,4 +448,96 @@ mod tests {
448448
assert!((r1.psd_rate - r2.psd_rate).abs() < 1e-12);
449449
assert!((r1.lognorm_concentration - r2.lognorm_concentration).abs() < 1e-12);
450450
}
451+
452+
/// Drift-detection: the pillar's `bundle_step` independently re-derives
453+
/// the bit-mixing bundle operator. The production code path at
454+
/// `crate::hpc::dn_tree::bundle_into` (PR #189, exposed `pub(crate)`)
455+
/// is the substrate the pillar is defending. This test runs both on
456+
/// seed-aligned SplitMix64 RNGs and asserts the first 16 u64 words of
457+
/// production's `GraphHV.channels[0]` agree bit-exactly with the
458+
/// pillar's `[u64; WORDS]` output.
459+
///
460+
/// # Why this is a bit-exact (not ε-tolerant) check
461+
///
462+
/// Per the substrate's bit-exactness contract (W1a + the data-flow
463+
/// rules), bundling is a *gated XOR* (Bernoulli-mixture per bit) —
464+
/// the mask draws come from `SplitMix64` which is bit-deterministic.
465+
/// Both pillar's `probability_mask` and production's
466+
/// `make_probability_mask` consume the same number of `next_u64()`
467+
/// draws at lr=0.25 (n=ceil(-log2(0.25))=2 per word), so the masks
468+
/// for the first `WORDS` words align exactly across the two
469+
/// functions. The remaining 240 words of channel 0 (and channels 1/2)
470+
/// consume extra RNG draws on the production side; those don't affect
471+
/// the first WORDS=16 words because each word is independent.
472+
///
473+
/// # Why not lr=0.5
474+
///
475+
/// Production's `make_probability_mask(0.5)` has a latent
476+
/// infinite-recursion bug: `p >= 0.5` recurses with `1.0 - 0.5 = 0.5`
477+
/// forever. Pillar's `probability_mask` uses `p > 0.5` (strict) and
478+
/// falls through to the AND-cascade at p=0.5. Real production usage
479+
/// (DNConfig default lr=0.03, boost up to ~30 → effective_lr~0.9)
480+
/// never hits 0.5 exactly, so the bug is dormant. This drift-check
481+
/// uses lr=0.25 where both implementations agree; the lr=0.5 case
482+
/// is recorded as a follow-up.
483+
#[test]
484+
fn pillar_13_matches_production_bundle_into() {
485+
use crate::hpc::cam_index::GraphHV;
486+
use crate::hpc::dn_tree::{bundle_into, SplitMix64 as DnSplitMix64};
487+
488+
const N_TRIALS: u32 = 16;
489+
const TEST_LR: f64 = 0.25;
490+
491+
// Both SplitMix64 implementations use identical algorithm (same
492+
// multiplier constants 0x9E3779B97F4A7C15, 0xBF58476D1CE4E5B9,
493+
// 0x94D049BB133111EB and same shift sequence), so identical seeds
494+
// → identical sequences. Both functions consume the same number
495+
// of next_u64() draws per word at p=0.25 (n=ceil(-log2(0.25))=2),
496+
// so the mask sequences align bit-exactly across the first WORDS
497+
// positions of each call.
498+
//
499+
// The RNGs MUST be re-seeded per trial because production's
500+
// bundle_into consumes 48× more RNG draws per call (3 channels ×
501+
// 256 words × 2 draws = 1536) than pillar's bundle_step (16 words
502+
// × 2 draws = 32). Without re-seeding, post-trial-0 RNG states
503+
// diverge.
504+
505+
for trial in 0..N_TRIALS {
506+
// Per-trial seed for both bundling RNGs (must be the same so
507+
// masks align). Inputs come from a separate stream so the
508+
// bundling RNG state isn't disturbed by input generation.
509+
let trial_seed = PILLAR_13_SEED.wrapping_add(trial as u64);
510+
let mut rng_pillar = SplitMix64::new(trial_seed);
511+
let mut rng_prod = DnSplitMix64::new(trial_seed);
512+
513+
let mut rng_inputs = SplitMix64::new(trial_seed.wrapping_mul(0x9E37_79B9_7F4A_7C15));
514+
let x = random_bits(&mut rng_inputs);
515+
let y = random_bits(&mut rng_inputs);
516+
517+
// Pillar side: WORDS=16 u64 mixing
518+
let out_pillar = bundle_step(&x, &y, TEST_LR as f32, &mut rng_pillar);
519+
520+
// Production side: pack x/y into channel 0 of a GraphHV,
521+
// zero the rest. Pillar's `bundle(x, y, lr)` is "keep x where
522+
// mask=0, take y where mask=1"; production's `bundle_into`
523+
// contract is the same with `current` ↔ x and `hv` ↔ y
524+
// (per src/hpc/dn_tree.rs line 125). boost=1.0 means
525+
// effective_lr = lr * 1.0 = TEST_LR (matching pillar).
526+
let mut hv_x = GraphHV::zero();
527+
let mut hv_y = GraphHV::zero();
528+
hv_x.channels[0].words[..WORDS].copy_from_slice(&x);
529+
hv_y.channels[0].words[..WORDS].copy_from_slice(&y);
530+
let hv_out = bundle_into(&hv_x, &hv_y, TEST_LR, 1.0, &mut rng_prod);
531+
532+
// Compare first WORDS=16 u64 words bit-exactly
533+
for w in 0..WORDS {
534+
assert_eq!(
535+
out_pillar[w], hv_out.channels[0].words[w],
536+
"Pillar/bundle_into drift at trial {trial} word {w}: \
537+
pillar=0x{:016x} prod=0x{:016x}",
538+
out_pillar[w], hv_out.channels[0].words[w]
539+
);
540+
}
541+
}
542+
}
451543
}

src/hpc/pillar/ogit_lattice.rs

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -450,4 +450,108 @@ mod tests {
450450
assert!((r1.psd_rate - r2.psd_rate).abs() < 1e-12);
451451
assert!((r1.lognorm_concentration - r2.lognorm_concentration).abs() < 1e-12);
452452
}
453+
454+
/// Drift-detection: the pillar's `transitive_closure` independently
455+
/// derives the partial-order closure on synthetic DAGs. The production
456+
/// code path at `crate::hpc::ogit_bridge::schema::OntologySchema::is_ancestor`
457+
/// (PR #189, exposed `pub`) is the substrate the pillar is defending.
458+
///
459+
/// This test generates a small **single-parent** tree (production's
460+
/// `OntologySchema.parent: Option<Box<str>>` is single-parent, so the
461+
/// drift-check operates on a strict subset of pillar's DAG family),
462+
/// builds it as Turtle source, runs both:
463+
/// - pillar's `transitive_closure` on the equivalent boolean
464+
/// direct-edge matrix
465+
/// - production's `is_ancestor(a, d)` on the parsed `OntologySchema`
466+
/// and asserts agreement on EVERY (ancestor, descendant) pair.
467+
///
468+
/// # Pillar/production closure axes
469+
///
470+
/// Pillar `le[i * N + j] = true` means "type `i` ≤ type `j`" (i.e.,
471+
/// `i` extends/is-subclass-of `j`). Production
472+
/// `is_ancestor(a, d) = true` means "a is an ancestor of d" (i.e.,
473+
/// d extends/is-subclass-of a). So the equivalence is:
474+
/// `pillar.le[i][j] == production.is_ancestor(types[j], types[i])`.
475+
#[cfg(feature = "ogit_bridge")]
476+
#[test]
477+
fn pillar_14_matches_production_is_ancestor() {
478+
use crate::hpc::ogit_bridge::schema::OntologySchema;
479+
use crate::hpc::ogit_bridge::turtle_parser::TurtleParser;
480+
481+
// Small N — Turtle parsing scales linearly but we want a fast test.
482+
const N: usize = 8;
483+
484+
// Type names: ogit:T0, ogit:T1, …, ogit:T{N-1}
485+
let names: Vec<String> = (0..N).map(|i| format!("ogit:T{i}")).collect();
486+
487+
// Generate a deterministic single-parent tree. Type 0 is the root;
488+
// type k>0 picks parent uniformly from {0..k}. Seed-anchored so
489+
// the test is reproducible.
490+
let mut rng = SplitMix64::new(PILLAR_14_SEED);
491+
let mut parent = vec![usize::MAX; N];
492+
for k in 1..N {
493+
// Uniform sample over {0..k}; range is small so modulo-bias
494+
// is negligible and reproducibility matters more than rigor.
495+
parent[k] = (rng.next_u64() as usize) % k;
496+
}
497+
498+
// Build Turtle source and parse to OntologySchema.
499+
let mut src = String::from(
500+
"@prefix ogit: <http://www.purl.org/ogit/> .\n\
501+
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .\n",
502+
);
503+
src.push_str(&format!("{} a rdfs:Class .\n", names[0]));
504+
for k in 1..N {
505+
src.push_str(&format!("{} a rdfs:Class ; rdfs:subClassOf {} .\n", names[k], names[parent[k]]));
506+
}
507+
let triples = TurtleParser::parse(&src).unwrap();
508+
let schema = OntologySchema::from_triples(&triples).unwrap();
509+
510+
// Build the equivalent direct-edge boolean matrix in pillar's
511+
// [N × N] flat layout. direct[k * N + parent[k]] = true.
512+
let mut direct = vec![false; N * N];
513+
for k in 1..N {
514+
direct[k * N + parent[k]] = true;
515+
}
516+
// Hand-compute closure using pillar's helper (not full Pillar 14
517+
// version which is N_TYPES-sized; inline the Floyd-Warshall here).
518+
let mut le = vec![false; N * N];
519+
for i in 0..N {
520+
le[i * N + i] = true;
521+
for j in 0..N {
522+
if direct[i * N + j] {
523+
le[i * N + j] = true;
524+
}
525+
}
526+
}
527+
for kk in 0..N {
528+
for i in 0..N {
529+
if !le[i * N + kk] {
530+
continue;
531+
}
532+
for j in 0..N {
533+
if le[kk * N + j] {
534+
le[i * N + j] = true;
535+
}
536+
}
537+
}
538+
}
539+
540+
// Cross-check every (ancestor, descendant) pair.
541+
let mut total = 0u32;
542+
for i in 0..N {
543+
for j in 0..N {
544+
let pillar_says = le[i * N + j]; // i extends j (j is ancestor of i)
545+
let prod_says = schema.is_ancestor(&names[j], &names[i]);
546+
assert_eq!(
547+
pillar_says, prod_says,
548+
"Pillar/is_ancestor drift on pair (ancestor={}, descendant={}): \
549+
pillar.le[{i}][{j}]={pillar_says} production.is_ancestor={prod_says}",
550+
names[j], names[i]
551+
);
552+
total += 1;
553+
}
554+
}
555+
eprintln!("Pillar 14 ↔ is_ancestor agreement: {total} pair-checks pass over N={N} single-parent tree");
556+
}
453557
}

src/hpc/pillar/splat_invariants.rs

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -437,4 +437,52 @@ mod tests {
437437
assert!((r1.psd_rate - r2.psd_rate).abs() < 1e-12);
438438
assert!((r1.lognorm_concentration - r2.lognorm_concentration).abs() < 1e-12);
439439
}
440+
441+
/// Drift-detection: the pillar's `covariance_from_scale_quat`
442+
/// independently re-derives `Σ = R(q) · diag(s²) · R(q)ᵀ`. The
443+
/// production code path at `crate::hpc::splat3d::spd3::Spd3::from_scale_quat`
444+
/// is the substrate the pillar is *defending*. This test runs both
445+
/// on the same SplitMix64-seeded sample of 256 `(scale, quat)` pairs
446+
/// and asserts agreement to within `1e-5` per upper-triangle entry.
447+
///
448+
/// Any divergence ≥ ε indicates one of two failure modes:
449+
/// (a) production drifted from the canonical quaternion-rotation
450+
/// formula (the pillar definition wins by design — fix the
451+
/// production code), or
452+
/// (b) the pillar itself drifted (audit `covariance_from_scale_quat`
453+
/// against Kerbl 2023 Eq. 3 before changing).
454+
///
455+
/// This is the *coupling* the per-pillar docstring promises:
456+
/// production and pillar share no code, but they share a CI gate
457+
/// that compares them point-for-point.
458+
#[test]
459+
fn pillar_12_matches_production_spd3_from_scale_quat() {
460+
use crate::hpc::splat3d::spd3::Spd3;
461+
462+
const N: u32 = 256;
463+
let mut rng = SplitMix64::new(PILLAR_12_SEED);
464+
let mut max_abs_err: f32 = 0.0;
465+
466+
for _ in 0..N {
467+
let s = [sample_scale_axis(&mut rng), sample_scale_axis(&mut rng), sample_scale_axis(&mut rng)];
468+
let q = sample_unit_quaternion(&mut rng);
469+
470+
let pillar = covariance_from_scale_quat(s, q);
471+
let prod = Spd3::from_scale_quat(s, q);
472+
let prod_ut = [prod.a11, prod.a12, prod.a13, prod.a22, prod.a23, prod.a33];
473+
474+
for (i, (&p, &pr)) in pillar.iter().zip(prod_ut.iter()).enumerate() {
475+
let err = (p - pr).abs();
476+
if err > max_abs_err {
477+
max_abs_err = err;
478+
}
479+
assert!(
480+
err < 1e-5,
481+
"Pillar/Spd3 drift at lane {i}: pillar={p:.7} prod={pr:.7} err={err:.2e} s={s:?} quat={q:?}"
482+
);
483+
}
484+
}
485+
486+
eprintln!("Pillar 12 ↔ Spd3::from_scale_quat agreement: max_abs_err={max_abs_err:.3e} over {N} pairs");
487+
}
440488
}

src/simd_runtime/cpu_ops.rs

Lines changed: 62 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -243,29 +243,42 @@ pub fn cpu_ops_for_tier(name: &str) -> Option<&'static CpuOps> {
243243
}
244244
}
245245

246-
/// Lookup by GCC CPU codename (e.g. `"sapphirerapids"`,
247-
/// `"neoverse-v2"`, `"apple-m2"`). Maps the canonical GCC name to the
248-
/// dispatch tier the CPU lands in, sourced from the scrape recorded
249-
/// in the matrix doc § M.
246+
/// Lookup a [`CpuOps`] by GCC CPU codename (e.g. `"sapphirerapids"`,
247+
/// `"neoverse-v2"`, `"apple-m2"`) on the **current build host**.
250248
///
251-
/// Used for "what would this CPU pick?" introspection without
252-
/// touching CPUID on the running host — e.g. cross-compilation
253-
/// reports, deployment-planning tools, integration tests that want
254-
/// to assert tier selection for a named target without running on
255-
/// that silicon.
249+
/// Returns `Some(&'static CpuOps)` only when the named CPU's tier is
250+
/// reachable from the current `target_arch` (e.g. an x86_64 CPU name
251+
/// on an x86_64 build, an aarch64 CPU name on an aarch64 build).
252+
/// Cross-arch lookups — e.g. `cpu_ops_for_cpu("apple-m2")` on an
253+
/// x86_64 build — return `None` because the underlying NEON kernel
254+
/// fn pointers are compiled out and there is no honest `CpuOps` to
255+
/// return.
256+
///
257+
/// For pure introspection ("what tier would this CPU pick?", with no
258+
/// intent to call kernels), use [`cpu_tier_for_cpu`] instead — it is
259+
/// `cfg`-free and works on any build host.
256260
///
257261
/// Returns `None` for unknown CPU names. Only modern (V8.2-A+ on
258262
/// aarch64, AVX-512+ or AVX-VNNI+ on x86_64) names are mapped — older
259263
/// silicon falls through to `cpu_ops_for_tier("scalar")` by
260264
/// convention if you really need it.
261265
pub fn cpu_ops_for_cpu(name: &str) -> Option<&'static CpuOps> {
262-
cpu_ops_for_tier(cpu_to_tier(name)?)
266+
cpu_ops_for_tier(cpu_tier_for_cpu(name)?)
263267
}
264268

265-
/// Maps a GCC CPU codename to the [`CpuOps`] tier it lands in. Data
266-
/// from the scrape recorded in `.claude/knowledge/agnostic-surface-cpu-matrix.md`
267-
/// § M (aarch64) plus the GCC i386 cpu definitions for x86_64.
268-
fn cpu_to_tier(cpu: &str) -> Option<&'static str> {
269+
/// Lookup the dispatch tier name (e.g. `"amx_int8"`, `"avx512vnni"`,
270+
/// `"neon"`) for a GCC CPU codename. Data from the scrape recorded
271+
/// in `.claude/knowledge/agnostic-surface-cpu-matrix.md` § M
272+
/// (aarch64) plus the GCC i386 cpu definitions for x86_64.
273+
///
274+
/// `cfg`-free — works on any build host regardless of `target_arch`.
275+
/// This is the right entry point for cross-target introspection:
276+
/// deployment-planning tools, cross-compilation reports, integration
277+
/// tests that assert "apple-m2 lands at the neon tier" without
278+
/// actually building for that silicon.
279+
///
280+
/// Returns `None` for unknown CPU names.
281+
pub fn cpu_tier_for_cpu(cpu: &str) -> Option<&'static str> {
269282
Some(match cpu {
270283
// x86_64 — AMX-INT8 hosts
271284
"sapphirerapids" | "graniterapids" | "graniterapids-d" | "emeraldrapids" => "amx_int8",
@@ -336,24 +349,41 @@ mod tests {
336349
}
337350

338351
#[test]
339-
fn cpu_ops_for_cpu_data_driven_lookup() {
340-
// Spot-check the GCC-scraped mapping (matrix doc § M).
341-
assert_eq!(cpu_to_tier("sapphirerapids"), Some("amx_int8"));
342-
assert_eq!(cpu_to_tier("graniterapids"), Some("amx_int8"));
343-
assert_eq!(cpu_to_tier("cascadelake"), Some("avx512vnni"));
344-
assert_eq!(cpu_to_tier("znver4"), Some("avx512vnni"));
345-
assert_eq!(cpu_to_tier("znver5"), Some("avx512vnni"));
346-
assert_eq!(cpu_to_tier("alderlake"), Some("avxvnni"));
347-
assert_eq!(cpu_to_tier("arrowlake"), Some("avxvnni"));
348-
assert_eq!(cpu_to_tier("haswell"), Some("avx2_fma"));
349-
assert_eq!(cpu_to_tier("znver3"), Some("avx2_fma"));
350-
351-
assert_eq!(cpu_to_tier("apple-m2"), Some("neon"));
352-
assert_eq!(cpu_to_tier("neoverse-v2"), Some("neon"));
353-
assert_eq!(cpu_to_tier("oryon-1"), Some("neon"));
354-
assert_eq!(cpu_to_tier("grace"), Some("neon"));
355-
356-
assert_eq!(cpu_to_tier("totally-fake-cpu"), None);
352+
fn cpu_tier_for_cpu_data_driven_lookup() {
353+
// Spot-check the GCC-scraped mapping (matrix doc § M). This
354+
// function is cfg-free — every assertion must hold on every
355+
// build host, regardless of target_arch.
356+
assert_eq!(cpu_tier_for_cpu("sapphirerapids"), Some("amx_int8"));
357+
assert_eq!(cpu_tier_for_cpu("graniterapids"), Some("amx_int8"));
358+
assert_eq!(cpu_tier_for_cpu("cascadelake"), Some("avx512vnni"));
359+
assert_eq!(cpu_tier_for_cpu("znver4"), Some("avx512vnni"));
360+
assert_eq!(cpu_tier_for_cpu("znver5"), Some("avx512vnni"));
361+
assert_eq!(cpu_tier_for_cpu("alderlake"), Some("avxvnni"));
362+
assert_eq!(cpu_tier_for_cpu("arrowlake"), Some("avxvnni"));
363+
assert_eq!(cpu_tier_for_cpu("haswell"), Some("avx2_fma"));
364+
assert_eq!(cpu_tier_for_cpu("znver3"), Some("avx2_fma"));
365+
366+
assert_eq!(cpu_tier_for_cpu("apple-m2"), Some("neon"));
367+
assert_eq!(cpu_tier_for_cpu("neoverse-v2"), Some("neon"));
368+
assert_eq!(cpu_tier_for_cpu("oryon-1"), Some("neon"));
369+
assert_eq!(cpu_tier_for_cpu("grace"), Some("neon"));
370+
371+
assert_eq!(cpu_tier_for_cpu("totally-fake-cpu"), None);
372+
}
373+
374+
/// Regression for the cross-arch-introspection bug Codex flagged
375+
/// on PR #187: `cpu_tier_for_cpu` MUST return the same Some-string
376+
/// regardless of the build host. Previously, ARM CPU names like
377+
/// `"apple-m2"` would fall to `None` on an x86_64 build because the
378+
/// lookup piped through the cfg-gated `cpu_ops_for_tier`.
379+
#[test]
380+
fn cpu_tier_for_cpu_is_cross_arch() {
381+
// These four must resolve on EVERY build host (x86_64, aarch64,
382+
// wasm, etc.) — no cfg gating on this surface.
383+
assert_eq!(cpu_tier_for_cpu("apple-m2"), Some("neon"));
384+
assert_eq!(cpu_tier_for_cpu("sapphirerapids"), Some("amx_int8"));
385+
assert_eq!(cpu_tier_for_cpu("neoverse-v2"), Some("neon"));
386+
assert_eq!(cpu_tier_for_cpu("alderlake"), Some("avxvnni"));
357387
}
358388

359389
#[test]

src/simd_runtime/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,6 @@ pub mod vnni_dot;
9292
// consumers can `use crate::simd_runtime::*` and get every op flat.
9393
pub use add_mul::{add_mul_f32, add_mul_f64};
9494
pub use casts::{bf16_to_f32_batch, cast_f16_to_f32_batch, cast_f32_to_f16_batch, f32_to_bf16_batch_rne};
95-
pub use cpu_ops::{cpu_ops, cpu_ops_for_cpu, cpu_ops_for_tier, CpuOps};
95+
pub use cpu_ops::{cpu_ops, cpu_ops_for_cpu, cpu_ops_for_tier, cpu_tier_for_cpu, CpuOps};
9696
pub use matmul::{gemm_u8_i8, matmul_bf16_to_f32, matmul_f32, matmul_i8_to_i32};
9797
pub use vnni_dot::vnni_dot_u8_i8;

0 commit comments

Comments
 (0)