From f0c5d825e3ed0b03b03d402a37e95a4c5bb32ebd Mon Sep 17 00:00:00 2001
From: booth-algo <kevinlauofficial01@gmail.com>
Date: Thu, 14 May 2026 12:35:38 +0100
Subject: [PATCH 1/2] Clarify sliced emulator entry points

---
 aten/e2e_runner.py                       | 337 ++--------------------
 aten/plena_frontend.py                   | 133 +--------
 aten/sliced_emulator_runner.py           | 351 +++++++++++++++++++++++
 aten/tests/test_plena_compiler.py        |  14 +-
 aten/tests/test_quantization_ablation.py |   6 +-
 docs/ARCHITECTURE.md                     |   4 +-
 docs/ATEN_TREE.md                        |   8 +-
 docs/COMPILATION_PIPELINES.md            |  10 +-
 generator/README.md                      |   9 +-
 generator/tests/test_generator_e2e.py    |  12 +-
 10 files changed, 410 insertions(+), 474 deletions(-)
 create mode 100644 aten/sliced_emulator_runner.py

diff --git a/aten/e2e_runner.py b/aten/e2e_runner.py
index 70ffc48..7d6d4a4 100644
--- a/aten/e2e_runner.py
+++ b/aten/e2e_runner.py
@@ -1,322 +1,27 @@
-"""ATen-backed end-to-end runner.
+"""Compatibility wrapper for the old ATen e2e runner module name.
 
-This wraps the verified ATen compilation path:
-
-    HuggingFace model -> PlenaCompiler + ops.* -> ISA -> emulator -> golden check
-
-The symbolic generator path is separate and remains under ``generator.runner
-codegen``.
-
-Usage:
-    python -m compiler.aten.e2e_runner AICrossSim/clm-60m --seq-len 32
+Use ``compiler.aten.sliced_emulator_runner`` for new code.
 """
 
-import sys
-import time
-from pathlib import Path
-
-# ---------------------------------------------------------------------------
-# Repo root bootstrap — mirror the same sys.path setup used by the existing
-# test infrastructure so imports resolve regardless of cwd.
-# ---------------------------------------------------------------------------
-_COMPILER_ROOT = Path(__file__).resolve().parents[1]  # PLENA_Compiler/
-_REPO_ROOT = _COMPILER_ROOT.parent
-for _p in [str(_REPO_ROOT), str(_REPO_ROOT / "tools"), str(_COMPILER_ROOT)]:
-    if _p not in sys.path:
-        sys.path.insert(0, _p)
-
-
-def run_aten_e2e(
-    model_id: str,
-    seq_len: int = 64,
-    num_layers: int = 1,
-    build_dir: str | None = None,
-    layer_idx: int = 0,
-    hidden_size: int = 64,
-    inter_dim: int = 128,
-    trust_remote_code: bool = False,
-    partial_load: bool = False,
-) -> dict:
-    """Run a HF model through the ATen compilation path end-to-end.
-
-    Steps:
-      1. Load model config + layer weights from HuggingFace
-      2. Build ISA via PlenaCompiler + ops.* (numerically verified path)
-      3. Set up sim environment (ASM + HBM weights + FPRAM constants)
-      4. Run Rust emulator
-      5. Compare VRAM output against golden PyTorch reference
-
-    Returns dict with:
-        passed:             bool
-        allclose_match_rate: float (percentage)
-        max_error:          float
-        mae:                float
-        mse:                float
-        elapsed_s:          float (wall-clock seconds)
-        model_id:           str
-        layer_idx:          int
-        num_layers:         int
-        seq_len:            int
-        hidden_size:        int
-        inter_dim:          int
-        build_dir:          str
-    """
-    from transactional_emulator.testbench.emulator_runner import compare_emulator_output
-    from transactional_emulator.testbench.model_layer_test_builder import (
-        build_and_run_decoder_test,
-        build_and_run_multi_layer_test,
-        get_model_dims,
-        slice_dims_for_sim,
-    )
-
-    t0 = time.time()
-
-    # Resolve build directory
-    if build_dir is None:
-        safe_name = model_id.replace("/", "_")
-        build_dir = str(
-            Path("/tmp") / f"aten_e2e_{safe_name}_sl{seq_len}_l{layer_idx}"
-        )
-    build_path = Path(build_dir)
-
-    # ------------------------------------------------------------------
-    # [1/5] Probe model config
-    # ------------------------------------------------------------------
-    print(f"[1/5] Probing model config: {model_id}")
-    try:
-        full_dims = get_model_dims(model_id)
-    except (OSError, ConnectionError) as exc:
-        print(f"[SKIP] HuggingFace model '{model_id}' unavailable: {exc}")
-        return {
-            "passed": False,
-            "error": str(exc),
-            "model_id": model_id,
-        }
-    sim_dims = slice_dims_for_sim(full_dims, hidden_slice=hidden_size, inter_slice=inter_dim)
-    print(f"       Full dims: hidden={full_dims.hidden_size}, inter={full_dims.inter_dim}, "
-          f"heads={full_dims.num_heads}, kv_heads={full_dims.num_kv_heads}, head_dim={full_dims.head_dim}")
-    print(f"       Sim  dims: hidden={sim_dims.hidden_size}, inter={sim_dims.inter_dim}")
-
-    # ------------------------------------------------------------------
-    # [2/5] Build ISA + golden reference + sim env via build_and_run_decoder_test
-    #
-    # We call the proven function directly — it handles:
-    #   - Weight loading + slicing
-    #   - PlenaCompiler ISA generation
-    #   - create_sim_env + create_mem_for_sim
-    #   - Golden reference computation
-    #   - Emulator execution + comparison
-    #
-    # For multi-layer: iterate layers (each is independent at sim scale).
-    # ------------------------------------------------------------------
-    results_per_layer = []
-
-    if num_layers == 1:
-        # Single layer: use proven single-layer path (with RoPE)
-        current_layer = layer_idx
-        asm_name = f"aten_{model_id.split('/')[-1]}_l{current_layer}"
-        layer_build = build_path / f"layer_{current_layer}"
-
-        print(f"\n[2/5] Building ISA for layer {current_layer} via PlenaCompiler + ops.*")
-        print(f"[3/5] Setting up sim environment: {layer_build}")
-        print("[4/5] Running Rust transactional emulator")
-
-        extra_kwargs = {}
-        if trust_remote_code:
-            extra_kwargs["trust_remote_code"] = True
-        if partial_load:
-            extra_kwargs["partial_load"] = True
-
-        try:
-            build_and_run_decoder_test(
-                model_id=model_id,
-                asm_name=asm_name,
-                build_dir=layer_build,
-                layer_idx=current_layer,
-                seq_len=seq_len,
-                hidden_size=hidden_size,
-                inter_dim=inter_dim,
-                **extra_kwargs,
-            )
-            comp_results, _comp_params = compare_emulator_output(layer_build)
-            results_per_layer.append({
-                "layer": current_layer,
-                "passed": True,
-                "allclose_match_rate": comp_results["allclose_match_rate"],
-                "max_error": comp_results["max_error"],
-                "mae": comp_results["mae"],
-                "mse": comp_results["mse"],
-            })
-        except SystemExit as e:
-            if e.code == 0:
-                return {
-                    "passed": False,
-                    "error": "HuggingFace model unavailable (skipped)",
-                    "model_id": model_id,
-                }
-            try:
-                comp_results, _comp_params = compare_emulator_output(layer_build)
-                results_per_layer.append({
-                    "layer": current_layer,
-                    "passed": False,
-                    "allclose_match_rate": comp_results["allclose_match_rate"],
-                    "max_error": comp_results["max_error"],
-                    "mae": comp_results["mae"],
-                    "mse": comp_results["mse"],
-                })
-            except Exception:
-                results_per_layer.append({
-                    "layer": current_layer,
-                    "passed": False,
-                    "error": f"Emulator comparison failed after exit code {e.code}",
-                })
-    else:
-        # Multi-layer: chain N layers with residual connections (no RoPE)
-        asm_name = f"aten_{model_id.split('/')[-1]}_chain{num_layers}"
-        chain_build = build_path / f"chain_{num_layers}layers"
-
-        print(f"\n[2/5] Building chained {num_layers}-layer ISA via PlenaCompiler + ops.*")
-        print(f"[3/5] Setting up sim environment: {chain_build}")
-        print("[4/5] Running Rust transactional emulator")
-
-        extra_kwargs = {}
-        if trust_remote_code:
-            extra_kwargs["trust_remote_code"] = True
-        if partial_load:
-            extra_kwargs["partial_load"] = True
-
-        try:
-            build_and_run_multi_layer_test(
-                model_id=model_id,
-                asm_name=asm_name,
-                build_dir=chain_build,
-                num_layers=num_layers,
-                layer_idx_start=layer_idx,
-                seq_len=seq_len,
-                hidden_size=hidden_size,
-                inter_dim=inter_dim,
-                **extra_kwargs,
-            )
-            comp_results, _comp_params = compare_emulator_output(chain_build)
-            results_per_layer.append({
-                "layer": f"chain_{num_layers}",
-                "passed": True,
-                "allclose_match_rate": comp_results["allclose_match_rate"],
-                "max_error": comp_results["max_error"],
-                "mae": comp_results["mae"],
-                "mse": comp_results["mse"],
-            })
-        except SystemExit as e:
-            if e.code == 0:
-                return {
-                    "passed": False,
-                    "error": "HuggingFace model unavailable (skipped)",
-                    "model_id": model_id,
-                }
-            try:
-                comp_results, _comp_params = compare_emulator_output(chain_build)
-                results_per_layer.append({
-                    "layer": f"chain_{num_layers}",
-                    "passed": False,
-                    "allclose_match_rate": comp_results["allclose_match_rate"],
-                    "max_error": comp_results["max_error"],
-                    "mae": comp_results["mae"],
-                    "mse": comp_results["mse"],
-                })
-            except Exception:
-                results_per_layer.append({
-                    "layer": f"chain_{num_layers}",
-                    "passed": False,
-                    "error": f"Emulator comparison failed after exit code {e.code}",
-                })
-
-    elapsed = time.time() - t0
-
-    # ------------------------------------------------------------------
-    # [5/5] Aggregate results
-    # ------------------------------------------------------------------
-    print(f"\n[5/5] Results summary ({elapsed:.1f}s elapsed)")
-    all_passed = all(r.get("passed", False) for r in results_per_layer)
-
-    # Use first layer's metrics for the top-level result
-    first = results_per_layer[0] if results_per_layer else {}
-
-    summary = {
-        "passed": all_passed,
-        "allclose_match_rate": first.get("allclose_match_rate", 0.0),
-        "max_error": first.get("max_error", float("inf")),
-        "mae": first.get("mae", float("inf")),
-        "mse": first.get("mse", float("inf")),
-        "elapsed_s": elapsed,
-        "model_id": model_id,
-        "layer_idx": layer_idx,
-        "num_layers": num_layers,
-        "seq_len": seq_len,
-        "hidden_size": hidden_size,
-        "inter_dim": inter_dim,
-        "build_dir": str(build_path),
-        "layers": results_per_layer,
-    }
-
-    for r in results_per_layer:
-        status = "PASS" if r.get("passed") else "FAIL"
-        match = r.get("allclose_match_rate", "N/A")
-        if isinstance(match, float):
-            match = f"{match:.2f}%"
-        print(f"  Layer {r.get('layer', '?')}: [{status}] allclose={match}")
-
-    if all_passed:
-        print(f"\n[ATen e2e PASSED] {model_id} — {num_layers} layer(s), "
-              f"allclose={first.get('allclose_match_rate', 0):.2f}%")
-    else:
-        print(f"\n[ATen e2e FAILED] {model_id} — see per-layer results above")
-
-    return summary
-
-
-# ---------------------------------------------------------------------------
-# CLI entry point
-# ---------------------------------------------------------------------------
-def main():
-    import argparse
-
-    parser = argparse.ArgumentParser(
-        description="Run HF model through ATen compilation path (PlenaCompiler + ops.*)",
-        prog="python -m compiler.aten.e2e_runner",
-    )
-    parser.add_argument("model_id", help="HuggingFace model ID (e.g. AICrossSim/clm-60m)")
-    parser.add_argument("--seq-len", type=int, default=64,
-                        help="Sequence length (default: 64)")
-    parser.add_argument("--num-layers", type=int, default=1,
-                        help="Number of decoder layers to test (default: 1)")
-    parser.add_argument("--layer-idx", type=int, default=0,
-                        help="Starting layer index (default: 0)")
-    parser.add_argument("--hidden-size", type=int, default=64,
-                        help="Hidden dimension clipped to sim limits (default: 64)")
-    parser.add_argument("--inter-dim", type=int, default=128,
-                        help="FFN intermediate dimension clipped to sim limits (default: 128)")
-    parser.add_argument("--build-dir", type=str, default=None,
-                        help="Build directory for sim artifacts (default: /tmp/aten_e2e_...)")
-    parser.add_argument("--trust-remote-code", action="store_true",
-                        help="Trust remote code for HF model loading")
-    parser.add_argument("--partial-load", action="store_true",
-                        help="Load only needed weight shards (for large models)")
-
-    args = parser.parse_args()
-
-    result = run_aten_e2e(
-        model_id=args.model_id,
-        seq_len=args.seq_len,
-        num_layers=args.num_layers,
-        build_dir=args.build_dir,
-        layer_idx=args.layer_idx,
-        hidden_size=args.hidden_size,
-        inter_dim=args.inter_dim,
-        trust_remote_code=args.trust_remote_code,
-        partial_load=args.partial_load,
-    )
-
-    sys.exit(0 if result["passed"] else 1)
+from compiler.aten import sliced_emulator_runner as _impl
+
+for _name, _value in vars(_impl).items():
+    if _name not in {
+        "__builtins__",
+        "__cached__",
+        "__doc__",
+        "__file__",
+        "__loader__",
+        "__name__",
+        "__package__",
+        "__spec__",
+    }:
+        globals()[_name] = _value
+
+for _compat_local in ("_impl", "_name", "_value"):
+    globals().pop(_compat_local, None)
+
+__all__ = [name for name in globals() if not name.startswith("__")]
 
 
 if __name__ == "__main__":
diff --git a/aten/plena_frontend.py b/aten/plena_frontend.py
index 47c584d..2242fc7 100644
--- a/aten/plena_frontend.py
+++ b/aten/plena_frontend.py
@@ -31,6 +31,7 @@
     "_ksplit_matmul",
     "_make_rotate_half_matrix",
     "compile_hf_model",
+    "compile_native_hf_decoder",
     "quantize_to_mxfp",
 ]
 
@@ -262,7 +263,7 @@ def _register_layer_inputs(prog, layer_idx: int, weights: LayerWeights) -> Layer
 # ---------------------------------------------------------------------------
 # Main compilation function
 # ---------------------------------------------------------------------------
-def compile_hf_model(
+def compile_native_hf_decoder(
     model,
     seq_len: int = 64,
     num_layers: int | None = None,
@@ -273,7 +274,7 @@ def compile_hf_model(
     golden_precision: str = "hardware",
     verbose: bool = False,
 ) -> dict:
-    """Compile a HuggingFace decoder model to PLENA ISA and simulation metadata."""
+    """Compile a HuggingFace decoder model at native dimensions to PLENA ISA metadata."""
     def _verbose(message: str = ""):
         if verbose:
             print(message)
@@ -541,129 +542,5 @@ def _verbose(message: str = ""):
         "info": info,
         "golden_precision": golden_precision,
     }
-<<<<<<< Updated upstream
-=======
-
-
-# ---------------------------------------------------------------------------
-# Convenience: compile + run emulator + compare
-# ---------------------------------------------------------------------------
-def compile_and_run(
-    model,
-    build_dir,
-    **kwargs,
-) -> dict:
-    """Compile, run emulator, and compare against golden.
-
-    Convenience wrapper that calls compile_hf_model, sets up simulation
-    environment, runs the Rust transactional emulator, and compares output.
-
-    Args:
-        model:     nn.Module (HF CausalLM model, already loaded)
-        build_dir: Directory for simulation artifacts
-        **kwargs:  Forwarded to compile_hf_model (seq_len, hidden_size, etc.)
-
-    Returns:
-        dict with compilation info + comparison results including
-        'allclose_match_rate' percentage.
-    """
-    from transactional_emulator.tools.create_sim_env import create_sim_env
-    from sim_env_utils.build_env import create_mem_for_sim
-    from transactional_emulator.testbench.emulator_runner import (
-        run_and_assert,
-        compare_emulator_output,
-    )
-
-    result = compile_hf_model(model, **kwargs)
-    build_dir = Path(build_dir)
-    build_dir.mkdir(parents=True, exist_ok=True)
-
-    mlen = kwargs.get("mlen", 64)
-    blen = kwargs.get("blen", 4)
-    asm_name = f"model_{result['info']['model_type']}_{result['info']['num_layers']}L"
-
-    # Write sim env files
-    create_sim_env(
-        result["input_tensors"],
-        result["isa"],
-        {"original_output": result["golden_output"]},
-        result["fp_preload"],
-        build_dir=str(build_dir),
-    )
-
-    create_mem_for_sim(
-        data_size=256,
-        mode="behave_sim",
-        asm=asm_name,
-        data=None,
-        specified_data_order=result["data_order"],
-        build_path=build_dir,
-    )
-
-    with open(build_dir / "comparison_params.json", "w") as f:
-        json.dump(result["comparison_params"], f, indent=2)
-
-    with open(build_dir / "generated_asm_code.asm", "w") as f:
-        f.write(result["isa"])
-
-    print(f"\nSimulation environment created: {build_dir}")
-    print(f"  Result location: VRAM row {result['comparison_params']['start_row_idx']}")
-    print(f"  Layers: {result['info']['num_layers']}, data_order: {result['data_order']}")
-
-    # Run emulator and compare (don't exit on failure — VRAM stage comparison follows)
-    from transactional_emulator.testbench.emulator_runner import update_plena_config, run_emulator
-    update_plena_config(vlen=mlen, mlen=mlen, blen=blen, verbose=False)
-    print("\n--- Running Rust transactional emulator ---")
-    run_emulator(build_dir)
-
-    print("\n--- Comparing emulator output vs golden ---")
-    comp_results, _params = compare_emulator_output(build_dir)
-    from transactional_emulator.tools.check_mem import print_comparison_results
-    print_comparison_results(comp_results, verbose=True, comparison_params=_params)
-
-    if comp_results["allclose_pass"]:
-        print(f"\n[ATen-style {asm_name} test PASSED - ISA generated + emulator verified]")
-    else:
-        print(f"\n[ATen-style {asm_name} test FAILED - emulator numerical check failed]")
-
-    # Three-way comparison
-    golden = result["golden_output"]
-    hf_gt = result["hf_ground_truth"]
-    print("\n--- Three-way comparison ---")
-    if hf_gt is not None and golden is not None:
-        # HF float32 vs golden (MXFP8 + BF16)
-        n = min(hf_gt.numel(), golden.numel())
-        allclose_hf_vs_gold = (
-            torch.isclose(hf_gt.float().flatten()[:n],
-                          golden.float().flatten()[:n], atol=1e-2)
-            .float().mean().item() * 100
-        )
-        print(f"  HF float32 vs golden (MXFP8+BF16):  {allclose_hf_vs_gold:.2f}% allclose")
-    # Emulator vs golden: reported by compare_emulator_output
-    emu_match = comp_results.get("allclose_match_rate", None)
-    if emu_match is not None:
-        print(f"  Emulator vs golden (MXFP8+BF16):    {emu_match:.2f}% allclose")
-
-    # VRAM stage comparison: validates each pipeline segment using
-    # emulator's own intermediates as golden input (immune to accumulation drift)
-    try:
-        from compiler.aten.vram_stage_compare import compare_stages
-        emulator_dir = Path(__file__).parent.parent.parent / "transactional_emulator"
-        vram_path = str(emulator_dir / "vram_dump.bin")
-        print("\n--- VRAM stage comparison (authoritative) ---")
-        stage_results = compare_stages(
-            vram_path=vram_path,
-            build_dir=str(build_dir),
-            hidden=result["info"]["hidden_size"],
-            inter=result["info"].get("inter_dim", result["info"]["hidden_size"] * 4),
-            num_heads=result["info"]["num_heads"],
-            num_kv_heads=result["info"]["num_kv_heads"],
-        )
-        stage_pass = stage_results.get("norm+FFN+norm", 0) >= 99.0
-        comp_results["vram_stage_allclose"] = stage_results.get("norm+FFN+norm", None)
-        comp_results["vram_stage_pass"] = stage_pass
-    except Exception as e:
-        print(f"  (skipped: {e})")
-
-    return {**result["info"], **comp_results}
->>>>>>> Stashed changes
+# Backwards-compatible alias for older callers.
+compile_hf_model = compile_native_hf_decoder
diff --git a/aten/sliced_emulator_runner.py b/aten/sliced_emulator_runner.py
new file mode 100644
index 0000000..1abb36f
--- /dev/null
+++ b/aten/sliced_emulator_runner.py
@@ -0,0 +1,351 @@
+"""ATen-backed sliced-dimension emulator runner.
+
+This wraps the simulator-scale testbench path:
+
+    HuggingFace model -> PlenaCompiler + ops.* -> ISA -> emulator -> golden check
+
+It delegates to ``transactional_emulator.testbench.sliced_layer_test_builder``
+and defaults to sliced dimensions (hidden=64, inter=128). This is the right
+entry point for quick emulator checks that use real weights but simulator-sized
+tensors.
+
+It is not the native full-decoder frontend.  For native hidden/intermediate
+dimensions, use ``compiler.aten.plena_frontend.compile_native_hf_decoder``.
+The symbolic generator path is separate and remains under ``generator.runner
+codegen``.
+
+Usage:
+    python -m compiler.aten.sliced_emulator_runner AICrossSim/clm-60m --seq-len 32
+"""
+
+import sys
+import time
+from pathlib import Path
+
+# ---------------------------------------------------------------------------
+# Repo root bootstrap — mirror the same sys.path setup used by the existing
+# test infrastructure so imports resolve regardless of cwd.
+# ---------------------------------------------------------------------------
+_COMPILER_ROOT = Path(__file__).resolve().parents[1]  # PLENA_Compiler/
+_REPO_ROOT = _COMPILER_ROOT.parent
+for _p in [str(_REPO_ROOT), str(_REPO_ROOT / "tools"), str(_COMPILER_ROOT)]:
+    if _p not in sys.path:
+        sys.path.insert(0, _p)
+
+
+def run_sliced_emulator_check(
+    model_id: str,
+    seq_len: int = 64,
+    num_layers: int = 1,
+    build_dir: str | None = None,
+    layer_idx: int = 0,
+    hidden_size: int = 64,
+    inter_dim: int = 128,
+    trust_remote_code: bool = False,
+    partial_load: bool = False,
+) -> dict:
+    """Run a HF model through the sliced ATen emulator path.
+
+    Steps:
+      1. Load model config + layer weights from HuggingFace
+      2. Build ISA via PlenaCompiler + ops.* at simulator-scale dimensions
+      3. Set up sim environment (ASM + HBM weights + FPRAM constants)
+      4. Run Rust emulator
+      5. Compare VRAM output against golden PyTorch reference
+
+    Returns dict with:
+        passed:             bool
+        allclose_match_rate: float (percentage)
+        max_error:          float
+        mae:                float
+        mse:                float
+        elapsed_s:          float (wall-clock seconds)
+        model_id:           str
+        layer_idx:          int
+        num_layers:         int
+        seq_len:            int
+        hidden_size:        int
+        inter_dim:          int
+        build_dir:          str
+    """
+    from transactional_emulator.testbench.emulator_runner import compare_emulator_output
+    from transactional_emulator.testbench.sliced_layer_test_builder import (
+        build_and_run_sliced_decoder_chain_test,
+        build_and_run_sliced_decoder_layer_test,
+        get_model_dims,
+        slice_dims_for_sim,
+    )
+
+    t0 = time.time()
+
+    # Resolve build directory
+    if build_dir is None:
+        safe_name = model_id.replace("/", "_")
+        build_dir = str(
+            Path("/tmp") / f"aten_sliced_{safe_name}_sl{seq_len}_l{layer_idx}"
+        )
+    build_path = Path(build_dir)
+
+    # ------------------------------------------------------------------
+    # [1/5] Probe model config
+    # ------------------------------------------------------------------
+    print(f"[1/5] Probing model config: {model_id}")
+    try:
+        full_dims = get_model_dims(model_id)
+    except (OSError, ConnectionError) as exc:
+        print(f"[SKIP] HuggingFace model '{model_id}' unavailable: {exc}")
+        return {
+            "passed": False,
+            "error": str(exc),
+            "model_id": model_id,
+        }
+    sim_dims = slice_dims_for_sim(full_dims, hidden_slice=hidden_size, inter_slice=inter_dim)
+    print(f"       Full dims: hidden={full_dims.hidden_size}, inter={full_dims.inter_dim}, "
+          f"heads={full_dims.num_heads}, kv_heads={full_dims.num_kv_heads}, head_dim={full_dims.head_dim}")
+    print(f"       Sim  dims: hidden={sim_dims.hidden_size}, inter={sim_dims.inter_dim}")
+
+    if hidden_size > full_dims.head_dim:
+        error = (
+            "aten.sliced_emulator_runner uses the sliced_layer_test_builder single-head/sim-sliced "
+            f"harness. Requested hidden_size={hidden_size}, but the model head_dim is "
+            f"{full_dims.head_dim}; native multi-head dimensions must go through "
+            "compiler.aten.plena_frontend.compile_native_hf_decoder."
+        )
+        print(f"[ERROR] {error}")
+        return {
+            "passed": False,
+            "error": error,
+            "model_id": model_id,
+            "seq_len": seq_len,
+            "num_layers": num_layers,
+            "hidden_size": hidden_size,
+            "inter_dim": inter_dim,
+            "build_dir": str(build_path),
+        }
+
+    # ------------------------------------------------------------------
+    # [2/5] Build ISA + golden reference + sim env via build_and_run_sliced_decoder_layer_test
+    #
+    # We call the proven function directly — it handles:
+    #   - Weight loading + slicing
+    #   - PlenaCompiler ISA generation
+    #   - create_sim_env + create_mem_for_sim
+    #   - Golden reference computation
+    #   - Emulator execution + comparison
+    #
+    # For multi-layer: iterate layers (each is independent at sim scale).
+    # ------------------------------------------------------------------
+    results_per_layer = []
+
+    if num_layers == 1:
+        # Single layer: use proven single-layer path (with RoPE)
+        current_layer = layer_idx
+        asm_name = f"aten_{model_id.split('/')[-1]}_l{current_layer}"
+        layer_build = build_path / f"layer_{current_layer}"
+
+        print(f"\n[2/5] Building ISA for layer {current_layer} via PlenaCompiler + ops.*")
+        print(f"[3/5] Setting up sim environment: {layer_build}")
+        print("[4/5] Running Rust transactional emulator")
+
+        extra_kwargs = {}
+        if trust_remote_code:
+            extra_kwargs["trust_remote_code"] = True
+        if partial_load:
+            extra_kwargs["partial_load"] = True
+
+        try:
+            build_and_run_sliced_decoder_layer_test(
+                model_id=model_id,
+                asm_name=asm_name,
+                build_dir=layer_build,
+                layer_idx=current_layer,
+                seq_len=seq_len,
+                hidden_size=hidden_size,
+                inter_dim=inter_dim,
+                **extra_kwargs,
+            )
+            comp_results, _comp_params = compare_emulator_output(layer_build)
+            results_per_layer.append({
+                "layer": current_layer,
+                "passed": True,
+                "allclose_match_rate": comp_results["allclose_match_rate"],
+                "max_error": comp_results["max_error"],
+                "mae": comp_results["mae"],
+                "mse": comp_results["mse"],
+            })
+        except SystemExit as e:
+            if e.code == 0:
+                return {
+                    "passed": False,
+                    "error": "HuggingFace model unavailable (skipped)",
+                    "model_id": model_id,
+                }
+            try:
+                comp_results, _comp_params = compare_emulator_output(layer_build)
+                results_per_layer.append({
+                    "layer": current_layer,
+                    "passed": False,
+                    "allclose_match_rate": comp_results["allclose_match_rate"],
+                    "max_error": comp_results["max_error"],
+                    "mae": comp_results["mae"],
+                    "mse": comp_results["mse"],
+                })
+            except Exception:
+                results_per_layer.append({
+                    "layer": current_layer,
+                    "passed": False,
+                    "error": f"Emulator comparison failed after exit code {e.code}",
+                })
+    else:
+        # Multi-layer: chain N layers with residual connections (no RoPE)
+        asm_name = f"aten_{model_id.split('/')[-1]}_chain{num_layers}"
+        chain_build = build_path / f"chain_{num_layers}layers"
+
+        print(f"\n[2/5] Building chained {num_layers}-layer ISA via PlenaCompiler + ops.*")
+        print(f"[3/5] Setting up sim environment: {chain_build}")
+        print("[4/5] Running Rust transactional emulator")
+
+        extra_kwargs = {}
+        if trust_remote_code:
+            extra_kwargs["trust_remote_code"] = True
+        if partial_load:
+            extra_kwargs["partial_load"] = True
+
+        try:
+            build_and_run_sliced_decoder_chain_test(
+                model_id=model_id,
+                asm_name=asm_name,
+                build_dir=chain_build,
+                num_layers=num_layers,
+                layer_idx_start=layer_idx,
+                seq_len=seq_len,
+                hidden_size=hidden_size,
+                inter_dim=inter_dim,
+                **extra_kwargs,
+            )
+            comp_results, _comp_params = compare_emulator_output(chain_build)
+            results_per_layer.append({
+                "layer": f"chain_{num_layers}",
+                "passed": True,
+                "allclose_match_rate": comp_results["allclose_match_rate"],
+                "max_error": comp_results["max_error"],
+                "mae": comp_results["mae"],
+                "mse": comp_results["mse"],
+            })
+        except SystemExit as e:
+            if e.code == 0:
+                return {
+                    "passed": False,
+                    "error": "HuggingFace model unavailable (skipped)",
+                    "model_id": model_id,
+                }
+            try:
+                comp_results, _comp_params = compare_emulator_output(chain_build)
+                results_per_layer.append({
+                    "layer": f"chain_{num_layers}",
+                    "passed": False,
+                    "allclose_match_rate": comp_results["allclose_match_rate"],
+                    "max_error": comp_results["max_error"],
+                    "mae": comp_results["mae"],
+                    "mse": comp_results["mse"],
+                })
+            except Exception:
+                results_per_layer.append({
+                    "layer": f"chain_{num_layers}",
+                    "passed": False,
+                    "error": f"Emulator comparison failed after exit code {e.code}",
+                })
+
+    elapsed = time.time() - t0
+
+    # ------------------------------------------------------------------
+    # [5/5] Aggregate results
+    # ------------------------------------------------------------------
+    print(f"\n[5/5] Results summary ({elapsed:.1f}s elapsed)")
+    all_passed = all(r.get("passed", False) for r in results_per_layer)
+
+    # Use first layer's metrics for the top-level result
+    first = results_per_layer[0] if results_per_layer else {}
+
+    summary = {
+        "passed": all_passed,
+        "allclose_match_rate": first.get("allclose_match_rate", 0.0),
+        "max_error": first.get("max_error", float("inf")),
+        "mae": first.get("mae", float("inf")),
+        "mse": first.get("mse", float("inf")),
+        "elapsed_s": elapsed,
+        "model_id": model_id,
+        "layer_idx": layer_idx,
+        "num_layers": num_layers,
+        "seq_len": seq_len,
+        "hidden_size": hidden_size,
+        "inter_dim": inter_dim,
+        "build_dir": str(build_path),
+        "layers": results_per_layer,
+    }
+
+    for r in results_per_layer:
+        status = "PASS" if r.get("passed") else "FAIL"
+        match = r.get("allclose_match_rate", "N/A")
+        if isinstance(match, float):
+            match = f"{match:.2f}%"
+        print(f"  Layer {r.get('layer', '?')}: [{status}] allclose={match}")
+
+    if all_passed:
+        print(f"\n[Sliced emulator PASSED] {model_id} — {num_layers} layer(s), "
+              f"allclose={first.get('allclose_match_rate', 0):.2f}%")
+    else:
+        print(f"\n[Sliced emulator FAILED] {model_id} — see per-layer results above")
+
+    return summary
+
+
+# ---------------------------------------------------------------------------
+# CLI entry point
+# ---------------------------------------------------------------------------
+def main():
+    import argparse
+
+    parser = argparse.ArgumentParser(
+        description="Run HF model through the sim-sliced ATen emulator harness",
+        prog="python -m compiler.aten.sliced_emulator_runner",
+    )
+    parser.add_argument("model_id", help="HuggingFace model ID (e.g. AICrossSim/clm-60m)")
+    parser.add_argument("--seq-len", type=int, default=64,
+                        help="Sequence length (default: 64)")
+    parser.add_argument("--num-layers", type=int, default=1,
+                        help="Number of decoder layers to test (default: 1)")
+    parser.add_argument("--layer-idx", type=int, default=0,
+                        help="Starting layer index (default: 0)")
+    parser.add_argument("--hidden-size", type=int, default=64,
+                        help="Sliced hidden dimension for this harness (default: 64)")
+    parser.add_argument("--inter-dim", type=int, default=128,
+                        help="FFN intermediate dimension clipped to sim limits (default: 128)")
+    parser.add_argument("--build-dir", type=str, default=None,
+                        help="Build directory for sim artifacts (default: /tmp/aten_sliced_...)")
+    parser.add_argument("--trust-remote-code", action="store_true",
+                        help="Trust remote code for HF model loading")
+    parser.add_argument("--partial-load", action="store_true",
+                        help="Load only needed weight shards (for large models)")
+
+    args = parser.parse_args()
+
+    result = run_sliced_emulator_check(
+        model_id=args.model_id,
+        seq_len=args.seq_len,
+        num_layers=args.num_layers,
+        build_dir=args.build_dir,
+        layer_idx=args.layer_idx,
+        hidden_size=args.hidden_size,
+        inter_dim=args.inter_dim,
+        trust_remote_code=args.trust_remote_code,
+        partial_load=args.partial_load,
+    )
+    sys.exit(0 if result["passed"] else 1)
+
+# Backwards-compatible alias for older callers.
+run_aten_e2e = run_sliced_emulator_check
+
+
+if __name__ == "__main__":
+    main()
diff --git a/aten/tests/test_plena_compiler.py b/aten/tests/test_plena_compiler.py
index a10932e..1011f4d 100644
--- a/aten/tests/test_plena_compiler.py
+++ b/aten/tests/test_plena_compiler.py
@@ -204,9 +204,9 @@ def test_rotate_half_matrix_identity():
     print("  PASS test_rotate_half_matrix_identity")
 
 
-def test_compile_hf_model_golden_vs_hf():
+def test_compile_native_hf_decoder_golden_vs_hf():
     """Golden (MXFP8+BF16) should closely match HF float32 at native dims."""
-    from compiler.aten.plena_frontend import compile_hf_model
+    from compiler.aten.plena_frontend import compile_native_hf_decoder
     from transformers import AutoModelForCausalLM
 
     model = AutoModelForCausalLM.from_pretrained(
@@ -214,7 +214,7 @@ def test_compile_hf_model_golden_vs_hf():
     )
     model.eval()
 
-    r = compile_hf_model(model, seq_len=64, num_layers=1)
+    r = compile_native_hf_decoder(model, seq_len=64, num_layers=1)
     golden = r["golden_output"]
     hf = r["hf_ground_truth"]
 
@@ -226,7 +226,7 @@ def test_compile_hf_model_golden_vs_hf():
 
     assert pct >= 95.0, f"Golden vs HF allclose {pct:.1f}% < 95%"
     assert cos.item() >= 0.99, f"Golden vs HF cosine {cos.item():.4f} < 0.99"
-    print(f"  PASS test_compile_hf_model_golden_vs_hf ({pct:.1f}% allclose, cos={cos.item():.4f})")
+    print(f"  PASS test_compile_native_hf_decoder_golden_vs_hf ({pct:.1f}% allclose, cos={cos.item():.4f})")
 
 
 def test_native_compile_assembles():
@@ -234,7 +234,7 @@ def test_native_compile_assembles():
     import os
     import tempfile
 
-    from compiler.aten.plena_frontend import compile_hf_model
+    from compiler.aten.plena_frontend import compile_native_hf_decoder
     from transformers import AutoModelForCausalLM
 
     model = AutoModelForCausalLM.from_pretrained(
@@ -242,7 +242,7 @@ def test_native_compile_assembles():
     )
     model.eval()
 
-    r = compile_hf_model(model, seq_len=64, num_layers=1)
+    r = compile_native_hf_decoder(model, seq_len=64, num_layers=1)
     isa = r["isa"]
 
     # Assemble — should not raise ValueError (u32 overflow)
@@ -290,7 +290,7 @@ def test_native_compile_assembles():
         test_fix_large_immediates_roundtrip,
         test_fix_large_immediates_preserves_relative_adds,
         test_rotate_half_matrix_identity,
-        test_compile_hf_model_golden_vs_hf,
+        test_compile_native_hf_decoder_golden_vs_hf,
         test_native_compile_assembles,
     ]
 
diff --git a/aten/tests/test_quantization_ablation.py b/aten/tests/test_quantization_ablation.py
index 13762f1..50bb0be 100644
--- a/aten/tests/test_quantization_ablation.py
+++ b/aten/tests/test_quantization_ablation.py
@@ -1,6 +1,6 @@
 """Ablation study proving HF-vs-golden accuracy gap is from MXFP8 weight quantization.
 
-Runs compile_hf_model in four precision modes and compares golden output
+Runs compile_native_hf_decoder in four precision modes and compares golden output
 against the HF float32 ground truth. Expected result:
 
     hardware       (MXFP8 + BF16)  ~52% allclose  ← full HW gap
@@ -25,13 +25,13 @@
 
 def _run_ablation(num_layers: int) -> dict[str, dict]:
     from transformers import AutoModelForCausalLM
-    from compiler.aten.plena_frontend import compile_hf_model
+    from compiler.aten.plena_frontend import compile_native_hf_decoder
 
     model = AutoModelForCausalLM.from_pretrained(MODEL_ID)
     results = {}
 
     for mode in MODES:
-        result = compile_hf_model(
+        result = compile_native_hf_decoder(
             model, seq_len=64, num_layers=num_layers, golden_precision=mode,
         )
         golden = result["golden_output"]
diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md
index a0ff701..8310520 100644
--- a/docs/ARCHITECTURE.md
+++ b/docs/ARCHITECTURE.md
@@ -26,8 +26,8 @@ PLENA_Compiler/
 |   +-- reset_reg_asm.py     #   Register reset helpers
 |
 |-- aten/                    # Pipeline 1: ATen compilation backend
-|   |-- plena_frontend.py    #   HF model -> PLENA program -> ISA text
-|   |-- e2e_runner.py        #   ATen e2e runner: PlenaCompiler -> emulator -> golden
+|   |-- plena_frontend.py    #   native HF decoder -> PLENA program -> ISA text
+|   |-- sliced_emulator_runner.py #   sliced HF weights -> emulator -> golden
 |   |-- plena/               #   Canonical PlenaCompiler implementation package
 |   |   |-- compiler.py      #     PlenaCompiler composition class
 |   |   |-- memory_state.py  #     Tensor/input/FP memory state
diff --git a/docs/ATEN_TREE.md b/docs/ATEN_TREE.md
index e5800da..4c7af08 100644
--- a/docs/ATEN_TREE.md
+++ b/docs/ATEN_TREE.md
@@ -9,8 +9,8 @@ aten/
 |-- native_ops.yaml             # Operator registry spec: signatures and dispatch targets
 |-- isa_builder.py              # Typed ISA instruction/register builder and legalization
 |-- model_extract.py            # HuggingFace model config/layer/embedding extraction helpers
-|-- plena_frontend.py           # HF decoder model -> PLENA program -> ISA text
-|-- e2e_runner.py               # HF model -> ATen compiler -> emulator -> golden check
+|-- plena_frontend.py           # native HF decoder -> PLENA program -> ISA text
+|-- sliced_emulator_runner.py   # sliced HF weights -> emulator -> golden check
 |-- reference.py                # CPU golden/reference math and MXFP/BF16 helpers
 |-- vram_stage_compare.py       # Debug tooling for VRAM stage comparisons
 |
@@ -70,8 +70,8 @@ Key points:
 
 - `aten/plena/` is the canonical compiler implementation package.
 - `aten/ops/` is the ATen-style dispatcher surface.
-- `aten/plena_frontend.py` is the HuggingFace/ATen frontend that drives model
+- `aten/plena_frontend.py` is the native HuggingFace/ATen frontend that drives model
   compilation.
-- `aten/e2e_runner.py` runs the ATen compiler path through the emulator and
+- `aten/sliced_emulator_runner.py` runs the sliced-dimension ATen compiler path through the emulator and
   golden comparison.
 - The old `aten/plena_compiler.py` compatibility facade has been removed.
diff --git a/docs/COMPILATION_PIPELINES.md b/docs/COMPILATION_PIPELINES.md
index 69e2ccc..b907f11 100644
--- a/docs/COMPILATION_PIPELINES.md
+++ b/docs/COMPILATION_PIPELINES.md
@@ -66,18 +66,18 @@ backends, and weight-handling strategies.
 | File | Role |
 |------|------|
 | `aten/plena/` | Canonical PlenaCompiler implementation package |
-| `aten/plena_frontend.py` | HuggingFace model frontend that drives ATen compilation |
+| `aten/plena_frontend.py` | Native HuggingFace decoder frontend (`compile_native_hf_decoder`) |
 | `aten/ops/plena/*.py` | Registered ATen op implementations (linear, attention, ffn, norm, conv, softmax, embedding) |
 | `aten/ops/cpu/*.py` | CPU reference fallbacks |
 | `aten/ops/registry.py` | Op dispatch registry |
-| `aten/e2e_runner.py` | E2E harness: model load -> compile -> emulate -> verify |
+| `aten/sliced_emulator_runner.py` | Sliced-dimension emulator harness: model load -> compile -> emulate -> verify |
 | `sim_env_utils/build_env.py` | Simulation environment builder |
 
 ### Entry points
 
-- **Single-layer tests**: `model_layer_test_builder.py::build_and_run_decoder_test`
-- **Full-model E2E**: `aten/e2e_runner.py::run_aten_e2e`
-- **CLI**: `python -m compiler.aten.e2e_runner <model> --seq-len 32 --num-layers 1`
+- **Sliced single-layer tests**: `sliced_layer_test_builder.py::build_and_run_sliced_decoder_layer_test`
+- **Sliced emulator CLI**: `python -m compiler.aten.sliced_emulator_runner <model> --seq-len 32 --num-layers 1`
+- **Native decoder compile**: `aten/plena_frontend.py::compile_native_hf_decoder`
 
 ### Test suite
 
diff --git a/generator/README.md b/generator/README.md
index aae7c39..2cfbc19 100644
--- a/generator/README.md
+++ b/generator/README.md
@@ -6,13 +6,16 @@ The generator path is the symbolic codegen and utilization-analysis pipeline:
 HF config -> symbolic graph -> scheduler -> ASM
 ```
 
-It is separate from the ATen e2e compiler path. For numerically verified ATen
-compilation, use:
+It is separate from the ATen compiler/emulator path. For quick numerically verified
+sim-sliced ATen checks, use:
 
 ```bash
-python -m compiler.aten.e2e_runner AICrossSim/clm-60m --seq-len 64 --num-layers 1
+python -m compiler.aten.sliced_emulator_runner AICrossSim/clm-60m --seq-len 64 --num-layers 1
 ```
 
+For native hidden/intermediate dimensions, use
+`compiler.aten.plena_frontend.compile_native_hf_decoder` from Python.
+
 Run symbolic codegen:
 
 ```bash
diff --git a/generator/tests/test_generator_e2e.py b/generator/tests/test_generator_e2e.py
index 2ef75aa..7234df4 100644
--- a/generator/tests/test_generator_e2e.py
+++ b/generator/tests/test_generator_e2e.py
@@ -512,20 +512,20 @@ def run_test_aten(
     seq_len: int = 64,
     num_layers: int = 1,
 ) -> int:
-    """Run the ATen-backed e2e pipeline (PlenaCompiler + ops.*).
+    """Run the sim-sliced ATen-backed e2e pipeline (PlenaCompiler + ops.*).
 
     Unlike ``run_test`` which uses the generator's own codegen path and has
-    numerical verification deferred, this immediately gets full numerical
-    correctness via the mature ATen compilation backend.
+    numerical verification deferred, this immediately gets emulator numerical
+    coverage through ``compiler.aten.sliced_emulator_runner`` at simulator-sliced dims.
     """
-    from compiler.aten.e2e_runner import run_aten_e2e
+    from compiler.aten.sliced_emulator_runner import run_sliced_emulator_check
 
     print("=" * 80)
     print(f"Generator e2e harness (ATen backend) — {model_id} — "
           f"seq_len={seq_len}, num_layers={num_layers}")
     print("=" * 80)
 
-    result = run_aten_e2e(
+    result = run_sliced_emulator_check(
         model_id=model_id,
         seq_len=seq_len,
         num_layers=num_layers,
@@ -550,7 +550,7 @@ def run_test_aten(
     _ap.add_argument("--num-layers", type=int, default=None,
                      help="Override num_hidden_layers (e.g. 1 for fast e2e runs, ~22x less ASM)")
     _ap.add_argument("--aten", action="store_true",
-                     help="Use ATen backend (PlenaCompiler + ops.*) instead of generator codegen")
+                     help="Use sim-sliced ATen harness instead of generator codegen")
     _args = _ap.parse_args()
     if _args.aten:
         sys.exit(run_test_aten(

From 618f000f19790bfbe22ddc8616976b85a9e90af2 Mon Sep 17 00:00:00 2001
From: booth-algo <kevinlauofficial01@gmail.com>
Date: Thu, 14 May 2026 19:47:39 +0100
Subject: [PATCH 2/2] fix: validate final layer in VRAM stage compare

---
 aten/vram_stage_compare.py | 59 +++++++++++++++++++++++++++-----------
 1 file changed, 42 insertions(+), 17 deletions(-)

diff --git a/aten/vram_stage_compare.py b/aten/vram_stage_compare.py
index f169fb3..a206868 100644
--- a/aten/vram_stage_compare.py
+++ b/aten/vram_stage_compare.py
@@ -11,8 +11,10 @@
         vram_path="transactional_emulator/vram_dump.bin",
         build_dir="/tmp/smolvlm2_1layer_f32regs",
         hidden=576, inter=1536, num_heads=9, num_kv_heads=3,
+        layer_idx=0,
     )
 """
+import re
 import struct
 import numpy as np
 import torch
@@ -47,14 +49,36 @@ def _mse(a, b):
     return ((a.float() - b.float()) ** 2).mean().item()
 
 
+def _infer_final_layer_idx(build: Path) -> int:
+    indices = []
+    for path in build.glob("W_o_*.pt"):
+        match = re.fullmatch(r"W_o_(\d+)\.pt", path.name)
+        if match:
+            indices.append(int(match.group(1)))
+    if not indices:
+        raise FileNotFoundError(f"No W_o_<layer>.pt files found in {build}")
+    return max(indices)
+
+
+def _read_alloc_addr(asm: str, name: str) -> int | None:
+    match = re.search(
+        rf"Allocate VRAM Matrix {re.escape(name)}: .*?VRAM\[(\d+)\]",
+        asm,
+    )
+    return int(match.group(1)) if match else None
+
+
 def compare_stages(vram_path, build_dir, hidden, inter, num_heads, num_kv_heads,
-                   seq_len=64, mlen=64, head_dim=64, eps=1e-5, verbose=True):
+                   seq_len=64, mlen=64, head_dim=64, eps=1e-5, verbose=True,
+                   layer_idx=None):
     """Compare each pipeline stage using emulator's own VRAM intermediates.
 
     Args:
         vram_path: path to the emulator's vram_dump.bin
         build_dir: path to the build directory with weight .pt files
         hidden, inter, num_heads, num_kv_heads: model dimensions
+        layer_idx: decoder layer to validate. Defaults to the last layer found
+                   in build_dir, which is the layer that feeds the final output.
 
     Returns:
         dict of stage results with allclose percentages
@@ -63,40 +87,39 @@ def compare_stages(vram_path, build_dir, hidden, inter, num_heads, num_kv_heads,
     _to_inter = lambda x: x.to(torch.bfloat16)
     _from_inter = lambda x: x.float()
 
-    # VRAM addresses (from ISA comments — these are model-dependent)
-    # For SmolVLM2 1-layer: X=12288, scratch=233472, Q=270400, O_full=307264, O_proj=356416
-    # For clm-60m: different addresses. We compute from VRAM layout.
-    tiles = hidden // mlen
-    x_addr = 3 * mlen * mlen + 0  # after COS, SIN, mask (for native mode)
+    if layer_idx is None:
+        layer_idx = _infer_final_layer_idx(build)
+
     # Read final output address from comparison_params
     import json
     params = json.load(open(build / "comparison_params.json"))
     final_addr = params["start_row_idx"] * mlen
 
-    results = {}
+    results = {"layer_idx": layer_idx}
 
     # --- Load weights ---
-    W_o = quantize_to_mxfp(torch.load(build / "W_o_0.pt", weights_only=True))
-    W_gate = quantize_to_mxfp(torch.load(build / "W_gate_0.pt", weights_only=True))
-    W_up = quantize_to_mxfp(torch.load(build / "W_up_0.pt", weights_only=True))
-    W_down = quantize_to_mxfp(torch.load(build / "W_down_0.pt", weights_only=True))
+    W_o = quantize_to_mxfp(torch.load(build / f"W_o_{layer_idx}.pt", weights_only=True))
+    W_gate = quantize_to_mxfp(torch.load(build / f"W_gate_{layer_idx}.pt", weights_only=True))
+    W_up = quantize_to_mxfp(torch.load(build / f"W_up_{layer_idx}.pt", weights_only=True))
+    W_down = quantize_to_mxfp(torch.load(build / f"W_down_{layer_idx}.pt", weights_only=True))
 
     # --- Stage 1: O_full (attention output) ---
     # Find O_full address from ISA comments
     o_full_addr = final_addr - 2 * seq_len * hidden
-    import re
     asm_path = build / "generated_asm_code.asm"
     if asm_path.exists():
         with open(asm_path) as f:
             asm = f.read()
-        m = re.search(r'Allocate VRAM Matrix O_full_0.*?VRAM\[(\d+)\]', asm)
-        if m:
-            o_full_addr = int(m.group(1))
-        m2 = re.search(r'Allocate VRAM Matrix residual_scratch.*?VRAM\[(\d+)\]', asm)
-        scratch_addr = int(m2.group(1)) if m2 else None
+        parsed_o_full_addr = _read_alloc_addr(asm, f"O_full_{layer_idx}")
+        if parsed_o_full_addr is not None:
+            o_full_addr = parsed_o_full_addr
+        scratch_addr = _read_alloc_addr(asm, "residual_scratch")
     else:
         scratch_addr = None
 
+    if verbose:
+        print(f"  Validating layer {layer_idx}")
+
     O_full = _read_bf16_matrix(vram_path, o_full_addr, seq_len, hidden)
 
     # --- Stage 2: O_proj = O_full @ W_o ---
@@ -163,11 +186,13 @@ def compare_stages(vram_path, build_dir, hidden, inter, num_heads, num_kv_heads,
     import sys
     vram = sys.argv[1] if len(sys.argv) > 1 else "transactional_emulator/vram_dump.bin"
     build = sys.argv[2] if len(sys.argv) > 2 else "/tmp/smolvlm2_1layer_f32regs"
+    layer_idx = int(sys.argv[3]) if len(sys.argv) > 3 else None
 
     print("=== VRAM Stage Comparison ===")
     results = compare_stages(
         vram_path=vram,
         build_dir=build,
         hidden=576, inter=1536, num_heads=9, num_kv_heads=3,
+        layer_idx=layer_idx,
     )
     print(f"\nOverall: {'PASS' if results.get('norm+FFN+norm', 0) >= 99.0 else 'FAIL'}")