From f0c5d825e3ed0b03b03d402a37e95a4c5bb32ebd Mon Sep 17 00:00:00 2001 From: booth-algo Date: Thu, 14 May 2026 12:35:38 +0100 Subject: [PATCH 1/2] Clarify sliced emulator entry points --- aten/e2e_runner.py | 337 ++-------------------- aten/plena_frontend.py | 133 +-------- aten/sliced_emulator_runner.py | 351 +++++++++++++++++++++++ aten/tests/test_plena_compiler.py | 14 +- aten/tests/test_quantization_ablation.py | 6 +- docs/ARCHITECTURE.md | 4 +- docs/ATEN_TREE.md | 8 +- docs/COMPILATION_PIPELINES.md | 10 +- generator/README.md | 9 +- generator/tests/test_generator_e2e.py | 12 +- 10 files changed, 410 insertions(+), 474 deletions(-) create mode 100644 aten/sliced_emulator_runner.py diff --git a/aten/e2e_runner.py b/aten/e2e_runner.py index 70ffc48..7d6d4a4 100644 --- a/aten/e2e_runner.py +++ b/aten/e2e_runner.py @@ -1,322 +1,27 @@ -"""ATen-backed end-to-end runner. +"""Compatibility wrapper for the old ATen e2e runner module name. -This wraps the verified ATen compilation path: - - HuggingFace model -> PlenaCompiler + ops.* -> ISA -> emulator -> golden check - -The symbolic generator path is separate and remains under ``generator.runner -codegen``. - -Usage: - python -m compiler.aten.e2e_runner AICrossSim/clm-60m --seq-len 32 +Use ``compiler.aten.sliced_emulator_runner`` for new code. """ -import sys -import time -from pathlib import Path - -# --------------------------------------------------------------------------- -# Repo root bootstrap — mirror the same sys.path setup used by the existing -# test infrastructure so imports resolve regardless of cwd. -# --------------------------------------------------------------------------- -_COMPILER_ROOT = Path(__file__).resolve().parents[1] # PLENA_Compiler/ -_REPO_ROOT = _COMPILER_ROOT.parent -for _p in [str(_REPO_ROOT), str(_REPO_ROOT / "tools"), str(_COMPILER_ROOT)]: - if _p not in sys.path: - sys.path.insert(0, _p) - - -def run_aten_e2e( - model_id: str, - seq_len: int = 64, - num_layers: int = 1, - build_dir: str | None = None, - layer_idx: int = 0, - hidden_size: int = 64, - inter_dim: int = 128, - trust_remote_code: bool = False, - partial_load: bool = False, -) -> dict: - """Run a HF model through the ATen compilation path end-to-end. - - Steps: - 1. Load model config + layer weights from HuggingFace - 2. Build ISA via PlenaCompiler + ops.* (numerically verified path) - 3. Set up sim environment (ASM + HBM weights + FPRAM constants) - 4. Run Rust emulator - 5. Compare VRAM output against golden PyTorch reference - - Returns dict with: - passed: bool - allclose_match_rate: float (percentage) - max_error: float - mae: float - mse: float - elapsed_s: float (wall-clock seconds) - model_id: str - layer_idx: int - num_layers: int - seq_len: int - hidden_size: int - inter_dim: int - build_dir: str - """ - from transactional_emulator.testbench.emulator_runner import compare_emulator_output - from transactional_emulator.testbench.model_layer_test_builder import ( - build_and_run_decoder_test, - build_and_run_multi_layer_test, - get_model_dims, - slice_dims_for_sim, - ) - - t0 = time.time() - - # Resolve build directory - if build_dir is None: - safe_name = model_id.replace("/", "_") - build_dir = str( - Path("/tmp") / f"aten_e2e_{safe_name}_sl{seq_len}_l{layer_idx}" - ) - build_path = Path(build_dir) - - # ------------------------------------------------------------------ - # [1/5] Probe model config - # ------------------------------------------------------------------ - print(f"[1/5] Probing model config: {model_id}") - try: - full_dims = get_model_dims(model_id) - except (OSError, ConnectionError) as exc: - print(f"[SKIP] HuggingFace model '{model_id}' unavailable: {exc}") - return { - "passed": False, - "error": str(exc), - "model_id": model_id, - } - sim_dims = slice_dims_for_sim(full_dims, hidden_slice=hidden_size, inter_slice=inter_dim) - print(f" Full dims: hidden={full_dims.hidden_size}, inter={full_dims.inter_dim}, " - f"heads={full_dims.num_heads}, kv_heads={full_dims.num_kv_heads}, head_dim={full_dims.head_dim}") - print(f" Sim dims: hidden={sim_dims.hidden_size}, inter={sim_dims.inter_dim}") - - # ------------------------------------------------------------------ - # [2/5] Build ISA + golden reference + sim env via build_and_run_decoder_test - # - # We call the proven function directly — it handles: - # - Weight loading + slicing - # - PlenaCompiler ISA generation - # - create_sim_env + create_mem_for_sim - # - Golden reference computation - # - Emulator execution + comparison - # - # For multi-layer: iterate layers (each is independent at sim scale). - # ------------------------------------------------------------------ - results_per_layer = [] - - if num_layers == 1: - # Single layer: use proven single-layer path (with RoPE) - current_layer = layer_idx - asm_name = f"aten_{model_id.split('/')[-1]}_l{current_layer}" - layer_build = build_path / f"layer_{current_layer}" - - print(f"\n[2/5] Building ISA for layer {current_layer} via PlenaCompiler + ops.*") - print(f"[3/5] Setting up sim environment: {layer_build}") - print("[4/5] Running Rust transactional emulator") - - extra_kwargs = {} - if trust_remote_code: - extra_kwargs["trust_remote_code"] = True - if partial_load: - extra_kwargs["partial_load"] = True - - try: - build_and_run_decoder_test( - model_id=model_id, - asm_name=asm_name, - build_dir=layer_build, - layer_idx=current_layer, - seq_len=seq_len, - hidden_size=hidden_size, - inter_dim=inter_dim, - **extra_kwargs, - ) - comp_results, _comp_params = compare_emulator_output(layer_build) - results_per_layer.append({ - "layer": current_layer, - "passed": True, - "allclose_match_rate": comp_results["allclose_match_rate"], - "max_error": comp_results["max_error"], - "mae": comp_results["mae"], - "mse": comp_results["mse"], - }) - except SystemExit as e: - if e.code == 0: - return { - "passed": False, - "error": "HuggingFace model unavailable (skipped)", - "model_id": model_id, - } - try: - comp_results, _comp_params = compare_emulator_output(layer_build) - results_per_layer.append({ - "layer": current_layer, - "passed": False, - "allclose_match_rate": comp_results["allclose_match_rate"], - "max_error": comp_results["max_error"], - "mae": comp_results["mae"], - "mse": comp_results["mse"], - }) - except Exception: - results_per_layer.append({ - "layer": current_layer, - "passed": False, - "error": f"Emulator comparison failed after exit code {e.code}", - }) - else: - # Multi-layer: chain N layers with residual connections (no RoPE) - asm_name = f"aten_{model_id.split('/')[-1]}_chain{num_layers}" - chain_build = build_path / f"chain_{num_layers}layers" - - print(f"\n[2/5] Building chained {num_layers}-layer ISA via PlenaCompiler + ops.*") - print(f"[3/5] Setting up sim environment: {chain_build}") - print("[4/5] Running Rust transactional emulator") - - extra_kwargs = {} - if trust_remote_code: - extra_kwargs["trust_remote_code"] = True - if partial_load: - extra_kwargs["partial_load"] = True - - try: - build_and_run_multi_layer_test( - model_id=model_id, - asm_name=asm_name, - build_dir=chain_build, - num_layers=num_layers, - layer_idx_start=layer_idx, - seq_len=seq_len, - hidden_size=hidden_size, - inter_dim=inter_dim, - **extra_kwargs, - ) - comp_results, _comp_params = compare_emulator_output(chain_build) - results_per_layer.append({ - "layer": f"chain_{num_layers}", - "passed": True, - "allclose_match_rate": comp_results["allclose_match_rate"], - "max_error": comp_results["max_error"], - "mae": comp_results["mae"], - "mse": comp_results["mse"], - }) - except SystemExit as e: - if e.code == 0: - return { - "passed": False, - "error": "HuggingFace model unavailable (skipped)", - "model_id": model_id, - } - try: - comp_results, _comp_params = compare_emulator_output(chain_build) - results_per_layer.append({ - "layer": f"chain_{num_layers}", - "passed": False, - "allclose_match_rate": comp_results["allclose_match_rate"], - "max_error": comp_results["max_error"], - "mae": comp_results["mae"], - "mse": comp_results["mse"], - }) - except Exception: - results_per_layer.append({ - "layer": f"chain_{num_layers}", - "passed": False, - "error": f"Emulator comparison failed after exit code {e.code}", - }) - - elapsed = time.time() - t0 - - # ------------------------------------------------------------------ - # [5/5] Aggregate results - # ------------------------------------------------------------------ - print(f"\n[5/5] Results summary ({elapsed:.1f}s elapsed)") - all_passed = all(r.get("passed", False) for r in results_per_layer) - - # Use first layer's metrics for the top-level result - first = results_per_layer[0] if results_per_layer else {} - - summary = { - "passed": all_passed, - "allclose_match_rate": first.get("allclose_match_rate", 0.0), - "max_error": first.get("max_error", float("inf")), - "mae": first.get("mae", float("inf")), - "mse": first.get("mse", float("inf")), - "elapsed_s": elapsed, - "model_id": model_id, - "layer_idx": layer_idx, - "num_layers": num_layers, - "seq_len": seq_len, - "hidden_size": hidden_size, - "inter_dim": inter_dim, - "build_dir": str(build_path), - "layers": results_per_layer, - } - - for r in results_per_layer: - status = "PASS" if r.get("passed") else "FAIL" - match = r.get("allclose_match_rate", "N/A") - if isinstance(match, float): - match = f"{match:.2f}%" - print(f" Layer {r.get('layer', '?')}: [{status}] allclose={match}") - - if all_passed: - print(f"\n[ATen e2e PASSED] {model_id} — {num_layers} layer(s), " - f"allclose={first.get('allclose_match_rate', 0):.2f}%") - else: - print(f"\n[ATen e2e FAILED] {model_id} — see per-layer results above") - - return summary - - -# --------------------------------------------------------------------------- -# CLI entry point -# --------------------------------------------------------------------------- -def main(): - import argparse - - parser = argparse.ArgumentParser( - description="Run HF model through ATen compilation path (PlenaCompiler + ops.*)", - prog="python -m compiler.aten.e2e_runner", - ) - parser.add_argument("model_id", help="HuggingFace model ID (e.g. AICrossSim/clm-60m)") - parser.add_argument("--seq-len", type=int, default=64, - help="Sequence length (default: 64)") - parser.add_argument("--num-layers", type=int, default=1, - help="Number of decoder layers to test (default: 1)") - parser.add_argument("--layer-idx", type=int, default=0, - help="Starting layer index (default: 0)") - parser.add_argument("--hidden-size", type=int, default=64, - help="Hidden dimension clipped to sim limits (default: 64)") - parser.add_argument("--inter-dim", type=int, default=128, - help="FFN intermediate dimension clipped to sim limits (default: 128)") - parser.add_argument("--build-dir", type=str, default=None, - help="Build directory for sim artifacts (default: /tmp/aten_e2e_...)") - parser.add_argument("--trust-remote-code", action="store_true", - help="Trust remote code for HF model loading") - parser.add_argument("--partial-load", action="store_true", - help="Load only needed weight shards (for large models)") - - args = parser.parse_args() - - result = run_aten_e2e( - model_id=args.model_id, - seq_len=args.seq_len, - num_layers=args.num_layers, - build_dir=args.build_dir, - layer_idx=args.layer_idx, - hidden_size=args.hidden_size, - inter_dim=args.inter_dim, - trust_remote_code=args.trust_remote_code, - partial_load=args.partial_load, - ) - - sys.exit(0 if result["passed"] else 1) +from compiler.aten import sliced_emulator_runner as _impl + +for _name, _value in vars(_impl).items(): + if _name not in { + "__builtins__", + "__cached__", + "__doc__", + "__file__", + "__loader__", + "__name__", + "__package__", + "__spec__", + }: + globals()[_name] = _value + +for _compat_local in ("_impl", "_name", "_value"): + globals().pop(_compat_local, None) + +__all__ = [name for name in globals() if not name.startswith("__")] if __name__ == "__main__": diff --git a/aten/plena_frontend.py b/aten/plena_frontend.py index 47c584d..2242fc7 100644 --- a/aten/plena_frontend.py +++ b/aten/plena_frontend.py @@ -31,6 +31,7 @@ "_ksplit_matmul", "_make_rotate_half_matrix", "compile_hf_model", + "compile_native_hf_decoder", "quantize_to_mxfp", ] @@ -262,7 +263,7 @@ def _register_layer_inputs(prog, layer_idx: int, weights: LayerWeights) -> Layer # --------------------------------------------------------------------------- # Main compilation function # --------------------------------------------------------------------------- -def compile_hf_model( +def compile_native_hf_decoder( model, seq_len: int = 64, num_layers: int | None = None, @@ -273,7 +274,7 @@ def compile_hf_model( golden_precision: str = "hardware", verbose: bool = False, ) -> dict: - """Compile a HuggingFace decoder model to PLENA ISA and simulation metadata.""" + """Compile a HuggingFace decoder model at native dimensions to PLENA ISA metadata.""" def _verbose(message: str = ""): if verbose: print(message) @@ -541,129 +542,5 @@ def _verbose(message: str = ""): "info": info, "golden_precision": golden_precision, } -<<<<<<< Updated upstream -======= - - -# --------------------------------------------------------------------------- -# Convenience: compile + run emulator + compare -# --------------------------------------------------------------------------- -def compile_and_run( - model, - build_dir, - **kwargs, -) -> dict: - """Compile, run emulator, and compare against golden. - - Convenience wrapper that calls compile_hf_model, sets up simulation - environment, runs the Rust transactional emulator, and compares output. - - Args: - model: nn.Module (HF CausalLM model, already loaded) - build_dir: Directory for simulation artifacts - **kwargs: Forwarded to compile_hf_model (seq_len, hidden_size, etc.) - - Returns: - dict with compilation info + comparison results including - 'allclose_match_rate' percentage. - """ - from transactional_emulator.tools.create_sim_env import create_sim_env - from sim_env_utils.build_env import create_mem_for_sim - from transactional_emulator.testbench.emulator_runner import ( - run_and_assert, - compare_emulator_output, - ) - - result = compile_hf_model(model, **kwargs) - build_dir = Path(build_dir) - build_dir.mkdir(parents=True, exist_ok=True) - - mlen = kwargs.get("mlen", 64) - blen = kwargs.get("blen", 4) - asm_name = f"model_{result['info']['model_type']}_{result['info']['num_layers']}L" - - # Write sim env files - create_sim_env( - result["input_tensors"], - result["isa"], - {"original_output": result["golden_output"]}, - result["fp_preload"], - build_dir=str(build_dir), - ) - - create_mem_for_sim( - data_size=256, - mode="behave_sim", - asm=asm_name, - data=None, - specified_data_order=result["data_order"], - build_path=build_dir, - ) - - with open(build_dir / "comparison_params.json", "w") as f: - json.dump(result["comparison_params"], f, indent=2) - - with open(build_dir / "generated_asm_code.asm", "w") as f: - f.write(result["isa"]) - - print(f"\nSimulation environment created: {build_dir}") - print(f" Result location: VRAM row {result['comparison_params']['start_row_idx']}") - print(f" Layers: {result['info']['num_layers']}, data_order: {result['data_order']}") - - # Run emulator and compare (don't exit on failure — VRAM stage comparison follows) - from transactional_emulator.testbench.emulator_runner import update_plena_config, run_emulator - update_plena_config(vlen=mlen, mlen=mlen, blen=blen, verbose=False) - print("\n--- Running Rust transactional emulator ---") - run_emulator(build_dir) - - print("\n--- Comparing emulator output vs golden ---") - comp_results, _params = compare_emulator_output(build_dir) - from transactional_emulator.tools.check_mem import print_comparison_results - print_comparison_results(comp_results, verbose=True, comparison_params=_params) - - if comp_results["allclose_pass"]: - print(f"\n[ATen-style {asm_name} test PASSED - ISA generated + emulator verified]") - else: - print(f"\n[ATen-style {asm_name} test FAILED - emulator numerical check failed]") - - # Three-way comparison - golden = result["golden_output"] - hf_gt = result["hf_ground_truth"] - print("\n--- Three-way comparison ---") - if hf_gt is not None and golden is not None: - # HF float32 vs golden (MXFP8 + BF16) - n = min(hf_gt.numel(), golden.numel()) - allclose_hf_vs_gold = ( - torch.isclose(hf_gt.float().flatten()[:n], - golden.float().flatten()[:n], atol=1e-2) - .float().mean().item() * 100 - ) - print(f" HF float32 vs golden (MXFP8+BF16): {allclose_hf_vs_gold:.2f}% allclose") - # Emulator vs golden: reported by compare_emulator_output - emu_match = comp_results.get("allclose_match_rate", None) - if emu_match is not None: - print(f" Emulator vs golden (MXFP8+BF16): {emu_match:.2f}% allclose") - - # VRAM stage comparison: validates each pipeline segment using - # emulator's own intermediates as golden input (immune to accumulation drift) - try: - from compiler.aten.vram_stage_compare import compare_stages - emulator_dir = Path(__file__).parent.parent.parent / "transactional_emulator" - vram_path = str(emulator_dir / "vram_dump.bin") - print("\n--- VRAM stage comparison (authoritative) ---") - stage_results = compare_stages( - vram_path=vram_path, - build_dir=str(build_dir), - hidden=result["info"]["hidden_size"], - inter=result["info"].get("inter_dim", result["info"]["hidden_size"] * 4), - num_heads=result["info"]["num_heads"], - num_kv_heads=result["info"]["num_kv_heads"], - ) - stage_pass = stage_results.get("norm+FFN+norm", 0) >= 99.0 - comp_results["vram_stage_allclose"] = stage_results.get("norm+FFN+norm", None) - comp_results["vram_stage_pass"] = stage_pass - except Exception as e: - print(f" (skipped: {e})") - - return {**result["info"], **comp_results} ->>>>>>> Stashed changes +# Backwards-compatible alias for older callers. +compile_hf_model = compile_native_hf_decoder diff --git a/aten/sliced_emulator_runner.py b/aten/sliced_emulator_runner.py new file mode 100644 index 0000000..1abb36f --- /dev/null +++ b/aten/sliced_emulator_runner.py @@ -0,0 +1,351 @@ +"""ATen-backed sliced-dimension emulator runner. + +This wraps the simulator-scale testbench path: + + HuggingFace model -> PlenaCompiler + ops.* -> ISA -> emulator -> golden check + +It delegates to ``transactional_emulator.testbench.sliced_layer_test_builder`` +and defaults to sliced dimensions (hidden=64, inter=128). This is the right +entry point for quick emulator checks that use real weights but simulator-sized +tensors. + +It is not the native full-decoder frontend. For native hidden/intermediate +dimensions, use ``compiler.aten.plena_frontend.compile_native_hf_decoder``. +The symbolic generator path is separate and remains under ``generator.runner +codegen``. + +Usage: + python -m compiler.aten.sliced_emulator_runner AICrossSim/clm-60m --seq-len 32 +""" + +import sys +import time +from pathlib import Path + +# --------------------------------------------------------------------------- +# Repo root bootstrap — mirror the same sys.path setup used by the existing +# test infrastructure so imports resolve regardless of cwd. +# --------------------------------------------------------------------------- +_COMPILER_ROOT = Path(__file__).resolve().parents[1] # PLENA_Compiler/ +_REPO_ROOT = _COMPILER_ROOT.parent +for _p in [str(_REPO_ROOT), str(_REPO_ROOT / "tools"), str(_COMPILER_ROOT)]: + if _p not in sys.path: + sys.path.insert(0, _p) + + +def run_sliced_emulator_check( + model_id: str, + seq_len: int = 64, + num_layers: int = 1, + build_dir: str | None = None, + layer_idx: int = 0, + hidden_size: int = 64, + inter_dim: int = 128, + trust_remote_code: bool = False, + partial_load: bool = False, +) -> dict: + """Run a HF model through the sliced ATen emulator path. + + Steps: + 1. Load model config + layer weights from HuggingFace + 2. Build ISA via PlenaCompiler + ops.* at simulator-scale dimensions + 3. Set up sim environment (ASM + HBM weights + FPRAM constants) + 4. Run Rust emulator + 5. Compare VRAM output against golden PyTorch reference + + Returns dict with: + passed: bool + allclose_match_rate: float (percentage) + max_error: float + mae: float + mse: float + elapsed_s: float (wall-clock seconds) + model_id: str + layer_idx: int + num_layers: int + seq_len: int + hidden_size: int + inter_dim: int + build_dir: str + """ + from transactional_emulator.testbench.emulator_runner import compare_emulator_output + from transactional_emulator.testbench.sliced_layer_test_builder import ( + build_and_run_sliced_decoder_chain_test, + build_and_run_sliced_decoder_layer_test, + get_model_dims, + slice_dims_for_sim, + ) + + t0 = time.time() + + # Resolve build directory + if build_dir is None: + safe_name = model_id.replace("/", "_") + build_dir = str( + Path("/tmp") / f"aten_sliced_{safe_name}_sl{seq_len}_l{layer_idx}" + ) + build_path = Path(build_dir) + + # ------------------------------------------------------------------ + # [1/5] Probe model config + # ------------------------------------------------------------------ + print(f"[1/5] Probing model config: {model_id}") + try: + full_dims = get_model_dims(model_id) + except (OSError, ConnectionError) as exc: + print(f"[SKIP] HuggingFace model '{model_id}' unavailable: {exc}") + return { + "passed": False, + "error": str(exc), + "model_id": model_id, + } + sim_dims = slice_dims_for_sim(full_dims, hidden_slice=hidden_size, inter_slice=inter_dim) + print(f" Full dims: hidden={full_dims.hidden_size}, inter={full_dims.inter_dim}, " + f"heads={full_dims.num_heads}, kv_heads={full_dims.num_kv_heads}, head_dim={full_dims.head_dim}") + print(f" Sim dims: hidden={sim_dims.hidden_size}, inter={sim_dims.inter_dim}") + + if hidden_size > full_dims.head_dim: + error = ( + "aten.sliced_emulator_runner uses the sliced_layer_test_builder single-head/sim-sliced " + f"harness. Requested hidden_size={hidden_size}, but the model head_dim is " + f"{full_dims.head_dim}; native multi-head dimensions must go through " + "compiler.aten.plena_frontend.compile_native_hf_decoder." + ) + print(f"[ERROR] {error}") + return { + "passed": False, + "error": error, + "model_id": model_id, + "seq_len": seq_len, + "num_layers": num_layers, + "hidden_size": hidden_size, + "inter_dim": inter_dim, + "build_dir": str(build_path), + } + + # ------------------------------------------------------------------ + # [2/5] Build ISA + golden reference + sim env via build_and_run_sliced_decoder_layer_test + # + # We call the proven function directly — it handles: + # - Weight loading + slicing + # - PlenaCompiler ISA generation + # - create_sim_env + create_mem_for_sim + # - Golden reference computation + # - Emulator execution + comparison + # + # For multi-layer: iterate layers (each is independent at sim scale). + # ------------------------------------------------------------------ + results_per_layer = [] + + if num_layers == 1: + # Single layer: use proven single-layer path (with RoPE) + current_layer = layer_idx + asm_name = f"aten_{model_id.split('/')[-1]}_l{current_layer}" + layer_build = build_path / f"layer_{current_layer}" + + print(f"\n[2/5] Building ISA for layer {current_layer} via PlenaCompiler + ops.*") + print(f"[3/5] Setting up sim environment: {layer_build}") + print("[4/5] Running Rust transactional emulator") + + extra_kwargs = {} + if trust_remote_code: + extra_kwargs["trust_remote_code"] = True + if partial_load: + extra_kwargs["partial_load"] = True + + try: + build_and_run_sliced_decoder_layer_test( + model_id=model_id, + asm_name=asm_name, + build_dir=layer_build, + layer_idx=current_layer, + seq_len=seq_len, + hidden_size=hidden_size, + inter_dim=inter_dim, + **extra_kwargs, + ) + comp_results, _comp_params = compare_emulator_output(layer_build) + results_per_layer.append({ + "layer": current_layer, + "passed": True, + "allclose_match_rate": comp_results["allclose_match_rate"], + "max_error": comp_results["max_error"], + "mae": comp_results["mae"], + "mse": comp_results["mse"], + }) + except SystemExit as e: + if e.code == 0: + return { + "passed": False, + "error": "HuggingFace model unavailable (skipped)", + "model_id": model_id, + } + try: + comp_results, _comp_params = compare_emulator_output(layer_build) + results_per_layer.append({ + "layer": current_layer, + "passed": False, + "allclose_match_rate": comp_results["allclose_match_rate"], + "max_error": comp_results["max_error"], + "mae": comp_results["mae"], + "mse": comp_results["mse"], + }) + except Exception: + results_per_layer.append({ + "layer": current_layer, + "passed": False, + "error": f"Emulator comparison failed after exit code {e.code}", + }) + else: + # Multi-layer: chain N layers with residual connections (no RoPE) + asm_name = f"aten_{model_id.split('/')[-1]}_chain{num_layers}" + chain_build = build_path / f"chain_{num_layers}layers" + + print(f"\n[2/5] Building chained {num_layers}-layer ISA via PlenaCompiler + ops.*") + print(f"[3/5] Setting up sim environment: {chain_build}") + print("[4/5] Running Rust transactional emulator") + + extra_kwargs = {} + if trust_remote_code: + extra_kwargs["trust_remote_code"] = True + if partial_load: + extra_kwargs["partial_load"] = True + + try: + build_and_run_sliced_decoder_chain_test( + model_id=model_id, + asm_name=asm_name, + build_dir=chain_build, + num_layers=num_layers, + layer_idx_start=layer_idx, + seq_len=seq_len, + hidden_size=hidden_size, + inter_dim=inter_dim, + **extra_kwargs, + ) + comp_results, _comp_params = compare_emulator_output(chain_build) + results_per_layer.append({ + "layer": f"chain_{num_layers}", + "passed": True, + "allclose_match_rate": comp_results["allclose_match_rate"], + "max_error": comp_results["max_error"], + "mae": comp_results["mae"], + "mse": comp_results["mse"], + }) + except SystemExit as e: + if e.code == 0: + return { + "passed": False, + "error": "HuggingFace model unavailable (skipped)", + "model_id": model_id, + } + try: + comp_results, _comp_params = compare_emulator_output(chain_build) + results_per_layer.append({ + "layer": f"chain_{num_layers}", + "passed": False, + "allclose_match_rate": comp_results["allclose_match_rate"], + "max_error": comp_results["max_error"], + "mae": comp_results["mae"], + "mse": comp_results["mse"], + }) + except Exception: + results_per_layer.append({ + "layer": f"chain_{num_layers}", + "passed": False, + "error": f"Emulator comparison failed after exit code {e.code}", + }) + + elapsed = time.time() - t0 + + # ------------------------------------------------------------------ + # [5/5] Aggregate results + # ------------------------------------------------------------------ + print(f"\n[5/5] Results summary ({elapsed:.1f}s elapsed)") + all_passed = all(r.get("passed", False) for r in results_per_layer) + + # Use first layer's metrics for the top-level result + first = results_per_layer[0] if results_per_layer else {} + + summary = { + "passed": all_passed, + "allclose_match_rate": first.get("allclose_match_rate", 0.0), + "max_error": first.get("max_error", float("inf")), + "mae": first.get("mae", float("inf")), + "mse": first.get("mse", float("inf")), + "elapsed_s": elapsed, + "model_id": model_id, + "layer_idx": layer_idx, + "num_layers": num_layers, + "seq_len": seq_len, + "hidden_size": hidden_size, + "inter_dim": inter_dim, + "build_dir": str(build_path), + "layers": results_per_layer, + } + + for r in results_per_layer: + status = "PASS" if r.get("passed") else "FAIL" + match = r.get("allclose_match_rate", "N/A") + if isinstance(match, float): + match = f"{match:.2f}%" + print(f" Layer {r.get('layer', '?')}: [{status}] allclose={match}") + + if all_passed: + print(f"\n[Sliced emulator PASSED] {model_id} — {num_layers} layer(s), " + f"allclose={first.get('allclose_match_rate', 0):.2f}%") + else: + print(f"\n[Sliced emulator FAILED] {model_id} — see per-layer results above") + + return summary + + +# --------------------------------------------------------------------------- +# CLI entry point +# --------------------------------------------------------------------------- +def main(): + import argparse + + parser = argparse.ArgumentParser( + description="Run HF model through the sim-sliced ATen emulator harness", + prog="python -m compiler.aten.sliced_emulator_runner", + ) + parser.add_argument("model_id", help="HuggingFace model ID (e.g. AICrossSim/clm-60m)") + parser.add_argument("--seq-len", type=int, default=64, + help="Sequence length (default: 64)") + parser.add_argument("--num-layers", type=int, default=1, + help="Number of decoder layers to test (default: 1)") + parser.add_argument("--layer-idx", type=int, default=0, + help="Starting layer index (default: 0)") + parser.add_argument("--hidden-size", type=int, default=64, + help="Sliced hidden dimension for this harness (default: 64)") + parser.add_argument("--inter-dim", type=int, default=128, + help="FFN intermediate dimension clipped to sim limits (default: 128)") + parser.add_argument("--build-dir", type=str, default=None, + help="Build directory for sim artifacts (default: /tmp/aten_sliced_...)") + parser.add_argument("--trust-remote-code", action="store_true", + help="Trust remote code for HF model loading") + parser.add_argument("--partial-load", action="store_true", + help="Load only needed weight shards (for large models)") + + args = parser.parse_args() + + result = run_sliced_emulator_check( + model_id=args.model_id, + seq_len=args.seq_len, + num_layers=args.num_layers, + build_dir=args.build_dir, + layer_idx=args.layer_idx, + hidden_size=args.hidden_size, + inter_dim=args.inter_dim, + trust_remote_code=args.trust_remote_code, + partial_load=args.partial_load, + ) + sys.exit(0 if result["passed"] else 1) + +# Backwards-compatible alias for older callers. +run_aten_e2e = run_sliced_emulator_check + + +if __name__ == "__main__": + main() diff --git a/aten/tests/test_plena_compiler.py b/aten/tests/test_plena_compiler.py index a10932e..1011f4d 100644 --- a/aten/tests/test_plena_compiler.py +++ b/aten/tests/test_plena_compiler.py @@ -204,9 +204,9 @@ def test_rotate_half_matrix_identity(): print(" PASS test_rotate_half_matrix_identity") -def test_compile_hf_model_golden_vs_hf(): +def test_compile_native_hf_decoder_golden_vs_hf(): """Golden (MXFP8+BF16) should closely match HF float32 at native dims.""" - from compiler.aten.plena_frontend import compile_hf_model + from compiler.aten.plena_frontend import compile_native_hf_decoder from transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained( @@ -214,7 +214,7 @@ def test_compile_hf_model_golden_vs_hf(): ) model.eval() - r = compile_hf_model(model, seq_len=64, num_layers=1) + r = compile_native_hf_decoder(model, seq_len=64, num_layers=1) golden = r["golden_output"] hf = r["hf_ground_truth"] @@ -226,7 +226,7 @@ def test_compile_hf_model_golden_vs_hf(): assert pct >= 95.0, f"Golden vs HF allclose {pct:.1f}% < 95%" assert cos.item() >= 0.99, f"Golden vs HF cosine {cos.item():.4f} < 0.99" - print(f" PASS test_compile_hf_model_golden_vs_hf ({pct:.1f}% allclose, cos={cos.item():.4f})") + print(f" PASS test_compile_native_hf_decoder_golden_vs_hf ({pct:.1f}% allclose, cos={cos.item():.4f})") def test_native_compile_assembles(): @@ -234,7 +234,7 @@ def test_native_compile_assembles(): import os import tempfile - from compiler.aten.plena_frontend import compile_hf_model + from compiler.aten.plena_frontend import compile_native_hf_decoder from transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained( @@ -242,7 +242,7 @@ def test_native_compile_assembles(): ) model.eval() - r = compile_hf_model(model, seq_len=64, num_layers=1) + r = compile_native_hf_decoder(model, seq_len=64, num_layers=1) isa = r["isa"] # Assemble — should not raise ValueError (u32 overflow) @@ -290,7 +290,7 @@ def test_native_compile_assembles(): test_fix_large_immediates_roundtrip, test_fix_large_immediates_preserves_relative_adds, test_rotate_half_matrix_identity, - test_compile_hf_model_golden_vs_hf, + test_compile_native_hf_decoder_golden_vs_hf, test_native_compile_assembles, ] diff --git a/aten/tests/test_quantization_ablation.py b/aten/tests/test_quantization_ablation.py index 13762f1..50bb0be 100644 --- a/aten/tests/test_quantization_ablation.py +++ b/aten/tests/test_quantization_ablation.py @@ -1,6 +1,6 @@ """Ablation study proving HF-vs-golden accuracy gap is from MXFP8 weight quantization. -Runs compile_hf_model in four precision modes and compares golden output +Runs compile_native_hf_decoder in four precision modes and compares golden output against the HF float32 ground truth. Expected result: hardware (MXFP8 + BF16) ~52% allclose ← full HW gap @@ -25,13 +25,13 @@ def _run_ablation(num_layers: int) -> dict[str, dict]: from transformers import AutoModelForCausalLM - from compiler.aten.plena_frontend import compile_hf_model + from compiler.aten.plena_frontend import compile_native_hf_decoder model = AutoModelForCausalLM.from_pretrained(MODEL_ID) results = {} for mode in MODES: - result = compile_hf_model( + result = compile_native_hf_decoder( model, seq_len=64, num_layers=num_layers, golden_precision=mode, ) golden = result["golden_output"] diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index a0ff701..8310520 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -26,8 +26,8 @@ PLENA_Compiler/ | +-- reset_reg_asm.py # Register reset helpers | |-- aten/ # Pipeline 1: ATen compilation backend -| |-- plena_frontend.py # HF model -> PLENA program -> ISA text -| |-- e2e_runner.py # ATen e2e runner: PlenaCompiler -> emulator -> golden +| |-- plena_frontend.py # native HF decoder -> PLENA program -> ISA text +| |-- sliced_emulator_runner.py # sliced HF weights -> emulator -> golden | |-- plena/ # Canonical PlenaCompiler implementation package | | |-- compiler.py # PlenaCompiler composition class | | |-- memory_state.py # Tensor/input/FP memory state diff --git a/docs/ATEN_TREE.md b/docs/ATEN_TREE.md index e5800da..4c7af08 100644 --- a/docs/ATEN_TREE.md +++ b/docs/ATEN_TREE.md @@ -9,8 +9,8 @@ aten/ |-- native_ops.yaml # Operator registry spec: signatures and dispatch targets |-- isa_builder.py # Typed ISA instruction/register builder and legalization |-- model_extract.py # HuggingFace model config/layer/embedding extraction helpers -|-- plena_frontend.py # HF decoder model -> PLENA program -> ISA text -|-- e2e_runner.py # HF model -> ATen compiler -> emulator -> golden check +|-- plena_frontend.py # native HF decoder -> PLENA program -> ISA text +|-- sliced_emulator_runner.py # sliced HF weights -> emulator -> golden check |-- reference.py # CPU golden/reference math and MXFP/BF16 helpers |-- vram_stage_compare.py # Debug tooling for VRAM stage comparisons | @@ -70,8 +70,8 @@ Key points: - `aten/plena/` is the canonical compiler implementation package. - `aten/ops/` is the ATen-style dispatcher surface. -- `aten/plena_frontend.py` is the HuggingFace/ATen frontend that drives model +- `aten/plena_frontend.py` is the native HuggingFace/ATen frontend that drives model compilation. -- `aten/e2e_runner.py` runs the ATen compiler path through the emulator and +- `aten/sliced_emulator_runner.py` runs the sliced-dimension ATen compiler path through the emulator and golden comparison. - The old `aten/plena_compiler.py` compatibility facade has been removed. diff --git a/docs/COMPILATION_PIPELINES.md b/docs/COMPILATION_PIPELINES.md index 69e2ccc..b907f11 100644 --- a/docs/COMPILATION_PIPELINES.md +++ b/docs/COMPILATION_PIPELINES.md @@ -66,18 +66,18 @@ backends, and weight-handling strategies. | File | Role | |------|------| | `aten/plena/` | Canonical PlenaCompiler implementation package | -| `aten/plena_frontend.py` | HuggingFace model frontend that drives ATen compilation | +| `aten/plena_frontend.py` | Native HuggingFace decoder frontend (`compile_native_hf_decoder`) | | `aten/ops/plena/*.py` | Registered ATen op implementations (linear, attention, ffn, norm, conv, softmax, embedding) | | `aten/ops/cpu/*.py` | CPU reference fallbacks | | `aten/ops/registry.py` | Op dispatch registry | -| `aten/e2e_runner.py` | E2E harness: model load -> compile -> emulate -> verify | +| `aten/sliced_emulator_runner.py` | Sliced-dimension emulator harness: model load -> compile -> emulate -> verify | | `sim_env_utils/build_env.py` | Simulation environment builder | ### Entry points -- **Single-layer tests**: `model_layer_test_builder.py::build_and_run_decoder_test` -- **Full-model E2E**: `aten/e2e_runner.py::run_aten_e2e` -- **CLI**: `python -m compiler.aten.e2e_runner --seq-len 32 --num-layers 1` +- **Sliced single-layer tests**: `sliced_layer_test_builder.py::build_and_run_sliced_decoder_layer_test` +- **Sliced emulator CLI**: `python -m compiler.aten.sliced_emulator_runner --seq-len 32 --num-layers 1` +- **Native decoder compile**: `aten/plena_frontend.py::compile_native_hf_decoder` ### Test suite diff --git a/generator/README.md b/generator/README.md index aae7c39..2cfbc19 100644 --- a/generator/README.md +++ b/generator/README.md @@ -6,13 +6,16 @@ The generator path is the symbolic codegen and utilization-analysis pipeline: HF config -> symbolic graph -> scheduler -> ASM ``` -It is separate from the ATen e2e compiler path. For numerically verified ATen -compilation, use: +It is separate from the ATen compiler/emulator path. For quick numerically verified +sim-sliced ATen checks, use: ```bash -python -m compiler.aten.e2e_runner AICrossSim/clm-60m --seq-len 64 --num-layers 1 +python -m compiler.aten.sliced_emulator_runner AICrossSim/clm-60m --seq-len 64 --num-layers 1 ``` +For native hidden/intermediate dimensions, use +`compiler.aten.plena_frontend.compile_native_hf_decoder` from Python. + Run symbolic codegen: ```bash diff --git a/generator/tests/test_generator_e2e.py b/generator/tests/test_generator_e2e.py index 2ef75aa..7234df4 100644 --- a/generator/tests/test_generator_e2e.py +++ b/generator/tests/test_generator_e2e.py @@ -512,20 +512,20 @@ def run_test_aten( seq_len: int = 64, num_layers: int = 1, ) -> int: - """Run the ATen-backed e2e pipeline (PlenaCompiler + ops.*). + """Run the sim-sliced ATen-backed e2e pipeline (PlenaCompiler + ops.*). Unlike ``run_test`` which uses the generator's own codegen path and has - numerical verification deferred, this immediately gets full numerical - correctness via the mature ATen compilation backend. + numerical verification deferred, this immediately gets emulator numerical + coverage through ``compiler.aten.sliced_emulator_runner`` at simulator-sliced dims. """ - from compiler.aten.e2e_runner import run_aten_e2e + from compiler.aten.sliced_emulator_runner import run_sliced_emulator_check print("=" * 80) print(f"Generator e2e harness (ATen backend) — {model_id} — " f"seq_len={seq_len}, num_layers={num_layers}") print("=" * 80) - result = run_aten_e2e( + result = run_sliced_emulator_check( model_id=model_id, seq_len=seq_len, num_layers=num_layers, @@ -550,7 +550,7 @@ def run_test_aten( _ap.add_argument("--num-layers", type=int, default=None, help="Override num_hidden_layers (e.g. 1 for fast e2e runs, ~22x less ASM)") _ap.add_argument("--aten", action="store_true", - help="Use ATen backend (PlenaCompiler + ops.*) instead of generator codegen") + help="Use sim-sliced ATen harness instead of generator codegen") _args = _ap.parse_args() if _args.aten: sys.exit(run_test_aten( From 618f000f19790bfbe22ddc8616976b85a9e90af2 Mon Sep 17 00:00:00 2001 From: booth-algo Date: Thu, 14 May 2026 19:47:39 +0100 Subject: [PATCH 2/2] fix: validate final layer in VRAM stage compare --- aten/vram_stage_compare.py | 59 +++++++++++++++++++++++++++----------- 1 file changed, 42 insertions(+), 17 deletions(-) diff --git a/aten/vram_stage_compare.py b/aten/vram_stage_compare.py index f169fb3..a206868 100644 --- a/aten/vram_stage_compare.py +++ b/aten/vram_stage_compare.py @@ -11,8 +11,10 @@ vram_path="transactional_emulator/vram_dump.bin", build_dir="/tmp/smolvlm2_1layer_f32regs", hidden=576, inter=1536, num_heads=9, num_kv_heads=3, + layer_idx=0, ) """ +import re import struct import numpy as np import torch @@ -47,14 +49,36 @@ def _mse(a, b): return ((a.float() - b.float()) ** 2).mean().item() +def _infer_final_layer_idx(build: Path) -> int: + indices = [] + for path in build.glob("W_o_*.pt"): + match = re.fullmatch(r"W_o_(\d+)\.pt", path.name) + if match: + indices.append(int(match.group(1))) + if not indices: + raise FileNotFoundError(f"No W_o_.pt files found in {build}") + return max(indices) + + +def _read_alloc_addr(asm: str, name: str) -> int | None: + match = re.search( + rf"Allocate VRAM Matrix {re.escape(name)}: .*?VRAM\[(\d+)\]", + asm, + ) + return int(match.group(1)) if match else None + + def compare_stages(vram_path, build_dir, hidden, inter, num_heads, num_kv_heads, - seq_len=64, mlen=64, head_dim=64, eps=1e-5, verbose=True): + seq_len=64, mlen=64, head_dim=64, eps=1e-5, verbose=True, + layer_idx=None): """Compare each pipeline stage using emulator's own VRAM intermediates. Args: vram_path: path to the emulator's vram_dump.bin build_dir: path to the build directory with weight .pt files hidden, inter, num_heads, num_kv_heads: model dimensions + layer_idx: decoder layer to validate. Defaults to the last layer found + in build_dir, which is the layer that feeds the final output. Returns: dict of stage results with allclose percentages @@ -63,40 +87,39 @@ def compare_stages(vram_path, build_dir, hidden, inter, num_heads, num_kv_heads, _to_inter = lambda x: x.to(torch.bfloat16) _from_inter = lambda x: x.float() - # VRAM addresses (from ISA comments — these are model-dependent) - # For SmolVLM2 1-layer: X=12288, scratch=233472, Q=270400, O_full=307264, O_proj=356416 - # For clm-60m: different addresses. We compute from VRAM layout. - tiles = hidden // mlen - x_addr = 3 * mlen * mlen + 0 # after COS, SIN, mask (for native mode) + if layer_idx is None: + layer_idx = _infer_final_layer_idx(build) + # Read final output address from comparison_params import json params = json.load(open(build / "comparison_params.json")) final_addr = params["start_row_idx"] * mlen - results = {} + results = {"layer_idx": layer_idx} # --- Load weights --- - W_o = quantize_to_mxfp(torch.load(build / "W_o_0.pt", weights_only=True)) - W_gate = quantize_to_mxfp(torch.load(build / "W_gate_0.pt", weights_only=True)) - W_up = quantize_to_mxfp(torch.load(build / "W_up_0.pt", weights_only=True)) - W_down = quantize_to_mxfp(torch.load(build / "W_down_0.pt", weights_only=True)) + W_o = quantize_to_mxfp(torch.load(build / f"W_o_{layer_idx}.pt", weights_only=True)) + W_gate = quantize_to_mxfp(torch.load(build / f"W_gate_{layer_idx}.pt", weights_only=True)) + W_up = quantize_to_mxfp(torch.load(build / f"W_up_{layer_idx}.pt", weights_only=True)) + W_down = quantize_to_mxfp(torch.load(build / f"W_down_{layer_idx}.pt", weights_only=True)) # --- Stage 1: O_full (attention output) --- # Find O_full address from ISA comments o_full_addr = final_addr - 2 * seq_len * hidden - import re asm_path = build / "generated_asm_code.asm" if asm_path.exists(): with open(asm_path) as f: asm = f.read() - m = re.search(r'Allocate VRAM Matrix O_full_0.*?VRAM\[(\d+)\]', asm) - if m: - o_full_addr = int(m.group(1)) - m2 = re.search(r'Allocate VRAM Matrix residual_scratch.*?VRAM\[(\d+)\]', asm) - scratch_addr = int(m2.group(1)) if m2 else None + parsed_o_full_addr = _read_alloc_addr(asm, f"O_full_{layer_idx}") + if parsed_o_full_addr is not None: + o_full_addr = parsed_o_full_addr + scratch_addr = _read_alloc_addr(asm, "residual_scratch") else: scratch_addr = None + if verbose: + print(f" Validating layer {layer_idx}") + O_full = _read_bf16_matrix(vram_path, o_full_addr, seq_len, hidden) # --- Stage 2: O_proj = O_full @ W_o --- @@ -163,11 +186,13 @@ def compare_stages(vram_path, build_dir, hidden, inter, num_heads, num_kv_heads, import sys vram = sys.argv[1] if len(sys.argv) > 1 else "transactional_emulator/vram_dump.bin" build = sys.argv[2] if len(sys.argv) > 2 else "/tmp/smolvlm2_1layer_f32regs" + layer_idx = int(sys.argv[3]) if len(sys.argv) > 3 else None print("=== VRAM Stage Comparison ===") results = compare_stages( vram_path=vram, build_dir=build, hidden=576, inter=1536, num_heads=9, num_kv_heads=3, + layer_idx=layer_idx, ) print(f"\nOverall: {'PASS' if results.get('norm+FFN+norm', 0) >= 99.0 else 'FAIL'}")