PSAL-POSTECH · YWHyuk · Jun 24, 2026 · Jun 24, 2026 · Jun 24, 2026 · Jun 24, 2026
diff --git a/AsmParser/tog_generator.py b/AsmParser/tog_generator.py
@@ -1,3 +1,9 @@
+# DEPRECATED (timing path): legacy ONNX Tile-Operation-Graph producer. Builds
+# the TOG and serializes it to ONNX for the C++ TileGraphParser. Superseded by
+# the C++ trace pipeline (PyTorchSimFrontend/mlir/passes/build_skeleton.py +
+# lower_to_emitc.py + cycle_table.py -> a compiled trace .so). Kept live so the
+# current pipeline does not break; to be retired once the trace pipeline (P3+)
+# stabilizes. See docs/design/togsim_cpp_trace.md.
 import os
 import sys
 import importlib.util

diff --git a/CLAUDE.md b/CLAUDE.md
@@ -58,6 +58,12 @@ export TORCHSIM_DUMP_MLIR_IR=1
 export TORCHSIM_DUMP_LLVM_IR=1
 ```
 
+**To find which op a wrong result first diverges at** (per-kernel CPU cross-check;
+sub-option of functional mode). Set `pytorchsim_functional_verify_per_kernel: 1`
+in the config YAML, clear the codegen cache, and re-run: each compiled kernel's
+output is compared to a CPU golden and the run stops at the first divergent
+kernel, naming the op and offending indices. See `docs/per-kernel-functional-verify.md`.
+
 ## Key environment variables
 
 Read in `PyTorchSimFrontend/extension_config.py`:
@@ -85,11 +91,13 @@ Note: `TOGSIM_CONFIG` is **overwritten** while inside a `with TOGSimulator(confi
 Located under `configs/*.yml`:
 
 - `num_cores`, `core_freq_mhz`, `num_systolic_array_per_core`
+- `sa_weight_buffer_depth` (per-SA resident weight slots; **must be > 0** — the simulator errors on 0. Raise it to effectively disable the preload run-ahead throttle. Defaults to 2 if the key is absent.)
 - `vpu_num_lanes`, `vpu_spad_size_kb_per_lane`, `vpu_vector_length_bits`
 - `dram_type` (`ramulator2` | `simple`), `dram_channels`, `dram_freq_mhz`, `ramulator_config_path`
 - `icnt_type` (`simple` | `booksim`), `icnt_latency_cycles`, `icnt_freq_mhz`, `icnt_config_path`
 - `l2d_type` (e.g., `datacache`), `l2d_config` (AccelSim-format cache config string)
 - `pytorchsim_functional_mode` (Spike on/off), `pytorchsim_timing_mode`
+- `pytorchsim_functional_verify_per_kernel` (debug: per-kernel CPU cross-check; see `docs/per-kernel-functional-verify.md`)
 - `codegen_mapping_strategy`: `heuristic` | `autotune` | `external-then-heuristic` | `external-then-autotune`
 - `codegen_external_mapping_file` (key `"M_N_K"` → `{TILE_M, TILE_K, TILE_N}` JSON)
 - `codegen_compiler_optimization`: `"all"` | `"none"` | a list from `{fusion, reduction_epilogue, reduction_reduction, prologue, single_batch_conv, multi_tile_conv, subtile}`

diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py
@@ -5,7 +5,7 @@
 import torch
 
 from PyTorchSimFrontend import extension_config
-from torch._inductor.codecache import get_hash, write
+from torch._inductor.codecache import get_hash, write, write_atomic
 from torch._inductor.async_compile import AsyncCompile
 from AsmParser.tog_generator import tog_generator
 from PyTorchSimFrontend.mlir.mlir_caller_codegen import MLIRKernelCallerCodeGen
@@ -23,6 +23,13 @@ def get_write_path(src_code):
     return os.path.join(extension_config.get_dump_path(), hash_prefix(get_hash(src_code.strip())))
 
 
+_HEADER_BY_HASH = {}
+def store_header(src_code, spike_header, gem5_header):
+    _HEADER_BY_HASH[get_hash(src_code.strip())] = (spike_header, gem5_header)
+def get_header(src_code):
+    return _HEADER_BY_HASH.get(get_hash(src_code.strip()))
+
+
 def get_lock_path(write_path):
     """Return lock file path for the given write_path (per-source_code lock)."""
     return os.path.join(write_path, ".compile.lock")
@@ -128,84 +135,96 @@ def load(cls, source_code,
         vlen = kwargs['vlen']
         vlenb = vlen // 8
         write_path = get_write_path(source_code)
-        key, input_path = write(source_code, "mlir", specified_dir=write_path)
-        # Run the Python out-of-line MLIR passes (MLIR bindings) on the kernel
-        # .mlir in place, before mlir-opt. Currently lowers torchsim.vlane_idx
-        # (replaces the old C++ -global-idx pass); add more in passes/__init__.py.
+        os.makedirs(write_path, exist_ok=True)
+        global_var_header = kwargs.get("global_var_header")
+        if global_var_header is not None:
+            write_atomic(os.path.join(write_path, "global_var.h"), global_var_header)
+        gem5_global_var_header = kwargs.get("gem5_global_var_header")
+        if gem5_global_var_header is not None:
+            write_atomic(os.path.join(write_path, "gem5_global_var.h"), gem5_global_var_header)
+        # The compile rewrites the kernel .mlir in place (run_python_passes) and reads
+        # it back (mlir-opt). Two compiles of the same source -- the autotune's chosen
+        # candidate and the final kernel -- share a write_path, so hold the per-path
+        # lock across the whole build to keep them from interleaving, and skip the
+        # rebuild when a prior build already finished (its tile_graph.onnx exists).
+        from filelock import FileLock
         from PyTorchSimFrontend.mlir.passes import (
             run_python_passes, run_module_passes, POST_OPT_PASSES,
             run_standard_lowering, run_tog,
         )
-        run_python_passes(input_path, vectorlane=vectorlane_size)
-        new_input_path = os.path.splitext(input_path)[0]
-        raw_tog_path = new_input_path + "_tog.py"
         tog_path = os.path.join(write_path, "tile_graph.onnx")
-        sample_mlir_path = new_input_path + "_sample"
-        validation_binary_path = os.path.join(write_path, validation_binary_name)
-        gem5_cmds = mlir_gem5_compile_command(new_input_path, sample_mlir_path, raw_tog_path, vectorlane_size)
-
-        from filelock import FileLock
-        os.makedirs(write_path, exist_ok=True)
         lock = FileLock(get_lock_path(write_path), timeout=LOCK_TIMEOUT)
-
-        if spad_info is not None:
-            link_option = f"-Wl,--section-start=.spad=0x{spad_info['spad_vaddr']:x}"
-        else:
-            link_option = ""
-        # Generate LLVM kernel calller and binary for validation
-        if extension_config.pytorchsim_functional_mode:
+        with lock:
+            key, input_path = write(source_code, "mlir", specified_dir=write_path)
+            if os.path.isfile(tog_path):
+                return key
+            # Run the Python out-of-line MLIR passes (MLIR bindings) on the kernel
+            # .mlir in place, before mlir-opt. Currently lowers torchsim.vlane_idx
+            # (replaces the old C++ -global-idx pass); add more in passes/__init__.py.
+            run_python_passes(input_path, vectorlane=vectorlane_size)
+            new_input_path = os.path.splitext(input_path)[0]
+            raw_tog_path = new_input_path + "_tog.py"
+            sample_mlir_path = new_input_path + "_sample"
+            validation_binary_path = os.path.join(write_path, validation_binary_name)
+            gem5_cmds = mlir_gem5_compile_command(new_input_path, sample_mlir_path, raw_tog_path, vectorlane_size)
+
+            if spad_info is not None:
+                link_option = f"-Wl,--section-start=.spad=0x{spad_info['spad_vaddr']:x}"
+            else:
+                link_option = ""
+            # Compile a validation binary and measure its .spad section to reject
+            # over-spad tilings (SpadOverflowError) -- this must run even in
+            # timing-only / autotune (non-functional) mode, so a tiling that does not
+            # fit the spad is scored infeasible instead of wedging TOGSim. The Spike
+            # *execution* itself stays gated on functional_mode (run_spike, below).
             # Use custom malloc to avoid size error
             new_link_option = link_option + " -Wl,--wrap=malloc -Wl,--wrap=free"
             cmds = mlir_compile_command(new_input_path, vectorlane_size, vlen=vlen)
             opt_pad_cmd = shlex.split(cmds[0])
             translate_cmd = shlex.split(cmds[1])
             llc_cmd = shlex.split(cmds[2])
             llc_asm_cmd = shlex.split(cmds[3])
-            with lock:
-                try:
-                    # loop-padding (mlir-opt) -> Python fine-grained + vcix (one parse/print)
-                    subprocess.check_call(opt_pad_cmd)
-                    run_module_passes(new_input_path + "_padded.mlir",
-                                      new_input_path + "_custom.mlir",
-                                      POST_OPT_PASSES, vectorlane=vectorlane_size, vlen=vlen)
-                    # Standard MLIR -> LLVM-dialect lowering (registered upstream
-                    # passes) runs in-process via the bindings PassManager, picking
-                    # up after the custom mlir-opt passes (memref-to-gemmini).
-                    run_standard_lowering(new_input_path + "_custom.mlir", new_input_path + "_llvm.mlir")
-                    subprocess.check_call(translate_cmd)
-                    subprocess.check_call(llc_cmd)
-                    subprocess.check_call(llc_asm_cmd)
-                except subprocess.CalledProcessError as e:
-                    logger.error(f"Command failed with exit code {e.returncode}")
-                    logger.error(f"Error output: {e.output.decode() if isinstance(e.output, bytes) else e.output}")
-                    assert(0)
-
-                val_llvm_caller = MLIRKernelCallerCodeGen(extension_config.pytorchsim_functional_mode, arg_attributes)
-                val_llvm_caller.generate_wrapper_file(write_path, validation_wrapper_name)
-                val_llvm_caller.compile_wih_kernel(write_path, key, validation_wrapper_name,
-                                                   validation_binary_name, new_link_option)
-
-                stack_size = val_llvm_caller.parse_stack_sizes(f"{write_path}/{key}.s", vlenb=vlenb)
-                spad_size =  val_llvm_caller.get_spad_size(validation_binary_path)
-                spad_usage = stack_size + spad_size # Spad usage per lane
-                if extension_config.CONFIG_SPAD_INFO["spad_size"] < spad_usage:
-                    logger.debug(
-                        f"Scratchpad size exceeded: required {spad_usage} bytes, "
-                        f"but only {extension_config.CONFIG_SPAD_INFO['spad_size']} bytes available."
-                    )
-                    raise SpadOverflowError()
-
-        # Skip if TOG file already exists
-        if os.path.isfile(tog_path):
-            return key
+            try:
+                # loop-padding (mlir-opt) -> Python fine-grained + vcix (one parse/print)
+                subprocess.check_call(opt_pad_cmd)
+                run_module_passes(new_input_path + "_padded.mlir",
+                                  new_input_path + "_custom.mlir",
+                                  POST_OPT_PASSES, vectorlane=vectorlane_size, vlen=vlen)
+                # Standard MLIR -> LLVM-dialect lowering (registered upstream
+                # passes) runs in-process via the bindings PassManager, picking
+                # up after the custom mlir-opt passes (memref-to-gemmini).
+                run_standard_lowering(new_input_path + "_custom.mlir", new_input_path + "_llvm.mlir")
+                subprocess.check_call(translate_cmd)
+                subprocess.check_call(llc_cmd)
+                subprocess.check_call(llc_asm_cmd)
+            except subprocess.CalledProcessError as e:
+                logger.error(f"Command failed with exit code {e.returncode}")
+                logger.error(f"Error output: {e.output.decode() if isinstance(e.output, bytes) else e.output}")
+                assert(0)
 
-        # Launch tile graph generator
-        gem5_pad_cmd = shlex.split(gem5_cmds[0])
-        gem5_translate_cmd = shlex.split(gem5_cmds[1])
-        gem5_llc_cmd = shlex.split(gem5_cmds[2])
+            val_llvm_caller = MLIRKernelCallerCodeGen(extension_config.pytorchsim_functional_mode, arg_attributes)
+            val_llvm_caller.generate_wrapper_file(write_path, validation_wrapper_name)
+            val_llvm_caller.compile_wih_kernel(write_path, key, validation_wrapper_name,
+                                               validation_binary_name, new_link_option)
+
+            # Only the .spad section consumes the scratchpad; the stack frame lives in main memory (sp in the -m region, not the scratchpad vaddr) so it is not charged against the per-lane spad budget.
+            spad_usage = val_llvm_caller.get_spad_size(validation_binary_path)
+            # Budget per dispatch = half the spad: two work-items run concurrently
+            # (double-buffer), so each must fit in spad/2 or they deadlock competing for
+            # the shared spad. Matches the GEMM tiling gate (max_spad_size = spad/2).
+            spad_budget = extension_config.CONFIG_SPAD_INFO["spad_size"] // 2
+            if spad_budget < spad_usage:
+                logger.debug(
+                    f"Scratchpad size exceeded: required {spad_usage} bytes, but only "
+                    f"{spad_budget} bytes (spad/2, double-buffer budget) available."
+                )
+                raise SpadOverflowError()
+
+            # Launch tile graph generator
+            gem5_pad_cmd = shlex.split(gem5_cmds[0])
+            gem5_translate_cmd = shlex.split(gem5_cmds[1])
+            gem5_llc_cmd = shlex.split(gem5_cmds[2])
 
-        lock = FileLock(get_lock_path(write_path), timeout=LOCK_TIMEOUT)
-        with lock:
             try:
                 # mlir-opt now runs only loop-padding/dma-fine-grained/pytorchsim-to-vcix
                 # and writes the post-vcix IR. The tile-operation-graph pass is ported
@@ -241,8 +260,19 @@ def load(cls, source_code,
             # Run cyclesim
             cyclesim = CycleSimulator()
             cycle_list = cyclesim.compile_and_simulate(os.path.join(write_path, cycle_binary_name), vectorlane_size, silent_mode=silent_mode)
+            # Snapshot for the P3-trace hook below: generate_tile_graph consumes
+            # cycle_list in place (cycle_list.pop(0) per tile), leaving it empty.
+            cycle_list_for_trace = list(cycle_list)
 
             # Create TOG
+            # DEPRECATED (timing path): this ONNX-TOG producer -- run_tog ->
+            # tog_generator.generate_tile_graph -> ONNX -> C++ TileGraphParser --
+            # is being superseded by the C++ trace pipeline (build_skeleton +
+            # lower_to_emitc -> compiled .so, + the cycle_table sidecar). The
+            # per-tile cycle_list / x_offset / w_offset computed here are exactly
+            # what cycle_table.build_cycle_table will reuse, so both paths stay
+            # cycle-consistent during the transition. Kept live (pipeline must not
+            # break); to be retired once the trace pipeline (P3+) stabilizes.
             w_offset, x_offset = vectorlane_size, vectorlane_size
             if kwargs['loop_size'] is not None and kwargs['loop_size'][-3] < vectorlane_size:
                 x_offset = kwargs['loop_size'][-3]
@@ -258,6 +288,33 @@ def load(cls, source_code,
                 w_offset=w_offset, # FIXME.
                 vector_lane=vectorlane_size
             )
+
+            # Trace pipeline (DEFAULT): emit the compiled trace producer .so + the
+            # cycle-table TSV from the post-vcix IR and gem5 cycle_list/offsets. This
+            # is the default simulation path (the C++ TOG); the legacy ONNX TOG is
+            # DEPRECATED, an opt-in fallback via TORCHSIM_LEGACY_TOG=1, in which case the
+            # .so is unused so skip emitting it. Best-effort: never breaks the compile.
+            if os.environ.get("TORCHSIM_LEGACY_TOG") != "1":
+                try:
+                    import mlir.ir as ir
+                    from PyTorchSimFrontend.mlir.passes import (
+                        build_skeleton as _bs, cycle_table as _ct, lower_to_emitc as _l2e)
+                    pv = sample_mlir_path + "_postvcix.mlir"
+                    _ctx = ir.Context(); _ctx.allow_unregistered_dialects = True
+                    with _ctx:
+                        _mod = ir.Module.parse(open(pv).read(), _ctx)
+                        _bs.build_skeleton(_mod)
+                        _ntiles = len(_ct._compute_types(_mod))
+                        # align lengths: gem5 gives one numCycles per compute node;
+                        # pad with the last value / truncate if it disagrees.
+                        _cl = list(cycle_list_for_trace)
+                        if _cl and len(_cl) != _ntiles:
+                            _cl = (_cl + [_cl[-1]] * _ntiles)[:_ntiles]
+                        _tbl = _ct.build_cycle_table(_mod, _cl, x_offset, w_offset)
+                    _ct.dump_cycle_table_tsv(_tbl, os.path.join(write_path, "trace_cycles.tsv"))
+                    _l2e.build_trace_so(pv, os.path.join(write_path, "trace.so"))
+                except Exception as e:
+                    logger.warning(f"[P3-trace] trace .so/sidecar dump skipped: {e}")
         return key
 
 class CustomAsyncCompile(AsyncCompile):

diff --git a/PyTorchSimFrontend/extension_config.py b/PyTorchSimFrontend/extension_config.py
@@ -57,6 +57,13 @@ def __getattr__(name):
         return config_yaml['pytorchsim_functional_mode']
     if name == "pytorchsim_timing_mode":
         return config_yaml['pytorchsim_timing_mode']
+    # Sub-option of functional mode: per-kernel CPU cross-check. When set (and
+    # functional mode is on), every realized buffer produced by Spike is compared
+    # against a CPU golden to localize the first kernel whose value diverges.
+    # Auto-disabled when functional mode is off (no Spike values to verify).
+    if name == "pytorchsim_functional_verify_per_kernel":
+        return bool(config_yaml.get('pytorchsim_functional_verify_per_kernel', False)) \
+            and bool(config_yaml['pytorchsim_functional_mode'])
 
     # Mapping strategy
     if name == "codegen_mapping_strategy":