Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
c437316
[Docs] C++ trace pipeline design (runtime-tag pairing, ABI)
YWHyuk Jun 24, 2026
29a4c34
[TOGSim] C++ trace pipeline: front end, runtime, loader, bridge, Core…
YWHyuk Jun 24, 2026
b2d73da
[TOGSim] Per-iteration tag pairing for multi-tile-K and conv
YWHyuk Jun 24, 2026
551b2cb
[TOGSim] Work-item outlining and ABI v12 dispatch
YWHyuk Jun 24, 2026
11716ec
[TOGSim] SRAM-capacity and SA weight-buffer throttle for the trace path
YWHyuk Jun 24, 2026
f1f5ec0
[Tooling] TOGSim trace timeline (Perfetto) and the trace emits it needs
YWHyuk Jun 24, 2026
83d0f2f
[TOGSim] Make the C++ trace path the default and stabilize it
YWHyuk Jun 24, 2026
63c6d24
[TOGSim] Make the trace runtime test self-contained
YWHyuk Jun 24, 2026
1512705
[Frontend] Trace cache-safe replay and compile-race fixes
YWHyuk Jun 24, 2026
064bb27
[TOGSim] Redesign trace-bridge dependency, barrier, SRAM-version, and…
YWHyuk Jun 24, 2026
ed5c747
[Frontend] Run the spad-overflow check in timing-only mode, budget at…
YWHyuk Jun 25, 2026
4e459b9
[Frontend] Generate the trace.cpp ABI/API banner from togsim_runtime.h
YWHyuk Jun 25, 2026
df61f76
[TOGSim] Pick 1- vs 2-dispatch concurrency by per-dispatch spad footp…
YWHyuk Jun 25, 2026
f05ac8a
[Frontend] Budget fused-epilogue spad buffers honestly in GEMM tile s…
YWHyuk Jun 25, 2026
7148a48
[Frontend] Stop charging the kernel stack frame against the spad budget
YWHyuk Jun 25, 2026
4f9018b
[CI] Bump spike pin to v1.0.3
YWHyuk Jun 25, 2026
e2d5608
[Frontend] Add per-kernel CPU functional verify sub-option
YWHyuk Jun 25, 2026
8b36968
[Docs] Document per-kernel functional verify
YWHyuk Jun 25, 2026
618e4fc
[TOGSim] Drop the ABI version changelog comment
YWHyuk Jun 25, 2026
5c65b78
[Frontend] Budget fused-prologue spad buffers in BMM tile selection
YWHyuk Jun 25, 2026
c9ad81f
[Frontend] Condense BMM prologue spad-budget comments to one line
YWHyuk Jun 25, 2026
90b5560
Merge pull request #278 from PSAL-POSTECH/fix/bmm-prologue-spad-budget
YWHyuk Jun 25, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions AsmParser/tog_generator.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
# DEPRECATED (timing path): legacy ONNX Tile-Operation-Graph producer. Builds
# the TOG and serializes it to ONNX for the C++ TileGraphParser. Superseded by
# the C++ trace pipeline (PyTorchSimFrontend/mlir/passes/build_skeleton.py +
# lower_to_emitc.py + cycle_table.py -> a compiled trace .so). Kept live so the
# current pipeline does not break; to be retired once the trace pipeline (P3+)
# stabilizes. See docs/design/togsim_cpp_trace.md.
import os
import sys
import importlib.util
Expand Down
8 changes: 8 additions & 0 deletions CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,12 @@ export TORCHSIM_DUMP_MLIR_IR=1
export TORCHSIM_DUMP_LLVM_IR=1
```

**To find which op a wrong result first diverges at** (per-kernel CPU cross-check;
sub-option of functional mode). Set `pytorchsim_functional_verify_per_kernel: 1`
in the config YAML, clear the codegen cache, and re-run: each compiled kernel's
output is compared to a CPU golden and the run stops at the first divergent
kernel, naming the op and offending indices. See `docs/per-kernel-functional-verify.md`.

## Key environment variables

Read in `PyTorchSimFrontend/extension_config.py`:
Expand Down Expand Up @@ -85,11 +91,13 @@ Note: `TOGSIM_CONFIG` is **overwritten** while inside a `with TOGSimulator(confi
Located under `configs/*.yml`:

- `num_cores`, `core_freq_mhz`, `num_systolic_array_per_core`
- `sa_weight_buffer_depth` (per-SA resident weight slots; **must be > 0** — the simulator errors on 0. Raise it to effectively disable the preload run-ahead throttle. Defaults to 2 if the key is absent.)
- `vpu_num_lanes`, `vpu_spad_size_kb_per_lane`, `vpu_vector_length_bits`
- `dram_type` (`ramulator2` | `simple`), `dram_channels`, `dram_freq_mhz`, `ramulator_config_path`
- `icnt_type` (`simple` | `booksim`), `icnt_latency_cycles`, `icnt_freq_mhz`, `icnt_config_path`
- `l2d_type` (e.g., `datacache`), `l2d_config` (AccelSim-format cache config string)
- `pytorchsim_functional_mode` (Spike on/off), `pytorchsim_timing_mode`
- `pytorchsim_functional_verify_per_kernel` (debug: per-kernel CPU cross-check; see `docs/per-kernel-functional-verify.md`)
- `codegen_mapping_strategy`: `heuristic` | `autotune` | `external-then-heuristic` | `external-then-autotune`
- `codegen_external_mapping_file` (key `"M_N_K"` → `{TILE_M, TILE_K, TILE_N}` JSON)
- `codegen_compiler_optimization`: `"all"` | `"none"` | a list from `{fusion, reduction_epilogue, reduction_reduction, prologue, single_batch_conv, multi_tile_conv, subtile}`
Expand Down
185 changes: 121 additions & 64 deletions PyTorchSimFrontend/extension_codecache.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import torch

from PyTorchSimFrontend import extension_config
from torch._inductor.codecache import get_hash, write
from torch._inductor.codecache import get_hash, write, write_atomic
from torch._inductor.async_compile import AsyncCompile
from AsmParser.tog_generator import tog_generator
from PyTorchSimFrontend.mlir.mlir_caller_codegen import MLIRKernelCallerCodeGen
Expand All @@ -23,6 +23,13 @@ def get_write_path(src_code):
return os.path.join(extension_config.get_dump_path(), hash_prefix(get_hash(src_code.strip())))


_HEADER_BY_HASH = {}
def store_header(src_code, spike_header, gem5_header):
_HEADER_BY_HASH[get_hash(src_code.strip())] = (spike_header, gem5_header)
def get_header(src_code):
return _HEADER_BY_HASH.get(get_hash(src_code.strip()))


def get_lock_path(write_path):
"""Return lock file path for the given write_path (per-source_code lock)."""
return os.path.join(write_path, ".compile.lock")
Expand Down Expand Up @@ -128,84 +135,96 @@ def load(cls, source_code,
vlen = kwargs['vlen']
vlenb = vlen // 8
write_path = get_write_path(source_code)
key, input_path = write(source_code, "mlir", specified_dir=write_path)
# Run the Python out-of-line MLIR passes (MLIR bindings) on the kernel
# .mlir in place, before mlir-opt. Currently lowers torchsim.vlane_idx
# (replaces the old C++ -global-idx pass); add more in passes/__init__.py.
os.makedirs(write_path, exist_ok=True)
global_var_header = kwargs.get("global_var_header")
if global_var_header is not None:
write_atomic(os.path.join(write_path, "global_var.h"), global_var_header)
gem5_global_var_header = kwargs.get("gem5_global_var_header")
if gem5_global_var_header is not None:
write_atomic(os.path.join(write_path, "gem5_global_var.h"), gem5_global_var_header)
# The compile rewrites the kernel .mlir in place (run_python_passes) and reads
# it back (mlir-opt). Two compiles of the same source -- the autotune's chosen
# candidate and the final kernel -- share a write_path, so hold the per-path
# lock across the whole build to keep them from interleaving, and skip the
# rebuild when a prior build already finished (its tile_graph.onnx exists).
from filelock import FileLock
from PyTorchSimFrontend.mlir.passes import (
run_python_passes, run_module_passes, POST_OPT_PASSES,
run_standard_lowering, run_tog,
)
run_python_passes(input_path, vectorlane=vectorlane_size)
new_input_path = os.path.splitext(input_path)[0]
raw_tog_path = new_input_path + "_tog.py"
tog_path = os.path.join(write_path, "tile_graph.onnx")
sample_mlir_path = new_input_path + "_sample"
validation_binary_path = os.path.join(write_path, validation_binary_name)
gem5_cmds = mlir_gem5_compile_command(new_input_path, sample_mlir_path, raw_tog_path, vectorlane_size)

from filelock import FileLock
os.makedirs(write_path, exist_ok=True)
lock = FileLock(get_lock_path(write_path), timeout=LOCK_TIMEOUT)

if spad_info is not None:
link_option = f"-Wl,--section-start=.spad=0x{spad_info['spad_vaddr']:x}"
else:
link_option = ""
# Generate LLVM kernel calller and binary for validation
if extension_config.pytorchsim_functional_mode:
with lock:
key, input_path = write(source_code, "mlir", specified_dir=write_path)
if os.path.isfile(tog_path):
return key
# Run the Python out-of-line MLIR passes (MLIR bindings) on the kernel
# .mlir in place, before mlir-opt. Currently lowers torchsim.vlane_idx
# (replaces the old C++ -global-idx pass); add more in passes/__init__.py.
run_python_passes(input_path, vectorlane=vectorlane_size)
new_input_path = os.path.splitext(input_path)[0]
raw_tog_path = new_input_path + "_tog.py"
sample_mlir_path = new_input_path + "_sample"
validation_binary_path = os.path.join(write_path, validation_binary_name)
gem5_cmds = mlir_gem5_compile_command(new_input_path, sample_mlir_path, raw_tog_path, vectorlane_size)

if spad_info is not None:
link_option = f"-Wl,--section-start=.spad=0x{spad_info['spad_vaddr']:x}"
else:
link_option = ""
# Compile a validation binary and measure its .spad section to reject
# over-spad tilings (SpadOverflowError) -- this must run even in
# timing-only / autotune (non-functional) mode, so a tiling that does not
# fit the spad is scored infeasible instead of wedging TOGSim. The Spike
# *execution* itself stays gated on functional_mode (run_spike, below).
# Use custom malloc to avoid size error
new_link_option = link_option + " -Wl,--wrap=malloc -Wl,--wrap=free"
cmds = mlir_compile_command(new_input_path, vectorlane_size, vlen=vlen)
opt_pad_cmd = shlex.split(cmds[0])
translate_cmd = shlex.split(cmds[1])
llc_cmd = shlex.split(cmds[2])
llc_asm_cmd = shlex.split(cmds[3])
with lock:
try:
# loop-padding (mlir-opt) -> Python fine-grained + vcix (one parse/print)
subprocess.check_call(opt_pad_cmd)
run_module_passes(new_input_path + "_padded.mlir",
new_input_path + "_custom.mlir",
POST_OPT_PASSES, vectorlane=vectorlane_size, vlen=vlen)
# Standard MLIR -> LLVM-dialect lowering (registered upstream
# passes) runs in-process via the bindings PassManager, picking
# up after the custom mlir-opt passes (memref-to-gemmini).
run_standard_lowering(new_input_path + "_custom.mlir", new_input_path + "_llvm.mlir")
subprocess.check_call(translate_cmd)
subprocess.check_call(llc_cmd)
subprocess.check_call(llc_asm_cmd)
except subprocess.CalledProcessError as e:
logger.error(f"Command failed with exit code {e.returncode}")
logger.error(f"Error output: {e.output.decode() if isinstance(e.output, bytes) else e.output}")
assert(0)

val_llvm_caller = MLIRKernelCallerCodeGen(extension_config.pytorchsim_functional_mode, arg_attributes)
val_llvm_caller.generate_wrapper_file(write_path, validation_wrapper_name)
val_llvm_caller.compile_wih_kernel(write_path, key, validation_wrapper_name,
validation_binary_name, new_link_option)

stack_size = val_llvm_caller.parse_stack_sizes(f"{write_path}/{key}.s", vlenb=vlenb)
spad_size = val_llvm_caller.get_spad_size(validation_binary_path)
spad_usage = stack_size + spad_size # Spad usage per lane
if extension_config.CONFIG_SPAD_INFO["spad_size"] < spad_usage:
logger.debug(
f"Scratchpad size exceeded: required {spad_usage} bytes, "
f"but only {extension_config.CONFIG_SPAD_INFO['spad_size']} bytes available."
)
raise SpadOverflowError()

# Skip if TOG file already exists
if os.path.isfile(tog_path):
return key
try:
# loop-padding (mlir-opt) -> Python fine-grained + vcix (one parse/print)
subprocess.check_call(opt_pad_cmd)
run_module_passes(new_input_path + "_padded.mlir",
new_input_path + "_custom.mlir",
POST_OPT_PASSES, vectorlane=vectorlane_size, vlen=vlen)
# Standard MLIR -> LLVM-dialect lowering (registered upstream
# passes) runs in-process via the bindings PassManager, picking
# up after the custom mlir-opt passes (memref-to-gemmini).
run_standard_lowering(new_input_path + "_custom.mlir", new_input_path + "_llvm.mlir")
subprocess.check_call(translate_cmd)
subprocess.check_call(llc_cmd)
subprocess.check_call(llc_asm_cmd)
except subprocess.CalledProcessError as e:
logger.error(f"Command failed with exit code {e.returncode}")
logger.error(f"Error output: {e.output.decode() if isinstance(e.output, bytes) else e.output}")
assert(0)

# Launch tile graph generator
gem5_pad_cmd = shlex.split(gem5_cmds[0])
gem5_translate_cmd = shlex.split(gem5_cmds[1])
gem5_llc_cmd = shlex.split(gem5_cmds[2])
val_llvm_caller = MLIRKernelCallerCodeGen(extension_config.pytorchsim_functional_mode, arg_attributes)
val_llvm_caller.generate_wrapper_file(write_path, validation_wrapper_name)
val_llvm_caller.compile_wih_kernel(write_path, key, validation_wrapper_name,
validation_binary_name, new_link_option)

# Only the .spad section consumes the scratchpad; the stack frame lives in main memory (sp in the -m region, not the scratchpad vaddr) so it is not charged against the per-lane spad budget.
spad_usage = val_llvm_caller.get_spad_size(validation_binary_path)
# Budget per dispatch = half the spad: two work-items run concurrently
# (double-buffer), so each must fit in spad/2 or they deadlock competing for
# the shared spad. Matches the GEMM tiling gate (max_spad_size = spad/2).
spad_budget = extension_config.CONFIG_SPAD_INFO["spad_size"] // 2
if spad_budget < spad_usage:
logger.debug(
f"Scratchpad size exceeded: required {spad_usage} bytes, but only "
f"{spad_budget} bytes (spad/2, double-buffer budget) available."
)
raise SpadOverflowError()

# Launch tile graph generator
gem5_pad_cmd = shlex.split(gem5_cmds[0])
gem5_translate_cmd = shlex.split(gem5_cmds[1])
gem5_llc_cmd = shlex.split(gem5_cmds[2])

lock = FileLock(get_lock_path(write_path), timeout=LOCK_TIMEOUT)
with lock:
try:
# mlir-opt now runs only loop-padding/dma-fine-grained/pytorchsim-to-vcix
# and writes the post-vcix IR. The tile-operation-graph pass is ported
Expand Down Expand Up @@ -241,8 +260,19 @@ def load(cls, source_code,
# Run cyclesim
cyclesim = CycleSimulator()
cycle_list = cyclesim.compile_and_simulate(os.path.join(write_path, cycle_binary_name), vectorlane_size, silent_mode=silent_mode)
# Snapshot for the P3-trace hook below: generate_tile_graph consumes
# cycle_list in place (cycle_list.pop(0) per tile), leaving it empty.
cycle_list_for_trace = list(cycle_list)

# Create TOG
# DEPRECATED (timing path): this ONNX-TOG producer -- run_tog ->
# tog_generator.generate_tile_graph -> ONNX -> C++ TileGraphParser --
# is being superseded by the C++ trace pipeline (build_skeleton +
# lower_to_emitc -> compiled .so, + the cycle_table sidecar). The
# per-tile cycle_list / x_offset / w_offset computed here are exactly
# what cycle_table.build_cycle_table will reuse, so both paths stay
# cycle-consistent during the transition. Kept live (pipeline must not
# break); to be retired once the trace pipeline (P3+) stabilizes.
w_offset, x_offset = vectorlane_size, vectorlane_size
if kwargs['loop_size'] is not None and kwargs['loop_size'][-3] < vectorlane_size:
x_offset = kwargs['loop_size'][-3]
Expand All @@ -258,6 +288,33 @@ def load(cls, source_code,
w_offset=w_offset, # FIXME.
vector_lane=vectorlane_size
)

# Trace pipeline (DEFAULT): emit the compiled trace producer .so + the
# cycle-table TSV from the post-vcix IR and gem5 cycle_list/offsets. This
# is the default simulation path (the C++ TOG); the legacy ONNX TOG is
# DEPRECATED, an opt-in fallback via TORCHSIM_LEGACY_TOG=1, in which case the
# .so is unused so skip emitting it. Best-effort: never breaks the compile.
if os.environ.get("TORCHSIM_LEGACY_TOG") != "1":
try:
import mlir.ir as ir
from PyTorchSimFrontend.mlir.passes import (
build_skeleton as _bs, cycle_table as _ct, lower_to_emitc as _l2e)
pv = sample_mlir_path + "_postvcix.mlir"
_ctx = ir.Context(); _ctx.allow_unregistered_dialects = True
with _ctx:
_mod = ir.Module.parse(open(pv).read(), _ctx)
_bs.build_skeleton(_mod)
_ntiles = len(_ct._compute_types(_mod))
# align lengths: gem5 gives one numCycles per compute node;
# pad with the last value / truncate if it disagrees.
_cl = list(cycle_list_for_trace)
if _cl and len(_cl) != _ntiles:
_cl = (_cl + [_cl[-1]] * _ntiles)[:_ntiles]
_tbl = _ct.build_cycle_table(_mod, _cl, x_offset, w_offset)
_ct.dump_cycle_table_tsv(_tbl, os.path.join(write_path, "trace_cycles.tsv"))
_l2e.build_trace_so(pv, os.path.join(write_path, "trace.so"))
except Exception as e:
logger.warning(f"[P3-trace] trace .so/sidecar dump skipped: {e}")
return key

class CustomAsyncCompile(AsyncCompile):
Expand Down
7 changes: 7 additions & 0 deletions PyTorchSimFrontend/extension_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,13 @@ def __getattr__(name):
return config_yaml['pytorchsim_functional_mode']
if name == "pytorchsim_timing_mode":
return config_yaml['pytorchsim_timing_mode']
# Sub-option of functional mode: per-kernel CPU cross-check. When set (and
# functional mode is on), every realized buffer produced by Spike is compared
# against a CPU golden to localize the first kernel whose value diverges.
# Auto-disabled when functional mode is off (no Spike values to verify).
if name == "pytorchsim_functional_verify_per_kernel":
return bool(config_yaml.get('pytorchsim_functional_verify_per_kernel', False)) \
and bool(config_yaml['pytorchsim_functional_mode'])

# Mapping strategy
if name == "codegen_mapping_strategy":
Expand Down
Loading
Loading