Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions AsmParser/tog_generator.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
# DEPRECATED (timing path): legacy ONNX Tile-Operation-Graph producer. Builds
# the TOG and serializes it to ONNX for the C++ TileGraphParser. Superseded by
# the C++ trace pipeline (PyTorchSimFrontend/mlir/passes/build_skeleton.py +
# lower_to_emitc.py + cycle_table.py -> a compiled trace .so). Kept live so the
# current pipeline does not break; to be retired once the trace pipeline (P3+)
# stabilizes. See docs/design/togsim_cpp_trace.md.
import os
import sys
import importlib.util
Expand Down
1 change: 1 addition & 0 deletions CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ Note: `TOGSIM_CONFIG` is **overwritten** while inside a `with TOGSimulator(confi
Located under `configs/*.yml`:

- `num_cores`, `core_freq_mhz`, `num_systolic_array_per_core`
- `sa_weight_buffer_depth` (per-SA resident weight slots; **must be > 0** — the simulator errors on 0. Raise it to effectively disable the preload run-ahead throttle. Defaults to 2 if the key is absent.)
- `vpu_num_lanes`, `vpu_spad_size_kb_per_lane`, `vpu_vector_length_bits`
- `dram_type` (`ramulator2` | `simple`), `dram_channels`, `dram_freq_mhz`, `ramulator_config_path`
- `icnt_type` (`simple` | `booksim`), `icnt_latency_cycles`, `icnt_freq_mhz`, `icnt_config_path`
Expand Down
129 changes: 90 additions & 39 deletions PyTorchSimFrontend/extension_codecache.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import torch

from PyTorchSimFrontend import extension_config
from torch._inductor.codecache import get_hash, write
from torch._inductor.codecache import get_hash, write, write_atomic
from torch._inductor.async_compile import AsyncCompile
from AsmParser.tog_generator import tog_generator
from PyTorchSimFrontend.mlir.mlir_caller_codegen import MLIRKernelCallerCodeGen
Expand All @@ -23,6 +23,13 @@ def get_write_path(src_code):
return os.path.join(extension_config.get_dump_path(), hash_prefix(get_hash(src_code.strip())))


_HEADER_BY_HASH = {}
def store_header(src_code, spike_header, gem5_header):
_HEADER_BY_HASH[get_hash(src_code.strip())] = (spike_header, gem5_header)
def get_header(src_code):
return _HEADER_BY_HASH.get(get_hash(src_code.strip()))


def get_lock_path(write_path):
"""Return lock file path for the given write_path (per-source_code lock)."""
return os.path.join(write_path, ".compile.lock")
Expand Down Expand Up @@ -128,40 +135,52 @@ def load(cls, source_code,
vlen = kwargs['vlen']
vlenb = vlen // 8
write_path = get_write_path(source_code)
key, input_path = write(source_code, "mlir", specified_dir=write_path)
# Run the Python out-of-line MLIR passes (MLIR bindings) on the kernel
# .mlir in place, before mlir-opt. Currently lowers torchsim.vlane_idx
# (replaces the old C++ -global-idx pass); add more in passes/__init__.py.
os.makedirs(write_path, exist_ok=True)
global_var_header = kwargs.get("global_var_header")
if global_var_header is not None:
write_atomic(os.path.join(write_path, "global_var.h"), global_var_header)
gem5_global_var_header = kwargs.get("gem5_global_var_header")
if gem5_global_var_header is not None:
write_atomic(os.path.join(write_path, "gem5_global_var.h"), gem5_global_var_header)
# The compile rewrites the kernel .mlir in place (run_python_passes) and reads
# it back (mlir-opt). Two compiles of the same source -- the autotune's chosen
# candidate and the final kernel -- share a write_path, so hold the per-path
# lock across the whole build to keep them from interleaving, and skip the
# rebuild when a prior build already finished (its tile_graph.onnx exists).
from filelock import FileLock
from PyTorchSimFrontend.mlir.passes import (
run_python_passes, run_module_passes, POST_OPT_PASSES,
run_standard_lowering, run_tog,
)
run_python_passes(input_path, vectorlane=vectorlane_size)
new_input_path = os.path.splitext(input_path)[0]
raw_tog_path = new_input_path + "_tog.py"
tog_path = os.path.join(write_path, "tile_graph.onnx")
sample_mlir_path = new_input_path + "_sample"
validation_binary_path = os.path.join(write_path, validation_binary_name)
gem5_cmds = mlir_gem5_compile_command(new_input_path, sample_mlir_path, raw_tog_path, vectorlane_size)

from filelock import FileLock
os.makedirs(write_path, exist_ok=True)
lock = FileLock(get_lock_path(write_path), timeout=LOCK_TIMEOUT)

if spad_info is not None:
link_option = f"-Wl,--section-start=.spad=0x{spad_info['spad_vaddr']:x}"
else:
link_option = ""
# Generate LLVM kernel calller and binary for validation
if extension_config.pytorchsim_functional_mode:
# Use custom malloc to avoid size error
new_link_option = link_option + " -Wl,--wrap=malloc -Wl,--wrap=free"
cmds = mlir_compile_command(new_input_path, vectorlane_size, vlen=vlen)
opt_pad_cmd = shlex.split(cmds[0])
translate_cmd = shlex.split(cmds[1])
llc_cmd = shlex.split(cmds[2])
llc_asm_cmd = shlex.split(cmds[3])
with lock:
with lock:
key, input_path = write(source_code, "mlir", specified_dir=write_path)
if os.path.isfile(tog_path):
return key
# Run the Python out-of-line MLIR passes (MLIR bindings) on the kernel
# .mlir in place, before mlir-opt. Currently lowers torchsim.vlane_idx
# (replaces the old C++ -global-idx pass); add more in passes/__init__.py.
run_python_passes(input_path, vectorlane=vectorlane_size)
new_input_path = os.path.splitext(input_path)[0]
raw_tog_path = new_input_path + "_tog.py"
sample_mlir_path = new_input_path + "_sample"
validation_binary_path = os.path.join(write_path, validation_binary_name)
gem5_cmds = mlir_gem5_compile_command(new_input_path, sample_mlir_path, raw_tog_path, vectorlane_size)

if spad_info is not None:
link_option = f"-Wl,--section-start=.spad=0x{spad_info['spad_vaddr']:x}"
else:
link_option = ""
# Generate LLVM kernel calller and binary for validation
if extension_config.pytorchsim_functional_mode:
# Use custom malloc to avoid size error
new_link_option = link_option + " -Wl,--wrap=malloc -Wl,--wrap=free"
cmds = mlir_compile_command(new_input_path, vectorlane_size, vlen=vlen)
opt_pad_cmd = shlex.split(cmds[0])
translate_cmd = shlex.split(cmds[1])
llc_cmd = shlex.split(cmds[2])
llc_asm_cmd = shlex.split(cmds[3])
try:
# loop-padding (mlir-opt) -> Python fine-grained + vcix (one parse/print)
subprocess.check_call(opt_pad_cmd)
Expand Down Expand Up @@ -195,17 +214,11 @@ def load(cls, source_code,
)
raise SpadOverflowError()

# Skip if TOG file already exists
if os.path.isfile(tog_path):
return key

# Launch tile graph generator
gem5_pad_cmd = shlex.split(gem5_cmds[0])
gem5_translate_cmd = shlex.split(gem5_cmds[1])
gem5_llc_cmd = shlex.split(gem5_cmds[2])
# Launch tile graph generator
gem5_pad_cmd = shlex.split(gem5_cmds[0])
gem5_translate_cmd = shlex.split(gem5_cmds[1])
gem5_llc_cmd = shlex.split(gem5_cmds[2])

lock = FileLock(get_lock_path(write_path), timeout=LOCK_TIMEOUT)
with lock:
try:
# mlir-opt now runs only loop-padding/dma-fine-grained/pytorchsim-to-vcix
# and writes the post-vcix IR. The tile-operation-graph pass is ported
Expand Down Expand Up @@ -241,8 +254,19 @@ def load(cls, source_code,
# Run cyclesim
cyclesim = CycleSimulator()
cycle_list = cyclesim.compile_and_simulate(os.path.join(write_path, cycle_binary_name), vectorlane_size, silent_mode=silent_mode)
# Snapshot for the P3-trace hook below: generate_tile_graph consumes
# cycle_list in place (cycle_list.pop(0) per tile), leaving it empty.
cycle_list_for_trace = list(cycle_list)

# Create TOG
# DEPRECATED (timing path): this ONNX-TOG producer -- run_tog ->
# tog_generator.generate_tile_graph -> ONNX -> C++ TileGraphParser --
# is being superseded by the C++ trace pipeline (build_skeleton +
# lower_to_emitc -> compiled .so, + the cycle_table sidecar). The
# per-tile cycle_list / x_offset / w_offset computed here are exactly
# what cycle_table.build_cycle_table will reuse, so both paths stay
# cycle-consistent during the transition. Kept live (pipeline must not
# break); to be retired once the trace pipeline (P3+) stabilizes.
w_offset, x_offset = vectorlane_size, vectorlane_size
if kwargs['loop_size'] is not None and kwargs['loop_size'][-3] < vectorlane_size:
x_offset = kwargs['loop_size'][-3]
Expand All @@ -258,6 +282,33 @@ def load(cls, source_code,
w_offset=w_offset, # FIXME.
vector_lane=vectorlane_size
)

# Trace pipeline (DEFAULT): emit the compiled trace producer .so + the
# cycle-table TSV from the post-vcix IR and gem5 cycle_list/offsets. This
# is the default simulation path (the C++ TOG); the legacy ONNX TOG is
# DEPRECATED, an opt-in fallback via TORCHSIM_LEGACY_TOG=1, in which case the
# .so is unused so skip emitting it. Best-effort: never breaks the compile.
if os.environ.get("TORCHSIM_LEGACY_TOG") != "1":
try:
import mlir.ir as ir
from PyTorchSimFrontend.mlir.passes import (
build_skeleton as _bs, cycle_table as _ct, lower_to_emitc as _l2e)
pv = sample_mlir_path + "_postvcix.mlir"
_ctx = ir.Context(); _ctx.allow_unregistered_dialects = True
with _ctx:
_mod = ir.Module.parse(open(pv).read(), _ctx)
_bs.build_skeleton(_mod)
_ntiles = len(_ct._compute_types(_mod))
# align lengths: gem5 gives one numCycles per compute node;
# pad with the last value / truncate if it disagrees.
_cl = list(cycle_list_for_trace)
if _cl and len(_cl) != _ntiles:
_cl = (_cl + [_cl[-1]] * _ntiles)[:_ntiles]
_tbl = _ct.build_cycle_table(_mod, _cl, x_offset, w_offset)
_ct.dump_cycle_table_tsv(_tbl, os.path.join(write_path, "trace_cycles.tsv"))
_l2e.build_trace_so(pv, os.path.join(write_path, "trace.so"))
except Exception as e:
logger.warning(f"[P3-trace] trace .so/sidecar dump skipped: {e}")
return key

class CustomAsyncCompile(AsyncCompile):
Expand Down
7 changes: 5 additions & 2 deletions PyTorchSimFrontend/mlir/mlir_autotune.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def __str__(self) -> str:
def make_run_fn(
self, input_tensors: torch.Tensor, output_tensors: torch.Tensor
) -> Callable[[], None]:
from PyTorchSimFrontend.extension_codecache import CustomAsyncCompile
from PyTorchSimFrontend.extension_codecache import CustomAsyncCompile, get_header
custom_async_compile = CustomAsyncCompile()

# Check already cached result.
Expand All @@ -80,12 +80,15 @@ def cached_run_fn(*args, autotune_subprocess_timeout_sec=None, **kwargs):
return cached_run_fn

# Run a candidate code
_headers = get_header(self.source_code)
_header_kwargs = {} if _headers is None else {
"global_var_header": _headers[0], "gem5_global_var_header": _headers[1]}
run_method = custom_async_compile.mlir(
self.source_code, vectorlane_size=self.extra_args["vector_lane"],
loop_size=self.extra_args["loop_size"], spad_info=self.extra_args["spad_info"],
vlen=self.extra_args["vlen"], arg_attributes=self.extra_args["arg_attributes"],
origins=self.extra_args["origins"], silent_mode=True,
autotune=self.extra_args['autotune'])
autotune=self.extra_args['autotune'], **_header_kwargs)

args = [
tensor
Expand Down
Loading