From 7a30eb2f423ca1967eff7a275c9f7fb122105d2d Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Thu, 25 Jun 2026 23:43:10 +0900 Subject: [PATCH] [Frontend] Make the C++ trace the sole main TOG path; drop legacy ONNX TOG The main compile/sim path no longer generates or selects the legacy ONNX Tile-Operation-Graph. extension_codecache emits only trace.so + trace_cycles.tsv (the build-skip now keys on trace.so), and TOGSimulator.run_standalone always drives TOGSim with --trace_so. The TORCHSIM_LEGACY_TOG opt-in is removed from the frontend. The ONNX --models_list branch is kept solely for the STONNE sparse path (extension_op.py); TOGSim's C++ ONNX parser is untouched (separate PR). origins (which FX nodes a kernel came from) is preserved: logged per kernel run and recorded as a trailing "# origins:" line in trace_cycles.tsv -- the legacy ONNX TOG carried this as node metadata, and the C++ cycle-table loader stops at the comment so the current parser is unaffected. Also drop the dead tog_file param from mlir_gem5_compile_command, migrate scripts/chiplet.sh to --trace_so/--cycle_table (the trace path stubs per-tensor addresses and --attributes_list is no longer a Simulator option), and refresh the CLAUDE.md TOG-generation notes. --- CLAUDE.md | 4 +- PyTorchSimFrontend/extension_codecache.py | 86 +++++++------------ PyTorchSimFrontend/mlir/passes/cycle_table.py | 12 ++- Simulator/simulator.py | 28 +++--- scripts/chiplet.sh | 13 +-- 5 files changed, 66 insertions(+), 77 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 1a042cce..5975ee60 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -21,7 +21,7 @@ The pipeline runs in that order on every `torch.compile` invocation; you'll see | `Simulator/simulator.py` | Python drivers: `FunctionalSimulator` (Spike), `CycleSimulator` (Gem5), `TOGSimulator` (the cycle-accurate one + multi-tenant context manager) | | `Scheduler/scheduler.py` | Poisson arrival generator + scheduling utilities for multi-tenant runs | | `TOGSim/` | C++ TOGSim source. `src/Simulator.cc`, `Core.cc`, `Dram.cc`, `Interconnect.cc`, `L2Cache.cc`, `Tile.cc`, `TileGraph.cc` are the core models. Externals: ramulator2, booksim, stonneCore, onnx, protobuf, spdlog, yaml-cpp | -| `AsmParser/` | `tog_generator.py`, `onnx_utility.py` — TOG generation from ONNX/ASM | +| `AsmParser/` | `tog_generator.py`, `onnx_utility.py` — legacy ONNX TOG generation; now used only by the STONNE sparse path (the main path emits a C++ `trace.so` instead) | | `configs/` | TOGSim hardware configs (YAML). The default is `systolic_ws_128x128_c1_simple_noc_tpuv3.yml`. Naming pattern: `systolic_ws__c__.yml` | | `tests/` | Op- and model-level tests organized under `ops//` (elementwise, reduce, gemm, conv, attention, view, sort, sparsity, misc, fusion), `models//` (Llama, Mixtral8x7B, DeepSeek, Diffusion, MoE, MLP, MobileNet, Yolov5) plus single-file model tests (test_resnet, test_transformer, test_vit, test_mlp, test_single_perceptron), and `system/` (scheduler, eager, hetro, stonne, vectorops). Shared helper: `tests/_utils.py` | | `experiments/artifact/` | Paper reproduction scripts (`cycle_validation/run_cycle.sh`, `speedup/run_speedup.sh`) | @@ -130,7 +130,7 @@ Conan deps for TOGSim: `boost/1.79.0`, `robin-hood-hashing/3.11.5`, `spdlog/1.11 - **Adding a new op (Inductor lowering):** `PyTorchSimFrontend/mlir/mlir_ops.py`, `mlir_lowering.py`, plus a new `mlir__template.py` if it needs its own MLIR template. Decomposition rules: `mlir_decomposition.py`. Scheduling: `mlir_scheduling.py`. Autotune: `mlir_autotune.py`. - **Adding a PyTorch device op:** `PyTorchSimDevice/csrc/aten/native/*` (Minimal/Extra split mirrors `torch_openreg`). - **TOGSim hardware model changes:** `TOGSim/src/{Core,Dram,Interconnect,L2Cache,Tile,TileGraph}.cc` + matching `include/*.h`. -- **TOG generation:** `AsmParser/tog_generator.py` builds the raw graph and serializes it via `AsmParser/onnx_utility.py` to **ONNX, which is the on-disk TOG format** consumed by TOGSim. +- **TOG generation:** the main path compiles each kernel to a C++ **`trace.so`** (`mlir/passes/build_skeleton.py` + `lower_to_emitc.py`) plus a `trace_cycles.tsv` cycle table, which TOGSim turns into a TileGraph via `trace_to_tilegraph`. `AsmParser/tog_generator.py` + `onnx_utility.py` (the legacy ONNX TOG) remain only for the **STONNE sparse path** (`extension_op.py`). - **Eager fallback registration:** `torch.npu.register_eager_to_compile([...])` — see `tests/system/test_eager.py`. - **Per-run results:** `togsim_results/>.log` (stats) and `.trace` (instruction trace). The path is also printed at the end of every run. - **Wrapper codegen path:** printed as `Wrapper Codegen Path = /tmp/torchinductor_//...py` — useful for inspecting generated kernel code and tensor names for `SRAM_BUFFER_PLAN_PATH`. diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py index 89bd7c89..5e6857ee 100644 --- a/PyTorchSimFrontend/extension_codecache.py +++ b/PyTorchSimFrontend/extension_codecache.py @@ -7,7 +7,6 @@ from PyTorchSimFrontend import extension_config from torch._inductor.codecache import get_hash, write, write_atomic from torch._inductor.async_compile import AsyncCompile -from AsmParser.tog_generator import tog_generator from PyTorchSimFrontend.mlir.mlir_caller_codegen import MLIRKernelCallerCodeGen from Simulator.simulator import FunctionalSimulator, CycleSimulator, TOGSimulator @@ -81,7 +80,7 @@ def mlir_compile_command(filename, vectorlane_size, vlen=256): """, ).strip()] -def mlir_gem5_compile_command(filename, sample_filename, tog_file, vectorlane_size, vlen=256): +def mlir_gem5_compile_command(filename, sample_filename, vectorlane_size, vlen=256): # See mlir_compile_command: -dma-fine-grained and -test-pytorchsim-to-vcix are # Python passes run in-process; mlir-opt runs only loop-padding here. return [re.sub(r"[ \n]+", " ", @@ -146,17 +145,17 @@ def load(cls, source_code, # it back (mlir-opt). Two compiles of the same source -- the autotune's chosen # candidate and the final kernel -- share a write_path, so hold the per-path # lock across the whole build to keep them from interleaving, and skip the - # rebuild when a prior build already finished (its tile_graph.onnx exists). + # rebuild when a prior build already finished (its trace.so exists). from filelock import FileLock from PyTorchSimFrontend.mlir.passes import ( run_python_passes, run_module_passes, POST_OPT_PASSES, run_standard_lowering, run_tog, ) - tog_path = os.path.join(write_path, "tile_graph.onnx") + trace_so_path = os.path.join(write_path, "trace.so") lock = FileLock(get_lock_path(write_path), timeout=LOCK_TIMEOUT) with lock: key, input_path = write(source_code, "mlir", specified_dir=write_path) - if os.path.isfile(tog_path): + if os.path.isfile(trace_so_path): return key # Run the Python out-of-line MLIR passes (MLIR bindings) on the kernel # .mlir in place, before mlir-opt. Currently lowers torchsim.vlane_idx @@ -166,7 +165,7 @@ def load(cls, source_code, raw_tog_path = new_input_path + "_tog.py" sample_mlir_path = new_input_path + "_sample" validation_binary_path = os.path.join(write_path, validation_binary_name) - gem5_cmds = mlir_gem5_compile_command(new_input_path, sample_mlir_path, raw_tog_path, vectorlane_size) + gem5_cmds = mlir_gem5_compile_command(new_input_path, sample_mlir_path, vectorlane_size) if spad_info is not None: link_option = f"-Wl,--section-start=.spad=0x{spad_info['spad_vaddr']:x}" @@ -260,61 +259,39 @@ def load(cls, source_code, # Run cyclesim cyclesim = CycleSimulator() cycle_list = cyclesim.compile_and_simulate(os.path.join(write_path, cycle_binary_name), vectorlane_size, silent_mode=silent_mode) - # Snapshot for the P3-trace hook below: generate_tile_graph consumes - # cycle_list in place (cycle_list.pop(0) per tile), leaving it empty. cycle_list_for_trace = list(cycle_list) - # Create TOG - # DEPRECATED (timing path): this ONNX-TOG producer -- run_tog -> - # tog_generator.generate_tile_graph -> ONNX -> C++ TileGraphParser -- - # is being superseded by the C++ trace pipeline (build_skeleton + - # lower_to_emitc -> compiled .so, + the cycle_table sidecar). The - # per-tile cycle_list / x_offset / w_offset computed here are exactly - # what cycle_table.build_cycle_table will reuse, so both paths stay - # cycle-consistent during the transition. Kept live (pipeline must not - # break); to be retired once the trace pipeline (P3+) stabilizes. + # Per-tile cycle offsets, shared with the trace cycle-table below. w_offset, x_offset = vectorlane_size, vectorlane_size if kwargs['loop_size'] is not None and kwargs['loop_size'][-3] < vectorlane_size: x_offset = kwargs['loop_size'][-3] if kwargs['loop_size'] is not None and kwargs['loop_size'][-1] < vectorlane_size: w_offset = kwargs['loop_size'][-1] w_offset = 0 # max(w_offset - x_offset, 0) - tile_graph_generator = tog_generator(origins) - tile_graph_generator.load_file(raw_tog_path) - tile_graph_generator.generate_tile_graph( - tog_path, - cycle_list=cycle_list, - x_offset=x_offset, # FIXME. - w_offset=w_offset, # FIXME. - vector_lane=vectorlane_size - ) - - # Trace pipeline (DEFAULT): emit the compiled trace producer .so + the - # cycle-table TSV from the post-vcix IR and gem5 cycle_list/offsets. This - # is the default simulation path (the C++ TOG); the legacy ONNX TOG is - # DEPRECATED, an opt-in fallback via TORCHSIM_LEGACY_TOG=1, in which case the - # .so is unused so skip emitting it. Best-effort: never breaks the compile. - if os.environ.get("TORCHSIM_LEGACY_TOG") != "1": - try: - import mlir.ir as ir - from PyTorchSimFrontend.mlir.passes import ( - build_skeleton as _bs, cycle_table as _ct, lower_to_emitc as _l2e) - pv = sample_mlir_path + "_postvcix.mlir" - _ctx = ir.Context(); _ctx.allow_unregistered_dialects = True - with _ctx: - _mod = ir.Module.parse(open(pv).read(), _ctx) - _bs.build_skeleton(_mod) - _ntiles = len(_ct._compute_types(_mod)) - # align lengths: gem5 gives one numCycles per compute node; - # pad with the last value / truncate if it disagrees. - _cl = list(cycle_list_for_trace) - if _cl and len(_cl) != _ntiles: - _cl = (_cl + [_cl[-1]] * _ntiles)[:_ntiles] - _tbl = _ct.build_cycle_table(_mod, _cl, x_offset, w_offset) - _ct.dump_cycle_table_tsv(_tbl, os.path.join(write_path, "trace_cycles.tsv")) - _l2e.build_trace_so(pv, os.path.join(write_path, "trace.so")) - except Exception as e: - logger.warning(f"[P3-trace] trace .so/sidecar dump skipped: {e}") + + # Trace pipeline (sole sim path): emit the compiled trace producer .so + + # the cycle-table TSV from the post-vcix IR and gem5 cycle_list/offsets; + # TOGSim builds its C++ TOG from this via trace_to_tilegraph. + try: + import mlir.ir as ir + from PyTorchSimFrontend.mlir.passes import ( + build_skeleton as _bs, cycle_table as _ct, lower_to_emitc as _l2e) + pv = sample_mlir_path + "_postvcix.mlir" + _ctx = ir.Context(); _ctx.allow_unregistered_dialects = True + with _ctx: + _mod = ir.Module.parse(open(pv).read(), _ctx) + _bs.build_skeleton(_mod) + _ntiles = len(_ct._compute_types(_mod)) + # align lengths: gem5 gives one numCycles per compute node; + # pad with the last value / truncate if it disagrees. + _cl = list(cycle_list_for_trace) + if _cl and len(_cl) != _ntiles: + _cl = (_cl + [_cl[-1]] * _ntiles)[:_ntiles] + _tbl = _ct.build_cycle_table(_mod, _cl, x_offset, w_offset) + _ct.dump_cycle_table_tsv(_tbl, os.path.join(write_path, "trace_cycles.tsv"), origins=origins) + _l2e.build_trace_so(pv, os.path.join(write_path, "trace.so")) + except Exception as e: + logger.warning(f"[P3-trace] trace .so/sidecar dump skipped: {e}") return key class CustomAsyncCompile(AsyncCompile): @@ -339,6 +316,9 @@ def task(): def run_kernel_simulation(*args, autotune_subprocess_timeout_sec=None, **kwargs): # Wait for compilation key = future.result() + if not autotune and origins: + logger.info("[kernel %s] origins: %s", + hash_prefix(key), ", ".join(sorted(str(o) for o in origins))) from filelock import FileLock result_path = os.path.join(extension_config.get_dump_path(), hash_prefix(key)) lock = FileLock(get_lock_path(result_path), timeout=LOCK_TIMEOUT) diff --git a/PyTorchSimFrontend/mlir/passes/cycle_table.py b/PyTorchSimFrontend/mlir/passes/cycle_table.py index 40dd3459..53682131 100644 --- a/PyTorchSimFrontend/mlir/passes/cycle_table.py +++ b/PyTorchSimFrontend/mlir/passes/cycle_table.py @@ -93,11 +93,19 @@ def load_cycle_table(path): return json.load(fh) -def dump_cycle_table_tsv(table, path): +def dump_cycle_table_tsv(table, path, origins=None): """Plain `cycleoverlapping` per line, in tile_id order -- the trivial format the C++ `--cycle_table` loader (main.cc, P3 trace pipeline) reads with - ifstream (no JSON dependency in TOGSim).""" + ifstream (no JSON dependency in TOGSim). + + `origins` (the FX nodes this kernel came from) is recorded as a trailing + `# origins: ...` comment after the data rows -- the legacy ONNX TOG carried + this as node metadata. The C++ loader's `while (ct >> c >> o)` stops at the + `#` once all (cycle, overlapping) rows are read, so the comment is safe with + the current parser; a future TOGSim change can promote it to a real field.""" with open(path, "w") as fh: for cycle, overlapping in table: fh.write("%d\t%d\n" % (int(cycle), int(overlapping))) + if origins: + fh.write("# origins: %s\n" % ", ".join(sorted(str(o) for o in origins))) return path diff --git a/Simulator/simulator.py b/Simulator/simulator.py index a4517285..7378665d 100644 --- a/Simulator/simulator.py +++ b/Simulator/simulator.py @@ -319,7 +319,9 @@ def _send_command(self, command_type, device_index, stream_index, tog_path="", a command_type: Type of command ("LAUNCH_KERNEL" or "DEVICE_SYNC") device_index: Device index stream_index: Stream index - tog_path: Path to TOG file (ONNX model) - empty for DEVICE_SYNC + tog_path: kernel-dir handle; TOGSim derives trace.so/trace_cycles.tsv from + its directory (the ONNX file itself is only read on the STONNE sparse + path) - empty for DEVICE_SYNC attribute_path: Path to attribute file - empty for DEVICE_SYNC timestamp: Timestamp in nanoseconds (default: 0) @@ -410,7 +412,8 @@ def launch_kernel(self, device_index, stream_index, tog_path, attribute_path, ti Args: device_index: Device index stream_index: Stream index - tog_path: Path to TOG file (ONNX model) + tog_path: kernel-dir handle; TOGSim derives trace.so from its directory + (the ONNX file itself is only read on the STONNE sparse path) attribute_path: Path to attribute file timestamp: Timestamp in nanoseconds (default: 0) @@ -523,7 +526,8 @@ def run_standalone( For streaming multiple kernels, use launch_kernel() instead. Args: - model_path: Path to TOG file (ONNX model) + model_path: kernel-dir handle; trace.so/trace_cycles.tsv are derived from + its directory (the ONNX file itself is only read on the STONNE sparse path) attribute_path: Path to attribute file autotune_mode: If True, run in autotune mode (silent) config_path: Path to TOGSim config file (required) @@ -560,22 +564,16 @@ def run_standalone( os.fsync(trace_file.fileno()) try: - # The C++ TOG (trace) path is the DEFAULT: drive the simulation from the - # emitted trace.so. The legacy ONNX TOG is the opt-in fallback via - # TORCHSIM_LEGACY_TOG=1. Each autotune candidate compiles to its own - # write_path (keyed by its retiled source), so its trace.so/cycle_table sit - # next to its tile_graph.onnx -- benchmark it through the trace path too. - # Fall back to legacy only if the .so was not emitted. + # Drive the simulation from the emitted trace.so (the C++ TOG path). + # The ONNX --models_list path remains only for callers that pass an ONNX + # TOG without a trace.so (the STONNE sparse path); the normal compile + # always emits trace.so next to write_path. trace_so = os.path.join(os.path.dirname(str(model_path)), "trace.so") cycle_tsv = os.path.join(os.path.dirname(str(model_path)), "trace_cycles.tsv") base_cmd = TOGSimulator.get_togsim_command(config_path, togsim_path) - use_trace = (os.environ.get("TORCHSIM_LEGACY_TOG") != "1" - and os.path.exists(trace_so)) - if os.environ.get("TORCHSIM_LEGACY_TOG") == "1": - logger.warning("TORCHSIM_LEGACY_TOG=1 selects the DEPRECATED legacy ONNX TOG path") - if use_trace: + if os.path.exists(trace_so): cmd = f"{base_cmd} --trace_so {trace_so} --cycle_table {cycle_tsv}" - else: # DEPRECATED: legacy ONNX TOG path + else: # ONNX TOG path (STONNE sparse) cmd = f"{base_cmd} --models_list {trace_file_path}" if extension_config.CONFIG_TOGSIM_DEBUG_LEVEL: cmd += f" --log_level {extension_config.CONFIG_TOGSIM_DEBUG_LEVEL}" diff --git a/scripts/chiplet.sh b/scripts/chiplet.sh index e622874b..40aa77c4 100755 --- a/scripts/chiplet.sh +++ b/scripts/chiplet.sh @@ -35,7 +35,12 @@ for ATTRIBUTE in "$@"; do fi ATTRIBUTE_FILES+=("$ATTRIBUTE_FILE") done -MODELS_LIST="$GEMM_PATH/tile_graph.onnx" +# Trace (C++ TOG) path. NOTE: TOGSim currently stubs per-tensor addresses for the +# trace path (build_trace_tilegraph), so chiplet NoC/DRAM-partition accuracy is +# approximate until the trace path consumes real addresses; --attributes_list is +# no longer a Simulator option. +TRACE_SO="$GEMM_PATH/trace.so" +CYCLE_TABLE="$GEMM_PATH/trace_cycles.tsv" ATTRIBUTE_PATH="$GEMM_PATH/runtime_0000/attribute" for CONFIG in "${CONFIG_LIST[@]}"; do @@ -49,8 +54,7 @@ for CONFIG in "${CONFIG_LIST[@]}"; do OUTPUT_FILE="$RESULTS_DIR/${CONFIG_NAME}_result.txt" # Run Simulator - echo "$SIMULATOR_PATH" --config "$CONFIG" --models_list "$MODELS_LIST" --attributes_list "$ATTRIBUTE_PATH/$ATTRIBUTE_NAME" - "$SIMULATOR_PATH" --config "$CONFIG" --models_list "$MODELS_LIST" --log_level trace --attributes_list "$ATTRIBUTE_PATH/$ATTRIBUTE_NAME" > "$OUTPUT_FILE" & + echo "$SIMULATOR_PATH" --config "$CONFIG" --trace_so "$TRACE_SO" --cycle_table "$CYCLE_TABLE" "$SIMULATOR_PATH" --config "$CONFIG" --trace_so "$TRACE_SO" --cycle_table "$CYCLE_TABLE" --log_level trace --attributes_list "$ATTRIBUTE_PATH/$ATTRIBUTE_NAME" > "$OUTPUT_FILE" & echo "[TOGSim] for $CONFIG stored to \"$(pwd)/$OUTPUT_FILE\"" done done @@ -63,8 +67,7 @@ for CONFIG in "${CONFIG_LIST2[@]}"; do OUTPUT_FILE="$RESULTS_DIR/${CONFIG_NAME}_result.txt" # Run Simulator - # echo "$SIMULATOR_PATH" --config "$CONFIG" --models_list "$MODELS_LIST" --attributes_list "$ATTRIBUTE_PATH/$ATTRIBUTE_NAME" - "$SIMULATOR_PATH" --config "$CONFIG" --models_list "$MODELS_LIST" --log_level trace --attributes_list "$ATTRIBUTE_PATH/$ATTRIBUTE_NAME" > "$OUTPUT_FILE" & + # echo "$SIMULATOR_PATH" --config "$CONFIG" --trace_so "$TRACE_SO" --cycle_table "$CYCLE_TABLE" "$SIMULATOR_PATH" --config "$CONFIG" --trace_so "$TRACE_SO" --cycle_table "$CYCLE_TABLE" --log_level trace --attributes_list "$ATTRIBUTE_PATH/$ATTRIBUTE_NAME" > "$OUTPUT_FILE" & echo "[TOGSim] for $CONFIG stored to \"$(pwd)/$OUTPUT_FILE\"" done wait \ No newline at end of file