From 7a30eb2f423ca1967eff7a275c9f7fb122105d2d Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 25 Jun 2026 23:43:10 +0900
Subject: [PATCH] [Frontend] Make the C++ trace the sole main TOG path; drop
 legacy ONNX TOG

The main compile/sim path no longer generates or selects the legacy ONNX
Tile-Operation-Graph. extension_codecache emits only trace.so + trace_cycles.tsv
(the build-skip now keys on trace.so), and TOGSimulator.run_standalone always
drives TOGSim with --trace_so. The TORCHSIM_LEGACY_TOG opt-in is removed from the
frontend. The ONNX --models_list branch is kept solely for the STONNE sparse path
(extension_op.py); TOGSim's C++ ONNX parser is untouched (separate PR).

origins (which FX nodes a kernel came from) is preserved: logged per kernel run
and recorded as a trailing "# origins:" line in trace_cycles.tsv -- the legacy
ONNX TOG carried this as node metadata, and the C++ cycle-table loader stops at
the comment so the current parser is unaffected.

Also drop the dead tog_file param from mlir_gem5_compile_command, migrate
scripts/chiplet.sh to --trace_so/--cycle_table (the trace path stubs per-tensor
addresses and --attributes_list is no longer a Simulator option), and refresh
the CLAUDE.md TOG-generation notes.
---
 CLAUDE.md                                     |  4 +-
 PyTorchSimFrontend/extension_codecache.py     | 86 +++++++------------
 PyTorchSimFrontend/mlir/passes/cycle_table.py | 12 ++-
 Simulator/simulator.py                        | 28 +++---
 scripts/chiplet.sh                            | 13 +--
 5 files changed, 66 insertions(+), 77 deletions(-)
diff --git a/CLAUDE.md b/CLAUDE.md
index 1a042cce..5975ee60 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -21,7 +21,7 @@ The pipeline runs in that order on every `torch.compile` invocation; you'll see
 | `Simulator/simulator.py` | Python drivers: `FunctionalSimulator` (Spike), `CycleSimulator` (Gem5), `TOGSimulator` (the cycle-accurate one + multi-tenant context manager) |
 | `Scheduler/scheduler.py` | Poisson arrival generator + scheduling utilities for multi-tenant runs |
 | `TOGSim/` | C++ TOGSim source. `src/Simulator.cc`, `Core.cc`, `Dram.cc`, `Interconnect.cc`, `L2Cache.cc`, `Tile.cc`, `TileGraph.cc` are the core models. Externals: ramulator2, booksim, stonneCore, onnx, protobuf, spdlog, yaml-cpp |
-| `AsmParser/` | `tog_generator.py`, `onnx_utility.py` — TOG generation from ONNX/ASM |
+| `AsmParser/` | `tog_generator.py`, `onnx_utility.py` — legacy ONNX TOG generation; now used only by the STONNE sparse path (the main path emits a C++ `trace.so` instead) |
 | `configs/` | TOGSim hardware configs (YAML). The default is `systolic_ws_128x128_c1_simple_noc_tpuv3.yml`. Naming pattern: `systolic_ws_<size>_c<cores>_<noc>_<target>.yml` |
 | `tests/` | Op- and model-level tests organized under `ops/<family>/` (elementwise, reduce, gemm, conv, attention, view, sort, sparsity, misc, fusion), `models/<name>/` (Llama, Mixtral8x7B, DeepSeek, Diffusion, MoE, MLP, MobileNet, Yolov5) plus single-file model tests (test_resnet, test_transformer, test_vit, test_mlp, test_single_perceptron), and `system/` (scheduler, eager, hetro, stonne, vectorops). Shared helper: `tests/_utils.py` |
 | `experiments/artifact/` | Paper reproduction scripts (`cycle_validation/run_cycle.sh`, `speedup/run_speedup.sh`) |
@@ -130,7 +130,7 @@ Conan deps for TOGSim: `boost/1.79.0`, `robin-hood-hashing/3.11.5`, `spdlog/1.11
 - **Adding a new op (Inductor lowering):** `PyTorchSimFrontend/mlir/mlir_ops.py`, `mlir_lowering.py`, plus a new `mlir_<op>_template.py` if it needs its own MLIR template. Decomposition rules: `mlir_decomposition.py`. Scheduling: `mlir_scheduling.py`. Autotune: `mlir_autotune.py`.
 - **Adding a PyTorch device op:** `PyTorchSimDevice/csrc/aten/native/*` (Minimal/Extra split mirrors `torch_openreg`).
 - **TOGSim hardware model changes:** `TOGSim/src/{Core,Dram,Interconnect,L2Cache,Tile,TileGraph}.cc` + matching `include/*.h`.
-- **TOG generation:** `AsmParser/tog_generator.py` builds the raw graph and serializes it via `AsmParser/onnx_utility.py` to **ONNX, which is the on-disk TOG format** consumed by TOGSim.
+- **TOG generation:** the main path compiles each kernel to a C++ **`trace.so`** (`mlir/passes/build_skeleton.py` + `lower_to_emitc.py`) plus a `trace_cycles.tsv` cycle table, which TOGSim turns into a TileGraph via `trace_to_tilegraph`. `AsmParser/tog_generator.py` + `onnx_utility.py` (the legacy ONNX TOG) remain only for the **STONNE sparse path** (`extension_op.py`).
 - **Eager fallback registration:** `torch.npu.register_eager_to_compile([...])` — see `tests/system/test_eager.py`.
 - **Per-run results:** `togsim_results/<YYYYMMDD_HHMMSS_<hash>>.log` (stats) and `.trace` (instruction trace). The path is also printed at the end of every run.
 - **Wrapper codegen path:** printed as `Wrapper Codegen Path = /tmp/torchinductor_<user>/<hash>/...py` — useful for inspecting generated kernel code and tensor names for `SRAM_BUFFER_PLAN_PATH`.
diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py
index 89bd7c89..5e6857ee 100644
--- a/PyTorchSimFrontend/extension_codecache.py
+++ b/PyTorchSimFrontend/extension_codecache.py
@@ -7,7 +7,6 @@
 from PyTorchSimFrontend import extension_config
 from torch._inductor.codecache import get_hash, write, write_atomic
 from torch._inductor.async_compile import AsyncCompile
-from AsmParser.tog_generator import tog_generator
 from PyTorchSimFrontend.mlir.mlir_caller_codegen import MLIRKernelCallerCodeGen
 from Simulator.simulator import FunctionalSimulator, CycleSimulator, TOGSimulator
 
@@ -81,7 +80,7 @@ def mlir_compile_command(filename, vectorlane_size, vlen=256):
         """,
     ).strip()]
 
-def mlir_gem5_compile_command(filename, sample_filename, tog_file, vectorlane_size, vlen=256):
+def mlir_gem5_compile_command(filename, sample_filename, vectorlane_size, vlen=256):
     # See mlir_compile_command: -dma-fine-grained and -test-pytorchsim-to-vcix are
     # Python passes run in-process; mlir-opt runs only loop-padding here.
     return [re.sub(r"[ \n]+", " ",
@@ -146,17 +145,17 @@ def load(cls, source_code,
         # it back (mlir-opt). Two compiles of the same source -- the autotune's chosen
         # candidate and the final kernel -- share a write_path, so hold the per-path
         # lock across the whole build to keep them from interleaving, and skip the
-        # rebuild when a prior build already finished (its tile_graph.onnx exists).
+        # rebuild when a prior build already finished (its trace.so exists).
         from filelock import FileLock
         from PyTorchSimFrontend.mlir.passes import (
             run_python_passes, run_module_passes, POST_OPT_PASSES,
             run_standard_lowering, run_tog,
         )
-        tog_path = os.path.join(write_path, "tile_graph.onnx")
+        trace_so_path = os.path.join(write_path, "trace.so")
         lock = FileLock(get_lock_path(write_path), timeout=LOCK_TIMEOUT)
         with lock:
             key, input_path = write(source_code, "mlir", specified_dir=write_path)
-            if os.path.isfile(tog_path):
+            if os.path.isfile(trace_so_path):
                 return key
             # Run the Python out-of-line MLIR passes (MLIR bindings) on the kernel
             # .mlir in place, before mlir-opt. Currently lowers torchsim.vlane_idx
@@ -166,7 +165,7 @@ def load(cls, source_code,
             raw_tog_path = new_input_path + "_tog.py"
             sample_mlir_path = new_input_path + "_sample"
             validation_binary_path = os.path.join(write_path, validation_binary_name)
-            gem5_cmds = mlir_gem5_compile_command(new_input_path, sample_mlir_path, raw_tog_path, vectorlane_size)
+            gem5_cmds = mlir_gem5_compile_command(new_input_path, sample_mlir_path, vectorlane_size)
 
             if spad_info is not None:
                 link_option = f"-Wl,--section-start=.spad=0x{spad_info['spad_vaddr']:x}"
@@ -260,61 +259,39 @@ def load(cls, source_code,
             # Run cyclesim
             cyclesim = CycleSimulator()
             cycle_list = cyclesim.compile_and_simulate(os.path.join(write_path, cycle_binary_name), vectorlane_size, silent_mode=silent_mode)
-            # Snapshot for the P3-trace hook below: generate_tile_graph consumes
-            # cycle_list in place (cycle_list.pop(0) per tile), leaving it empty.
             cycle_list_for_trace = list(cycle_list)
 
-            # Create TOG
-            # DEPRECATED (timing path): this ONNX-TOG producer -- run_tog ->
-            # tog_generator.generate_tile_graph -> ONNX -> C++ TileGraphParser --
-            # is being superseded by the C++ trace pipeline (build_skeleton +
-            # lower_to_emitc -> compiled .so, + the cycle_table sidecar). The
-            # per-tile cycle_list / x_offset / w_offset computed here are exactly
-            # what cycle_table.build_cycle_table will reuse, so both paths stay
-            # cycle-consistent during the transition. Kept live (pipeline must not
-            # break); to be retired once the trace pipeline (P3+) stabilizes.
+            # Per-tile cycle offsets, shared with the trace cycle-table below.
             w_offset, x_offset = vectorlane_size, vectorlane_size
             if kwargs['loop_size'] is not None and kwargs['loop_size'][-3] < vectorlane_size:
                 x_offset = kwargs['loop_size'][-3]
             if kwargs['loop_size'] is not None and kwargs['loop_size'][-1] < vectorlane_size:
                 w_offset = kwargs['loop_size'][-1]
             w_offset = 0 # max(w_offset - x_offset, 0)
-            tile_graph_generator = tog_generator(origins)
-            tile_graph_generator.load_file(raw_tog_path)
-            tile_graph_generator.generate_tile_graph(
-                tog_path,
-                cycle_list=cycle_list,
-                x_offset=x_offset, # FIXME.
-                w_offset=w_offset, # FIXME.
-                vector_lane=vectorlane_size
-            )
-
-            # Trace pipeline (DEFAULT): emit the compiled trace producer .so + the
-            # cycle-table TSV from the post-vcix IR and gem5 cycle_list/offsets. This
-            # is the default simulation path (the C++ TOG); the legacy ONNX TOG is
-            # DEPRECATED, an opt-in fallback via TORCHSIM_LEGACY_TOG=1, in which case the
-            # .so is unused so skip emitting it. Best-effort: never breaks the compile.
-            if os.environ.get("TORCHSIM_LEGACY_TOG") != "1":
-                try:
-                    import mlir.ir as ir
-                    from PyTorchSimFrontend.mlir.passes import (
-                        build_skeleton as _bs, cycle_table as _ct, lower_to_emitc as _l2e)
-                    pv = sample_mlir_path + "_postvcix.mlir"
-                    _ctx = ir.Context(); _ctx.allow_unregistered_dialects = True
-                    with _ctx:
-                        _mod = ir.Module.parse(open(pv).read(), _ctx)
-                        _bs.build_skeleton(_mod)
-                        _ntiles = len(_ct._compute_types(_mod))
-                        # align lengths: gem5 gives one numCycles per compute node;
-                        # pad with the last value / truncate if it disagrees.
-                        _cl = list(cycle_list_for_trace)
-                        if _cl and len(_cl) != _ntiles:
-                            _cl = (_cl + [_cl[-1]] * _ntiles)[:_ntiles]
-                        _tbl = _ct.build_cycle_table(_mod, _cl, x_offset, w_offset)
-                    _ct.dump_cycle_table_tsv(_tbl, os.path.join(write_path, "trace_cycles.tsv"))
-                    _l2e.build_trace_so(pv, os.path.join(write_path, "trace.so"))
-                except Exception as e:
-                    logger.warning(f"[P3-trace] trace .so/sidecar dump skipped: {e}")
+
+            # Trace pipeline (sole sim path): emit the compiled trace producer .so +
+            # the cycle-table TSV from the post-vcix IR and gem5 cycle_list/offsets;
+            # TOGSim builds its C++ TOG from this via trace_to_tilegraph.
+            try:
+                import mlir.ir as ir
+                from PyTorchSimFrontend.mlir.passes import (
+                    build_skeleton as _bs, cycle_table as _ct, lower_to_emitc as _l2e)
+                pv = sample_mlir_path + "_postvcix.mlir"
+                _ctx = ir.Context(); _ctx.allow_unregistered_dialects = True
+                with _ctx:
+                    _mod = ir.Module.parse(open(pv).read(), _ctx)
+                    _bs.build_skeleton(_mod)
+                    _ntiles = len(_ct._compute_types(_mod))
+                    # align lengths: gem5 gives one numCycles per compute node;
+                    # pad with the last value / truncate if it disagrees.
+                    _cl = list(cycle_list_for_trace)
+                    if _cl and len(_cl) != _ntiles:
+                        _cl = (_cl + [_cl[-1]] * _ntiles)[:_ntiles]
+                    _tbl = _ct.build_cycle_table(_mod, _cl, x_offset, w_offset)
+                _ct.dump_cycle_table_tsv(_tbl, os.path.join(write_path, "trace_cycles.tsv"), origins=origins)
+                _l2e.build_trace_so(pv, os.path.join(write_path, "trace.so"))
+            except Exception as e:
+                logger.warning(f"[P3-trace] trace .so/sidecar dump skipped: {e}")
         return key
 
 class CustomAsyncCompile(AsyncCompile):
@@ -339,6 +316,9 @@ def task():
         def run_kernel_simulation(*args, autotune_subprocess_timeout_sec=None, **kwargs):
             # Wait for compilation
             key = future.result()
+            if not autotune and origins:
+                logger.info("[kernel %s] origins: %s",
+                            hash_prefix(key), ", ".join(sorted(str(o) for o in origins)))
             from filelock import FileLock
             result_path = os.path.join(extension_config.get_dump_path(), hash_prefix(key))
             lock = FileLock(get_lock_path(result_path), timeout=LOCK_TIMEOUT)
diff --git a/PyTorchSimFrontend/mlir/passes/cycle_table.py b/PyTorchSimFrontend/mlir/passes/cycle_table.py
index 40dd3459..53682131 100644
--- a/PyTorchSimFrontend/mlir/passes/cycle_table.py
+++ b/PyTorchSimFrontend/mlir/passes/cycle_table.py
@@ -93,11 +93,19 @@ def load_cycle_table(path):
         return json.load(fh)
 
 
-def dump_cycle_table_tsv(table, path):
+def dump_cycle_table_tsv(table, path, origins=None):
     """Plain `cycle<TAB>overlapping` per line, in tile_id order -- the trivial
     format the C++ `--cycle_table` loader (main.cc, P3 trace pipeline) reads with
-    ifstream (no JSON dependency in TOGSim)."""
+    ifstream (no JSON dependency in TOGSim).
+
+    `origins` (the FX nodes this kernel came from) is recorded as a trailing
+    `# origins: ...` comment after the data rows -- the legacy ONNX TOG carried
+    this as node metadata. The C++ loader's `while (ct >> c >> o)` stops at the
+    `#` once all (cycle, overlapping) rows are read, so the comment is safe with
+    the current parser; a future TOGSim change can promote it to a real field."""
     with open(path, "w") as fh:
         for cycle, overlapping in table:
             fh.write("%d\t%d\n" % (int(cycle), int(overlapping)))
+        if origins:
+            fh.write("# origins: %s\n" % ", ".join(sorted(str(o) for o in origins)))
     return path
diff --git a/Simulator/simulator.py b/Simulator/simulator.py
index a4517285..7378665d 100644
--- a/Simulator/simulator.py
+++ b/Simulator/simulator.py
@@ -319,7 +319,9 @@ def _send_command(self, command_type, device_index, stream_index, tog_path="", a
             command_type: Type of command ("LAUNCH_KERNEL" or "DEVICE_SYNC")
             device_index: Device index
             stream_index: Stream index
-            tog_path: Path to TOG file (ONNX model) - empty for DEVICE_SYNC
+            tog_path: kernel-dir handle; TOGSim derives trace.so/trace_cycles.tsv from
+                its directory (the ONNX file itself is only read on the STONNE sparse
+                path) - empty for DEVICE_SYNC
             attribute_path: Path to attribute file - empty for DEVICE_SYNC
             timestamp: Timestamp in nanoseconds (default: 0)
 
@@ -410,7 +412,8 @@ def launch_kernel(self, device_index, stream_index, tog_path, attribute_path, ti
         Args:
             device_index: Device index
             stream_index: Stream index
-            tog_path: Path to TOG file (ONNX model)
+            tog_path: kernel-dir handle; TOGSim derives trace.so from its directory
+                (the ONNX file itself is only read on the STONNE sparse path)
             attribute_path: Path to attribute file
             timestamp: Timestamp in nanoseconds (default: 0)
 
@@ -523,7 +526,8 @@ def run_standalone(
         For streaming multiple kernels, use launch_kernel() instead.
 
         Args:
-            model_path: Path to TOG file (ONNX model)
+            model_path: kernel-dir handle; trace.so/trace_cycles.tsv are derived from
+                its directory (the ONNX file itself is only read on the STONNE sparse path)
             attribute_path: Path to attribute file
             autotune_mode: If True, run in autotune mode (silent)
             config_path: Path to TOGSim config file (required)
@@ -560,22 +564,16 @@ def run_standalone(
             os.fsync(trace_file.fileno())
 
         try:
-            # The C++ TOG (trace) path is the DEFAULT: drive the simulation from the
-            # emitted trace.so. The legacy ONNX TOG is the opt-in fallback via
-            # TORCHSIM_LEGACY_TOG=1. Each autotune candidate compiles to its own
-            # write_path (keyed by its retiled source), so its trace.so/cycle_table sit
-            # next to its tile_graph.onnx -- benchmark it through the trace path too.
-            # Fall back to legacy only if the .so was not emitted.
+            # Drive the simulation from the emitted trace.so (the C++ TOG path).
+            # The ONNX --models_list path remains only for callers that pass an ONNX
+            # TOG without a trace.so (the STONNE sparse path); the normal compile
+            # always emits trace.so next to write_path.
             trace_so = os.path.join(os.path.dirname(str(model_path)), "trace.so")
             cycle_tsv = os.path.join(os.path.dirname(str(model_path)), "trace_cycles.tsv")
             base_cmd = TOGSimulator.get_togsim_command(config_path, togsim_path)
-            use_trace = (os.environ.get("TORCHSIM_LEGACY_TOG") != "1"
-                         and os.path.exists(trace_so))
-            if os.environ.get("TORCHSIM_LEGACY_TOG") == "1":
-                logger.warning("TORCHSIM_LEGACY_TOG=1 selects the DEPRECATED legacy ONNX TOG path")
-            if use_trace:
+            if os.path.exists(trace_so):
                 cmd = f"{base_cmd} --trace_so {trace_so} --cycle_table {cycle_tsv}"
-            else:  # DEPRECATED: legacy ONNX TOG path
+            else:  # ONNX TOG path (STONNE sparse)
                 cmd = f"{base_cmd} --models_list {trace_file_path}"
             if extension_config.CONFIG_TOGSIM_DEBUG_LEVEL:
                 cmd += f" --log_level {extension_config.CONFIG_TOGSIM_DEBUG_LEVEL}"
diff --git a/scripts/chiplet.sh b/scripts/chiplet.sh
index e622874b..40aa77c4 100755
--- a/scripts/chiplet.sh
+++ b/scripts/chiplet.sh
@@ -35,7 +35,12 @@ for ATTRIBUTE in "$@"; do
     fi
     ATTRIBUTE_FILES+=("$ATTRIBUTE_FILE")
 done
-MODELS_LIST="$GEMM_PATH/tile_graph.onnx"
+# Trace (C++ TOG) path. NOTE: TOGSim currently stubs per-tensor addresses for the
+# trace path (build_trace_tilegraph), so chiplet NoC/DRAM-partition accuracy is
+# approximate until the trace path consumes real addresses; --attributes_list is
+# no longer a Simulator option.
+TRACE_SO="$GEMM_PATH/trace.so"
+CYCLE_TABLE="$GEMM_PATH/trace_cycles.tsv"
 ATTRIBUTE_PATH="$GEMM_PATH/runtime_0000/attribute"
 
 for CONFIG in "${CONFIG_LIST[@]}"; do
@@ -49,8 +54,7 @@ for CONFIG in "${CONFIG_LIST[@]}"; do
         OUTPUT_FILE="$RESULTS_DIR/${CONFIG_NAME}_result.txt"
 
         # Run Simulator
-        echo "$SIMULATOR_PATH" --config "$CONFIG" --models_list "$MODELS_LIST" --attributes_list "$ATTRIBUTE_PATH/$ATTRIBUTE_NAME"
-        "$SIMULATOR_PATH" --config "$CONFIG" --models_list "$MODELS_LIST" --log_level trace --attributes_list "$ATTRIBUTE_PATH/$ATTRIBUTE_NAME" > "$OUTPUT_FILE" &
+        echo "$SIMULATOR_PATH" --config "$CONFIG" --trace_so "$TRACE_SO" --cycle_table "$CYCLE_TABLE"        "$SIMULATOR_PATH" --config "$CONFIG" --trace_so "$TRACE_SO" --cycle_table "$CYCLE_TABLE" --log_level trace --attributes_list "$ATTRIBUTE_PATH/$ATTRIBUTE_NAME" > "$OUTPUT_FILE" &
         echo "[TOGSim] for $CONFIG stored to \"$(pwd)/$OUTPUT_FILE\""
     done
 done
@@ -63,8 +67,7 @@ for CONFIG in "${CONFIG_LIST2[@]}"; do
     OUTPUT_FILE="$RESULTS_DIR/${CONFIG_NAME}_result.txt"
 
     # Run Simulator
-    # echo "$SIMULATOR_PATH" --config "$CONFIG" --models_list "$MODELS_LIST" --attributes_list "$ATTRIBUTE_PATH/$ATTRIBUTE_NAME"
-    "$SIMULATOR_PATH" --config "$CONFIG" --models_list "$MODELS_LIST" --log_level trace --attributes_list "$ATTRIBUTE_PATH/$ATTRIBUTE_NAME" > "$OUTPUT_FILE" &
+    # echo "$SIMULATOR_PATH" --config "$CONFIG" --trace_so "$TRACE_SO" --cycle_table "$CYCLE_TABLE"    "$SIMULATOR_PATH" --config "$CONFIG" --trace_so "$TRACE_SO" --cycle_table "$CYCLE_TABLE" --log_level trace --attributes_list "$ATTRIBUTE_PATH/$ATTRIBUTE_NAME" > "$OUTPUT_FILE" &
     echo "[TOGSim] for $CONFIG stored to \"$(pwd)/$OUTPUT_FILE\""
 done
 wait
\ No newline at end of file