Skip to content

Commit 90848a0

Browse files
committed
added sgemm_tcu roofline perf plotting
1 parent 4c8deaf commit 90848a0

3 files changed

Lines changed: 66 additions & 24 deletions

File tree

AGENTS.md

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ Wait until build completes before running anything else in parallel terminals.
6161
CONFIGS="-DEXT_TCU_ENABLE" ./ci/blackbox.sh --driver=simx --app=sgemm_tcu --threads=8
6262
```
6363

64-
- `make tests` and `make -C tests/regression` build test binaries using their default macros. If you intend to run a test with non-default `NUM_THREADS`, data types, or feature flags, rebuild that specific test explicitly before invoking it.
64+
- `make tests` and `make -C tests/regression` build test binaries using their default macros. If you intend to run a test with non-default thread count, data types, or feature flags use `CONFIGS` + `blackbox.sh`
6565

6666
## Testing & Debugging
6767

@@ -94,12 +94,18 @@ make -C tests/regression/<test-name>
9494
./ci/blackbox.sh --driver=rtlsim --app=<test-name> --debug=1 --log=run.log
9595
```
9696

97-
When using non-default compile-time macros, split the flow into an explicit rebuild step and a run step:
97+
When using non-default compile-time macros, pass them directly via `CONFIGS` on the same command:
9898

9999
```bash
100-
make -C tests/regression/<test-name> clean
101-
CONFIGS="-DNUM_THREADS=8 -DITYPE=fp16 -DOTYPE=fp32 -DEXT_TCU_ENABLE" make -C tests/regression/<test-name>
102-
CONFIGS="-DEXT_TCU_ENABLE" ./ci/blackbox.sh --driver=simx --app=<test-name> --threads=8 --args="..."
100+
CONFIGS="-DNUM_THREADS=8 -DITYPE=fp16 -DOTYPE=fp32 -DEXT_TCU_ENABLE" \
101+
./ci/blackbox.sh --driver=simx --app=<test-name> --threads=8 --args="..."
102+
```
103+
104+
### Roofline Perf Plot
105+
106+
Example to run sgemm_tcu test with perf collection and generate roofline plot (Peak vs Actual FLOPS, Compute vs Memory BW)
107+
```bash
108+
/usr/bin/python3 ../perf/roofline.py --app=sgemm_tcu --driver=simx --cores=1 --warps=4 --threads=8 --issue-width=2 --n=32 --perf=1 --by-cycle --output=sgemm_tcu_roofline.png
103109
```
104110

105111
For multi-suite coverage, `ci/regression.sh` is the canonical source of tested configurations. Use it to discover supported parameter combinations before inventing ad hoc ones.
@@ -133,8 +139,8 @@ For parameters not exposed as explicit flags, use `CONFIGS` with `-D...` overrid
133139

134140
Example: To enable TCU, select FEDP backend, and set I/O data types:
135141
```bash
136-
make -C tests/regression/sgemm_tcu clean && CONFIGS="-DNUM_THREADS=4 -DITYPE=bf16 -DOTYPE=fp32" make -C tests/regression/sgemm_tcu
137-
CONFIGS="-DNUM_THREADS=4 -DEXT_TCU_ENABLE -DTCU_TYPE_TFR" ./ci/blackbox.sh --driver=rtlsim --app=sgemm_tcu
142+
CONFIGS="-DNUM_THREADS=4 -DEXT_TCU_ENABLE -DTCU_TYPE_TFR -DITYPE=bf16 -DOTYPE=fp32" \
143+
./ci/blackbox.sh --driver=rtlsim --app=sgemm_tcu --threads=4
138144
```
139145

140146
### Editing default config files

docs/install_vortex.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,12 @@
99
sudo apt-get install build-essential cmake ccache zlib1g-dev libtinfo-dev libncurses-dev uuid-dev libboost-serialization-dev libpng-dev libhwloc-dev
1010
```
1111

12+
(Optional) for roofline/perf plotting:
13+
14+
```
15+
sudo apt-get install python3-numpy python3-matplotlib
16+
```
17+
1218
2. Download the Vortex codebase:
1319

1420
```
@@ -42,6 +48,12 @@ Note: depending on the system, some of the toolchain may need to be recompiled f
4248
sudo yum install libpng-devel boost boost-devel boost-serialization libuuid-devel opencl-headers hwloc hwloc-devel gmp-devel compat-hwloc1
4349
```
4450

51+
(Optional) for roofline/perf plotting:
52+
53+
```
54+
sudo yum install python3-numpy python3-matplotlib
55+
```
56+
4557
2. Upgrade GCC to 11:
4658

4759
```

perf/roofline.py

Lines changed: 41 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -13,16 +13,16 @@
1313
# limitations under the License.
1414
#
1515
# Vortex SGEMM Roofline Plotter
16-
# Runs the regression/sgemmx kernel via ci/blackbox.sh and plots the roofline.
16+
# Runs a SGEMM-style regression kernel via ci/blackbox.sh and plots roofline metrics.
1717
#
1818
# Usage (from build dir):
19-
# python3 tests/regression/sgemmx/roofline.py [--driver=rtlsim] [--cores=1]
19+
# python3 perf/roofline.py [--app=sgemmx] [--driver=rtlsim] [--cores=1]
2020
# [--warps=4] [--threads=4] [--n=128]
2121
# [--freq=<auto>] [--bw=51.2] [--perf=1]
2222
# [--output=roofline.png]
2323
#
2424
# Usage (from source tree, targeting a specific build dir):
25-
# python3 tests/regression/sgemmx/roofline.py --build-dir=build_test32 ...
25+
# python3 perf/roofline.py --build-dir=build_test32 ...
2626

2727
import argparse
2828
import os
@@ -64,6 +64,9 @@ def parse_args():
6464
p.add_argument("--driver", default="rtlsim",
6565
choices=["rtlsim", "simx", "opae", "xrt"],
6666
help="Vortex driver")
67+
p.add_argument("--app", default="sgemmx",
68+
choices=["sgemmx", "sgemm_tcu"],
69+
help="Regression app to run")
6770
p.add_argument("--cores", type=int, default=1,
6871
help="Number of cores (NUM_CORES)")
6972
p.add_argument("--warps", type=int, default=4,
@@ -74,6 +77,10 @@ def parse_args():
7477
help="Issue width (ISSUE_WIDTH)")
7578
p.add_argument("--n", type=int, default=32,
7679
help="Square matrix dimension N (SGEMM computes N×N × N×N)")
80+
p.add_argument("--m", type=int, default=None,
81+
help="M dimension for non-square SGEMM (default: use --n)")
82+
p.add_argument("--k", type=int, default=None,
83+
help="K dimension for non-square SGEMM (default: use --n)")
7784
p.add_argument("--freq", type=float, default=0,
7885
help="Pipeline clock frequency in MHz "
7986
"(0 = per-cycle mode, implies --by-cycle; "
@@ -129,15 +136,28 @@ def find_blackbox(args):
129136
return bb, bdir
130137

131138

132-
def run_sgemmx_capture(args):
133-
"""Run sgemmx via blackbox.sh and return captured stdout+stderr."""
139+
def _matrix_dims(args):
140+
m_dim = args.m if args.m is not None else args.n
141+
n_dim = args.n
142+
k_dim = args.k if args.k is not None else args.n
143+
return m_dim, n_dim, k_dim
144+
145+
146+
def run_app_capture(args):
147+
"""Run the selected app via blackbox.sh and return captured stdout+stderr."""
134148
blackbox, cwd = find_blackbox(args)
135149

150+
m_dim, n_dim, k_dim = _matrix_dims(args)
151+
if args.app == "sgemm_tcu":
152+
app_args = f"-m{m_dim} -n{n_dim} -k{k_dim}"
153+
else:
154+
app_args = f"-n{n_dim}"
155+
136156
cmd = [
137157
blackbox,
138158
f"--driver={args.driver}",
139-
"--app=sgemmx",
140-
f"--args=-n{args.n}",
159+
f"--app={args.app}",
160+
f"--args={app_args}",
141161
]
142162
if args.perf:
143163
cmd.append(f"--perf={args.perf}")
@@ -155,6 +175,8 @@ def run_sgemmx_capture(args):
155175
configs.append(f"-DPLATFORM_MEMORY_NUM_BANKS={args.mem_banks}")
156176
if args.mem_data_size is not None:
157177
configs.append(f"-DPLATFORM_MEMORY_DATA_SIZE={args.mem_data_size}")
178+
if args.app == "sgemm_tcu":
179+
configs.append("-DEXT_TCU_ENABLE")
158180

159181
env = os.environ.copy()
160182
existing = env.get("CONFIGS", "")
@@ -248,7 +270,7 @@ def parse_perf(output):
248270
# ────────────────────────────────────────────────────────────────────────────
249271

250272
def compute_metrics(args, perf):
251-
n = args.n
273+
m_dim, n_dim, k_dim = _matrix_dims(args)
252274
freq_hz = args.freq * 1e6 # MHz → Hz
253275
num_hw_threads = args.cores * args.warps * args.threads
254276
# Compute throughput threads: one warp issues per cycle per core, executing
@@ -257,14 +279,14 @@ def compute_metrics(args, perf):
257279
num_compute_threads = args.cores * args.threads
258280

259281
# --- Workload FLOPs ---
260-
# SGEMM: C = A × B → 2· FLOPs (N³ muls + N³ adds, or N³ FMAs)
261-
flops = 2.0 * n ** 3
282+
# SGEMM: C[M×N] = A[M×K] × B[K×N] → 2·M·N·K FLOPs
283+
flops = 2.0 * m_dim * n_dim * k_dim
262284

263285
# --- Arithmetic intensity ---
264286
# Ideal (Roofline "capacity" model): one-pass, perfect reuse
265-
# load A ( f32) + load B ( f32) + store C ( f32)
266-
bytes_ideal = 3 * n * n * 4 # bytes, float32
267-
ai_ideal = flops / bytes_ideal # FLOP/byte (= N/6)
287+
# load A (M·K f32) + load B (K·N f32) + store C (M·N f32)
288+
bytes_ideal = (m_dim * k_dim + k_dim * n_dim + m_dim * n_dim) * 4
289+
ai_ideal = flops / bytes_ideal
268290

269291
# Actual (from profiler cache-line accounting), if available
270292
if perf.get("actual_bytes") is not None:
@@ -443,9 +465,9 @@ def plot_roofline(args, perf, m, outfile):
443465
ax.set_xlabel("Arithmetic Intensity (FLOP / byte)", fontsize=12)
444466
ax.set_ylabel(y_label, fontsize=12)
445467
ax.set_title(
446-
f"Vortex SGEMM Roofline — "
468+
f"Vortex SGEMM Roofline — {args.app}"
447469
f"{args.cores}C / {args.warps}W / {args.threads}T, "
448-
f"N={args.n}, {args.driver}, {domain_tag}",
470+
f"M={_matrix_dims(args)[0]}, N={_matrix_dims(args)[1]}, K={_matrix_dims(args)[2]}, {args.driver}, {domain_tag}",
449471
fontsize=12, fontweight="bold",
450472
)
451473
ax.legend(fontsize=9, loc="upper left")
@@ -513,12 +535,14 @@ def _annotate(ax, ai, gfl, color, label):
513535
# ────────────────────────────────────────────────────────────────────────────
514536

515537
def print_metrics(args, perf, m):
538+
m_dim, n_dim, k_dim = _matrix_dims(args)
516539
sep = "═" * 56
517540
print(f"\n{sep}")
518541
print(" Vortex SGEMM Roofline Metrics")
519542
print(sep)
543+
print(f" App : {args.app}")
520544
print(f" Config : {args.cores}C / {args.warps}W / {args.threads}T")
521-
print(f" Matrix size : {args.n} × {args.n}")
545+
print(f" Matrix size : M={m_dim}, N={n_dim}, K={k_dim}")
522546
print(f" Driver : {args.driver}")
523547
if args.by_cycle:
524548
print(f" Domain : per-cycle")
@@ -577,7 +601,7 @@ def main():
577601
freq = read_platform_clock(bdir)
578602
args.freq = float(freq) if freq else 1.0
579603

580-
output = run_sgemmx_capture(args)
604+
output = run_app_capture(args)
581605
perf = parse_perf(output)
582606

583607
print(f"\nParsed ({perf['cores_seen']} core(s)): "

0 commit comments

Comments
 (0)