1313# limitations under the License.
1414#
1515# Vortex SGEMM Roofline Plotter
16- # Runs the regression/sgemmx kernel via ci/blackbox.sh and plots the roofline.
16+ # Runs a SGEMM-style regression kernel via ci/blackbox.sh and plots roofline metrics .
1717#
1818# Usage (from build dir):
19- # python3 tests/regression/sgemmx/ roofline.py [--driver=rtlsim] [--cores=1]
19+ # python3 perf/ roofline.py [--app=sgemmx] [--driver=rtlsim] [--cores=1]
2020# [--warps=4] [--threads=4] [--n=128]
2121# [--freq=<auto>] [--bw=51.2] [--perf=1]
2222# [--output=roofline.png]
2323#
2424# Usage (from source tree, targeting a specific build dir):
25- # python3 tests/regression/sgemmx /roofline.py --build-dir=build_test32 ...
25+ # python3 perf /roofline.py --build-dir=build_test32 ...
2626
2727import argparse
2828import os
@@ -64,6 +64,9 @@ def parse_args():
6464 p .add_argument ("--driver" , default = "rtlsim" ,
6565 choices = ["rtlsim" , "simx" , "opae" , "xrt" ],
6666 help = "Vortex driver" )
67+ p .add_argument ("--app" , default = "sgemmx" ,
68+ choices = ["sgemmx" , "sgemm_tcu" ],
69+ help = "Regression app to run" )
6770 p .add_argument ("--cores" , type = int , default = 1 ,
6871 help = "Number of cores (NUM_CORES)" )
6972 p .add_argument ("--warps" , type = int , default = 4 ,
@@ -74,6 +77,10 @@ def parse_args():
7477 help = "Issue width (ISSUE_WIDTH)" )
7578 p .add_argument ("--n" , type = int , default = 32 ,
7679 help = "Square matrix dimension N (SGEMM computes N×N × N×N)" )
80+ p .add_argument ("--m" , type = int , default = None ,
81+ help = "M dimension for non-square SGEMM (default: use --n)" )
82+ p .add_argument ("--k" , type = int , default = None ,
83+ help = "K dimension for non-square SGEMM (default: use --n)" )
7784 p .add_argument ("--freq" , type = float , default = 0 ,
7885 help = "Pipeline clock frequency in MHz "
7986 "(0 = per-cycle mode, implies --by-cycle; "
@@ -129,15 +136,28 @@ def find_blackbox(args):
129136 return bb , bdir
130137
131138
132- def run_sgemmx_capture (args ):
133- """Run sgemmx via blackbox.sh and return captured stdout+stderr."""
139+ def _matrix_dims (args ):
140+ m_dim = args .m if args .m is not None else args .n
141+ n_dim = args .n
142+ k_dim = args .k if args .k is not None else args .n
143+ return m_dim , n_dim , k_dim
144+
145+
146+ def run_app_capture (args ):
147+ """Run the selected app via blackbox.sh and return captured stdout+stderr."""
134148 blackbox , cwd = find_blackbox (args )
135149
150+ m_dim , n_dim , k_dim = _matrix_dims (args )
151+ if args .app == "sgemm_tcu" :
152+ app_args = f"-m{ m_dim } -n{ n_dim } -k{ k_dim } "
153+ else :
154+ app_args = f"-n{ n_dim } "
155+
136156 cmd = [
137157 blackbox ,
138158 f"--driver={ args .driver } " ,
139- "--app=sgemmx " ,
140- f"--args=-n { args . n } " ,
159+ f "--app={ args . app } " ,
160+ f"--args={ app_args } " ,
141161 ]
142162 if args .perf :
143163 cmd .append (f"--perf={ args .perf } " )
@@ -155,6 +175,8 @@ def run_sgemmx_capture(args):
155175 configs .append (f"-DPLATFORM_MEMORY_NUM_BANKS={ args .mem_banks } " )
156176 if args .mem_data_size is not None :
157177 configs .append (f"-DPLATFORM_MEMORY_DATA_SIZE={ args .mem_data_size } " )
178+ if args .app == "sgemm_tcu" :
179+ configs .append ("-DEXT_TCU_ENABLE" )
158180
159181 env = os .environ .copy ()
160182 existing = env .get ("CONFIGS" , "" )
@@ -248,7 +270,7 @@ def parse_perf(output):
248270# ────────────────────────────────────────────────────────────────────────────
249271
250272def compute_metrics (args , perf ):
251- n = args . n
273+ m_dim , n_dim , k_dim = _matrix_dims ( args )
252274 freq_hz = args .freq * 1e6 # MHz → Hz
253275 num_hw_threads = args .cores * args .warps * args .threads
254276 # Compute throughput threads: one warp issues per cycle per core, executing
@@ -257,14 +279,14 @@ def compute_metrics(args, perf):
257279 num_compute_threads = args .cores * args .threads
258280
259281 # --- Workload FLOPs ---
260- # SGEMM: C = A × B → 2·N³ FLOPs (N³ muls + N³ adds, or N³ FMAs)
261- flops = 2.0 * n ** 3
282+ # SGEMM: C[M×N] = A[M×K] × B[K×N] → 2·M·N·K FLOPs
283+ flops = 2.0 * m_dim * n_dim * k_dim
262284
263285 # --- Arithmetic intensity ---
264286 # Ideal (Roofline "capacity" model): one-pass, perfect reuse
265- # load A (N² f32) + load B (N² f32) + store C (N² f32)
266- bytes_ideal = 3 * n * n * 4 # bytes, float32
267- ai_ideal = flops / bytes_ideal # FLOP/byte (= N/6)
287+ # load A (M·K f32) + load B (K·N f32) + store C (M·N f32)
288+ bytes_ideal = ( m_dim * k_dim + k_dim * n_dim + m_dim * n_dim ) * 4
289+ ai_ideal = flops / bytes_ideal
268290
269291 # Actual (from profiler cache-line accounting), if available
270292 if perf .get ("actual_bytes" ) is not None :
@@ -443,9 +465,9 @@ def plot_roofline(args, perf, m, outfile):
443465 ax .set_xlabel ("Arithmetic Intensity (FLOP / byte)" , fontsize = 12 )
444466 ax .set_ylabel (y_label , fontsize = 12 )
445467 ax .set_title (
446- f"Vortex SGEMM Roofline — "
468+ f"Vortex SGEMM Roofline — { args . app } — "
447469 f"{ args .cores } C / { args .warps } W / { args .threads } T, "
448- f"N={ args . n } , { args .driver } , { domain_tag } " ,
470+ f"M= { _matrix_dims ( args )[ 0 ] } , N={ _matrix_dims ( args )[ 1 ] } , K= { _matrix_dims ( args )[ 2 ] } , { args .driver } , { domain_tag } " ,
449471 fontsize = 12 , fontweight = "bold" ,
450472 )
451473 ax .legend (fontsize = 9 , loc = "upper left" )
@@ -513,12 +535,14 @@ def _annotate(ax, ai, gfl, color, label):
513535# ────────────────────────────────────────────────────────────────────────────
514536
515537def print_metrics (args , perf , m ):
538+ m_dim , n_dim , k_dim = _matrix_dims (args )
516539 sep = "═" * 56
517540 print (f"\n { sep } " )
518541 print (" Vortex SGEMM Roofline Metrics" )
519542 print (sep )
543+ print (f" App : { args .app } " )
520544 print (f" Config : { args .cores } C / { args .warps } W / { args .threads } T" )
521- print (f" Matrix size : { args . n } × { args . n } " )
545+ print (f" Matrix size : M= { m_dim } , N= { n_dim } , K= { k_dim } " )
522546 print (f" Driver : { args .driver } " )
523547 if args .by_cycle :
524548 print (f" Domain : per-cycle" )
@@ -577,7 +601,7 @@ def main():
577601 freq = read_platform_clock (bdir )
578602 args .freq = float (freq ) if freq else 1.0
579603
580- output = run_sgemmx_capture (args )
604+ output = run_app_capture (args )
581605 perf = parse_perf (output )
582606
583607 print (f"\n Parsed ({ perf ['cores_seen' ]} core(s)): "
0 commit comments