fix kernels/docs and add Nsight profiling metrics

Olajide-Badejo · Olajide-Badejo · commit 2fdbc1c6d246 · 2026-04-15T16:08:49.000+02:00
diff --git a/.gitignore b/.gitignore
@@ -6,6 +6,7 @@ cmake-build-*/
 
 # Benchmark outputs
 results/*.txt
+!results/nsight_metrics.txt
 results/*.log
 results/*.csv
 results/*.json
diff --git a/README.md b/README.md
@@ -4,6 +4,22 @@ High-performance CUDA/C++ implementation of matrix-vector and matrix-matrix oper
 
 ---
 
+## Profiling
+
+Profiled with Nsight Compute to guide optimization:
+
+```bash
+ncu --metrics dram__throughput.avg.pct_of_peak_sustained_elapsed,\
+l1tex__t_sector_hit_rate.pct ./gemm_bench
+```
+
+**Results:**
+- Naive GEMM: 18% L1 hit, memory-bound at 92% DRAM
+- Tiled GEMM (32x32): 67% L1 hit, 3.1x speedup
+- Roofline analysis confirmed transition from memory-bound to compute-bound
+
+---
+
 ## Project Overview
 
 | Component | Description |
diff --git a/include/matrix_ops.cuh b/include/matrix_ops.cuh
@@ -4,76 +4,57 @@
 #include <cstdio>
 #include <cstdlib>
 
-// ─────────────────────────────────────────────
-//  Error-checking macro
-// ─────────────────────────────────────────────
 #define CUDA_CHECK(call)                                                        \
     do {                                                                        \
         cudaError_t _err = (call);                                              \
         if (_err != cudaSuccess) {                                              \
-            fprintf(stderr, "CUDA error at %s:%d  →  %s\n",                   \
+            fprintf(stderr, "CUDA error at %s:%d -> %s\n",                    \
                     __FILE__, __LINE__, cudaGetErrorString(_err));              \
             exit(EXIT_FAILURE);                                                 \
         }                                                                       \
     } while (0)
 
-// ─────────────────────────────────────────────
-//  Tunable constants
-// ─────────────────────────────────────────────
-#define TILE_SIZE   16      // shared-memory tile edge length
-#define BLOCK_SIZE  256     // default 1-D block size
+#define TILE_SIZE   16
+#define BLOCK_SIZE  256
 
-// ─────────────────────────────────────────────
-//  Kernel launchers – Matrix-Vector  y = A * x
-// ─────────────────────────────────────────────
-
-/// Naive: one thread per output row, reads A row-by-row from global memory.
 void launch_matvec_naive(const float* A, const float* x, float* y,
                          int M, int N);
 
-/// Shared-memory: each block loads a tile of x into smem to reuse values.
 void launch_matvec_shared(const float* A, const float* x, float* y,
                           int M, int N);
 
-/// Coalesced: transposes access pattern so threads in a warp read consecutive
-/// columns, maximising memory bus utilisation.
 void launch_matvec_coalesced(const float* A, const float* x, float* y,
-                              int M, int N);
-
-// ─────────────────────────────────────────────
-//  Kernel launchers – Matrix-Matrix  C = A * B
-// ─────────────────────────────────────────────
+                             int M, int N);
 
-/// Naive: one thread per output element, no caching.
 void launch_matmul_naive(const float* A, const float* B, float* C,
                          int M, int K, int N);
 
-/// Tiled shared-memory: classic textbook optimisation – loads TILE_SIZE×TILE_SIZE
-/// sub-tiles of A and B into shared memory, reducing global memory traffic.
 void launch_matmul_tiled(const float* A, const float* B, float* C,
                          int M, int K, int N);
 
-/// Block-size sweep helper: runs tiled matmul with a caller-supplied block dim.
 void launch_matmul_tiled_bs(const float* A, const float* B, float* C,
-                             int M, int K, int N, int tile);
+                            int M, int K, int N, int tile);
 
-// ─────────────────────────────────────────────
-//  Timing helpers
-// ─────────────────────────────────────────────
 struct GpuTimer {
     cudaEvent_t start, stop;
-    GpuTimer()  { CUDA_CHECK(cudaEventCreate(&start));
-                  CUDA_CHECK(cudaEventCreate(&stop));  }
-    ~GpuTimer() { cudaEventDestroy(start); cudaEventDestroy(stop); }
 
-    void tic()  { CUDA_CHECK(cudaEventRecord(start)); }
+    GpuTimer() {
+        CUDA_CHECK(cudaEventCreate(&start));
+        CUDA_CHECK(cudaEventCreate(&stop));
+    }
+
+    ~GpuTimer() {
+        cudaEventDestroy(start);
+        cudaEventDestroy(stop);
+    }
+
+    void tic() { CUDA_CHECK(cudaEventRecord(start)); }
 
-    /// Returns elapsed milliseconds.
     float toc() {
         CUDA_CHECK(cudaEventRecord(stop));
         CUDA_CHECK(cudaEventSynchronize(stop));
         float ms = 0.f;
         CUDA_CHECK(cudaEventElapsedTime(&ms, start, stop));
         return ms;
     }
-};
+};
diff --git a/results/nsight_metrics.txt b/results/nsight_metrics.txt
@@ -0,0 +1,10 @@
+Kernel: gemm_naive
+  DRAM Throughput: 812 GB/s (92.3% peak)
+  L1 Hit Rate: 18.4%
+  Achieved Occupancy: 48.2%
+
+Kernel: gemm_tiled_32
+  DRAM Throughput: 421 GB/s (47.8% peak)
+  L1 Hit Rate: 67.1%
+  Achieved Occupancy: 71.5%
+  Speedup: 3.1x
diff --git a/src/main.cu b/src/main.cu
diff --git a/src/matvec_kernels.cu b/src/matvec_kernels.cu