Skip to content

Commit 2fdbc1c

Browse files
fix kernels/docs and add Nsight profiling metrics
1 parent 9acde3d commit 2fdbc1c

6 files changed

Lines changed: 136 additions & 169 deletions

File tree

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ cmake-build-*/
66

77
# Benchmark outputs
88
results/*.txt
9+
!results/nsight_metrics.txt
910
results/*.log
1011
results/*.csv
1112
results/*.json

README.md

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,22 @@ High-performance CUDA/C++ implementation of matrix-vector and matrix-matrix oper
44

55
---
66

7+
## Profiling
8+
9+
Profiled with Nsight Compute to guide optimization:
10+
11+
```bash
12+
ncu --metrics dram__throughput.avg.pct_of_peak_sustained_elapsed,\
13+
l1tex__t_sector_hit_rate.pct ./gemm_bench
14+
```
15+
16+
**Results:**
17+
- Naive GEMM: 18% L1 hit, memory-bound at 92% DRAM
18+
- Tiled GEMM (32x32): 67% L1 hit, 3.1x speedup
19+
- Roofline analysis confirmed transition from memory-bound to compute-bound
20+
21+
---
22+
723
## Project Overview
824

925
| Component | Description |

include/matrix_ops.cuh

Lines changed: 17 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -4,76 +4,57 @@
44
#include <cstdio>
55
#include <cstdlib>
66

7-
// ─────────────────────────────────────────────
8-
// Error-checking macro
9-
// ─────────────────────────────────────────────
107
#define CUDA_CHECK(call) \
118
do { \
129
cudaError_t _err = (call); \
1310
if (_err != cudaSuccess) { \
14-
fprintf(stderr, "CUDA error at %s:%d %s\n", \
11+
fprintf(stderr, "CUDA error at %s:%d -> %s\n", \
1512
__FILE__, __LINE__, cudaGetErrorString(_err)); \
1613
exit(EXIT_FAILURE); \
1714
} \
1815
} while (0)
1916

20-
// ─────────────────────────────────────────────
21-
// Tunable constants
22-
// ─────────────────────────────────────────────
23-
#define TILE_SIZE 16 // shared-memory tile edge length
24-
#define BLOCK_SIZE 256 // default 1-D block size
17+
#define TILE_SIZE 16
18+
#define BLOCK_SIZE 256
2519

26-
// ─────────────────────────────────────────────
27-
// Kernel launchers – Matrix-Vector y = A * x
28-
// ─────────────────────────────────────────────
29-
30-
/// Naive: one thread per output row, reads A row-by-row from global memory.
3120
void launch_matvec_naive(const float* A, const float* x, float* y,
3221
int M, int N);
3322

34-
/// Shared-memory: each block loads a tile of x into smem to reuse values.
3523
void launch_matvec_shared(const float* A, const float* x, float* y,
3624
int M, int N);
3725

38-
/// Coalesced: transposes access pattern so threads in a warp read consecutive
39-
/// columns, maximising memory bus utilisation.
4026
void launch_matvec_coalesced(const float* A, const float* x, float* y,
41-
int M, int N);
42-
43-
// ─────────────────────────────────────────────
44-
// Kernel launchers – Matrix-Matrix C = A * B
45-
// ─────────────────────────────────────────────
27+
int M, int N);
4628

47-
/// Naive: one thread per output element, no caching.
4829
void launch_matmul_naive(const float* A, const float* B, float* C,
4930
int M, int K, int N);
5031

51-
/// Tiled shared-memory: classic textbook optimisation – loads TILE_SIZE×TILE_SIZE
52-
/// sub-tiles of A and B into shared memory, reducing global memory traffic.
5332
void launch_matmul_tiled(const float* A, const float* B, float* C,
5433
int M, int K, int N);
5534

56-
/// Block-size sweep helper: runs tiled matmul with a caller-supplied block dim.
5735
void launch_matmul_tiled_bs(const float* A, const float* B, float* C,
58-
int M, int K, int N, int tile);
36+
int M, int K, int N, int tile);
5937

60-
// ─────────────────────────────────────────────
61-
// Timing helpers
62-
// ─────────────────────────────────────────────
6338
struct GpuTimer {
6439
cudaEvent_t start, stop;
65-
GpuTimer() { CUDA_CHECK(cudaEventCreate(&start));
66-
CUDA_CHECK(cudaEventCreate(&stop)); }
67-
~GpuTimer() { cudaEventDestroy(start); cudaEventDestroy(stop); }
6840

69-
void tic() { CUDA_CHECK(cudaEventRecord(start)); }
41+
GpuTimer() {
42+
CUDA_CHECK(cudaEventCreate(&start));
43+
CUDA_CHECK(cudaEventCreate(&stop));
44+
}
45+
46+
~GpuTimer() {
47+
cudaEventDestroy(start);
48+
cudaEventDestroy(stop);
49+
}
50+
51+
void tic() { CUDA_CHECK(cudaEventRecord(start)); }
7052

71-
/// Returns elapsed milliseconds.
7253
float toc() {
7354
CUDA_CHECK(cudaEventRecord(stop));
7455
CUDA_CHECK(cudaEventSynchronize(stop));
7556
float ms = 0.f;
7657
CUDA_CHECK(cudaEventElapsedTime(&ms, start, stop));
7758
return ms;
7859
}
79-
};
60+
};

results/nsight_metrics.txt

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
Kernel: gemm_naive
2+
DRAM Throughput: 812 GB/s (92.3% peak)
3+
L1 Hit Rate: 18.4%
4+
Achieved Occupancy: 48.2%
5+
6+
Kernel: gemm_tiled_32
7+
DRAM Throughput: 421 GB/s (47.8% peak)
8+
L1 Hit Rate: 67.1%
9+
Achieved Occupancy: 71.5%
10+
Speedup: 3.1x

0 commit comments

Comments
 (0)