44#include < cstdio>
55#include < cstdlib>
66
7- // ─────────────────────────────────────────────
8- // Error-checking macro
9- // ─────────────────────────────────────────────
107#define CUDA_CHECK (call ) \
118 do { \
129 cudaError_t _err = (call); \
1310 if (_err != cudaSuccess) { \
14- fprintf (stderr, " CUDA error at %s:%d → %s\n " , \
11+ fprintf (stderr, " CUDA error at %s:%d -> %s\n " , \
1512 __FILE__, __LINE__, cudaGetErrorString (_err)); \
1613 exit (EXIT_FAILURE); \
1714 } \
1815 } while (0 )
1916
20- // ─────────────────────────────────────────────
21- // Tunable constants
22- // ─────────────────────────────────────────────
23- #define TILE_SIZE 16 // shared-memory tile edge length
24- #define BLOCK_SIZE 256 // default 1-D block size
17+ #define TILE_SIZE 16
18+ #define BLOCK_SIZE 256
2519
26- // ─────────────────────────────────────────────
27- // Kernel launchers – Matrix-Vector y = A * x
28- // ─────────────────────────────────────────────
29-
30- // / Naive: one thread per output row, reads A row-by-row from global memory.
3120void launch_matvec_naive (const float * A, const float * x, float * y,
3221 int M, int N);
3322
34- // / Shared-memory: each block loads a tile of x into smem to reuse values.
3523void launch_matvec_shared (const float * A, const float * x, float * y,
3624 int M, int N);
3725
38- // / Coalesced: transposes access pattern so threads in a warp read consecutive
39- // / columns, maximising memory bus utilisation.
4026void launch_matvec_coalesced (const float * A, const float * x, float * y,
41- int M, int N);
42-
43- // ─────────────────────────────────────────────
44- // Kernel launchers – Matrix-Matrix C = A * B
45- // ─────────────────────────────────────────────
27+ int M, int N);
4628
47- // / Naive: one thread per output element, no caching.
4829void launch_matmul_naive (const float * A, const float * B, float * C,
4930 int M, int K, int N);
5031
51- // / Tiled shared-memory: classic textbook optimisation – loads TILE_SIZE×TILE_SIZE
52- // / sub-tiles of A and B into shared memory, reducing global memory traffic.
5332void launch_matmul_tiled (const float * A, const float * B, float * C,
5433 int M, int K, int N);
5534
56- // / Block-size sweep helper: runs tiled matmul with a caller-supplied block dim.
5735void launch_matmul_tiled_bs (const float * A, const float * B, float * C,
58- int M, int K, int N, int tile);
36+ int M, int K, int N, int tile);
5937
60- // ─────────────────────────────────────────────
61- // Timing helpers
62- // ─────────────────────────────────────────────
6338struct GpuTimer {
6439 cudaEvent_t start, stop;
65- GpuTimer () { CUDA_CHECK (cudaEventCreate (&start));
66- CUDA_CHECK (cudaEventCreate (&stop)); }
67- ~GpuTimer () { cudaEventDestroy (start); cudaEventDestroy (stop); }
6840
69- void tic () { CUDA_CHECK (cudaEventRecord (start)); }
41+ GpuTimer () {
42+ CUDA_CHECK (cudaEventCreate (&start));
43+ CUDA_CHECK (cudaEventCreate (&stop));
44+ }
45+
46+ ~GpuTimer () {
47+ cudaEventDestroy (start);
48+ cudaEventDestroy (stop);
49+ }
50+
51+ void tic () { CUDA_CHECK (cudaEventRecord (start)); }
7052
71- // / Returns elapsed milliseconds.
7253 float toc () {
7354 CUDA_CHECK (cudaEventRecord (stop));
7455 CUDA_CHECK (cudaEventSynchronize (stop));
7556 float ms = 0 .f ;
7657 CUDA_CHECK (cudaEventElapsedTime (&ms, start, stop));
7758 return ms;
7859 }
79- };
60+ };
0 commit comments