Skip to content

Commit fa4a9e2

Browse files
feat: initialize GPU matrix ops benchmark suite
0 parents  commit fa4a9e2

16 files changed

Lines changed: 1222 additions & 0 deletions

.gitattributes

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
* text=auto eol=lf
2+
3+
*.cu text eol=lf
4+
*.cuh text eol=lf
5+
*.cpp text eol=lf
6+
*.h text eol=lf
7+
*.md text eol=lf
8+
*.txt text eol=lf
9+
Makefile text eol=lf
10+
CMakeLists.txt text eol=lf

.gitignore

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
# Build artifacts
2+
bin/
3+
obj/
4+
build/
5+
cmake-build-*/
6+
7+
# Benchmark outputs
8+
results/*.txt
9+
results/*.log
10+
results/*.csv
11+
results/*.json
12+
13+
# CUDA profiling outputs
14+
*.ncu-rep
15+
*.nvprof
16+
*.nsys-rep
17+
*.qdrep
18+
19+
# OS/editor noise
20+
.DS_Store
21+
Thumbs.db
22+
.vscode/
23+
.idea/
24+
*.swp

CITATION.cff

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
cff-version: 1.2.0
2+
title: "GPU-Based Matrix Operations"
3+
message: "If you use this project in academic work, please cite it using this metadata."
4+
type: software
5+
authors:
6+
- family-names: Badejo
7+
given-names: Olajide
8+
repository-code: "https://github.com/Olajide-Badejo/GPU-Based-Matrix-Operations"
9+
license: MIT
10+
abstract: >
11+
CUDA/C++ benchmark suite for matrix-vector and matrix-matrix kernels with
12+
comparative evaluation of naive, shared-memory, and coalesced/tiled variants.
13+
keywords:
14+
- CUDA
15+
- GPU
16+
- matrix multiplication
17+
- memory coalescing
18+
- shared memory

CMakeLists.txt

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
cmake_minimum_required(VERSION 3.18)
2+
project(gpu_matrix_ops LANGUAGES CXX CUDA)
3+
4+
# ── C++ / CUDA standards ──────────────────────────────────────────────────────
5+
set(CMAKE_CXX_STANDARD 17)
6+
set(CMAKE_CUDA_STANDARD 17)
7+
8+
# ── CUDA architecture – auto-detect or override via -DCUDA_ARCH=86 etc. ───────
9+
if(NOT DEFINED CUDA_ARCH)
10+
# Common targets: 60=Pascal, 70=Volta, 75=Turing, 80=Ampere, 86=RTX30xx,
11+
# 89=Ada (RTX40xx), 90=Hopper
12+
set(CUDA_ARCH "60;70;75;80;86;89" CACHE STRING "CUDA gencode arch list")
13+
endif()
14+
15+
foreach(ARCH ${CUDA_ARCH})
16+
set(CMAKE_CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES} ${ARCH})
17+
endforeach()
18+
19+
# ── Compiler flags ────────────────────────────────────────────────────────────
20+
set(CMAKE_CXX_FLAGS_RELEASE "-O3 -march=native")
21+
set(CMAKE_CUDA_FLAGS_RELEASE "-O3 --use_fast_math")
22+
23+
# Extra CUDA flags for all build types
24+
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} \
25+
--expt-relaxed-constexpr \
26+
-Xcompiler=-Wall")
27+
28+
# ── Include paths ─────────────────────────────────────────────────────────────
29+
include_directories(${CMAKE_SOURCE_DIR}/include)
30+
31+
# ── Sources ───────────────────────────────────────────────────────────────────
32+
set(SOURCES
33+
src/main.cu
34+
src/matvec_kernels.cu
35+
src/matmul_kernels.cu
36+
src/cpu_ops.cpp
37+
)
38+
39+
# ── Executable ────────────────────────────────────────────────────────────────
40+
add_executable(gpu_matrix_ops ${SOURCES})
41+
42+
# Separate compilation needed when mixing .cu and .cpp in the same target
43+
set_target_properties(gpu_matrix_ops PROPERTIES
44+
CUDA_SEPARABLE_COMPILATION ON
45+
RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin
46+
)
47+
48+
target_compile_options(gpu_matrix_ops PRIVATE
49+
$<$<COMPILE_LANGUAGE:CUDA>:
50+
--ptxas-options=-v # verbose register/smem usage
51+
-lineinfo # line info for Nsight profiling
52+
>
53+
)
54+
55+
# ── Install ───────────────────────────────────────────────────────────────────
56+
install(TARGETS gpu_matrix_ops DESTINATION bin)
57+
58+
# ── Summary ───────────────────────────────────────────────────────────────────
59+
message(STATUS "CUDA architectures : ${CMAKE_CUDA_ARCHITECTURES}")
60+
message(STATUS "Build type : ${CMAKE_BUILD_TYPE}")

CONTRIBUTING.md

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
# Contributing
2+
3+
Thanks for your interest in improving this project.
4+
5+
## Development Principles
6+
7+
- Keep kernels correct first, then optimize.
8+
- Always compare GPU output against CPU reference (`max_abs_diff`).
9+
- Report benchmark settings with enough detail to reproduce:
10+
- GPU model
11+
- CUDA version
12+
- driver version
13+
- matrix sizes
14+
- iteration counts
15+
- build flags / architecture target
16+
17+
## Local Workflow
18+
19+
1. Create a branch for your change.
20+
2. Keep commits small and focused.
21+
3. Run a Release build and execute benchmark binary.
22+
4. Include before/after metrics for performance-related changes.
23+
24+
## Performance Change Checklist
25+
26+
- State expected impact (latency, throughput, occupancy, memory traffic).
27+
- Validate at multiple problem sizes (small, medium, large).
28+
- Include error metrics against CPU reference.
29+
- Mention any architecture-specific assumptions (e.g., SM target).
30+
31+
## Coding Style
32+
33+
- C++17 / CUDA C++.
34+
- Prefer clear names and short, meaningful comments.
35+
- Avoid introducing dependencies unless justified by measurable benefit.

LICENSE

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
MIT License
2+
3+
Copyright (c) 2026 Olajide Badejo
4+
5+
Permission is hereby granted, free of charge, to any person obtaining a copy
6+
of this software and associated documentation files (the "Software"), to deal
7+
in the Software without restriction, including without limitation the rights
8+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
copies of the Software, and to permit persons to whom the Software is
10+
furnished to do so, subject to the following conditions:
11+
12+
The above copyright notice and this permission notice shall be included in all
13+
copies or substantial portions of the Software.
14+
15+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
SOFTWARE.

Makefile

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
# ─────────────────────────────────────────────────────────────────────────────
2+
# Makefile — GPU Matrix Ops
3+
# Usage:
4+
# make # build with optimisations
5+
# make DEBUG=1 # build with debug info / no optimisation
6+
# make run # build + execute
7+
# make clean # remove build artefacts
8+
# make profile # build with Nsight-profiling flags
9+
# ─────────────────────────────────────────────────────────────────────────────
10+
11+
NVCC := nvcc
12+
CXX := g++
13+
14+
# Detect SM version; fallback to 86 (Ampere RTX 30-series)
15+
SM ?= 86
16+
17+
# ── Directories ───────────────────────────────────────────────────────────────
18+
SRCDIR := src
19+
INCDIR := include
20+
BINDIR := bin
21+
OBJDIR := obj
22+
23+
# ── Sources ───────────────────────────────────────────────────────────────────
24+
CU_SRCS := $(SRCDIR)/main.cu \
25+
$(SRCDIR)/matvec_kernels.cu \
26+
$(SRCDIR)/matmul_kernels.cu
27+
CPP_SRCS := $(SRCDIR)/cpu_ops.cpp
28+
29+
CU_OBJS := $(patsubst $(SRCDIR)/%.cu, $(OBJDIR)/%.cu.o, $(CU_SRCS))
30+
CPP_OBJS := $(patsubst $(SRCDIR)/%.cpp, $(OBJDIR)/%.cpp.o, $(CPP_SRCS))
31+
32+
TARGET := $(BINDIR)/gpu_matrix_ops
33+
34+
# ── Flags ─────────────────────────────────────────────────────────────────────
35+
COMMON_FLAGS := -I$(INCDIR)
36+
37+
ifdef DEBUG
38+
NVCC_FLAGS := -g -G -O0 -arch=sm_$(SM) $(COMMON_FLAGS)
39+
CXX_FLAGS := -g -O0 $(COMMON_FLAGS)
40+
else
41+
NVCC_FLAGS := -O3 --use_fast_math -arch=sm_$(SM) \
42+
--expt-relaxed-constexpr \
43+
-lineinfo \
44+
--ptxas-options=-v \
45+
$(COMMON_FLAGS)
46+
CXX_FLAGS := -O3 -march=native $(COMMON_FLAGS)
47+
endif
48+
49+
ifdef PROFILE
50+
NVCC_FLAGS += -lineinfo -G
51+
endif
52+
53+
LINK_FLAGS := -lcudart
54+
55+
# ── Default target ─────────────────────────────────────────────────────────────
56+
.PHONY: all run clean profile
57+
58+
all: $(TARGET)
59+
60+
$(TARGET): $(CU_OBJS) $(CPP_OBJS)
61+
@mkdir -p $(BINDIR)
62+
$(NVCC) $(NVCC_FLAGS) $^ -o $@ $(LINK_FLAGS)
63+
@echo ""
64+
@echo " Built: $@"
65+
66+
$(OBJDIR)/%.cu.o: $(SRCDIR)/%.cu
67+
@mkdir -p $(OBJDIR)
68+
$(NVCC) $(NVCC_FLAGS) -dc -c $< -o $@
69+
70+
$(OBJDIR)/%.cpp.o: $(SRCDIR)/%.cpp
71+
@mkdir -p $(OBJDIR)
72+
$(CXX) $(CXX_FLAGS) -c $< -o $@
73+
74+
run: all
75+
@echo ""
76+
./$(TARGET)
77+
78+
profile: PROFILE=1
79+
profile: all
80+
@echo " Binary built with profiling flags."
81+
@echo " Run with: ncu --set full ./$(TARGET)"
82+
@echo " or: nvprof ./$(TARGET)"
83+
84+
clean:
85+
rm -rf $(OBJDIR) $(BINDIR)
86+
@echo " Cleaned."
87+
88+
# Show which SM version is being compiled for
89+
info:
90+
@echo " Target SM : sm_$(SM)"
91+
@echo " NVCC : $(shell which $(NVCC))"
92+
@echo " CUDA ver : $(shell $(NVCC) --version | tail -1)"

0 commit comments

Comments
 (0)