Olajide-Badejo
diff --git a/‎.gitattributes‎
Lines changed: 10 additions & 0 deletions b/‎.gitattributes‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎.gitignore‎
Lines changed: 24 additions & 0 deletions b/‎.gitignore‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎CITATION.cff‎
Lines changed: 18 additions & 0 deletions b/‎CITATION.cff‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 60 additions & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 60 additions & 0 deletions
diff --git a/‎CONTRIBUTING.md‎
Lines changed: 35 additions & 0 deletions b/‎CONTRIBUTING.md‎
Lines changed: 35 additions & 0 deletions
diff --git a/‎LICENSE‎
Lines changed: 21 additions & 0 deletions b/‎LICENSE‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎Makefile‎
Lines changed: 92 additions & 0 deletions b/‎Makefile‎
Lines changed: 92 additions & 0 deletions
@@ -0,0 +1,10 @@
+* text=auto eol=lf
+
+*.cu text eol=lf
+*.cuh text eol=lf
+*.cpp text eol=lf
+*.h text eol=lf
+*.md text eol=lf
+*.txt text eol=lf
+Makefile text eol=lf
+CMakeLists.txt text eol=lf
@@ -0,0 +1,24 @@
+# Build artifacts
+bin/
+obj/
+build/
+cmake-build-*/
+
+# Benchmark outputs
+results/*.txt
+results/*.log
+results/*.csv
+results/*.json
+
+# CUDA profiling outputs
+*.ncu-rep
+*.nvprof
+*.nsys-rep
+*.qdrep
+
+# OS/editor noise
+.DS_Store
+Thumbs.db
+.vscode/
+.idea/
+*.swp
@@ -0,0 +1,18 @@
+cff-version: 1.2.0
+title: "GPU-Based Matrix Operations"
+message: "If you use this project in academic work, please cite it using this metadata."
+type: software
+authors:
+  - family-names: Badejo
+    given-names: Olajide
+repository-code: "https://github.com/Olajide-Badejo/GPU-Based-Matrix-Operations"
+license: MIT
+abstract: >
+  CUDA/C++ benchmark suite for matrix-vector and matrix-matrix kernels with
+  comparative evaluation of naive, shared-memory, and coalesced/tiled variants.
+keywords:
+  - CUDA
+  - GPU
+  - matrix multiplication
+  - memory coalescing
+  - shared memory
@@ -0,0 +1,60 @@
+cmake_minimum_required(VERSION 3.18)
+project(gpu_matrix_ops LANGUAGES CXX CUDA)
+
+# ── C++ / CUDA standards ──────────────────────────────────────────────────────
+set(CMAKE_CXX_STANDARD  17)
+set(CMAKE_CUDA_STANDARD 17)
+
+# ── CUDA architecture – auto-detect or override via -DCUDA_ARCH=86 etc. ───────
+if(NOT DEFINED CUDA_ARCH)
+    # Common targets: 60=Pascal, 70=Volta, 75=Turing, 80=Ampere, 86=RTX30xx,
+    #                 89=Ada (RTX40xx), 90=Hopper
+    set(CUDA_ARCH "60;70;75;80;86;89" CACHE STRING "CUDA gencode arch list")
+endif()
+
+foreach(ARCH ${CUDA_ARCH})
+    set(CMAKE_CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES} ${ARCH})
+endforeach()
+
+# ── Compiler flags ────────────────────────────────────────────────────────────
+set(CMAKE_CXX_FLAGS_RELEASE  "-O3 -march=native")
+set(CMAKE_CUDA_FLAGS_RELEASE "-O3 --use_fast_math")
+
+# Extra CUDA flags for all build types
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} \
+    --expt-relaxed-constexpr \
+    -Xcompiler=-Wall")
+
+# ── Include paths ─────────────────────────────────────────────────────────────
+include_directories(${CMAKE_SOURCE_DIR}/include)
+
+# ── Sources ───────────────────────────────────────────────────────────────────
+set(SOURCES
+    src/main.cu
+    src/matvec_kernels.cu
+    src/matmul_kernels.cu
+    src/cpu_ops.cpp
+)
+
+# ── Executable ────────────────────────────────────────────────────────────────
+add_executable(gpu_matrix_ops ${SOURCES})
+
+# Separate compilation needed when mixing .cu and .cpp in the same target
+set_target_properties(gpu_matrix_ops PROPERTIES
+    CUDA_SEPARABLE_COMPILATION ON
+    RUNTIME_OUTPUT_DIRECTORY   ${CMAKE_BINARY_DIR}/bin
+)
+
+target_compile_options(gpu_matrix_ops PRIVATE
+    $<$<COMPILE_LANGUAGE:CUDA>:
+        --ptxas-options=-v          # verbose register/smem usage
+        -lineinfo                   # line info for Nsight profiling
+    >
+)
+
+# ── Install ───────────────────────────────────────────────────────────────────
+install(TARGETS gpu_matrix_ops DESTINATION bin)
+
+# ── Summary ───────────────────────────────────────────────────────────────────
+message(STATUS "CUDA architectures : ${CMAKE_CUDA_ARCHITECTURES}")
+message(STATUS "Build type         : ${CMAKE_BUILD_TYPE}")
@@ -0,0 +1,35 @@
+# Contributing
+
+Thanks for your interest in improving this project.
+
+## Development Principles
+
+- Keep kernels correct first, then optimize.
+- Always compare GPU output against CPU reference (`max_abs_diff`).
+- Report benchmark settings with enough detail to reproduce:
+  - GPU model
+  - CUDA version
+  - driver version
+  - matrix sizes
+  - iteration counts
+  - build flags / architecture target
+
+## Local Workflow
+
+1. Create a branch for your change.
+2. Keep commits small and focused.
+3. Run a Release build and execute benchmark binary.
+4. Include before/after metrics for performance-related changes.
+
+## Performance Change Checklist
+
+- State expected impact (latency, throughput, occupancy, memory traffic).
+- Validate at multiple problem sizes (small, medium, large).
+- Include error metrics against CPU reference.
+- Mention any architecture-specific assumptions (e.g., SM target).
+
+## Coding Style
+
+- C++17 / CUDA C++.
+- Prefer clear names and short, meaningful comments.
+- Avoid introducing dependencies unless justified by measurable benefit.
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2026 Olajide Badejo
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
@@ -0,0 +1,92 @@
+# ─────────────────────────────────────────────────────────────────────────────
+#  Makefile  —  GPU Matrix Ops
+#  Usage:
+#    make            # build with optimisations
+#    make DEBUG=1    # build with debug info / no optimisation
+#    make run        # build + execute
+#    make clean      # remove build artefacts
+#    make profile    # build with Nsight-profiling flags
+# ─────────────────────────────────────────────────────────────────────────────
+
+NVCC       := nvcc
+CXX        := g++
+
+# Detect SM version; fallback to 86 (Ampere RTX 30-series)
+SM         ?= 86
+
+# ── Directories ───────────────────────────────────────────────────────────────
+SRCDIR     := src
+INCDIR     := include
+BINDIR     := bin
+OBJDIR     := obj
+
+# ── Sources ───────────────────────────────────────────────────────────────────
+CU_SRCS    := $(SRCDIR)/main.cu \
+              $(SRCDIR)/matvec_kernels.cu \
+              $(SRCDIR)/matmul_kernels.cu
+CPP_SRCS   := $(SRCDIR)/cpu_ops.cpp
+
+CU_OBJS    := $(patsubst $(SRCDIR)/%.cu,  $(OBJDIR)/%.cu.o,  $(CU_SRCS))
+CPP_OBJS   := $(patsubst $(SRCDIR)/%.cpp, $(OBJDIR)/%.cpp.o, $(CPP_SRCS))
+
+TARGET     := $(BINDIR)/gpu_matrix_ops
+
+# ── Flags ─────────────────────────────────────────────────────────────────────
+COMMON_FLAGS := -I$(INCDIR)
+
+ifdef DEBUG
+  NVCC_FLAGS  := -g -G -O0 -arch=sm_$(SM) $(COMMON_FLAGS)
+  CXX_FLAGS   := -g -O0 $(COMMON_FLAGS)
+else
+  NVCC_FLAGS  := -O3 --use_fast_math -arch=sm_$(SM) \
+                 --expt-relaxed-constexpr \
+                 -lineinfo \
+                 --ptxas-options=-v \
+                 $(COMMON_FLAGS)
+  CXX_FLAGS   := -O3 -march=native $(COMMON_FLAGS)
+endif
+
+ifdef PROFILE
+  NVCC_FLAGS  += -lineinfo -G
+endif
+
+LINK_FLAGS := -lcudart
+
+# ── Default target ─────────────────────────────────────────────────────────────
+.PHONY: all run clean profile
+
+all: $(TARGET)
+
+$(TARGET): $(CU_OBJS) $(CPP_OBJS)
+	@mkdir -p $(BINDIR)
+	$(NVCC) $(NVCC_FLAGS) $^ -o $@ $(LINK_FLAGS)
+	@echo ""
+	@echo "  Built: $@"
+
+$(OBJDIR)/%.cu.o: $(SRCDIR)/%.cu
+	@mkdir -p $(OBJDIR)
+	$(NVCC) $(NVCC_FLAGS) -dc -c $< -o $@
+
+$(OBJDIR)/%.cpp.o: $(SRCDIR)/%.cpp
+	@mkdir -p $(OBJDIR)
+	$(CXX) $(CXX_FLAGS) -c $< -o $@
+
+run: all
+	@echo ""
+	./$(TARGET)
+
+profile: PROFILE=1
+profile: all
+	@echo "  Binary built with profiling flags."
+	@echo "  Run with: ncu --set full ./$(TARGET)"
+	@echo "         or: nvprof ./$(TARGET)"
+
+clean:
+	rm -rf $(OBJDIR) $(BINDIR)
+	@echo "  Cleaned."
+
+# Show which SM version is being compiled for
+info:
+	@echo "  Target SM : sm_$(SM)"
+	@echo "  NVCC      : $(shell which $(NVCC))"
+	@echo "  CUDA ver  : $(shell $(NVCC) --version | tail -1)"