CUDA-Matrix-Library/CMakeLists.txt at main · Olajide-Badejo/CUDA-Matrix-Library · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
cmake_minimum_required(VERSION 3.18)

project(matrix-ops-lib
    VERSION     0.1.0
    DESCRIPTION "GPU-accelerated matrix operations library with CUDA tensor core support"
    LANGUAGES   CXX CUDA
)

# ============================================================================
# C++ / CUDA standards
# ============================================================================
set(CMAKE_CXX_STANDARD  17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CUDA_STANDARD 17)
set(CMAKE_CUDA_STANDARD_REQUIRED ON)

# ============================================================================
# Default build type
# ============================================================================
if(NOT CMAKE_BUILD_TYPE)
    set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
endif()

# ============================================================================
# CUDA architecture selection
# Defaults to Ampere (SM 8.0) + Hopper (SM 9.0).  Override with:
#   cmake -DCMAKE_CUDA_ARCHITECTURES="70;80;89;90" ..
# ============================================================================
if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
    set(CMAKE_CUDA_ARCHITECTURES 80 90)
    message(STATUS "CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES} (default)")
endif()

# ============================================================================
# Find CUDA libraries
# ============================================================================
find_package(CUDAToolkit REQUIRED)
message(STATUS "CUDA version: ${CUDAToolkit_VERSION}")

# ============================================================================
# Custom CMake modules
# ============================================================================
list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake")
include(TensorCoreDetection)

# ============================================================================
# Compiler flags
# ============================================================================

# Enable all CUDA warnings and optimise for the target architecture.
set(CUDA_COMMON_FLAGS
    --use_fast_math               # Enable FP32 fast math intrinsics
    --expt-relaxed-constexpr      # Allow constexpr in device code
    --expt-extended-lambda        # Allow lambdas in device code
    -Xcompiler=-Wall              # Host compiler warnings
    -Xcompiler=-Wextra
    -Xcompiler=-Wno-unused-parameter
)

if(CMAKE_BUILD_TYPE STREQUAL "Debug")
    list(APPEND CUDA_COMMON_FLAGS -g -G)   # Device-side debug info
else()
    list(APPEND CUDA_COMMON_FLAGS -O3 --ptxas-options=-v)
endif()

# ============================================================================
# Core library sources
# ============================================================================
set(MATLIB_SOURCES
    src/blas/gemm/gemm_naive.cu
    src/blas/gemm/gemm_tiled.cu
    src/blas/gemm/gemm_registers.cu
    src/blas/gemm/gemm_tensor_core.cu
    src/blas/gemm/gemm_bf16.cu
    src/blas/gemm/gemm_dispatcher.cu
    src/blas/gemm/fp_convert.cu
    src/blas/gemv.cu
    src/blas/trsm.cu
    src/blas/batched_gemm.cu
    src/sparse/spmv_csr.cu
    src/sparse/merge_path_spmv.cu
)

# Static library (used by tests and benchmarks).
add_library(matlib_static STATIC ${MATLIB_SOURCES})

target_include_directories(matlib_static PUBLIC
    "${CMAKE_SOURCE_DIR}/include"
    "${CMAKE_SOURCE_DIR}/kernels"
)

target_link_libraries(matlib_static PUBLIC
    CUDA::cudart
    CUDA::cublas
    CUDA::cusolver
    CUDA::cusparse
)

target_compile_options(matlib_static PRIVATE
    "$<$<COMPILE_LANGUAGE:CUDA>:${CUDA_COMMON_FLAGS}>"
)

# ============================================================================
# Unit tests
# ============================================================================
option(MATLIB_BUILD_TESTS "Build unit and integration tests" ON)

if(MATLIB_BUILD_TESTS)
    find_package(GTest QUIET)
    if(NOT GTest_FOUND)
        # Automatically download GoogleTest via FetchContent.
        include(FetchContent)
        FetchContent_Declare(
            googletest
            GIT_REPOSITORY https://github.com/google/googletest.git
            GIT_TAG        v1.14.0
        )
        FetchContent_MakeAvailable(googletest)
    endif()

    enable_testing()

    # Macro to add a test executable with common setup.
    macro(add_matlib_test name source)
        add_executable(${name} ${source})
        target_include_directories(${name} PRIVATE "${CMAKE_SOURCE_DIR}/include")
        target_link_libraries(${name} PRIVATE
            matlib_static
            GTest::gtest_main
            CUDA::cudart
            CUDA::cublas
            CUDA::cusolver
        )
        add_test(NAME ${name} COMMAND ${name})
    endmacro()

    add_matlib_test(test_gemm           tests/unit/test_gemm.cpp)
    add_matlib_test(test_trsm           tests/unit/test_trsm.cpp)
    add_matlib_test(test_tensor_core    tests/unit/test_tensor_core.cpp)
    add_matlib_test(test_tensor_gemm    tests/unit/test_tensor_core.cpp)
    add_matlib_test(test_bf16_gemm      tests/unit/test_bf16_gemm.cpp)
    add_matlib_test(test_accuracy       tests/numerical/test_accuracy.cpp)
    add_matlib_test(test_conditioning   tests/numerical/test_conditioning.cpp)
    add_matlib_test(test_cublas_compat  tests/integration/test_cublas_compat.cpp)
endif()

# ============================================================================
# Benchmarks
# ============================================================================
option(MATLIB_BUILD_BENCHMARKS "Build benchmark executables" ON)

if(MATLIB_BUILD_BENCHMARKS)
    macro(add_matlib_bench name source)
        add_executable(${name} ${source})
        target_include_directories(${name} PRIVATE "${CMAKE_SOURCE_DIR}/include")
        target_link_libraries(${name} PRIVATE
            matlib_static
            CUDA::cudart
            CUDA::cublas
        )
    endmacro()

    add_matlib_bench(gemm_bench          benchmarks/gemm_bench.cpp)
    add_matlib_bench(gemm_sweep          benchmarks/gemm_sweep.cpp)
    add_matlib_bench(compare_cublas      benchmarks/compare_cublas.cpp)
    add_matlib_bench(kernel_search       benchmarks/autotuning/kernel_search.cpp)
endif()

# ============================================================================
# Python bindings (optional - requires pybind11)
# ============================================================================
option(MATLIB_BUILD_PYTHON "Build Python pybind11 extension" OFF)

if(MATLIB_BUILD_PYTHON)
    include(PythonBindings)
endif()

# ============================================================================
# Install
# ============================================================================
install(TARGETS matlib_static
    ARCHIVE DESTINATION lib
)
install(DIRECTORY include/matlib DESTINATION include)
install(FILES include/matlib.hpp DESTINATION include)

# ============================================================================
# Summary
# ============================================================================
message(STATUS "")
message(STATUS "matrix-ops-lib configuration summary:")
message(STATUS "  Build type        : ${CMAKE_BUILD_TYPE}")
message(STATUS "  CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
message(STATUS "  FP16 tensor cores : ${HAS_TC}")
message(STATUS "  BF16 tensor cores : ${HAS_BF16}")
message(STATUS "  Build tests       : ${MATLIB_BUILD_TESTS}")
message(STATUS "  Build benchmarks  : ${MATLIB_BUILD_BENCHMARKS}")
message(STATUS "  Build Python ext  : ${MATLIB_BUILD_PYTHON}")
message(STATUS "")