Skip to content

Commit 46d397a

Browse files
committed
chore: version 0.3.0
1 parent adbaa8f commit 46d397a

15 files changed

Lines changed: 1567 additions & 370 deletions
Lines changed: 16 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
# This starter workflow is for a CMake project running on multiple platforms. There is a different starter workflow if you just want a single platform.
2-
# See: https://github.com/actions/starter-workflows/blob/main/ci/cmake-single-platform.yml
31
name: CMake on multiple platforms
42

53
on:
@@ -13,18 +11,16 @@ jobs:
1311
runs-on: ${{ matrix.os }}
1412

1513
strategy:
16-
# Set fail-fast to false to ensure that feedback is delivered for all matrix combinations. Consider changing this to true when your workflow is stable.
1714
fail-fast: false
1815

19-
# Set up a matrix to run the following 3 configurations:
20-
# 1. <Windows, Release, latest MSVC compiler toolchain on the default runner image, default generator>
21-
# 2. <Linux, Release, latest GCC compiler toolchain on the default runner image, default generator>
22-
# 3. <Linux, Release, latest Clang compiler toolchain on the default runner image, default generator>
23-
#
24-
# To add more build types (Release, Debug, RelWithDebInfo, etc.) customize the build_type list.
16+
# Matrix: OS × compiler
17+
# ubuntu-latest → x86-64 (GCC, Clang)
18+
# ubuntu-24.04-arm → ARM64 / NEON (GCC, Clang)
19+
# macos-latest → Apple Silicon ARM64 (Clang)
20+
# windows-latest → x86-64 (MSVC)
2521
matrix:
26-
os: [ubuntu-latest, windows-latest, macos-latest]
27-
build_type: [Release]
22+
os: [ubuntu-latest, ubuntu-24.04-arm, windows-latest, macos-latest]
23+
build_type: [Release, Debug]
2824
c_compiler: [gcc, clang, cl]
2925
include:
3026
- os: windows-latest
@@ -36,6 +32,12 @@ jobs:
3632
- os: ubuntu-latest
3733
c_compiler: clang
3834
cpp_compiler: clang++
35+
- os: ubuntu-24.04-arm
36+
c_compiler: gcc
37+
cpp_compiler: g++
38+
- os: ubuntu-24.04-arm
39+
c_compiler: clang
40+
cpp_compiler: clang++
3941
- os: macos-latest
4042
c_compiler: clang
4143
cpp_compiler: clang++
@@ -46,6 +48,8 @@ jobs:
4648
c_compiler: clang
4749
- os: ubuntu-latest
4850
c_compiler: cl
51+
- os: ubuntu-24.04-arm
52+
c_compiler: cl
4953
- os: macos-latest
5054
c_compiler: cl
5155
- os: macos-latest
@@ -55,15 +59,12 @@ jobs:
5559
- uses: actions/checkout@v4
5660

5761
- name: Set reusable strings
58-
# Turn repeated input strings (such as the build output directory) into step outputs. These step outputs can be used throughout the workflow file.
5962
id: strings
6063
shell: bash
6164
run: |
6265
echo "build-output-dir=${{ github.workspace }}/build" >> "$GITHUB_OUTPUT"
6366
6467
- name: Configure CMake
65-
# Configure CMake in a 'build' subdirectory. `CMAKE_BUILD_TYPE` is only required if you are using a single-configuration generator such as make.
66-
# See https://cmake.org/cmake/help/latest/variable/CMAKE_BUILD_TYPE.html?highlight=cmake_build_type
6768
run: >
6869
cmake -B ${{ steps.strings.outputs.build-output-dir }}
6970
-DCMAKE_CXX_COMPILER=${{ matrix.cpp_compiler }}
@@ -73,11 +74,8 @@ jobs:
7374
-DVECTOR_MATH_BUILD_TEST=ON
7475
7576
- name: Build
76-
# Build your program with the given configuration. Note that --config is needed because the default Windows generator is a multi-config generator (Visual Studio generator).
7777
run: cmake --build ${{ steps.strings.outputs.build-output-dir }} --config ${{ matrix.build_type }}
7878

7979
- name: Test
8080
working-directory: ${{ steps.strings.outputs.build-output-dir }}
81-
# Execute tests defined by the CMake configuration. Note that --build-config is needed because the default Windows generator is a multi-config generator (Visual Studio generator).
82-
# See https://cmake.org/cmake/help/latest/manual/ctest.1.html for more detail
83-
run: ctest --build-config ${{ matrix.build_type }}
81+
run: ctest --build-config ${{ matrix.build_type }} --output-on-failure

CHANGELOG.md

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,33 @@
11
# Changelog
22

3+
## [0.3.0] - 2026-03-18
4+
5+
### Added
6+
- **`Vector4d` SIMD arithmetic** — all operators now have AVX (x86) and NEON (AArch64) paths instead of falling back to the scalar `Vector4<double>` base class:
7+
- `operator+` / `operator-`: `_mm256_add/sub_pd` · `vaddq/vsubq_f64`
8+
- unary `operator-`: `_mm256_xor_pd` with sign-bit mask · `vnegq_f64`
9+
- `operator*(scalar)` / `operator/(scalar)`: `_mm256_mul/div_pd` · `vmulq_n_f64`
10+
- `dot()` (instance + static): AVX hadd trick (`_mm256_hadd_pd` + 128-bit extract) · `vpaddq_f64` + `vaddvq_f64`
11+
- **`Matrix4d` SIMD add / subtract / negate / scalar-multiply** — new overrides replace the 16-element scalar loop with 4 AVX 256-bit ops (one per row) or 8 NEON 128-bit ops:
12+
- `operator+(Matrix4d)` / `operator-(Matrix4d)`: `_mm256_add/sub_pd` · `vaddq/vsubq_f64`
13+
- unary `operator-()`: `_mm256_xor_pd` · `vnegq_f64`
14+
- `operator*(double)`: `_mm256_mul_pd` + `_mm256_set1_pd` · `vmulq_n_f64`
15+
- **FMA3 in `Matrix4d` mat×mat and mat×vec** (x86) — `mul + add` pairs replaced with `_mm256_fmadd_pd` when compiled with `-mfma` (`__FMA__` defined); reduces 7 instructions to 4 per accumulation step
16+
- **`Matrix4f * Vector4f` ARM NEON** — previously fell back to scalar; now uses two `vpaddq_f32` passes to compute all four dot products simultaneously
17+
- **`-mfma` compiler flag** added to the x86 CMake path (gcc/clang: `-mavx -mfma`; MSVC: `/arch:AVX2`)
18+
- Benchmarks for all new operations: `BM_Vector4dSIMDAdd`, `BM_Vector4dSIMDScalarMultiply`, `BM_Vector4dSIMDDot`, `BM_Matrix4dSIMDAdd`, `BM_Matrix4dSIMDScalarMultiply`, `BM_Matrix4fByVectorGeneric`, with scalar and GLM baselines
19+
- **AArch64 NEON** full implementation for `Matrix4d` matrix–matrix and matrix–vector multiply (`float64x2_t`, `vfmaq_f64`, `vpaddq_f64`)
20+
- `Matrix4d::lookAt` optimized inline override — avoids generic `Vec<>` loop overhead and intermediate temporaries
21+
- CMake **install support**: `GNUInstallDirs`, `CMakePackageConfigHelpers`, package config files (`vector_mathConfig.cmake`, `vector_mathConfigVersion.cmake`), and `INSTALL_INTERFACE` include paths
22+
- Benchmark suite expanded: `Matrix4f`, `Matrix4d`, `Quaternion`, and GLM comparison benchmarks; removed dummy `BM_StringCreation`
23+
24+
### Changed
25+
- `Matrix4d` and `Matrix4f` implementations moved from `.cpp` translation units to **header-only inline** methods — `src/matrix4d.cpp` and `src/matrix4f.cpp` removed
26+
- `Matrix4::identity()` now returns a cached `static const` instance (computed once via IIFE) instead of allocating a local array on every call
27+
- CI: added **`ubuntu-24.04-arm`** runner (AArch64 NEON coverage); added **Debug** build type alongside Release; `ctest` now runs with `--output-on-failure`
28+
29+
---
30+
331
## [0.2.0] - 2026-03-17
432

533
### Added

CLAUDE.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,9 +55,10 @@ Architecture is detected at compile time:
5555

5656
`Matrix4f` uses SSE 128-bit intrinsics (4×float). `Matrix4d` uses AVX 256-bit intrinsics (4×double). ARM paths currently fall back to scalar operations.
5757

58-
### Known issues
58+
### Known limitations
5959

60-
- `matrix4d` AVX implementation is broken (commit `f7bf612`). The scalar fallback is used on ARM; the AVX path may mix `_mm256_add_ps` (32-bit) with `_mm256_add_pd` (64-bit) incorrectly.
60+
- `Matrix4d` ARM 32-bit (ARMv7) uses scalar fallback — `float64x2_t` is AArch64-only. AArch64 (Apple Silicon, `ubuntu-24.04-arm`) uses the full NEON implementation.
61+
- `Matrix4f` vector-multiply `#else` fallback (non-x86, non-ARM) uses a reinterpret cast (`*(Vector4f*)&toReturn`) rather than the copy constructor; technically UB but harmless in practice.
6162

6263
### Dependencies (auto-fetched by CMake via FetchContent)
6364

CMakeLists.txt

Lines changed: 53 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,25 @@
11
cmake_minimum_required(VERSION 3.22.1)
22

3-
project(vector_math VERSION 0.2.0)
3+
project(vector_math VERSION 0.3.0)
44

55
configure_file(src/vector_math_config.h.in vector_math_config.h)
66

77
set(CMAKE_CXX_STANDARD 20)
88

9+
include(GNUInstallDirs)
10+
include(CMakePackageConfigHelpers)
11+
912
message(STATUS ${CMAKE_SYSTEM_PROCESSOR})
1013

1114
# enable avx simd extension for x86 processor family
1215
if (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86)|(X86)|(amd64)|(AMD64)|(x86_64)")
1316
message(STATUS "Enabling AVX support")
1417
if(MSVC AND NOT MSVC_VERSION LESS 1600)
15-
message(STATUS "Enabling AVX support for MSVC")
16-
set( CMAKE_CXX_FLAGS "/arch:AVX")
18+
message(STATUS "Enabling AVX2+FMA support for MSVC")
19+
set( CMAKE_CXX_FLAGS "/arch:AVX2")
1720
else()
18-
message(STATUS "Enabling AVX support for gcc/clang")
19-
set( CMAKE_CXX_FLAGS "-mavx")
21+
message(STATUS "Enabling AVX+FMA support for gcc/clang")
22+
set( CMAKE_CXX_FLAGS "-mavx -mfma")
2023
endif()
2124
else ()
2225
endif ()
@@ -31,17 +34,54 @@ message(STATUS "Building ${PROJECT_NAME}...")
3134

3235
add_library(${PROJECT_NAME}
3336
src/vector_math.cpp
34-
src/matrix4f.cpp
35-
src/matrix4d.cpp
3637
)
3738

38-
target_include_directories (vector_math PUBLIC
39-
"$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/inc>"
40-
)
41-
4239
target_include_directories(vector_math PUBLIC
43-
"${PROJECT_BINARY_DIR}"
44-
)
40+
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/inc>
41+
$<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>
42+
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
43+
)
44+
45+
# ── Install ───────────────────────────────────────────────────────────────────
46+
47+
install(TARGETS vector_math
48+
EXPORT vector_mathTargets
49+
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
50+
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
51+
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
52+
)
53+
54+
install(DIRECTORY inc/
55+
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
56+
)
57+
58+
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/vector_math_config.h
59+
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
60+
)
61+
62+
install(EXPORT vector_mathTargets
63+
FILE vector_mathTargets.cmake
64+
NAMESPACE vector_math::
65+
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/vector_math
66+
)
67+
68+
configure_package_config_file(
69+
cmake/vector_mathConfig.cmake.in
70+
${CMAKE_CURRENT_BINARY_DIR}/vector_mathConfig.cmake
71+
INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/vector_math
72+
)
73+
74+
write_basic_package_version_file(
75+
${CMAKE_CURRENT_BINARY_DIR}/vector_mathConfigVersion.cmake
76+
VERSION ${PROJECT_VERSION}
77+
COMPATIBILITY SameMajorVersion
78+
)
79+
80+
install(FILES
81+
${CMAKE_CURRENT_BINARY_DIR}/vector_mathConfig.cmake
82+
${CMAKE_CURRENT_BINARY_DIR}/vector_mathConfigVersion.cmake
83+
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/vector_math
84+
)
4585

4686
if (VECTOR_MATH_BUILD_TEST)
4787
include(CTest)

README.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ A C++17 vector and matrix mathematics library with SIMD acceleration for x86/x64
1616
## Requirements
1717

1818
- CMake 3.22.1+
19-
- C++17 compiler (GCC, Clang, MSVC)
19+
- C++20 compiler (GCC, Clang, MSVC)
2020

2121
## Building
2222

@@ -52,6 +52,7 @@ Architecture is detected automatically at compile time:
5252
| Architecture | Intrinsics | Types accelerated |
5353
|---|---|---|
5454
| x86/x64 | SSE / AVX (`-mavx`) | `Matrix4f` (SSE), `Matrix4d` (AVX) |
55-
| ARM | NEON | scalar fallback (in progress) |
55+
| AArch64 | NEON | `Matrix4f` (NEON), `Matrix4d` (NEON) |
56+
| ARM 32-bit | NEON | `Matrix4f` (NEON), `Matrix4d` (scalar fallback) |
5657

5758
`Vector4f` and `Matrix4f` use `alignas(16)` to satisfy SIMD alignment requirements.

0 commit comments

Comments
 (0)