rusoleal
diff --git a/‎.github/workflows/cmake-multi-platform.yml‎
Lines changed: 16 additions & 18 deletions b/‎.github/workflows/cmake-multi-platform.yml‎
Lines changed: 16 additions & 18 deletions
diff --git a/‎CHANGELOG.md‎
Lines changed: 28 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎CLAUDE.md‎
Lines changed: 3 additions & 2 deletions b/‎CLAUDE.md‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 53 additions & 13 deletions b/‎CMakeLists.txt‎
Lines changed: 53 additions & 13 deletions
diff --git a/‎README.md‎
Lines changed: 3 additions & 2 deletions b/‎README.md‎
Lines changed: 3 additions & 2 deletions
@@ -1,5 +1,3 @@
-# This starter workflow is for a CMake project running on multiple platforms. There is a different starter workflow if you just want a single platform.
-# See: https://github.com/actions/starter-workflows/blob/main/ci/cmake-single-platform.yml
 name: CMake on multiple platforms
 
 on:
@@ -13,18 +11,16 @@ jobs:
     runs-on: ${{ matrix.os }}
 
     strategy:
-      # Set fail-fast to false to ensure that feedback is delivered for all matrix combinations. Consider changing this to true when your workflow is stable.
       fail-fast: false
 
-      # Set up a matrix to run the following 3 configurations:
-      # 1. <Windows, Release, latest MSVC compiler toolchain on the default runner image, default generator>
-      # 2. <Linux, Release, latest GCC compiler toolchain on the default runner image, default generator>
-      # 3. <Linux, Release, latest Clang compiler toolchain on the default runner image, default generator>
-      #
-      # To add more build types (Release, Debug, RelWithDebInfo, etc.) customize the build_type list.
+      # Matrix: OS × compiler
+      # ubuntu-latest        → x86-64 (GCC, Clang)
+      # ubuntu-24.04-arm     → ARM64 / NEON (GCC, Clang)
+      # macos-latest         → Apple Silicon ARM64 (Clang)
+      # windows-latest       → x86-64 (MSVC)
       matrix:
-        os: [ubuntu-latest, windows-latest, macos-latest]
-        build_type: [Release]
+        os: [ubuntu-latest, ubuntu-24.04-arm, windows-latest, macos-latest]
+        build_type: [Release, Debug]
         c_compiler: [gcc, clang, cl]
         include:
           - os: windows-latest
@@ -36,6 +32,12 @@ jobs:
           - os: ubuntu-latest
             c_compiler: clang
             cpp_compiler: clang++
+          - os: ubuntu-24.04-arm
+            c_compiler: gcc
+            cpp_compiler: g++
+          - os: ubuntu-24.04-arm
+            c_compiler: clang
+            cpp_compiler: clang++
           - os: macos-latest
             c_compiler: clang
             cpp_compiler: clang++
@@ -46,6 +48,8 @@ jobs:
             c_compiler: clang
           - os: ubuntu-latest
             c_compiler: cl
+          - os: ubuntu-24.04-arm
+            c_compiler: cl
           - os: macos-latest
             c_compiler: cl
           - os: macos-latest
@@ -55,15 +59,12 @@ jobs:
     - uses: actions/checkout@v4
 
     - name: Set reusable strings
-      # Turn repeated input strings (such as the build output directory) into step outputs. These step outputs can be used throughout the workflow file.
       id: strings
       shell: bash
       run: |
         echo "build-output-dir=${{ github.workspace }}/build" >> "$GITHUB_OUTPUT"
 
     - name: Configure CMake
-      # Configure CMake in a 'build' subdirectory. `CMAKE_BUILD_TYPE` is only required if you are using a single-configuration generator such as make.
-      # See https://cmake.org/cmake/help/latest/variable/CMAKE_BUILD_TYPE.html?highlight=cmake_build_type
       run: >
         cmake -B ${{ steps.strings.outputs.build-output-dir }}
         -DCMAKE_CXX_COMPILER=${{ matrix.cpp_compiler }}
@@ -73,11 +74,8 @@ jobs:
         -DVECTOR_MATH_BUILD_TEST=ON
 
     - name: Build
-      # Build your program with the given configuration. Note that --config is needed because the default Windows generator is a multi-config generator (Visual Studio generator).
       run: cmake --build ${{ steps.strings.outputs.build-output-dir }} --config ${{ matrix.build_type }}
 
     - name: Test
       working-directory: ${{ steps.strings.outputs.build-output-dir }}
-      # Execute tests defined by the CMake configuration. Note that --build-config is needed because the default Windows generator is a multi-config generator (Visual Studio generator).
-      # See https://cmake.org/cmake/help/latest/manual/ctest.1.html for more detail
-      run: ctest --build-config ${{ matrix.build_type }}
+      run: ctest --build-config ${{ matrix.build_type }} --output-on-failure
@@ -1,5 +1,33 @@
 # Changelog
 
+## [0.3.0] - 2026-03-18
+
+### Added
+- **`Vector4d` SIMD arithmetic** — all operators now have AVX (x86) and NEON (AArch64) paths instead of falling back to the scalar `Vector4<double>` base class:
+  - `operator+` / `operator-`: `_mm256_add/sub_pd` · `vaddq/vsubq_f64`
+  - unary `operator-`: `_mm256_xor_pd` with sign-bit mask · `vnegq_f64`
+  - `operator*(scalar)` / `operator/(scalar)`: `_mm256_mul/div_pd` · `vmulq_n_f64`
+  - `dot()` (instance + static): AVX hadd trick (`_mm256_hadd_pd` + 128-bit extract) · `vpaddq_f64` + `vaddvq_f64`
+- **`Matrix4d` SIMD add / subtract / negate / scalar-multiply** — new overrides replace the 16-element scalar loop with 4 AVX 256-bit ops (one per row) or 8 NEON 128-bit ops:
+  - `operator+(Matrix4d)` / `operator-(Matrix4d)`: `_mm256_add/sub_pd` · `vaddq/vsubq_f64`
+  - unary `operator-()`: `_mm256_xor_pd` · `vnegq_f64`
+  - `operator*(double)`: `_mm256_mul_pd` + `_mm256_set1_pd` · `vmulq_n_f64`
+- **FMA3 in `Matrix4d` mat×mat and mat×vec** (x86) — `mul + add` pairs replaced with `_mm256_fmadd_pd` when compiled with `-mfma` (`__FMA__` defined); reduces 7 instructions to 4 per accumulation step
+- **`Matrix4f * Vector4f` ARM NEON** — previously fell back to scalar; now uses two `vpaddq_f32` passes to compute all four dot products simultaneously
+- **`-mfma` compiler flag** added to the x86 CMake path (gcc/clang: `-mavx -mfma`; MSVC: `/arch:AVX2`)
+- Benchmarks for all new operations: `BM_Vector4dSIMDAdd`, `BM_Vector4dSIMDScalarMultiply`, `BM_Vector4dSIMDDot`, `BM_Matrix4dSIMDAdd`, `BM_Matrix4dSIMDScalarMultiply`, `BM_Matrix4fByVectorGeneric`, with scalar and GLM baselines
+- **AArch64 NEON** full implementation for `Matrix4d` matrix–matrix and matrix–vector multiply (`float64x2_t`, `vfmaq_f64`, `vpaddq_f64`)
+- `Matrix4d::lookAt` optimized inline override — avoids generic `Vec<>` loop overhead and intermediate temporaries
+- CMake **install support**: `GNUInstallDirs`, `CMakePackageConfigHelpers`, package config files (`vector_mathConfig.cmake`, `vector_mathConfigVersion.cmake`), and `INSTALL_INTERFACE` include paths
+- Benchmark suite expanded: `Matrix4f`, `Matrix4d`, `Quaternion`, and GLM comparison benchmarks; removed dummy `BM_StringCreation`
+
+### Changed
+- `Matrix4d` and `Matrix4f` implementations moved from `.cpp` translation units to **header-only inline** methods — `src/matrix4d.cpp` and `src/matrix4f.cpp` removed
+- `Matrix4::identity()` now returns a cached `static const` instance (computed once via IIFE) instead of allocating a local array on every call
+- CI: added **`ubuntu-24.04-arm`** runner (AArch64 NEON coverage); added **Debug** build type alongside Release; `ctest` now runs with `--output-on-failure`
+
+---
+
 ## [0.2.0] - 2026-03-17
 
 ### Added
 
@@ -55,9 +55,10 @@ Architecture is detected at compile time:
 
 `Matrix4f` uses SSE 128-bit intrinsics (4×float). `Matrix4d` uses AVX 256-bit intrinsics (4×double). ARM paths currently fall back to scalar operations.
 
-### Known issues
+### Known limitations
 
-- `matrix4d` AVX implementation is broken (commit `f7bf612`). The scalar fallback is used on ARM; the AVX path may mix `_mm256_add_ps` (32-bit) with `_mm256_add_pd` (64-bit) incorrectly.
+- `Matrix4d` ARM 32-bit (ARMv7) uses scalar fallback — `float64x2_t` is AArch64-only. AArch64 (Apple Silicon, `ubuntu-24.04-arm`) uses the full NEON implementation.
+- `Matrix4f` vector-multiply `#else` fallback (non-x86, non-ARM) uses a reinterpret cast (`*(Vector4f*)&toReturn`) rather than the copy constructor; technically UB but harmless in practice.
 
 ### Dependencies (auto-fetched by CMake via FetchContent)
 
 
@@ -1,22 +1,25 @@
 cmake_minimum_required(VERSION 3.22.1)
 
-project(vector_math VERSION 0.2.0)
+project(vector_math VERSION 0.3.0)
 
 configure_file(src/vector_math_config.h.in vector_math_config.h)
 
 set(CMAKE_CXX_STANDARD 20)
 
+include(GNUInstallDirs)
+include(CMakePackageConfigHelpers)
+
 message(STATUS ${CMAKE_SYSTEM_PROCESSOR})
 
 # enable avx simd extension for x86 processor family
 if (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86)|(X86)|(amd64)|(AMD64)|(x86_64)")
     message(STATUS "Enabling AVX support")
     if(MSVC AND NOT MSVC_VERSION LESS 1600)
-        message(STATUS "Enabling AVX support for MSVC")
-        set( CMAKE_CXX_FLAGS "/arch:AVX")
+        message(STATUS "Enabling AVX2+FMA support for MSVC")
+        set( CMAKE_CXX_FLAGS "/arch:AVX2")
     else()
-        message(STATUS "Enabling AVX support for gcc/clang")
-        set( CMAKE_CXX_FLAGS "-mavx")
+        message(STATUS "Enabling AVX+FMA support for gcc/clang")
+        set( CMAKE_CXX_FLAGS "-mavx -mfma")
     endif()
 else ()
 endif ()
@@ -31,17 +34,54 @@ message(STATUS "Building ${PROJECT_NAME}...")
 
 add_library(${PROJECT_NAME}
     src/vector_math.cpp
-    src/matrix4f.cpp
-    src/matrix4d.cpp
 )
 
-target_include_directories (vector_math PUBLIC 
-                            "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/inc>"
-                            )
-
 target_include_directories(vector_math PUBLIC
-                           "${PROJECT_BINARY_DIR}"
-                           )
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/inc>
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
+)
+
+# ── Install ───────────────────────────────────────────────────────────────────
+
+install(TARGETS vector_math
+    EXPORT vector_mathTargets
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+)
+
+install(DIRECTORY inc/
+    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+)
+
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/vector_math_config.h
+    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+)
+
+install(EXPORT vector_mathTargets
+    FILE vector_mathTargets.cmake
+    NAMESPACE vector_math::
+    DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/vector_math
+)
+
+configure_package_config_file(
+    cmake/vector_mathConfig.cmake.in
+    ${CMAKE_CURRENT_BINARY_DIR}/vector_mathConfig.cmake
+    INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/vector_math
+)
+
+write_basic_package_version_file(
+    ${CMAKE_CURRENT_BINARY_DIR}/vector_mathConfigVersion.cmake
+    VERSION ${PROJECT_VERSION}
+    COMPATIBILITY SameMajorVersion
+)
+
+install(FILES
+    ${CMAKE_CURRENT_BINARY_DIR}/vector_mathConfig.cmake
+    ${CMAKE_CURRENT_BINARY_DIR}/vector_mathConfigVersion.cmake
+    DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/vector_math
+)
 
 if (VECTOR_MATH_BUILD_TEST)
     include(CTest)
 
@@ -16,7 +16,7 @@ A C++17 vector and matrix mathematics library with SIMD acceleration for x86/x64
 ## Requirements
 
 - CMake 3.22.1+
-- C++17 compiler (GCC, Clang, MSVC)
+- C++20 compiler (GCC, Clang, MSVC)
 
 ## Building
 
@@ -52,6 +52,7 @@ Architecture is detected automatically at compile time:
 | Architecture | Intrinsics | Types accelerated |
 |---|---|---|
 | x86/x64 | SSE / AVX (`-mavx`) | `Matrix4f` (SSE), `Matrix4d` (AVX) |
-| ARM | NEON | scalar fallback (in progress) |
+| AArch64 | NEON | `Matrix4f` (NEON), `Matrix4d` (NEON) |
+| ARM 32-bit | NEON | `Matrix4f` (NEON), `Matrix4d` (scalar fallback) |
 
 `Vector4f` and `Matrix4f` use `alignas(16)` to satisfy SIMD alignment requirements.