From 34cbef1544118152a5d569aa18b26d13c625f60e Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Sun, 7 Jun 2026 17:14:14 -0400 Subject: [PATCH 1/3] removing simde (standalone) --- CMakeLists.txt | 17 +-- README.md | 4 +- cmake_modules/simde.cmake | 13 -- headers/VarIntG8IU.h | 4 +- headers/common.h | 4 +- headers/fastpfor_neon.h | 261 ++++++++++++++++++++++++++++++++++++++ src/streamvbyte.c | 4 +- src/varintdecode.c | 4 +- 8 files changed, 274 insertions(+), 37 deletions(-) delete mode 100644 cmake_modules/simde.cmake create mode 100644 headers/fastpfor_neon.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 5ddb29f9..bd756ac1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -40,10 +40,9 @@ if( SUPPORT_SSE42 ) MESSAGE( STATUS "SSE 4.2 support detected" ) else() if (SUPPORT_NEON) - include(simde) - MESSAGE(STATUS "USING SIMDE FOR SIMD OPERATIONS") + MESSAGE(STATUS "Using native ARM NEON intrinsics for SIMD operations") else () - MESSAGE(STATUS "SIMDE and SSE 4.2 support not detected") + MESSAGE(STATUS "Neither SSE 4.2 nor ARM NEON support detected") endif () endif() @@ -133,14 +132,6 @@ target_link_libraries(partitionbylength PRIVATE FastPFOR) add_executable(csv2maropu src/csv2maropu.cpp) target_link_libraries(csv2maropu PRIVATE FastPFOR) -if (SUPPORT_NEON) - target_link_libraries(FastPFOR PUBLIC simde) - target_link_libraries(gapstats PUBLIC simde) - target_link_libraries(partitionbylength PUBLIC simde) - target_link_libraries(csv2maropu PUBLIC simde) -else() - message(STATUS "SIMDE not used") -endif() add_executable(entropy src/entropy.cpp) target_link_libraries(entropy FastPFOR) @@ -236,7 +227,5 @@ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/fastpfor.pc" if (SUPPORT_NEON) - message(WARNING "Building with emulation with SIMDE for ARM NEON support.") - message(WARNING "We do not actually support ARM NEON natively.") - message(WARNING "If you actually want native ARM NEON support, please consider providing a patch.") + message(STATUS "Building with native ARM NEON support (no SIMD emulation layer).") endif() diff --git a/README.md b/README.md index 43194aa5..6d1f8a67 100644 --- a/README.md +++ b/README.md @@ -135,8 +135,8 @@ On an x64 platform, your processor should support SSSE3. This includes almost ev sold after 2006. (Note: the key schemes require merely SSE2.) Some specific binaries will only run if your processor supports SSE4.1. They have been purely used for specific tests however. -We also support ARM platforms through SIMDe, by wrapping. The performance might be poor. If you would -like to contribute native ARM support, please provide a pull request. +We also support ARM platforms (aarch64 / ARM64) natively: the SIMD code is mapped directly to ARM NEON +intrinsics, with no emulation layer or external dependency. ## Building with CMake diff --git a/cmake_modules/simde.cmake b/cmake_modules/simde.cmake deleted file mode 100644 index 2a49903d..00000000 --- a/cmake_modules/simde.cmake +++ /dev/null @@ -1,13 +0,0 @@ -include(FetchContent) -FetchContent_Declare( - simde - GIT_REPOSITORY https://github.com/simd-everywhere/simde.git - GIT_TAG c6ddddc4a5bee9913b60de6757227aa078192663 -) -FetchContent_MakeAvailable(simde) - -add_library(simde INTERFACE IMPORTED GLOBAL) -target_include_directories(simde INTERFACE "${simde_SOURCE_DIR}") - -# Enables native aliases. Not ideal but makes it easier to convert old code. -target_compile_definitions(simde INTERFACE SIMDE_ENABLE_NATIVE_ALIASES) diff --git a/headers/VarIntG8IU.h b/headers/VarIntG8IU.h index 337073d8..222ff5b8 100644 --- a/headers/VarIntG8IU.h +++ b/headers/VarIntG8IU.h @@ -15,8 +15,8 @@ #if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__)) #include #elif defined(__aarch64__) -/* GCC-compatible compiler, targeting ARM with NEON */ -#include +/* GCC-compatible compiler, targeting ARM with native NEON */ +#include "fastpfor_neon.h" #endif #include "codecs.h" #ifdef __GNUC__ diff --git a/headers/common.h b/headers/common.h index b1c97b5d..9827997f 100644 --- a/headers/common.h +++ b/headers/common.h @@ -13,7 +13,7 @@ #if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__)) #include #elif defined(__GNUC__) && defined(__aarch64__) -#include +#include "fastpfor_neon.h" #endif #include @@ -53,7 +53,7 @@ #if (defined(_M_IX86) || defined(_M_AMD64)) #include #elif defined(_M_ARM64) -#include +#include "fastpfor_neon.h" #endif #define __attribute__(n) diff --git a/headers/fastpfor_neon.h b/headers/fastpfor_neon.h new file mode 100644 index 00000000..c2b41392 --- /dev/null +++ b/headers/fastpfor_neon.h @@ -0,0 +1,261 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire + */ + +/** + * Native ARM NEON implementations of the (small) subset of x86 SSE intrinsics + * used by FastPFOR. This replaces the SIMDe emulation layer on aarch64 targets: + * every operation below maps directly to native NEON instructions. + * + * The mappings follow the well-known SSE->NEON correspondences (the same ones + * used by projects such as sse2neon). Shift-by-amount operations are expressed + * with the NEON variable-shift instruction so they accept both compile-time + * constants (which the compiler folds to immediate shifts) and runtime counts. + * + * This header is valid in both C99 and C++ so it can be shared by the C codecs + * (streamvbyte.c, varintdecode.c) and the C++ headers. + */ +#ifndef FASTPFOR_NEON_H_ +#define FASTPFOR_NEON_H_ + +#if !(defined(__aarch64__) || (defined(_MSC_VER) && defined(_M_ARM64))) +#error "fastpfor_neon.h is only for ARM (aarch64 / ARM64) targets" +#endif + +#include +#include +#include + +typedef int64x2_t __m128i; +typedef float32x4_t __m128; + +/* ----------------------------- load / store ----------------------------- */ + +static inline __m128i _mm_loadu_si128(const __m128i *p) { + return vld1q_s64((const int64_t *)p); +} +static inline __m128i _mm_lddqu_si128(const __m128i *p) { + return vld1q_s64((const int64_t *)p); +} +static inline __m128i _mm_load_si128(const __m128i *p) { + return vld1q_s64((const int64_t *)p); +} +static inline __m128i _mm_loadl_epi64(const __m128i *p) { + return vcombine_s64(vld1_s64((const int64_t *)p), vdup_n_s64(0)); +} +static inline void _mm_storeu_si128(__m128i *p, __m128i a) { + vst1q_s64((int64_t *)p, a); +} +static inline void _mm_store_si128(__m128i *p, __m128i a) { + vst1q_s64((int64_t *)p, a); +} +static inline void _mm_stream_si128(__m128i *p, __m128i a) { + vst1q_s64((int64_t *)p, a); +} +static inline void _mm_storel_epi64(__m128i *p, __m128i a) { + vst1_s64((int64_t *)p, vget_low_s64(a)); +} + +/* --------------------------------- set ---------------------------------- */ + +static inline __m128i _mm_setzero_si128(void) { return vdupq_n_s64(0); } +static inline __m128i _mm_set1_epi32(int a) { + return vreinterpretq_s64_s32(vdupq_n_s32(a)); +} +static inline __m128i _mm_set1_epi16(short a) { + return vreinterpretq_s64_s16(vdupq_n_s16(a)); +} +static inline __m128i _mm_set1_epi8(signed char a) { + return vreinterpretq_s64_s8(vdupq_n_s8(a)); +} +static inline __m128i _mm_set_epi64x(int64_t e1, int64_t e0) { + int64_t d[2]; + d[0] = e0; + d[1] = e1; + return vld1q_s64(d); +} +static inline __m128i +_mm_set_epi8(signed char e15, signed char e14, signed char e13, signed char e12, + signed char e11, signed char e10, signed char e9, signed char e8, + signed char e7, signed char e6, signed char e5, signed char e4, + signed char e3, signed char e2, signed char e1, signed char e0) { + int8_t d[16]; + d[0] = e0; d[1] = e1; d[2] = e2; d[3] = e3; + d[4] = e4; d[5] = e5; d[6] = e6; d[7] = e7; + d[8] = e8; d[9] = e9; d[10] = e10; d[11] = e11; + d[12] = e12; d[13] = e13; d[14] = e14; d[15] = e15; + return vreinterpretq_s64_s8(vld1q_s8(d)); +} +static inline __m128i +_mm_setr_epi8(signed char e0, signed char e1, signed char e2, signed char e3, + signed char e4, signed char e5, signed char e6, signed char e7, + signed char e8, signed char e9, signed char e10, signed char e11, + signed char e12, signed char e13, signed char e14, + signed char e15) { + int8_t d[16]; + d[0] = e0; d[1] = e1; d[2] = e2; d[3] = e3; + d[4] = e4; d[5] = e5; d[6] = e6; d[7] = e7; + d[8] = e8; d[9] = e9; d[10] = e10; d[11] = e11; + d[12] = e12; d[13] = e13; d[14] = e14; d[15] = e15; + return vreinterpretq_s64_s8(vld1q_s8(d)); +} +static inline __m128i _mm_setr_epi16(short e0, short e1, short e2, short e3, + short e4, short e5, short e6, short e7) { + int16_t d[8]; + d[0] = e0; d[1] = e1; d[2] = e2; d[3] = e3; + d[4] = e4; d[5] = e5; d[6] = e6; d[7] = e7; + return vreinterpretq_s64_s16(vld1q_s16(d)); +} + +/* ----------------------------- bitwise / arith -------------------------- */ + +static inline __m128i _mm_and_si128(__m128i a, __m128i b) { + return vandq_s64(a, b); +} +static inline __m128i _mm_or_si128(__m128i a, __m128i b) { + return vorrq_s64(a, b); +} +static inline __m128i _mm_add_epi32(__m128i a, __m128i b) { + return vreinterpretq_s64_s32( + vaddq_s32(vreinterpretq_s32_s64(a), vreinterpretq_s32_s64(b))); +} +static inline __m128i _mm_sub_epi32(__m128i a, __m128i b) { + return vreinterpretq_s64_s32( + vsubq_s32(vreinterpretq_s32_s64(a), vreinterpretq_s32_s64(b))); +} +static inline __m128i _mm_mullo_epi32(__m128i a, __m128i b) { + return vreinterpretq_s64_s32( + vmulq_s32(vreinterpretq_s32_s64(a), vreinterpretq_s32_s64(b))); +} +static inline __m128i _mm_mullo_epi16(__m128i a, __m128i b) { + return vreinterpretq_s64_s16( + vmulq_s16(vreinterpretq_s16_s64(a), vreinterpretq_s16_s64(b))); +} + +/* --------------------------------- shifts ------------------------------- */ +/* Variable-shift form: accepts runtime counts; the compiler lowers a + * constant count to a native immediate shift. A right shift is a left shift + * by a negative amount (NEON semantics); counts >= element width yield 0, + * matching SSE. */ + +static inline __m128i _mm_slli_epi32(__m128i a, int imm) { + return vreinterpretq_s64_u32( + vshlq_u32(vreinterpretq_u32_s64(a), vdupq_n_s32(imm))); +} +static inline __m128i _mm_srli_epi32(__m128i a, int imm) { + return vreinterpretq_s64_u32( + vshlq_u32(vreinterpretq_u32_s64(a), vdupq_n_s32(-imm))); +} +static inline __m128i _mm_srli_epi16(__m128i a, int imm) { + return vreinterpretq_s64_u16( + vshlq_u16(vreinterpretq_u16_s64(a), vdupq_n_s16((int16_t)-imm))); +} +static inline __m128i _mm_slli_epi64(__m128i a, int imm) { + return vreinterpretq_s64_u64( + vshlq_u64(vreinterpretq_u64_s64(a), vdupq_n_s64(imm))); +} +static inline __m128i _mm_srli_epi64(__m128i a, int imm) { + return vreinterpretq_s64_u64( + vshlq_u64(vreinterpretq_u64_s64(a), vdupq_n_s64(-imm))); +} + +/* Whole-register byte shifts. The byte count is always a compile-time + * constant in FastPFOR, so vextq_u8 (which needs an immediate) is used. */ +#define _mm_srli_si128(a, imm) \ + vreinterpretq_s64_u8( \ + vextq_u8(vreinterpretq_u8_s64(a), vdupq_n_u8(0), (imm))) +#define _mm_slli_si128(a, imm) \ + vreinterpretq_s64_u8( \ + vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_s64(a), (16 - (imm)))) + +/* ------------------------------- compares ------------------------------- */ + +static inline __m128i _mm_cmplt_epi32(__m128i a, __m128i b) { + return vreinterpretq_s64_u32( + vcltq_s32(vreinterpretq_s32_s64(a), vreinterpretq_s32_s64(b))); +} +static inline __m128i _mm_cmpeq_epi8(__m128i a, __m128i b) { + return vreinterpretq_s64_u8( + vceqq_u8(vreinterpretq_u8_s64(a), vreinterpretq_u8_s64(b))); +} + +/* ------------------------------- shuffles ------------------------------- */ + +static inline __m128i _mm_shuffle_epi8(__m128i a, __m128i b) { + /* pshufb: a byte of the index with its high bit set produces 0. Masking the + * index with 0x8F leaves the low nibble and the high bit; vqtbl1q_u8 then + * yields 0 for any index >= 16 (i.e. when the high bit was set). */ + uint8x16_t tbl = vreinterpretq_u8_s64(a); + uint8x16_t idx = vandq_u8(vreinterpretq_u8_s64(b), vdupq_n_u8(0x8F)); + return vreinterpretq_s64_u8(vqtbl1q_u8(tbl, idx)); +} +static inline __m128i _mm_shuffle_epi32(__m128i a, const int imm) { + uint32_t t[4]; + uint32_t r[4]; + vst1q_u32(t, vreinterpretq_u32_s64(a)); + r[0] = t[imm & 3]; + r[1] = t[(imm >> 2) & 3]; + r[2] = t[(imm >> 4) & 3]; + r[3] = t[(imm >> 6) & 3]; + return vreinterpretq_s64_u32(vld1q_u32(r)); +} +static inline __m128i _mm_blend_epi16(__m128i a, __m128i b, const int imm) { + uint16_t m[8]; + int i; + for (i = 0; i < 8; i++) + m[i] = ((imm >> i) & 1) ? (uint16_t)0xFFFF : (uint16_t)0; + return vreinterpretq_s64_u16(vbslq_u16(vld1q_u16(m), + vreinterpretq_u16_s64(b), + vreinterpretq_u16_s64(a))); +} + +/* --------------------------- extract / convert -------------------------- */ + +#define _mm_extract_epi32(a, imm) \ + vgetq_lane_s32(vreinterpretq_s32_s64(a), (imm)) + +static inline int _mm_cvtsi128_si32(__m128i a) { + return vgetq_lane_s32(vreinterpretq_s32_s64(a), 0); +} +static inline __m128i _mm_cvtepu8_epi16(__m128i a) { + return vreinterpretq_s64_u16(vmovl_u8(vget_low_u8(vreinterpretq_u8_s64(a)))); +} +static inline __m128i _mm_cvtepu16_epi32(__m128i a) { + return vreinterpretq_s64_u32( + vmovl_u16(vget_low_u16(vreinterpretq_u16_s64(a)))); +} +static inline __m128i _mm_cvtepi8_epi32(__m128i a) { + int16x8_t t16 = vmovl_s8(vget_low_s8(vreinterpretq_s8_s64(a))); + return vreinterpretq_s64_s32(vmovl_s16(vget_low_s16(t16))); +} + +/* -------------------------------- masks --------------------------------- */ + +static inline int _mm_movemask_epi8(__m128i a) { + uint8x16_t input = vreinterpretq_u8_s64(a); + uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7)); + uint32x4_t paired16 = + vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7)); + uint64x2_t paired32 = + vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14)); + uint8x16_t paired64 = + vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28)); + return vgetq_lane_u8(paired64, 0) | ((int)vgetq_lane_u8(paired64, 8) << 8); +} + +/* --------------------------------- float -------------------------------- */ + +static inline __m128 _mm_castsi128_ps(__m128i a) { + return vreinterpretq_f32_s64(a); +} +static inline int _mm_movemask_ps(__m128 a) { + static const int32_t shifts[4] = {0, 1, 2, 3}; + uint32x4_t signs = vshrq_n_u32(vreinterpretq_u32_f32(a), 31); + uint32x4_t weighted = vshlq_u32(signs, vld1q_s32(shifts)); + return (int)vaddvq_u32(weighted); +} + +#endif /* FASTPFOR_NEON_H_ */ diff --git a/src/streamvbyte.c b/src/streamvbyte.c index 1f9dc6b0..94376e3f 100644 --- a/src/streamvbyte.c +++ b/src/streamvbyte.c @@ -10,7 +10,7 @@ #if (defined(_M_IX86) || defined(_M_AMD64)) #include #elif defined(_M_ARM64) - #include + #include "fastpfor_neon.h" #endif #include @@ -21,7 +21,7 @@ #include #elif defined(__aarch64__) /* GCC-compatible compiler, targeting ARM with NEON */ - #include + #include "fastpfor_neon.h" #elif defined(__GNUC__) && defined(__IWMMXT__) /* GCC-compatible compiler, targeting ARM with WMMX */ #include diff --git a/src/varintdecode.c b/src/varintdecode.c index 7807e12d..15f0cff9 100644 --- a/src/varintdecode.c +++ b/src/varintdecode.c @@ -8,7 +8,7 @@ #if (defined(_M_IX86) || defined(_M_AMD64)) #include #elif defined(_M_ARM64) - #include + #include "fastpfor_neon.h" #endif #elif defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__)) /* GCC-compatible compiler, targeting x86/x86-64 */ @@ -16,7 +16,7 @@ #elif defined(__aarch64__) /* GCC-compatible compiler, targeting ARM with NEON */ - #include + #include "fastpfor_neon.h" #elif defined(__GNUC__) && defined(__IWMMXT__) /* GCC-compatible compiler, targeting ARM with WMMX */ #include From e06e8a15605a026a1a2d6223b652dd0d4f749ecd Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Sun, 7 Jun 2026 17:19:23 -0400 Subject: [PATCH 2/3] optimization --- headers/fastpfor_neon.h | 51 ++++++++++++++++++++++++++++++----------- 1 file changed, 37 insertions(+), 14 deletions(-) diff --git a/headers/fastpfor_neon.h b/headers/fastpfor_neon.h index c2b41392..a2117669 100644 --- a/headers/fastpfor_neon.h +++ b/headers/fastpfor_neon.h @@ -193,22 +193,45 @@ static inline __m128i _mm_shuffle_epi8(__m128i a, __m128i b) { return vreinterpretq_s64_u8(vqtbl1q_u8(tbl, idx)); } static inline __m128i _mm_shuffle_epi32(__m128i a, const int imm) { - uint32_t t[4]; - uint32_t r[4]; - vst1q_u32(t, vreinterpretq_u32_s64(a)); - r[0] = t[imm & 3]; - r[1] = t[(imm >> 2) & 3]; - r[2] = t[(imm >> 4) & 3]; - r[3] = t[(imm >> 6) & 3]; - return vreinterpretq_s64_u32(vld1q_u32(r)); + /* Permute the four 32-bit lanes. `imm` is a compile-time constant at every + * call site, so the broadcast fast paths fold away and the general case + * builds a constant byte index that compiles to a single TBL. */ + uint32x4_t v = vreinterpretq_u32_s64(a); + if (imm == 0x00) + return vreinterpretq_s64_u32(vdupq_laneq_u32(v, 0)); + if (imm == 0x55) + return vreinterpretq_s64_u32(vdupq_laneq_u32(v, 1)); + if (imm == 0xAA) + return vreinterpretq_s64_u32(vdupq_laneq_u32(v, 2)); + if (imm == 0xFF) + return vreinterpretq_s64_u32(vdupq_laneq_u32(v, 3)); + { + const uint8_t b0 = (uint8_t)((imm & 3) * 4); + const uint8_t b1 = (uint8_t)(((imm >> 2) & 3) * 4); + const uint8_t b2 = (uint8_t)(((imm >> 4) & 3) * 4); + const uint8_t b3 = (uint8_t)(((imm >> 6) & 3) * 4); + const uint8x16_t idx = { + b0, (uint8_t)(b0 + 1), (uint8_t)(b0 + 2), (uint8_t)(b0 + 3), + b1, (uint8_t)(b1 + 1), (uint8_t)(b1 + 2), (uint8_t)(b1 + 3), + b2, (uint8_t)(b2 + 1), (uint8_t)(b2 + 2), (uint8_t)(b2 + 3), + b3, (uint8_t)(b3 + 1), (uint8_t)(b3 + 2), (uint8_t)(b3 + 3)}; + return vreinterpretq_s64_u8(vqtbl1q_u8(vreinterpretq_u8_s64(a), idx)); + } } static inline __m128i _mm_blend_epi16(__m128i a, __m128i b, const int imm) { - uint16_t m[8]; - int i; - for (i = 0; i < 8; i++) - m[i] = ((imm >> i) & 1) ? (uint16_t)0xFFFF : (uint16_t)0; - return vreinterpretq_s64_u16(vbslq_u16(vld1q_u16(m), - vreinterpretq_u16_s64(b), + /* Per-16-bit-lane select from `a` (0) or `b` (1). `imm` is constant at every + * call site, so this compound-literal mask folds to a constant vector load + * feeding a single BSL, with no stack round-trip. */ + const uint16x8_t mask = { + (imm & 0x01) ? (uint16_t)0xFFFF : (uint16_t)0, + (imm & 0x02) ? (uint16_t)0xFFFF : (uint16_t)0, + (imm & 0x04) ? (uint16_t)0xFFFF : (uint16_t)0, + (imm & 0x08) ? (uint16_t)0xFFFF : (uint16_t)0, + (imm & 0x10) ? (uint16_t)0xFFFF : (uint16_t)0, + (imm & 0x20) ? (uint16_t)0xFFFF : (uint16_t)0, + (imm & 0x40) ? (uint16_t)0xFFFF : (uint16_t)0, + (imm & 0x80) ? (uint16_t)0xFFFF : (uint16_t)0}; + return vreinterpretq_s64_u16(vbslq_u16(mask, vreinterpretq_u16_s64(b), vreinterpretq_u16_s64(a))); } From a1232cc0d5ea639b5d6eff4476c889f3d08b0fb5 Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Sun, 7 Jun 2026 17:39:06 -0400 Subject: [PATCH 3/3] minor tweak to cmake --- CMakeLists.txt | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index bd756ac1..34b42808 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -47,9 +47,15 @@ else() endif() +# GNUInstallDirs defines CMAKE_INSTALL_INCLUDEDIR (and friends), which is used +# below in the INSTALL_INTERFACE include path. It must be included before that +# expansion, otherwise the variable is empty and the exported target ends up +# with a bogus "/fastpfor" include directory that breaks find_package(). +include(GNUInstallDirs) + # library target add_library(FastPFOR STATIC) -target_include_directories(FastPFOR PUBLIC +target_include_directories(FastPFOR PUBLIC $ $ ) @@ -193,8 +199,7 @@ if(FASTPFOR_WITH_TEST) enable_testing() add_test("FastPFOR_unittest" FastPFOR_unittest) endif() - -include(GNUInstallDirs) + install(TARGETS FastPFOR EXPORT FastPFORExport ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}"