From 34cbef1544118152a5d569aa18b26d13c625f60e Mon Sep 17 00:00:00 2001
From: Daniel Lemire <daniel@lemire.me>
Date: Sun, 7 Jun 2026 17:14:14 -0400
Subject: [PATCH 1/3] removing simde (standalone)

---
 CMakeLists.txt            |  17 +--
 README.md                 |   4 +-
 cmake_modules/simde.cmake |  13 --
 headers/VarIntG8IU.h      |   4 +-
 headers/common.h          |   4 +-
 headers/fastpfor_neon.h   | 261 ++++++++++++++++++++++++++++++++++++++
 src/streamvbyte.c         |   4 +-
 src/varintdecode.c        |   4 +-
 8 files changed, 274 insertions(+), 37 deletions(-)
 delete mode 100644 cmake_modules/simde.cmake
 create mode 100644 headers/fastpfor_neon.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5ddb29f9..bd756ac1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -40,10 +40,9 @@ if( SUPPORT_SSE42 )
     MESSAGE( STATUS "SSE 4.2 support detected" )
 else()
     if (SUPPORT_NEON)
-        include(simde)
-        MESSAGE(STATUS "USING SIMDE FOR SIMD OPERATIONS")
+        MESSAGE(STATUS "Using native ARM NEON intrinsics for SIMD operations")
     else ()
-        MESSAGE(STATUS "SIMDE and SSE 4.2 support not detected")
+        MESSAGE(STATUS "Neither SSE 4.2 nor ARM NEON support detected")
     endif ()
 endif()
 
@@ -133,14 +132,6 @@ target_link_libraries(partitionbylength PRIVATE FastPFOR)
 add_executable(csv2maropu src/csv2maropu.cpp)
 target_link_libraries(csv2maropu PRIVATE FastPFOR)
 
-if (SUPPORT_NEON)
-    target_link_libraries(FastPFOR PUBLIC simde)
-    target_link_libraries(gapstats PUBLIC simde)
-    target_link_libraries(partitionbylength PUBLIC simde)
-    target_link_libraries(csv2maropu PUBLIC simde)
-else()
-    message(STATUS "SIMDE not used")
-endif()
 
 add_executable(entropy src/entropy.cpp)
 target_link_libraries(entropy FastPFOR)
@@ -236,7 +227,5 @@ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/fastpfor.pc"
 
 
 if (SUPPORT_NEON)
-    message(WARNING "Building with emulation with SIMDE for ARM NEON support.")
-    message(WARNING "We do not actually support ARM NEON natively.")
-    message(WARNING "If you actually want native ARM NEON support, please consider providing a patch.")
+    message(STATUS "Building with native ARM NEON support (no SIMD emulation layer).")
 endif()
diff --git a/README.md b/README.md
index 43194aa5..6d1f8a67 100644
--- a/README.md
+++ b/README.md
@@ -135,8 +135,8 @@ On an x64 platform, your processor should support SSSE3. This includes almost ev
 sold after 2006. (Note: the key schemes require merely SSE2.)  Some specific binaries will only run if your processor 
 supports SSE4.1. They have been purely used for specific tests however.
 
-We also support ARM platforms through SIMDe, by wrapping. The performance might be poor. If you would
-like to contribute native ARM support, please provide a pull request.
+We also support ARM platforms (aarch64 / ARM64) natively: the SIMD code is mapped directly to ARM NEON
+intrinsics, with no emulation layer or external dependency.
 
 ## Building with CMake
 
diff --git a/cmake_modules/simde.cmake b/cmake_modules/simde.cmake
deleted file mode 100644
index 2a49903d..00000000
--- a/cmake_modules/simde.cmake
+++ /dev/null
@@ -1,13 +0,0 @@
-include(FetchContent)
-FetchContent_Declare(
-        simde
-        GIT_REPOSITORY https://github.com/simd-everywhere/simde.git
-        GIT_TAG c6ddddc4a5bee9913b60de6757227aa078192663
-)
-FetchContent_MakeAvailable(simde)
-
-add_library(simde INTERFACE IMPORTED GLOBAL)
-target_include_directories(simde INTERFACE "${simde_SOURCE_DIR}")
-
-# Enables native aliases. Not ideal but makes it easier to convert old code.
-target_compile_definitions(simde INTERFACE SIMDE_ENABLE_NATIVE_ALIASES)
diff --git a/headers/VarIntG8IU.h b/headers/VarIntG8IU.h
index 337073d8..222ff5b8 100644
--- a/headers/VarIntG8IU.h
+++ b/headers/VarIntG8IU.h
@@ -15,8 +15,8 @@
 #if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
 #include <emmintrin.h>
 #elif defined(__aarch64__)
-/* GCC-compatible compiler, targeting ARM with NEON */
-#include <simde/x86/sse3.h>
+/* GCC-compatible compiler, targeting ARM with native NEON */
+#include "fastpfor_neon.h"
 #endif
 #include "codecs.h"
 #ifdef __GNUC__
diff --git a/headers/common.h b/headers/common.h
index b1c97b5d..9827997f 100644
--- a/headers/common.h
+++ b/headers/common.h
@@ -13,7 +13,7 @@
 #if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
 #include <immintrin.h>
 #elif defined(__GNUC__) && defined(__aarch64__)
-#include <simde/x86/sse4.1.h>
+#include "fastpfor_neon.h"
 #endif
 
 #include <stdio.h>
@@ -53,7 +53,7 @@
 #if (defined(_M_IX86) || defined(_M_AMD64))
 #include <intrin.h>
 #elif defined(_M_ARM64)
-#include <simde/x86/sse4.1.h>
+#include "fastpfor_neon.h"
 #endif
 
 #define __attribute__(n)
diff --git a/headers/fastpfor_neon.h b/headers/fastpfor_neon.h
new file mode 100644
index 00000000..c2b41392
--- /dev/null
+++ b/headers/fastpfor_neon.h
@@ -0,0 +1,261 @@
+/**
+ * This code is released under the
+ * Apache License Version 2.0 http://www.apache.org/licenses/.
+ *
+ * (c) Daniel Lemire
+ */
+
+/**
+ * Native ARM NEON implementations of the (small) subset of x86 SSE intrinsics
+ * used by FastPFOR. This replaces the SIMDe emulation layer on aarch64 targets:
+ * every operation below maps directly to native NEON instructions.
+ *
+ * The mappings follow the well-known SSE->NEON correspondences (the same ones
+ * used by projects such as sse2neon). Shift-by-amount operations are expressed
+ * with the NEON variable-shift instruction so they accept both compile-time
+ * constants (which the compiler folds to immediate shifts) and runtime counts.
+ *
+ * This header is valid in both C99 and C++ so it can be shared by the C codecs
+ * (streamvbyte.c, varintdecode.c) and the C++ headers.
+ */
+#ifndef FASTPFOR_NEON_H_
+#define FASTPFOR_NEON_H_
+
+#if !(defined(__aarch64__) || (defined(_MSC_VER) && defined(_M_ARM64)))
+#error "fastpfor_neon.h is only for ARM (aarch64 / ARM64) targets"
+#endif
+
+#include <arm_neon.h>
+#include <stddef.h>
+#include <stdint.h>
+
+typedef int64x2_t __m128i;
+typedef float32x4_t __m128;
+
+/* ----------------------------- load / store ----------------------------- */
+
+static inline __m128i _mm_loadu_si128(const __m128i *p) {
+  return vld1q_s64((const int64_t *)p);
+}
+static inline __m128i _mm_lddqu_si128(const __m128i *p) {
+  return vld1q_s64((const int64_t *)p);
+}
+static inline __m128i _mm_load_si128(const __m128i *p) {
+  return vld1q_s64((const int64_t *)p);
+}
+static inline __m128i _mm_loadl_epi64(const __m128i *p) {
+  return vcombine_s64(vld1_s64((const int64_t *)p), vdup_n_s64(0));
+}
+static inline void _mm_storeu_si128(__m128i *p, __m128i a) {
+  vst1q_s64((int64_t *)p, a);
+}
+static inline void _mm_store_si128(__m128i *p, __m128i a) {
+  vst1q_s64((int64_t *)p, a);
+}
+static inline void _mm_stream_si128(__m128i *p, __m128i a) {
+  vst1q_s64((int64_t *)p, a);
+}
+static inline void _mm_storel_epi64(__m128i *p, __m128i a) {
+  vst1_s64((int64_t *)p, vget_low_s64(a));
+}
+
+/* --------------------------------- set ---------------------------------- */
+
+static inline __m128i _mm_setzero_si128(void) { return vdupq_n_s64(0); }
+static inline __m128i _mm_set1_epi32(int a) {
+  return vreinterpretq_s64_s32(vdupq_n_s32(a));
+}
+static inline __m128i _mm_set1_epi16(short a) {
+  return vreinterpretq_s64_s16(vdupq_n_s16(a));
+}
+static inline __m128i _mm_set1_epi8(signed char a) {
+  return vreinterpretq_s64_s8(vdupq_n_s8(a));
+}
+static inline __m128i _mm_set_epi64x(int64_t e1, int64_t e0) {
+  int64_t d[2];
+  d[0] = e0;
+  d[1] = e1;
+  return vld1q_s64(d);
+}
+static inline __m128i
+_mm_set_epi8(signed char e15, signed char e14, signed char e13, signed char e12,
+             signed char e11, signed char e10, signed char e9, signed char e8,
+             signed char e7, signed char e6, signed char e5, signed char e4,
+             signed char e3, signed char e2, signed char e1, signed char e0) {
+  int8_t d[16];
+  d[0] = e0;   d[1] = e1;   d[2] = e2;   d[3] = e3;
+  d[4] = e4;   d[5] = e5;   d[6] = e6;   d[7] = e7;
+  d[8] = e8;   d[9] = e9;   d[10] = e10; d[11] = e11;
+  d[12] = e12; d[13] = e13; d[14] = e14; d[15] = e15;
+  return vreinterpretq_s64_s8(vld1q_s8(d));
+}
+static inline __m128i
+_mm_setr_epi8(signed char e0, signed char e1, signed char e2, signed char e3,
+              signed char e4, signed char e5, signed char e6, signed char e7,
+              signed char e8, signed char e9, signed char e10, signed char e11,
+              signed char e12, signed char e13, signed char e14,
+              signed char e15) {
+  int8_t d[16];
+  d[0] = e0;   d[1] = e1;   d[2] = e2;   d[3] = e3;
+  d[4] = e4;   d[5] = e5;   d[6] = e6;   d[7] = e7;
+  d[8] = e8;   d[9] = e9;   d[10] = e10; d[11] = e11;
+  d[12] = e12; d[13] = e13; d[14] = e14; d[15] = e15;
+  return vreinterpretq_s64_s8(vld1q_s8(d));
+}
+static inline __m128i _mm_setr_epi16(short e0, short e1, short e2, short e3,
+                                     short e4, short e5, short e6, short e7) {
+  int16_t d[8];
+  d[0] = e0; d[1] = e1; d[2] = e2; d[3] = e3;
+  d[4] = e4; d[5] = e5; d[6] = e6; d[7] = e7;
+  return vreinterpretq_s64_s16(vld1q_s16(d));
+}
+
+/* ----------------------------- bitwise / arith -------------------------- */
+
+static inline __m128i _mm_and_si128(__m128i a, __m128i b) {
+  return vandq_s64(a, b);
+}
+static inline __m128i _mm_or_si128(__m128i a, __m128i b) {
+  return vorrq_s64(a, b);
+}
+static inline __m128i _mm_add_epi32(__m128i a, __m128i b) {
+  return vreinterpretq_s64_s32(
+      vaddq_s32(vreinterpretq_s32_s64(a), vreinterpretq_s32_s64(b)));
+}
+static inline __m128i _mm_sub_epi32(__m128i a, __m128i b) {
+  return vreinterpretq_s64_s32(
+      vsubq_s32(vreinterpretq_s32_s64(a), vreinterpretq_s32_s64(b)));
+}
+static inline __m128i _mm_mullo_epi32(__m128i a, __m128i b) {
+  return vreinterpretq_s64_s32(
+      vmulq_s32(vreinterpretq_s32_s64(a), vreinterpretq_s32_s64(b)));
+}
+static inline __m128i _mm_mullo_epi16(__m128i a, __m128i b) {
+  return vreinterpretq_s64_s16(
+      vmulq_s16(vreinterpretq_s16_s64(a), vreinterpretq_s16_s64(b)));
+}
+
+/* --------------------------------- shifts ------------------------------- */
+/* Variable-shift form: accepts runtime counts; the compiler lowers a
+ * constant count to a native immediate shift. A right shift is a left shift
+ * by a negative amount (NEON semantics); counts >= element width yield 0,
+ * matching SSE. */
+
+static inline __m128i _mm_slli_epi32(__m128i a, int imm) {
+  return vreinterpretq_s64_u32(
+      vshlq_u32(vreinterpretq_u32_s64(a), vdupq_n_s32(imm)));
+}
+static inline __m128i _mm_srli_epi32(__m128i a, int imm) {
+  return vreinterpretq_s64_u32(
+      vshlq_u32(vreinterpretq_u32_s64(a), vdupq_n_s32(-imm)));
+}
+static inline __m128i _mm_srli_epi16(__m128i a, int imm) {
+  return vreinterpretq_s64_u16(
+      vshlq_u16(vreinterpretq_u16_s64(a), vdupq_n_s16((int16_t)-imm)));
+}
+static inline __m128i _mm_slli_epi64(__m128i a, int imm) {
+  return vreinterpretq_s64_u64(
+      vshlq_u64(vreinterpretq_u64_s64(a), vdupq_n_s64(imm)));
+}
+static inline __m128i _mm_srli_epi64(__m128i a, int imm) {
+  return vreinterpretq_s64_u64(
+      vshlq_u64(vreinterpretq_u64_s64(a), vdupq_n_s64(-imm)));
+}
+
+/* Whole-register byte shifts. The byte count is always a compile-time
+ * constant in FastPFOR, so vextq_u8 (which needs an immediate) is used. */
+#define _mm_srli_si128(a, imm)                                                 \
+  vreinterpretq_s64_u8(                                                        \
+      vextq_u8(vreinterpretq_u8_s64(a), vdupq_n_u8(0), (imm)))
+#define _mm_slli_si128(a, imm)                                                 \
+  vreinterpretq_s64_u8(                                                        \
+      vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_s64(a), (16 - (imm))))
+
+/* ------------------------------- compares ------------------------------- */
+
+static inline __m128i _mm_cmplt_epi32(__m128i a, __m128i b) {
+  return vreinterpretq_s64_u32(
+      vcltq_s32(vreinterpretq_s32_s64(a), vreinterpretq_s32_s64(b)));
+}
+static inline __m128i _mm_cmpeq_epi8(__m128i a, __m128i b) {
+  return vreinterpretq_s64_u8(
+      vceqq_u8(vreinterpretq_u8_s64(a), vreinterpretq_u8_s64(b)));
+}
+
+/* ------------------------------- shuffles ------------------------------- */
+
+static inline __m128i _mm_shuffle_epi8(__m128i a, __m128i b) {
+  /* pshufb: a byte of the index with its high bit set produces 0. Masking the
+   * index with 0x8F leaves the low nibble and the high bit; vqtbl1q_u8 then
+   * yields 0 for any index >= 16 (i.e. when the high bit was set). */
+  uint8x16_t tbl = vreinterpretq_u8_s64(a);
+  uint8x16_t idx = vandq_u8(vreinterpretq_u8_s64(b), vdupq_n_u8(0x8F));
+  return vreinterpretq_s64_u8(vqtbl1q_u8(tbl, idx));
+}
+static inline __m128i _mm_shuffle_epi32(__m128i a, const int imm) {
+  uint32_t t[4];
+  uint32_t r[4];
+  vst1q_u32(t, vreinterpretq_u32_s64(a));
+  r[0] = t[imm & 3];
+  r[1] = t[(imm >> 2) & 3];
+  r[2] = t[(imm >> 4) & 3];
+  r[3] = t[(imm >> 6) & 3];
+  return vreinterpretq_s64_u32(vld1q_u32(r));
+}
+static inline __m128i _mm_blend_epi16(__m128i a, __m128i b, const int imm) {
+  uint16_t m[8];
+  int i;
+  for (i = 0; i < 8; i++)
+    m[i] = ((imm >> i) & 1) ? (uint16_t)0xFFFF : (uint16_t)0;
+  return vreinterpretq_s64_u16(vbslq_u16(vld1q_u16(m),
+                                         vreinterpretq_u16_s64(b),
+                                         vreinterpretq_u16_s64(a)));
+}
+
+/* --------------------------- extract / convert -------------------------- */
+
+#define _mm_extract_epi32(a, imm)                                              \
+  vgetq_lane_s32(vreinterpretq_s32_s64(a), (imm))
+
+static inline int _mm_cvtsi128_si32(__m128i a) {
+  return vgetq_lane_s32(vreinterpretq_s32_s64(a), 0);
+}
+static inline __m128i _mm_cvtepu8_epi16(__m128i a) {
+  return vreinterpretq_s64_u16(vmovl_u8(vget_low_u8(vreinterpretq_u8_s64(a))));
+}
+static inline __m128i _mm_cvtepu16_epi32(__m128i a) {
+  return vreinterpretq_s64_u32(
+      vmovl_u16(vget_low_u16(vreinterpretq_u16_s64(a))));
+}
+static inline __m128i _mm_cvtepi8_epi32(__m128i a) {
+  int16x8_t t16 = vmovl_s8(vget_low_s8(vreinterpretq_s8_s64(a)));
+  return vreinterpretq_s64_s32(vmovl_s16(vget_low_s16(t16)));
+}
+
+/* -------------------------------- masks --------------------------------- */
+
+static inline int _mm_movemask_epi8(__m128i a) {
+  uint8x16_t input = vreinterpretq_u8_s64(a);
+  uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
+  uint32x4_t paired16 =
+      vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
+  uint64x2_t paired32 =
+      vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
+  uint8x16_t paired64 =
+      vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
+  return vgetq_lane_u8(paired64, 0) | ((int)vgetq_lane_u8(paired64, 8) << 8);
+}
+
+/* --------------------------------- float -------------------------------- */
+
+static inline __m128 _mm_castsi128_ps(__m128i a) {
+  return vreinterpretq_f32_s64(a);
+}
+static inline int _mm_movemask_ps(__m128 a) {
+  static const int32_t shifts[4] = {0, 1, 2, 3};
+  uint32x4_t signs = vshrq_n_u32(vreinterpretq_u32_f32(a), 31);
+  uint32x4_t weighted = vshlq_u32(signs, vld1q_s32(shifts));
+  return (int)vaddvq_u32(weighted);
+}
+
+#endif /* FASTPFOR_NEON_H_ */
diff --git a/src/streamvbyte.c b/src/streamvbyte.c
index 1f9dc6b0..94376e3f 100644
--- a/src/streamvbyte.c
+++ b/src/streamvbyte.c
@@ -10,7 +10,7 @@
     #if (defined(_M_IX86) || defined(_M_AMD64))
     #include <intrin.h>
     #elif defined(_M_ARM64)
-    #include <simde/x86/sse4.1.h>
+    #include "fastpfor_neon.h"
     #endif
 
     #include <iso646.h>
@@ -21,7 +21,7 @@
     #include <x86intrin.h>
 #elif defined(__aarch64__)
     /* GCC-compatible compiler, targeting ARM with NEON */
-    #include <simde/x86/sse4.1.h>
+    #include "fastpfor_neon.h"
 #elif defined(__GNUC__) && defined(__IWMMXT__)
     /* GCC-compatible compiler, targeting ARM with WMMX */
     #include <mmintrin.h>
diff --git a/src/varintdecode.c b/src/varintdecode.c
index 7807e12d..15f0cff9 100644
--- a/src/varintdecode.c
+++ b/src/varintdecode.c
@@ -8,7 +8,7 @@
     #if (defined(_M_IX86) || defined(_M_AMD64))
     #include <intrin.h>
     #elif defined(_M_ARM64)
-    #include <simde/x86/sse4.1.h>
+    #include "fastpfor_neon.h"
     #endif
 #elif defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
     /* GCC-compatible compiler, targeting x86/x86-64 */
@@ -16,7 +16,7 @@
 
 #elif defined(__aarch64__)
     /* GCC-compatible compiler, targeting ARM with NEON */
-    #include <simde/x86/sse4.1.h>
+    #include "fastpfor_neon.h"
 #elif defined(__GNUC__) && defined(__IWMMXT__)
     /* GCC-compatible compiler, targeting ARM with WMMX */
     #include <mmintrin.h>

From e06e8a15605a026a1a2d6223b652dd0d4f749ecd Mon Sep 17 00:00:00 2001
From: Daniel Lemire <daniel@lemire.me>
Date: Sun, 7 Jun 2026 17:19:23 -0400
Subject: [PATCH 2/3] optimization

---
 headers/fastpfor_neon.h | 51 ++++++++++++++++++++++++++++++-----------
 1 file changed, 37 insertions(+), 14 deletions(-)

diff --git a/headers/fastpfor_neon.h b/headers/fastpfor_neon.h
index c2b41392..a2117669 100644
--- a/headers/fastpfor_neon.h
+++ b/headers/fastpfor_neon.h
@@ -193,22 +193,45 @@ static inline __m128i _mm_shuffle_epi8(__m128i a, __m128i b) {
   return vreinterpretq_s64_u8(vqtbl1q_u8(tbl, idx));
 }
 static inline __m128i _mm_shuffle_epi32(__m128i a, const int imm) {
-  uint32_t t[4];
-  uint32_t r[4];
-  vst1q_u32(t, vreinterpretq_u32_s64(a));
-  r[0] = t[imm & 3];
-  r[1] = t[(imm >> 2) & 3];
-  r[2] = t[(imm >> 4) & 3];
-  r[3] = t[(imm >> 6) & 3];
-  return vreinterpretq_s64_u32(vld1q_u32(r));
+  /* Permute the four 32-bit lanes. `imm` is a compile-time constant at every
+   * call site, so the broadcast fast paths fold away and the general case
+   * builds a constant byte index that compiles to a single TBL. */
+  uint32x4_t v = vreinterpretq_u32_s64(a);
+  if (imm == 0x00)
+    return vreinterpretq_s64_u32(vdupq_laneq_u32(v, 0));
+  if (imm == 0x55)
+    return vreinterpretq_s64_u32(vdupq_laneq_u32(v, 1));
+  if (imm == 0xAA)
+    return vreinterpretq_s64_u32(vdupq_laneq_u32(v, 2));
+  if (imm == 0xFF)
+    return vreinterpretq_s64_u32(vdupq_laneq_u32(v, 3));
+  {
+    const uint8_t b0 = (uint8_t)((imm & 3) * 4);
+    const uint8_t b1 = (uint8_t)(((imm >> 2) & 3) * 4);
+    const uint8_t b2 = (uint8_t)(((imm >> 4) & 3) * 4);
+    const uint8_t b3 = (uint8_t)(((imm >> 6) & 3) * 4);
+    const uint8x16_t idx = {
+        b0, (uint8_t)(b0 + 1), (uint8_t)(b0 + 2), (uint8_t)(b0 + 3),
+        b1, (uint8_t)(b1 + 1), (uint8_t)(b1 + 2), (uint8_t)(b1 + 3),
+        b2, (uint8_t)(b2 + 1), (uint8_t)(b2 + 2), (uint8_t)(b2 + 3),
+        b3, (uint8_t)(b3 + 1), (uint8_t)(b3 + 2), (uint8_t)(b3 + 3)};
+    return vreinterpretq_s64_u8(vqtbl1q_u8(vreinterpretq_u8_s64(a), idx));
+  }
 }
 static inline __m128i _mm_blend_epi16(__m128i a, __m128i b, const int imm) {
-  uint16_t m[8];
-  int i;
-  for (i = 0; i < 8; i++)
-    m[i] = ((imm >> i) & 1) ? (uint16_t)0xFFFF : (uint16_t)0;
-  return vreinterpretq_s64_u16(vbslq_u16(vld1q_u16(m),
-                                         vreinterpretq_u16_s64(b),
+  /* Per-16-bit-lane select from `a` (0) or `b` (1). `imm` is constant at every
+   * call site, so this compound-literal mask folds to a constant vector load
+   * feeding a single BSL, with no stack round-trip. */
+  const uint16x8_t mask = {
+      (imm & 0x01) ? (uint16_t)0xFFFF : (uint16_t)0,
+      (imm & 0x02) ? (uint16_t)0xFFFF : (uint16_t)0,
+      (imm & 0x04) ? (uint16_t)0xFFFF : (uint16_t)0,
+      (imm & 0x08) ? (uint16_t)0xFFFF : (uint16_t)0,
+      (imm & 0x10) ? (uint16_t)0xFFFF : (uint16_t)0,
+      (imm & 0x20) ? (uint16_t)0xFFFF : (uint16_t)0,
+      (imm & 0x40) ? (uint16_t)0xFFFF : (uint16_t)0,
+      (imm & 0x80) ? (uint16_t)0xFFFF : (uint16_t)0};
+  return vreinterpretq_s64_u16(vbslq_u16(mask, vreinterpretq_u16_s64(b),
                                          vreinterpretq_u16_s64(a)));
 }
 

From a1232cc0d5ea639b5d6eff4476c889f3d08b0fb5 Mon Sep 17 00:00:00 2001
From: Daniel Lemire <daniel@lemire.me>
Date: Sun, 7 Jun 2026 17:39:06 -0400
Subject: [PATCH 3/3] minor tweak to cmake

---
 CMakeLists.txt | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index bd756ac1..34b42808 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -47,9 +47,15 @@ else()
 endif()
 
 
+# GNUInstallDirs defines CMAKE_INSTALL_INCLUDEDIR (and friends), which is used
+# below in the INSTALL_INTERFACE include path. It must be included before that
+# expansion, otherwise the variable is empty and the exported target ends up
+# with a bogus "/fastpfor" include directory that breaks find_package().
+include(GNUInstallDirs)
+
 # library target
 add_library(FastPFOR STATIC)
-target_include_directories(FastPFOR PUBLIC 
+target_include_directories(FastPFOR PUBLIC
     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/headers>
     $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/fastpfor>
 )
@@ -193,8 +199,7 @@ if(FASTPFOR_WITH_TEST)
   enable_testing()
   add_test("FastPFOR_unittest" FastPFOR_unittest)
 endif()
- 
-include(GNUInstallDirs)
+
 install(TARGETS FastPFOR
     EXPORT FastPFORExport
     ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}"