fix(build): only enable SIMDE_BACKEND for non-x86 architectures (#254)

darvid · web-flow · commit 5bc8cbe2a993 · 2026-02-11T11:45:35.000-05:00
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -188,6 +188,7 @@ jobs:
         uses: astral-sh/setup-uv@v5
         with:
           enable-cache: true
+          version: "0.9.x"
 
       - name: Remove project venv
         run: python -c "import pathlib, shutil; p = pathlib.Path('.venv'); shutil.rmtree(p) if p.exists() else None"
@@ -337,28 +338,28 @@ jobs:
             python_id: cp314t
             platform_id: musllinux_aarch64
 
-          # 🍎 macOS x86_64
-          - os: macos-13
+          # 🍎 macOS x86_64 (cross-compiled on ARM runner via Rosetta 2)
+          - os: macos-15
             host_python: "3.12"
             python_id: cp310
             platform_id: macosx_x86_64
-          - os: macos-13
+          - os: macos-15
             host_python: "3.12"
             python_id: cp311
             platform_id: macosx_x86_64
-          - os: macos-13
+          - os: macos-15
             host_python: "3.12"
             python_id: cp312
             platform_id: macosx_x86_64
-          - os: macos-13
+          - os: macos-15
             host_python: "3.12"
             python_id: cp313
             platform_id: macosx_x86_64
-          - os: macos-13
+          - os: macos-15
             host_python: "3.12"
             python_id: cp314
             platform_id: macosx_x86_64
-          - os: macos-13
+          - os: macos-15
             host_python: "3.12"
             python_id: cp314t
             platform_id: macosx_x86_64
@@ -425,6 +426,7 @@ jobs:
         uses: astral-sh/setup-uv@v5
         with:
           enable-cache: true
+          version: "0.9.x"
 
       - name: Remove project venv
         run: python -c "import pathlib, shutil; p = pathlib.Path('.venv'); shutil.rmtree(p) if p.exists() else None"
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -261,16 +261,30 @@ if(HS_BUILD_REQUIRED)
   else()
     set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS}")
     
-    # Architecture-specific compiler flags
-    if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm|aarch64|arm64)")
-      # ARM architecture - use conservative flags with SIMDE_BACKEND
+    # Architecture-specific compiler flags and SIMDE_BACKEND selection.
+    # SIMDE_BACKEND is only enabled for non-x86 architectures (ARM, etc.)
+    # where vectorscan has no native SIMD support. On x86-64, the native
+    # backend provides runtime CPU feature detection (SSE4.2/AVX2/AVX512)
+    # which is critical for performance. Enabling SIMDE_BACKEND on x86-64
+    # disables all higher ISA code paths and caps performance at SSE2
+    # level (~10-15x slower). See: https://github.com/darvid/python-hyperscan/issues/253
+    #
+    # For macOS cross-compilation (e.g. building x86_64 on ARM runner),
+    # CMAKE_OSX_ARCHITECTURES reflects the TARGET arch and takes priority
+    # over CMAKE_SYSTEM_PROCESSOR (which reflects the HOST).
+    set(HS_USE_SIMDE_BACKEND OFF)
+    set(_HS_TARGET_ARCH "${CMAKE_SYSTEM_PROCESSOR}")
+    if(APPLE AND CMAKE_OSX_ARCHITECTURES)
+      set(_HS_TARGET_ARCH "${CMAKE_OSX_ARCHITECTURES}")
+    endif()
+    if(_HS_TARGET_ARCH MATCHES "(arm|aarch64|arm64)")
       set(HS_CMAKE_COMMON_FLAGS "-fPIC")
-    elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86|X86|amd64|AMD64|x86_64|i[3-6]86)")
-      # x86/x86_64 architecture - use compatible x86-64 baseline
+      set(HS_USE_SIMDE_BACKEND ON)
+    elseif(_HS_TARGET_ARCH MATCHES "(x86|X86|amd64|AMD64|x86_64|i[3-6]86)")
       set(HS_CMAKE_COMMON_FLAGS "-march=x86-64 -fPIC")
     else()
-      # Other architectures - rely on SIMDE_BACKEND for portability
       set(HS_CMAKE_COMMON_FLAGS "-fPIC")
+      set(HS_USE_SIMDE_BACKEND ON)
     endif()
     
     
@@ -444,10 +458,15 @@ if(HS_BUILD_REQUIRED)
   set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${HS_CMAKE_C_FLAGS}")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${HS_CMAKE_CXX_FLAGS}")
 
+  # Forward CMAKE_OSX_ARCHITECTURES to ExternalProject for cross-compilation
+  if(APPLE AND CMAKE_OSX_ARCHITECTURES)
+    list(APPEND HS_CMAKE_ARGS -DCMAKE_OSX_ARCHITECTURES=${CMAKE_OSX_ARCHITECTURES})
+  endif()
+
   if(USE_VECTORSCAN)
     list(
       APPEND HS_CMAKE_ARGS
-      -DSIMDE_BACKEND=ON
+      -DSIMDE_BACKEND=${HS_USE_SIMDE_BACKEND}
       -DRAGEL=${RAGEL_EXECUTABLE}
       -DPCRE_BUILD_SOURCE=ON
       -DBUILD_STATIC_LIBS=ON
@@ -472,6 +491,21 @@ if(HS_BUILD_REQUIRED)
     set(HS_TARGETS --target hs --target hs_runtime --target chimera --target pcre)
   endif()
 
+  # Vectorscan 5.4.12 uses -march=x86-64-v2 in cflags-x86.cmake and
+  # archdetect.cmake, but GCC <11 (e.g. manylinux2014 devtoolset) does
+  # not recognize this value. Patch it to use "nehalem" which provides
+  # the same SSE4.2 baseline and is supported by all GCC versions.
+  # Uses perl instead of sed to avoid BSD/GNU sed -i syntax differences.
+  if(USE_VECTORSCAN AND NOT HS_USE_SIMDE_BACKEND)
+    set(HS_PATCH_COMMAND
+      perl -pi -e "s/x86-64-v2/nehalem/g"
+        ${hyperscan_SOURCE_DIR}/cmake/cflags-x86.cmake
+        ${hyperscan_SOURCE_DIR}/cmake/archdetect.cmake
+    )
+  else()
+    set(HS_PATCH_COMMAND "")
+  endif()
+
   ExternalProject_Add(
     libhs
     GIT_REPOSITORY ${HYPERSCAN_REPO}
@@ -483,6 +517,7 @@ if(HS_BUILD_REQUIRED)
     SOURCE_DIR ${hyperscan_SOURCE_DIR}
     BINARY_DIR ${hyperscan_BINARY_DIR}
     STAMP_DIR ${hyperscan_STAMP_DIR}
+    PATCH_COMMAND ${HS_PATCH_COMMAND}
     INSTALL_COMMAND ""
     CMAKE_GENERATOR ${HS_GENERATOR}
     CMAKE_ARGS ${HS_CMAKE_ARGS} -Wno-dev
diff --git a/tools/bench_regression.py b/tools/bench_regression.py
@@ -0,0 +1,195 @@
+#!/usr/bin/env python
+"""Benchmark to reproduce performance regression reported in #253.
+
+Simulates the reporter's workload: 50 patterns scanning 500KB documents
+in block mode. Reports throughput (MB/s) and average time per scan.
+
+Usage:
+    python tools/bench_regression.py
+    python tools/bench_regression.py --patterns 100 --doc-size 1048576
+"""
+
+import argparse
+import os
+import random
+import statistics
+import string
+import time
+
+import hyperscan
+
+
+def generate_patterns(count):
+    """Generate realistic regex patterns for benchmarking."""
+    templates = [
+        rb"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b",
+        rb"\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b",
+        rb"\b(https?|ftp)://[^\s/$.?#].[^\s]*\b",
+        rb"\b\d{3}[-.]?\d{3}[-.]?\d{4}\b",
+        rb"\b[A-Z][a-z]+\s[A-Z][a-z]+\b",
+        rb"[0-9a-fA-F]{32}",
+        rb"\b(error|warning|critical|fatal)\b",
+        rb"\b\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\b",
+        rb"<[^>]+>",
+        rb"\$\d+[\.,]?\d*",
+    ]
+    keyword_bases = [
+        b"password", b"secret", b"token", b"api.key",
+        b"authorization", b"credential", b"private",
+        b"admin", b"root", b"config", b"database",
+        b"server", b"client", b"session", b"cookie",
+        b"header", b"payload", b"request", b"response",
+        b"encrypt", b"decrypt", b"hash", b"salt",
+        b"certificate", b"key.file", b"login", b"logout",
+        b"access", b"permission", b"role", b"user",
+        b"account", b"profile", b"setting", b"option",
+        b"enable", b"disable", b"start", b"stop",
+        b"create", b"delete", b"update", b"select",
+    ]
+
+    patterns = []
+    for i in range(count):
+        if i < len(templates):
+            patterns.append(templates[i])
+        else:
+            base = keyword_bases[i % len(keyword_bases)]
+            suffix = str(i).encode()
+            patterns.append(rb"\b" + base + suffix + rb"\b")
+    return patterns
+
+
+def generate_document(size):
+    """Generate a synthetic document of approximately the given size."""
+    words = [
+        "the", "quick", "brown", "fox", "jumps", "over", "lazy", "dog",
+        "server", "error", "warning", "request", "response", "data",
+        "user", "admin", "config", "session", "token", "password",
+        "https://example.com/path", "192.168.1.100", "test@email.com",
+        "2025-01-15T10:30:00", "$1,234.56", "abcdef1234567890abcdef",
+        "authorization", "credential", "certificate", "encrypted",
+    ]
+    rng = random.Random(42)
+    chunks = []
+    total = 0
+    while total < size:
+        line_words = rng.choices(words, k=rng.randint(5, 20))
+        line = " ".join(line_words) + "\n"
+        chunks.append(line)
+        total += len(line)
+    return "".join(chunks)[:size].encode("utf-8")
+
+
+def run_benchmark(db, document, num_scans, warmup=3):
+    """Run the benchmark and return per-scan times."""
+    match_count = 0
+
+    def on_match(id, start, end, flags, ctx):
+        nonlocal match_count
+        match_count += 1
+
+    # warmup
+    for _ in range(warmup):
+        db.scan(document, match_event_handler=on_match)
+
+    match_count = 0
+    times = []
+    for _ in range(num_scans):
+        t0 = time.perf_counter()
+        db.scan(document, match_event_handler=on_match)
+        t1 = time.perf_counter()
+        times.append(t1 - t0)
+
+    return times, match_count
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Benchmark for hyperscan regression #253"
+    )
+    parser.add_argument(
+        "--patterns", type=int, default=50,
+        help="Number of regex patterns (default: 50)",
+    )
+    parser.add_argument(
+        "--doc-size", type=int, default=500_000,
+        help="Document size in bytes (default: 500000)",
+    )
+    parser.add_argument(
+        "--scans", type=int, default=100,
+        help="Number of scans to perform (default: 100)",
+    )
+    parser.add_argument(
+        "--warmup", type=int, default=5,
+        help="Number of warmup scans (default: 5)",
+    )
+    args = parser.parse_args()
+
+    print("=" * 60)
+    print("hyperscan regression benchmark (#253)")
+    print("=" * 60)
+
+    db_info = hyperscan.Database(mode=hyperscan.HS_MODE_BLOCK)
+    patterns = generate_patterns(args.patterns)
+    db_info.compile(
+        expressions=patterns,
+        ids=list(range(len(patterns))),
+        flags=[hyperscan.HS_FLAG_CASELESS | hyperscan.HS_FLAG_SINGLEMATCH]
+        * len(patterns),
+    )
+
+    print(f"engine info:     {db_info.info().decode()}")
+    print(f"database size:   {db_info.size():,} bytes")
+    print(f"pattern count:   {args.patterns}")
+    print(f"document size:   {args.doc_size:,} bytes")
+    print(f"scan iterations: {args.scans}")
+    print(f"warmup scans:    {args.warmup}")
+    print()
+
+    document = generate_document(args.doc_size)
+
+    print("running benchmark...")
+    times, match_count = run_benchmark(
+        db_info, document, args.scans, args.warmup
+    )
+
+    avg_time = statistics.mean(times)
+    median_time = statistics.median(times)
+    stdev_time = statistics.stdev(times) if len(times) > 1 else 0
+    min_time = min(times)
+    max_time = max(times)
+    doc_mb = args.doc_size / (1024 * 1024)
+    throughput_avg = doc_mb / avg_time if avg_time > 0 else float("inf")
+    throughput_median = (
+        doc_mb / median_time if median_time > 0 else float("inf")
+    )
+
+    print()
+    print("-" * 60)
+    print("results")
+    print("-" * 60)
+    print(f"total matches:      {match_count:,}")
+    print(f"avg time/scan:      {avg_time * 1000:.3f} ms")
+    print(f"median time/scan:   {median_time * 1000:.3f} ms")
+    print(f"min time/scan:      {min_time * 1000:.3f} ms")
+    print(f"max time/scan:      {max_time * 1000:.3f} ms")
+    print(f"stdev:              {stdev_time * 1000:.3f} ms")
+    print(f"throughput (avg):   {throughput_avg:.1f} MB/s")
+    print(f"throughput (median):{throughput_median:.1f} MB/s")
+    print()
+
+    if avg_time * 1000 > 10:
+        print("!! REGRESSION DETECTED !!")
+        print(
+            f"avg scan time {avg_time*1000:.1f}ms is way above the "
+            f"expected ~3ms baseline from v0.7.21"
+        )
+        print(
+            "likely cause: SIMDE_BACKEND=ON forcing SSE2-only code "
+            "paths on x86-64"
+        )
+    else:
+        print("performance looks healthy")
+
+
+if __name__ == "__main__":
+    main()