Skip to content

Commit 5bc8cbe

Browse files
authored
fix(build): only enable SIMDE_BACKEND for non-x86 architectures (#254)
1 parent 3ab92ae commit 5bc8cbe

3 files changed

Lines changed: 246 additions & 14 deletions

File tree

.github/workflows/build.yml

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,7 @@ jobs:
188188
uses: astral-sh/setup-uv@v5
189189
with:
190190
enable-cache: true
191+
version: "0.9.x"
191192

192193
- name: Remove project venv
193194
run: python -c "import pathlib, shutil; p = pathlib.Path('.venv'); shutil.rmtree(p) if p.exists() else None"
@@ -337,28 +338,28 @@ jobs:
337338
python_id: cp314t
338339
platform_id: musllinux_aarch64
339340

340-
# 🍎 macOS x86_64
341-
- os: macos-13
341+
# 🍎 macOS x86_64 (cross-compiled on ARM runner via Rosetta 2)
342+
- os: macos-15
342343
host_python: "3.12"
343344
python_id: cp310
344345
platform_id: macosx_x86_64
345-
- os: macos-13
346+
- os: macos-15
346347
host_python: "3.12"
347348
python_id: cp311
348349
platform_id: macosx_x86_64
349-
- os: macos-13
350+
- os: macos-15
350351
host_python: "3.12"
351352
python_id: cp312
352353
platform_id: macosx_x86_64
353-
- os: macos-13
354+
- os: macos-15
354355
host_python: "3.12"
355356
python_id: cp313
356357
platform_id: macosx_x86_64
357-
- os: macos-13
358+
- os: macos-15
358359
host_python: "3.12"
359360
python_id: cp314
360361
platform_id: macosx_x86_64
361-
- os: macos-13
362+
- os: macos-15
362363
host_python: "3.12"
363364
python_id: cp314t
364365
platform_id: macosx_x86_64
@@ -425,6 +426,7 @@ jobs:
425426
uses: astral-sh/setup-uv@v5
426427
with:
427428
enable-cache: true
429+
version: "0.9.x"
428430

429431
- name: Remove project venv
430432
run: python -c "import pathlib, shutil; p = pathlib.Path('.venv'); shutil.rmtree(p) if p.exists() else None"

CMakeLists.txt

Lines changed: 42 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -261,16 +261,30 @@ if(HS_BUILD_REQUIRED)
261261
else()
262262
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS}")
263263

264-
# Architecture-specific compiler flags
265-
if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm|aarch64|arm64)")
266-
# ARM architecture - use conservative flags with SIMDE_BACKEND
264+
# Architecture-specific compiler flags and SIMDE_BACKEND selection.
265+
# SIMDE_BACKEND is only enabled for non-x86 architectures (ARM, etc.)
266+
# where vectorscan has no native SIMD support. On x86-64, the native
267+
# backend provides runtime CPU feature detection (SSE4.2/AVX2/AVX512)
268+
# which is critical for performance. Enabling SIMDE_BACKEND on x86-64
269+
# disables all higher ISA code paths and caps performance at SSE2
270+
# level (~10-15x slower). See: https://github.com/darvid/python-hyperscan/issues/253
271+
#
272+
# For macOS cross-compilation (e.g. building x86_64 on ARM runner),
273+
# CMAKE_OSX_ARCHITECTURES reflects the TARGET arch and takes priority
274+
# over CMAKE_SYSTEM_PROCESSOR (which reflects the HOST).
275+
set(HS_USE_SIMDE_BACKEND OFF)
276+
set(_HS_TARGET_ARCH "${CMAKE_SYSTEM_PROCESSOR}")
277+
if(APPLE AND CMAKE_OSX_ARCHITECTURES)
278+
set(_HS_TARGET_ARCH "${CMAKE_OSX_ARCHITECTURES}")
279+
endif()
280+
if(_HS_TARGET_ARCH MATCHES "(arm|aarch64|arm64)")
267281
set(HS_CMAKE_COMMON_FLAGS "-fPIC")
268-
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86|X86|amd64|AMD64|x86_64|i[3-6]86)")
269-
# x86/x86_64 architecture - use compatible x86-64 baseline
282+
set(HS_USE_SIMDE_BACKEND ON)
283+
elseif(_HS_TARGET_ARCH MATCHES "(x86|X86|amd64|AMD64|x86_64|i[3-6]86)")
270284
set(HS_CMAKE_COMMON_FLAGS "-march=x86-64 -fPIC")
271285
else()
272-
# Other architectures - rely on SIMDE_BACKEND for portability
273286
set(HS_CMAKE_COMMON_FLAGS "-fPIC")
287+
set(HS_USE_SIMDE_BACKEND ON)
274288
endif()
275289

276290

@@ -444,10 +458,15 @@ if(HS_BUILD_REQUIRED)
444458
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${HS_CMAKE_C_FLAGS}")
445459
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${HS_CMAKE_CXX_FLAGS}")
446460

461+
# Forward CMAKE_OSX_ARCHITECTURES to ExternalProject for cross-compilation
462+
if(APPLE AND CMAKE_OSX_ARCHITECTURES)
463+
list(APPEND HS_CMAKE_ARGS -DCMAKE_OSX_ARCHITECTURES=${CMAKE_OSX_ARCHITECTURES})
464+
endif()
465+
447466
if(USE_VECTORSCAN)
448467
list(
449468
APPEND HS_CMAKE_ARGS
450-
-DSIMDE_BACKEND=ON
469+
-DSIMDE_BACKEND=${HS_USE_SIMDE_BACKEND}
451470
-DRAGEL=${RAGEL_EXECUTABLE}
452471
-DPCRE_BUILD_SOURCE=ON
453472
-DBUILD_STATIC_LIBS=ON
@@ -472,6 +491,21 @@ if(HS_BUILD_REQUIRED)
472491
set(HS_TARGETS --target hs --target hs_runtime --target chimera --target pcre)
473492
endif()
474493

494+
# Vectorscan 5.4.12 uses -march=x86-64-v2 in cflags-x86.cmake and
495+
# archdetect.cmake, but GCC <11 (e.g. manylinux2014 devtoolset) does
496+
# not recognize this value. Patch it to use "nehalem" which provides
497+
# the same SSE4.2 baseline and is supported by all GCC versions.
498+
# Uses perl instead of sed to avoid BSD/GNU sed -i syntax differences.
499+
if(USE_VECTORSCAN AND NOT HS_USE_SIMDE_BACKEND)
500+
set(HS_PATCH_COMMAND
501+
perl -pi -e "s/x86-64-v2/nehalem/g"
502+
${hyperscan_SOURCE_DIR}/cmake/cflags-x86.cmake
503+
${hyperscan_SOURCE_DIR}/cmake/archdetect.cmake
504+
)
505+
else()
506+
set(HS_PATCH_COMMAND "")
507+
endif()
508+
475509
ExternalProject_Add(
476510
libhs
477511
GIT_REPOSITORY ${HYPERSCAN_REPO}
@@ -483,6 +517,7 @@ if(HS_BUILD_REQUIRED)
483517
SOURCE_DIR ${hyperscan_SOURCE_DIR}
484518
BINARY_DIR ${hyperscan_BINARY_DIR}
485519
STAMP_DIR ${hyperscan_STAMP_DIR}
520+
PATCH_COMMAND ${HS_PATCH_COMMAND}
486521
INSTALL_COMMAND ""
487522
CMAKE_GENERATOR ${HS_GENERATOR}
488523
CMAKE_ARGS ${HS_CMAKE_ARGS} -Wno-dev

tools/bench_regression.py

Lines changed: 195 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,195 @@
1+
#!/usr/bin/env python
2+
"""Benchmark to reproduce performance regression reported in #253.
3+
4+
Simulates the reporter's workload: 50 patterns scanning 500KB documents
5+
in block mode. Reports throughput (MB/s) and average time per scan.
6+
7+
Usage:
8+
python tools/bench_regression.py
9+
python tools/bench_regression.py --patterns 100 --doc-size 1048576
10+
"""
11+
12+
import argparse
13+
import os
14+
import random
15+
import statistics
16+
import string
17+
import time
18+
19+
import hyperscan
20+
21+
22+
def generate_patterns(count):
23+
"""Generate realistic regex patterns for benchmarking."""
24+
templates = [
25+
rb"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b",
26+
rb"\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b",
27+
rb"\b(https?|ftp)://[^\s/$.?#].[^\s]*\b",
28+
rb"\b\d{3}[-.]?\d{3}[-.]?\d{4}\b",
29+
rb"\b[A-Z][a-z]+\s[A-Z][a-z]+\b",
30+
rb"[0-9a-fA-F]{32}",
31+
rb"\b(error|warning|critical|fatal)\b",
32+
rb"\b\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\b",
33+
rb"<[^>]+>",
34+
rb"\$\d+[\.,]?\d*",
35+
]
36+
keyword_bases = [
37+
b"password", b"secret", b"token", b"api.key",
38+
b"authorization", b"credential", b"private",
39+
b"admin", b"root", b"config", b"database",
40+
b"server", b"client", b"session", b"cookie",
41+
b"header", b"payload", b"request", b"response",
42+
b"encrypt", b"decrypt", b"hash", b"salt",
43+
b"certificate", b"key.file", b"login", b"logout",
44+
b"access", b"permission", b"role", b"user",
45+
b"account", b"profile", b"setting", b"option",
46+
b"enable", b"disable", b"start", b"stop",
47+
b"create", b"delete", b"update", b"select",
48+
]
49+
50+
patterns = []
51+
for i in range(count):
52+
if i < len(templates):
53+
patterns.append(templates[i])
54+
else:
55+
base = keyword_bases[i % len(keyword_bases)]
56+
suffix = str(i).encode()
57+
patterns.append(rb"\b" + base + suffix + rb"\b")
58+
return patterns
59+
60+
61+
def generate_document(size):
62+
"""Generate a synthetic document of approximately the given size."""
63+
words = [
64+
"the", "quick", "brown", "fox", "jumps", "over", "lazy", "dog",
65+
"server", "error", "warning", "request", "response", "data",
66+
"user", "admin", "config", "session", "token", "password",
67+
"https://example.com/path", "192.168.1.100", "test@email.com",
68+
"2025-01-15T10:30:00", "$1,234.56", "abcdef1234567890abcdef",
69+
"authorization", "credential", "certificate", "encrypted",
70+
]
71+
rng = random.Random(42)
72+
chunks = []
73+
total = 0
74+
while total < size:
75+
line_words = rng.choices(words, k=rng.randint(5, 20))
76+
line = " ".join(line_words) + "\n"
77+
chunks.append(line)
78+
total += len(line)
79+
return "".join(chunks)[:size].encode("utf-8")
80+
81+
82+
def run_benchmark(db, document, num_scans, warmup=3):
83+
"""Run the benchmark and return per-scan times."""
84+
match_count = 0
85+
86+
def on_match(id, start, end, flags, ctx):
87+
nonlocal match_count
88+
match_count += 1
89+
90+
# warmup
91+
for _ in range(warmup):
92+
db.scan(document, match_event_handler=on_match)
93+
94+
match_count = 0
95+
times = []
96+
for _ in range(num_scans):
97+
t0 = time.perf_counter()
98+
db.scan(document, match_event_handler=on_match)
99+
t1 = time.perf_counter()
100+
times.append(t1 - t0)
101+
102+
return times, match_count
103+
104+
105+
def main():
106+
parser = argparse.ArgumentParser(
107+
description="Benchmark for hyperscan regression #253"
108+
)
109+
parser.add_argument(
110+
"--patterns", type=int, default=50,
111+
help="Number of regex patterns (default: 50)",
112+
)
113+
parser.add_argument(
114+
"--doc-size", type=int, default=500_000,
115+
help="Document size in bytes (default: 500000)",
116+
)
117+
parser.add_argument(
118+
"--scans", type=int, default=100,
119+
help="Number of scans to perform (default: 100)",
120+
)
121+
parser.add_argument(
122+
"--warmup", type=int, default=5,
123+
help="Number of warmup scans (default: 5)",
124+
)
125+
args = parser.parse_args()
126+
127+
print("=" * 60)
128+
print("hyperscan regression benchmark (#253)")
129+
print("=" * 60)
130+
131+
db_info = hyperscan.Database(mode=hyperscan.HS_MODE_BLOCK)
132+
patterns = generate_patterns(args.patterns)
133+
db_info.compile(
134+
expressions=patterns,
135+
ids=list(range(len(patterns))),
136+
flags=[hyperscan.HS_FLAG_CASELESS | hyperscan.HS_FLAG_SINGLEMATCH]
137+
* len(patterns),
138+
)
139+
140+
print(f"engine info: {db_info.info().decode()}")
141+
print(f"database size: {db_info.size():,} bytes")
142+
print(f"pattern count: {args.patterns}")
143+
print(f"document size: {args.doc_size:,} bytes")
144+
print(f"scan iterations: {args.scans}")
145+
print(f"warmup scans: {args.warmup}")
146+
print()
147+
148+
document = generate_document(args.doc_size)
149+
150+
print("running benchmark...")
151+
times, match_count = run_benchmark(
152+
db_info, document, args.scans, args.warmup
153+
)
154+
155+
avg_time = statistics.mean(times)
156+
median_time = statistics.median(times)
157+
stdev_time = statistics.stdev(times) if len(times) > 1 else 0
158+
min_time = min(times)
159+
max_time = max(times)
160+
doc_mb = args.doc_size / (1024 * 1024)
161+
throughput_avg = doc_mb / avg_time if avg_time > 0 else float("inf")
162+
throughput_median = (
163+
doc_mb / median_time if median_time > 0 else float("inf")
164+
)
165+
166+
print()
167+
print("-" * 60)
168+
print("results")
169+
print("-" * 60)
170+
print(f"total matches: {match_count:,}")
171+
print(f"avg time/scan: {avg_time * 1000:.3f} ms")
172+
print(f"median time/scan: {median_time * 1000:.3f} ms")
173+
print(f"min time/scan: {min_time * 1000:.3f} ms")
174+
print(f"max time/scan: {max_time * 1000:.3f} ms")
175+
print(f"stdev: {stdev_time * 1000:.3f} ms")
176+
print(f"throughput (avg): {throughput_avg:.1f} MB/s")
177+
print(f"throughput (median):{throughput_median:.1f} MB/s")
178+
print()
179+
180+
if avg_time * 1000 > 10:
181+
print("!! REGRESSION DETECTED !!")
182+
print(
183+
f"avg scan time {avg_time*1000:.1f}ms is way above the "
184+
f"expected ~3ms baseline from v0.7.21"
185+
)
186+
print(
187+
"likely cause: SIMDE_BACKEND=ON forcing SSE2-only code "
188+
"paths on x86-64"
189+
)
190+
else:
191+
print("performance looks healthy")
192+
193+
194+
if __name__ == "__main__":
195+
main()

0 commit comments

Comments
 (0)