-
Notifications
You must be signed in to change notification settings - Fork 194
Optimise getValueOfBits and insertBits with BMI2 PEXT/PDEP (#717) #796
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: devel
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -145,6 +145,13 @@ option( | |
| ) | ||
| message(STATUS "NUMA awareness is turned ${QUEST_ENABLE_NUMA}. Set QUEST_ENABLE_NUMA to modify.") | ||
|
|
||
| option( | ||
| QUEST_ENABLE_BMI2 | ||
| "Whether QuEST will accelerate CPU bit gather/scatter with x86 BMI2 (PEXT/PDEP) intrinsics (issue #717). Turned OFF by default; when ON, the resulting binary requires a BMI2-capable CPU at runtime." | ||
| OFF | ||
| ) | ||
| message(STATUS "BMI2 bitwise acceleration is turned ${QUEST_ENABLE_BMI2}. Set QUEST_ENABLE_BMI2 to modify.") | ||
|
|
||
|
|
||
| # Distribution | ||
| option( | ||
|
|
@@ -402,13 +409,35 @@ else() | |
| set(WARNING_FLAG -Wall) | ||
| endif() | ||
|
|
||
| target_compile_options(QuEST | ||
| target_compile_options(QuEST | ||
| PRIVATE | ||
| $<$<COMPILE_LANGUAGE:CXX>:${WARNING_FLAG}> | ||
| $<$<COMPILE_LANGUAGE:C>:${WARNING_FLAG}> | ||
| ) | ||
|
|
||
|
|
||
| # ================================================== | ||
| # CPU bit-manipulation acceleration (BMI2, issue #717) | ||
| # ================================================== | ||
| # The PEXT/PDEP fast paths in quest/src/core/bitwise.hpp are guarded by `#if defined(__BMI2__)`, | ||
| # which the compiler only defines when BMI2 codegen is enabled. We add -mbmi2 ONLY when the user opts | ||
| # in via QUEST_ENABLE_BMI2 (OFF by default), so a default build stays portable and runs on any x86 CPU | ||
| # (it compiles the byte-identical scalar fallback). Without the opt-in, -mbmi2 is never added, so the | ||
| # library is free of BMI2 instructions and cannot SIGILL on a pre-BMI2 CPU. The generator expression | ||
| # scopes the flag to C++ host translation units, so CUDA/HIP device compilation is unaffected (and the | ||
| # intrinsics are additionally #ifdef-guarded against __CUDA_ARCH__/__HIP_DEVICE_COMPILE__). A user who | ||
| # instead supplies their own -march=native still gets the fast path on their own CPU. | ||
| if (QUEST_ENABLE_BMI2) | ||
| include(CheckCXXCompilerFlag) | ||
| check_cxx_compiler_flag("-mbmi2" QUEST_COMPILER_SUPPORTS_MBMI2) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
| if (QUEST_COMPILER_SUPPORTS_MBMI2) | ||
| target_compile_options(QuEST PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-mbmi2>) | ||
| else() | ||
| message(WARNING "QUEST_ENABLE_BMI2=ON but the compiler does not accept -mbmi2; building the scalar fallback.") | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. TODO: error instead of warn here, unless we change |
||
| endif() | ||
| endif() | ||
|
|
||
|
|
||
|
|
||
| # ============================ | ||
| # Link optional dependencies | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,3 +1,15 @@ | ||
| # @author Tyson Jones | ||
|
|
||
| add_all_local_examples() | ||
|
|
||
| # The issue-#717 bitwise micro-benchmark builds with -mbmi2 (so its PEXT/PDEP path is enabled) only | ||
| # when the user opts in via QUEST_ENABLE_BMI2 — same switch the library uses. Without the opt-in it | ||
| # compiles the scalar fallback and prints "BMI2 fast path: INACTIVE" (never SIGILLs). add_example() | ||
| # names the target <filename>_<ext>; the flag is scoped to this one target. | ||
| if (QUEST_ENABLE_BMI2 AND TARGET benchmark_bitwise_bmi2_cpp) | ||
| include(CheckCXXCompilerFlag) | ||
| check_cxx_compiler_flag("-mbmi2" QUEST_EXAMPLE_SUPPORTS_MBMI2) | ||
| if (QUEST_EXAMPLE_SUPPORTS_MBMI2) | ||
| target_compile_options(benchmark_bitwise_bmi2_cpp PRIVATE -mbmi2) | ||
| endif() | ||
| endif() |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,153 @@ | ||
| /** @file | ||
| * A quick, self-contained micro-benchmark of the BMI2 PEXT/PDEP fast paths added for issue #717, | ||
| * comparing them against the original scalar bit gather/scatter loops. It prints per-call timings | ||
| * so QuEST's CI can compare the speedup across its tested platforms and compilers. | ||
| * | ||
| * The two scalar routines below mirror getValueOfBits() and insertBitsWithMaskedValues() from | ||
| * quest/src/core/bitwise.hpp; the BMI2 routines are the single-instruction _pext_u64 / _pdep_u64 | ||
| * paths. This file deliberately depends on nothing but the C++ standard library (and <immintrin.h> | ||
| * when targeting x86 BMI2), so it compiles and runs on every platform — emitting the scalar | ||
| * timings alone where BMI2 is unavailable, never raising SIGILL. | ||
| * | ||
| * Build note: this target is compiled with -mbmi2 (see examples/automated/CMakeLists.txt) so the | ||
| * intrinsic path is enabled; the QuEST library itself enables -mbmi2 the same way in the top-level | ||
| * CMakeLists.txt. Whether the fast path was compiled in is printed at runtime. | ||
| * | ||
| * @author (issue #717 contribution) | ||
| */ | ||
|
|
||
| #include <cstdint> | ||
| #include <cstdio> | ||
| #include <chrono> | ||
|
|
||
| #if defined(__BMI2__) && (defined(__x86_64__) || defined(__i386__) || defined(_M_X64) || defined(_M_IX86)) | ||
| #include <immintrin.h> | ||
| #define BENCH_USE_BMI2 | ||
| #endif | ||
|
|
||
| using std::uint64_t; | ||
|
|
||
| // --- scalar references (mirroring quest/src/core/bitwise.hpp) ------------------------------------- | ||
|
|
||
| // getValueOfBits: gather the bits at the given (strictly increasing) positions into the low bits. | ||
| static inline uint64_t scalarGather(uint64_t number, const int* inds, int n) { | ||
| uint64_t value = 0; | ||
| for (int i=0; i<n; i++) | ||
| value |= ((number >> inds[i]) & 1ULL) << i; | ||
| return value; | ||
| } | ||
|
|
||
| // insertBitsWithMaskedValues: spread number's low bits into the positions NOT named by inds (i.e. | ||
| // insert a 0 at each increasing index), then OR in the precomputed value mask. | ||
| static inline uint64_t scalarScatter(uint64_t number, const int* inds, int n, uint64_t valueMask) { | ||
| uint64_t r = number; | ||
| for (int i=0; i<n; i++) { | ||
| uint64_t lo = r & ((1ULL << inds[i]) - 1); | ||
| uint64_t hi = r & ~((1ULL << inds[i]) - 1); | ||
| r = (hi << 1) | lo; | ||
| } | ||
| return valueMask | r; | ||
| } | ||
|
|
||
| static inline uint64_t makePosMask(const int* inds, int n) { | ||
| uint64_t m = 0; | ||
| for (int i=0; i<n; i++) | ||
| m |= 1ULL << inds[i]; | ||
| return m; | ||
| } | ||
|
|
||
| // --- timing harness ------------------------------------------------------------------------------ | ||
|
|
||
| static double nsPerCall(uint64_t iters, double seconds) { | ||
| return 1e9 * seconds / (double) iters; | ||
| } | ||
|
|
||
| template <typename F> | ||
| static double timeMin(uint64_t iters, int reps, F&& fn) { | ||
| double best = 1e300; | ||
| for (int r=0; r<reps; r++) { | ||
| auto t0 = std::chrono::steady_clock::now(); | ||
| fn(iters); | ||
| auto t1 = std::chrono::steady_clock::now(); | ||
| double s = std::chrono::duration<double>(t1 - t0).count(); | ||
| if (s < best) best = s; | ||
| } | ||
| return best; | ||
| } | ||
|
|
||
| int main() { | ||
|
|
||
| printf("QuEST issue #717 - BMI2 PEXT/PDEP bitwise micro-benchmark\n"); | ||
| #ifdef BENCH_USE_BMI2 | ||
| printf("BMI2 fast path: ACTIVE (compiled with -mbmi2)\n\n"); | ||
| #else | ||
| printf("BMI2 fast path: INACTIVE (x86 BMI2 not targeted; scalar timings only)\n\n"); | ||
| #endif | ||
|
|
||
| const uint64_t iters = 8000000; // keeps total runtime well under a second | ||
| const int reps = 3; | ||
| const int counts[] = {3, 6}; // representative qubit-arity per gate | ||
|
|
||
| printf("%-8s %-4s %14s %14s %10s\n", "op", "k", "scalar ns/call", "bmi2 ns/call", "speedup"); | ||
|
|
||
| for (int ci=0; ci<2; ci++) { | ||
| int k = counts[ci]; | ||
|
|
||
| // a fixed, strictly-increasing index set and a value mask consistent with it | ||
| int inds[8]; | ||
| for (int i=0; i<k; i++) inds[i] = 3*i + 1; | ||
| uint64_t posMask = makePosMask(inds, k); | ||
| uint64_t valueMask = posMask & 0xA5A5A5A5A5A5A5A5ULL; | ||
|
|
||
| volatile uint64_t sink = 0; | ||
|
|
||
| // ---- gather (getValueOfBits) ---- | ||
| double sg = timeMin(iters, reps, [&](uint64_t N){ | ||
| uint64_t acc = 0; | ||
| for (uint64_t n=0; n<N; n++) acc ^= scalarGather(n, inds, k); | ||
| sink ^= acc; | ||
| }); | ||
| #ifdef BENCH_USE_BMI2 | ||
| double bg = timeMin(iters, reps, [&](uint64_t N){ | ||
| uint64_t acc = 0; | ||
| for (uint64_t n=0; n<N; n++) acc ^= (uint64_t) _pext_u64(n, posMask); | ||
| sink ^= acc; | ||
| }); | ||
| printf("%-8s %-4d %14.3f %14.3f %9.2fx\n", "gather", k, | ||
| nsPerCall(iters, sg), nsPerCall(iters, bg), sg/bg); | ||
| #else | ||
| printf("%-8s %-4d %14.3f %14s %10s\n", "gather", k, nsPerCall(iters, sg), "-", "-"); | ||
| #endif | ||
|
|
||
| // ---- scatter (insertBitsWithMaskedValues) ---- | ||
| double ss = timeMin(iters, reps, [&](uint64_t N){ | ||
| uint64_t acc = 0; | ||
| for (uint64_t n=0; n<N; n++) acc ^= scalarScatter(n, inds, k, valueMask); | ||
| sink ^= acc; | ||
| }); | ||
| #ifdef BENCH_USE_BMI2 | ||
| double bs = timeMin(iters, reps, [&](uint64_t N){ | ||
| uint64_t acc = 0; | ||
| for (uint64_t n=0; n<N; n++) acc ^= (valueMask | (uint64_t) _pdep_u64(n, ~posMask)); | ||
| sink ^= acc; | ||
| }); | ||
| printf("%-8s %-4d %14.3f %14.3f %9.2fx\n", "scatter", k, | ||
| nsPerCall(iters, ss), nsPerCall(iters, bs), ss/bs); | ||
| #else | ||
| printf("%-8s %-4d %14.3f %14s %10s\n", "scatter", k, nsPerCall(iters, ss), "-", "-"); | ||
| #endif | ||
|
|
||
| #ifdef BENCH_USE_BMI2 | ||
| // sanity: the intrinsic and scalar paths must agree (bit-for-bit) for these sorted indices | ||
| bool ok = true; | ||
| for (uint64_t n=0; n<4096 && ok; n++) { | ||
| if ((uint64_t)_pext_u64(n, posMask) != scalarGather(n, inds, k)) ok = false; | ||
| if ((valueMask | (uint64_t)_pdep_u64(n, ~posMask)) != scalarScatter(n, inds, k, valueMask)) ok = false; | ||
| } | ||
| printf(" (k=%d results verified bit-identical to scalar: %s)\n", k, ok ? "yes" : "NO"); | ||
| #endif | ||
| (void) sink; | ||
| } | ||
|
|
||
| return 0; | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
TODO: consider enabling by default when detectedly supported by compiler