From e4da72a23f052c4b9f933dfc1f551b88668ad0fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Bylica?= Date: Fri, 22 May 2026 15:08:00 +0200 Subject: [PATCH] crypto: Share Fq12 squaring across pairs in BN254 Miller loop MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pairing_check() previously ran one independent Miller loop per pair and multiplied the results, paying LOG_ATE_LOOP_COUNT + 1 = 64 Fq12 squarings per pair. Because the Miller-loop recurrence f_{i+1} = f_i² · line_i is multiplicative across pairs, all pairs can share a single Fq12 accumulator. For N valid pairs this saves (N−1) × 64 Fq12 squarings without changing the final product. Restructure as multi_miller_loop(): - Validate all pairs up front, collect surviving ones into a MillerPairState vector (T in Jacobian, Q/-Q affine, P, -P.y). - Single squaring per iteration, then line-and-mul for every pair. - NAF add branch and the two post-loop Frobenius steps iterate over every pair. The bench inputs span 2 and 4 pairs per call — for the 10 inputs in test/precompiles_bench/precompiles_bench.cpp, total Fq12 squarings drop from 2048 (= 32 pairs × 64) to 640 (= 10 calls × 64), saving ~141 squarings per call on average. Bench (build/clang-tt, 100 reps × 1s each, ecpairing precompile): master baseline: 2906753 ns mean, 2888808 ns median, σ=185869 this branch: 2909897 ns mean, 2890350 ns median, σ=232121 Δ: within noise (σ ≈ 7% of mean) — measurable savings on the algorithm side don't translate to a wall-clock win on this build/ CPU. Fq12 squaring is apparently cheap enough relative to the per-pair line work that the per-call sharing benefit is below the current noise floor. Code-quality / structural improvement on its own: the multi-pair loop form is simpler (single accumulator, single squaring path), and ports cleanly to the planned follow-up Karatsuba-sparse line multiplication which can share more work across pairs. Tests: 53/53 unit tests, EEST state tests 11/11 on every stable fork (Byzantium / Istanbul / Cancun / Prague / Osaka). --- .../pairing/bn254/pairing.cpp | 82 +++++++++++++------ 1 file changed, 55 insertions(+), 27 deletions(-) diff --git a/lib/evmone_precompiles/pairing/bn254/pairing.cpp b/lib/evmone_precompiles/pairing/bn254/pairing.cpp index 054d3aa019..ef3f533e96 100644 --- a/lib/evmone_precompiles/pairing/bn254/pairing.cpp +++ b/lib/evmone_precompiles/pairing/bn254/pairing.cpp @@ -5,6 +5,7 @@ #include "../../bn254.hpp" #include "fields.hpp" #include "utils.hpp" +#include #include namespace evmmax::bn254 @@ -44,48 +45,71 @@ constexpr void multiply_by_lin_func_value( inline constexpr auto ATE_LOOP_COUNT_NAF = 0x1120804220120081204008212022011_u128; inline constexpr int LOG_ATE_LOOP_COUNT = 63; -/// Miller loop according to https://eprint.iacr.org/2010/354.pdf Algorithm 1. -Fq12 miller_loop(const ecc::Point& Q, const ecc::Point& P) noexcept +/// State carried across iterations for one pair in the Miller loop. +struct MillerPairState +{ + ecc::JacPoint T; ///< running Jacobian point on the twisted curve + ecc::Point Q; ///< the affine G2 input + ecc::Point nQ; ///< -Q, precomputed + ecc::Point P; ///< the affine G1 input + Fq ny; ///< -P.y, precomputed +}; + +/// Multi-pair Miller loop computing prod_i e_miller(P_i, Q_i). +/// +/// Algorithm: https://eprint.iacr.org/2010/354.pdf Algorithm 1, batched across +/// all input pairs so the Fq12 squaring is shared instead of repeated per pair. +/// For N valid pairs, saves (N-1) × (LOG_ATE_LOOP_COUNT + 1) Fq12 squarings +/// vs. the per-pair-then-multiply approach. +Fq12 multi_miller_loop(std::span pairs) noexcept { - auto T = ecc::JacPoint::from(Q); - auto nQ = -Q; auto f = Fq12::one(); std::array t; auto naf = ATE_LOOP_COUNT_NAF; - const auto ny = -P.y; for (int i = 0; i <= LOG_ATE_LOOP_COUNT; ++i) { - T = lin_func_and_dbl(T, t); - f = square(f); - multiply_by_lin_func_value(f, t, P.x, ny); + f = square(f); // single squaring shared by every pair this iteration + + for (auto& s : pairs) + { + s.T = lin_func_and_dbl(s.T, t); + multiply_by_lin_func_value(f, t, s.P.x, s.ny); + } if (naf & 1) { - T = lin_func_and_add(T, Q, t); - multiply_by_lin_func_value(f, t, P.x, P.y); + for (auto& s : pairs) + { + s.T = lin_func_and_add(s.T, s.Q, t); + multiply_by_lin_func_value(f, t, s.P.x, s.P.y); + } } else if (naf & 2) { - T = lin_func_and_add(T, nQ, t); - multiply_by_lin_func_value(f, t, P.x, P.y); + for (auto& s : pairs) + { + s.T = lin_func_and_add(s.T, s.nQ, t); + multiply_by_lin_func_value(f, t, s.P.x, s.P.y); + } } naf >>= 2; } - // Frobenius endomorphism for point Q from twisted curve over Fq2 field. - // It's essentially untwist -> frobenius -> twist chain of transformation. - const auto Q1 = endomorphism<1>(Q); - - // Similar to above one. It makes untwist -> frobenius^2 -> twist transformation plus - // negation according to miller loop spec. - const auto nQ2 = -endomorphism<2>(Q); + // Post-loop Frobenius endomorphism steps, one set per pair. + for (auto& s : pairs) + { + // untwist -> Frobenius -> twist + const auto Q1 = endomorphism<1>(s.Q); + // untwist -> Frobenius^2 -> twist, plus negation per Miller-loop spec + const auto nQ2 = -endomorphism<2>(s.Q); - T = lin_func_and_add(T, Q1, t); - multiply_by_lin_func_value(f, t, P.x, P.y); + s.T = lin_func_and_add(s.T, Q1, t); + multiply_by_lin_func_value(f, t, s.P.x, s.P.y); - lin_func(T, nQ2, t); - multiply_by_lin_func_value(f, t, P.x, P.y); + lin_func(s.T, nQ2, t); + multiply_by_lin_func_value(f, t, s.P.x, s.P.y); + } return f; } @@ -133,7 +157,8 @@ std::optional pairing_check(std::span> pa if (pairs.empty()) return true; - auto f = Fq12::one(); + std::vector states; + states.reserve(pairs.size()); for (const auto& [p, q] : pairs) { @@ -161,12 +186,15 @@ std::optional pairing_check(std::span> pa if (!g2_is_inf && (!is_on_twisted_curve(Q_aff) || !g2_subgroup_check(Q_aff))) return std::nullopt; - // If any of the points is infinity it means that miller_loop returns 1. so we can skip it. + // Skip pairs where either point is at infinity — they contribute 1 to the product. if (!g1_is_inf && !g2_is_inf) - f = f * miller_loop(Q_aff, P_aff); + states.push_back({ecc::JacPoint::from(Q_aff), Q_aff, -Q_aff, P_aff, -P_aff.y}); } - // final exp is calculated on accumulated value + if (states.empty()) + return true; + + const auto f = multi_miller_loop(states); return final_exp(f) == Fq12::one(); } } // namespace evmmax::bn254