From e4da72a23f052c4b9f933dfc1f551b88668ad0fc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20Bylica?= <pawel@hepcolgum.band>
Date: Fri, 22 May 2026 15:08:00 +0200
Subject: [PATCH] crypto: Share Fq12 squaring across pairs in BN254 Miller loop
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

pairing_check() previously ran one independent Miller loop per pair
and multiplied the results, paying LOG_ATE_LOOP_COUNT + 1 = 64 Fq12
squarings per pair. Because the Miller-loop recurrence

    f_{i+1} = f_i² · line_i

is multiplicative across pairs, all pairs can share a single Fq12
accumulator. For N valid pairs this saves (N−1) × 64 Fq12 squarings
without changing the final product.

Restructure as multi_miller_loop():
  - Validate all pairs up front, collect surviving ones into a
    MillerPairState vector (T in Jacobian, Q/-Q affine, P, -P.y).
  - Single squaring per iteration, then line-and-mul for every pair.
  - NAF add branch and the two post-loop Frobenius steps iterate
    over every pair.

The bench inputs span 2 and 4 pairs per call — for the 10 inputs in
test/precompiles_bench/precompiles_bench.cpp, total Fq12 squarings
drop from 2048 (= 32 pairs × 64) to 640 (= 10 calls × 64), saving
~141 squarings per call on average.

Bench (build/clang-tt, 100 reps × 1s each, ecpairing precompile):
  master baseline:  2906753 ns mean, 2888808 ns median, σ=185869
  this branch:      2909897 ns mean, 2890350 ns median, σ=232121
  Δ: within noise (σ ≈ 7% of mean) — measurable savings on the
  algorithm side don't translate to a wall-clock win on this build/
  CPU. Fq12 squaring is apparently cheap enough relative to the
  per-pair line work that the per-call sharing benefit is below the
  current noise floor.

Code-quality / structural improvement on its own: the multi-pair loop
form is simpler (single accumulator, single squaring path), and ports
cleanly to the planned follow-up Karatsuba-sparse line multiplication
which can share more work across pairs.

Tests: 53/53 unit tests, EEST state tests 11/11 on every stable fork
(Byzantium / Istanbul / Cancun / Prague / Osaka).
---
 .../pairing/bn254/pairing.cpp                 | 82 +++++++++++++------
 1 file changed, 55 insertions(+), 27 deletions(-)
diff --git a/lib/evmone_precompiles/pairing/bn254/pairing.cpp b/lib/evmone_precompiles/pairing/bn254/pairing.cpp
index 054d3aa019..ef3f533e96 100644
--- a/lib/evmone_precompiles/pairing/bn254/pairing.cpp
+++ b/lib/evmone_precompiles/pairing/bn254/pairing.cpp
@@ -5,6 +5,7 @@
 #include "../../bn254.hpp"
 #include "fields.hpp"
 #include "utils.hpp"
+#include <span>
 #include <vector>
 
 namespace evmmax::bn254
@@ -44,48 +45,71 @@ constexpr void multiply_by_lin_func_value(
 inline constexpr auto ATE_LOOP_COUNT_NAF = 0x1120804220120081204008212022011_u128;
 inline constexpr int LOG_ATE_LOOP_COUNT = 63;
 
-/// Miller loop according to https://eprint.iacr.org/2010/354.pdf Algorithm 1.
-Fq12 miller_loop(const ecc::Point<Fq2>& Q, const ecc::Point<Fq>& P) noexcept
+/// State carried across iterations for one pair in the Miller loop.
+struct MillerPairState
+{
+    ecc::JacPoint<Fq2> T;  ///< running Jacobian point on the twisted curve
+    ecc::Point<Fq2> Q;     ///< the affine G2 input
+    ecc::Point<Fq2> nQ;    ///< -Q, precomputed
+    ecc::Point<Fq> P;      ///< the affine G1 input
+    Fq ny;                 ///< -P.y, precomputed
+};
+
+/// Multi-pair Miller loop computing prod_i e_miller(P_i, Q_i).
+///
+/// Algorithm: https://eprint.iacr.org/2010/354.pdf Algorithm 1, batched across
+/// all input pairs so the Fq12 squaring is shared instead of repeated per pair.
+/// For N valid pairs, saves (N-1) × (LOG_ATE_LOOP_COUNT + 1) Fq12 squarings
+/// vs. the per-pair-then-multiply approach.
+Fq12 multi_miller_loop(std::span<MillerPairState> pairs) noexcept
 {
-    auto T = ecc::JacPoint<Fq2>::from(Q);
-    auto nQ = -Q;
     auto f = Fq12::one();
     std::array<Fq2, 3> t;
     auto naf = ATE_LOOP_COUNT_NAF;
-    const auto ny = -P.y;
 
     for (int i = 0; i <= LOG_ATE_LOOP_COUNT; ++i)
     {
-        T = lin_func_and_dbl(T, t);
-        f = square(f);
-        multiply_by_lin_func_value(f, t, P.x, ny);
+        f = square(f);  // single squaring shared by every pair this iteration
+
+        for (auto& s : pairs)
+        {
+            s.T = lin_func_and_dbl(s.T, t);
+            multiply_by_lin_func_value(f, t, s.P.x, s.ny);
+        }
 
         if (naf & 1)
         {
-            T = lin_func_and_add(T, Q, t);
-            multiply_by_lin_func_value(f, t, P.x, P.y);
+            for (auto& s : pairs)
+            {
+                s.T = lin_func_and_add(s.T, s.Q, t);
+                multiply_by_lin_func_value(f, t, s.P.x, s.P.y);
+            }
         }
         else if (naf & 2)
         {
-            T = lin_func_and_add(T, nQ, t);
-            multiply_by_lin_func_value(f, t, P.x, P.y);
+            for (auto& s : pairs)
+            {
+                s.T = lin_func_and_add(s.T, s.nQ, t);
+                multiply_by_lin_func_value(f, t, s.P.x, s.P.y);
+            }
         }
         naf >>= 2;
     }
 
-    // Frobenius endomorphism for point Q from twisted curve over Fq2 field.
-    // It's essentially untwist -> frobenius -> twist chain of transformation.
-    const auto Q1 = endomorphism<1>(Q);
-
-    // Similar to above one. It makes untwist -> frobenius^2 -> twist transformation plus
-    // negation according to miller loop spec.
-    const auto nQ2 = -endomorphism<2>(Q);
+    // Post-loop Frobenius endomorphism steps, one set per pair.
+    for (auto& s : pairs)
+    {
+        // untwist -> Frobenius -> twist
+        const auto Q1 = endomorphism<1>(s.Q);
+        // untwist -> Frobenius^2 -> twist, plus negation per Miller-loop spec
+        const auto nQ2 = -endomorphism<2>(s.Q);
 
-    T = lin_func_and_add(T, Q1, t);
-    multiply_by_lin_func_value(f, t, P.x, P.y);
+        s.T = lin_func_and_add(s.T, Q1, t);
+        multiply_by_lin_func_value(f, t, s.P.x, s.P.y);
 
-    lin_func(T, nQ2, t);
-    multiply_by_lin_func_value(f, t, P.x, P.y);
+        lin_func(s.T, nQ2, t);
+        multiply_by_lin_func_value(f, t, s.P.x, s.P.y);
+    }
 
     return f;
 }
@@ -133,7 +157,8 @@ std::optional<bool> pairing_check(std::span<const std::pair<Point, ExtPoint>> pa
     if (pairs.empty())
         return true;
 
-    auto f = Fq12::one();
+    std::vector<MillerPairState> states;
+    states.reserve(pairs.size());
 
     for (const auto& [p, q] : pairs)
     {
@@ -161,12 +186,15 @@ std::optional<bool> pairing_check(std::span<const std::pair<Point, ExtPoint>> pa
         if (!g2_is_inf && (!is_on_twisted_curve(Q_aff) || !g2_subgroup_check(Q_aff)))
             return std::nullopt;
 
-        // If any of the points is infinity it means that miller_loop returns 1. so we can skip it.
+        // Skip pairs where either point is at infinity — they contribute 1 to the product.
         if (!g1_is_inf && !g2_is_inf)
-            f = f * miller_loop(Q_aff, P_aff);
+            states.push_back({ecc::JacPoint<Fq2>::from(Q_aff), Q_aff, -Q_aff, P_aff, -P_aff.y});
     }
 
-    // final exp is calculated on accumulated value
+    if (states.empty())
+        return true;
+
+    const auto f = multi_miller_loop(states);
     return final_exp(f) == Fq12::one();
 }
 }  // namespace evmmax::bn254