From 94833302e3c3b0fb32717c59bbfe8b11bdd6be8e Mon Sep 17 00:00:00 2001
From: Ivana <ivana.gyro@gmail.com>
Date: Sat, 30 May 2026 09:38:21 +0000
Subject: [PATCH 1/2] fix(sum): use pairwise summation for floating-point
 linalg::Sum

linalg::Sum accumulated every element into one scalar in a serial loop,
whose worst-case rounding error grows as O(N * eps). The Accuracy test in
tests/linalg_test/sum_test.cpp (10000 identical floating-point values whose
bit patterns are chosen to expose round-off) failed under naive accumulation
and was disabled.

Add a pairwise-summation primitive (pairwise_sum.hpp): PairwiseSum mirrors
NumPy's np.add.reduce -- a straight loop below 8 elements, an eight-accumulator
unrolled block up to 128, and a halve-and-recurse split rounded to a multiple
of eight above that. Worst-case error growth drops to O(log N * eps) at
essentially the same cost. PairwiseSum consumes any random-access range, so it
also serves strided reductions later.

Rewrite the Float/Double/ComplexFloat/ComplexDouble branches of Sum_internal to
call PairwiseSum over the contiguous storage span and re-enable the accuracy
test. Integer branches keep the straight loop since they do not round. The GPU
path (cuReduce_gpu) already performs a block/tree reduction with bounded error,
so it is left unchanged.

For the disabled test's values the new code lands within 2 ULP of the
reference, against the 4-ULP tolerance of EXPECT_NUMBER_EQ; naive accumulation
was 108 ULP off.

Co-authored-by: Claude <noreply@anthropic.com>
---
 .../linalg_internal_cpu/Sum_internal.cpp      | 23 +++----
 .../linalg_internal_cpu/pairwise_sum.hpp      | 62 +++++++++++++++++++
 tests/linalg_test/sum_test.cpp                |  2 +-
 3 files changed, 70 insertions(+), 17 deletions(-)
 create mode 100644 src/backend/linalg_internal_cpu/pairwise_sum.hpp
diff --git a/src/backend/linalg_internal_cpu/Sum_internal.cpp b/src/backend/linalg_internal_cpu/Sum_internal.cpp
index df871390d..7fef6385c 100644
--- a/src/backend/linalg_internal_cpu/Sum_internal.cpp
+++ b/src/backend/linalg_internal_cpu/Sum_internal.cpp
@@ -1,8 +1,11 @@
 #include "Sum_internal.hpp"
 
+#include <span>
+
 #include "boost/smart_ptr/intrusive_ptr.hpp"
 
 #include "backend/Storage.hpp"
+#include "backend/linalg_internal_cpu/pairwise_sum.hpp"
 #include "cytnx_error.hpp"
 #include "Type.hpp"
 
@@ -88,10 +91,7 @@ namespace cytnx {
       cytnx_double *_ten = (cytnx_double *)ten->data();
       cytnx_double *_out = (cytnx_double *)out->data();
 
-      _out[0] = 0;
-      for (cytnx_uint64 n = 0; n < Nelem; n++) {
-        _out[0] += _ten[n];
-      }
+      _out[0] = PairwiseSum(std::span<const cytnx_double>(_ten, Nelem));
     }
 
     void Sum_internal_f(boost::intrusive_ptr<Storage_base> &out,
@@ -99,20 +99,14 @@ namespace cytnx {
       cytnx_float *_ten = (cytnx_float *)ten->data();
       cytnx_float *_out = (cytnx_float *)out->data();
 
-      _out[0] = 0;
-      for (cytnx_uint64 n = 0; n < Nelem; n++) {
-        _out[0] += _ten[n];
-      }
+      _out[0] = PairwiseSum(std::span<const cytnx_float>(_ten, Nelem));
     }
     void Sum_internal_cd(boost::intrusive_ptr<Storage_base> &out,
                          const boost::intrusive_ptr<Storage_base> &ten, const cytnx_uint64 &Nelem) {
       cytnx_complex128 *_ten = (cytnx_complex128 *)ten->data();
       cytnx_complex128 *_out = (cytnx_complex128 *)out->data();
 
-      _out[0] = 0;
-      for (cytnx_uint64 n = 0; n < Nelem; n++) {
-        _out[0] += _ten[n];
-      }
+      _out[0] = PairwiseSum(std::span<const cytnx_complex128>(_ten, Nelem));
     }
 
     void Sum_internal_cf(boost::intrusive_ptr<Storage_base> &out,
@@ -120,10 +114,7 @@ namespace cytnx {
       cytnx_complex64 *_ten = (cytnx_complex64 *)ten->data();
       cytnx_complex64 *_out = (cytnx_complex64 *)out->data();
 
-      _out[0] = 0;
-      for (cytnx_uint64 n = 0; n < Nelem; n++) {
-        _out[0] += _ten[n];
-      }
+      _out[0] = PairwiseSum(std::span<const cytnx_complex64>(_ten, Nelem));
     }
 
     void Sum_internal_b(boost::intrusive_ptr<Storage_base> &out,
diff --git a/src/backend/linalg_internal_cpu/pairwise_sum.hpp b/src/backend/linalg_internal_cpu/pairwise_sum.hpp
new file mode 100644
index 000000000..2680307ff
--- /dev/null
+++ b/src/backend/linalg_internal_cpu/pairwise_sum.hpp
@@ -0,0 +1,62 @@
+#ifndef CYTNX_BACKEND_LINALG_INTERNAL_CPU_PAIRWISE_SUM_H_
+#define CYTNX_BACKEND_LINALG_INTERNAL_CPU_PAIRWISE_SUM_H_
+
+#include <cstddef>
+#include <iterator>
+#include <ranges>
+
+namespace cytnx {
+  namespace linalg_internal {
+
+    // Recursive (divide-and-conquer) core of the pairwise summation, matching
+    // NumPy's np.add.reduce: a straight loop for the smallest blocks, an
+    // eight-accumulator unrolled loop up to 128 elements, and a split into two
+    // halves (rounded to a multiple of eight) above that. Worst-case rounding
+    // error grows as O(log N * eps) instead of the O(N * eps) of a naive serial
+    // accumulation, at essentially the same cost.
+    template <class T, std::random_access_iterator It>
+    T PairwiseSumBlocks(It first, std::size_t n) {
+      if (n < 8) {
+        T res = T(0);
+        for (std::size_t i = 0; i < n; ++i) res += first[static_cast<std::ptrdiff_t>(i)];
+        return res;
+      }
+      if (n <= 128) {
+        T r0 = first[0], r1 = first[1], r2 = first[2], r3 = first[3];
+        T r4 = first[4], r5 = first[5], r6 = first[6], r7 = first[7];
+        std::size_t i = 8;
+        for (; i + 8 <= n; i += 8) {
+          auto p = first + static_cast<std::ptrdiff_t>(i);
+          r0 += p[0];
+          r1 += p[1];
+          r2 += p[2];
+          r3 += p[3];
+          r4 += p[4];
+          r5 += p[5];
+          r6 += p[6];
+          r7 += p[7];
+        }
+        T res = ((r0 + r1) + (r2 + r3)) + ((r4 + r5) + (r6 + r7));
+        for (; i < n; ++i) res += first[static_cast<std::ptrdiff_t>(i)];
+        return res;
+      }
+      std::size_t half = n / 2;
+      half -= half % 8;
+      return PairwiseSumBlocks<T>(first, half) +
+             PairwiseSumBlocks<T>(first + static_cast<std::ptrdiff_t>(half), n - half);
+    }
+
+    // Pairwise sum over a random-access range. The element type is deduced from
+    // the range. A contiguous std::span sums every element; pass a strided view
+    // (see stride_view.hpp) to sum a strided sequence such as a matrix diagonal.
+    template <std::ranges::random_access_range R>
+    std::ranges::range_value_t<R> PairwiseSum(R&& range) {
+      using T = std::ranges::range_value_t<R>;
+      return PairwiseSumBlocks<T>(std::ranges::begin(range),
+                                  static_cast<std::size_t>(std::ranges::size(range)));
+    }
+
+  }  // namespace linalg_internal
+}  // namespace cytnx
+
+#endif  // CYTNX_BACKEND_LINALG_INTERNAL_CPU_PAIRWISE_SUM_H_
diff --git a/tests/linalg_test/sum_test.cpp b/tests/linalg_test/sum_test.cpp
index d130ec71c..23b7a4aed 100644
--- a/tests/linalg_test/sum_test.cpp
+++ b/tests/linalg_test/sum_test.cpp
@@ -49,7 +49,7 @@ namespace cytnx {
    * Note: `cytnx_bool` is not supported for the `linalg::Sum()` function.
    * This test also assesses the accuracy of summing floating-point numbers.
    */
-  TYPED_TEST(LinalgSumHomogeneousValuesTest, DISABLED_Accuracy) {
+  TYPED_TEST(LinalgSumHomogeneousValuesTest, Accuracy) {
     TypeParam value = LinalgSumHomogeneousValuesTest<TypeParam>::value;
     int element_number = 10000;
     unsigned int dtype = Type_class().cy_typeid(value);

From e7a9b3e73eebc07ba2a76b5ab89aa69e22f7432d Mon Sep 17 00:00:00 2001
From: Ivana <ivana.gyro@gmail.com>
Date: Tue, 9 Jun 2026 15:13:57 +0000
Subject: [PATCH 2/2] test(sum): cover PairwiseSumBlocks branches and
 heterogeneous magnitudes

The existing LinalgSumHomogeneousValuesTest.Accuracy uses element_number =
10000, which only reaches the recursive-split branch of PairwiseSumBlocks, and
the equal-magnitude input distribution is the one regime where naive serial
summation stays nearly exact -- neither demonstrates nor protects the actual
accuracy benefit pairwise summation buys.

Add two focused tests:

* LinalgSumBoundaryTest.EachPairwiseSumBranch iterates over sizes 1, 7, 8, 9,
  15, 128, 129, 137, 1024 and verifies linalg::Sum on a homogeneous-1.0 tensor
  is exactly N. The sizes straddle the n < 8 / n <= 128 / n > 128 thresholds
  and include the n % 8 != 0 tails inside the 8-accumulator block, so a
  regression that drops or double-counts an element in any branch fails
  immediately rather than only on the n = 10000 recursive case.

* LinalgSumHeterogeneousMagnitudeTest.RecoversTermsLostByNaiveAccumulation_*
  sums [+L, 1, 1, ..., 1, -L] where L is far above the precision threshold
  (1e16 for double > 2^53, 1e8 for float > 2^23). The exact sum is N - 2.
  Under naive accumulation the running total reaches L on the first element
  and every subsequent +1 vanishes in IEEE 754 rounding (1.0 is below the unit
  in the last place at that magnitude), so naive collapses to ~0. Pairwise
  keeps small values together in the tree until the +/- L cancellation at the
  top, so most of the small terms survive. The contract asserted is N/2 <
  result <= N: tight enough that a serial-accumulation regression collapses to
  ~0 and fails, loose enough to accommodate the handful of 1's that the
  unrolled accumulator holding L still rounds away.

Co-authored-by: Claude <noreply@anthropic.com>
---
 tests/linalg_test/sum_test.cpp | 77 ++++++++++++++++++++++++++++++++++
 1 file changed, 77 insertions(+)

diff --git a/tests/linalg_test/sum_test.cpp b/tests/linalg_test/sum_test.cpp
index 23b7a4aed..e222cd2e4 100644
--- a/tests/linalg_test/sum_test.cpp
+++ b/tests/linalg_test/sum_test.cpp
@@ -64,4 +64,81 @@ namespace cytnx {
 
     EXPECT_NUMBER_EQ(sum_result.at<TypeParam>({0}), value * static_cast<TypeParam>(element_number));
   }
+
+  /**
+   * Exercises every branch of `PairwiseSumBlocks` -- the recursive core that
+   * `linalg::Sum` dispatches floating-point reductions through.
+   *
+   *   * n < 8         : straight serial loop
+   *   * 8 <= n <= 128 : 8-accumulator unrolled body, optionally with a scalar
+   *                     tail when n % 8 != 0
+   *   * n > 128       : recursive split into two halves rounded to a multiple
+   *                     of 8
+   *
+   * The original Accuracy test only covers the n = 10000 recursive-split case;
+   * a regression in either of the small-n branches would not be caught there.
+   * Sizes 7/8/9/15/128/129/137 straddle the thresholds (including the off-by-one
+   * tail cases). The expected result is exact, so any branch that drops or
+   * double-counts an element fails immediately.
+   */
+  TEST(LinalgSumBoundaryTest, EachPairwiseSumBranch) {
+    const cytnx_double value = 1.0;
+    for (int n : {1, 7, 8, 9, 15, 128, 129, 137, 1024}) {
+      Tensor tensor(/* shape */ {static_cast<unsigned long>(n)}, Type.Double, Device.cpu,
+                    /* init_zero */ false);
+      tensor.fill(value);
+      Tensor sum_result = linalg::Sum(tensor);
+      EXPECT_EQ(sum_result.shape().size(), 1);
+      EXPECT_EQ(sum_result.shape()[0], 1);
+      EXPECT_DOUBLE_EQ(sum_result.at<cytnx_double>({0}), value * static_cast<cytnx_double>(n))
+        << "n=" << n;
+    }
+  }
+
+  /**
+   * The dynamic-range case that motivates pairwise summation, and the one
+   * input distribution where naive serial accumulation visibly fails.
+   *
+   * Both arrays are [+L, 1, 1, ..., 1, -L] with L far above the precision
+   * threshold (2^53 for double, 2^23 for float); the exact sum is N - 2. Under
+   * naive accumulation the running total reaches L on the first element and
+   * the subsequent +1's vanish in IEEE 754 rounding -- 1.0 is below the unit
+   * in the last place at that magnitude -- so naive returns ~0. Pairwise
+   * keeps small values together in the tree until the cancellation of +/-L at
+   * the top, so the small terms survive (modulo a handful of 1's in the
+   * unrolled-accumulator blocks that hold L itself).
+   *
+   * The contract this asserts is qualitative on purpose: any reasonable
+   * pairwise implementation must land much closer to N - 2 than to 0. A
+   * serial-accumulation regression would collapse to ~0 and fail
+   * `EXPECT_GT(result, N / 2)` immediately; the exact pairwise result depends
+   * on which accumulators receive +/-L (a few small terms are lost there).
+   */
+  TEST(LinalgSumHeterogeneousMagnitudeTest, RecoversTermsLostByNaiveAccumulation_Double) {
+    constexpr int N = 1024;
+    Tensor tensor(/* shape */ {static_cast<unsigned long>(N)}, Type.Double, Device.cpu,
+                  /* init_zero */ false);
+    tensor.fill(static_cast<cytnx_double>(1));
+    tensor.at<cytnx_double>({0}) = 1e16;
+    tensor.at<cytnx_double>({N - 1}) = -1e16;
+    Tensor sum_result = linalg::Sum(tensor);
+    const cytnx_double result = sum_result.at<cytnx_double>({0});
+    EXPECT_GT(result, static_cast<cytnx_double>(N / 2));
+    EXPECT_LE(result, static_cast<cytnx_double>(N));
+  }
+
+  TEST(LinalgSumHeterogeneousMagnitudeTest, RecoversTermsLostByNaiveAccumulation_Float) {
+    constexpr int N = 1024;
+    Tensor tensor(/* shape */ {static_cast<unsigned long>(N)}, Type.Float, Device.cpu,
+                  /* init_zero */ false);
+    tensor.fill(static_cast<cytnx_float>(1));
+    // float has ~7.2 decimal digits of precision; 1e8 already exceeds 2^23, so
+    // a serial `1e8 + 1` collapses to 1e8 and the unit terms are lost.
+    tensor.at<cytnx_float>({0}) = 1e8f;
+    tensor.at<cytnx_float>({N - 1}) = -1e8f;
+    Tensor sum_result = linalg::Sum(tensor);
+    const cytnx_float result = sum_result.at<cytnx_float>({0});
+    EXPECT_GT(result, static_cast<cytnx_float>(N / 2));
+    EXPECT_LE(result, static_cast<cytnx_float>(N));
+  }
 }  // namespace cytnx