From 94833302e3c3b0fb32717c59bbfe8b11bdd6be8e Mon Sep 17 00:00:00 2001 From: Ivana Date: Sat, 30 May 2026 09:38:21 +0000 Subject: [PATCH 1/2] fix(sum): use pairwise summation for floating-point linalg::Sum linalg::Sum accumulated every element into one scalar in a serial loop, whose worst-case rounding error grows as O(N * eps). The Accuracy test in tests/linalg_test/sum_test.cpp (10000 identical floating-point values whose bit patterns are chosen to expose round-off) failed under naive accumulation and was disabled. Add a pairwise-summation primitive (pairwise_sum.hpp): PairwiseSum mirrors NumPy's np.add.reduce -- a straight loop below 8 elements, an eight-accumulator unrolled block up to 128, and a halve-and-recurse split rounded to a multiple of eight above that. Worst-case error growth drops to O(log N * eps) at essentially the same cost. PairwiseSum consumes any random-access range, so it also serves strided reductions later. Rewrite the Float/Double/ComplexFloat/ComplexDouble branches of Sum_internal to call PairwiseSum over the contiguous storage span and re-enable the accuracy test. Integer branches keep the straight loop since they do not round. The GPU path (cuReduce_gpu) already performs a block/tree reduction with bounded error, so it is left unchanged. For the disabled test's values the new code lands within 2 ULP of the reference, against the 4-ULP tolerance of EXPECT_NUMBER_EQ; naive accumulation was 108 ULP off. Co-authored-by: Claude --- .../linalg_internal_cpu/Sum_internal.cpp | 23 +++---- .../linalg_internal_cpu/pairwise_sum.hpp | 62 +++++++++++++++++++ tests/linalg_test/sum_test.cpp | 2 +- 3 files changed, 70 insertions(+), 17 deletions(-) create mode 100644 src/backend/linalg_internal_cpu/pairwise_sum.hpp diff --git a/src/backend/linalg_internal_cpu/Sum_internal.cpp b/src/backend/linalg_internal_cpu/Sum_internal.cpp index df871390d..7fef6385c 100644 --- a/src/backend/linalg_internal_cpu/Sum_internal.cpp +++ b/src/backend/linalg_internal_cpu/Sum_internal.cpp @@ -1,8 +1,11 @@ #include "Sum_internal.hpp" +#include + #include "boost/smart_ptr/intrusive_ptr.hpp" #include "backend/Storage.hpp" +#include "backend/linalg_internal_cpu/pairwise_sum.hpp" #include "cytnx_error.hpp" #include "Type.hpp" @@ -88,10 +91,7 @@ namespace cytnx { cytnx_double *_ten = (cytnx_double *)ten->data(); cytnx_double *_out = (cytnx_double *)out->data(); - _out[0] = 0; - for (cytnx_uint64 n = 0; n < Nelem; n++) { - _out[0] += _ten[n]; - } + _out[0] = PairwiseSum(std::span(_ten, Nelem)); } void Sum_internal_f(boost::intrusive_ptr &out, @@ -99,20 +99,14 @@ namespace cytnx { cytnx_float *_ten = (cytnx_float *)ten->data(); cytnx_float *_out = (cytnx_float *)out->data(); - _out[0] = 0; - for (cytnx_uint64 n = 0; n < Nelem; n++) { - _out[0] += _ten[n]; - } + _out[0] = PairwiseSum(std::span(_ten, Nelem)); } void Sum_internal_cd(boost::intrusive_ptr &out, const boost::intrusive_ptr &ten, const cytnx_uint64 &Nelem) { cytnx_complex128 *_ten = (cytnx_complex128 *)ten->data(); cytnx_complex128 *_out = (cytnx_complex128 *)out->data(); - _out[0] = 0; - for (cytnx_uint64 n = 0; n < Nelem; n++) { - _out[0] += _ten[n]; - } + _out[0] = PairwiseSum(std::span(_ten, Nelem)); } void Sum_internal_cf(boost::intrusive_ptr &out, @@ -120,10 +114,7 @@ namespace cytnx { cytnx_complex64 *_ten = (cytnx_complex64 *)ten->data(); cytnx_complex64 *_out = (cytnx_complex64 *)out->data(); - _out[0] = 0; - for (cytnx_uint64 n = 0; n < Nelem; n++) { - _out[0] += _ten[n]; - } + _out[0] = PairwiseSum(std::span(_ten, Nelem)); } void Sum_internal_b(boost::intrusive_ptr &out, diff --git a/src/backend/linalg_internal_cpu/pairwise_sum.hpp b/src/backend/linalg_internal_cpu/pairwise_sum.hpp new file mode 100644 index 000000000..2680307ff --- /dev/null +++ b/src/backend/linalg_internal_cpu/pairwise_sum.hpp @@ -0,0 +1,62 @@ +#ifndef CYTNX_BACKEND_LINALG_INTERNAL_CPU_PAIRWISE_SUM_H_ +#define CYTNX_BACKEND_LINALG_INTERNAL_CPU_PAIRWISE_SUM_H_ + +#include +#include +#include + +namespace cytnx { + namespace linalg_internal { + + // Recursive (divide-and-conquer) core of the pairwise summation, matching + // NumPy's np.add.reduce: a straight loop for the smallest blocks, an + // eight-accumulator unrolled loop up to 128 elements, and a split into two + // halves (rounded to a multiple of eight) above that. Worst-case rounding + // error grows as O(log N * eps) instead of the O(N * eps) of a naive serial + // accumulation, at essentially the same cost. + template + T PairwiseSumBlocks(It first, std::size_t n) { + if (n < 8) { + T res = T(0); + for (std::size_t i = 0; i < n; ++i) res += first[static_cast(i)]; + return res; + } + if (n <= 128) { + T r0 = first[0], r1 = first[1], r2 = first[2], r3 = first[3]; + T r4 = first[4], r5 = first[5], r6 = first[6], r7 = first[7]; + std::size_t i = 8; + for (; i + 8 <= n; i += 8) { + auto p = first + static_cast(i); + r0 += p[0]; + r1 += p[1]; + r2 += p[2]; + r3 += p[3]; + r4 += p[4]; + r5 += p[5]; + r6 += p[6]; + r7 += p[7]; + } + T res = ((r0 + r1) + (r2 + r3)) + ((r4 + r5) + (r6 + r7)); + for (; i < n; ++i) res += first[static_cast(i)]; + return res; + } + std::size_t half = n / 2; + half -= half % 8; + return PairwiseSumBlocks(first, half) + + PairwiseSumBlocks(first + static_cast(half), n - half); + } + + // Pairwise sum over a random-access range. The element type is deduced from + // the range. A contiguous std::span sums every element; pass a strided view + // (see stride_view.hpp) to sum a strided sequence such as a matrix diagonal. + template + std::ranges::range_value_t PairwiseSum(R&& range) { + using T = std::ranges::range_value_t; + return PairwiseSumBlocks(std::ranges::begin(range), + static_cast(std::ranges::size(range))); + } + + } // namespace linalg_internal +} // namespace cytnx + +#endif // CYTNX_BACKEND_LINALG_INTERNAL_CPU_PAIRWISE_SUM_H_ diff --git a/tests/linalg_test/sum_test.cpp b/tests/linalg_test/sum_test.cpp index d130ec71c..23b7a4aed 100644 --- a/tests/linalg_test/sum_test.cpp +++ b/tests/linalg_test/sum_test.cpp @@ -49,7 +49,7 @@ namespace cytnx { * Note: `cytnx_bool` is not supported for the `linalg::Sum()` function. * This test also assesses the accuracy of summing floating-point numbers. */ - TYPED_TEST(LinalgSumHomogeneousValuesTest, DISABLED_Accuracy) { + TYPED_TEST(LinalgSumHomogeneousValuesTest, Accuracy) { TypeParam value = LinalgSumHomogeneousValuesTest::value; int element_number = 10000; unsigned int dtype = Type_class().cy_typeid(value); From e7a9b3e73eebc07ba2a76b5ab89aa69e22f7432d Mon Sep 17 00:00:00 2001 From: Ivana Date: Tue, 9 Jun 2026 15:13:57 +0000 Subject: [PATCH 2/2] test(sum): cover PairwiseSumBlocks branches and heterogeneous magnitudes The existing LinalgSumHomogeneousValuesTest.Accuracy uses element_number = 10000, which only reaches the recursive-split branch of PairwiseSumBlocks, and the equal-magnitude input distribution is the one regime where naive serial summation stays nearly exact -- neither demonstrates nor protects the actual accuracy benefit pairwise summation buys. Add two focused tests: * LinalgSumBoundaryTest.EachPairwiseSumBranch iterates over sizes 1, 7, 8, 9, 15, 128, 129, 137, 1024 and verifies linalg::Sum on a homogeneous-1.0 tensor is exactly N. The sizes straddle the n < 8 / n <= 128 / n > 128 thresholds and include the n % 8 != 0 tails inside the 8-accumulator block, so a regression that drops or double-counts an element in any branch fails immediately rather than only on the n = 10000 recursive case. * LinalgSumHeterogeneousMagnitudeTest.RecoversTermsLostByNaiveAccumulation_* sums [+L, 1, 1, ..., 1, -L] where L is far above the precision threshold (1e16 for double > 2^53, 1e8 for float > 2^23). The exact sum is N - 2. Under naive accumulation the running total reaches L on the first element and every subsequent +1 vanishes in IEEE 754 rounding (1.0 is below the unit in the last place at that magnitude), so naive collapses to ~0. Pairwise keeps small values together in the tree until the +/- L cancellation at the top, so most of the small terms survive. The contract asserted is N/2 < result <= N: tight enough that a serial-accumulation regression collapses to ~0 and fails, loose enough to accommodate the handful of 1's that the unrolled accumulator holding L still rounds away. Co-authored-by: Claude --- tests/linalg_test/sum_test.cpp | 77 ++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/tests/linalg_test/sum_test.cpp b/tests/linalg_test/sum_test.cpp index 23b7a4aed..e222cd2e4 100644 --- a/tests/linalg_test/sum_test.cpp +++ b/tests/linalg_test/sum_test.cpp @@ -64,4 +64,81 @@ namespace cytnx { EXPECT_NUMBER_EQ(sum_result.at({0}), value * static_cast(element_number)); } + + /** + * Exercises every branch of `PairwiseSumBlocks` -- the recursive core that + * `linalg::Sum` dispatches floating-point reductions through. + * + * * n < 8 : straight serial loop + * * 8 <= n <= 128 : 8-accumulator unrolled body, optionally with a scalar + * tail when n % 8 != 0 + * * n > 128 : recursive split into two halves rounded to a multiple + * of 8 + * + * The original Accuracy test only covers the n = 10000 recursive-split case; + * a regression in either of the small-n branches would not be caught there. + * Sizes 7/8/9/15/128/129/137 straddle the thresholds (including the off-by-one + * tail cases). The expected result is exact, so any branch that drops or + * double-counts an element fails immediately. + */ + TEST(LinalgSumBoundaryTest, EachPairwiseSumBranch) { + const cytnx_double value = 1.0; + for (int n : {1, 7, 8, 9, 15, 128, 129, 137, 1024}) { + Tensor tensor(/* shape */ {static_cast(n)}, Type.Double, Device.cpu, + /* init_zero */ false); + tensor.fill(value); + Tensor sum_result = linalg::Sum(tensor); + EXPECT_EQ(sum_result.shape().size(), 1); + EXPECT_EQ(sum_result.shape()[0], 1); + EXPECT_DOUBLE_EQ(sum_result.at({0}), value * static_cast(n)) + << "n=" << n; + } + } + + /** + * The dynamic-range case that motivates pairwise summation, and the one + * input distribution where naive serial accumulation visibly fails. + * + * Both arrays are [+L, 1, 1, ..., 1, -L] with L far above the precision + * threshold (2^53 for double, 2^23 for float); the exact sum is N - 2. Under + * naive accumulation the running total reaches L on the first element and + * the subsequent +1's vanish in IEEE 754 rounding -- 1.0 is below the unit + * in the last place at that magnitude -- so naive returns ~0. Pairwise + * keeps small values together in the tree until the cancellation of +/-L at + * the top, so the small terms survive (modulo a handful of 1's in the + * unrolled-accumulator blocks that hold L itself). + * + * The contract this asserts is qualitative on purpose: any reasonable + * pairwise implementation must land much closer to N - 2 than to 0. A + * serial-accumulation regression would collapse to ~0 and fail + * `EXPECT_GT(result, N / 2)` immediately; the exact pairwise result depends + * on which accumulators receive +/-L (a few small terms are lost there). + */ + TEST(LinalgSumHeterogeneousMagnitudeTest, RecoversTermsLostByNaiveAccumulation_Double) { + constexpr int N = 1024; + Tensor tensor(/* shape */ {static_cast(N)}, Type.Double, Device.cpu, + /* init_zero */ false); + tensor.fill(static_cast(1)); + tensor.at({0}) = 1e16; + tensor.at({N - 1}) = -1e16; + Tensor sum_result = linalg::Sum(tensor); + const cytnx_double result = sum_result.at({0}); + EXPECT_GT(result, static_cast(N / 2)); + EXPECT_LE(result, static_cast(N)); + } + + TEST(LinalgSumHeterogeneousMagnitudeTest, RecoversTermsLostByNaiveAccumulation_Float) { + constexpr int N = 1024; + Tensor tensor(/* shape */ {static_cast(N)}, Type.Float, Device.cpu, + /* init_zero */ false); + tensor.fill(static_cast(1)); + // float has ~7.2 decimal digits of precision; 1e8 already exceeds 2^23, so + // a serial `1e8 + 1` collapses to 1e8 and the unit terms are lost. + tensor.at({0}) = 1e8f; + tensor.at({N - 1}) = -1e8f; + Tensor sum_result = linalg::Sum(tensor); + const cytnx_float result = sum_result.at({0}); + EXPECT_GT(result, static_cast(N / 2)); + EXPECT_LE(result, static_cast(N)); + } } // namespace cytnx