first and maybe the last commit

ceesb · ceesb · commit 4bd62ede3c87 · 2025-03-18T22:51:25.000-07:00
diff --git a/README.md b/README.md
@@ -0,0 +1,40 @@
+# Incremental stats
+
+A few incremental 1st order statistics in numpy. Currently:
+
+- Correlation (Pearson)
+- Covariance
+- Variance
+- Mean
+- Welch-t
+
+## Setup a venv
+
+    mkdir venv
+    virtualenv -p `which python3` venv
+    source venv/bin/activate
+
+## Option 1: Checkout hackable project
+
+    python -m pip install --upgrade pip
+    git clone https://github.com/ceeesb/python-incrementalstats
+    pip install -e python-incrementalstats
+    cd python-incrementalstats
+
+Now all the changes you make to this project source code are "live".
+
+## Option 2: Build a wheel
+
+    git clone https://github.com/ceeesb/python-incrementalstats
+    cd python-incrementalstats
+    python -m build
+
+Now you can distribute or install the wheel created in the `dist` folder.
+
+    $ ls dist/
+    incrementalstats-0.0.1-py3-none-any.whl  incrementalstats-0.0.1.tar.gz
+    pip install dist/incrementalstats-0.0.1-py3-none-any.whl
+
+## Run the tests
+
+    python -m unittest
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,18 @@
+[project]
+name = "incrementalstats"
+version = "0.0.1"
+authors = [
+  { name = "Cees-Bart Breunesse", email = "ceeesb@gmail.com" },
+]
+description = "A few incremental statistics using numpy"
+readme = "README.md"
+requires-python = ">=3.7"
+classifiers = [
+    "Programming Language :: Python :: 3",
+    "License :: OSI Approved :: MIT License",
+    "Operating System :: OS Independent",
+]
+
+[project.urls]
+"Homepage" = "https://github.com/xxx/pyincrementalstats"
+"Bug Tracker" = "https://github.com/xxx/pyincrementalstats/issues"
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1 @@
+numpy
diff --git a/src/incrementalstats/__init__.py b/src/incrementalstats/__init__.py
@@ -0,0 +1,3 @@
+from .mean_var import IncrementalMeanVariance
+from .covariance_correlation import IncrementalCovarianceCorrelation
+from .welcht import IncrementalWelcht
diff --git a/src/incrementalstats/covariance_correlation.py b/src/incrementalstats/covariance_correlation.py
@@ -0,0 +1,97 @@
+from __future__ import annotations
+import numpy as np
+
+from .mean_var import IncrementalMeanVariance
+
+class IncrementalCovarianceCorrelation:
+    """Incrementally computes vectorized covariance and correlation.
+
+    When feeding every row of an MxN matrix A and every row of an MxO matrix B, this code
+    updates a co-variance NxO matrix C where every cell C[r,c] is the covariance between
+    A[:,r] and B[:,c].
+
+    This code is useful when the matrixes A and B cannot be not present
+    in memory at one given time.
+
+    ```
+    nX = 100
+    nY1 = 30
+    nY2 = 50
+
+    m1 = np.random.randn(nX, nY1)
+    m2 = np.random.randn(nX, nY2)
+
+    ic = IncrementalCovarianceCorrelation(nY1, nY2)
+
+    for row in range(nX):
+        ic.update(m1[row,:], m2[row,:])
+
+    reference_covariance = (1 / (nX - 1)) * np.matmul((m1 - np.mean(m1, axis=0)).T, (m2 - np.mean(m2, axis=0)))
+    reference_m1_stddev = np.std(m1, axis=0, ddof=1)
+    reference_m2_stddev = np.std(m2, axis=0, ddof=1)
+    reference_1_over_m1_stddev = (1 / numpy_m1_stddev).reshape(numpy_m1_stddev.size, 1)
+    reference_1_over_m2_stddev = (1 / numpy_m2_stddev).reshape(1,numpy_m2_stddev.size)
+    reference_correlation = numpy_covariance * numpy_1_over_m1_stddev * numpy_1_over_m2_stddev
+
+    assert (np.allclose(ic.getCovariance(), reference_covariance))
+    assert (np.allclose(ic.getCorrelation(), reference_correlation))
+
+    ```
+    """
+
+    def __init__(self, nX, nY):
+        """Initialize with the #columns for matrices A and B"""
+        self.nX = nX
+        self.nY = nY
+        self.imX = IncrementalMeanVariance(nX)
+        self.imY = IncrementalMeanVariance(nY)
+        self.cov = np.zeros((nX, nY), dtype=np.float64)
+        self.n = 0
+
+    def update(self, x, y):
+        """Updates the covariance matrix with a single row of matrix A and a single row of matrix B"""
+        if len(x) != self.nX:
+            raise Exception("wrong x length")
+        if len(y) != self.nY:
+            raise Exception("wrong y length")
+
+        self.n += 1
+        f = (self.n - 1) / self.n
+
+        mfX = (x - self.imX.mean) * f
+        mfY = y - self.imY.mean
+
+        self.cov += np.tensordot(mfX, mfY, axes=0)
+
+        self.imX.update(x)
+        self.imY.update(y)
+
+    def add(self, x: IncrementalCovarianceCorrelation):
+        """Merges another object of IncrementalCovarianceCorrelation into this co-variance matrix. This is useful in
+        parallelized computations, where different nodes compute co-variances over different
+        ranges of rows"""
+        n = self.n + x.n
+        f = (self.n * x.n ** 2 + x.n * self.n ** 2) / (n ** 2)
+
+        deltaX = self.imX.mean - x.imX.mean
+        deltaX = deltaX.reshape(deltaX.size, 1) * f
+
+        deltaY = self.imY.mean - x.imY.mean
+        deltaY = deltaY.reshape(1, deltaY.size)
+
+        self.cov += x.cov + deltaX * deltaY
+        self.n = n
+        self.imX.add(x.imX)
+        self.imY.add(x.imY)
+
+    def getCovariance(self):
+        """Returns the scaled co-variance matrix with 1 degree of freedom"""
+        return 1 / (self.n - 1) * self.cov
+
+    def getCorrelation(self):
+        """Returns Pearson's correlation matrix"""
+        sX = 1 / np.sqrt(self.imX.getVariance())
+        sX = sX.reshape(sX.size, 1)
+        sY = 1 / np.sqrt(self.imY.getVariance())
+        sY = sY.reshape(1, sY.size)
+        return 1 / (self.n - 1) * self.cov * sX * sY
diff --git a/src/incrementalstats/mean_var.py b/src/incrementalstats/mean_var.py
@@ -0,0 +1,74 @@
+from __future__ import annotations
+import numpy as np
+
+class IncrementalMeanVariance:
+    """Incrementally computes vectorized mean and variance.
+
+    When feeding every row of an MxN matrix, this code
+    computes the mean/variance over axis 0 of this matrix, as
+    demonstrated in the example below.
+
+    This code is useful when the matrix cannot be not present
+    in memory at one given time.
+
+    ```
+    import numpy as np
+    from risca.statistics import IncrementalMeanVariance
+
+    nX = 100
+    nY = 10
+    m = np.random.randn(nX, nY)
+
+    im = IncrementalMeanVariance(nY)
+    for row in range(nX):
+        im.update(m[row,:])
+
+    reference_mean = m.mean(axis=0)
+    reference_variance = m.var(axis=0, ddof=1)
+    assert np.allclose(im.getMean(), reference_mean)
+    assert np.allclose(im.getVariance(), reference_variance)
+    ```
+
+    """
+
+    def __init__(self, ncolumns):
+        """Initialize with the #columns of the hypothetical matrix M over
+        which we will compute the mean / variance"""
+        nX = ncolumns
+        self.nX = nX
+        self.mean = np.zeros(nX, dtype=np.float64)
+        self.var = np.zeros(nX, dtype=np.float64)
+        self.n = 0
+
+    def add(self, other: IncrementalMeanVariance):
+        """Merges another object of IncrementalMeanVariance into this mean/variance. This is useful in
+        parallelized computations, where different nodes compute mean/variance over different
+        ranges of rows"""
+        x = other
+        n = self.n + x.n
+        delta = x.mean - self.mean
+        self.mean += x.n * (delta / n)
+        self.var += x.var + self.n * x.n * delta ** 2 / n
+        self.n = n
+
+    def update(self, row):
+        x = row
+        """Updates the mean/variance with a single row. """
+        if len(x) != self.nX:
+            raise Exception("wrong length")
+
+        self.n += 1
+        y1 = x - self.mean
+        self.mean += y1 / self.n
+        y2 = x - self.mean
+        self.var += y1 * y2
+
+    def getMean(self):
+        """Returns the current mean"""
+        return self.mean.copy()
+
+    def getVariance(self):
+        """Returns the current variance"""
+        if self.n < 2:
+            raise Exception("not enough data")
+        return 1/(self.n - 1) * self.var
diff --git a/src/incrementalstats/welcht.py b/src/incrementalstats/welcht.py
@@ -0,0 +1,50 @@
+# function tval(m1, m2, v1, v2, n1, n2)
+#     x = (m1 - m2) / sqrt((v1 / n1) + (v2 / n2) + eps(0.0))
+#     if isnan(x)
+#         return 0.0
+#     else
+#         return x
+#     end
+# end
+
+
+from __future__ import annotations
+import numpy as np
+
+from .mean_var import IncrementalMeanVariance
+
+class IncrementalWelcht:
+    """
+    Incremental Welch-t between 2 groups
+    """
+
+    def __init__(self, nsamples):
+        """Initialize with the #samples we're computing Welch-t over"""
+        nX = nsamples
+        self.mv0 = IncrementalMeanVariance(nX)
+        self.mv1 = IncrementalMeanVariance(nX)
+        self.n = 0
+
+    def add(self, other: IncrementalWelcht):
+        self.mv0.add(other.mv0)
+        self.mv1.add(other.mv1)
+    
+    def update(self, group, row):
+        if group == 0:
+            self.mv0.update(row)
+        else:
+            self.mv1.update(row)
+  
+    def getWelcht(self):
+        m0 = self.mv0.getMean()
+        v0 = self.mv0.getVariance()
+        n0 = self.mv0.n
+        m1 = self.mv1.getMean()
+        v1 = self.mv1.getVariance()
+        n1 = self.mv1.n
+
+        x = (m0 - m1) / np.sqrt((v0 / n0) + (v1 / n1) + 1e-12)
+
+        x = np.nan_to_num(x, copy = False)
+
+        return x
diff --git a/tests/__init__.py b/tests/__init__.py
diff --git a/tests/test_incremental_stats.py b/tests/test_incremental_stats.py
@@ -0,0 +1,73 @@
+import unittest
+import numpy as np
+
+from incrementalstats import IncrementalMeanVariance, IncrementalCovarianceCorrelation
+
+class IncrementalMeanVarianceTest(unittest.TestCase):
+    def testCorrectness(self):
+        nX = 10000
+        nY = 10
+        m = np.random.randn(nX, nY)
+        numpy_mean = m.mean(axis=0)
+        numpy_variance = m.var(axis=0, ddof=1)
+ 
+        im = IncrementalMeanVariance(nY)
+        im1 = IncrementalMeanVariance(nY)
+        im2 = IncrementalMeanVariance(nY)
+        for row in range(nX):
+            im.update(m[row,:])
+            if row < (nX // 2):
+                im1.update(m[row,:])
+            else:
+                im2.update(m[row,:]) 
+        
+        self.assertTrue(np.allclose(im.getMean(), numpy_mean))
+        self.assertTrue(np.allclose(im.getVariance(), numpy_variance))
+
+        im1.add(im2)
+        
+        self.assertTrue(np.allclose(im1.getMean(), numpy_mean))
+        self.assertTrue(np.allclose(im1.getVariance(), numpy_variance))        
+
+class IncrementalCovarianceCorrelationTest(unittest.TestCase):
+    def testCorrectness(self):
+        nX = 10000
+        nY1 = 3
+        nY2 = 5
+
+        m1 = np.random.randn(nX, nY1)
+        m2 = np.random.randn(nX, nY2)
+        
+        numpy_covariance = (1 / (nX - 1)) * np.matmul((m1 - np.mean(m1, axis=0)).T, (m2 - np.mean(m2, axis=0)))
+        numpy_m1_stddev = np.std(m1, axis=0, ddof=1)
+        numpy_m2_stddev = np.std(m2, axis=0, ddof=1)
+        numpy_1_over_m1_stddev = (1 / numpy_m1_stddev).reshape(numpy_m1_stddev.size, 1)
+        numpy_1_over_m2_stddev = (1 / numpy_m2_stddev).reshape(1,numpy_m2_stddev.size)
+        numpy_correlation = numpy_covariance * numpy_1_over_m1_stddev * numpy_1_over_m2_stddev
+        
+        ic = IncrementalCovarianceCorrelation(nY1, nY2)
+        ic1 = IncrementalCovarianceCorrelation(nY1, nY2)
+        ic2 = IncrementalCovarianceCorrelation(nY1, nY2)
+        
+        for row in range(nX):
+            ic.update(m1[row,:], m2[row,:])
+            if row < (nX // 2):
+                ic1.update(m1[row,:], m2[row,:])
+            else:
+                ic2.update(m1[row,:], m2[row,:])                
+        
+        self.assertTrue(np.allclose(ic.getCovariance(), numpy_covariance))
+        self.assertTrue(np.allclose(ic.getCorrelation(), numpy_correlation))
+        
+        ic1.add(ic2)
+        
+        self.assertTrue(np.allclose(ic1.getCovariance(), numpy_covariance))
+        self.assertTrue(np.allclose(ic1.getCorrelation(), numpy_correlation))
+
+        self.assertTrue(np.allclose(ic1.imX.getMean(), ic.imX.getMean()))
+        self.assertTrue(np.allclose(ic1.imY.getMean(), ic.imY.getMean()))
+        self.assertTrue(np.allclose(ic1.imX.getVariance(), ic.imX.getVariance()))
+        self.assertTrue(np.allclose(ic1.imY.getVariance(), ic.imY.getVariance()))
+        
+if __name__ == "__main__":
+    unittest.main(verbosity=2)
diff --git a/tests/test_welcht.py b/tests/test_welcht.py
@@ -0,0 +1,29 @@
+import unittest
+import numpy as np
+import scipy
+
+from incrementalstats import IncrementalWelcht
+
+class IncrementalWelchtTest(unittest.TestCase):
+    def testCorrectness(self):
+        nsamples = 5
+        x = np.random.randn(100,nsamples)
+        y = np.random.randn(110,nsamples)
+
+        t_stat, p_value = scipy.stats.ttest_ind(x, y, equal_var=False)
+
+        iw = IncrementalWelcht(nsamples)
+
+        for v in x:
+            iw.update(0, v)
+        
+        for v in y:
+            iw.update(1, v)
+        
+        t_stat2 = iw.getWelcht()
+
+        self.assertTrue(np.allclose(t_stat, t_stat2)) 
+
+
+if __name__ == "__main__":
+    unittest.main(verbosity=2)

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+from .mean_var import IncrementalMeanVariance`
	`2`	`+from .covariance_correlation import IncrementalCovarianceCorrelation`
	`3`	`+from .welcht import IncrementalWelcht`