Skip to content

Commit 4bd62ed

Browse files
committed
first and maybe the last commit
1 parent 56d6aeb commit 4bd62ed

10 files changed

Lines changed: 385 additions & 0 deletions

File tree

README.md

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
# Incremental stats
2+
3+
A few incremental 1st order statistics in numpy. Currently:
4+
5+
- Correlation (Pearson)
6+
- Covariance
7+
- Variance
8+
- Mean
9+
- Welch-t
10+
11+
## Setup a venv
12+
13+
mkdir venv
14+
virtualenv -p `which python3` venv
15+
source venv/bin/activate
16+
17+
## Option 1: Checkout hackable project
18+
19+
python -m pip install --upgrade pip
20+
git clone https://github.com/ceeesb/python-incrementalstats
21+
pip install -e python-incrementalstats
22+
cd python-incrementalstats
23+
24+
Now all the changes you make to this project source code are "live".
25+
26+
## Option 2: Build a wheel
27+
28+
git clone https://github.com/ceeesb/python-incrementalstats
29+
cd python-incrementalstats
30+
python -m build
31+
32+
Now you can distribute or install the wheel created in the `dist` folder.
33+
34+
$ ls dist/
35+
incrementalstats-0.0.1-py3-none-any.whl incrementalstats-0.0.1.tar.gz
36+
pip install dist/incrementalstats-0.0.1-py3-none-any.whl
37+
38+
## Run the tests
39+
40+
python -m unittest

pyproject.toml

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
[project]
2+
name = "incrementalstats"
3+
version = "0.0.1"
4+
authors = [
5+
{ name = "Cees-Bart Breunesse", email = "ceeesb@gmail.com" },
6+
]
7+
description = "A few incremental statistics using numpy"
8+
readme = "README.md"
9+
requires-python = ">=3.7"
10+
classifiers = [
11+
"Programming Language :: Python :: 3",
12+
"License :: OSI Approved :: MIT License",
13+
"Operating System :: OS Independent",
14+
]
15+
16+
[project.urls]
17+
"Homepage" = "https://github.com/xxx/pyincrementalstats"
18+
"Bug Tracker" = "https://github.com/xxx/pyincrementalstats/issues"

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
numpy

src/incrementalstats/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
from .mean_var import IncrementalMeanVariance
2+
from .covariance_correlation import IncrementalCovarianceCorrelation
3+
from .welcht import IncrementalWelcht
Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
from __future__ import annotations
2+
import numpy as np
3+
4+
from .mean_var import IncrementalMeanVariance
5+
6+
class IncrementalCovarianceCorrelation:
7+
"""Incrementally computes vectorized covariance and correlation.
8+
9+
When feeding every row of an MxN matrix A and every row of an MxO matrix B, this code
10+
updates a co-variance NxO matrix C where every cell C[r,c] is the covariance between
11+
A[:,r] and B[:,c].
12+
13+
This code is useful when the matrixes A and B cannot be not present
14+
in memory at one given time.
15+
16+
```
17+
nX = 100
18+
nY1 = 30
19+
nY2 = 50
20+
21+
m1 = np.random.randn(nX, nY1)
22+
m2 = np.random.randn(nX, nY2)
23+
24+
ic = IncrementalCovarianceCorrelation(nY1, nY2)
25+
26+
for row in range(nX):
27+
ic.update(m1[row,:], m2[row,:])
28+
29+
reference_covariance = (1 / (nX - 1)) * np.matmul((m1 - np.mean(m1, axis=0)).T, (m2 - np.mean(m2, axis=0)))
30+
reference_m1_stddev = np.std(m1, axis=0, ddof=1)
31+
reference_m2_stddev = np.std(m2, axis=0, ddof=1)
32+
reference_1_over_m1_stddev = (1 / numpy_m1_stddev).reshape(numpy_m1_stddev.size, 1)
33+
reference_1_over_m2_stddev = (1 / numpy_m2_stddev).reshape(1,numpy_m2_stddev.size)
34+
reference_correlation = numpy_covariance * numpy_1_over_m1_stddev * numpy_1_over_m2_stddev
35+
36+
assert (np.allclose(ic.getCovariance(), reference_covariance))
37+
assert (np.allclose(ic.getCorrelation(), reference_correlation))
38+
39+
```
40+
"""
41+
42+
def __init__(self, nX, nY):
43+
"""Initialize with the #columns for matrices A and B"""
44+
self.nX = nX
45+
self.nY = nY
46+
self.imX = IncrementalMeanVariance(nX)
47+
self.imY = IncrementalMeanVariance(nY)
48+
self.cov = np.zeros((nX, nY), dtype=np.float64)
49+
self.n = 0
50+
51+
def update(self, x, y):
52+
"""Updates the covariance matrix with a single row of matrix A and a single row of matrix B"""
53+
if len(x) != self.nX:
54+
raise Exception("wrong x length")
55+
if len(y) != self.nY:
56+
raise Exception("wrong y length")
57+
58+
self.n += 1
59+
f = (self.n - 1) / self.n
60+
61+
mfX = (x - self.imX.mean) * f
62+
mfY = y - self.imY.mean
63+
64+
self.cov += np.tensordot(mfX, mfY, axes=0)
65+
66+
self.imX.update(x)
67+
self.imY.update(y)
68+
69+
def add(self, x: IncrementalCovarianceCorrelation):
70+
"""Merges another object of IncrementalCovarianceCorrelation into this co-variance matrix. This is useful in
71+
parallelized computations, where different nodes compute co-variances over different
72+
ranges of rows"""
73+
n = self.n + x.n
74+
f = (self.n * x.n ** 2 + x.n * self.n ** 2) / (n ** 2)
75+
76+
deltaX = self.imX.mean - x.imX.mean
77+
deltaX = deltaX.reshape(deltaX.size, 1) * f
78+
79+
deltaY = self.imY.mean - x.imY.mean
80+
deltaY = deltaY.reshape(1, deltaY.size)
81+
82+
self.cov += x.cov + deltaX * deltaY
83+
self.n = n
84+
self.imX.add(x.imX)
85+
self.imY.add(x.imY)
86+
87+
def getCovariance(self):
88+
"""Returns the scaled co-variance matrix with 1 degree of freedom"""
89+
return 1 / (self.n - 1) * self.cov
90+
91+
def getCorrelation(self):
92+
"""Returns Pearson's correlation matrix"""
93+
sX = 1 / np.sqrt(self.imX.getVariance())
94+
sX = sX.reshape(sX.size, 1)
95+
sY = 1 / np.sqrt(self.imY.getVariance())
96+
sY = sY.reshape(1, sY.size)
97+
return 1 / (self.n - 1) * self.cov * sX * sY

src/incrementalstats/mean_var.py

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
from __future__ import annotations
2+
import numpy as np
3+
4+
class IncrementalMeanVariance:
5+
"""Incrementally computes vectorized mean and variance.
6+
7+
When feeding every row of an MxN matrix, this code
8+
computes the mean/variance over axis 0 of this matrix, as
9+
demonstrated in the example below.
10+
11+
This code is useful when the matrix cannot be not present
12+
in memory at one given time.
13+
14+
```
15+
import numpy as np
16+
from risca.statistics import IncrementalMeanVariance
17+
18+
nX = 100
19+
nY = 10
20+
m = np.random.randn(nX, nY)
21+
22+
im = IncrementalMeanVariance(nY)
23+
for row in range(nX):
24+
im.update(m[row,:])
25+
26+
reference_mean = m.mean(axis=0)
27+
reference_variance = m.var(axis=0, ddof=1)
28+
assert np.allclose(im.getMean(), reference_mean)
29+
assert np.allclose(im.getVariance(), reference_variance)
30+
```
31+
32+
"""
33+
34+
def __init__(self, ncolumns):
35+
"""Initialize with the #columns of the hypothetical matrix M over
36+
which we will compute the mean / variance"""
37+
nX = ncolumns
38+
self.nX = nX
39+
self.mean = np.zeros(nX, dtype=np.float64)
40+
self.var = np.zeros(nX, dtype=np.float64)
41+
self.n = 0
42+
43+
def add(self, other: IncrementalMeanVariance):
44+
"""Merges another object of IncrementalMeanVariance into this mean/variance. This is useful in
45+
parallelized computations, where different nodes compute mean/variance over different
46+
ranges of rows"""
47+
x = other
48+
n = self.n + x.n
49+
delta = x.mean - self.mean
50+
self.mean += x.n * (delta / n)
51+
self.var += x.var + self.n * x.n * delta ** 2 / n
52+
self.n = n
53+
54+
def update(self, row):
55+
x = row
56+
"""Updates the mean/variance with a single row. """
57+
if len(x) != self.nX:
58+
raise Exception("wrong length")
59+
60+
self.n += 1
61+
y1 = x - self.mean
62+
self.mean += y1 / self.n
63+
y2 = x - self.mean
64+
self.var += y1 * y2
65+
66+
def getMean(self):
67+
"""Returns the current mean"""
68+
return self.mean.copy()
69+
70+
def getVariance(self):
71+
"""Returns the current variance"""
72+
if self.n < 2:
73+
raise Exception("not enough data")
74+
return 1/(self.n - 1) * self.var

src/incrementalstats/welcht.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
# function tval(m1, m2, v1, v2, n1, n2)
2+
# x = (m1 - m2) / sqrt((v1 / n1) + (v2 / n2) + eps(0.0))
3+
# if isnan(x)
4+
# return 0.0
5+
# else
6+
# return x
7+
# end
8+
# end
9+
10+
11+
from __future__ import annotations
12+
import numpy as np
13+
14+
from .mean_var import IncrementalMeanVariance
15+
16+
class IncrementalWelcht:
17+
"""
18+
Incremental Welch-t between 2 groups
19+
"""
20+
21+
def __init__(self, nsamples):
22+
"""Initialize with the #samples we're computing Welch-t over"""
23+
nX = nsamples
24+
self.mv0 = IncrementalMeanVariance(nX)
25+
self.mv1 = IncrementalMeanVariance(nX)
26+
self.n = 0
27+
28+
def add(self, other: IncrementalWelcht):
29+
self.mv0.add(other.mv0)
30+
self.mv1.add(other.mv1)
31+
32+
def update(self, group, row):
33+
if group == 0:
34+
self.mv0.update(row)
35+
else:
36+
self.mv1.update(row)
37+
38+
def getWelcht(self):
39+
m0 = self.mv0.getMean()
40+
v0 = self.mv0.getVariance()
41+
n0 = self.mv0.n
42+
m1 = self.mv1.getMean()
43+
v1 = self.mv1.getVariance()
44+
n1 = self.mv1.n
45+
46+
x = (m0 - m1) / np.sqrt((v0 / n0) + (v1 / n1) + 1e-12)
47+
48+
x = np.nan_to_num(x, copy = False)
49+
50+
return x

tests/__init__.py

Whitespace-only changes.

tests/test_incremental_stats.py

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
import unittest
2+
import numpy as np
3+
4+
from incrementalstats import IncrementalMeanVariance, IncrementalCovarianceCorrelation
5+
6+
class IncrementalMeanVarianceTest(unittest.TestCase):
7+
def testCorrectness(self):
8+
nX = 10000
9+
nY = 10
10+
m = np.random.randn(nX, nY)
11+
numpy_mean = m.mean(axis=0)
12+
numpy_variance = m.var(axis=0, ddof=1)
13+
14+
im = IncrementalMeanVariance(nY)
15+
im1 = IncrementalMeanVariance(nY)
16+
im2 = IncrementalMeanVariance(nY)
17+
for row in range(nX):
18+
im.update(m[row,:])
19+
if row < (nX // 2):
20+
im1.update(m[row,:])
21+
else:
22+
im2.update(m[row,:])
23+
24+
self.assertTrue(np.allclose(im.getMean(), numpy_mean))
25+
self.assertTrue(np.allclose(im.getVariance(), numpy_variance))
26+
27+
im1.add(im2)
28+
29+
self.assertTrue(np.allclose(im1.getMean(), numpy_mean))
30+
self.assertTrue(np.allclose(im1.getVariance(), numpy_variance))
31+
32+
class IncrementalCovarianceCorrelationTest(unittest.TestCase):
33+
def testCorrectness(self):
34+
nX = 10000
35+
nY1 = 3
36+
nY2 = 5
37+
38+
m1 = np.random.randn(nX, nY1)
39+
m2 = np.random.randn(nX, nY2)
40+
41+
numpy_covariance = (1 / (nX - 1)) * np.matmul((m1 - np.mean(m1, axis=0)).T, (m2 - np.mean(m2, axis=0)))
42+
numpy_m1_stddev = np.std(m1, axis=0, ddof=1)
43+
numpy_m2_stddev = np.std(m2, axis=0, ddof=1)
44+
numpy_1_over_m1_stddev = (1 / numpy_m1_stddev).reshape(numpy_m1_stddev.size, 1)
45+
numpy_1_over_m2_stddev = (1 / numpy_m2_stddev).reshape(1,numpy_m2_stddev.size)
46+
numpy_correlation = numpy_covariance * numpy_1_over_m1_stddev * numpy_1_over_m2_stddev
47+
48+
ic = IncrementalCovarianceCorrelation(nY1, nY2)
49+
ic1 = IncrementalCovarianceCorrelation(nY1, nY2)
50+
ic2 = IncrementalCovarianceCorrelation(nY1, nY2)
51+
52+
for row in range(nX):
53+
ic.update(m1[row,:], m2[row,:])
54+
if row < (nX // 2):
55+
ic1.update(m1[row,:], m2[row,:])
56+
else:
57+
ic2.update(m1[row,:], m2[row,:])
58+
59+
self.assertTrue(np.allclose(ic.getCovariance(), numpy_covariance))
60+
self.assertTrue(np.allclose(ic.getCorrelation(), numpy_correlation))
61+
62+
ic1.add(ic2)
63+
64+
self.assertTrue(np.allclose(ic1.getCovariance(), numpy_covariance))
65+
self.assertTrue(np.allclose(ic1.getCorrelation(), numpy_correlation))
66+
67+
self.assertTrue(np.allclose(ic1.imX.getMean(), ic.imX.getMean()))
68+
self.assertTrue(np.allclose(ic1.imY.getMean(), ic.imY.getMean()))
69+
self.assertTrue(np.allclose(ic1.imX.getVariance(), ic.imX.getVariance()))
70+
self.assertTrue(np.allclose(ic1.imY.getVariance(), ic.imY.getVariance()))
71+
72+
if __name__ == "__main__":
73+
unittest.main(verbosity=2)

tests/test_welcht.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
import unittest
2+
import numpy as np
3+
import scipy
4+
5+
from incrementalstats import IncrementalWelcht
6+
7+
class IncrementalWelchtTest(unittest.TestCase):
8+
def testCorrectness(self):
9+
nsamples = 5
10+
x = np.random.randn(100,nsamples)
11+
y = np.random.randn(110,nsamples)
12+
13+
t_stat, p_value = scipy.stats.ttest_ind(x, y, equal_var=False)
14+
15+
iw = IncrementalWelcht(nsamples)
16+
17+
for v in x:
18+
iw.update(0, v)
19+
20+
for v in y:
21+
iw.update(1, v)
22+
23+
t_stat2 = iw.getWelcht()
24+
25+
self.assertTrue(np.allclose(t_stat, t_stat2))
26+
27+
28+
if __name__ == "__main__":
29+
unittest.main(verbosity=2)

0 commit comments

Comments
 (0)