DM-Lab-UvA · AaronDC60 · Apr 21, 2026 · Jun 27, 2025 · Jun 27, 2025 · Jun 27, 2025
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -19,11 +19,20 @@ jobs:
         with:
           submodules: recursive
 
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+
       - name: Install dependencies (Linux)
         if: runner.os == 'Linux'
         run: |  
           sudo apt-get update
-          sudo apt-get install -y g++ cmake python3
+
+      - name: Install dependencies
+        run: |
+          python3 -m pip install --upgrade pip
+          python3 -m pip install pytest numpy
 
       - name : Build project
         run: |
@@ -33,3 +42,8 @@ jobs:
         run: |
           cd build
           ctest --output-on-failure
+
+      - name : Test python module
+        run: |
+          cd mcmpy/tests
+          pytest
diff --git a/docs/api/data.rst b/docs/api/data.rst
@@ -240,6 +240,13 @@ to infer either the best minimally complex (MCM) or the optimal basis representa
 
       The actual number of datapoints in the dataset (read-only).
 
+   .. py:attribute:: N_synthetic
+      :type: int
+
+      The synthetic number of datapoints in the dataset.
+
+      This attribute can be changed to perform an analysis of the dataset as if it is either larger or smaller.
+
    .. py:attribute:: N_unique
       :type: int
 

diff --git a/include/data/dataset.h b/include/data/dataset.h
@@ -31,6 +31,14 @@ class Data {
      */
     Data(const std::vector<std::pair<std::vector<__uint128_t>, unsigned int>>& _dataset, int n_var, int n_states, int n_samples);
 
+    /**
+     * Change the value for the number of datapoints in the dataset that is used for analysis.
+     * This can be changed to perform an analysis of the dataset as if it is larger or smaller.
+     * 
+     * @param n_datapoints          The synthetic number of datapoints in the dataset.
+     */
+    void set_N_synthetic(int n_datapoints);
+
     /**
      * Calculate the entropy of the dataset.
      * 
@@ -128,6 +136,7 @@ class Data {
     int N; // Number of datapoints
     int N_unique; // Number of different datapoints
     int n_ints; // Number of 128bit integers necessary to represent the data
+    int N_synthetic; // The synthetic number of datapoints in the dataset
 
     std::vector<__uint128_t> pow_q; // Vector containing the first n powers of q used to speed up the calculation of the evidence
 };
diff --git a/mcmpy/include/py_dataset.h b/mcmpy/include/py_dataset.h
@@ -12,7 +12,7 @@
 
 namespace py = pybind11;
 
-std::vector<__uint128_t> convert_spin_op_from_py(const py::array_t<uint8_t>& spin_op, int q, int n_ints);
+std::vector<__uint128_t> convert_spin_op_from_py(const py::array_t<uint8_t>& spin_op, int q, int n_ints, int n);
 
 class PyData {
 public:
@@ -65,9 +65,12 @@ class PyData {
 
     int get_n() {return this->data.n;};
     int get_N() {return this->data.N;};
+    int get_N_synthetic() {return this->data.N_synthetic;};
     int get_N_unique() {return this->data.N_unique;};
     int get_q() {return this->data.q;};
 
+    void set_N_synthetic(int n_datapoints) {this->data.set_N_synthetic(n_datapoints);};
+
     Data data;
 };
 

diff --git a/mcmpy/src/py_dataset.cpp b/mcmpy/src/py_dataset.cpp
@@ -1,6 +1,6 @@
 #include "py_dataset.h"
 
-std::vector<__uint128_t> convert_spin_op_from_py(const py::array_t<uint8_t>& spin_op, int q, int n_ints){
+std::vector<__uint128_t> convert_spin_op_from_py(const py::array_t<uint8_t>& spin_op, int q, int n_ints, int n){
     py::buffer_info buff = spin_op.request();
 
     // Check if there is only one dimension
@@ -9,9 +9,9 @@ std::vector<__uint128_t> convert_spin_op_from_py(const py::array_t<uint8_t>& spi
         throw std::invalid_argument("The spin operator should be given as a 1D numpy array.");
     }
     // Check if the system size is valid
-    int n = buff.shape[0];
-    if (n > 128){
-        throw std::invalid_argument("The maximum system size is 128.");
+    int n_entries = buff.shape[0];
+    if (n_entries != n){
+        throw std::invalid_argument("The given spin operator doesn't contain n elements.");
     }
 
     std::vector<uint8_t> conv_spin_op(n, 0);
@@ -154,7 +154,7 @@ double PyData::entropy(int base){
 }
 
 double PyData::entropy_of_spin_op(const py::array_t<int8_t>& op){
-    std::vector<__uint128_t> spin_op = convert_spin_op_from_py(op, this->data.q, this->data.n_ints);
+    std::vector<__uint128_t> spin_op = convert_spin_op_from_py(op, this->data.q, this->data.n_ints, this->data.n);
     return calc_entropy_of_spin_op(this->data, spin_op);
 }
 
@@ -187,8 +187,10 @@ void bind_data_class(py::module &m) {
 
         .def("entropy", &PyData::entropy, py::arg("base") = -1)
         .def("entropy_of_spin_operator", &PyData::entropy_of_spin_op, py::arg("spin_op"))
+
         .def_property_readonly("n", &PyData::get_n)
         .def_property_readonly("q", &PyData::get_q)
         .def_property_readonly("N", &PyData::get_N)
-        .def_property_readonly("N_unique", &PyData::get_N_unique);
+        .def_property_readonly("N_unique", &PyData::get_N_unique)
+        .def_property("N_synthetic", &PyData::get_N_synthetic, &PyData::set_N_synthetic);
 }
diff --git a/mcmpy/src/py_partition.cpp b/mcmpy/src/py_partition.cpp
@@ -76,6 +76,9 @@ std::vector<__uint128_t> convert_partition_from_py_2d_array(py::array_t<int8_t>&
         element = 1;
         for (int j = 0; j < n; j++){
             if(ptr[i*n + j]){
+                if(ptr[i*n + j] != 1){
+                    throw std::invalid_argument("Entries of the 2D array should be either 0 or 1.");
+                }
                 partition[i] += element;
             }
             element <<= 1;

diff --git a/mcmpy/tests/conftest.py b/mcmpy/tests/conftest.py
@@ -0,0 +1,27 @@
+import pytest
+from mcmpy import Data, MCM, MCMSearch
+
+# Data
+
+@pytest.fixture
+def scotus_data_q2():
+    return Data("../../input/US_SupremeCourt_n9_N895.dat", 9, 2)
+
+@pytest.fixture
+def scotus_data_q3():
+    return Data("../../input/US_SupremeCourt_n9_N895.dat", 9, 3)
+
+@pytest.fixture
+def scotus_data_q4():
+    return Data("../../input/US_SupremeCourt_n9_N895.dat", 9, 4)
+
+@pytest.fixture
+def scotus_data_q5():
+    return Data("../../input/US_SupremeCourt_n9_N895.dat", 9, 5)
+
+
+# MCM
+
+@pytest.fixture
+def opt_mcm_scotus_q2():
+    return MCM(9, [[1,0,1,1,1,0,1,0,0], [0,1,0,0,0,1,0,1,1]])
diff --git a/mcmpy/tests/test_basis.py b/mcmpy/tests/test_basis.py
@@ -0,0 +1,52 @@
+import pytest
+import numpy as np
+from mcmpy import Basis
+
+
+def test_init_default():
+    n = 3
+    q = 5
+    basis = Basis(n,q)
+
+    # Default basis should be the n one-body spin operators
+    assert np.all(basis.matrix == np.eye(n))
+    assert(basis.n == n)
+    assert(basis.q == q)
+
+def test_init_array():
+    n = 4
+    q = 3
+    array = np.array([[1,2,0,0],[2,2,0,0],[0,0,1,0],[0,0,2,1]])
+    basis = Basis(n,q, array)
+
+    # Check the matrix representation
+    # Should be equal to the transpose of the input given that columns represent operators
+    assert np.all(basis.matrix == array.T)
+    assert(basis.n == n)
+    assert(basis.q == q)
+
+    # Check reset to default
+    basis.set_default()
+    assert np.all(basis.matrix == np.eye(n))
+
+def test_init_file():
+    n = 4
+    q = 3
+    basis = Basis(n,q, "../../tests/basis2.dat")
+
+    # Check the matrix representation
+    assert np.all(basis.matrix == [[1,2,0,0],[2,2,0,0],[0,0,1,2],[0,0,0,1]])
+    assert(basis.n == n)
+    assert(basis.q == q)
+
+def test_set_from_file():
+    n = 4
+    q = 3
+
+    basis = Basis(n,q)
+    basis.set_from_file("../../tests/basis2.dat")
+
+    # Check the matrix representation
+    assert np.all(basis.matrix == [[1,2,0,0],[2,2,0,0],[0,0,1,2],[0,0,0,1]])
+    assert(basis.n == n)
+    assert(basis.q == q)
diff --git a/mcmpy/tests/test_basis_search.py b/mcmpy/tests/test_basis_search.py
@@ -0,0 +1,92 @@
+import pytest
+import numpy as np
+from mcmpy import Data, MCM, MCMSearch, Basis, BasisSearch
+
+# Best basis search of supreme court
+
+# Test against previous code
+# (https://github.com/clelidm/MinCompSpin_ExhaustiveSearch) by C. de Mulatier;
+# (https://github.com/AaronDC60/MinCompSpin_discrete) by Aaron De Clercq.
+
+class TestBasisSearch:
+
+    def setup_class(self):
+        self.searcher = BasisSearch()
+        self.mcmsearcher = MCMSearch()
+
+    def test_exhaustive_search(self):
+        dataset = Data("../../input/US_SupremeCourt_n9_N895.dat", 9, 2)
+        opt_basis_1 = self.searcher.exhaustive(dataset)
+        opt_basis_2 = self.searcher.get_basis()
+        assert np.all(opt_basis_1.matrix == [[0,0,0,1,1,1,0,0,0],
+                                           [0,0,0,0,0,0,1,0,0],
+                                           [0,0,0,0,0,1,0,1,0],
+                                           [1,0,0,0,0,0,0,0,0],
+                                           [0,0,0,1,0,0,0,0,0],
+                                           [0,1,0,0,0,0,0,0,0],
+                                           [1,0,0,0,1,0,0,0,1],
+                                           [0,1,1,0,0,0,1,0,0],
+                                           [0,0,1,0,0,0,0,1,0]])
+
+        assert np.all(opt_basis_1.matrix == opt_basis_2.matrix)
+        assert opt_basis_1.n == 9
+        assert opt_basis_2.n == 9
+
+        assert opt_basis_1.q == 2
+        assert opt_basis_2.q == 2
+
+    def get_best_mcm(self, q):
+        dataset = Data("../../input/US_SupremeCourt_n9_N895.dat", 9, q)
+        # Calculate the best basis
+        opt_basis = self.searcher.exhaustive(dataset)
+        # Transform data
+        transformed_data = opt_basis.gauge_transform_data(dataset)
+        # Get best mcm in the new basis
+        best_mcm = self.mcmsearcher.exhaustive(transformed_data)
+
+        return best_mcm
+
+    def test_mcm_opt_basis_q2(self):
+        best_mcm = self.get_best_mcm(2)
+
+        # Check the mcm and log-evidence
+        assert np.all(best_mcm.array == [[1,0,0,0,0,0,0,0,0],
+                                         [0,1,1,0,0,0,1,0,0],
+                                         [0,0,0,1,1,1,0,1,1]])
+
+        assert np.isclose(best_mcm.get_best_log_evidence(), -3154.42)
+
+    def test_mcm_opt_basis_q3(self):
+        best_mcm = self.get_best_mcm(3)
+
+        # Check the mcm and log-evidence
+        assert np.all(best_mcm.array == [[1,0,0,0,1,0,0,1,0],
+                                         [0,1,0,1,0,0,1,0,0],
+                                         [0,0,1,0,0,1,0,0,1]])
+
+        assert np.isclose(best_mcm.get_best_log_evidence(), -3587.68)
+
+    def test_mcm_opt_basis_q4(self):
+        best_mcm = self.get_best_mcm(4)
+
+        # Check the mcm and log-evidence
+        assert np.all(best_mcm.array == [[1,0,0,0,0,0,0,0,0],
+                                         [0,1,0,0,0,0,1,0,0],
+                                         [0,0,1,0,0,1,0,0,0],
+                                         [0,0,0,1,0,0,0,0,1],
+                                         [0,0,0,0,1,0,0,1,0]])
+
+        assert np.isclose(best_mcm.get_best_log_evidence(), -3763.43)
+
+    def test_mcm_opt_basis_q5(self):
+        best_mcm = self.get_best_mcm(5)
+
+        # Check the mcm and log-evidence
+        assert np.all(best_mcm.array == [[1,0,0,0,0,0,0,0,0],
+                                         [0,1,0,0,0,0,0,0,0],
+                                         [0,0,1,0,0,1,0,0,0],
+                                         [0,0,0,1,0,0,1,0,0],
+                                         [0,0,0,0,1,0,0,1,0],
+                                         [0,0,0,0,0,0,0,0,1]])
+
+        assert np.isclose(best_mcm.get_best_log_evidence(), -3848.06)
diff --git a/mcmpy/tests/test_data.py b/mcmpy/tests/test_data.py
@@ -0,0 +1,59 @@
+import pytest
+import numpy as np
+from mcmpy import Data
+
+# Test information theoretic criteria against previous code
+# (https://github.com/clelidm/MinCompSpin_ExhaustiveSearch) by C. de Mulatier;
+# Results of the optimal MCM for the binary SCOTUS data in the original basis 
+
+def test_log_evidence(scotus_data_q2, opt_mcm_scotus_q2):
+    assert np.isclose(scotus_data_q2.log_evidence(opt_mcm_scotus_q2), -3300.4)
+
+def test_log_likelihood(scotus_data_q2, opt_mcm_scotus_q2):
+    assert np.isclose(scotus_data_q2.log_likelihood(opt_mcm_scotus_q2), -3194.36)
+
+def test_parametric_complexity(scotus_data_q2, opt_mcm_scotus_q2):
+    assert np.isclose(scotus_data_q2.complexity_parametric(opt_mcm_scotus_q2), 114.056)
+
+def test_geometric_complexity(scotus_data_q2, opt_mcm_scotus_q2):
+    assert np.isclose(scotus_data_q2.complexity_geometric(opt_mcm_scotus_q2), -8.95092)
+
+def test_total_complexity(scotus_data_q2, opt_mcm_scotus_q2):
+    parametric_complexity = scotus_data_q2.complexity_parametric(opt_mcm_scotus_q2)
+    geometric_complexity = scotus_data_q2.complexity_geometric(opt_mcm_scotus_q2)
+    assert np.isclose(parametric_complexity + geometric_complexity , 105.105)
+
+def test_mdl(scotus_data_q2, opt_mcm_scotus_q2):
+    assert np.isclose(scotus_data_q2.minimum_description_length(opt_mcm_scotus_q2), -3299.46)
+
+def test_log_evidence_icc(scotus_data_q2, opt_mcm_scotus_q2):
+    assert np.all(np.isclose(scotus_data_q2.log_evidence_icc(opt_mcm_scotus_q2), [-1754.41, -1545.98]))
+
+def test_log_likelihood_icc(scotus_data_q2, opt_mcm_scotus_q2):
+    assert np.all(np.isclose(scotus_data_q2.log_likelihood_icc(opt_mcm_scotus_q2), [-1686.28, -1508.08]))
+
+def test_parametric_complexity_icc(scotus_data_q2, opt_mcm_scotus_q2):
+    assert np.all(np.isclose(scotus_data_q2.complexity_parametric_icc(opt_mcm_scotus_q2), [76.8637, 37.1921]))
+
+def test_geometric_complexity_icc(scotus_data_q2, opt_mcm_scotus_q2):
+    assert np.all(np.isclose(scotus_data_q2.complexity_geometric_icc(opt_mcm_scotus_q2), [-9.58359, 0.632678]))
+
+
+# Input array test
+def test_partition_input(scotus_data_q2):
+    with pytest.raises(ValueError, match="The partition should be a 1D or 2D array."):
+        scotus_data_q2.log_evidence(np.ones((2,2,2)))
+
+    with pytest.raises(ValueError, match="Entries of the 2D array should be either 0 or 1."):
+        scotus_data_q2.log_evidence([[1,2,0], [0,0,1]])
+
+def test_entropy_of_op_input(scotus_data_q2):
+    with pytest.raises(ValueError, match="The spin operator should be given as a 1D numpy array."):
+        scotus_data_q2.entropy_of_spin_operator(np.ones((2,2)))
+
+    with pytest.raises(ValueError, match="The vector should only contain values between 0 and q-1."):
+        scotus_data_q2.entropy_of_spin_operator([1,2,0,1,0,1,0,1,1])
+
+    with pytest.raises(ValueError, match="The given spin operator doesn't contain n elements."):
+        scotus_data_q2.entropy_of_spin_operator([1,0,1,0,1,0,1])
+