diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d5adf7a..0cb2f88 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -19,11 +19,20 @@ jobs: with: submodules: recursive + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + - name: Install dependencies (Linux) if: runner.os == 'Linux' run: | sudo apt-get update - sudo apt-get install -y g++ cmake python3 + + - name: Install dependencies + run: | + python3 -m pip install --upgrade pip + python3 -m pip install pytest numpy - name : Build project run: | @@ -33,3 +42,8 @@ jobs: run: | cd build ctest --output-on-failure + + - name : Test python module + run: | + cd mcmpy/tests + pytest diff --git a/docs/api/data.rst b/docs/api/data.rst index 85bbaee..2630757 100644 --- a/docs/api/data.rst +++ b/docs/api/data.rst @@ -240,6 +240,13 @@ to infer either the best minimally complex (MCM) or the optimal basis representa The actual number of datapoints in the dataset (read-only). + .. py:attribute:: N_synthetic + :type: int + + The synthetic number of datapoints in the dataset. + + This attribute can be changed to perform an analysis of the dataset as if it is either larger or smaller. + .. py:attribute:: N_unique :type: int diff --git a/include/data/dataset.h b/include/data/dataset.h index 2c7a21c..c8cd02b 100644 --- a/include/data/dataset.h +++ b/include/data/dataset.h @@ -31,6 +31,14 @@ class Data { */ Data(const std::vector, unsigned int>>& _dataset, int n_var, int n_states, int n_samples); + /** + * Change the value for the number of datapoints in the dataset that is used for analysis. + * This can be changed to perform an analysis of the dataset as if it is larger or smaller. + * + * @param n_datapoints The synthetic number of datapoints in the dataset. + */ + void set_N_synthetic(int n_datapoints); + /** * Calculate the entropy of the dataset. * @@ -128,6 +136,7 @@ class Data { int N; // Number of datapoints int N_unique; // Number of different datapoints int n_ints; // Number of 128bit integers necessary to represent the data + int N_synthetic; // The synthetic number of datapoints in the dataset std::vector<__uint128_t> pow_q; // Vector containing the first n powers of q used to speed up the calculation of the evidence }; diff --git a/mcmpy/include/py_dataset.h b/mcmpy/include/py_dataset.h index c4ca24f..86093df 100644 --- a/mcmpy/include/py_dataset.h +++ b/mcmpy/include/py_dataset.h @@ -12,7 +12,7 @@ namespace py = pybind11; -std::vector<__uint128_t> convert_spin_op_from_py(const py::array_t& spin_op, int q, int n_ints); +std::vector<__uint128_t> convert_spin_op_from_py(const py::array_t& spin_op, int q, int n_ints, int n); class PyData { public: @@ -65,9 +65,12 @@ class PyData { int get_n() {return this->data.n;}; int get_N() {return this->data.N;}; + int get_N_synthetic() {return this->data.N_synthetic;}; int get_N_unique() {return this->data.N_unique;}; int get_q() {return this->data.q;}; + void set_N_synthetic(int n_datapoints) {this->data.set_N_synthetic(n_datapoints);}; + Data data; }; diff --git a/mcmpy/src/py_dataset.cpp b/mcmpy/src/py_dataset.cpp index de74447..310ea82 100644 --- a/mcmpy/src/py_dataset.cpp +++ b/mcmpy/src/py_dataset.cpp @@ -1,6 +1,6 @@ #include "py_dataset.h" -std::vector<__uint128_t> convert_spin_op_from_py(const py::array_t& spin_op, int q, int n_ints){ +std::vector<__uint128_t> convert_spin_op_from_py(const py::array_t& spin_op, int q, int n_ints, int n){ py::buffer_info buff = spin_op.request(); // Check if there is only one dimension @@ -9,9 +9,9 @@ std::vector<__uint128_t> convert_spin_op_from_py(const py::array_t& spi throw std::invalid_argument("The spin operator should be given as a 1D numpy array."); } // Check if the system size is valid - int n = buff.shape[0]; - if (n > 128){ - throw std::invalid_argument("The maximum system size is 128."); + int n_entries = buff.shape[0]; + if (n_entries != n){ + throw std::invalid_argument("The given spin operator doesn't contain n elements."); } std::vector conv_spin_op(n, 0); @@ -154,7 +154,7 @@ double PyData::entropy(int base){ } double PyData::entropy_of_spin_op(const py::array_t& op){ - std::vector<__uint128_t> spin_op = convert_spin_op_from_py(op, this->data.q, this->data.n_ints); + std::vector<__uint128_t> spin_op = convert_spin_op_from_py(op, this->data.q, this->data.n_ints, this->data.n); return calc_entropy_of_spin_op(this->data, spin_op); } @@ -187,8 +187,10 @@ void bind_data_class(py::module &m) { .def("entropy", &PyData::entropy, py::arg("base") = -1) .def("entropy_of_spin_operator", &PyData::entropy_of_spin_op, py::arg("spin_op")) + .def_property_readonly("n", &PyData::get_n) .def_property_readonly("q", &PyData::get_q) .def_property_readonly("N", &PyData::get_N) - .def_property_readonly("N_unique", &PyData::get_N_unique); + .def_property_readonly("N_unique", &PyData::get_N_unique) + .def_property("N_synthetic", &PyData::get_N_synthetic, &PyData::set_N_synthetic); } diff --git a/mcmpy/src/py_partition.cpp b/mcmpy/src/py_partition.cpp index 399d531..3fdfd4e 100644 --- a/mcmpy/src/py_partition.cpp +++ b/mcmpy/src/py_partition.cpp @@ -76,6 +76,9 @@ std::vector<__uint128_t> convert_partition_from_py_2d_array(py::array_t& element = 1; for (int j = 0; j < n; j++){ if(ptr[i*n + j]){ + if(ptr[i*n + j] != 1){ + throw std::invalid_argument("Entries of the 2D array should be either 0 or 1."); + } partition[i] += element; } element <<= 1; diff --git a/mcmpy/tests/conftest.py b/mcmpy/tests/conftest.py new file mode 100644 index 0000000..d77727d --- /dev/null +++ b/mcmpy/tests/conftest.py @@ -0,0 +1,27 @@ +import pytest +from mcmpy import Data, MCM, MCMSearch + +# Data + +@pytest.fixture +def scotus_data_q2(): + return Data("../../input/US_SupremeCourt_n9_N895.dat", 9, 2) + +@pytest.fixture +def scotus_data_q3(): + return Data("../../input/US_SupremeCourt_n9_N895.dat", 9, 3) + +@pytest.fixture +def scotus_data_q4(): + return Data("../../input/US_SupremeCourt_n9_N895.dat", 9, 4) + +@pytest.fixture +def scotus_data_q5(): + return Data("../../input/US_SupremeCourt_n9_N895.dat", 9, 5) + + +# MCM + +@pytest.fixture +def opt_mcm_scotus_q2(): + return MCM(9, [[1,0,1,1,1,0,1,0,0], [0,1,0,0,0,1,0,1,1]]) diff --git a/mcmpy/tests/test_basis.py b/mcmpy/tests/test_basis.py new file mode 100644 index 0000000..6d1afcd --- /dev/null +++ b/mcmpy/tests/test_basis.py @@ -0,0 +1,52 @@ +import pytest +import numpy as np +from mcmpy import Basis + + +def test_init_default(): + n = 3 + q = 5 + basis = Basis(n,q) + + # Default basis should be the n one-body spin operators + assert np.all(basis.matrix == np.eye(n)) + assert(basis.n == n) + assert(basis.q == q) + +def test_init_array(): + n = 4 + q = 3 + array = np.array([[1,2,0,0],[2,2,0,0],[0,0,1,0],[0,0,2,1]]) + basis = Basis(n,q, array) + + # Check the matrix representation + # Should be equal to the transpose of the input given that columns represent operators + assert np.all(basis.matrix == array.T) + assert(basis.n == n) + assert(basis.q == q) + + # Check reset to default + basis.set_default() + assert np.all(basis.matrix == np.eye(n)) + +def test_init_file(): + n = 4 + q = 3 + basis = Basis(n,q, "../../tests/basis2.dat") + + # Check the matrix representation + assert np.all(basis.matrix == [[1,2,0,0],[2,2,0,0],[0,0,1,2],[0,0,0,1]]) + assert(basis.n == n) + assert(basis.q == q) + +def test_set_from_file(): + n = 4 + q = 3 + + basis = Basis(n,q) + basis.set_from_file("../../tests/basis2.dat") + + # Check the matrix representation + assert np.all(basis.matrix == [[1,2,0,0],[2,2,0,0],[0,0,1,2],[0,0,0,1]]) + assert(basis.n == n) + assert(basis.q == q) diff --git a/mcmpy/tests/test_basis_search.py b/mcmpy/tests/test_basis_search.py new file mode 100644 index 0000000..62ead4e --- /dev/null +++ b/mcmpy/tests/test_basis_search.py @@ -0,0 +1,92 @@ +import pytest +import numpy as np +from mcmpy import Data, MCM, MCMSearch, Basis, BasisSearch + +# Best basis search of supreme court + +# Test against previous code +# (https://github.com/clelidm/MinCompSpin_ExhaustiveSearch) by C. de Mulatier; +# (https://github.com/AaronDC60/MinCompSpin_discrete) by Aaron De Clercq. + +class TestBasisSearch: + + def setup_class(self): + self.searcher = BasisSearch() + self.mcmsearcher = MCMSearch() + + def test_exhaustive_search(self): + dataset = Data("../../input/US_SupremeCourt_n9_N895.dat", 9, 2) + opt_basis_1 = self.searcher.exhaustive(dataset) + opt_basis_2 = self.searcher.get_basis() + assert np.all(opt_basis_1.matrix == [[0,0,0,1,1,1,0,0,0], + [0,0,0,0,0,0,1,0,0], + [0,0,0,0,0,1,0,1,0], + [1,0,0,0,0,0,0,0,0], + [0,0,0,1,0,0,0,0,0], + [0,1,0,0,0,0,0,0,0], + [1,0,0,0,1,0,0,0,1], + [0,1,1,0,0,0,1,0,0], + [0,0,1,0,0,0,0,1,0]]) + + assert np.all(opt_basis_1.matrix == opt_basis_2.matrix) + assert opt_basis_1.n == 9 + assert opt_basis_2.n == 9 + + assert opt_basis_1.q == 2 + assert opt_basis_2.q == 2 + + def get_best_mcm(self, q): + dataset = Data("../../input/US_SupremeCourt_n9_N895.dat", 9, q) + # Calculate the best basis + opt_basis = self.searcher.exhaustive(dataset) + # Transform data + transformed_data = opt_basis.gauge_transform_data(dataset) + # Get best mcm in the new basis + best_mcm = self.mcmsearcher.exhaustive(transformed_data) + + return best_mcm + + def test_mcm_opt_basis_q2(self): + best_mcm = self.get_best_mcm(2) + + # Check the mcm and log-evidence + assert np.all(best_mcm.array == [[1,0,0,0,0,0,0,0,0], + [0,1,1,0,0,0,1,0,0], + [0,0,0,1,1,1,0,1,1]]) + + assert np.isclose(best_mcm.get_best_log_evidence(), -3154.42) + + def test_mcm_opt_basis_q3(self): + best_mcm = self.get_best_mcm(3) + + # Check the mcm and log-evidence + assert np.all(best_mcm.array == [[1,0,0,0,1,0,0,1,0], + [0,1,0,1,0,0,1,0,0], + [0,0,1,0,0,1,0,0,1]]) + + assert np.isclose(best_mcm.get_best_log_evidence(), -3587.68) + + def test_mcm_opt_basis_q4(self): + best_mcm = self.get_best_mcm(4) + + # Check the mcm and log-evidence + assert np.all(best_mcm.array == [[1,0,0,0,0,0,0,0,0], + [0,1,0,0,0,0,1,0,0], + [0,0,1,0,0,1,0,0,0], + [0,0,0,1,0,0,0,0,1], + [0,0,0,0,1,0,0,1,0]]) + + assert np.isclose(best_mcm.get_best_log_evidence(), -3763.43) + + def test_mcm_opt_basis_q5(self): + best_mcm = self.get_best_mcm(5) + + # Check the mcm and log-evidence + assert np.all(best_mcm.array == [[1,0,0,0,0,0,0,0,0], + [0,1,0,0,0,0,0,0,0], + [0,0,1,0,0,1,0,0,0], + [0,0,0,1,0,0,1,0,0], + [0,0,0,0,1,0,0,1,0], + [0,0,0,0,0,0,0,0,1]]) + + assert np.isclose(best_mcm.get_best_log_evidence(), -3848.06) diff --git a/mcmpy/tests/test_data.py b/mcmpy/tests/test_data.py new file mode 100644 index 0000000..3ac8e2b --- /dev/null +++ b/mcmpy/tests/test_data.py @@ -0,0 +1,59 @@ +import pytest +import numpy as np +from mcmpy import Data + +# Test information theoretic criteria against previous code +# (https://github.com/clelidm/MinCompSpin_ExhaustiveSearch) by C. de Mulatier; +# Results of the optimal MCM for the binary SCOTUS data in the original basis + +def test_log_evidence(scotus_data_q2, opt_mcm_scotus_q2): + assert np.isclose(scotus_data_q2.log_evidence(opt_mcm_scotus_q2), -3300.4) + +def test_log_likelihood(scotus_data_q2, opt_mcm_scotus_q2): + assert np.isclose(scotus_data_q2.log_likelihood(opt_mcm_scotus_q2), -3194.36) + +def test_parametric_complexity(scotus_data_q2, opt_mcm_scotus_q2): + assert np.isclose(scotus_data_q2.complexity_parametric(opt_mcm_scotus_q2), 114.056) + +def test_geometric_complexity(scotus_data_q2, opt_mcm_scotus_q2): + assert np.isclose(scotus_data_q2.complexity_geometric(opt_mcm_scotus_q2), -8.95092) + +def test_total_complexity(scotus_data_q2, opt_mcm_scotus_q2): + parametric_complexity = scotus_data_q2.complexity_parametric(opt_mcm_scotus_q2) + geometric_complexity = scotus_data_q2.complexity_geometric(opt_mcm_scotus_q2) + assert np.isclose(parametric_complexity + geometric_complexity , 105.105) + +def test_mdl(scotus_data_q2, opt_mcm_scotus_q2): + assert np.isclose(scotus_data_q2.minimum_description_length(opt_mcm_scotus_q2), -3299.46) + +def test_log_evidence_icc(scotus_data_q2, opt_mcm_scotus_q2): + assert np.all(np.isclose(scotus_data_q2.log_evidence_icc(opt_mcm_scotus_q2), [-1754.41, -1545.98])) + +def test_log_likelihood_icc(scotus_data_q2, opt_mcm_scotus_q2): + assert np.all(np.isclose(scotus_data_q2.log_likelihood_icc(opt_mcm_scotus_q2), [-1686.28, -1508.08])) + +def test_parametric_complexity_icc(scotus_data_q2, opt_mcm_scotus_q2): + assert np.all(np.isclose(scotus_data_q2.complexity_parametric_icc(opt_mcm_scotus_q2), [76.8637, 37.1921])) + +def test_geometric_complexity_icc(scotus_data_q2, opt_mcm_scotus_q2): + assert np.all(np.isclose(scotus_data_q2.complexity_geometric_icc(opt_mcm_scotus_q2), [-9.58359, 0.632678])) + + +# Input array test +def test_partition_input(scotus_data_q2): + with pytest.raises(ValueError, match="The partition should be a 1D or 2D array."): + scotus_data_q2.log_evidence(np.ones((2,2,2))) + + with pytest.raises(ValueError, match="Entries of the 2D array should be either 0 or 1."): + scotus_data_q2.log_evidence([[1,2,0], [0,0,1]]) + +def test_entropy_of_op_input(scotus_data_q2): + with pytest.raises(ValueError, match="The spin operator should be given as a 1D numpy array."): + scotus_data_q2.entropy_of_spin_operator(np.ones((2,2))) + + with pytest.raises(ValueError, match="The vector should only contain values between 0 and q-1."): + scotus_data_q2.entropy_of_spin_operator([1,2,0,1,0,1,0,1,1]) + + with pytest.raises(ValueError, match="The given spin operator doesn't contain n elements."): + scotus_data_q2.entropy_of_spin_operator([1,0,1,0,1,0,1]) + diff --git a/mcmpy/tests/test_mcm.py b/mcmpy/tests/test_mcm.py new file mode 100644 index 0000000..8fa33bd --- /dev/null +++ b/mcmpy/tests/test_mcm.py @@ -0,0 +1,125 @@ +import pytest +import numpy as np +from mcmpy import MCM, Data + +@pytest.fixture +def model(): + return MCM(8, [0,1,2,0,1,-1,5]) + +# Test input with specific partition +def test_mcm_array(model): + assert np.all(model.array == [[1,0,0,1,0,0,0,0], + [0,1,0,0,1,0,0,0], + [0,0,1,0,0,0,0,0], + [0,0,0,0,0,0,1,0]]) + +def test_mcm_gray_code(model): + assert np.all(model.array_gray_code == [0,1,2,0,1,-1,3,-1]) + +def test_read_only_properties(model): + assert(model.n == 8) + assert(model.n_icc == 4) + assert(model.rank == 6) + assert(model.is_optimized is False) + +def test_move_in_invalid(model): + with pytest.raises(ValueError, match="The variable index should be between 0 and n-1."): + model.move_variable_in(8,5) + + with pytest.raises(ValueError, match="This variable is already present in the partition."): + model.move_variable_in(1,6) + +def test_move_in(model): + model.move_variable_in(5,2) + model.move_variable_in(7,7) + + # Check gray code + assert np.all(model.array_gray_code == [0,1,2,0,1,2,3,4]) + + # Check binary array + assert np.all(model.array == [[1,0,0,1,0,0,0,0], + [0,1,0,0,1,0,0,0], + [0,0,1,0,0,1,0,0], + [0,0,0,0,0,0,1,0], + [0,0,0,0,0,0,0,1]]) + + # Check properties + assert(model.n == 8) + assert(model.n_icc == 5) + assert(model.rank == 8) + assert(model.is_optimized is False) + +def test_move_out_invalid(model): + with pytest.raises(ValueError, match="The variable index should be between 0 and n-1."): + model.move_variable_out(8) + + with pytest.raises(ValueError, match="This variable is not present in the partition."): + model.move_variable_out(7) + +def test_move_out(model): + model.move_variable_out(2) + + # Check gray code + assert np.all(model.array_gray_code == [0,1,-1,0,1,-1,2,-1]) + + # Check binary array + assert np.all(model.array == [[1,0,0,1,0,0,0,0], + [0,1,0,0,1,0,0,0], + [0,0,0,0,0,0,1,0]]) + + # Check properties + assert(model.n == 8) + assert(model.n_icc == 3) + assert(model.rank == 5) + assert(model.is_optimized is False) + +def test_move(model): + model.move_variable(6,0) + + # Check gray code + assert np.all(model.array_gray_code == [0,1,2,0,1,-1,0,-1]) + + # Check binary array + assert np.all(model.array == [[1,0,0,1,0,0,1,0], + [0,1,0,0,1,0,0,0], + [0,0,1,0,0,0,0,0]]) + + # Check properties + assert(model.n == 8) + assert(model.n_icc == 3) + assert(model.rank == 6) + assert(model.is_optimized is False) + +def test_properties(): + # Independent model + model = MCM(12, "independent") + assert(model.n == 12) + assert(model.n_icc == 12) + assert(model.rank == 12) + assert(model.is_optimized is False) + + # Complete model + model = MCM(12, "complete") + assert(model.n == 12) + assert(model.n_icc == 1) + assert(model.rank == 12) + assert(model.is_optimized is False) + +def test_data_object(scotus_data_q2, opt_mcm_scotus_q2): + data = opt_mcm_scotus_q2.generate_data_object(10_000, scotus_data_q2) + + assert data.N == 10_000 + assert data.N_synthetic == 10_000 + assert data.q == scotus_data_q2.q + assert data.n == scotus_data_q2.n + +def test_data_file(scotus_data_q2, opt_mcm_scotus_q2, tmp_path): + filepath = str(tmp_path / "test.dat") + opt_mcm_scotus_q2.generate_data_file(10_000, scotus_data_q2, filepath) + # Create data object from file + data = Data(filepath, scotus_data_q2.n, scotus_data_q2.q) + + assert data.N == 10_000 + assert data.N_synthetic == 10_000 + assert data.q == scotus_data_q2.q + assert data.n == scotus_data_q2.n diff --git a/mcmpy/tests/test_mcm_search.py b/mcmpy/tests/test_mcm_search.py new file mode 100644 index 0000000..0e9ef7d --- /dev/null +++ b/mcmpy/tests/test_mcm_search.py @@ -0,0 +1,183 @@ +import pytest +import numpy as np +from mcmpy import Data, MCM, MCMSearch + +@pytest.fixture +def mcm_searcher(): + return MCMSearch() + +## Exhaustive search + +# Test against previous code +# (https://github.com/clelidm/MinCompSpin_ExhaustiveSearch) by C. de Mulatier; +# (https://github.com/AaronDC60/MinCompSpin_discrete) by Aaron De Clercq. + +class TestMCMSearch: + + def setup_class(self): + scotus_data = Data("../../input/US_SupremeCourt_n9_N895.dat", 9, 2) + self.searcher = MCMSearch() + + # MCM used as starting partition in the hierarchical greedy merging algorithm + mcm_in_merging = MCM(9, [0,1,0,3,4,5,6,7,8]) + + self.opt_mcms = {} + self.mcms_out = {} + self.mcms_in = {} + + self.opt_mcms["simulated_annealing"] = self.searcher.simulated_annealing(scotus_data) + self.mcms_out["simulated_annealing"] = self.searcher.get_mcm_out() + self.mcms_in["simulated_annealing"] = self.searcher.get_mcm_in() + + self.opt_mcms["hierarchical_greedy_merging"] = self.searcher.hierarchical_greedy_merging(scotus_data, mcm_in_merging) + self.mcms_out["hierarchical_greedy_merging"] = self.searcher.get_mcm_out() + self.mcms_in["hierarchical_greedy_merging"] = self.searcher.get_mcm_in() + + self.opt_mcms["hierarchical_greedy_divisive"] = self.searcher.hierarchical_greedy_divisive(scotus_data) + self.mcms_out["hierarchical_greedy_divisive"] = self.searcher.get_mcm_out() + self.mcms_in["hierarchical_greedy_divisive"] = self.searcher.get_mcm_in() + + self.opt_mcms["exhaustive"] = self.searcher.exhaustive(scotus_data) + self.mcms_out["exhaustive"] = self.searcher.get_mcm_out() + + def test_log_ev_trajectory_exhaustive(self): + # Exhaustive search stores all parititons + # Number of partitions for n variables is given by Bell number of n + assert len(self.searcher.log_evidence_trajectory) == 21_147 + + @pytest.mark.parametrize("method", ["exhaustive", "simulated_annealing", "hierarchical_greedy_merging", "hierarchical_greedy_divisive"]) + def test_opt_mcms_structure(self, method): + assert self.opt_mcms[method].n == 9 + assert self.opt_mcms[method].n_icc == 2 + assert self.opt_mcms[method].rank == 9 + assert self.opt_mcms[method].is_optimized is True + + if method != "simulated_annealing": + # Simulated annealing inverts the order sometimes + assert np.all(self.opt_mcms[method].array == [[1,0,1,1,1,0,1,0,0], [0,1,0,0,0,1,0,1,1]]) + assert np.all(self.opt_mcms[method].array_gray_code == [0,1,0,0,0,1,0,1,1]) + else: + assert [1,0,1,1,1,0,1,0,0] in self.opt_mcms[method].array + assert [0,1,0,0,0,1,0,1,1] in self.opt_mcms[method].array + + @pytest.mark.parametrize("method", ["exhaustive", "simulated_annealing", "hierarchical_greedy_merging", "hierarchical_greedy_divisive"]) + def test_mcms_out_structure(self, method): + assert self.mcms_out[method].n == 9 + assert self.mcms_out[method].n_icc == 2 + assert self.mcms_out[method].rank == 9 + assert self.mcms_out[method].is_optimized is True + + if method != "simulated_annealing": + # Simulated annealing inverts the order sometimes + assert np.all(self.mcms_out[method].array == [[1,0,1,1,1,0,1,0,0], [0,1,0,0,0,1,0,1,1]]) + assert np.all(self.mcms_out[method].array_gray_code == [0,1,0,0,0,1,0,1,1]) + else: + assert [1,0,1,1,1,0,1,0,0] in self.mcms_out[method].array + assert [0,1,0,0,0,1,0,1,1] in self.mcms_out[method].array + + @pytest.mark.parametrize("method", ["exhaustive", "simulated_annealing", "hierarchical_greedy_merging", "hierarchical_greedy_divisive"]) + def test_opt_mcm_log_ev(self, method): + log_ev_per_icc = self.opt_mcms[method].get_best_log_evidence_icc() + log_ev = self.opt_mcms[method].get_best_log_evidence() + + assert np.isclose(log_ev, -3300.4) + assert np.allclose(log_ev_per_icc, [-1754.41, -1545.98]) or np.allclose(log_ev_per_icc, [-1545.98, -1754.41]) + + @pytest.mark.parametrize("method", ["exhaustive", "simulated_annealing", "hierarchical_greedy_merging", "hierarchical_greedy_divisive"]) + def test_opt_mcm_log_ev(self, method): + log_ev_per_icc = self.opt_mcms[method].get_best_log_evidence_icc() + log_ev = self.opt_mcms[method].get_best_log_evidence() + + assert np.isclose(log_ev, -3300.4) + assert np.allclose(log_ev_per_icc, [-1754.41, -1545.98]) or np.allclose(log_ev_per_icc, [-1545.98, -1754.41]) + + def test_mcm_in_invalid(self): + # Try before any search has been run + search = MCMSearch() + with pytest.raises(RuntimeError, match="No search has been ran yet."): + search.get_mcm_in() + + def test_mcm_in_exhaustive(self): + with pytest.raises(RuntimeError, match="Exhaustive search does not have an initial MCM."): + self.searcher.get_mcm_in() + + def test_mcm_in_greedy_divisive(self): + # Starts by default from the complete model + assert self.mcms_in["hierarchical_greedy_divisive"].n == 9 + assert self.mcms_in["hierarchical_greedy_divisive"].n_icc == 1 + assert self.mcms_in["hierarchical_greedy_divisive"].rank == 9 + assert self.mcms_in["hierarchical_greedy_divisive"].is_optimized is False + + def test_mcm_in_greedy_merging(self): + # mcm_in used is defined in the setup (mcm_in_merging) + assert self.mcms_in["hierarchical_greedy_merging"].n == 9 + assert self.mcms_in["hierarchical_greedy_merging"].n_icc == 8 + assert self.mcms_in["hierarchical_greedy_merging"].rank == 9 + assert self.mcms_in["hierarchical_greedy_merging"].is_optimized is False + + assert np.all(self.mcms_in["hierarchical_greedy_merging"].array_gray_code == [0,1,0,2,3,4,5,6,7]) + + def test_mcm_in_sim_annealing(self): + # Starts by default from the independent model + assert self.mcms_in["simulated_annealing"].n == 9 + assert self.mcms_in["simulated_annealing"].n_icc == 9 + assert self.mcms_in["simulated_annealing"].rank == 9 + assert self.mcms_in["simulated_annealing"].is_optimized is False + +# Check the optimal structure and log-evidence for the SCOTUS dataset with different values of q + +def test_scotus_q_3(mcm_searcher, scotus_data_q3): + opt_mcm = mcm_searcher.exhaustive(scotus_data_q3) + log_ev = opt_mcm.get_best_log_evidence() + + # MCM properties + assert opt_mcm.n == 9 + assert opt_mcm.n_icc == 2 + assert opt_mcm.rank == 9 + assert opt_mcm.is_optimized is True + + # MCM structure + assert np.all(opt_mcm.array == [[1,0,1,1,1,0,1,0,0], [0,1,0,0,0,1,0,1,1]]) + assert np.all(opt_mcm.array_gray_code == [0,1,0,0,0,1,0,1,1]) + + # MCM log-evidence + assert np.allclose(log_ev, -3714.65) + +def test_scotus_q_4(mcm_searcher, scotus_data_q4): + opt_mcm = mcm_searcher.exhaustive(scotus_data_q4) + log_ev = opt_mcm.get_best_log_evidence() + + # MCM properties + assert opt_mcm.n == 9 + assert opt_mcm.n_icc == 3 + assert opt_mcm.rank == 9 + assert opt_mcm.is_optimized is True + + # MCM structure + assert np.all(opt_mcm.array == [[1,0,1,0,1,0,0,0,0], + [0,1,0,0,0,1,0,1,1], + [0,0,0,1,0,0,1,0,0]]) + assert np.all(opt_mcm.array_gray_code == [0,1,0,2,0,1,2,1,1]) + + # MCM log-evidence + assert np.allclose(log_ev, -4033.14) + +def test_scotus_q_5(mcm_searcher, scotus_data_q5): + opt_mcm = mcm_searcher.exhaustive(scotus_data_q5) + log_ev = opt_mcm.get_best_log_evidence() + + # MCM properties + assert opt_mcm.n == 9 + assert opt_mcm.n_icc == 4 + assert opt_mcm.rank == 9 + assert opt_mcm.is_optimized is True + + # MCM structure + assert np.all(opt_mcm.array == [[1,0,1,0,1,0,0,0,0], + [0,1,0,0,0,0,0,0,0], + [0,0,0,1,0,0,1,0,0], + [0,0,0,0,0,1,0,1,1]]) + assert np.all(opt_mcm.array_gray_code == [0,1,0,2,0,3,2,3,3]) + + # MCM log-evidence + assert np.allclose(log_ev, -4261.89) diff --git a/src/basis/basis.cpp b/src/basis/basis.cpp index 5083d74..6e8b937 100644 --- a/src/basis/basis.cpp +++ b/src/basis/basis.cpp @@ -156,6 +156,10 @@ void Basis::set_basis_default() { // Set the default base: s1, s2, ..., sn __uint128_t element = 1; for (int i = 0; i < this->n; i++){ + // Reset operator + std::fill(this->basis_ops[i].begin(), this->basis_ops[i].end(), 0); + std::fill(this->basis_ops_matrix[i].begin(), this->basis_ops_matrix[i].end(), 0); + // log2(q) 128 bit integer representation this->basis_ops[i][0] = element; element <<= 1; @@ -185,7 +189,7 @@ void Basis::print_details() { std::string op_string; for (std::vector<__uint128_t> op : this->basis_ops){ op_string = convert_128bit_vec_to_string(op, this->n); - std::cout << "Operator " << i << ": \t" << op_string << "\t Indices: "; + std::cout << "Operator " << i << ": \t" << op_string << "\t Indices of variables involved: "; // Find the indices of the variables present in the spin operator for (size_t j = 0; j < this->n; ++j){ if (op_string[j] != '0') {std::cout << j << "\t";} diff --git a/src/data/complexity.cpp b/src/data/complexity.cpp index 26aad50..dbbf3f0 100644 --- a/src/data/complexity.cpp +++ b/src/data/complexity.cpp @@ -22,7 +22,7 @@ double Data::calc_param_complexity_icc(__uint128_t component){ // Determine the size of the component int r = bit_count(component); __uint128_t n_param = this->pow_q[r] - 1; - return (n_param / 2.) * log(this->N / (2 * M_PI)); + return (n_param / 2.) * log(this->N_synthetic / (2 * M_PI)); } double Data::calc_param_complexity(std::vector<__uint128_t>& partition){ diff --git a/src/data/dataset.cpp b/src/data/dataset.cpp index 36158fb..37d35f4 100644 --- a/src/data/dataset.cpp +++ b/src/data/dataset.cpp @@ -17,6 +17,7 @@ Data::Data(const std::string& filename, int n_var, int n_states){ this->n_ints = ceil(log2(q)); // Process the dataset this->N = processing(filename, n_var, n_ints, n_states, dataset); + this->N_synthetic = this->N; this->N_unique = dataset.size(); // Precompute the powers of q to speed up the calculation of the evidence (q^r) @@ -34,6 +35,7 @@ Data::Data(const std::vector, unsigned int>>& this->q = n_states; this->dataset = _dataset; this->N = n_samples; + this->N_synthetic = this->N; this->N_unique = _dataset.size(); // Calculate the number of integers necessary to represent the data @@ -48,6 +50,14 @@ Data::Data(const std::vector, unsigned int>>& } } +void Data::set_N_synthetic(int n_datapoints){ + // Check if the given number of datapoints is valid + if (n_datapoints < 1){ + throw std::invalid_argument("The number of datapoints should be a positive number."); + } + this->N_synthetic = n_datapoints; +} + double Data::entropy(int base){ // Default base is q if (base == -1){ diff --git a/src/data/evidence.cpp b/src/data/evidence.cpp index 73905d5..87201c7 100644 --- a/src/data/evidence.cpp +++ b/src/data/evidence.cpp @@ -5,21 +5,23 @@ double Data::calc_log_ev_icc(__uint128_t component){ double log_evidence = 0; // Determine the size of the component int r = bit_count(component); + double N_syn = this->N_synthetic; + double alpha = N_syn / this->N; // Contributions from the datapoint frequencies std::unordered_map, unsigned int, HashVector128> counts = build_histogram(*this, component); std::unordered_map, unsigned int, HashVector128>::iterator count_iter = counts.begin(); while (count_iter != counts.end()){ - log_evidence += (lgamma(count_iter->second + 0.5) - 0.5 * log(M_PI)); + log_evidence += (lgamma(alpha * count_iter->second + 0.5) - 0.5 * log(M_PI)); ++count_iter; } // Calculate prefactor if (r > 25){ // Approximate for large components because lgamma overflows - log_evidence -= (r * log(this->q) * this->N); + log_evidence -= (r * log(this->q) * this->N_synthetic); } else{ - log_evidence += lgamma(this->pow_q[r]/2.) - lgamma(this->N + pow_q[r]/2.); + log_evidence += lgamma(this->pow_q[r]/2.) - lgamma(this->N_synthetic + pow_q[r]/2.); } return log_evidence; @@ -37,7 +39,7 @@ double Data::calc_log_ev(std::vector<__uint128_t>& partition){ } } // Contribution from variables not in the model - log_ev -= this->N * (this->n - r) * log(this->q); + log_ev -= this->N_synthetic * (this->n - r) * log(this->q); return log_ev; } diff --git a/src/data/likelihood.cpp b/src/data/likelihood.cpp index e61d62c..fa0187e 100644 --- a/src/data/likelihood.cpp +++ b/src/data/likelihood.cpp @@ -4,13 +4,14 @@ double Data::calc_log_likelihood_icc(__uint128_t component){ double log_likelihood = 0; double N_datapoints = this->N; + double alpha = this->N_synthetic / N_datapoints; // Determine the size of the component int r = bit_count(component); // Get the datapoint frequencies std::unordered_map, unsigned int, HashVector128> counts = build_histogram(*this, component); std::unordered_map, unsigned int, HashVector128>::iterator count_iter = counts.begin(); while (count_iter != counts.end()){ - log_likelihood += count_iter->second * log(count_iter->second / N_datapoints); + log_likelihood += alpha * count_iter->second * log(count_iter->second / N_datapoints); ++count_iter; } return log_likelihood; @@ -28,7 +29,7 @@ double Data::calc_log_likelihood(std::vector<__uint128_t>& partition){ } } // Contribution from variables not in the model - log_likelihood -= this->N * (this->n - r) * log(this->q); + log_likelihood -= this->N_synthetic * (this->n - r) * log(this->q); return log_likelihood; } \ No newline at end of file diff --git a/src/search/mcm_search/annealing.cpp b/src/search/mcm_search/annealing.cpp index 80e4ba7..26b669d 100644 --- a/src/search/mcm_search/annealing.cpp +++ b/src/search/mcm_search/annealing.cpp @@ -51,15 +51,16 @@ MCM MCMSearch::simulated_annealing(Data& data, MCM* init_mcm, std::string file_n *this->output_file << "Number of variables: " << data.n << "\n"; *this->output_file << "Number of states per variable: " << data.q << "\n"; *this->output_file << "Number of datapoints: " << data.N << "\n"; + *this->output_file << "Number of synthetic datapoints: " << data.N_synthetic << "\n"; *this->output_file << "Number of unique datapoint: " << data.N_unique << "\n"; *this->output_file << "Entropy of the data: " << data.entropy() << " q-its \n\n"; *this->output_file << "Initial partition \n"; *this->output_file << "----------------- \n\n"; - *this->output_file << "Log-evidence: " << mcm_tmp.log_ev << " = " << mcm_tmp.log_ev / (data.N * log(data.q)) << " q-its/datapoint \n\n"; + *this->output_file << "Log-evidence: " << mcm_tmp.log_ev << " = " << mcm_tmp.log_ev / (data.N_synthetic * log(data.q)) << " q-its/datapoint \n\n"; - print_partition_details_to_file(*this->output_file, mcm_tmp, data.N, data.q); + print_partition_details_to_file(*this->output_file, mcm_tmp, data.N_synthetic, data.q); *this->output_file << "\n"; *this->output_file << "Start annealing \n"; @@ -114,7 +115,7 @@ MCM MCMSearch::simulated_annealing(Data& data, MCM* init_mcm, std::string file_n // Write to output file if (this->output_file){ - *this->output_file << "Iteration " << i << " \t\t Temperature: " << settings.temp << " \t\t Log-evidence (q-its/datapoint): " << this->mcm_out.log_ev / (this->data->N * log(this->data->q)) << "\n"; + *this->output_file << "Iteration " << i << " \t\t Temperature: " << settings.temp << " \t\t Log-evidence (q-its/datapoint): " << this->mcm_out.log_ev / (this->data->N_synthetic * log(this->data->q)) << "\n"; } } else{steps_since_improve++;} @@ -161,10 +162,10 @@ MCM MCMSearch::simulated_annealing(Data& data, MCM* init_mcm, std::string file_n *this->output_file << "\nFinal partition \n"; *this->output_file << "----------------- \n\n"; - *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N * log(data.q)) << " q-its/datapoint \n"; - *this->output_file << "Max-Log-likelihood: " << max_log_likelihood << " = " << max_log_likelihood / (data.N * log(data.q)) << " q-its/datapoint \n\n"; + *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N_synthetic * log(data.q)) << " q-its/datapoint \n"; + *this->output_file << "Max-Log-likelihood: " << max_log_likelihood << " = " << max_log_likelihood / (data.N_synthetic * log(data.q)) << " q-its/datapoint \n\n"; - print_partition_details_to_file(*this->output_file, this->mcm_out, data.N, data.q); + print_partition_details_to_file(*this->output_file, this->mcm_out, data.N_synthetic, data.q); *this->output_file << "\n"; this->output_file.reset(); } diff --git a/src/search/mcm_search/div_and_conq.cpp b/src/search/mcm_search/div_and_conq.cpp index 96966b8..361f987 100644 --- a/src/search/mcm_search/div_and_conq.cpp +++ b/src/search/mcm_search/div_and_conq.cpp @@ -50,15 +50,16 @@ MCM MCMSearch::divide_and_conquer(Data& data, MCM* init_mcm, std::string file_na *this->output_file << "Number of variables: " << data.n << "\n"; *this->output_file << "Number of states per variable: " << data.q << "\n"; *this->output_file << "Number of datapoints: " << data.N << "\n"; + *this->output_file << "Number of synthetic datapoints: " << data.N_synthetic << "\n"; *this->output_file << "Number of unique datapoint: " << data.N_unique << "\n"; *this->output_file << "Entropy of the data: " << data.entropy() << " q-its \n\n"; *this->output_file << "Initial partition \n"; *this->output_file << "----------------- \n\n"; - *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N * log(data.q)) << " q-its/datapoint \n\n"; + *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N_synthetic * log(data.q)) << " q-its/datapoint \n\n"; - print_partition_details_to_file(*this->output_file, this->mcm_out, data.N, data.q); + print_partition_details_to_file(*this->output_file, this->mcm_out, data.N_synthetic, data.q); *this->output_file << "\n"; *this->output_file << "Start dividing \n"; @@ -83,10 +84,10 @@ MCM MCMSearch::divide_and_conquer(Data& data, MCM* init_mcm, std::string file_na *this->output_file << "\nFinal partition \n"; *this->output_file << "----------------- \n\n"; - *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N * log(data.q)) << " q-its/datapoint \n"; - *this->output_file << "Max-Log-likelihood: " << max_log_likelihood << " = " << max_log_likelihood / (data.N * log(data.q)) << " q-its/datapoint \n\n"; + *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N_synthetic * log(data.q)) << " q-its/datapoint \n"; + *this->output_file << "Max-Log-likelihood: " << max_log_likelihood << " = " << max_log_likelihood / (data.N_synthetic * log(data.q)) << " q-its/datapoint \n\n"; - print_partition_details_to_file(*this->output_file, this->mcm_out, data.N, data.q); + print_partition_details_to_file(*this->output_file, this->mcm_out, data.N_synthetic, data.q); *this->output_file << "\n"; this->output_file.reset(); } @@ -174,8 +175,7 @@ int MCMSearch::division(int move_from, int move_to){ this->log_evidence_trajectory.push_back(this->mcm_out.log_ev); if (this->output_file){ - std::cout << "Should only print with output file"; - *this->output_file << "Splitting component " << move_from << "\t Log-evidence (q-its/datapoint): " << this->mcm_out.log_ev / (this->data->N * log(this->data->q)) << "\n"; + *this->output_file << "Splitting component " << move_from << "\t Log-evidence (q-its/datapoint): " << this->mcm_out.log_ev / (this->data->N_synthetic * log(this->data->q)) << "\n"; *this->output_file << "\t Component " << move_from << " : \t" + int_to_string(this->mcm_out.partition[move_from], this->mcm_out.n) << "\n"; *this->output_file << "\t Component " << move_to << " : \t" + int_to_string(this->mcm_out.partition[move_to], this->mcm_out.n) << "\n\n"; } diff --git a/src/search/mcm_search/greedy.cpp b/src/search/mcm_search/greedy.cpp index 6acb93f..a176b94 100644 --- a/src/search/mcm_search/greedy.cpp +++ b/src/search/mcm_search/greedy.cpp @@ -49,15 +49,16 @@ MCM MCMSearch::greedy_search(Data& data, MCM* init_mcm, std::string file_name){ *this->output_file << "Number of variables: " << data.n << "\n"; *this->output_file << "Number of states per variable: " << data.q << "\n"; *this->output_file << "Number of datapoints: " << data.N << "\n"; + *this->output_file << "Number of synthetic datapoints: " << data.N_synthetic << "\n"; *this->output_file << "Number of unique datapoint: " << data.N_unique << "\n"; *this->output_file << "Entropy of the data: " << data.entropy() << " q-its \n\n"; *this->output_file << "Initial partition \n"; *this->output_file << "----------------- \n\n"; - *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N * log(data.q)) << " q-its/datapoint \n\n"; + *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N_synthetic * log(data.q)) << " q-its/datapoint \n\n"; - print_partition_details_to_file(*this->output_file, this->mcm_out, data.N, data.q); + print_partition_details_to_file(*this->output_file, this->mcm_out, data.N_synthetic, data.q); *this->output_file << "\n"; } @@ -76,10 +77,10 @@ MCM MCMSearch::greedy_search(Data& data, MCM* init_mcm, std::string file_name){ *this->output_file << "\nFinal partition \n"; *this->output_file << "----------------- \n\n"; - *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N * log(data.q)) << " q-its/datapoint \n"; - *this->output_file << "Max-Log-likelihood: " << max_log_likelihood << " = " << max_log_likelihood / (data.N * log(data.q)) << " q-its/datapoint \n\n"; + *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N_synthetic * log(data.q)) << " q-its/datapoint \n"; + *this->output_file << "Max-Log-likelihood: " << max_log_likelihood << " = " << max_log_likelihood / (data.N_synthetic * log(data.q)) << " q-its/datapoint \n\n"; - print_partition_details_to_file(*this->output_file, this->mcm_out, data.N, data.q); + print_partition_details_to_file(*this->output_file, this->mcm_out, data.N_synthetic, data.q); *this->output_file << "\n"; this->output_file.reset(); } @@ -150,7 +151,7 @@ void MCMSearch::hierarchical_merging(){ // Write to the output file if (this->output_file){ - *this->output_file << "Merging components " << best_i << " and " << best_j << " \t Log-evidence (q-its/datapoint): "<< (this->mcm_out.log_ev) / (this->data->N * log(this->data->q)) << "\n"; + *this->output_file << "Merging components " << best_i << " and " << best_j << " \t Log-evidence (q-its/datapoint): "<< (this->mcm_out.log_ev) / (this->data->N_synthetic * log(this->data->q)) << "\n"; } } } diff --git a/src/utilities/partition.cpp b/src/utilities/partition.cpp index 99e5569..675a29f 100644 --- a/src/utilities/partition.cpp +++ b/src/utilities/partition.cpp @@ -1,32 +1,6 @@ #include "utilities/partition.h" #include "utilities/miscellaneous.h" -/* -std::vector generate_random_partition(int n){ - // Check if the number of variables is a valid number - if (n > 128){ - throw std::domain_error("The maximum system size is 128 variables."); - } - if (n < 0){ - throw std::domain_error("The number of variables should be a positive number."); - } - - std::vector growth_string(n, 0); - - int index; - int max_index = 2; - for (int i = 1; i < n; i++){ - index = rand()/(RAND_MAX/max_index); - growth_string[i] = index; - - if (index == (max_index-1)){ - max_index += 1; - } - } - - return growth_string; -}*/ - std::vector<__uint128_t> generate_random_partition(int n){ // Check if the number of variables is a valid number if (n > 128){ diff --git a/tests/basis/basis.cpp b/tests/basis/basis.cpp index 9bf0303..4c5c4c8 100644 --- a/tests/basis/basis.cpp +++ b/tests/basis/basis.cpp @@ -49,13 +49,10 @@ TEST(basis, init_n){ Basis basis(n, q); exp_spin_ops = {{1,0,0}, {2,0,0}, {4,0,0}}; - spin_ops = basis.get_basis(); - exp_spin_ops_array = {{1,0,0}, {0,1,0}, {0,0,1}}; - spin_ops_array = basis.get_basis_matrix(); - EXPECT_EQ(exp_spin_ops, spin_ops); - EXPECT_EQ(exp_spin_ops_array, spin_ops_array); + EXPECT_EQ(exp_spin_ops, basis.get_basis()); + EXPECT_EQ(exp_spin_ops_array, basis.get_basis_matrix()); EXPECT_EQ(basis.get_n(), n); EXPECT_EQ(basis.get_q(), q); EXPECT_EQ(basis.get_n_ints(), 3); @@ -186,6 +183,14 @@ TEST(basis, init_spin_ops){ catch(std::invalid_argument const & err) { EXPECT_EQ(err.what(), std::string("Given basis is not linearly independent.")); } + + // Reset to default basis + exp_spin_ops = {{1,0}, {2,0}, {4,0}, {8,0}}; + exp_matrix = {{1,0,0,0}, {0,1,0,0}, {0,0,1,0}, {0,0,0,1}}; + basis.set_basis_default(); + + EXPECT_EQ(basis.get_basis(), exp_spin_ops); + EXPECT_EQ(basis.get_basis_matrix(), exp_matrix); } TEST(basis, init_file){