From f6052920346bf266403e0c0295f553c3ccf76d26 Mon Sep 17 00:00:00 2001
From: AaronDC60 <aaron.de.clercq1@gmail.com>
Date: Fri, 27 Jun 2025 17:35:57 +0200
Subject: [PATCH 01/21] Add option to change the number of datapoints

---
 include/data/dataset.h                 |  9 +++++++++
 mcmpy/include/py_dataset.h             |  3 +++
 mcmpy/src/py_dataset.cpp               |  4 +++-
 src/data/complexity.cpp                |  2 +-
 src/data/dataset.cpp                   | 10 ++++++++++
 src/data/evidence.cpp                  |  9 +++++----
 src/data/likelihood.cpp                |  5 +++--
 src/search/mcm_search/annealing.cpp    | 13 +++++++------
 src/search/mcm_search/div_and_conq.cpp | 14 +++++++-------
 src/search/mcm_search/greedy.cpp       | 13 +++++++------
 src/utilities/partition.cpp            | 26 --------------------------
 11 files changed, 55 insertions(+), 53 deletions(-)

diff --git a/include/data/dataset.h b/include/data/dataset.h
index 2c7a21c..653460a 100644
--- a/include/data/dataset.h
+++ b/include/data/dataset.h
@@ -31,6 +31,14 @@ class Data {
      */
     Data(const std::vector<std::pair<std::vector<__uint128_t>, unsigned int>>& _dataset, int n_var, int n_states, int n_samples);
 
+    /**
+     * Change the number of assumed datapoints in the dataset.
+     * This can be changed to perform an analysis of the dataset under the assumption that it is larger or smaller.
+     * 
+     * @param n_datapoints          The new number of assumed datapoints in the dataset.
+     */
+    void set_N_assumed(int n_datapoints);
+
     /**
      * Calculate the entropy of the dataset.
      * 
@@ -128,6 +136,7 @@ class Data {
     int N; // Number of datapoints
     int N_unique; // Number of different datapoints
     int n_ints; // Number of 128bit integers necessary to represent the data
+    int N_assumed; // The assumed number of datapoints in the dataset
 
     std::vector<__uint128_t> pow_q; // Vector containing the first n powers of q used to speed up the calculation of the evidence
 };
diff --git a/mcmpy/include/py_dataset.h b/mcmpy/include/py_dataset.h
index c4ca24f..f138340 100644
--- a/mcmpy/include/py_dataset.h
+++ b/mcmpy/include/py_dataset.h
@@ -65,9 +65,12 @@ class PyData {
 
     int get_n() {return this->data.n;};
     int get_N() {return this->data.N;};
+    int get_N_assumed() {return this->data.N_assumed;};
     int get_N_unique() {return this->data.N_unique;};
     int get_q() {return this->data.q;};
 
+    void set_N_assumed(int n_datapoints) {this->data.set_N_assumed(n_datapoints);};
+
     Data data;
 };
 
diff --git a/mcmpy/src/py_dataset.cpp b/mcmpy/src/py_dataset.cpp
index de74447..8ecaf11 100644
--- a/mcmpy/src/py_dataset.cpp
+++ b/mcmpy/src/py_dataset.cpp
@@ -187,8 +187,10 @@ void bind_data_class(py::module &m) {
 
         .def("entropy", &PyData::entropy, py::arg("base") = -1)
         .def("entropy_of_spin_operator", &PyData::entropy_of_spin_op, py::arg("spin_op"))
+
         .def_property_readonly("n", &PyData::get_n)
         .def_property_readonly("q", &PyData::get_q)
         .def_property_readonly("N", &PyData::get_N)
-        .def_property_readonly("N_unique", &PyData::get_N_unique);
+        .def_property_readonly("N_unique", &PyData::get_N_unique)
+        .def_property("N_assumed", &PyData::get_N_assumed, &PyData::set_N_assumed);
 }
diff --git a/src/data/complexity.cpp b/src/data/complexity.cpp
index 26aad50..5562997 100644
--- a/src/data/complexity.cpp
+++ b/src/data/complexity.cpp
@@ -22,7 +22,7 @@ double Data::calc_param_complexity_icc(__uint128_t component){
     // Determine the size of the component
     int r = bit_count(component);
     __uint128_t n_param = this->pow_q[r] - 1;
-    return (n_param / 2.) * log(this->N / (2 * M_PI));
+    return (n_param / 2.) * log(this->N_assumed / (2 * M_PI));
 }
 
 double Data::calc_param_complexity(std::vector<__uint128_t>& partition){
diff --git a/src/data/dataset.cpp b/src/data/dataset.cpp
index 36158fb..b95dbb9 100644
--- a/src/data/dataset.cpp
+++ b/src/data/dataset.cpp
@@ -17,6 +17,7 @@ Data::Data(const std::string& filename, int n_var, int n_states){
     this->n_ints = ceil(log2(q));
     // Process the dataset
     this->N = processing(filename, n_var, n_ints, n_states, dataset);
+    this->N_assumed = this->N;
     this->N_unique = dataset.size();
 
     // Precompute the powers of q to speed up the calculation of the evidence (q^r)
@@ -34,6 +35,7 @@ Data::Data(const std::vector<std::pair<std::vector<__uint128_t>, unsigned int>>&
     this->q = n_states;
     this->dataset = _dataset;
     this->N = n_samples;
+    this->N_assumed = this->N;
     this->N_unique = _dataset.size();
 
     // Calculate the number of integers necessary to represent the data
@@ -48,6 +50,14 @@ Data::Data(const std::vector<std::pair<std::vector<__uint128_t>, unsigned int>>&
     }
 }
 
+void Data::set_N_assumed(int n_datapoints){
+    // Check if the given number of datapoints is valid
+    if (n_datapoints < 1){
+        throw std::invalid_argument("The number of datapoints should be a positive number.");
+    }
+    this->N_assumed = n_datapoints;
+}
+
 double Data::entropy(int base){
     // Default base is q
     if (base == -1){
diff --git a/src/data/evidence.cpp b/src/data/evidence.cpp
index 73905d5..1d6aef4 100644
--- a/src/data/evidence.cpp
+++ b/src/data/evidence.cpp
@@ -5,21 +5,22 @@ double Data::calc_log_ev_icc(__uint128_t component){
     double log_evidence = 0;
     // Determine the size of the component
     int r = bit_count(component);
+    double alpha = this->N_assumed / this->N;
     // Contributions from the datapoint frequencies
     std::unordered_map<std::vector<__uint128_t>, unsigned int, HashVector128> counts = build_histogram(*this, component);
     std::unordered_map<std::vector<__uint128_t>, unsigned int, HashVector128>::iterator count_iter = counts.begin();
     while (count_iter != counts.end()){
-        log_evidence += (lgamma(count_iter->second + 0.5) - 0.5 * log(M_PI));
+        log_evidence += (lgamma(alpha * count_iter->second + 0.5) - 0.5 * log(M_PI));
         ++count_iter;
     }
 
     // Calculate prefactor
     if (r > 25){
         // Approximate for large components because lgamma overflows
-        log_evidence -= (r * log(this->q) * this->N);
+        log_evidence -= (r * log(this->q) * this->N_assumed);
     }
     else{
-        log_evidence += lgamma(this->pow_q[r]/2.) - lgamma(this->N + pow_q[r]/2.);
+        log_evidence += lgamma(this->pow_q[r]/2.) - lgamma(this->N_assumed + pow_q[r]/2.);
     }
     
     return log_evidence;
@@ -37,7 +38,7 @@ double Data::calc_log_ev(std::vector<__uint128_t>& partition){
         }
     }
     // Contribution from variables not in the model
-    log_ev -= this->N * (this->n - r) * log(this->q);
+    log_ev -= this->N_assumed * (this->n - r) * log(this->q);
 
     return log_ev;
 }
diff --git a/src/data/likelihood.cpp b/src/data/likelihood.cpp
index e61d62c..e5b5e33 100644
--- a/src/data/likelihood.cpp
+++ b/src/data/likelihood.cpp
@@ -4,13 +4,14 @@
 double Data::calc_log_likelihood_icc(__uint128_t component){
     double log_likelihood = 0;
     double N_datapoints = this->N;
+    double alpha = this->N_assumed / this->N;
     // Determine the size of the component
     int r = bit_count(component);
     // Get the datapoint frequencies
     std::unordered_map<std::vector<__uint128_t>, unsigned int, HashVector128> counts = build_histogram(*this, component);
     std::unordered_map<std::vector<__uint128_t>, unsigned int, HashVector128>::iterator count_iter = counts.begin();
     while (count_iter != counts.end()){
-        log_likelihood += count_iter->second * log(count_iter->second / N_datapoints);
+        log_likelihood += alpha * count_iter->second * log(count_iter->second / N_datapoints);
         ++count_iter;
     }
     return log_likelihood;
@@ -28,7 +29,7 @@ double Data::calc_log_likelihood(std::vector<__uint128_t>& partition){
         }
     }
     // Contribution from variables not in the model
-    log_likelihood -= this->N * (this->n - r) * log(this->q);
+    log_likelihood -= this->N_assumed * (this->n - r) * log(this->q);
     
     return log_likelihood;
 }
\ No newline at end of file
diff --git a/src/search/mcm_search/annealing.cpp b/src/search/mcm_search/annealing.cpp
index 80e4ba7..ddced3a 100644
--- a/src/search/mcm_search/annealing.cpp
+++ b/src/search/mcm_search/annealing.cpp
@@ -51,15 +51,16 @@ MCM MCMSearch::simulated_annealing(Data& data, MCM* init_mcm, std::string file_n
         *this->output_file << "Number of variables: " << data.n << "\n";
         *this->output_file << "Number of states per variable: " << data.q << "\n";
         *this->output_file << "Number of datapoints: " << data.N << "\n";
+        *this->output_file << "Number of assumed datapoints: " << data.N_assumed << "\n";
         *this->output_file << "Number of unique datapoint: " << data.N_unique << "\n";
         *this->output_file << "Entropy of the data: " << data.entropy() << " q-its \n\n";
 
         *this->output_file << "Initial partition \n";
         *this->output_file << "----------------- \n\n";
 
-        *this->output_file << "Log-evidence: " << mcm_tmp.log_ev << " = " << mcm_tmp.log_ev / (data.N * log(data.q)) << " q-its/datapoint \n\n";
+        *this->output_file << "Log-evidence: " << mcm_tmp.log_ev << " = " << mcm_tmp.log_ev / (data.N_assumed * log(data.q)) << " q-its/datapoint \n\n";
 
-        print_partition_details_to_file(*this->output_file, mcm_tmp, data.N, data.q);
+        print_partition_details_to_file(*this->output_file, mcm_tmp, data.N_assumed, data.q);
         *this->output_file << "\n";
 
         *this->output_file << "Start annealing \n";
@@ -114,7 +115,7 @@ MCM MCMSearch::simulated_annealing(Data& data, MCM* init_mcm, std::string file_n
 
             // Write to output file
             if (this->output_file){
-                *this->output_file << "Iteration " << i << " \t\t Temperature: " << settings.temp << " \t\t Log-evidence (q-its/datapoint): " << this->mcm_out.log_ev / (this->data->N * log(this->data->q)) << "\n";
+                *this->output_file << "Iteration " << i << " \t\t Temperature: " << settings.temp << " \t\t Log-evidence (q-its/datapoint): " << this->mcm_out.log_ev / (this->data->N_assumed * log(this->data->q)) << "\n";
             }
         }
         else{steps_since_improve++;}
@@ -161,10 +162,10 @@ MCM MCMSearch::simulated_annealing(Data& data, MCM* init_mcm, std::string file_n
         *this->output_file << "\nFinal partition \n";
         *this->output_file << "----------------- \n\n";
 
-        *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N * log(data.q)) << " q-its/datapoint \n";
-        *this->output_file << "Max-Log-likelihood: " << max_log_likelihood << " = " << max_log_likelihood / (data.N * log(data.q)) << " q-its/datapoint \n\n";
+        *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N_assumed * log(data.q)) << " q-its/datapoint \n";
+        *this->output_file << "Max-Log-likelihood: " << max_log_likelihood << " = " << max_log_likelihood / (data.N_assumed * log(data.q)) << " q-its/datapoint \n\n";
 
-        print_partition_details_to_file(*this->output_file, this->mcm_out, data.N, data.q);
+        print_partition_details_to_file(*this->output_file, this->mcm_out, data.N_assumed, data.q);
         *this->output_file << "\n";
         this->output_file.reset();
     }
diff --git a/src/search/mcm_search/div_and_conq.cpp b/src/search/mcm_search/div_and_conq.cpp
index 96966b8..5e5e441 100644
--- a/src/search/mcm_search/div_and_conq.cpp
+++ b/src/search/mcm_search/div_and_conq.cpp
@@ -50,15 +50,16 @@ MCM MCMSearch::divide_and_conquer(Data& data, MCM* init_mcm, std::string file_na
         *this->output_file << "Number of variables: " << data.n << "\n";
         *this->output_file << "Number of states per variable: " << data.q << "\n";
         *this->output_file << "Number of datapoints: " << data.N << "\n";
+        *this->output_file << "Number of assumed datapoints: " << data.N_assumed << "\n";
         *this->output_file << "Number of unique datapoint: " << data.N_unique << "\n";
         *this->output_file << "Entropy of the data: " << data.entropy() << " q-its \n\n";
 
         *this->output_file << "Initial partition \n";
         *this->output_file << "----------------- \n\n";
 
-        *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N * log(data.q)) << " q-its/datapoint \n\n";
+        *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N_assumed * log(data.q)) << " q-its/datapoint \n\n";
 
-        print_partition_details_to_file(*this->output_file, this->mcm_out, data.N, data.q);
+        print_partition_details_to_file(*this->output_file, this->mcm_out, data.N_assumed, data.q);
         *this->output_file << "\n";
     
         *this->output_file << "Start dividing \n";
@@ -83,10 +84,10 @@ MCM MCMSearch::divide_and_conquer(Data& data, MCM* init_mcm, std::string file_na
         *this->output_file << "\nFinal partition \n";
         *this->output_file << "----------------- \n\n";
 
-        *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N * log(data.q)) << " q-its/datapoint \n";
-        *this->output_file << "Max-Log-likelihood: " << max_log_likelihood << " = " << max_log_likelihood / (data.N * log(data.q)) << " q-its/datapoint \n\n";
+        *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N_assumed * log(data.q)) << " q-its/datapoint \n";
+        *this->output_file << "Max-Log-likelihood: " << max_log_likelihood << " = " << max_log_likelihood / (data.N_assumed * log(data.q)) << " q-its/datapoint \n\n";
 
-        print_partition_details_to_file(*this->output_file, this->mcm_out, data.N, data.q);
+        print_partition_details_to_file(*this->output_file, this->mcm_out, data.N_assumed, data.q);
         *this->output_file << "\n";
         this->output_file.reset();
     }
@@ -174,8 +175,7 @@ int MCMSearch::division(int move_from, int move_to){
     this->log_evidence_trajectory.push_back(this->mcm_out.log_ev);
 
     if (this->output_file){
-        std::cout << "Should only print with output file";
-        *this->output_file << "Splitting component " << move_from << "\t Log-evidence (q-its/datapoint): " << this->mcm_out.log_ev / (this->data->N * log(this->data->q)) << "\n";
+        *this->output_file << "Splitting component " << move_from << "\t Log-evidence (q-its/datapoint): " << this->mcm_out.log_ev / (this->data->N_assumed * log(this->data->q)) << "\n";
         *this->output_file << "\t Component " << move_from << " : \t" + int_to_string(this->mcm_out.partition[move_from], this->mcm_out.n) << "\n";
         *this->output_file << "\t Component " << move_to << " : \t" + int_to_string(this->mcm_out.partition[move_to], this->mcm_out.n) << "\n\n";
     }
diff --git a/src/search/mcm_search/greedy.cpp b/src/search/mcm_search/greedy.cpp
index 6acb93f..2378003 100644
--- a/src/search/mcm_search/greedy.cpp
+++ b/src/search/mcm_search/greedy.cpp
@@ -49,15 +49,16 @@ MCM MCMSearch::greedy_search(Data& data, MCM* init_mcm, std::string file_name){
         *this->output_file << "Number of variables: " << data.n << "\n";
         *this->output_file << "Number of states per variable: " << data.q << "\n";
         *this->output_file << "Number of datapoints: " << data.N << "\n";
+        *this->output_file << "Number of assumed datapoints: " << data.N_assumed << "\n";
         *this->output_file << "Number of unique datapoint: " << data.N_unique << "\n";
         *this->output_file << "Entropy of the data: " << data.entropy() << " q-its \n\n";
 
         *this->output_file << "Initial partition \n";
         *this->output_file << "----------------- \n\n";
 
-        *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N * log(data.q)) << " q-its/datapoint \n\n";
+        *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N_assumed * log(data.q)) << " q-its/datapoint \n\n";
 
-        print_partition_details_to_file(*this->output_file, this->mcm_out, data.N, data.q);
+        print_partition_details_to_file(*this->output_file, this->mcm_out, data.N_assumed, data.q);
         *this->output_file << "\n";
     }
 
@@ -76,10 +77,10 @@ MCM MCMSearch::greedy_search(Data& data, MCM* init_mcm, std::string file_name){
         *this->output_file << "\nFinal partition \n";
         *this->output_file << "----------------- \n\n";
 
-        *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N * log(data.q)) << " q-its/datapoint \n";
-        *this->output_file << "Max-Log-likelihood: " << max_log_likelihood << " = " << max_log_likelihood / (data.N * log(data.q)) << " q-its/datapoint \n\n";
+        *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N_assumed * log(data.q)) << " q-its/datapoint \n";
+        *this->output_file << "Max-Log-likelihood: " << max_log_likelihood << " = " << max_log_likelihood / (data.N_assumed * log(data.q)) << " q-its/datapoint \n\n";
 
-        print_partition_details_to_file(*this->output_file, this->mcm_out, data.N, data.q);
+        print_partition_details_to_file(*this->output_file, this->mcm_out, data.N_assumed, data.q);
         *this->output_file << "\n";
         this->output_file.reset();
     }
@@ -150,7 +151,7 @@ void MCMSearch::hierarchical_merging(){
 
             // Write to the output file
             if (this->output_file){
-                *this->output_file << "Merging components " << best_i << " and " << best_j << " \t Log-evidence (q-its/datapoint): "<<  (this->mcm_out.log_ev) / (this->data->N * log(this->data->q)) << "\n";
+                *this->output_file << "Merging components " << best_i << " and " << best_j << " \t Log-evidence (q-its/datapoint): "<<  (this->mcm_out.log_ev) / (this->data->N_assumed * log(this->data->q)) << "\n";
             }
         }
     }
diff --git a/src/utilities/partition.cpp b/src/utilities/partition.cpp
index 99e5569..675a29f 100644
--- a/src/utilities/partition.cpp
+++ b/src/utilities/partition.cpp
@@ -1,32 +1,6 @@
 #include "utilities/partition.h"
 #include "utilities/miscellaneous.h"
 
-/*
-std::vector<int> generate_random_partition(int n){
-    // Check if the number of variables is a valid number
-    if (n > 128){
-        throw std::domain_error("The maximum system size is 128 variables.");
-    }
-    if (n < 0){
-        throw std::domain_error("The number of variables should be a positive number.");
-    }
-
-    std::vector<int> growth_string(n, 0);
-
-    int index;
-    int max_index = 2;
-    for (int i = 1; i < n; i++){
-        index = rand()/(RAND_MAX/max_index);
-        growth_string[i] = index;
-
-        if (index == (max_index-1)){
-            max_index += 1;
-        }
-    }
-
-    return growth_string;
-}*/
-
 std::vector<__uint128_t> generate_random_partition(int n){
     // Check if the number of variables is a valid number
     if (n > 128){

From c96a71a881d6953ed62383f0faaf50bbbe86b8c5 Mon Sep 17 00:00:00 2001
From: AaronDC60 <aaron.de.clercq1@gmail.com>
Date: Fri, 27 Jun 2025 17:52:04 +0200
Subject: [PATCH 02/21] Update documentation

---
 docs/api/data.rst | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/docs/api/data.rst b/docs/api/data.rst
index 85bbaee..28c8f2e 100644
--- a/docs/api/data.rst
+++ b/docs/api/data.rst
@@ -240,6 +240,13 @@ to infer either the best minimally complex (MCM) or the optimal basis representa
       
       The actual number of datapoints in the dataset (read-only).
 
+   .. py:attribute:: N_assumed
+      :type: int
+
+      The assumed number of datapoints in the dataset.
+
+      This attribute can be changed to perform an analysis of the dataset under the assumption that it is either larger or smaller.
+
    .. py:attribute:: N_unique
       :type: int
       

From 49b08534280c0cc4247bdf48bc2f32040d31ae27 Mon Sep 17 00:00:00 2001
From: AaronDC60 <aaron.de.clercq1@gmail.com>
Date: Fri, 27 Jun 2025 19:05:54 +0200
Subject: [PATCH 03/21] Change variable name

---
 docs/api/data.rst                      |  6 +++---
 include/data/dataset.h                 | 10 +++++-----
 mcmpy/include/py_dataset.h             |  4 ++--
 mcmpy/src/py_dataset.cpp               |  2 +-
 src/data/complexity.cpp                |  2 +-
 src/data/dataset.cpp                   |  8 ++++----
 src/data/evidence.cpp                  |  8 ++++----
 src/data/likelihood.cpp                |  4 ++--
 src/search/mcm_search/annealing.cpp    | 14 +++++++-------
 src/search/mcm_search/div_and_conq.cpp | 14 +++++++-------
 src/search/mcm_search/greedy.cpp       | 14 +++++++-------
 11 files changed, 43 insertions(+), 43 deletions(-)

diff --git a/docs/api/data.rst b/docs/api/data.rst
index 28c8f2e..2630757 100644
--- a/docs/api/data.rst
+++ b/docs/api/data.rst
@@ -240,12 +240,12 @@ to infer either the best minimally complex (MCM) or the optimal basis representa
       
       The actual number of datapoints in the dataset (read-only).
 
-   .. py:attribute:: N_assumed
+   .. py:attribute:: N_synthetic
       :type: int
 
-      The assumed number of datapoints in the dataset.
+      The synthetic number of datapoints in the dataset.
 
-      This attribute can be changed to perform an analysis of the dataset under the assumption that it is either larger or smaller.
+      This attribute can be changed to perform an analysis of the dataset as if it is either larger or smaller.
 
    .. py:attribute:: N_unique
       :type: int
diff --git a/include/data/dataset.h b/include/data/dataset.h
index 653460a..c8cd02b 100644
--- a/include/data/dataset.h
+++ b/include/data/dataset.h
@@ -32,12 +32,12 @@ class Data {
     Data(const std::vector<std::pair<std::vector<__uint128_t>, unsigned int>>& _dataset, int n_var, int n_states, int n_samples);
 
     /**
-     * Change the number of assumed datapoints in the dataset.
-     * This can be changed to perform an analysis of the dataset under the assumption that it is larger or smaller.
+     * Change the value for the number of datapoints in the dataset that is used for analysis.
+     * This can be changed to perform an analysis of the dataset as if it is larger or smaller.
      * 
-     * @param n_datapoints          The new number of assumed datapoints in the dataset.
+     * @param n_datapoints          The synthetic number of datapoints in the dataset.
      */
-    void set_N_assumed(int n_datapoints);
+    void set_N_synthetic(int n_datapoints);
 
     /**
      * Calculate the entropy of the dataset.
@@ -136,7 +136,7 @@ class Data {
     int N; // Number of datapoints
     int N_unique; // Number of different datapoints
     int n_ints; // Number of 128bit integers necessary to represent the data
-    int N_assumed; // The assumed number of datapoints in the dataset
+    int N_synthetic; // The synthetic number of datapoints in the dataset
 
     std::vector<__uint128_t> pow_q; // Vector containing the first n powers of q used to speed up the calculation of the evidence
 };
diff --git a/mcmpy/include/py_dataset.h b/mcmpy/include/py_dataset.h
index f138340..e3a5648 100644
--- a/mcmpy/include/py_dataset.h
+++ b/mcmpy/include/py_dataset.h
@@ -65,11 +65,11 @@ class PyData {
 
     int get_n() {return this->data.n;};
     int get_N() {return this->data.N;};
-    int get_N_assumed() {return this->data.N_assumed;};
+    int get_N_synthetic() {return this->data.N_synthetic;};
     int get_N_unique() {return this->data.N_unique;};
     int get_q() {return this->data.q;};
 
-    void set_N_assumed(int n_datapoints) {this->data.set_N_assumed(n_datapoints);};
+    void set_N_synthetic(int n_datapoints) {this->data.set_N_synthetic(n_datapoints);};
 
     Data data;
 };
diff --git a/mcmpy/src/py_dataset.cpp b/mcmpy/src/py_dataset.cpp
index 8ecaf11..ade3dd7 100644
--- a/mcmpy/src/py_dataset.cpp
+++ b/mcmpy/src/py_dataset.cpp
@@ -192,5 +192,5 @@ void bind_data_class(py::module &m) {
         .def_property_readonly("q", &PyData::get_q)
         .def_property_readonly("N", &PyData::get_N)
         .def_property_readonly("N_unique", &PyData::get_N_unique)
-        .def_property("N_assumed", &PyData::get_N_assumed, &PyData::set_N_assumed);
+        .def_property("N_synthetic", &PyData::get_N_synthetic, &PyData::set_N_synthetic);
 }
diff --git a/src/data/complexity.cpp b/src/data/complexity.cpp
index 5562997..dbbf3f0 100644
--- a/src/data/complexity.cpp
+++ b/src/data/complexity.cpp
@@ -22,7 +22,7 @@ double Data::calc_param_complexity_icc(__uint128_t component){
     // Determine the size of the component
     int r = bit_count(component);
     __uint128_t n_param = this->pow_q[r] - 1;
-    return (n_param / 2.) * log(this->N_assumed / (2 * M_PI));
+    return (n_param / 2.) * log(this->N_synthetic / (2 * M_PI));
 }
 
 double Data::calc_param_complexity(std::vector<__uint128_t>& partition){
diff --git a/src/data/dataset.cpp b/src/data/dataset.cpp
index b95dbb9..37d35f4 100644
--- a/src/data/dataset.cpp
+++ b/src/data/dataset.cpp
@@ -17,7 +17,7 @@ Data::Data(const std::string& filename, int n_var, int n_states){
     this->n_ints = ceil(log2(q));
     // Process the dataset
     this->N = processing(filename, n_var, n_ints, n_states, dataset);
-    this->N_assumed = this->N;
+    this->N_synthetic = this->N;
     this->N_unique = dataset.size();
 
     // Precompute the powers of q to speed up the calculation of the evidence (q^r)
@@ -35,7 +35,7 @@ Data::Data(const std::vector<std::pair<std::vector<__uint128_t>, unsigned int>>&
     this->q = n_states;
     this->dataset = _dataset;
     this->N = n_samples;
-    this->N_assumed = this->N;
+    this->N_synthetic = this->N;
     this->N_unique = _dataset.size();
 
     // Calculate the number of integers necessary to represent the data
@@ -50,12 +50,12 @@ Data::Data(const std::vector<std::pair<std::vector<__uint128_t>, unsigned int>>&
     }
 }
 
-void Data::set_N_assumed(int n_datapoints){
+void Data::set_N_synthetic(int n_datapoints){
     // Check if the given number of datapoints is valid
     if (n_datapoints < 1){
         throw std::invalid_argument("The number of datapoints should be a positive number.");
     }
-    this->N_assumed = n_datapoints;
+    this->N_synthetic = n_datapoints;
 }
 
 double Data::entropy(int base){
diff --git a/src/data/evidence.cpp b/src/data/evidence.cpp
index 1d6aef4..2f16a65 100644
--- a/src/data/evidence.cpp
+++ b/src/data/evidence.cpp
@@ -5,7 +5,7 @@ double Data::calc_log_ev_icc(__uint128_t component){
     double log_evidence = 0;
     // Determine the size of the component
     int r = bit_count(component);
-    double alpha = this->N_assumed / this->N;
+    double alpha = this->N_synthetic / this->N;
     // Contributions from the datapoint frequencies
     std::unordered_map<std::vector<__uint128_t>, unsigned int, HashVector128> counts = build_histogram(*this, component);
     std::unordered_map<std::vector<__uint128_t>, unsigned int, HashVector128>::iterator count_iter = counts.begin();
@@ -17,10 +17,10 @@ double Data::calc_log_ev_icc(__uint128_t component){
     // Calculate prefactor
     if (r > 25){
         // Approximate for large components because lgamma overflows
-        log_evidence -= (r * log(this->q) * this->N_assumed);
+        log_evidence -= (r * log(this->q) * this->N_synthetic);
     }
     else{
-        log_evidence += lgamma(this->pow_q[r]/2.) - lgamma(this->N_assumed + pow_q[r]/2.);
+        log_evidence += lgamma(this->pow_q[r]/2.) - lgamma(this->N_synthetic + pow_q[r]/2.);
     }
     
     return log_evidence;
@@ -38,7 +38,7 @@ double Data::calc_log_ev(std::vector<__uint128_t>& partition){
         }
     }
     // Contribution from variables not in the model
-    log_ev -= this->N_assumed * (this->n - r) * log(this->q);
+    log_ev -= this->N_synthetic * (this->n - r) * log(this->q);
 
     return log_ev;
 }
diff --git a/src/data/likelihood.cpp b/src/data/likelihood.cpp
index e5b5e33..d1bfafb 100644
--- a/src/data/likelihood.cpp
+++ b/src/data/likelihood.cpp
@@ -4,7 +4,7 @@
 double Data::calc_log_likelihood_icc(__uint128_t component){
     double log_likelihood = 0;
     double N_datapoints = this->N;
-    double alpha = this->N_assumed / this->N;
+    double alpha = this->N_synthetic / this->N;
     // Determine the size of the component
     int r = bit_count(component);
     // Get the datapoint frequencies
@@ -29,7 +29,7 @@ double Data::calc_log_likelihood(std::vector<__uint128_t>& partition){
         }
     }
     // Contribution from variables not in the model
-    log_likelihood -= this->N_assumed * (this->n - r) * log(this->q);
+    log_likelihood -= this->N_synthetic * (this->n - r) * log(this->q);
     
     return log_likelihood;
 }
\ No newline at end of file
diff --git a/src/search/mcm_search/annealing.cpp b/src/search/mcm_search/annealing.cpp
index ddced3a..26b669d 100644
--- a/src/search/mcm_search/annealing.cpp
+++ b/src/search/mcm_search/annealing.cpp
@@ -51,16 +51,16 @@ MCM MCMSearch::simulated_annealing(Data& data, MCM* init_mcm, std::string file_n
         *this->output_file << "Number of variables: " << data.n << "\n";
         *this->output_file << "Number of states per variable: " << data.q << "\n";
         *this->output_file << "Number of datapoints: " << data.N << "\n";
-        *this->output_file << "Number of assumed datapoints: " << data.N_assumed << "\n";
+        *this->output_file << "Number of synthetic datapoints: " << data.N_synthetic << "\n";
         *this->output_file << "Number of unique datapoint: " << data.N_unique << "\n";
         *this->output_file << "Entropy of the data: " << data.entropy() << " q-its \n\n";
 
         *this->output_file << "Initial partition \n";
         *this->output_file << "----------------- \n\n";
 
-        *this->output_file << "Log-evidence: " << mcm_tmp.log_ev << " = " << mcm_tmp.log_ev / (data.N_assumed * log(data.q)) << " q-its/datapoint \n\n";
+        *this->output_file << "Log-evidence: " << mcm_tmp.log_ev << " = " << mcm_tmp.log_ev / (data.N_synthetic * log(data.q)) << " q-its/datapoint \n\n";
 
-        print_partition_details_to_file(*this->output_file, mcm_tmp, data.N_assumed, data.q);
+        print_partition_details_to_file(*this->output_file, mcm_tmp, data.N_synthetic, data.q);
         *this->output_file << "\n";
 
         *this->output_file << "Start annealing \n";
@@ -115,7 +115,7 @@ MCM MCMSearch::simulated_annealing(Data& data, MCM* init_mcm, std::string file_n
 
             // Write to output file
             if (this->output_file){
-                *this->output_file << "Iteration " << i << " \t\t Temperature: " << settings.temp << " \t\t Log-evidence (q-its/datapoint): " << this->mcm_out.log_ev / (this->data->N_assumed * log(this->data->q)) << "\n";
+                *this->output_file << "Iteration " << i << " \t\t Temperature: " << settings.temp << " \t\t Log-evidence (q-its/datapoint): " << this->mcm_out.log_ev / (this->data->N_synthetic * log(this->data->q)) << "\n";
             }
         }
         else{steps_since_improve++;}
@@ -162,10 +162,10 @@ MCM MCMSearch::simulated_annealing(Data& data, MCM* init_mcm, std::string file_n
         *this->output_file << "\nFinal partition \n";
         *this->output_file << "----------------- \n\n";
 
-        *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N_assumed * log(data.q)) << " q-its/datapoint \n";
-        *this->output_file << "Max-Log-likelihood: " << max_log_likelihood << " = " << max_log_likelihood / (data.N_assumed * log(data.q)) << " q-its/datapoint \n\n";
+        *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N_synthetic * log(data.q)) << " q-its/datapoint \n";
+        *this->output_file << "Max-Log-likelihood: " << max_log_likelihood << " = " << max_log_likelihood / (data.N_synthetic * log(data.q)) << " q-its/datapoint \n\n";
 
-        print_partition_details_to_file(*this->output_file, this->mcm_out, data.N_assumed, data.q);
+        print_partition_details_to_file(*this->output_file, this->mcm_out, data.N_synthetic, data.q);
         *this->output_file << "\n";
         this->output_file.reset();
     }
diff --git a/src/search/mcm_search/div_and_conq.cpp b/src/search/mcm_search/div_and_conq.cpp
index 5e5e441..361f987 100644
--- a/src/search/mcm_search/div_and_conq.cpp
+++ b/src/search/mcm_search/div_and_conq.cpp
@@ -50,16 +50,16 @@ MCM MCMSearch::divide_and_conquer(Data& data, MCM* init_mcm, std::string file_na
         *this->output_file << "Number of variables: " << data.n << "\n";
         *this->output_file << "Number of states per variable: " << data.q << "\n";
         *this->output_file << "Number of datapoints: " << data.N << "\n";
-        *this->output_file << "Number of assumed datapoints: " << data.N_assumed << "\n";
+        *this->output_file << "Number of synthetic datapoints: " << data.N_synthetic << "\n";
         *this->output_file << "Number of unique datapoint: " << data.N_unique << "\n";
         *this->output_file << "Entropy of the data: " << data.entropy() << " q-its \n\n";
 
         *this->output_file << "Initial partition \n";
         *this->output_file << "----------------- \n\n";
 
-        *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N_assumed * log(data.q)) << " q-its/datapoint \n\n";
+        *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N_synthetic * log(data.q)) << " q-its/datapoint \n\n";
 
-        print_partition_details_to_file(*this->output_file, this->mcm_out, data.N_assumed, data.q);
+        print_partition_details_to_file(*this->output_file, this->mcm_out, data.N_synthetic, data.q);
         *this->output_file << "\n";
     
         *this->output_file << "Start dividing \n";
@@ -84,10 +84,10 @@ MCM MCMSearch::divide_and_conquer(Data& data, MCM* init_mcm, std::string file_na
         *this->output_file << "\nFinal partition \n";
         *this->output_file << "----------------- \n\n";
 
-        *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N_assumed * log(data.q)) << " q-its/datapoint \n";
-        *this->output_file << "Max-Log-likelihood: " << max_log_likelihood << " = " << max_log_likelihood / (data.N_assumed * log(data.q)) << " q-its/datapoint \n\n";
+        *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N_synthetic * log(data.q)) << " q-its/datapoint \n";
+        *this->output_file << "Max-Log-likelihood: " << max_log_likelihood << " = " << max_log_likelihood / (data.N_synthetic * log(data.q)) << " q-its/datapoint \n\n";
 
-        print_partition_details_to_file(*this->output_file, this->mcm_out, data.N_assumed, data.q);
+        print_partition_details_to_file(*this->output_file, this->mcm_out, data.N_synthetic, data.q);
         *this->output_file << "\n";
         this->output_file.reset();
     }
@@ -175,7 +175,7 @@ int MCMSearch::division(int move_from, int move_to){
     this->log_evidence_trajectory.push_back(this->mcm_out.log_ev);
 
     if (this->output_file){
-        *this->output_file << "Splitting component " << move_from << "\t Log-evidence (q-its/datapoint): " << this->mcm_out.log_ev / (this->data->N_assumed * log(this->data->q)) << "\n";
+        *this->output_file << "Splitting component " << move_from << "\t Log-evidence (q-its/datapoint): " << this->mcm_out.log_ev / (this->data->N_synthetic * log(this->data->q)) << "\n";
         *this->output_file << "\t Component " << move_from << " : \t" + int_to_string(this->mcm_out.partition[move_from], this->mcm_out.n) << "\n";
         *this->output_file << "\t Component " << move_to << " : \t" + int_to_string(this->mcm_out.partition[move_to], this->mcm_out.n) << "\n\n";
     }
diff --git a/src/search/mcm_search/greedy.cpp b/src/search/mcm_search/greedy.cpp
index 2378003..a176b94 100644
--- a/src/search/mcm_search/greedy.cpp
+++ b/src/search/mcm_search/greedy.cpp
@@ -49,16 +49,16 @@ MCM MCMSearch::greedy_search(Data& data, MCM* init_mcm, std::string file_name){
         *this->output_file << "Number of variables: " << data.n << "\n";
         *this->output_file << "Number of states per variable: " << data.q << "\n";
         *this->output_file << "Number of datapoints: " << data.N << "\n";
-        *this->output_file << "Number of assumed datapoints: " << data.N_assumed << "\n";
+        *this->output_file << "Number of synthetic datapoints: " << data.N_synthetic << "\n";
         *this->output_file << "Number of unique datapoint: " << data.N_unique << "\n";
         *this->output_file << "Entropy of the data: " << data.entropy() << " q-its \n\n";
 
         *this->output_file << "Initial partition \n";
         *this->output_file << "----------------- \n\n";
 
-        *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N_assumed * log(data.q)) << " q-its/datapoint \n\n";
+        *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N_synthetic * log(data.q)) << " q-its/datapoint \n\n";
 
-        print_partition_details_to_file(*this->output_file, this->mcm_out, data.N_assumed, data.q);
+        print_partition_details_to_file(*this->output_file, this->mcm_out, data.N_synthetic, data.q);
         *this->output_file << "\n";
     }
 
@@ -77,10 +77,10 @@ MCM MCMSearch::greedy_search(Data& data, MCM* init_mcm, std::string file_name){
         *this->output_file << "\nFinal partition \n";
         *this->output_file << "----------------- \n\n";
 
-        *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N_assumed * log(data.q)) << " q-its/datapoint \n";
-        *this->output_file << "Max-Log-likelihood: " << max_log_likelihood << " = " << max_log_likelihood / (data.N_assumed * log(data.q)) << " q-its/datapoint \n\n";
+        *this->output_file << "Log-evidence: " << this->mcm_out.log_ev << " = " << this->mcm_out.log_ev / (data.N_synthetic * log(data.q)) << " q-its/datapoint \n";
+        *this->output_file << "Max-Log-likelihood: " << max_log_likelihood << " = " << max_log_likelihood / (data.N_synthetic * log(data.q)) << " q-its/datapoint \n\n";
 
-        print_partition_details_to_file(*this->output_file, this->mcm_out, data.N_assumed, data.q);
+        print_partition_details_to_file(*this->output_file, this->mcm_out, data.N_synthetic, data.q);
         *this->output_file << "\n";
         this->output_file.reset();
     }
@@ -151,7 +151,7 @@ void MCMSearch::hierarchical_merging(){
 
             // Write to the output file
             if (this->output_file){
-                *this->output_file << "Merging components " << best_i << " and " << best_j << " \t Log-evidence (q-its/datapoint): "<<  (this->mcm_out.log_ev) / (this->data->N_assumed * log(this->data->q)) << "\n";
+                *this->output_file << "Merging components " << best_i << " and " << best_j << " \t Log-evidence (q-its/datapoint): "<<  (this->mcm_out.log_ev) / (this->data->N_synthetic * log(this->data->q)) << "\n";
             }
         }
     }

From 7397dd90fe25f4035aba6569725b8f65c8daad4b Mon Sep 17 00:00:00 2001
From: AaronDC60 <aaron.de.clercq1@gmail.com>
Date: Fri, 27 Jun 2025 19:18:01 +0200
Subject: [PATCH 04/21] Fix calculation of alpha

---
 src/data/evidence.cpp   | 3 ++-
 src/data/likelihood.cpp | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/data/evidence.cpp b/src/data/evidence.cpp
index 2f16a65..87201c7 100644
--- a/src/data/evidence.cpp
+++ b/src/data/evidence.cpp
@@ -5,7 +5,8 @@ double Data::calc_log_ev_icc(__uint128_t component){
     double log_evidence = 0;
     // Determine the size of the component
     int r = bit_count(component);
-    double alpha = this->N_synthetic / this->N;
+    double N_syn = this->N_synthetic;
+    double alpha = N_syn / this->N;
     // Contributions from the datapoint frequencies
     std::unordered_map<std::vector<__uint128_t>, unsigned int, HashVector128> counts = build_histogram(*this, component);
     std::unordered_map<std::vector<__uint128_t>, unsigned int, HashVector128>::iterator count_iter = counts.begin();
diff --git a/src/data/likelihood.cpp b/src/data/likelihood.cpp
index d1bfafb..fa0187e 100644
--- a/src/data/likelihood.cpp
+++ b/src/data/likelihood.cpp
@@ -4,7 +4,7 @@
 double Data::calc_log_likelihood_icc(__uint128_t component){
     double log_likelihood = 0;
     double N_datapoints = this->N;
-    double alpha = this->N_synthetic / this->N;
+    double alpha = this->N_synthetic / N_datapoints;
     // Determine the size of the component
     int r = bit_count(component);
     // Get the datapoint frequencies

From 19946e4094dd135ffc3f1e6e79ae2f7141a2e807 Mon Sep 17 00:00:00 2001
From: AaronDC60 <aaron.de.clercq1@gmail.com>
Date: Tue, 8 Jul 2025 20:39:31 +0200
Subject: [PATCH 05/21] Test pydata class

---
 mcmpy/include/py_dataset.h |  2 +-
 mcmpy/src/py_dataset.cpp   | 10 +++----
 mcmpy/src/py_partition.cpp |  3 ++
 mcmpy/tests/test_data.py   | 59 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 68 insertions(+), 6 deletions(-)
 create mode 100644 mcmpy/tests/test_data.py

diff --git a/mcmpy/include/py_dataset.h b/mcmpy/include/py_dataset.h
index e3a5648..86093df 100644
--- a/mcmpy/include/py_dataset.h
+++ b/mcmpy/include/py_dataset.h
@@ -12,7 +12,7 @@
 
 namespace py = pybind11;
 
-std::vector<__uint128_t> convert_spin_op_from_py(const py::array_t<uint8_t>& spin_op, int q, int n_ints);
+std::vector<__uint128_t> convert_spin_op_from_py(const py::array_t<uint8_t>& spin_op, int q, int n_ints, int n);
 
 class PyData {
 public:
diff --git a/mcmpy/src/py_dataset.cpp b/mcmpy/src/py_dataset.cpp
index ade3dd7..310ea82 100644
--- a/mcmpy/src/py_dataset.cpp
+++ b/mcmpy/src/py_dataset.cpp
@@ -1,6 +1,6 @@
 #include "py_dataset.h"
 
-std::vector<__uint128_t> convert_spin_op_from_py(const py::array_t<uint8_t>& spin_op, int q, int n_ints){
+std::vector<__uint128_t> convert_spin_op_from_py(const py::array_t<uint8_t>& spin_op, int q, int n_ints, int n){
     py::buffer_info buff = spin_op.request();
 
     // Check if there is only one dimension
@@ -9,9 +9,9 @@ std::vector<__uint128_t> convert_spin_op_from_py(const py::array_t<uint8_t>& spi
         throw std::invalid_argument("The spin operator should be given as a 1D numpy array.");
     }
     // Check if the system size is valid
-    int n = buff.shape[0];
-    if (n > 128){
-        throw std::invalid_argument("The maximum system size is 128.");
+    int n_entries = buff.shape[0];
+    if (n_entries != n){
+        throw std::invalid_argument("The given spin operator doesn't contain n elements.");
     }
 
     std::vector<uint8_t> conv_spin_op(n, 0);
@@ -154,7 +154,7 @@ double PyData::entropy(int base){
 }
 
 double PyData::entropy_of_spin_op(const py::array_t<int8_t>& op){
-    std::vector<__uint128_t> spin_op = convert_spin_op_from_py(op, this->data.q, this->data.n_ints);
+    std::vector<__uint128_t> spin_op = convert_spin_op_from_py(op, this->data.q, this->data.n_ints, this->data.n);
     return calc_entropy_of_spin_op(this->data, spin_op);
 }
 
diff --git a/mcmpy/src/py_partition.cpp b/mcmpy/src/py_partition.cpp
index 399d531..3fdfd4e 100644
--- a/mcmpy/src/py_partition.cpp
+++ b/mcmpy/src/py_partition.cpp
@@ -76,6 +76,9 @@ std::vector<__uint128_t> convert_partition_from_py_2d_array(py::array_t<int8_t>&
         element = 1;
         for (int j = 0; j < n; j++){
             if(ptr[i*n + j]){
+                if(ptr[i*n + j] != 1){
+                    throw std::invalid_argument("Entries of the 2D array should be either 0 or 1.");
+                }
                 partition[i] += element;
             }
             element <<= 1;
diff --git a/mcmpy/tests/test_data.py b/mcmpy/tests/test_data.py
new file mode 100644
index 0000000..3ac8e2b
--- /dev/null
+++ b/mcmpy/tests/test_data.py
@@ -0,0 +1,59 @@
+import pytest
+import numpy as np
+from mcmpy import Data
+
+# Test information theoretic criteria against previous code
+# (https://github.com/clelidm/MinCompSpin_ExhaustiveSearch) by C. de Mulatier;
+# Results of the optimal MCM for the binary SCOTUS data in the original basis 
+
+def test_log_evidence(scotus_data_q2, opt_mcm_scotus_q2):
+    assert np.isclose(scotus_data_q2.log_evidence(opt_mcm_scotus_q2), -3300.4)
+
+def test_log_likelihood(scotus_data_q2, opt_mcm_scotus_q2):
+    assert np.isclose(scotus_data_q2.log_likelihood(opt_mcm_scotus_q2), -3194.36)
+
+def test_parametric_complexity(scotus_data_q2, opt_mcm_scotus_q2):
+    assert np.isclose(scotus_data_q2.complexity_parametric(opt_mcm_scotus_q2), 114.056)
+
+def test_geometric_complexity(scotus_data_q2, opt_mcm_scotus_q2):
+    assert np.isclose(scotus_data_q2.complexity_geometric(opt_mcm_scotus_q2), -8.95092)
+
+def test_total_complexity(scotus_data_q2, opt_mcm_scotus_q2):
+    parametric_complexity = scotus_data_q2.complexity_parametric(opt_mcm_scotus_q2)
+    geometric_complexity = scotus_data_q2.complexity_geometric(opt_mcm_scotus_q2)
+    assert np.isclose(parametric_complexity + geometric_complexity , 105.105)
+
+def test_mdl(scotus_data_q2, opt_mcm_scotus_q2):
+    assert np.isclose(scotus_data_q2.minimum_description_length(opt_mcm_scotus_q2), -3299.46)
+
+def test_log_evidence_icc(scotus_data_q2, opt_mcm_scotus_q2):
+    assert np.all(np.isclose(scotus_data_q2.log_evidence_icc(opt_mcm_scotus_q2), [-1754.41, -1545.98]))
+
+def test_log_likelihood_icc(scotus_data_q2, opt_mcm_scotus_q2):
+    assert np.all(np.isclose(scotus_data_q2.log_likelihood_icc(opt_mcm_scotus_q2), [-1686.28, -1508.08]))
+
+def test_parametric_complexity_icc(scotus_data_q2, opt_mcm_scotus_q2):
+    assert np.all(np.isclose(scotus_data_q2.complexity_parametric_icc(opt_mcm_scotus_q2), [76.8637, 37.1921]))
+
+def test_geometric_complexity_icc(scotus_data_q2, opt_mcm_scotus_q2):
+    assert np.all(np.isclose(scotus_data_q2.complexity_geometric_icc(opt_mcm_scotus_q2), [-9.58359, 0.632678]))
+
+
+# Input array test
+def test_partition_input(scotus_data_q2):
+    with pytest.raises(ValueError, match="The partition should be a 1D or 2D array."):
+        scotus_data_q2.log_evidence(np.ones((2,2,2)))
+    
+    with pytest.raises(ValueError, match="Entries of the 2D array should be either 0 or 1."):
+        scotus_data_q2.log_evidence([[1,2,0], [0,0,1]])
+
+def test_entropy_of_op_input(scotus_data_q2):
+    with pytest.raises(ValueError, match="The spin operator should be given as a 1D numpy array."):
+        scotus_data_q2.entropy_of_spin_operator(np.ones((2,2)))
+
+    with pytest.raises(ValueError, match="The vector should only contain values between 0 and q-1."):
+        scotus_data_q2.entropy_of_spin_operator([1,2,0,1,0,1,0,1,1])
+
+    with pytest.raises(ValueError, match="The given spin operator doesn't contain n elements."):
+        scotus_data_q2.entropy_of_spin_operator([1,0,1,0,1,0,1])
+        

From dde949e571f0b90634e260c7c51160b73d84e108 Mon Sep 17 00:00:00 2001
From: AaronDC60 <aaron.de.clercq1@gmail.com>
Date: Tue, 8 Jul 2025 20:40:20 +0200
Subject: [PATCH 06/21] Test pymcm class

---
 mcmpy/tests/test_mcm.py | 127 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 127 insertions(+)
 create mode 100644 mcmpy/tests/test_mcm.py

diff --git a/mcmpy/tests/test_mcm.py b/mcmpy/tests/test_mcm.py
new file mode 100644
index 0000000..d09371a
--- /dev/null
+++ b/mcmpy/tests/test_mcm.py
@@ -0,0 +1,127 @@
+import pytest
+import numpy as np
+from mcmpy import MCM, Data
+
+@pytest.fixture
+def model():
+    return MCM(8, [0,1,2,0,1,-1,5])
+
+# Test input with specific partition
+def test_mcm_array(model):
+    assert np.all(model.array == [[1,0,0,1,0,0,0,0],
+                                  [0,1,0,0,1,0,0,0],
+                                  [0,0,1,0,0,0,0,0],
+                                  [0,0,0,0,0,0,1,0]])
+
+def test_mcm_gray_code(model):
+    assert np.all(model.array_gray_code == [0,1,2,0,1,-1,3,-1])
+
+def test_read_only_properties(model):
+    assert(model.n == 8)
+    assert(model.n_icc == 4)
+    assert(model.rank == 6)
+    assert(model.is_optimized is False)
+
+def test_move_in_invalid(model):
+    with pytest.raises(ValueError, match="The variable index should be between 0 and n-1."):
+        model.move_variable_in(8,5)
+
+    with pytest.raises(ValueError, match="This variable is already present in the partition."):
+        model.move_variable_in(1,6)
+
+def test_move_in(model):
+    model.move_variable_in(5,2)
+    model.move_variable_in(7,7)
+
+    # Check gray code
+    assert np.all(model.array_gray_code == [0,1,2,0,1,2,3,4])
+
+    # Check binary array
+    assert np.all(model.array == [[1,0,0,1,0,0,0,0],
+                                  [0,1,0,0,1,0,0,0],
+                                  [0,0,1,0,0,1,0,0],
+                                  [0,0,0,0,0,0,1,0],
+                                  [0,0,0,0,0,0,0,1]])
+
+    # Check properties
+    assert(model.n == 8)
+    assert(model.n_icc == 5)
+    assert(model.rank == 8)
+    assert(model.is_optimized is False)
+
+def test_move_out_invalid(model):
+    with pytest.raises(ValueError, match="The variable index should be between 0 and n-1."):
+        model.move_variable_out(8)
+
+    with pytest.raises(ValueError, match="This variable is not present in the partition."):
+        model.move_variable_out(7)
+
+def test_move_out(model):
+    model.move_variable_out(2)
+
+    # Check gray code
+    assert np.all(model.array_gray_code == [0,1,-1,0,1,-1,2,-1])
+
+    # Check binary array
+    assert np.all(model.array == [[1,0,0,1,0,0,0,0],
+                                  [0,1,0,0,1,0,0,0],
+                                  [0,0,0,0,0,0,1,0]])
+
+    # Check properties
+    assert(model.n == 8)
+    assert(model.n_icc == 3)
+    assert(model.rank == 5)
+    assert(model.is_optimized is False)    
+
+def test_move(model):
+    model.move_variable(6,0)
+
+    # Check gray code
+    assert np.all(model.array_gray_code == [0,1,2,0,1,-1,0,-1])
+
+    # Check binary array
+    assert np.all(model.array == [[1,0,0,1,0,0,1,0],
+                                  [0,1,0,0,1,0,0,0],
+                                  [0,0,1,0,0,0,0,0]])
+
+    # Check properties
+    assert(model.n == 8)
+    assert(model.n_icc == 3)
+    assert(model.rank == 6)
+    assert(model.is_optimized is False)
+
+def test_properties():
+    # Independent model
+    model = MCM(12, "independent")
+    assert(model.n == 12)
+    assert(model.n_icc == 12)
+    assert(model.rank == 12)
+    assert(model.is_optimized is False) 
+
+    # Complete model
+    model = MCM(12, "complete")
+    assert(model.n == 12)
+    assert(model.n_icc == 1)
+    assert(model.rank == 12)
+    assert(model.is_optimized is False)
+
+def test_data_object(scotus_data_q2, opt_mcm_scotus_q2):
+    data = opt_mcm_scotus_q2.generate_data_object(10_000, scotus_data_q2)
+
+    assert data.N == 10_000
+    assert data.N_synthetic == 10_000
+    assert data.q == scotus_data_q2.q
+    assert data.n == scotus_data_q2.n
+
+def test_data_file(scotus_data_q2, opt_mcm_scotus_q2, tmp_path):
+    filepath = str(tmp_path / "test.dat")
+    print(type(filepath))
+    opt_mcm_scotus_q2.generate_data_file(10_000, scotus_data_q2, filepath)
+    # Create data object from file
+    data = Data(filepath, scotus_data_q2.n, scotus_data_q2.q)
+
+    assert data.N == 10_000
+    assert data.N_synthetic == 10_000
+    assert data.q == scotus_data_q2.q
+    assert data.n == scotus_data_q2.n
+

From 54ce859382b4a5b453085c078706acb38ccd437e Mon Sep 17 00:00:00 2001
From: AaronDC60 <aaron.de.clercq1@gmail.com>
Date: Tue, 8 Jul 2025 20:43:47 +0200
Subject: [PATCH 07/21] Add conftest file

---
 mcmpy/tests/conftest.py | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)
 create mode 100644 mcmpy/tests/conftest.py

diff --git a/mcmpy/tests/conftest.py b/mcmpy/tests/conftest.py
new file mode 100644
index 0000000..a583bf9
--- /dev/null
+++ b/mcmpy/tests/conftest.py
@@ -0,0 +1,27 @@
+import pytest
+from mcmpy import Data, MCM
+
+# Data
+
+@pytest.fixture
+def scotus_data_q2():
+    return Data("../../input/US_SupremeCourt_n9_N895.dat", 9, 2)
+
+@pytest.fixture
+def scotus_data_q3():
+    return Data("../../input/US_SupremeCourt_n9_N895.dat", 9, 3)
+
+@pytest.fixture
+def scotus_data_q4():
+    return Data("../../input/US_SupremeCourt_n9_N895.dat", 9, 4)
+
+@pytest.fixture
+def scotus_data_q5():
+    return Data("../../input/US_SupremeCourt_n9_N895.dat", 9, 5)
+
+
+# MCM
+
+@pytest.fixture
+def opt_mcm_scotus_q2():
+    return MCM(9, [[1,0,1,1,1,0,1,0,0], [0,1,0,0,0,1,0,1,1]])
\ No newline at end of file

From fbb4c529469a935c0e81bfda1414214a206b2847 Mon Sep 17 00:00:00 2001
From: AaronDC60 <aaron.de.clercq1@gmail.com>
Date: Sun, 13 Jul 2025 12:07:09 +0200
Subject: [PATCH 08/21] Test pymcmsearch class

---
 mcmpy/tests/conftest.py        |  18 ++--
 mcmpy/tests/test_mcm_search.py | 185 +++++++++++++++++++++++++++++++++
 2 files changed, 196 insertions(+), 7 deletions(-)
 create mode 100644 mcmpy/tests/test_mcm_search.py

diff --git a/mcmpy/tests/conftest.py b/mcmpy/tests/conftest.py
index a583bf9..2743275 100644
--- a/mcmpy/tests/conftest.py
+++ b/mcmpy/tests/conftest.py
@@ -1,27 +1,31 @@
 import pytest
-from mcmpy import Data, MCM
+from mcmpy import Data, MCM, MCMSearch
 
 # Data
 
-@pytest.fixture
+@pytest.fixture(scope="module")
 def scotus_data_q2():
     return Data("../../input/US_SupremeCourt_n9_N895.dat", 9, 2)
 
-@pytest.fixture
+@pytest.fixture(scope="module")
 def scotus_data_q3():
     return Data("../../input/US_SupremeCourt_n9_N895.dat", 9, 3)
 
-@pytest.fixture
+@pytest.fixture(scope="module")
 def scotus_data_q4():
     return Data("../../input/US_SupremeCourt_n9_N895.dat", 9, 4)
 
-@pytest.fixture
+@pytest.fixture(scope="module")
 def scotus_data_q5():
     return Data("../../input/US_SupremeCourt_n9_N895.dat", 9, 5)
 
 
 # MCM
 
-@pytest.fixture
+@pytest.fixture(scope="module")
 def opt_mcm_scotus_q2():
-    return MCM(9, [[1,0,1,1,1,0,1,0,0], [0,1,0,0,0,1,0,1,1]])
\ No newline at end of file
+    return MCM(9, [[1,0,1,1,1,0,1,0,0], [0,1,0,0,0,1,0,1,1]])
+
+@pytest.fixture(scope="module")
+def random_mcm():
+    return MCM(9, "random")
diff --git a/mcmpy/tests/test_mcm_search.py b/mcmpy/tests/test_mcm_search.py
new file mode 100644
index 0000000..8bd4267
--- /dev/null
+++ b/mcmpy/tests/test_mcm_search.py
@@ -0,0 +1,185 @@
+import pytest
+import numpy as np
+from mcmpy import Data, MCM, MCMSearch
+
+@pytest.fixture
+def mcm_searcher():
+    return MCMSearch()
+
+## Exhaustive search
+
+# Test against previous code
+# (https://github.com/clelidm/MinCompSpin_ExhaustiveSearch) by C. de Mulatier;
+# (https://github.com/AaronDC60/MinCompSpin_discrete) by Aaron De Clercq.
+
+class TestMCMSearch:
+
+    def setup_class(self):
+        scotus_data = Data("../../input/US_SupremeCourt_n9_N895.dat", 9, 2)
+        self.searcher = MCMSearch()
+
+        mcm_in_merging = MCM(9, [0,1,0,3,4,5,6,7,8])
+
+        self.opt_mcms = {}
+        self.mcms_out = {}
+        self.mcms_in = {}
+
+        self.opt_mcms["simulated_annealing"] = self.searcher.simulated_annealing(scotus_data)
+        self.mcms_out["simulated_annealing"] = self.searcher.get_mcm_out()
+        self.mcms_in["simulated_annealing"] = self.searcher.get_mcm_in()
+
+        self.opt_mcms["hierarchical_greedy_merging"] = self.searcher.hierarchical_greedy_merging(scotus_data, mcm_in_merging)
+        self.mcms_out["hierarchical_greedy_merging"] = self.searcher.get_mcm_out()
+        self.mcms_in["hierarchical_greedy_merging"] = self.searcher.get_mcm_in()
+
+        self.opt_mcms["hierarchical_greedy_divisive"] = self.searcher.hierarchical_greedy_divisive(scotus_data)
+        self.mcms_out["hierarchical_greedy_divisive"] = self.searcher.get_mcm_out()
+        self.mcms_in["hierarchical_greedy_divisive"] = self.searcher.get_mcm_in()
+
+        self.opt_mcms["exhaustive"] = self.searcher.exhaustive(scotus_data)
+        self.mcms_out["exhaustive"] = self.searcher.get_mcm_out()
+
+    def test_log_ev_trajectory_exhaustive(self):
+        # Exhaustive search stores all parititons
+        # Number of partitions for n variables is given by Bell number of n
+        assert len(self.searcher.log_evidence_trajectory) == 21_147
+
+    @pytest.mark.parametrize("method", ["exhaustive", "simulated_annealing",  "hierarchical_greedy_merging", "hierarchical_greedy_divisive"])
+    def test_opt_mcms_structure(self, method):
+        assert self.opt_mcms[method].n == 9
+        assert self.opt_mcms[method].n_icc == 2
+        assert self.opt_mcms[method].rank == 9
+        assert self.opt_mcms[method].is_optimized is True
+
+        if method != "simulated_annealing":
+            # Simulated annealing inverts the order sometimes
+            assert np.all(self.opt_mcms[method].array == [[1,0,1,1,1,0,1,0,0], [0,1,0,0,0,1,0,1,1]])
+            assert np.all(self.opt_mcms[method].array_gray_code == [0,1,0,0,0,1,0,1,1])
+        else:
+            assert [1,0,1,1,1,0,1,0,0] in self.opt_mcms[method].array
+            assert [0,1,0,0,0,1,0,1,1] in self.opt_mcms[method].array
+
+    @pytest.mark.parametrize("method", ["exhaustive", "simulated_annealing",  "hierarchical_greedy_merging", "hierarchical_greedy_divisive"])
+    def test_mcms_out_structure(self, method):
+        assert self.mcms_out[method].n == 9
+        assert self.mcms_out[method].n_icc == 2
+        assert self.mcms_out[method].rank == 9
+        assert self.mcms_out[method].is_optimized is True
+
+        if method != "simulated_annealing":
+            # Simulated annealing inverts the order sometimes
+            assert np.all(self.mcms_out[method].array == [[1,0,1,1,1,0,1,0,0], [0,1,0,0,0,1,0,1,1]])
+            assert np.all(self.mcms_out[method].array_gray_code == [0,1,0,0,0,1,0,1,1])
+        else:
+            assert [1,0,1,1,1,0,1,0,0] in self.mcms_out[method].array
+            assert [0,1,0,0,0,1,0,1,1] in self.mcms_out[method].array
+
+    @pytest.mark.parametrize("method", ["exhaustive", "simulated_annealing",  "hierarchical_greedy_merging", "hierarchical_greedy_divisive"])
+    def test_opt_mcm_log_ev(self, method):
+        log_ev_per_icc = self.opt_mcms[method].get_best_log_evidence_icc()
+        log_ev = self.opt_mcms[method].get_best_log_evidence()
+
+        assert np.isclose(log_ev, -3300.4)
+        assert np.allclose(log_ev_per_icc, [-1754.41, -1545.98]) or np.allclose(log_ev_per_icc, [-1545.98, -1754.41])
+       
+    @pytest.mark.parametrize("method", ["exhaustive", "simulated_annealing",  "hierarchical_greedy_merging", "hierarchical_greedy_divisive"])
+    def test_opt_mcm_log_ev(self, method):
+        log_ev_per_icc = self.opt_mcms[method].get_best_log_evidence_icc()
+        log_ev = self.opt_mcms[method].get_best_log_evidence()
+
+        assert np.isclose(log_ev, -3300.4)
+        assert np.allclose(log_ev_per_icc, [-1754.41, -1545.98]) or np.allclose(log_ev_per_icc, [-1545.98, -1754.41])
+
+    def test_mcm_in_invalid(self):
+        # Try before any search has been run
+        search = MCMSearch()
+        with pytest.raises(RuntimeError, match="No search has been ran yet."):
+            search.get_mcm_in()
+
+    def test_mcm_in_exhaustive(self):
+        with pytest.raises(RuntimeError, match="Exhaustive search does not have an initial MCM."):
+            self.searcher.get_mcm_in()
+
+    def test_mcm_in_greedy_divisive(self):
+        # Starts by default from the complete model
+        assert self.mcms_in["hierarchical_greedy_divisive"].n == 9
+        assert self.mcms_in["hierarchical_greedy_divisive"].n_icc == 1
+        assert self.mcms_in["hierarchical_greedy_divisive"].rank == 9
+        assert self.mcms_in["hierarchical_greedy_divisive"].is_optimized is False
+
+    def test_mcm_in_greedy_merging(self):
+        # mcm_in used is defined in the setup
+        assert self.mcms_in["hierarchical_greedy_merging"].n == 9
+        assert self.mcms_in["hierarchical_greedy_merging"].n_icc == 8
+        assert self.mcms_in["hierarchical_greedy_merging"].rank == 9
+        assert self.mcms_in["hierarchical_greedy_merging"].is_optimized is False
+
+        assert np.all(self.mcms_in["hierarchical_greedy_merging"].array_gray_code == [0,1,0,2,3,4,5,6,7])
+
+    def test_mcm_in_sim_annealing(self):
+        # Starts by default from the independent model
+        assert self.mcms_in["simulated_annealing"].n == 9
+        assert self.mcms_in["simulated_annealing"].n_icc == 9
+        assert self.mcms_in["simulated_annealing"].rank == 9
+        assert self.mcms_in["simulated_annealing"].is_optimized is False
+
+# Check the optimal structure and log-evidence for the SCOTUS dataset with different values of q
+
+def test_scotus_q_3(mcm_searcher, scotus_data_q3):
+    opt_mcm = mcm_searcher.exhaustive(scotus_data_q3)
+    log_ev = opt_mcm.get_best_log_evidence()
+
+    # MCM properties
+    assert opt_mcm.n == 9
+    assert opt_mcm.n_icc == 2
+    assert opt_mcm.rank == 9
+    assert opt_mcm.is_optimized is True
+
+    # MCM structure
+    assert np.all(opt_mcm.array == [[1,0,1,1,1,0,1,0,0], [0,1,0,0,0,1,0,1,1]])
+    assert np.all(opt_mcm.array_gray_code == [0,1,0,0,0,1,0,1,1])
+
+    # MCM log-evidence
+    assert np.allclose(log_ev, -3714.65)
+
+def test_scotus_q_4(mcm_searcher, scotus_data_q4):
+    opt_mcm = mcm_searcher.exhaustive(scotus_data_q4)
+    log_ev = opt_mcm.get_best_log_evidence()
+
+    # MCM properties
+    assert opt_mcm.n == 9
+    assert opt_mcm.n_icc == 3
+    assert opt_mcm.rank == 9
+    assert opt_mcm.is_optimized is True
+
+    # MCM structure
+    assert np.all(opt_mcm.array == [[1,0,1,0,1,0,0,0,0], 
+                                  [0,1,0,0,0,1,0,1,1], 
+                                  [0,0,0,1,0,0,1,0,0]])
+    assert np.all(opt_mcm.array_gray_code == [0,1,0,2,0,1,2,1,1])
+
+    # MCM log-evidence
+    assert np.allclose(log_ev, -4033.14)
+
+def test_scotus_q_5(mcm_searcher, scotus_data_q5):
+    opt_mcm = mcm_searcher.exhaustive(scotus_data_q5)
+    log_ev = opt_mcm.get_best_log_evidence()
+
+    # MCM properties
+    assert opt_mcm.n == 9
+    assert opt_mcm.n_icc == 4
+    assert opt_mcm.rank == 9
+    assert opt_mcm.is_optimized is True
+
+    # MCM structure
+    assert np.all(opt_mcm.array == [[1,0,1,0,1,0,0,0,0], 
+                                  [0,1,0,0,0,0,0,0,0],
+                                  [0,0,0,1,0,0,1,0,0],
+                                  [0,0,0,0,0,1,0,1,1]])
+    assert np.all(opt_mcm.array_gray_code == [0,1,0,2,0,3,2,3,3])
+
+    # MCM log-evidence
+    assert np.allclose(log_ev, -4261.89)
+
+
+

From f716307019637f75d36039060cc8b1baa94ed18d Mon Sep 17 00:00:00 2001
From: AaronDC60 <aaron.de.clercq1@gmail.com>
Date: Tue, 21 Apr 2026 18:59:19 +0200
Subject: [PATCH 09/21] Fix basis reset

---
 mcmpy/tests/conftest.py        | 14 +++++---------
 mcmpy/tests/test_mcm_search.py |  3 ---
 src/basis/basis.cpp            |  6 +++++-
 tests/basis/basis.cpp          | 15 ++++++++++-----
 4 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/mcmpy/tests/conftest.py b/mcmpy/tests/conftest.py
index 2743275..d77727d 100644
--- a/mcmpy/tests/conftest.py
+++ b/mcmpy/tests/conftest.py
@@ -3,29 +3,25 @@
 
 # Data
 
-@pytest.fixture(scope="module")
+@pytest.fixture
 def scotus_data_q2():
     return Data("../../input/US_SupremeCourt_n9_N895.dat", 9, 2)
 
-@pytest.fixture(scope="module")
+@pytest.fixture
 def scotus_data_q3():
     return Data("../../input/US_SupremeCourt_n9_N895.dat", 9, 3)
 
-@pytest.fixture(scope="module")
+@pytest.fixture
 def scotus_data_q4():
     return Data("../../input/US_SupremeCourt_n9_N895.dat", 9, 4)
 
-@pytest.fixture(scope="module")
+@pytest.fixture
 def scotus_data_q5():
     return Data("../../input/US_SupremeCourt_n9_N895.dat", 9, 5)
 
 
 # MCM
 
-@pytest.fixture(scope="module")
+@pytest.fixture
 def opt_mcm_scotus_q2():
     return MCM(9, [[1,0,1,1,1,0,1,0,0], [0,1,0,0,0,1,0,1,1]])
-
-@pytest.fixture(scope="module")
-def random_mcm():
-    return MCM(9, "random")
diff --git a/mcmpy/tests/test_mcm_search.py b/mcmpy/tests/test_mcm_search.py
index 8bd4267..fadbca8 100644
--- a/mcmpy/tests/test_mcm_search.py
+++ b/mcmpy/tests/test_mcm_search.py
@@ -180,6 +180,3 @@ def test_scotus_q_5(mcm_searcher, scotus_data_q5):
 
     # MCM log-evidence
     assert np.allclose(log_ev, -4261.89)
-
-
-
diff --git a/src/basis/basis.cpp b/src/basis/basis.cpp
index 5083d74..6e8b937 100644
--- a/src/basis/basis.cpp
+++ b/src/basis/basis.cpp
@@ -156,6 +156,10 @@ void Basis::set_basis_default() {
     // Set the default base: s1, s2, ..., sn
     __uint128_t element = 1;
     for (int i = 0; i < this->n; i++){
+        // Reset operator
+        std::fill(this->basis_ops[i].begin(), this->basis_ops[i].end(), 0);
+        std::fill(this->basis_ops_matrix[i].begin(), this->basis_ops_matrix[i].end(), 0);
+
         // log2(q) 128 bit integer representation
         this->basis_ops[i][0] = element;
         element <<= 1;
@@ -185,7 +189,7 @@ void Basis::print_details() {
     std::string op_string;
     for (std::vector<__uint128_t> op : this->basis_ops){
         op_string = convert_128bit_vec_to_string(op, this->n);
-        std::cout << "Operator " << i << ": \t" << op_string << "\t Indices: ";
+        std::cout << "Operator " << i << ": \t" << op_string << "\t Indices of variables involved: ";
         // Find the indices of the variables present in the spin operator
         for (size_t j = 0; j < this->n; ++j){
             if (op_string[j] != '0') {std::cout << j << "\t";}
diff --git a/tests/basis/basis.cpp b/tests/basis/basis.cpp
index 9bf0303..4c5c4c8 100644
--- a/tests/basis/basis.cpp
+++ b/tests/basis/basis.cpp
@@ -49,13 +49,10 @@ TEST(basis, init_n){
     Basis basis(n, q);
 
     exp_spin_ops = {{1,0,0}, {2,0,0}, {4,0,0}};
-    spin_ops = basis.get_basis();
-
     exp_spin_ops_array = {{1,0,0}, {0,1,0}, {0,0,1}};
-    spin_ops_array = basis.get_basis_matrix();
 
-    EXPECT_EQ(exp_spin_ops, spin_ops);
-    EXPECT_EQ(exp_spin_ops_array, spin_ops_array);
+    EXPECT_EQ(exp_spin_ops, basis.get_basis());
+    EXPECT_EQ(exp_spin_ops_array, basis.get_basis_matrix());
     EXPECT_EQ(basis.get_n(), n);
     EXPECT_EQ(basis.get_q(), q);
     EXPECT_EQ(basis.get_n_ints(), 3);
@@ -186,6 +183,14 @@ TEST(basis, init_spin_ops){
     catch(std::invalid_argument const & err) {
         EXPECT_EQ(err.what(), std::string("Given basis is not linearly independent."));
     }
+
+    // Reset to default basis
+    exp_spin_ops = {{1,0}, {2,0}, {4,0}, {8,0}};
+    exp_matrix = {{1,0,0,0}, {0,1,0,0}, {0,0,1,0}, {0,0,0,1}};
+    basis.set_basis_default();
+
+    EXPECT_EQ(basis.get_basis(), exp_spin_ops);
+    EXPECT_EQ(basis.get_basis_matrix(), exp_matrix);
 }
 
 TEST(basis, init_file){

From b40047b872bb3a45c85d796c7b53a428338bfff3 Mon Sep 17 00:00:00 2001
From: AaronDC60 <aaron.de.clercq1@gmail.com>
Date: Tue, 21 Apr 2026 20:55:20 +0200
Subject: [PATCH 10/21] Update mcm tests

---
 mcmpy/tests/test_mcm.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/mcmpy/tests/test_mcm.py b/mcmpy/tests/test_mcm.py
index d09371a..8fa33bd 100644
--- a/mcmpy/tests/test_mcm.py
+++ b/mcmpy/tests/test_mcm.py
@@ -115,7 +115,6 @@ def test_data_object(scotus_data_q2, opt_mcm_scotus_q2):
 
 def test_data_file(scotus_data_q2, opt_mcm_scotus_q2, tmp_path):
     filepath = str(tmp_path / "test.dat")
-    print(type(filepath))
     opt_mcm_scotus_q2.generate_data_file(10_000, scotus_data_q2, filepath)
     # Create data object from file
     data = Data(filepath, scotus_data_q2.n, scotus_data_q2.q)
@@ -124,4 +123,3 @@ def test_data_file(scotus_data_q2, opt_mcm_scotus_q2, tmp_path):
     assert data.N_synthetic == 10_000
     assert data.q == scotus_data_q2.q
     assert data.n == scotus_data_q2.n
-

From 914a935d449c05eba4e23c5abea909ae504b6b95 Mon Sep 17 00:00:00 2001
From: AaronDC60 <aaron.de.clercq1@gmail.com>
Date: Tue, 21 Apr 2026 20:55:54 +0200
Subject: [PATCH 11/21] Update mcm search tests

---
 mcmpy/tests/test_mcm_search.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mcmpy/tests/test_mcm_search.py b/mcmpy/tests/test_mcm_search.py
index fadbca8..0e9ef7d 100644
--- a/mcmpy/tests/test_mcm_search.py
+++ b/mcmpy/tests/test_mcm_search.py
@@ -18,6 +18,7 @@ def setup_class(self):
         scotus_data = Data("../../input/US_SupremeCourt_n9_N895.dat", 9, 2)
         self.searcher = MCMSearch()
 
+        # MCM used as starting partition in the hierarchical greedy merging algorithm
         mcm_in_merging = MCM(9, [0,1,0,3,4,5,6,7,8])
 
         self.opt_mcms = {}
@@ -108,7 +109,7 @@ def test_mcm_in_greedy_divisive(self):
         assert self.mcms_in["hierarchical_greedy_divisive"].is_optimized is False
 
     def test_mcm_in_greedy_merging(self):
-        # mcm_in used is defined in the setup
+        # mcm_in used is defined in the setup (mcm_in_merging)
         assert self.mcms_in["hierarchical_greedy_merging"].n == 9
         assert self.mcms_in["hierarchical_greedy_merging"].n_icc == 8
         assert self.mcms_in["hierarchical_greedy_merging"].rank == 9

From 60cacdee94068466e2fceaeb755f3be9f5304566 Mon Sep 17 00:00:00 2001
From: AaronDC60 <aaron.de.clercq1@gmail.com>
Date: Tue, 21 Apr 2026 20:56:49 +0200
Subject: [PATCH 12/21] Add tests for basis class

---
 mcmpy/tests/test_basis.py | 52 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)
 create mode 100644 mcmpy/tests/test_basis.py

diff --git a/mcmpy/tests/test_basis.py b/mcmpy/tests/test_basis.py
new file mode 100644
index 0000000..6d1afcd
--- /dev/null
+++ b/mcmpy/tests/test_basis.py
@@ -0,0 +1,52 @@
+import pytest
+import numpy as np
+from mcmpy import Basis
+
+
+def test_init_default():
+    n = 3
+    q = 5
+    basis = Basis(n,q)
+
+    # Default basis should be the n one-body spin operators
+    assert np.all(basis.matrix == np.eye(n))
+    assert(basis.n == n)
+    assert(basis.q == q)
+
+def test_init_array():
+    n = 4
+    q = 3
+    array = np.array([[1,2,0,0],[2,2,0,0],[0,0,1,0],[0,0,2,1]])
+    basis = Basis(n,q, array)
+
+    # Check the matrix representation
+    # Should be equal to the transpose of the input given that columns represent operators
+    assert np.all(basis.matrix == array.T)
+    assert(basis.n == n)
+    assert(basis.q == q)
+
+    # Check reset to default
+    basis.set_default()
+    assert np.all(basis.matrix == np.eye(n))
+
+def test_init_file():
+    n = 4
+    q = 3
+    basis = Basis(n,q, "../../tests/basis2.dat")
+
+    # Check the matrix representation
+    assert np.all(basis.matrix == [[1,2,0,0],[2,2,0,0],[0,0,1,2],[0,0,0,1]])
+    assert(basis.n == n)
+    assert(basis.q == q)
+
+def test_set_from_file():
+    n = 4
+    q = 3
+
+    basis = Basis(n,q)
+    basis.set_from_file("../../tests/basis2.dat")
+
+    # Check the matrix representation
+    assert np.all(basis.matrix == [[1,2,0,0],[2,2,0,0],[0,0,1,2],[0,0,0,1]])
+    assert(basis.n == n)
+    assert(basis.q == q)

From 337ce1797332485e91fa1cba75be71c0a7f108b1 Mon Sep 17 00:00:00 2001
From: AaronDC60 <aaron.de.clercq1@gmail.com>
Date: Tue, 21 Apr 2026 20:57:08 +0200
Subject: [PATCH 13/21] Add tests for basis search class

---
 mcmpy/tests/test_basis_search.py | 92 ++++++++++++++++++++++++++++++++
 1 file changed, 92 insertions(+)
 create mode 100644 mcmpy/tests/test_basis_search.py

diff --git a/mcmpy/tests/test_basis_search.py b/mcmpy/tests/test_basis_search.py
new file mode 100644
index 0000000..62ead4e
--- /dev/null
+++ b/mcmpy/tests/test_basis_search.py
@@ -0,0 +1,92 @@
+import pytest
+import numpy as np
+from mcmpy import Data, MCM, MCMSearch, Basis, BasisSearch
+
+# Best basis search of supreme court
+
+# Test against previous code
+# (https://github.com/clelidm/MinCompSpin_ExhaustiveSearch) by C. de Mulatier;
+# (https://github.com/AaronDC60/MinCompSpin_discrete) by Aaron De Clercq.
+
+class TestBasisSearch:
+
+    def setup_class(self):
+        self.searcher = BasisSearch()
+        self.mcmsearcher = MCMSearch()
+
+    def test_exhaustive_search(self):
+        dataset = Data("../../input/US_SupremeCourt_n9_N895.dat", 9, 2)
+        opt_basis_1 = self.searcher.exhaustive(dataset)
+        opt_basis_2 = self.searcher.get_basis()
+        assert np.all(opt_basis_1.matrix == [[0,0,0,1,1,1,0,0,0],
+                                           [0,0,0,0,0,0,1,0,0],
+                                           [0,0,0,0,0,1,0,1,0],
+                                           [1,0,0,0,0,0,0,0,0],
+                                           [0,0,0,1,0,0,0,0,0],
+                                           [0,1,0,0,0,0,0,0,0],
+                                           [1,0,0,0,1,0,0,0,1],
+                                           [0,1,1,0,0,0,1,0,0],
+                                           [0,0,1,0,0,0,0,1,0]])
+
+        assert np.all(opt_basis_1.matrix == opt_basis_2.matrix)
+        assert opt_basis_1.n == 9
+        assert opt_basis_2.n == 9
+
+        assert opt_basis_1.q == 2
+        assert opt_basis_2.q == 2
+
+    def get_best_mcm(self, q):
+        dataset = Data("../../input/US_SupremeCourt_n9_N895.dat", 9, q)
+        # Calculate the best basis
+        opt_basis = self.searcher.exhaustive(dataset)
+        # Transform data
+        transformed_data = opt_basis.gauge_transform_data(dataset)
+        # Get best mcm in the new basis
+        best_mcm = self.mcmsearcher.exhaustive(transformed_data)
+
+        return best_mcm
+    
+    def test_mcm_opt_basis_q2(self):
+        best_mcm = self.get_best_mcm(2)
+
+        # Check the mcm and log-evidence
+        assert np.all(best_mcm.array == [[1,0,0,0,0,0,0,0,0],
+                                         [0,1,1,0,0,0,1,0,0],
+                                         [0,0,0,1,1,1,0,1,1]])
+
+        assert np.isclose(best_mcm.get_best_log_evidence(), -3154.42)
+
+    def test_mcm_opt_basis_q3(self):
+        best_mcm = self.get_best_mcm(3)
+
+        # Check the mcm and log-evidence
+        assert np.all(best_mcm.array == [[1,0,0,0,1,0,0,1,0],
+                                         [0,1,0,1,0,0,1,0,0],
+                                         [0,0,1,0,0,1,0,0,1]])
+
+        assert np.isclose(best_mcm.get_best_log_evidence(), -3587.68)
+
+    def test_mcm_opt_basis_q4(self):
+        best_mcm = self.get_best_mcm(4)
+
+        # Check the mcm and log-evidence
+        assert np.all(best_mcm.array == [[1,0,0,0,0,0,0,0,0],
+                                         [0,1,0,0,0,0,1,0,0],
+                                         [0,0,1,0,0,1,0,0,0],
+                                         [0,0,0,1,0,0,0,0,1],
+                                         [0,0,0,0,1,0,0,1,0]])
+
+        assert np.isclose(best_mcm.get_best_log_evidence(), -3763.43)
+
+    def test_mcm_opt_basis_q5(self):
+        best_mcm = self.get_best_mcm(5)
+
+        # Check the mcm and log-evidence
+        assert np.all(best_mcm.array == [[1,0,0,0,0,0,0,0,0],
+                                         [0,1,0,0,0,0,0,0,0],
+                                         [0,0,1,0,0,1,0,0,0],
+                                         [0,0,0,1,0,0,1,0,0],
+                                         [0,0,0,0,1,0,0,1,0],
+                                         [0,0,0,0,0,0,0,0,1]])
+
+        assert np.isclose(best_mcm.get_best_log_evidence(), -3848.06)

From 66441cdb17761e03eeb95f8f0ee9ef488fdc6c00 Mon Sep 17 00:00:00 2001
From: AaronDC60 <aaron.de.clercq1@gmail.com>
Date: Tue, 21 Apr 2026 21:00:17 +0200
Subject: [PATCH 14/21] Add python tests to the ci

---
 .github/workflows/ci.yml | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index e65d426..198675a 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -29,4 +29,9 @@ jobs:
       - name : Test project
         run: |
           cd build
-          ctest --output-on-failure
\ No newline at end of file
+          ctest --output-on-failure
+
+     - name : Test python module
+       run: |
+          cd mcmpy/tests
+          pytest

From 3d9c8ce5dfb101a70e3f346ac4bf9707a67e8680 Mon Sep 17 00:00:00 2001
From: AaronDC60 <aaron.de.clercq1@gmail.com>
Date: Tue, 21 Apr 2026 21:04:46 +0200
Subject: [PATCH 15/21] Fix ci

---
 .github/workflows/ci.yml | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 198675a..588d8df 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -3,6 +3,9 @@ on:
   push:
     branches:
       - main
+  pull_request:
+    branches:
+      - main
   
 jobs:
   build:
@@ -30,8 +33,8 @@ jobs:
         run: |
           cd build
           ctest --output-on-failure
-
-     - name : Test python module
-       run: |
+      
+      - name : Test python module
+        run: |
           cd mcmpy/tests
-          pytest
+          pytest
\ No newline at end of file

From ecd1186ef794099dc754c28217792005f12def2c Mon Sep 17 00:00:00 2001
From: Aaron De Clercq <60502264+AaronDC60@users.noreply.github.com>
Date: Tue, 21 Apr 2026 21:14:07 +0200
Subject: [PATCH 16/21] Update ci.yml

---
 .github/workflows/ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 811d746..8f7aa3a 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -23,7 +23,7 @@ jobs:
         if: runner.os == 'Linux'
         run: |  
           sudo apt-get update
-          sudo apt-get install -y g++ cmake python3
+          sudo apt-get install -y g++ cmake python3 pytest
       
       - name : Build project
         run: |

From b4b23a47c75898d847d9498456d1077be13f4d83 Mon Sep 17 00:00:00 2001
From: Aaron De Clercq <60502264+AaronDC60@users.noreply.github.com>
Date: Tue, 21 Apr 2026 21:19:37 +0200
Subject: [PATCH 17/21] Update ci.yml

---
 .github/workflows/ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 8f7aa3a..e1b2c8b 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -23,7 +23,7 @@ jobs:
         if: runner.os == 'Linux'
         run: |  
           sudo apt-get update
-          sudo apt-get install -y g++ cmake python3 pytest
+          sudo apt-get install -y g++ cmake python3 python3-pytest
       
       - name : Build project
         run: |

From d4bba6bac3505e90c58780c719ba20fdd32e3076 Mon Sep 17 00:00:00 2001
From: Aaron De Clercq <60502264+AaronDC60@users.noreply.github.com>
Date: Tue, 21 Apr 2026 21:23:39 +0200
Subject: [PATCH 18/21] Update ci.yml

---
 .github/workflows/ci.yml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index e1b2c8b..b0ad909 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -24,6 +24,11 @@ jobs:
         run: |  
           sudo apt-get update
           sudo apt-get install -y g++ cmake python3 python3-pytest
+          
+      - name: Install dependencies (macOS)
+        if: runner.os == 'macOS'
+        run: |
+          python3 -m pip install pytest
       
       - name : Build project
         run: |

From 1b96a06e7ecb29af0396fe3ca8665a2eb917649f Mon Sep 17 00:00:00 2001
From: Aaron De Clercq <60502264+AaronDC60@users.noreply.github.com>
Date: Tue, 21 Apr 2026 21:42:22 +0200
Subject: [PATCH 19/21] Update ci.yml

---
 .github/workflows/ci.yml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index b0ad909..58691d5 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -19,6 +19,11 @@ jobs:
         with:
           submodules: recursive
 
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+
       - name: Install dependencies (Linux)
         if: runner.os == 'Linux'
         run: |  
@@ -28,6 +33,7 @@ jobs:
       - name: Install dependencies (macOS)
         if: runner.os == 'macOS'
         run: |
+          python3 -m pip install --upgrade pip
           python3 -m pip install pytest
       
       - name : Build project

From ef0ab8707d9e421d3407db8e57a0e3c436f33ea1 Mon Sep 17 00:00:00 2001
From: Aaron De Clercq <60502264+AaronDC60@users.noreply.github.com>
Date: Tue, 21 Apr 2026 21:46:29 +0200
Subject: [PATCH 20/21] Update ci.yml

---
 .github/workflows/ci.yml | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 58691d5..ef51b4e 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -30,11 +30,10 @@ jobs:
           sudo apt-get update
           sudo apt-get install -y g++ cmake python3 python3-pytest
           
-      - name: Install dependencies (macOS)
-        if: runner.os == 'macOS'
+      - name: Install dependencies
         run: |
           python3 -m pip install --upgrade pip
-          python3 -m pip install pytest
+          python3 -m pip install pytest numpy
       
       - name : Build project
         run: |

From df36255a73097266dd3355997ec9c1795495dd81 Mon Sep 17 00:00:00 2001
From: Aaron De Clercq <60502264+AaronDC60@users.noreply.github.com>
Date: Tue, 21 Apr 2026 21:50:49 +0200
Subject: [PATCH 21/21] Update ci.yml

---
 .github/workflows/ci.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index ef51b4e..0cb2f88 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -28,7 +28,6 @@ jobs:
         if: runner.os == 'Linux'
         run: |  
           sudo apt-get update
-          sudo apt-get install -y g++ cmake python3 python3-pytest
           
       - name: Install dependencies
         run: |