cleanup, tests for model comparison

robin-janssen · robin-janssen · commit 1768e8d2f56a · 2025-07-28T14:52:27.000+02:00
diff --git a/codes/benchmark/bench_fcts.py b/codes/benchmark/bench_fcts.py
@@ -1019,37 +1019,6 @@ def load_losses(model_identifier: str):
     )
 
 
-# def compare_MAE(metrics: dict, config: dict) -> None:
-#     """
-#     Compare the MAE of different surrogate models over the course of training.
-
-#     Args:
-#         metrics (dict): dictionary containing the benchmark metrics for each surrogate model.
-#         config (dict): Configuration dictionary.
-
-#     Returns:
-#         None
-#     """
-#     MAE = []
-#     labels = []
-#     train_durations = []
-#     device = config["devices"]
-#     device = device[0] if isinstance(device, list) else device
-
-#     for surr_name, _ in metrics.items():
-#         training_id = config["training_id"]
-#         surrogate_class = get_surrogate(surr_name)
-#         n_timesteps = metrics[surr_name]["timesteps"].shape[0]
-#         n_quantities = metrics[surr_name]["accuracy"]["absolute_errors"].shape[2]
-#         model_config = get_model_config(surr_name, config)
-#         model = surrogate_class(device, n_quantities, n_timesteps, model_config)
-#         model_identifier = f"{surr_name.lower()}_main"
-#         model.load(training_id, surr_name, model_identifier=model_identifier)
-#         MAE.append(model.MAE)
-#         labels.append(surr_name)
-#         train_durations.append(model.train_duration)
-
-
 def compare_relative_errors(metrics: dict[str, dict], config: dict) -> None:
     """
     Compare the relative errors over time for different surrogate models.
diff --git a/codes/tune/__init__.py b/codes/tune/__init__.py
@@ -11,7 +11,15 @@
     maybe_set_runtime_threshold,
     training_run,
 )
-from .postgres_fcts import _make_db_url, initialize_optuna_database
+from .postgres_fcts import (
+    _make_db_url,
+    initialize_optuna_database,
+    _check_postgres_running_local,
+    _start_postgres_server_local,
+    _check_remote_reachable,
+    _initialize_postgres_local,
+    _initialize_postgres_remote,
+)
 from .tune_utils import (
     build_study_names,
     copy_config,
@@ -37,4 +45,9 @@
     "yes_no",
     "_make_db_url",
     "initialize_optuna_database",
+    "_check_postgres_running_local",
+    "_start_postgres_server_local",
+    "_check_remote_reachable",
+    "_initialize_postgres_local",
+    "_initialize_postgres_remote",
 ]
diff --git a/codes/utils/data_utils.py b/codes/utils/data_utils.py
@@ -223,70 +223,6 @@ def check_and_load_data(
     )
 
 
-def normalize_data_old(
-    train_data: np.ndarray,
-    test_data: np.ndarray | None = None,
-    val_data: np.ndarray | None = None,
-    mode: str = "standardise",
-) -> tuple:
-    """
-    Normalize the data based on the training data statistics.
-
-    Args:
-        train_data (np.ndarray): Training data array.
-        test_data (np.ndarray, optional): Test data array.
-        val_data (np.ndarray, optional): Validation data array.
-        mode (str): Normalization mode, either "minmax" or "standardise".
-
-    Returns:
-        tuple: Normalized training data, test data, and validation data.
-    """
-    if mode not in ["minmax", "standardise"]:
-        raise ValueError("Mode must be either 'minmax' or 'standardise'")
-
-    if mode == "minmax":
-        # Compute min and max on the training data
-        data_min = np.min(train_data)
-        data_max = np.max(train_data)
-
-        data_info = {"min": float(data_min), "max": float(data_max), "mode": mode}
-
-        # Normalize the training data
-        train_data_norm = 2 * (train_data - data_min) / (data_max - data_min) - 1
-
-        if test_data is not None:
-            test_data_norm = 2 * (test_data - data_min) / (data_max - data_min) - 1
-        else:
-            test_data_norm = None
-
-        if val_data is not None:
-            val_data_norm = 2 * (val_data - data_min) / (data_max - data_min) - 1
-        else:
-            val_data_norm = None
-
-    elif mode == "standardise":
-        # Compute mean and std on the training data
-        mean = np.mean(train_data)
-        std = np.std(train_data)
-
-        data_info = {"mean": float(mean), "std": float(std), "mode": mode}
-
-        # Standardize the training data
-        train_data_norm = (train_data - mean) / std
-
-        if test_data is not None:
-            test_data_norm = (test_data - mean) / std
-        else:
-            test_data_norm = None
-
-        if val_data is not None:
-            val_data_norm = (val_data - mean) / std
-        else:
-            val_data_norm = None
-
-    return data_info, train_data_norm, test_data_norm, val_data_norm
-
-
 def normalize_data(
     train_data: np.ndarray,
     test_data: np.ndarray | None = None,
diff --git a/pyproject.toml b/pyproject.toml
@@ -65,7 +65,7 @@ line_length = 88
 
 [tool.coverage.run]
 source = ["codes"]
-omit = ["*/tests/*", "*/test_*", "*/bench_plots.py", "*/__init__.py"]
+omit = ["*/tests/*", "*/test_*", "*/bench_plots.py", "*/__init__.py", "*/evaluate_study.py", "*/evaluate_tuning.py"]
 
 [tool.coverage.report]
 exclude_lines = [
diff --git a/test/test_model_comparison.py b/test/test_model_comparison.py
@@ -0,0 +1,158 @@
+# test/test_compare_models.py
+import pytest
+from codes.benchmark import bench_fcts
+
+
+@pytest.fixture(autouse=True)
+def record_calls(monkeypatch):
+    """
+    Stub out all compare_* and plot_* functions so that calls
+    just record their names into a shared list, instead of doing any real work.
+    """
+    calls = []
+    names = [
+        "compare_relative_errors",
+        "compare_main_losses",
+        "compare_dynamic_accuracy",
+        "compare_inference_time",
+        "compare_interpolation",
+        "compare_extrapolation",
+        "compare_sparse",
+        "plot_all_generalization_errors",
+        "compare_batchsize",
+        "compare_UQ",
+        "tabular_comparison",
+    ]
+    for name in names:
+        monkeypatch.setattr(
+            bench_fcts,
+            name,
+            lambda *args, _n=name, **kw: calls.append(_n),
+        )
+    return calls
+
+
+def make_dummy_metrics():
+    """
+    Build a minimal metrics dict that contains the keys
+    your compare_models dispatcher will look up.
+    Values themselves are never inspected by our stubs.
+    """
+    return {
+        "M1": {
+            "accuracy": {"relative_errors": None},
+            "timesteps": None,
+            "n_params": 0,
+            # for each enabled branch add a dummy sub-dict:
+            "timing": {
+                "mean_inference_time_per_run": 1.0,
+                "std_inference_time_per_run": 0.1,
+            },
+            "gradients": {
+                "gradients": None,
+                "avg_correlation": 0.0,
+                "max_gradient": 0,
+                "max_error": 0,
+                "max_counts": 0,
+            },
+            "interpolation": {"intervals": [1], "model_errors": [0]},
+            "extrapolation": {"cutoffs": [1], "model_errors": [0]},
+            "sparse": {"n_train_samples": [10], "model_errors": [0]},
+            "batch_size": {"batch_sizes": [32], "model_errors": [0]},
+            "UQ": {
+                "pred_uncertainty": None,
+                "absolute_errors": None,
+                "relative_errors": None,
+                "axis_max": None,
+                "max_counts": None,
+                "correlation_metrics": None,
+                "weighted_diff": None,
+            },
+        }
+    }
+
+
+@pytest.mark.parametrize(
+    "flags, expected_sequence",
+    [
+        # all branches on
+        (
+            {
+                "losses": True,
+                "gradients": True,
+                "timing": True,
+                "interpolation": {"enabled": True},
+                "extrapolation": {"enabled": True},
+                "sparse": {"enabled": True},
+                "batch_scaling": {"enabled": True},
+                "uncertainty": {"enabled": True},
+            },
+            [
+                "compare_relative_errors",
+                "compare_main_losses",
+                "compare_dynamic_accuracy",
+                "compare_inference_time",
+                "compare_interpolation",
+                "compare_extrapolation",
+                "compare_sparse",
+                "plot_all_generalization_errors",  # only if int+ext+sparse all enabled
+                "compare_batchsize",
+                "compare_UQ",
+                "tabular_comparison",
+            ],
+        ),
+        # only the mandatory relative-errors + table
+        (
+            {
+                "losses": False,
+                "gradients": False,
+                "timing": False,
+                "interpolation": {"enabled": False},
+                "extrapolation": {"enabled": False},
+                "sparse": {"enabled": False},
+                "batch_scaling": {"enabled": False},
+                "uncertainty": {"enabled": False},
+            },
+            [
+                "compare_relative_errors",
+                "tabular_comparison",
+            ],
+        ),
+        # losses but nothing else
+        (
+            {
+                "losses": True,
+                "gradients": False,
+                "timing": False,
+                "interpolation": {"enabled": False},
+                "extrapolation": {"enabled": False},
+                "sparse": {"enabled": False},
+                "batch_scaling": {"enabled": False},
+                "uncertainty": {"enabled": False},
+            },
+            [
+                "compare_relative_errors",
+                "compare_main_losses",
+                "tabular_comparison",
+            ],
+        ),
+    ],
+)
+def test_compare_models_branching(record_calls, flags, expected_sequence):
+    cfg = {
+        "training_id": "test",
+        "devices": ["cpu"],  # for compare_main_losses
+        "losses": flags["losses"],
+        "gradients": flags["gradients"],
+        "timing": flags["timing"],
+        "interpolation": flags["interpolation"],
+        "extrapolation": flags["extrapolation"],
+        "sparse": flags["sparse"],
+        "batch_scaling": flags["batch_scaling"],
+        "uncertainty": flags["uncertainty"],
+    }
+    metrics = make_dummy_metrics()
+
+    bench_fcts.compare_models(metrics, cfg)
+
+    assert record_calls == expected_sequence