implement hyperparameter finetuning mode

robin-janssen · robin-janssen · commit 46f65ef4084c · 2025-08-11T17:20:56.000+02:00
diff --git a/codes/tune/__init__.py b/codes/tune/__init__.py
@@ -6,6 +6,8 @@
 )
 from .optuna_fcts import (
     MaxValidTrialsCallback,
+    _count_valid_trials,
+    build_fine_optuna_params,
     create_objective,
     load_yaml_config,
     make_optuna_params,
@@ -31,6 +33,8 @@
 
 __all__ = [
     "create_objective",
+    "MaxValidTrialsCallback",
+    "build_fine_optuna_params",
     "load_yaml_config",
     "make_optuna_params",
     "maybe_set_runtime_threshold",
@@ -51,4 +55,5 @@
     "_check_remote_reachable",
     "_initialize_postgres_local",
     "_initialize_postgres_remote",
+    "_count_valid_trials",
 ]
diff --git a/codes/tune/optuna_fcts.py b/codes/tune/optuna_fcts.py
@@ -223,6 +223,60 @@ def objective(trial):
     return objective
 
 
+def create_objective(
+    config: dict, study_name: str, device_queue: queue.Queue
+) -> callable:
+    """
+    Create the objective function for Optuna.
+
+    Args:
+        config (dict): Configuration dictionary.
+        study_name (str): Name of the study.
+        device_queue (queue.Queue): Queue of available devices.
+
+    Returns:
+        function: Objective function for Optuna.
+    """
+
+    def objective(trial):
+        device, slot_id = device_queue.get()
+        try:
+            try:
+                return training_run(trial, device, slot_id, config, study_name)
+            except torch.cuda.OutOfMemoryError as e:
+                torch.cuda.empty_cache()
+                msg = repr(e).strip()
+                if not msg:
+                    msg = "CUDA Out of Memory (no details provided)."
+                trial.set_user_attr("exception", msg)
+                tqdm.write(f"[Trial {trial.number}] resulted in an OOM error.")
+                # raise optuna.TrialPruned(f"OOM error in trial {trial.number}")
+                if config.get("multi_objective", False):
+                    # In multi-objective mode, we return a tuple
+                    return float(config.get("loss_cap", 20)), float(10)
+                else:
+                    # In single objective mode, we return a single value
+                    return float(config.get("loss_cap", 20))
+            except optuna.TrialPruned as e:
+                msg = repr(e).strip()
+                trial.set_user_attr("exception", msg)
+                raise
+            except Exception as e:
+                torch.cuda.empty_cache()
+                msg = repr(e).strip()
+                if not msg:
+                    msg = "Unknown error occurred."
+                tqdm.write(
+                    f"Trial {trial.number} failed due to an unexpected error: {msg}"
+                )
+                trial.set_user_attr("exception", msg)
+                raise optuna.TrialPruned(f"Error in trial {trial.number}: {msg}")
+        finally:
+            device_queue.put((device, slot_id))
+
+    return objective
+
+
 def training_run(
     trial: optuna.Trial, device: str, slot_id: int, config: dict, study_name: str
 ) -> float | tuple[float, float]:
@@ -244,7 +298,6 @@ def training_run(
 
     download_data(config["dataset"]["name"], verbose=False)
 
-    # Load full data and parameters
     (
         (train_data, test_data, _),
         (train_params, test_params, _),
@@ -263,21 +316,29 @@ def training_run(
     )
 
     subset_factor = config["dataset"].get("subset_factor", 1)
-    # Get the appropriate subset of the training data
-    # We nevertheless use the full test data to measure performance.
     train_data = train_data[::subset_factor]
     train_params = train_params[::subset_factor] if train_params is not None else None
 
     set_random_seeds(config["seed"], device=device)
     surr_name = config["surrogate"]["name"]
-    suggested_params = make_optuna_params(trial, config["optuna_params"])
-    n_params = train_params.shape[1] if train_params is not None else 0
 
+    # Load base (best) config from disk as you already do
+    model_config = get_model_config(surr_name, config)
+
+    # Decide search space
+    if config.get("fine", False):
+        fine_space = config.get("fine_space")
+        suggested_params = make_optuna_params(trial, fine_space)
+    else:
+        suggested_params = make_optuna_params(trial, config["optuna_params"])
+
+    n_params = train_params.shape[1] if train_params is not None else 0
     n_timesteps = train_data.shape[1]
     n_quantities = train_data.shape[2]
     surrogate_class = get_surrogate(surr_name)
-    model_config = get_model_config(surr_name, config)
+
     model_config.update(suggested_params)
+
     model = surrogate_class(
         device=device,
         n_quantities=n_quantities,
@@ -312,30 +373,21 @@ def training_run(
         multi_objective=config["multi_objective"],
     )
 
-    # criterion = torch.nn.MSELoss()
     preds, targets = model.predict(test_loader, leave_log=True)
     p99_dex = torch.quantile(
         (preds - targets).abs().flatten(), float(config["target_percentile"])
     ).item()
-    # cap the loss to prevent exploding values
     p99_dex = min(p99_dex, config.get("loss_cap", 20))
 
-    # Extract the study name without the timestamp/suffix part
     parts = study_name.split("_")
     sname = "_".join(parts[:-1]) if len(parts) > 1 else study_name
 
     savepath = os.path.join("tuned", sname, "models")
     os.makedirs(savepath, exist_ok=True)
     model_name = f"{surr_name.lower()}_{trial.number}"
-    model.save(
-        model_name=model_name,
-        base_dir="",
-        training_id=savepath,
-    )
+    model.save(model_name=model_name, base_dir="", training_id=savepath)
 
-    # Check if we're running multi-objective optimisation
     if config["multi_objective"]:
-        # Measure inference time
         with _inference_time_lock:
             inference_times = measure_inference_time(model, test_loader)
         return p99_dex, np.mean(inference_times)
@@ -405,3 +457,54 @@ def is_bad(tr):
         f"\n[Study] Warmup complete. Runtime threshold set to {threshold:.1f}s "
         f"(mean = {mean_:.1f}s, std = {std_:.1f}s) over trials {used_trial_numbers}."
     )
+
+
+def _bounds_around(
+    v: float, factor: float = 10.0, lo: float | None = None, hi: float | None = None
+) -> tuple[float, float]:
+    low, high = float(v) / factor, float(v) * factor
+    if lo is not None:
+        low = max(low, lo)
+    if hi is not None:
+        high = min(high, hi)
+    # avoid degenerate ranges
+    if high <= low:
+        eps = max(abs(v) * 1e-3, 1e-12)
+        low, high = float(v) - eps, float(v) + eps
+    return low, high
+
+
+def build_fine_optuna_params(model_config: dict) -> dict:
+    keys = (
+        "learning_rate",
+        "beta",
+        "poly_power",
+        "eta_min",
+        "regularization_factor",
+        "momentum",
+    )
+    space: dict[str, dict] = {}
+    for k in keys:
+        if k not in model_config:
+            continue
+        val = model_config[k]
+        if not isinstance(val, (int, float)) or val == 0:
+            continue
+        lo, hi = _bounds_around(
+            val,
+            factor=10.0,
+            lo=1e-12 if k != "momentum" else 0.0,
+            hi=0.999 if k == "momentum" else None,
+        )
+        space[k] = {"type": "float", "low": lo, "high": hi, "log": True}
+    return space
+
+
+def _is_valid_trial(t: optuna.trial.FrozenTrial) -> bool:
+    return (t.state in (TrialState.COMPLETE, TrialState.PRUNED)) and (
+        "exception" not in t.user_attrs
+    )
+
+
+def _count_valid_trials(study: optuna.Study) -> int:
+    return sum(1 for t in study.get_trials(deepcopy=False) if _is_valid_trial(t))
diff --git a/run_tuning.py b/run_tuning.py
@@ -1,15 +1,20 @@
 import argparse
+import os
 import queue
 import sys
 import time
 from pathlib import Path
 
 import optuna
+import yaml
 from optuna.trial import TrialState
 from tqdm import tqdm
 
+from codes.benchmark import get_model_config
 from codes.tune import (
     MaxValidTrialsCallback,
+    _count_valid_trials,
+    build_fine_optuna_params,
     create_objective,
     initialize_optuna_database,
     load_yaml_config,
@@ -23,6 +28,16 @@ def run_single_study(config: dict, study_name: str, db_url: str):
     if not config.get("optuna_logging", False):
         optuna.logging.set_verbosity(optuna.logging.WARNING)
 
+    if config.get("fine", False):
+        try:
+            base_cfg = get_model_config(config["surrogate"]["name"], config)
+            finetune_space = build_fine_optuna_params(base_cfg)
+            n_fine = len(finetune_space)
+        except Exception:
+            n_fine = 0  # conservative fallback
+
+        config["n_trials"] = max(5 * n_fine, 5)  # disregard YAML trials
+
     if config["multi_objective"]:
         sampler = optuna.samplers.NSGAIISampler(
             seed=config["seed"], population_size=config["population_size"]
@@ -56,6 +71,13 @@ def run_single_study(config: dict, study_name: str, db_url: str):
             load_if_exists=True,
         )
 
+    have = _count_valid_trials(study)
+    if have >= config["n_trials"]:
+        print(
+            f"[skip] {study_name}: already has {have} valid trials (target {config['n_trials']}). Skipping optimize()."
+        )
+        return
+
     device_queue = queue.Queue()
     for slot_id, dev in enumerate(config["devices"]):
         device_queue.put((dev, slot_id))
@@ -110,7 +132,11 @@ def trial_complete_callback(study_: optuna.Study, trial_: optuna.trial.FrozenTri
 
 def run_all_studies(config: dict, main_study_name: str, db_url: str):
     surrogates = config["surrogates"]
-    global_params = config.get("global_optuna_params", {})
+    global_params = (
+        {} if config.get("fine", False) else config.get("global_optuna_params", {})
+    )
+
+    fine_report: dict[str, dict] = {}
 
     total_sub_studies = len(surrogates)
     with tqdm(
@@ -122,15 +148,47 @@ def run_all_studies(config: dict, main_study_name: str, db_url: str):
             )
 
         for surr in surrogates:
-            local = surr.get("optuna_params", {})
-            for name, opts in global_params.items():
-                if name in local:
-                    print(
-                        f"⚠️ Hyperparameter '{name}' defined globally and locally for {surr['name']}; using local."
-                    )
-                else:
-                    local[name] = opts
-            surr["optuna_params"] = local
+            arch_name = surr["name"]
+            if config.get("fine", False):
+                # ignore manual search spaces
+                surr["optuna_params"] = {}
+
+                # derive fine space from previously best config
+                base_cfg = get_model_config(arch_name, config)
+                fine_space = build_fine_optuna_params(base_cfg)
+                n_fine = len(fine_space)
+                n_trials_override = max(5 * n_fine, 5)
+
+                # CLI confirmation
+                print(
+                    f"[fine] {arch_name}: found fine-tunable parameters: {list(fine_space.keys()) or 'none'}"
+                )
+                for k, spec in fine_space.items():
+                    print(f"  - {k}: [{spec['low']:.3g}, {spec['high']:.3g}] (log)")
+                print(f"  -> running for {n_trials_override} trials\n")
+
+                # stash for YAML and pass along to run_single_study
+                fine_report[arch_name] = {
+                    "trials": int(n_trials_override),
+                    "params": {
+                        k: {
+                            "low": float(v["low"]),
+                            "high": float(v["high"]),
+                            "log": bool(v.get("log", False)),
+                        }
+                        for k, v in fine_space.items()
+                    },
+                }
+            else:
+                local = surr.get("optuna_params", {})
+                for name, opts in global_params.items():
+                    if name in local:
+                        print(
+                            f"⚠️ Hyperparameter '{name}' defined globally and locally for {surr['name']}; using local."
+                        )
+                    else:
+                        local[name] = opts
+                surr["optuna_params"] = local
 
             arch_name = surr["name"]
             study_name = f"{main_study_name}_{arch_name.lower()}"
@@ -142,22 +200,36 @@ def run_all_studies(config: dict, main_study_name: str, db_url: str):
                 "dataset": config["dataset"],
                 "devices": config["devices"],
                 "epochs": surr["epochs"],
-                "n_trials": trials,
+                "n_trials": trials if not n_trials_override else n_trials_override,
                 "seed": config["seed"],
                 "surrogate": {"name": arch_name},
-                "optuna_params": surr["optuna_params"],
+                "optuna_params": surr.get("optuna_params", {}),
                 "prune": config.get("prune", True),
                 "optuna_logging": config.get("optuna_logging", False),
                 "use_optimal_params": config.get("use_optimal_params", False),
                 "multi_objective": config.get("multi_objective", False),
                 "population_size": config.get("population_size", 50),
                 "target_percentile": config.get("target_percentile", 0.95),
+                "fine": config.get("fine", False),  # pass through
+                "loss_cap": config.get("loss_cap", 20),
             }
 
+            if config.get("fine", False):
+                sub_config["fine_space"] = fine_space
+
             run_single_study(sub_config, study_name, db_url)
             arch_pbar.update(1)
             arch_pbar.set_postfix({"done": study_name})
 
+    # Write YAML summary once per main study (only in fine mode)
+    if config.get("fine", False):
+        out_dir = os.path.join("tuned", main_study_name)
+        os.makedirs(out_dir, exist_ok=True)
+        out_path = os.path.join(out_dir, "fine_summary.yaml")
+        with open(out_path, "w", encoding="utf-8") as f:
+            yaml.safe_dump(fine_report, f, sort_keys=True, default_flow_style=False)
+        print(f"[fine] Wrote summary: {out_path}")
+
 
 def parse_arguments():
     parser = argparse.ArgumentParser(description="Run Optuna tuning studies.")