AstroAI-Lab
diff --git a/‎codes/surrogates/LatentPolynomial/latent_poly.py‎
Lines changed: 2 additions & 1 deletion b/‎codes/surrogates/LatentPolynomial/latent_poly.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎codes/tune/__init__.py‎
Lines changed: 5 additions & 0 deletions b/‎codes/tune/__init__.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎codes/tune/optuna_fcts.py‎
Lines changed: 119 additions & 16 deletions b/‎codes/tune/optuna_fcts.py‎
Lines changed: 119 additions & 16 deletions
diff --git a/‎codes/utils/utils.py‎
Lines changed: 1 addition & 1 deletion b/‎codes/utils/utils.py‎
Lines changed: 1 addition & 1 deletion
@@ -399,7 +399,7 @@ def total_loss(
         traj_loss = criterion(x_pred, x_true)
 
         # identity loss (reconstruct x0)
-        identity = self.identity_loss(x_true, params.to(self.device))
+        identity = self.identity_loss(x_true, params)
 
         # derivative losses: compute once
         d_pred = self.first_derivative(x_pred)
@@ -455,6 +455,7 @@ def identity_loss(self, x_true: Tensor, params: Tensor = None):
         # only reconstruct the initial state
         x0 = x_true[:, 0, :]
         if not self.config.coeff_network and params is not None:
+            params = params.to(self.device)
             enc_input = torch.cat([x0, params], dim=1)
         else:
             enc_input = x0
 
@@ -6,6 +6,8 @@
 )
 from .optuna_fcts import (
     MaxValidTrialsCallback,
+    _count_valid_trials,
+    build_fine_optuna_params,
     create_objective,
     load_yaml_config,
     make_optuna_params,
@@ -31,6 +33,8 @@
 
 __all__ = [
     "create_objective",
+    "MaxValidTrialsCallback",
+    "build_fine_optuna_params",
     "load_yaml_config",
     "make_optuna_params",
     "maybe_set_runtime_threshold",
@@ -51,4 +55,5 @@
     "_check_remote_reachable",
     "_initialize_postgres_local",
     "_initialize_postgres_remote",
+    "_count_valid_trials",
 ]
@@ -223,6 +223,60 @@ def objective(trial):
     return objective
 
 
+def create_objective(
+    config: dict, study_name: str, device_queue: queue.Queue
+) -> callable:
+    """
+    Create the objective function for Optuna.
+
+    Args:
+        config (dict): Configuration dictionary.
+        study_name (str): Name of the study.
+        device_queue (queue.Queue): Queue of available devices.
+
+    Returns:
+        function: Objective function for Optuna.
+    """
+
+    def objective(trial):
+        device, slot_id = device_queue.get()
+        try:
+            try:
+                return training_run(trial, device, slot_id, config, study_name)
+            except torch.cuda.OutOfMemoryError as e:
+                torch.cuda.empty_cache()
+                msg = repr(e).strip()
+                if not msg:
+                    msg = "CUDA Out of Memory (no details provided)."
+                trial.set_user_attr("exception", msg)
+                tqdm.write(f"[Trial {trial.number}] resulted in an OOM error.")
+                # raise optuna.TrialPruned(f"OOM error in trial {trial.number}")
+                if config.get("multi_objective", False):
+                    # In multi-objective mode, we return a tuple
+                    return float(config.get("loss_cap", 20)), float(10)
+                else:
+                    # In single objective mode, we return a single value
+                    return float(config.get("loss_cap", 20))
+            except optuna.TrialPruned as e:
+                msg = repr(e).strip()
+                trial.set_user_attr("exception", msg)
+                raise
+            except Exception as e:
+                torch.cuda.empty_cache()
+                msg = repr(e).strip()
+                if not msg:
+                    msg = "Unknown error occurred."
+                tqdm.write(
+                    f"Trial {trial.number} failed due to an unexpected error: {msg}"
+                )
+                trial.set_user_attr("exception", msg)
+                raise optuna.TrialPruned(f"Error in trial {trial.number}: {msg}")
+        finally:
+            device_queue.put((device, slot_id))
+
+    return objective
+
+
 def training_run(
     trial: optuna.Trial, device: str, slot_id: int, config: dict, study_name: str
 ) -> float | tuple[float, float]:
@@ -244,7 +298,6 @@ def training_run(
 
     download_data(config["dataset"]["name"], verbose=False)
 
-    # Load full data and parameters
     (
         (train_data, test_data, _),
         (train_params, test_params, _),
@@ -263,21 +316,29 @@ def training_run(
     )
 
     subset_factor = config["dataset"].get("subset_factor", 1)
-    # Get the appropriate subset of the training data
-    # We nevertheless use the full test data to measure performance.
     train_data = train_data[::subset_factor]
     train_params = train_params[::subset_factor] if train_params is not None else None
 
     set_random_seeds(config["seed"], device=device)
     surr_name = config["surrogate"]["name"]
-    suggested_params = make_optuna_params(trial, config["optuna_params"])
-    n_params = train_params.shape[1] if train_params is not None else 0
 
+    # Load base (best) config from disk as you already do
+    model_config = get_model_config(surr_name, config)
+
+    # Decide search space
+    if config.get("fine", False):
+        fine_space = config.get("fine_space")
+        suggested_params = make_optuna_params(trial, fine_space)
+    else:
+        suggested_params = make_optuna_params(trial, config["optuna_params"])
+
+    n_params = train_params.shape[1] if train_params is not None else 0
     n_timesteps = train_data.shape[1]
     n_quantities = train_data.shape[2]
     surrogate_class = get_surrogate(surr_name)
-    model_config = get_model_config(surr_name, config)
+
     model_config.update(suggested_params)
+
     model = surrogate_class(
         device=device,
         n_quantities=n_quantities,
@@ -312,30 +373,21 @@ def training_run(
         multi_objective=config["multi_objective"],
     )
 
-    # criterion = torch.nn.MSELoss()
     preds, targets = model.predict(test_loader, leave_log=True)
     p99_dex = torch.quantile(
         (preds - targets).abs().flatten(), float(config["target_percentile"])
     ).item()
-    # cap the loss to prevent exploding values
     p99_dex = min(p99_dex, config.get("loss_cap", 20))
 
-    # Extract the study name without the timestamp/suffix part
     parts = study_name.split("_")
     sname = "_".join(parts[:-1]) if len(parts) > 1 else study_name
 
     savepath = os.path.join("tuned", sname, "models")
     os.makedirs(savepath, exist_ok=True)
     model_name = f"{surr_name.lower()}_{trial.number}"
-    model.save(
-        model_name=model_name,
-        base_dir="",
-        training_id=savepath,
-    )
+    model.save(model_name=model_name, base_dir="", training_id=savepath)
 
-    # Check if we're running multi-objective optimisation
     if config["multi_objective"]:
-        # Measure inference time
         with _inference_time_lock:
             inference_times = measure_inference_time(model, test_loader)
         return p99_dex, np.mean(inference_times)
@@ -405,3 +457,54 @@ def is_bad(tr):
         f"\n[Study] Warmup complete. Runtime threshold set to {threshold:.1f}s "
         f"(mean = {mean_:.1f}s, std = {std_:.1f}s) over trials {used_trial_numbers}."
     )
+
+
+def _bounds_around(
+    v: float, factor: float = 10.0, lo: float | None = None, hi: float | None = None
+) -> tuple[float, float]:
+    low, high = float(v) / factor, float(v) * factor
+    if lo is not None:
+        low = max(low, lo)
+    if hi is not None:
+        high = min(high, hi)
+    # avoid degenerate ranges
+    if high <= low:
+        eps = max(abs(v) * 1e-3, 1e-12)
+        low, high = float(v) - eps, float(v) + eps
+    return low, high
+
+
+def build_fine_optuna_params(model_config: dict) -> dict:
+    keys = (
+        "learning_rate",
+        "beta",
+        "poly_power",
+        "eta_min",
+        "regularization_factor",
+        "momentum",
+    )
+    space: dict[str, dict] = {}
+    for k in keys:
+        if k not in model_config:
+            continue
+        val = model_config[k]
+        if not isinstance(val, (int, float)) or val == 0:
+            continue
+        lo, hi = _bounds_around(
+            val,
+            factor=10.0,
+            lo=1e-12 if k != "momentum" else 0.0,
+            hi=0.999 if k == "momentum" else None,
+        )
+        space[k] = {"type": "float", "low": lo, "high": hi, "log": True}
+    return space
+
+
+def _is_valid_trial(t: optuna.trial.FrozenTrial) -> bool:
+    return (t.state in (TrialState.COMPLETE, TrialState.PRUNED)) and (
+        "exception" not in t.user_attrs
+    )
+
+
+def _count_valid_trials(study: optuna.Study) -> int:
+    return sum(1 for t in study.get_trials(deepcopy=False) if _is_valid_trial(t))
@@ -63,7 +63,7 @@ def create_model_dir(
 
     # Check if the directory exists, and create it if it doesn't
     if not os.path.exists(full_path):
-        os.makedirs(full_path)
+        os.makedirs(full_path, exist_ok=True)
 
     return full_path