directory changes, logging changes

SamoraHunter · SamoraHunter · commit 428ff3bce2b3 · 2025-10-10T01:04:19.000+01:00
diff --git a/README.md b/README.md
@@ -158,14 +158,27 @@ After installation, activate the virtual environment to run your code or noteboo
 The main entry point for running experiments is typically a script or notebook that defines the parameter space and iterates through it. Here is a conceptual example of how to run a single pipeline iteration:
 
 ```python
+import os
+from pathlib import Path
 from ml_grid.pipeline.data import pipe
 from ml_grid.util.param_space import parameter_space
 from ml_grid.util.global_params import global_parameters
+from ml_grid.util.create_experiment_directory import create_experiment_directory
 
 # Define global settings
 global_parameters.verbose = 2
 global_parameters.error_raise = False
 
+# Define project root and experiment directories robustly
+# Assumes the script/notebook is in a subdirectory like 'notebooks'
+project_root = Path().resolve().parent
+
+# Define a base directory for all experiments within the project root
+experiments_base_dir = project_root / "experiments"
+
+# Create a unique, timestamped directory for this specific experiment run
+experiment_dir = create_experiment_directory(base_dir=experiments_base_dir, additional_naming="MyExperiment")
+
 # Load the parameter space
 param_space_df = parameter_space().get_parameter_space()
 
@@ -174,10 +187,11 @@ local_param_dict = param_space_df.iloc[0].to_dict()
 
 # Instantiate and run the pipeline
 ml_grid_object = pipe(
-    file_name='path/to/your/data.csv',
+    file_name=str(project_root / "data" / "your_data.csv"),
     drop_term_list=['id', 'unwanted_col'],
     local_param_dict=local_param_dict,
-    base_project_dir='path/to/your/project/',
+    base_project_dir=str(project_root),
+    experiment_dir=experiment_dir,
     param_space_index=0
 )
 
diff --git a/ml_grid/model_classes/lightgbm_class.py b/ml_grid/model_classes/lightgbm_class.py
@@ -21,9 +21,9 @@ def __init__(
         num_leaves: int = 31,
         learning_rate: float = 0.05,
         n_estimators: int = 100,
-        objective: str = "multiclass",
-        num_class: int = 1,
-        metric: str = "multi_logloss",
+        objective: str = "binary",
+        num_class: Optional[int] = None,
+        metric: str = "logloss",
         feature_fraction: float = 0.9,
         early_stopping_rounds: Optional[int] = None,
         verbosity: int = -1,
@@ -36,8 +36,9 @@ def __init__(
             learning_rate (float): Boosting learning rate.
             n_estimators (int): Number of boosting rounds.
             objective (str): The learning objective.
-            num_class (int): The number of classes for multiclass classification.
-            metric (str): The metric to be used for evaluation.
+            num_class (Optional[int]): The number of classes for multiclass
+                classification. Not needed for binary. Defaults to None.
+            metric (str): The metric to be used for evaluation. Defaults to 'logloss'.
             feature_fraction (float): Fraction of features to be considered for each
                 tree.
             early_stopping_rounds (Optional[int]): Activates early stopping.
diff --git a/ml_grid/pipeline/data_correlation_matrix.py b/ml_grid/pipeline/data_correlation_matrix.py
@@ -1,3 +1,4 @@
+import sys
 from typing import Any, Dict, List, Tuple
 
 import pandas as pd
@@ -69,7 +70,7 @@ def handle_correlation_matrix(
     ]
 
     # Iterate through each column chunk
-    for chunk in tqdm(column_chunks, desc="Calculating Correlations"):
+    for chunk in tqdm(column_chunks, desc="Calculating Correlations", file=sys.stdout):
         # Calculate the correlation coefficients for the current chunk
         try:
             correlations = df_numeric[chunk].corr()
diff --git a/ml_grid/pipeline/logs_project_folder.py b/ml_grid/pipeline/logs_project_folder.py
@@ -1,56 +0,0 @@
-import logging
-import pathlib
-import os
-from typing import Any, Dict, Optional
-
-
-class log_folder:
-    """Creates a unique log folder for each experimental run based on its parameters."""
-
-    def __init__(
-        self,
-        local_param_dict: Dict[str, Any],
-        additional_naming: Optional[str],
-        base_project_dir: str,
-    ) -> None:
-        """Initializes the log folder and sets up basic logging.
-
-        This constructor generates a unique folder name by concatenating the
-        values from the `local_param_dict`. It then creates this folder and
-        configures a basic logger to write to a 'log.log' file inside it.
-
-        Note:
-            This class re-configures the root logger on each instantiation,
-            which may have unintended side effects in a larger application.
-
-        Args:
-            local_param_dict (Dict[str, Any]): A dictionary of parameters for the
-                current pipeline run.
-            additional_naming (Optional[str]): An additional string to append to
-                the folder name.
-            base_project_dir (str): The root directory for the project.
-        """
-
-        str_b = ""
-        for key in local_param_dict.keys():
-            if key != "data":
-                str_b = str_b + "_" + str(local_param_dict.get(key))
-            else:
-                for data_key in local_param_dict.get("data", {}):
-                    str_b = str_b + str(int(local_param_dict.get("data", {}).get(data_key)))
-
-        global_param_str = str_b
-
-        print(global_param_str)
-
-        log_folder_name = f"{global_param_str}{additional_naming or ''}"
-        log_folder_path = os.path.join(base_project_dir, log_folder_name, "logs")
-
-        pathlib.Path(log_folder_path).mkdir(parents=True, exist_ok=True)
-
-        full_log_path = os.path.join(log_folder_path, "log.log")
-
-        logging.basicConfig(filename=full_log_path)
-        stderrLogger = logging.StreamHandler()
-        stderrLogger.setFormatter(logging.Formatter(logging.BASIC_FORMAT))
-        logging.getLogger().addHandler(stderrLogger)
diff --git a/ml_grid/results_processing/core.py b/ml_grid/results_processing/core.py
@@ -62,25 +62,68 @@ def load_feature_names(self, feature_names_csv: str) -> None:
             )
 
     def get_available_runs(self) -> List[str]:
-        """Gets a list of available timestamped run folders.
+        """Gets a list of available run folders by recursively searching for log files.
+
+        This method is robust to nested directory structures. It finds all
+        `final_grid_score_log.csv` files and returns their parent directory
+        names as the list of available runs.
+        
+        Special case: If a log file exists directly in the root folder, 
+        the root folder's name will be used as the run identifier.
 
         Returns:
             List[str]: A sorted list of valid run folder names.
 
         Raises:
-            ValueError: If the root folder does not exist.
+            ValueError: If the root folder does not exist or is not a directory.
         """
-        if not self.root_folder.exists():
-            raise ValueError(f"Root folder {self.root_folder} does not exist")
-
-        timestamp_folders = []
-        for item in self.root_folder.iterdir():
-            if item.is_dir():
-                log_file = item / "final_grid_score_log.csv"
-                if log_file.exists():
-                    timestamp_folders.append(item.name)
-
-        return sorted(timestamp_folders)
+        if not self.root_folder.is_dir():
+            raise ValueError(f"Root folder {self.root_folder} is not a valid directory")
+
+        # Check if log file exists directly in root
+        root_log_file = self.root_folder / "final_grid_score_log.csv"
+        run_folders = set()
+        
+        if root_log_file.exists():
+            # Use a special identifier for root-level CSV
+            run_folders.add(f"__ROOT__{self.root_folder.name}")
+        
+        # Recursively find all log files in subfolders
+        for log_file in self.root_folder.rglob("final_grid_score_log.csv"):
+            # Skip the root-level file (already handled)
+            if log_file == root_log_file:
+                continue
+            # Add the immediate parent folder name
+            run_folders.add(log_file.parent.name)
+        
+        return sorted(list(run_folders))
+
+    def _resolve_run_path(self, run_name: str) -> Path:
+        """Resolves a run name to its full path.
+        
+        Args:
+            run_name: The run folder name or special root identifier
+            
+        Returns:
+            Path to the run folder
+            
+        Raises:
+            FileNotFoundError: If the run cannot be found
+        """
+        # Check if this is the special root identifier
+        if run_name.startswith("__ROOT__"):
+            root_log = self.root_folder / "final_grid_score_log.csv"
+            if root_log.exists():
+                return self.root_folder
+            raise FileNotFoundError(f"Root log file not found: {root_log}")
+        
+        # Search for the folder name within the root directory
+        try:
+            return next(self.root_folder.rglob(f"**/{run_name}"))
+        except StopIteration:
+            raise FileNotFoundError(
+                f"Run folder '{run_name}' not found anywhere under {self.root_folder}"
+            )
 
     def load_single_run(self, timestamp_folder: str) -> pd.DataFrame:
         """Loads results from a specific timestamped run folder.
@@ -94,8 +137,10 @@ def load_single_run(self, timestamp_folder: str) -> pd.DataFrame:
         Raises:
             FileNotFoundError: If the log file does not exist in the folder.
         """
-        log_path = self.root_folder / timestamp_folder / "final_grid_score_log.csv"
-
+        # Resolve the run name to its full path. This handles nesting and the special root case.
+        run_folder_path = self._resolve_run_path(timestamp_folder)
+        
+        log_path = run_folder_path / "final_grid_score_log.csv"
         if not log_path.exists():
             raise FileNotFoundError(f"Log file not found: {log_path}")
 
@@ -138,7 +183,12 @@ def aggregate_specific_runs(self, run_names: List[str]) -> pd.DataFrame:
 
         for run in run_names:
             try:
+                # Resolve the run name to its actual path. This handles
+                # the special '__ROOT__' case and nested folders. The path is what we need.
+                run_folder_path = self._resolve_run_path(run)
                 df = self.load_single_run(run)
+                # Add the relative path to the run for better context
+                df['run_path'] = str(run_folder_path.relative_to(self.root_folder))
                 all_dataframes.append(df)
                 print(f"Loaded run: {run} ({len(df)} records)")
             except Exception as e:
diff --git a/ml_grid/util/create_experiment_directory.py b/ml_grid/util/create_experiment_directory.py
@@ -0,0 +1,29 @@
+import os
+from typing import Optional
+from datetime import datetime
+from pathlib import Path
+
+def create_experiment_directory(
+    base_dir: str, additional_naming: Optional[str] = None
+) -> str:
+    """Creates a single, timestamped directory for a group of experiment runs.
+
+    This function should be called once at the beginning of an experiment script
+    to create a unique parent folder for all the runs in that batch.
+
+    Args:
+        base_dir (str): The base directory where experiment folders will be stored
+                        (e.g., 'notebooks/HFE_ML_experiments').
+        additional_naming (Optional[str], optional): A descriptive name to append
+                                                     to the timestamp. Defaults to None.
+
+    Returns:
+        str: The full path to the created experiment directory.
+    """
+    
+    current_date_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+    folder_name = f"{current_date_time}_{additional_naming}" if additional_naming else current_date_time    
+    experiment_dir = Path(base_dir) / folder_name
+    experiment_dir.mkdir(parents=True, exist_ok=True)
+    print(f"Experiment directory created: {experiment_dir}")
+    return str(experiment_dir)
diff --git a/ml_grid/util/logger_setup.py b/ml_grid/util/logger_setup.py
diff --git a/ml_grid/util/project_score_save.py b/ml_grid/util/project_score_save.py