Skip to content

Commit 428ff3b

Browse files
committed
directory changes, logging changes
1 parent 7bd68ba commit 428ff3b

8 files changed

Lines changed: 171 additions & 179 deletions

File tree

README.md

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -158,14 +158,27 @@ After installation, activate the virtual environment to run your code or noteboo
158158
The main entry point for running experiments is typically a script or notebook that defines the parameter space and iterates through it. Here is a conceptual example of how to run a single pipeline iteration:
159159

160160
```python
161+
import os
162+
from pathlib import Path
161163
from ml_grid.pipeline.data import pipe
162164
from ml_grid.util.param_space import parameter_space
163165
from ml_grid.util.global_params import global_parameters
166+
from ml_grid.util.create_experiment_directory import create_experiment_directory
164167
165168
# Define global settings
166169
global_parameters.verbose = 2
167170
global_parameters.error_raise = False
168171
172+
# Define project root and experiment directories robustly
173+
# Assumes the script/notebook is in a subdirectory like 'notebooks'
174+
project_root = Path().resolve().parent
175+
176+
# Define a base directory for all experiments within the project root
177+
experiments_base_dir = project_root / "experiments"
178+
179+
# Create a unique, timestamped directory for this specific experiment run
180+
experiment_dir = create_experiment_directory(base_dir=experiments_base_dir, additional_naming="MyExperiment")
181+
169182
# Load the parameter space
170183
param_space_df = parameter_space().get_parameter_space()
171184
@@ -174,10 +187,11 @@ local_param_dict = param_space_df.iloc[0].to_dict()
174187
175188
# Instantiate and run the pipeline
176189
ml_grid_object = pipe(
177-
file_name='path/to/your/data.csv',
190+
file_name=str(project_root / "data" / "your_data.csv"),
178191
drop_term_list=['id', 'unwanted_col'],
179192
local_param_dict=local_param_dict,
180-
base_project_dir='path/to/your/project/',
193+
base_project_dir=str(project_root),
194+
experiment_dir=experiment_dir,
181195
param_space_index=0
182196
)
183197

ml_grid/model_classes/lightgbm_class.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,9 @@ def __init__(
2121
num_leaves: int = 31,
2222
learning_rate: float = 0.05,
2323
n_estimators: int = 100,
24-
objective: str = "multiclass",
25-
num_class: int = 1,
26-
metric: str = "multi_logloss",
24+
objective: str = "binary",
25+
num_class: Optional[int] = None,
26+
metric: str = "logloss",
2727
feature_fraction: float = 0.9,
2828
early_stopping_rounds: Optional[int] = None,
2929
verbosity: int = -1,
@@ -36,8 +36,9 @@ def __init__(
3636
learning_rate (float): Boosting learning rate.
3737
n_estimators (int): Number of boosting rounds.
3838
objective (str): The learning objective.
39-
num_class (int): The number of classes for multiclass classification.
40-
metric (str): The metric to be used for evaluation.
39+
num_class (Optional[int]): The number of classes for multiclass
40+
classification. Not needed for binary. Defaults to None.
41+
metric (str): The metric to be used for evaluation. Defaults to 'logloss'.
4142
feature_fraction (float): Fraction of features to be considered for each
4243
tree.
4344
early_stopping_rounds (Optional[int]): Activates early stopping.

ml_grid/pipeline/data_correlation_matrix.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import sys
12
from typing import Any, Dict, List, Tuple
23

34
import pandas as pd
@@ -69,7 +70,7 @@ def handle_correlation_matrix(
6970
]
7071

7172
# Iterate through each column chunk
72-
for chunk in tqdm(column_chunks, desc="Calculating Correlations"):
73+
for chunk in tqdm(column_chunks, desc="Calculating Correlations", file=sys.stdout):
7374
# Calculate the correlation coefficients for the current chunk
7475
try:
7576
correlations = df_numeric[chunk].corr()
Lines changed: 0 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -1,56 +0,0 @@
1-
import logging
2-
import pathlib
3-
import os
4-
from typing import Any, Dict, Optional
5-
6-
7-
class log_folder:
8-
"""Creates a unique log folder for each experimental run based on its parameters."""
9-
10-
def __init__(
11-
self,
12-
local_param_dict: Dict[str, Any],
13-
additional_naming: Optional[str],
14-
base_project_dir: str,
15-
) -> None:
16-
"""Initializes the log folder and sets up basic logging.
17-
18-
This constructor generates a unique folder name by concatenating the
19-
values from the `local_param_dict`. It then creates this folder and
20-
configures a basic logger to write to a 'log.log' file inside it.
21-
22-
Note:
23-
This class re-configures the root logger on each instantiation,
24-
which may have unintended side effects in a larger application.
25-
26-
Args:
27-
local_param_dict (Dict[str, Any]): A dictionary of parameters for the
28-
current pipeline run.
29-
additional_naming (Optional[str]): An additional string to append to
30-
the folder name.
31-
base_project_dir (str): The root directory for the project.
32-
"""
33-
34-
str_b = ""
35-
for key in local_param_dict.keys():
36-
if key != "data":
37-
str_b = str_b + "_" + str(local_param_dict.get(key))
38-
else:
39-
for data_key in local_param_dict.get("data", {}):
40-
str_b = str_b + str(int(local_param_dict.get("data", {}).get(data_key)))
41-
42-
global_param_str = str_b
43-
44-
print(global_param_str)
45-
46-
log_folder_name = f"{global_param_str}{additional_naming or ''}"
47-
log_folder_path = os.path.join(base_project_dir, log_folder_name, "logs")
48-
49-
pathlib.Path(log_folder_path).mkdir(parents=True, exist_ok=True)
50-
51-
full_log_path = os.path.join(log_folder_path, "log.log")
52-
53-
logging.basicConfig(filename=full_log_path)
54-
stderrLogger = logging.StreamHandler()
55-
stderrLogger.setFormatter(logging.Formatter(logging.BASIC_FORMAT))
56-
logging.getLogger().addHandler(stderrLogger)

ml_grid/results_processing/core.py

Lines changed: 65 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -62,25 +62,68 @@ def load_feature_names(self, feature_names_csv: str) -> None:
6262
)
6363

6464
def get_available_runs(self) -> List[str]:
65-
"""Gets a list of available timestamped run folders.
65+
"""Gets a list of available run folders by recursively searching for log files.
66+
67+
This method is robust to nested directory structures. It finds all
68+
`final_grid_score_log.csv` files and returns their parent directory
69+
names as the list of available runs.
70+
71+
Special case: If a log file exists directly in the root folder,
72+
the root folder's name will be used as the run identifier.
6673
6774
Returns:
6875
List[str]: A sorted list of valid run folder names.
6976
7077
Raises:
71-
ValueError: If the root folder does not exist.
78+
ValueError: If the root folder does not exist or is not a directory.
7279
"""
73-
if not self.root_folder.exists():
74-
raise ValueError(f"Root folder {self.root_folder} does not exist")
75-
76-
timestamp_folders = []
77-
for item in self.root_folder.iterdir():
78-
if item.is_dir():
79-
log_file = item / "final_grid_score_log.csv"
80-
if log_file.exists():
81-
timestamp_folders.append(item.name)
82-
83-
return sorted(timestamp_folders)
80+
if not self.root_folder.is_dir():
81+
raise ValueError(f"Root folder {self.root_folder} is not a valid directory")
82+
83+
# Check if log file exists directly in root
84+
root_log_file = self.root_folder / "final_grid_score_log.csv"
85+
run_folders = set()
86+
87+
if root_log_file.exists():
88+
# Use a special identifier for root-level CSV
89+
run_folders.add(f"__ROOT__{self.root_folder.name}")
90+
91+
# Recursively find all log files in subfolders
92+
for log_file in self.root_folder.rglob("final_grid_score_log.csv"):
93+
# Skip the root-level file (already handled)
94+
if log_file == root_log_file:
95+
continue
96+
# Add the immediate parent folder name
97+
run_folders.add(log_file.parent.name)
98+
99+
return sorted(list(run_folders))
100+
101+
def _resolve_run_path(self, run_name: str) -> Path:
102+
"""Resolves a run name to its full path.
103+
104+
Args:
105+
run_name: The run folder name or special root identifier
106+
107+
Returns:
108+
Path to the run folder
109+
110+
Raises:
111+
FileNotFoundError: If the run cannot be found
112+
"""
113+
# Check if this is the special root identifier
114+
if run_name.startswith("__ROOT__"):
115+
root_log = self.root_folder / "final_grid_score_log.csv"
116+
if root_log.exists():
117+
return self.root_folder
118+
raise FileNotFoundError(f"Root log file not found: {root_log}")
119+
120+
# Search for the folder name within the root directory
121+
try:
122+
return next(self.root_folder.rglob(f"**/{run_name}"))
123+
except StopIteration:
124+
raise FileNotFoundError(
125+
f"Run folder '{run_name}' not found anywhere under {self.root_folder}"
126+
)
84127

85128
def load_single_run(self, timestamp_folder: str) -> pd.DataFrame:
86129
"""Loads results from a specific timestamped run folder.
@@ -94,8 +137,10 @@ def load_single_run(self, timestamp_folder: str) -> pd.DataFrame:
94137
Raises:
95138
FileNotFoundError: If the log file does not exist in the folder.
96139
"""
97-
log_path = self.root_folder / timestamp_folder / "final_grid_score_log.csv"
98-
140+
# Resolve the run name to its full path. This handles nesting and the special root case.
141+
run_folder_path = self._resolve_run_path(timestamp_folder)
142+
143+
log_path = run_folder_path / "final_grid_score_log.csv"
99144
if not log_path.exists():
100145
raise FileNotFoundError(f"Log file not found: {log_path}")
101146

@@ -138,7 +183,12 @@ def aggregate_specific_runs(self, run_names: List[str]) -> pd.DataFrame:
138183

139184
for run in run_names:
140185
try:
186+
# Resolve the run name to its actual path. This handles
187+
# the special '__ROOT__' case and nested folders. The path is what we need.
188+
run_folder_path = self._resolve_run_path(run)
141189
df = self.load_single_run(run)
190+
# Add the relative path to the run for better context
191+
df['run_path'] = str(run_folder_path.relative_to(self.root_folder))
142192
all_dataframes.append(df)
143193
print(f"Loaded run: {run} ({len(df)} records)")
144194
except Exception as e:
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
import os
2+
from typing import Optional
3+
from datetime import datetime
4+
from pathlib import Path
5+
6+
def create_experiment_directory(
7+
base_dir: str, additional_naming: Optional[str] = None
8+
) -> str:
9+
"""Creates a single, timestamped directory for a group of experiment runs.
10+
11+
This function should be called once at the beginning of an experiment script
12+
to create a unique parent folder for all the runs in that batch.
13+
14+
Args:
15+
base_dir (str): The base directory where experiment folders will be stored
16+
(e.g., 'notebooks/HFE_ML_experiments').
17+
additional_naming (Optional[str], optional): A descriptive name to append
18+
to the timestamp. Defaults to None.
19+
20+
Returns:
21+
str: The full path to the created experiment directory.
22+
"""
23+
24+
current_date_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
25+
folder_name = f"{current_date_time}_{additional_naming}" if additional_naming else current_date_time
26+
experiment_dir = Path(base_dir) / folder_name
27+
experiment_dir.mkdir(parents=True, exist_ok=True)
28+
print(f"Experiment directory created: {experiment_dir}")
29+
return str(experiment_dir)

0 commit comments

Comments
 (0)