Skip to content

Commit 7bd68ba

Browse files
committed
added tests to hook. minor fixes.
1 parent 8c45750 commit 7bd68ba

6 files changed

Lines changed: 37 additions & 203 deletions

File tree

.github/workflows/notebook-test.yml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,4 +83,7 @@ jobs:
8383
set -e
8484
cd $GITHUB_WORKSPACE
8585
source "$VENV_PATH/bin/activate"
86-
pytest --nbmake --nbmake-timeout=1200 notebooks/unit_test_synthetic.ipynb
86+
pytest --nbmake --nbmake-timeout=1200 notebooks/unit_test_synthetic.ipynb
87+
88+
echo "Running Python unit tests..."
89+
pytest tests/

ml_grid/model_classes/knn_wrapper_class.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -47,11 +47,17 @@ def __init__(
4747
self.p = p
4848
self.metric = metric
4949
self.metric_params = metric_params
50-
self.device = (
51-
device if device else ("gpu" if torch.cuda.is_available() else "cpu")
52-
)
53-
if self.device == "cpu":
54-
print("warning using cpu KNNWrapper")
50+
51+
# Auto-detect device
52+
gpu_available = torch.cuda.is_available()
53+
if device == "gpu" and not gpu_available:
54+
print("Warning: GPU requested for KNNWrapper, but torch.cuda.is_available() is False. Falling back to CPU.")
55+
self.device = "cpu"
56+
elif device:
57+
self.device = device
58+
else:
59+
self.device = "gpu" if gpu_available else "cpu"
60+
5561
self.model: Optional[KNeighborsClassifier] = None
5662

5763
def fit(

ml_grid/model_classes/svc_class.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -40,11 +40,12 @@ def __init__(
4040
if self.X is None:
4141
raise ValueError("Input data X is None - data not loaded properly")
4242

43+
# If the dataframe is empty, there's nothing to scale.
44+
# The pipeline will likely fail later, but we avoid a scaling error here.
4345
if isinstance(self.X, pd.DataFrame) and self.X.empty:
44-
#raise ValueError("Input data X is an empty DataFrame")
45-
print("warn: SVC data scaling, X data is empty")
46+
raise ValueError("SVC_class received an empty DataFrame. Halting execution.")
4647

47-
if( self.X.empty == False):
48+
elif not self.X.empty:
4849
if not hasattr(self, 'scaler'):
4950
self.scaler = StandardScaler() # or whichever scaler you're using
5051

ml_grid/pipeline/column_names.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -345,6 +345,12 @@ def get_pertubation_columns(
345345
if local_param_dict.get("data").get("appointments") == True:
346346
pertubation_columns.extend(appointments_list)
347347

348+
# Add any other columns explicitly set to True in the data dict
349+
explicitly_selected_cols = {col for col, selected in local_param_dict.get("data", {}).items() if selected}
350+
for col in explicitly_selected_cols:
351+
if col not in pertubation_columns and col in all_df_columns:
352+
pertubation_columns.append(col)
353+
348354
print(f"local_param_dict data perturbation: \n {local_param_dict.get('data')}")
349355

350356
if verbose >= 2:

ml_grid/pipeline/data_correlation_matrix.py

Lines changed: 9 additions & 193 deletions
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ def handle_correlation_matrix(
8585
# Filter columns with correlation coefficient greater than the threshold
8686
try:
8787
correlated_cols = correlations[col][
88-
(correlations[col] > threshold) & (correlations[col] != 1)
88+
correlations[col].abs() > threshold
8989
].index.tolist()
9090
except KeyError:
9191
print(
@@ -95,201 +95,17 @@ def handle_correlation_matrix(
9595
print("Continuing with an empty list of correlated columns")
9696
correlated_cols = []
9797

98+
# Exclude the column itself from the list of correlated columns
99+
if col in correlated_cols:
100+
correlated_cols.remove(col)
101+
98102
# Add the correlated columns to the list
99-
drop_list.extend(
100-
[(col, corr_col) for corr_col in correlated_cols]
101-
)
103+
for corr_col in correlated_cols:
104+
# Add only the second column of the pair to avoid dropping both
105+
if col != corr_col:
106+
drop_list.append(corr_col)
102107

103108
# Remove duplicates from the list
104109
drop_list = list(set(drop_list))
105110

106111
return drop_list
107-
108-
109-
# def handle_correlation_matrix(local_param_dict, drop_list, df, chunk_size=50):
110-
# """
111-
# Calculate correlated columns in chunks.
112-
113-
# Calculates the correlation coefficient between each column in the input DataFrame
114-
# using chunks to avoid memory issues. The correlation threshold is defined by
115-
# the 'corr' key in the local_param_dict dictionary.
116-
117-
# Args:
118-
# local_param_dict (dict): Dictionary containing local parameters, including the correlation threshold.
119-
# drop_list (list): List to which correlated columns will be appended.
120-
# df (pandas.DataFrame): Input DataFrame.
121-
# chunk_size (int, optional): Size of each chunk for correlation calculation. Default is 50.
122-
123-
# Returns:
124-
# list: List of correlated columns.
125-
# """
126-
127-
# if chunk_size >= len(df):
128-
# chunk_size = len(df) - 1
129-
# # Define the correlation threshold
130-
# threshold = local_param_dict.get("corr", 0.25)
131-
132-
# # Remove non-numeric columns
133-
# numeric_columns = df.select_dtypes(include=["number"]).columns
134-
# df_numeric = df[numeric_columns]
135-
136-
# # Split columns into chunks
137-
# column_chunks = [
138-
# df_numeric.columns[i : i + chunk_size]
139-
# for i in range(0, len(df_numeric.columns), chunk_size)
140-
# ]
141-
142-
# # Iterate through each column chunk
143-
# for chunk in tqdm(column_chunks, desc="Calculating Correlations"):
144-
# # Calculate the correlation coefficients for the current chunk
145-
# try:
146-
# correlations = df_numeric[chunk].corr()
147-
# except:
148-
# print(
149-
# "Encountered exception while calculating correlations for chunk", chunk
150-
# )
151-
# print(traceback.format_exc())
152-
# continue
153-
154-
# # Iterate through each column in the chunk
155-
# for col in chunk:
156-
# # Filter columns with correlation coefficient greater than the threshold
157-
# try:
158-
# correlated_cols = correlations[col][
159-
# correlations[col].abs() > threshold
160-
# ].index.tolist()
161-
# except KeyError:
162-
# print(
163-
# "Encountered KeyError while calculating correlations for column",
164-
# col,
165-
# )
166-
# print("Continuing with an empty list of correlated columns")
167-
# correlated_cols = []
168-
# except AttributeError:
169-
# print(
170-
# "Encountered AttributeError while calculating correlations for column",
171-
# col,
172-
# )
173-
# print("Continuing with an empty list of correlated columns")
174-
# correlated_cols = []
175-
176-
# # Exclude the current column from the correlated columns list if it's in the list
177-
# if col in correlated_cols:
178-
# correlated_cols.remove(col)
179-
180-
# # Add the correlated columns to the list
181-
# drop_list.extend([(col, corr_col) for corr_col in correlated_cols])
182-
183-
# # Remove duplicates from the list
184-
# drop_list = list(set(drop_list))
185-
186-
# return drop_list
187-
188-
189-
# Example usage:
190-
# input_csv_path = '../concatenated_data_concatenated_output_imputed_f_b_m_collapsed_mean.csv'
191-
# df = pd.read_csv(input_csv_path)
192-
# local_param_dict = {'corr': 0.25} # Example threshold value
193-
194-
# correlated_columns = handle_correlation_matrix(df, local_param_dict)
195-
# print("Columns with correlation greater than", local_param_dict['corr'])
196-
# print(correlated_columns)
197-
198-
199-
# def handle_correlation_matrix(local_param_dict, drop_list, df):
200-
# print("Handling correlation matrix")
201-
# temp_col_list = list(df.select_dtypes(include=[float, int]).columns)
202-
203-
# # Calculate absolute correlation matrix
204-
# corr_matrix = df.select_dtypes(include=[float, int]).corr().abs()
205-
206-
# # Create a True/False mask and apply it
207-
# mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
208-
# tri_df = corr_matrix.mask(mask)
209-
210-
# # List column names of highly correlated features (r > local_param_dict['corr'])
211-
# corr_to_drop = [
212-
# c for c in tri_df.columns if any(tri_df[c] > local_param_dict.get("corr"))
213-
# ]
214-
215-
# print(
216-
# f"Identified {len(corr_to_drop)} correlated features to drop at >{local_param_dict.get('corr')}"
217-
# )
218-
# drop_list.extend(corr_to_drop)
219-
220-
# return drop_list
221-
# import pandas as pd
222-
# import numpy as np
223-
224-
225-
# def correlation_coefficient(x, y):
226-
# """
227-
# Calculate the correlation coefficient between two lists of values.
228-
229-
# Parameters:
230-
# x (list): First list of values.
231-
# y (list): Second list of values.
232-
233-
# Returns:
234-
# float: Correlation coefficient between x and y.
235-
# """
236-
# n = len(x)
237-
# sum_x = sum(x)
238-
# sum_y = sum(y)
239-
# sum_x_sq = sum(xi**2 for xi in x)
240-
# sum_y_sq = sum(yi**2 for yi in y)
241-
# sum_xy = sum(xi * yi for xi, yi in zip(x, y))
242-
243-
# numerator = n * sum_xy - sum_x * sum_y
244-
# denominator = ((n * sum_x_sq - sum_x**2) * (n * sum_y_sq - sum_y**2)) ** 0.5
245-
246-
# if denominator == 0:
247-
# return 0
248-
# else:
249-
# return numerator / denominator
250-
251-
252-
# def handle_correlation_matrix(local_param_dict, drop_list, df):
253-
# print("Handling correlation matrix")
254-
# temp_col_list = list(df.select_dtypes(include=[float, int]).columns)
255-
256-
# # Initialize an empty DataFrame to store correlation coefficients
257-
# corr_matrix = pd.DataFrame(index=temp_col_list, columns=temp_col_list)
258-
259-
# # Calculate correlation coefficients for each pair of columns
260-
# for i, col1 in enumerate(temp_col_list):
261-
# for j, col2 in enumerate(temp_col_list):
262-
# if i != j:
263-
# corr_matrix.loc[col1, col2] = correlation_coefficient(
264-
# df[col1], df[col2]
265-
# )
266-
267-
# # Convert the DataFrame to absolute values
268-
# corr_matrix = corr_matrix.abs()
269-
270-
# # Create a True/False mask and apply it
271-
# mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
272-
# tri_df = corr_matrix.mask(mask)
273-
274-
# # List column names of highly correlated features (r > local_param_dict['corr'])
275-
# corr_to_drop = [
276-
# c for c in tri_df.columns if any(tri_df[c] > local_param_dict.get("corr"))
277-
# ]
278-
279-
# print(
280-
# f"Identified {len(corr_to_drop)} correlated features to drop at >{local_param_dict.get('corr')}"
281-
# )
282-
# drop_list.extend(corr_to_drop)
283-
284-
# return drop_list
285-
286-
287-
# Example usage:
288-
# local_param_dict = {'corr': 0.25} # Example threshold value
289-
# drop_list = []
290-
291-
# # Assuming df is your DataFrame
292-
# # Replace df with your actual DataFrame
293-
# # Call the function to update the drop_list
294-
# updated_drop_list = handle_correlation_matrix(local_param_dict, drop_list, df)
295-
# print("Updated drop list:", updated_drop_list)

ml_grid/pipeline/data_feature_methods.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,9 @@ def getNfeaturesANOVAF(
4646
else:
4747
raise ValueError("X_train must be a pandas DataFrame or numpy array")
4848

49+
# Ensure at least one feature is requested
50+
n = max(1, n)
51+
4952
# Calculate F-values for all features at once
5053
f_values, _ = f_classif(X_train, y_train)
5154

@@ -153,7 +156,6 @@ def getNFeaturesMarkovBlanket(
153156
# Re-evaluate the first selected feature to ensure it's a valid name
154157
first_feature = selected_features[0]
155158
feature_names = [original_columns[first_feature] if isinstance(first_feature, int) else first_feature]
156-
157159
return feature_names
158160

159161

0 commit comments

Comments
 (0)