added tests to hook. minor fixes.

SamoraHunter · SamoraHunter · commit 7bd68ba6695c · 2025-10-08T00:13:16.000+01:00
diff --git a/.github/workflows/notebook-test.yml b/.github/workflows/notebook-test.yml
@@ -83,4 +83,7 @@ jobs:
           set -e
           cd $GITHUB_WORKSPACE
           source "$VENV_PATH/bin/activate"
-          pytest --nbmake --nbmake-timeout=1200 notebooks/unit_test_synthetic.ipynb
+          pytest --nbmake --nbmake-timeout=1200 notebooks/unit_test_synthetic.ipynb
+          
+          echo "Running Python unit tests..."
+          pytest tests/
diff --git a/ml_grid/model_classes/knn_wrapper_class.py b/ml_grid/model_classes/knn_wrapper_class.py
@@ -47,11 +47,17 @@ def __init__(
         self.p = p
         self.metric = metric
         self.metric_params = metric_params
-        self.device = (
-            device if device else ("gpu" if torch.cuda.is_available() else "cpu")
-        )
-        if self.device == "cpu":
-            print("warning using cpu KNNWrapper")
+
+        # Auto-detect device
+        gpu_available = torch.cuda.is_available()
+        if device == "gpu" and not gpu_available:
+            print("Warning: GPU requested for KNNWrapper, but torch.cuda.is_available() is False. Falling back to CPU.")
+            self.device = "cpu"
+        elif device:
+            self.device = device
+        else:
+            self.device = "gpu" if gpu_available else "cpu"
+
         self.model: Optional[KNeighborsClassifier] = None
 
     def fit(
diff --git a/ml_grid/model_classes/svc_class.py b/ml_grid/model_classes/svc_class.py
@@ -40,11 +40,12 @@ def __init__(
                 if self.X is None:
                     raise ValueError("Input data X is None - data not loaded properly")
                     
+                # If the dataframe is empty, there's nothing to scale.
+                # The pipeline will likely fail later, but we avoid a scaling error here.
                 if isinstance(self.X, pd.DataFrame) and self.X.empty:
-                    #raise ValueError("Input data X is an empty DataFrame")
-                    print("warn: SVC data scaling, X data is empty")
+                    raise ValueError("SVC_class received an empty DataFrame. Halting execution.")
 
-                if( self.X.empty == False):
+                elif not self.X.empty:
                     if not hasattr(self, 'scaler'):
                         self.scaler = StandardScaler()  # or whichever scaler you're using
                         
diff --git a/ml_grid/pipeline/column_names.py b/ml_grid/pipeline/column_names.py
@@ -345,6 +345,12 @@ def get_pertubation_columns(
     if local_param_dict.get("data").get("appointments") == True:
         pertubation_columns.extend(appointments_list)
 
+    # Add any other columns explicitly set to True in the data dict
+    explicitly_selected_cols = {col for col, selected in local_param_dict.get("data", {}).items() if selected}
+    for col in explicitly_selected_cols:
+        if col not in pertubation_columns and col in all_df_columns:
+            pertubation_columns.append(col)
+
     print(f"local_param_dict data perturbation: \n {local_param_dict.get('data')}")
 
     if verbose >= 2:
diff --git a/ml_grid/pipeline/data_correlation_matrix.py b/ml_grid/pipeline/data_correlation_matrix.py
@@ -85,7 +85,7 @@ def handle_correlation_matrix(
             # Filter columns with correlation coefficient greater than the threshold
             try:
                 correlated_cols = correlations[col][
-                    (correlations[col] > threshold) & (correlations[col] != 1)
+                    correlations[col].abs() > threshold
                 ].index.tolist()
             except KeyError:
                 print(
@@ -95,201 +95,17 @@ def handle_correlation_matrix(
                 print("Continuing with an empty list of correlated columns")
                 correlated_cols = []
 
+            # Exclude the column itself from the list of correlated columns
+            if col in correlated_cols:
+                correlated_cols.remove(col)
+
             # Add the correlated columns to the list
-            drop_list.extend(
-                [(col, corr_col) for corr_col in correlated_cols]
-            )
+            for corr_col in correlated_cols:
+                # Add only the second column of the pair to avoid dropping both
+                if col != corr_col:
+                    drop_list.append(corr_col)
 
     # Remove duplicates from the list
     drop_list = list(set(drop_list))
 
     return drop_list
-
-
-# def handle_correlation_matrix(local_param_dict, drop_list, df, chunk_size=50):
-#     """
-#     Calculate correlated columns in chunks.
-
-#     Calculates the correlation coefficient between each column in the input DataFrame
-#     using chunks to avoid memory issues. The correlation threshold is defined by
-#     the 'corr' key in the local_param_dict dictionary.
-
-#     Args:
-#         local_param_dict (dict): Dictionary containing local parameters, including the correlation threshold.
-#         drop_list (list): List to which correlated columns will be appended.
-#         df (pandas.DataFrame): Input DataFrame.
-#         chunk_size (int, optional): Size of each chunk for correlation calculation. Default is 50.
-
-#     Returns:
-#         list: List of correlated columns.
-#     """
-
-#     if chunk_size >= len(df):
-#         chunk_size = len(df) - 1
-#     # Define the correlation threshold
-#     threshold = local_param_dict.get("corr", 0.25)
-
-#     # Remove non-numeric columns
-#     numeric_columns = df.select_dtypes(include=["number"]).columns
-#     df_numeric = df[numeric_columns]
-
-#     # Split columns into chunks
-#     column_chunks = [
-#         df_numeric.columns[i : i + chunk_size]
-#         for i in range(0, len(df_numeric.columns), chunk_size)
-#     ]
-
-#     # Iterate through each column chunk
-#     for chunk in tqdm(column_chunks, desc="Calculating Correlations"):
-#         # Calculate the correlation coefficients for the current chunk
-#         try:
-#             correlations = df_numeric[chunk].corr()
-#         except:
-#             print(
-#                 "Encountered exception while calculating correlations for chunk", chunk
-#             )
-#             print(traceback.format_exc())
-#             continue
-
-#         # Iterate through each column in the chunk
-#         for col in chunk:
-#             # Filter columns with correlation coefficient greater than the threshold
-#             try:
-#                 correlated_cols = correlations[col][
-#                     correlations[col].abs() > threshold
-#                 ].index.tolist()
-#             except KeyError:
-#                 print(
-#                     "Encountered KeyError while calculating correlations for column",
-#                     col,
-#                 )
-#                 print("Continuing with an empty list of correlated columns")
-#                 correlated_cols = []
-#             except AttributeError:
-#                 print(
-#                     "Encountered AttributeError while calculating correlations for column",
-#                     col,
-#                 )
-#                 print("Continuing with an empty list of correlated columns")
-#                 correlated_cols = []
-
-#             # Exclude the current column from the correlated columns list if it's in the list
-#             if col in correlated_cols:
-#                 correlated_cols.remove(col)
-
-#             # Add the correlated columns to the list
-#             drop_list.extend([(col, corr_col) for corr_col in correlated_cols])
-
-#     # Remove duplicates from the list
-#     drop_list = list(set(drop_list))
-
-#     return drop_list
-
-
-# Example usage:
-# input_csv_path = '../concatenated_data_concatenated_output_imputed_f_b_m_collapsed_mean.csv'
-# df = pd.read_csv(input_csv_path)
-# local_param_dict = {'corr': 0.25}  # Example threshold value
-
-# correlated_columns = handle_correlation_matrix(df, local_param_dict)
-# print("Columns with correlation greater than", local_param_dict['corr'])
-# print(correlated_columns)
-
-
-# def handle_correlation_matrix(local_param_dict, drop_list, df):
-#     print("Handling correlation matrix")
-#     temp_col_list = list(df.select_dtypes(include=[float, int]).columns)
-
-#     # Calculate absolute correlation matrix
-#     corr_matrix = df.select_dtypes(include=[float, int]).corr().abs()
-
-#     # Create a True/False mask and apply it
-#     mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
-#     tri_df = corr_matrix.mask(mask)
-
-#     # List column names of highly correlated features (r > local_param_dict['corr'])
-#     corr_to_drop = [
-#         c for c in tri_df.columns if any(tri_df[c] > local_param_dict.get("corr"))
-#     ]
-
-#     print(
-#         f"Identified {len(corr_to_drop)} correlated features to drop at >{local_param_dict.get('corr')}"
-#     )
-#     drop_list.extend(corr_to_drop)
-
-#     return drop_list
-# import pandas as pd
-# import numpy as np
-
-
-# def correlation_coefficient(x, y):
-#     """
-#     Calculate the correlation coefficient between two lists of values.
-
-#     Parameters:
-#         x (list): First list of values.
-#         y (list): Second list of values.
-
-#     Returns:
-#         float: Correlation coefficient between x and y.
-#     """
-#     n = len(x)
-#     sum_x = sum(x)
-#     sum_y = sum(y)
-#     sum_x_sq = sum(xi**2 for xi in x)
-#     sum_y_sq = sum(yi**2 for yi in y)
-#     sum_xy = sum(xi * yi for xi, yi in zip(x, y))
-
-#     numerator = n * sum_xy - sum_x * sum_y
-#     denominator = ((n * sum_x_sq - sum_x**2) * (n * sum_y_sq - sum_y**2)) ** 0.5
-
-#     if denominator == 0:
-#         return 0
-#     else:
-#         return numerator / denominator
-
-
-# def handle_correlation_matrix(local_param_dict, drop_list, df):
-#     print("Handling correlation matrix")
-#     temp_col_list = list(df.select_dtypes(include=[float, int]).columns)
-
-#     # Initialize an empty DataFrame to store correlation coefficients
-#     corr_matrix = pd.DataFrame(index=temp_col_list, columns=temp_col_list)
-
-#     # Calculate correlation coefficients for each pair of columns
-#     for i, col1 in enumerate(temp_col_list):
-#         for j, col2 in enumerate(temp_col_list):
-#             if i != j:
-#                 corr_matrix.loc[col1, col2] = correlation_coefficient(
-#                     df[col1], df[col2]
-#                 )
-
-#     # Convert the DataFrame to absolute values
-#     corr_matrix = corr_matrix.abs()
-
-#     # Create a True/False mask and apply it
-#     mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
-#     tri_df = corr_matrix.mask(mask)
-
-#     # List column names of highly correlated features (r > local_param_dict['corr'])
-#     corr_to_drop = [
-#         c for c in tri_df.columns if any(tri_df[c] > local_param_dict.get("corr"))
-#     ]
-
-#     print(
-#         f"Identified {len(corr_to_drop)} correlated features to drop at >{local_param_dict.get('corr')}"
-#     )
-#     drop_list.extend(corr_to_drop)
-
-#     return drop_list
-
-
-# Example usage:
-# local_param_dict = {'corr': 0.25}  # Example threshold value
-# drop_list = []
-
-# # Assuming df is your DataFrame
-# # Replace df with your actual DataFrame
-# # Call the function to update the drop_list
-# updated_drop_list = handle_correlation_matrix(local_param_dict, drop_list, df)
-# print("Updated drop list:", updated_drop_list)
diff --git a/ml_grid/pipeline/data_feature_methods.py b/ml_grid/pipeline/data_feature_methods.py
@@ -46,6 +46,9 @@ def getNfeaturesANOVAF(
         else:
             raise ValueError("X_train must be a pandas DataFrame or numpy array")
 
+        # Ensure at least one feature is requested
+        n = max(1, n)
+
         # Calculate F-values for all features at once
         f_values, _ = f_classif(X_train, y_train)
 
@@ -153,7 +156,6 @@ def getNFeaturesMarkovBlanket(
             # Re-evaluate the first selected feature to ensure it's a valid name
             first_feature = selected_features[0]
             feature_names = [original_columns[first_feature] if isinstance(first_feature, int) else first_feature]
-        
         return feature_names