Refactor data pipeline for robustness and fix parameter validation

SamoraHunter · SamoraHunter · commit ae2a661a1a7b · 2026-02-28T19:07:38.000Z
ml_grid/pipeline/data.py:
Add checks for pd.DataFrame before applying DataFrame-specific operations (cleaning, scaling, feature selection, embeddings) to support non-DataFrame inputs (e.g., Time Series).
Improve _assert_index_alignment to handle non-pandas objects by checking length.
Safely call reset_index only if the method exists on the object.
ml_grid/pipeline/data_train_test_split.py:
Handle numpy array inputs for X and y, converting to pandas objects where appropriate for splitting logic.
Add check for DataFrame type before attempting to move samples in single-class fallback logic.
ml_grid/pipeline/grid_search_cross_validate.py:
Add automatic configuration of XLA_FLAGS for CUDA to resolve libdevice errors.
Ensure FLAMLClassifierWrapper and AutoKerasClassifierWrapper receive DataFrames and run single-threaded during final CV.
Remove logic that skipped final CV in test mode.
ml_grid/util/validate_parameters.py:
Update validation functions (validate_knn_parameters, validate_XGB_parameters) to handle lists of parameter dictionaries (Grid Search format) via recursion.
Update type hints to support lists of dictionaries.
ml_grid/pipeline/test_data_pipeline.py:
Enable test_mode in test setup.
diff --git a/ml_grid/pipeline/data.py b/ml_grid/pipeline/data.py
@@ -23,13 +23,12 @@
 from ml_grid.pipeline.embeddings import create_embedding_pipeline
 from ml_grid.util.global_params import global_parameters
 from ml_grid.util.logger_setup import setup_logger
-
-warnings.filterwarnings("ignore", category=ConvergenceWarning)
-warnings.filterwarnings("ignore", category=UserWarning)
 from sklearn.preprocessing import (
     StandardScaler,
 )  # Added explicit import for StandardScaler
 
+warnings.filterwarnings("ignore", category=ConvergenceWarning)
+warnings.filterwarnings("ignore", category=UserWarning)
 warnings.filterwarnings("ignore", category=FutureWarning)
 
 
@@ -135,10 +134,20 @@ def _log_feature_transformation(
                 }
             )
 
-    def _assert_index_alignment(
-        self, df1: pd.DataFrame, df2: pd.Series, step_name: str
-    ):
+    def _assert_index_alignment(self, df1: Any, df2: Any, step_name: str):
         """Helper function to assert that DataFrame and Series indices are equal."""
+        # Handle objects without .index (e.g. numpy arrays in time series mode)
+        if not hasattr(df1, "index") or not hasattr(df2, "index"):
+            if len(df1) != len(df2):
+                self.logger.error(
+                    f"Length mismatch at {step_name}: {len(df1)} vs {len(df2)}"
+                )
+                raise AssertionError(f"Length mismatch at {step_name}")
+            self.logger.debug(
+                f"Length alignment PASSED at: {step_name} (non-pandas objects)"
+            )
+            return
+
         try:
             assert_index_equal(df1.index, df2.index)
             self.logger.debug(f"Index alignment PASSED at: {step_name}")
@@ -499,18 +508,28 @@ def _split_data(self):
         # --- CRITICAL FIX: Reset all indices immediately after splitting ---
         # This ensures all downstream processing (constant removal, feature selection, embedding)
         # operates on data with clean, aligned, 0-based integer indices.
-        self.X_train.reset_index(drop=True, inplace=True)
-        self.y_train.reset_index(drop=True, inplace=True)
-        self.X_test.reset_index(drop=True, inplace=True)
-        self.y_test.reset_index(drop=True, inplace=True)
-        self.X_test_orig.reset_index(drop=True, inplace=True)
-        self.y_test_orig.reset_index(drop=True, inplace=True)
+        if hasattr(self.X_train, "reset_index"):
+            self.X_train.reset_index(drop=True, inplace=True)
+            self.X_test.reset_index(drop=True, inplace=True)
+            self.X_test_orig.reset_index(drop=True, inplace=True)
+
+        if hasattr(self.y_train, "reset_index"):
+            self.y_train.reset_index(drop=True, inplace=True)
+            self.y_test.reset_index(drop=True, inplace=True)
+            self.y_test_orig.reset_index(drop=True, inplace=True)
+
         self._assert_index_alignment(
             self.X_train, self.y_train, "After master reset_index"
         )
 
     def _post_split_cleaning(self):
         """Applies cleaning steps post-split to prevent data leakage."""
+        if not isinstance(self.X_train, pd.DataFrame):
+            self.logger.info(
+                "Skipping post-split cleaning (not a DataFrame, likely Time Series mode)."
+            )
+            return
+
         # Clean column names *before* dropping operations to ensure stable column order.
         cleanup = clean_up_class()
         cleanup.screen_non_float_types(self.X_train)
@@ -608,6 +627,12 @@ def _post_split_cleaning(self):
 
     def _scale_features(self):
         """Applies standard scaling to the feature sets."""
+        if not isinstance(self.X_train, pd.DataFrame):
+            self.logger.info(
+                "Skipping scaling (not a DataFrame, likely Time Series mode)."
+            )
+            return
+
         features_before = self.X_train.shape[1]
         scale = self.local_param_dict.get("scale")
         if scale:
@@ -650,6 +675,12 @@ def _scale_features(self):
 
     def _select_features_by_importance(self):
         """Selects features based on importance scores if configured."""
+        if not isinstance(self.X_train, pd.DataFrame):
+            self.logger.info(
+                "Skipping feature selection (not a DataFrame, likely Time Series mode)."
+            )
+            return
+
         target_n_features = self.local_param_dict.get("feature_n")
 
         if target_n_features is not None and target_n_features < 100:
@@ -752,6 +783,12 @@ def _select_features_by_importance(self):
 
     def _apply_embeddings(self):
         """Applies feature embedding/dimensionality reduction if configured."""
+        if not isinstance(self.X_train, pd.DataFrame):
+            self.logger.info(
+                "Skipping embeddings (not a DataFrame, likely Time Series mode)."
+            )
+            return
+
         if self.local_param_dict.get("use_embedding", False):
             features_before = self.X_train.shape[1]
             self.logger.info("Applying embeddings...")
@@ -925,16 +962,9 @@ def _finalize_pipeline(self):
         # Final definitive assertion before exiting the data pipeline.
         # This ensures that the X_train and y_train that will be passed to the
         # model training steps are perfectly aligned.
-        try:
-            assert_index_equal(self.X_train.index, self.y_train.index)
-            self.logger.info(
-                "Final data alignment check PASSED. X_train and y_train indices are identical."
-            )
-        except AssertionError:
-            self.logger.error(
-                "CRITICAL: Final data alignment check FAILED. X_train and y_train indices are NOT identical."
-            )
-            raise
+        self._assert_index_alignment(
+            self.X_train, self.y_train, "Final data alignment check"
+        )
 
     def _compile_and_log_feature_transformations(self, error_occurred: bool = False):
         """Compiles the feature transformation log and displays it."""
diff --git a/ml_grid/pipeline/data_train_test_split.py b/ml_grid/pipeline/data_train_test_split.py
@@ -42,6 +42,15 @@ def get_data_split(
     random.seed(1234)
     np.random.seed(1234)
 
+    # --- Handle Numpy Inputs (e.g. from Time Series mode) ---
+    if isinstance(y, np.ndarray):
+        y = pd.Series(y)
+
+    # Ensure X is a pandas DataFrame if it's 2D, to support column access if resampling is used.
+    # If X is >2D (e.g. time series), it stays as numpy array.
+    if isinstance(X, np.ndarray) and X.ndim == 2:
+        X = pd.DataFrame(X)
+
     # Check if data is valid
     if not is_valid_shape(X):
         local_param_dict["resample"] = None
@@ -138,7 +147,11 @@ def get_data_split(
     # --- Fallback for single-class training set ---
     # If the random split resulted in a training set with only 1 class (but we had 2+ available),
     # we attempt to move a sample from the test set to the training set to prevent model failure.
-    if y_train.nunique() < 2 and y_train_processed.nunique() >= 2:
+    if (
+        y_train.nunique() < 2
+        and y_train_processed.nunique() >= 2
+        and isinstance(X_train, pd.DataFrame)
+    ):
         logger.warning(
             "y_train contains only 1 class after split. Attempting to move a sample from X_test to X_train to ensure class presence."
         )
diff --git a/ml_grid/pipeline/grid_search_cross_validate.py b/ml_grid/pipeline/grid_search_cross_validate.py
@@ -3,6 +3,8 @@
 import multiprocessing
 import joblib
 import warnings
+import os
+import sys
 from typing import Any, Dict, List, Optional, Union
 
 import numpy as np
@@ -15,7 +17,9 @@
 from sklearn import metrics
 from pandas.testing import assert_index_equal
 from xgboost.core import XGBoostError
+from ml_grid.model_classes.AutoKerasClassifierWrapper import AutoKerasClassifierWrapper
 from ml_grid.model_classes.H2OAutoMLClassifier import H2OAutoMLClassifier
+from ml_grid.model_classes.FLAMLClassifierWrapper import FLAMLClassifierWrapper
 from ml_grid.model_classes.H2OGBMClassifier import H2OGBMClassifier
 from ml_grid.model_classes.H2ODRFClassifier import H2ODRFClassifier
 from ml_grid.model_classes.H2OGAMClassifier import H2OGAMClassifier
@@ -158,6 +162,34 @@ def __init__(
             # One-time TF/GPU Setup
             if is_gpu_model and not _TF_INITIALIZED:
                 try:
+                    # --- FIX for libdevice error ---
+                    # Set XLA_FLAGS to point to the CUDA toolkit installed by pip.
+                    # This is crucial for XLA to find the libdevice library for GPU compilation.
+                    if "XLA_FLAGS" not in os.environ:
+                        # Find site-packages directory
+                        site_packages_path = next(
+                            (p for p in sys.path if "site-packages" in p), None
+                        )
+                        if site_packages_path:
+                            # The 'nvidia-cuda-nvcc-cu12' package installs the compiler toolkit here.
+                            # XLA needs this path to find the 'nvvm/libdevice' directory.
+                            cuda_path = os.path.join(
+                                site_packages_path, "nvidia", "cuda_nvcc"
+                            )
+
+                            if os.path.exists(cuda_path):
+                                self.logger.info(
+                                    f"Found CUDA compiler toolkit at {cuda_path}. Setting XLA_FLAGS."
+                                )
+                                os.environ["XLA_FLAGS"] = (
+                                    f"--xla_gpu_cuda_data_dir={cuda_path}"
+                                )
+                            else:
+                                self.logger.warning(
+                                    "Could not find 'nvidia/cuda_nvcc' directory. Falling back to site-packages root. "
+                                    "Install 'nvidia-cuda-nvcc-cu12' for a reliable setup."
+                                )
+
                     gpu_devices = tf.config.experimental.list_physical_devices("GPU")
                     if gpu_devices:
                         for device in gpu_devices:
@@ -523,7 +555,10 @@ def __init__(
                 # Convert y to numpy for ALL models
                 y_train_search = self._optimize_y(y_train_reset)
 
-                if not is_h2o_model:
+                # Pass DataFrame to H2O and FLAML, which need column info.
+                # Other models get a numpy array for performance.
+                is_flaml_model = isinstance(current_algorithm, FLAMLClassifierWrapper)
+                if not is_h2o_model and not is_flaml_model:
                     X_train_search = X_train_reset.values
                 else:
                     X_train_search = X_train_reset
@@ -579,16 +614,6 @@ def __init__(
             # Restore the original grid_n_jobs setting
             self.global_parameters.grid_n_jobs = original_grid_n_jobs
 
-        # Skip final CV in test mode
-        if not failed and getattr(self.global_parameters, "test_mode", False):
-            self.logger.info(
-                "Test mode enabled. Skipping final cross-validation for speed."
-            )
-            self.grid_search_cross_validate_score_result = 0.5  # Return a valid float
-            # Final cleanup for H2O models
-            self._shutdown_h2o_if_needed(current_algorithm)
-            return
-
         if not failed and self.global_parameters.verbose >= 3:
             self.logger.debug("Fitting final model")
 
@@ -612,19 +637,26 @@ def __init__(
 
         is_h2o_model = isinstance(current_algorithm, H2O_MODEL_TYPES)
         is_keras_model = isinstance(current_algorithm, keras_model_types)
+        is_flaml_model = isinstance(current_algorithm, FLAMLClassifierWrapper)
+        is_autokeras_model = isinstance(current_algorithm, AutoKerasClassifierWrapper)
 
         # H2O and Keras models require single-threaded execution for CV
-        final_cv_n_jobs = 1 if is_h2o_model or is_keras_model else grid_n_jobs
+        final_cv_n_jobs = (
+            1
+            if is_h2o_model or is_keras_model or is_flaml_model or is_autokeras_model
+            else grid_n_jobs
+        )
         if final_cv_n_jobs == 1:
             self.logger.debug(
-                "H2O or Keras model detected. Forcing n_jobs=1 for final cross-validation."
+                "H2O, Keras, FLAML, or AutoKeras model detected. Forcing n_jobs=1 for final cross-validation."
             )
 
         try:
             if failed:
                 raise TimeoutError
 
-            if isinstance(current_algorithm, H2O_MODEL_TYPES):
+            # H2O, FLAML and AutoKeras require pandas DataFrame to handle categorical features correctly.
+            if is_h2o_model or is_flaml_model or is_autokeras_model:
                 X_train_final = self.X_train  # Pass DataFrame directly
                 y_train_final = self._optimize_y(self.y_train)
             else:
diff --git a/ml_grid/pipeline/test_data_pipeline.py b/ml_grid/pipeline/test_data_pipeline.py
@@ -44,6 +44,7 @@ def setUp(self):
         global_parameters.verbose = 0  # Keep test output clean
         global_parameters.error_raise = True
         global_parameters.bayessearch = False  # Explicitly set search mode
+        global_parameters.test_mode = True  # Enable fast test mode
 
         # Define a base configuration for the pipeline
         self.base_local_param_dict = {
diff --git a/ml_grid/util/validate_parameters.py b/ml_grid/util/validate_parameters.py
@@ -1,7 +1,7 @@
 """Functions to validate model-specific hyperparameters before grid search."""
 
 import logging
-from typing import Any, Dict
+from typing import Any, Dict, List, Union
 
 from sklearn.neighbors import KNeighborsClassifier
 from xgboost import XGBClassifier
@@ -11,23 +11,28 @@
 
 
 def validate_knn_parameters(
-    parameters: Dict[str, Any], ml_grid_object: Any
-) -> Dict[str, Any]:
+    parameters: Union[Dict[str, Any], List[Dict[str, Any]]], ml_grid_object: Any
+) -> Union[Dict[str, Any], List[Dict[str, Any]]]:
     """Validates the `n_neighbors` parameter for KNN classifiers.
 
     This function ensures that the values for `n_neighbors` do not exceed the
     number of samples in the training data. If a value is too large, it is
     capped at `n_samples - 1`.
 
     Args:
-        parameters (Dict[str, Any]): The dictionary of parameters to validate.
+        parameters (Union[Dict[str, Any], List[Dict[str, Any]]]): The dictionary or list of dictionaries of parameters to validate.
         ml_grid_object (Any): The main pipeline object containing the training
             data (`X_train`).
 
     Returns:
-        Dict[str, Any]: The validated parameters dictionary.
+        Union[Dict[str, Any], List[Dict[str, Any]]]: The validated parameters.
     """
 
+    if isinstance(parameters, list):
+        for i in range(len(parameters)):
+            parameters[i] = validate_knn_parameters(parameters[i], ml_grid_object)
+        return parameters
+
     logger = logging.getLogger("ml_grid")
     # Get the number of samples in the training data
     logger.debug("Validating KNN parameters")
@@ -58,23 +63,31 @@ def validate_knn_parameters(
 
 
 def validate_XGB_parameters(
-    parameters: Dict[str, Any], ml_grid_object: Any
-) -> Dict[str, Any]:
+    parameters: Union[Dict[str, Any], List[Dict[str, Any]]], ml_grid_object: Any
+) -> Union[Dict[str, Any], List[Dict[str, Any]]]:
     """Validates the `max_bin` parameter for XGBoost.
 
     This function checks that the max_bin values are greater than or equal to 2,
     and if not, it sets them to 2.
 
     Args:
-        parameters (Dict[str, Any]): The dictionary of parameters to validate.
+        parameters (Union[Dict[str, Any], List[Dict[str, Any]]]): The dictionary or list of dictionaries of parameters to validate.
         ml_grid_object (Any): The main pipeline object (currently unused).
 
     Returns:
-        Dict[str, Any]: The validated parameters dictionary.
+        Union[Dict[str, Any], List[Dict[str, Any]]]: The validated parameters.
     """
 
+    if isinstance(parameters, list):
+        for i in range(len(parameters)):
+            parameters[i] = validate_XGB_parameters(parameters[i], ml_grid_object)
+        return parameters
+
     max_bin_array = parameters.get("max_bin")
 
+    if max_bin_array is None:
+        return parameters
+
     # Iterate over each value in the max_bin array
     for i in range(len(max_bin_array)):
         # Check if the value is less than 2
@@ -89,17 +102,19 @@ def validate_XGB_parameters(
 
 
 def validate_parameters_helper(
-    algorithm_implementation: Any, parameters: Dict[str, Any], ml_grid_object: Any
-) -> Dict[str, Any]:
+    algorithm_implementation: Any,
+    parameters: Union[Dict[str, Any], List[Dict[str, Any]]],
+    ml_grid_object: Any,
+) -> Union[Dict[str, Any], List[Dict[str, Any]]]:
     """Dispatches to the correct parameter validation function based on algorithm type.
 
     Args:
         algorithm_implementation (Any): The scikit-learn estimator instance.
-        parameters (Dict[str, Any]): The dictionary of parameters to validate.
+        parameters (Union[Dict[str, Any], List[Dict[str, Any]]]): The parameters to validate.
         ml_grid_object (Any): The main pipeline object containing training data.
 
     Returns:
-        Dict[str, Any]: The validated parameters dictionary.
+        Union[Dict[str, Any], List[Dict[str, Any]]]: The validated parameters.
     """
 
     if isinstance(algorithm_implementation, KNeighborsClassifier):