feature selection column validation fix

SamoraHunter · SamoraHunter · commit 3249996f9eef · 2025-12-17T12:23:57.000Z
diff --git a/ml_grid/pipeline/data_feature_importance_methods.py b/ml_grid/pipeline/data_feature_importance_methods.py
@@ -5,15 +5,12 @@
 
 from ml_grid.pipeline.data_feature_methods import feature_methods
 
-# rename this class
-
-
 class feature_importance_methods:
     """A class to handle feature selection using different importance methods."""
 
     def __init__(self) -> None:
         """Initializes the feature_importance_methods class."""
-        pass
+        self.feature_method = "None"
 
     def handle_feature_importance_methods(
         self,
@@ -50,6 +47,7 @@ def handle_feature_importance_methods(
         """
 
         logger = logging.getLogger("ml_grid")
+        
         # Work with copies to avoid modifying the original DataFrames in the calling scope
         X_train_copy = X_train.copy()
         X_test_copy = X_test.copy()
@@ -58,33 +56,51 @@ def handle_feature_importance_methods(
         self.feature_method = ml_grid_object.local_param_dict.get(
             "feature_selection_method"
         )
+        
+        # Default to all features initially
+        features = list(X_train_copy.columns)
 
         if self.feature_method == "anova" or self.feature_method is None:
             logger.info("feature_method ANOVA")
             fm = feature_methods()
-            # The data pipeline now guarantees a clean index, so no reset is needed here.
             features = fm.getNfeaturesANOVAF(
                 n=target_n_features, X_train=X_train_copy, y_train=y_train
             )
 
         elif self.feature_method == "markov_blanket":
             logger.info("feature method Markov")
             fm = feature_methods()
-            # The data pipeline now guarantees a clean index, so no reset is needed here.
             features = fm.getNFeaturesMarkovBlanket(
                 n=target_n_features, X_train=X_train_copy, y_train=y_train
             )
 
         logger.info(f"target_n_features: {target_n_features}")
-        logger.info(f"Selected features: {features}")
+        
+        # --- Column Validation ---
+        # Filter the requested 'features' to ensure they actually exist in the DataFrame.
+        # This handles cases where selectors return indices, 'ColumnX' names, or 
+        # names that were dropped/renamed in previous pipeline steps.
+        
+        valid_features = [f for f in features if f in X_train_copy.columns]
+        
+        if len(valid_features) == 0:
+            logger.warning(
+                f"Feature selection ({self.feature_method}) returned 0 valid features. "
+                f"Requested examples: {features[:5] if features else 'None'}. "
+                "Falling back to ALL original features to prevent crash."
+            )
+            valid_features = list(X_train_copy.columns)
+        elif len(valid_features) < len(features):
+             logger.warning(
+                 f"{len(features) - len(valid_features)} selected features were not found in X_train columns. Dropped invalid keys."
+             )
 
-        # CRITICAL FIX: Apply feature selection to the X_train that was passed in,
-        # which has already been cleaned of post-split constant columns.
-        X_train_out = X_train_copy[features]
+        logger.info(f"Final selected features ({len(valid_features)}): {valid_features}")
 
-        # Apply the same feature selection to the test sets
-        X_test_out = X_test.copy()[features]
-        X_test_orig_out = X_test_orig.copy()[features]
+        # Apply the validated selection 
+        X_train_out = X_train_copy[valid_features]
+        X_test_out = X_test_copy[valid_features]
+        X_test_orig_out = X_test_orig_copy[valid_features]
 
         # The y series do not need to be modified, as they are already aligned.
-        return X_train_out, y_train, X_test_out, y_test, X_test_orig_out
+        return X_train_out, y_train, X_test_out, y_test, X_test_orig_out