11import re
22import random
3+ import numpy as np
34from typing import Any , Dict , List , Optional
45import warnings
56
1213from ml_grid .pipeline .data_clean_up import clean_up_class
1314from ml_grid .pipeline .data_constant_columns import remove_constant_columns , remove_constant_columns_with_debug
1415from ml_grid .pipeline .data_correlation_matrix import handle_correlation_matrix
16+ from ml_grid .pipeline .embeddings import create_embedding_pipeline , apply_embedding
1517from ml_grid .pipeline .data_feature_importance_methods import feature_importance_methods
1618from ml_grid .pipeline .data_outcome_list import handle_outcome_list
1719from ml_grid .pipeline .data_percent_missing import handle_percent_missing
@@ -248,42 +250,45 @@ def __init__(
248250 for col in self .pertubation_columns
249251 if (col not in self .drop_list and col in self .df .columns )
250252 ]
251- # Add safety mechanism to retain minimum features
252- min_required_features = 5 # Set your minimum threshold
253- core_protected_columns = ['age' , 'male' , 'client_idcode' ] # Columns to protect
254253
254+ # Safety net: If all features are pruned, retain a minimum set
255255 if not self .final_column_list :
256- print ("WARNING : All features pruned! Activating safety retention.. ." )
256+ print ("Warning : All features were pruned. Activating safety retention mechanism ." )
257257
258- # Try to keep protected columns first
259- safety_columns = [col for col in core_protected_columns
260- if col in self . df . columns and col in self . pertubation_columns ]
258+ # Define core columns to try and protect
259+ core_protected_columns = ['age' , 'male' , 'client_idcode' ]
260+ min_features = 2
261261
262- # If no protected columns, use first available columns
263- if not safety_columns :
264- safety_columns = [col for col in self .pertubation_columns
265- if col in self .df .columns ][:min_required_features ]
262+ # 1. Try to retain core protected columns
263+ retained_cols = [
264+ col for col in core_protected_columns
265+ if col in self .pertubation_columns and col in self .df .columns
266+ ]
266267
267- # Update final columns and drop list
268- self .final_column_list = safety_columns
269- # Also update the main drop list to prevent re-pruning
270- self .drop_list = [col for col in self .drop_list if col not in self .final_column_list ]
268+ # 2. If no core columns, try to retain any of the original perturbed columns
269+ if not retained_cols :
270+ retained_cols = [
271+ col for col in self .pertubation_columns if col in self .df .columns
272+ ][:min_features ]
273+
274+ # 3. As a last resort, pick random columns from the original features
275+ if not retained_cols :
276+ print ("Last resort: Selecting random features." )
277+ available_features = [
278+ col for col in self .orignal_feature_names
279+ if col != self .outcome_variable and col in self .df .columns
280+ ]
281+ if len (available_features ) >= min_features :
282+ retained_cols = random .sample (available_features , min_features )
283+ elif available_features :
284+ retained_cols = available_features
271285
272- print (f"Retaining minimum features: { self .final_column_list } " )
273-
274- # Re-filter final_column_list to be absolutely sure
275- self .final_column_list = [col for col in self .pertubation_columns if col not in self .drop_list and col in self .df .columns ]
276-
286+ self .final_column_list = retained_cols
287+ print (f"Retained minimum features: { self .final_column_list } " )
277288
278- # Add two random features if list still empty
279- if not self .final_column_list :
280- print ("Warning no feature columns retained, selecting two at random" )
281- self .final_column_list .append (random .choice (self .orignal_feature_names ))
282- self .final_column_list .append (random .choice (self .orignal_feature_names ))
283-
284- # Ensure we still have at least 1 feature
289+ # Final check to ensure we have at least one feature
285290 if not self .final_column_list :
286- raise ValueError ("CRITICAL: Unable to retain any features despite safety measures" )
291+ raise ValueError ("CRITICAL: Unable to retain any features despite safety measures. Halting pipeline. " )
287292
288293 if not self .final_column_list :
289294 raise ValueError ("All features pruned. No columns remaining in final_column_list." )
@@ -362,7 +367,7 @@ def __init__(
362367 print (self .X .shape )
363368
364369 self .X , self .y = convert_Xy_to_time_series (self .X , self .y , max_seq_length )
365- if self .verbose >= 1 :
370+ if not self .final_column_list :
366371 print (self .X .shape )
367372
368373 (
@@ -381,19 +386,30 @@ def __init__(
381386 self .X_test_orig ,
382387 verbosity = self .verbose
383388 )
389+ if self .verbose >= 1 :
390+ print (f"Shape of X_train after removing constant columns post-split: { self .X_train .shape } " )
384391
385- target_n_features = self .local_param_dict .get ("feature_n" )
392+
393+ # Add a safeguard to ensure features remain after removing constant columns
394+ if self .X_train .shape [1 ] == 0 :
395+ raise ValueError ("All feature columns were removed after data splitting because they were constant in the training set. Consider adjusting feature selection or data cleaning parameters." )
386396
387- if target_n_features != 100 :
388397
398+ target_n_features = self .local_param_dict .get ("feature_n" )
399+
400+ if target_n_features is not None and target_n_features < 100 :
389401 target_n_features_eval = int (
390402 (target_n_features / 100 ) * self .X_train .shape [1 ]
391403 )
392-
393404 # Ensure at least one feature is selected. The previous logic here
394405 # was incorrect and disabled feature selection entirely.
395406 target_n_features_eval = max (1 , target_n_features_eval )
396407
408+ if target_n_features is not None and target_n_features < 100 and self .X_train .shape [1 ] > 1 and not self .local_param_dict .get ("use_embedding" , False ) and target_n_features_eval < self .X_train .shape [1 ]:
409+
410+ if self .verbose >= 1 :
411+ print (f"Shape of X_train before feature importance selection: { self .X_train .shape } " )
412+
397413 print (
398414 f"Pre target_n_features { target_n_features } % reduction { target_n_features_eval } /{ self .X_train .shape [1 ]} "
399415 )
@@ -410,12 +426,77 @@ def __init__(
410426 ml_grid_object = self
411427 )
412428 )
429+ if self .verbose >= 1 :
430+ print (f"Shape of X_train after feature importance selection: { self .X_train .shape } " )
431+
413432 if self .X_train .shape [1 ] == 0 :
414433 raise ValueError ("Feature importance selection removed all features." )
415434
435+ # Safeguard: Ensure X_train is not empty after feature selection
436+ if self .X_train .shape [1 ] == 0 :
437+ raise ValueError ("All features were removed by the feature importance selection method. X_train is empty." )
438+
416439 except Exception as e :
417440 print ("failed target_n_features" , e )
418441
442+ # Apply embeddings if configured (after feature selection)
443+ if self .local_param_dict .get ("use_embedding" , False ):
444+ if self .verbose >= 1 :
445+ print ("Applying embeddings..." )
446+
447+ embedding_method = self .local_param_dict .get ("embedding_method" , "pca" )
448+ embedding_dim = self .local_param_dict .get ("embedding_dim" , 64 )
449+ scale_before_embedding = self .local_param_dict .get ("scale_features_before_embedding" , True )
450+
451+ if self .verbose >= 2 :
452+ print (f" Embedding Method: { embedding_method } " )
453+ print (f" Original features: { self .X_train .shape [1 ]} " )
454+ print (f" Target embedding dimensions: { embedding_dim } " )
455+ print (f" Scale before embedding: { scale_before_embedding } " )
456+
457+ if self .X_train .shape [1 ] > 1 and embedding_dim >= self .X_train .shape [1 ]:
458+ embedding_dim = self .X_train .shape [1 ] - 1
459+ if self .verbose >= 1 :
460+ print (f" Warning: embedding_dim >= n_features. Adjusting to { embedding_dim } " )
461+
462+ embedding_pipeline = create_embedding_pipeline (
463+ method = embedding_method ,
464+ n_components = embedding_dim ,
465+ scale = scale_before_embedding ,
466+ )
467+
468+ # Fit on train and transform all splits
469+ # Check if the method is supervised to pass y_train
470+ from sklearn .discriminant_analysis import LinearDiscriminantAnalysis
471+ from sklearn .feature_selection import SelectKBest
472+ from ml_grid .pipeline .embeddings import get_explained_variance
473+
474+ embed_step = embedding_pipeline .named_steps ['embed' ]
475+ if isinstance (embed_step , (LinearDiscriminantAnalysis , SelectKBest )):
476+ if self .verbose >= 2 :
477+ print (" Supervised embedding method detected, passing y_train." )
478+ self .X_train = pd .DataFrame (embedding_pipeline .fit_transform (self .X_train , self .y_train ), index = self .X_train .index , columns = [f"embed_{ i } " for i in range (embedding_dim )])
479+ else :
480+ self .X_train = pd .DataFrame (embedding_pipeline .fit_transform (self .X_train ), index = self .X_train .index , columns = [f"embed_{ i } " for i in range (embedding_dim )])
481+
482+ self .X_test = pd .DataFrame (embedding_pipeline .transform (self .X_test ), index = self .X_test .index , columns = [f"embed_{ i } " for i in range (embedding_dim )])
483+ self .X_test_orig = pd .DataFrame (embedding_pipeline .transform (self .X_test_orig ), index = self .X_test_orig .index , columns = [f"embed_{ i } " for i in range (embedding_dim )])
484+
485+ # The main self.X should also be updated for consistency, using the training data's embedding
486+ self .X = self .X_train .copy ()
487+
488+ if self .verbose >= 1 :
489+ print (f"Shape of X_train after embedding: { self .X_train .shape } " )
490+
491+ if self .verbose >= 1 :
492+ print (f"Data transformed to { self .X_train .shape [1 ]} embedding dimensions." )
493+ explained_variance = get_explained_variance (embedding_pipeline )
494+ if explained_variance is not None :
495+ print (f" Total explained variance by { embedding_dim } components: { explained_variance .sum ():.2%} " )
496+
497+
498+
499+
419500 if self .verbose >= 2 :
420501 print (
421502 f"Data Split Information:\n "
@@ -462,4 +543,7 @@ def __init__(
462543 self .model_class_list = get_model_class_list (self )
463544
464545 if isinstance (self .X_train , pd .DataFrame ) and self .X_train .empty :
465- raise ValueError ("-- end data pipeline-- Input data X_train is an empty DataFrame -- end data pipeline--" )
546+ raise ValueError (
547+ "-- end data pipeline-- Input data X_train is an empty DataFrame. "
548+ "This is likely due to aggressive feature selection or data cleaning."
549+ )
0 commit comments