@@ -46,28 +46,47 @@ def getNfeaturesANOVAF(self, n, X_train, y_train):
4646 else :
4747 raise ValueError ("X_train must be a pandas DataFrame or numpy array" )
4848
49- # Calculate F-value for each column in X_train
50- res = []
51- for i , col in enumerate (X_train .T ):
52- # Get the F-values from f_classif
53- f_values = f_classif (col .reshape (- 1 , 1 ), y_train )[0 ]
49+ # Calculate F-values for all features at once
50+ f_values , _ = f_classif (X_train , y_train )
5451
55- # If the F-value is not NaN, add it to the results
56- if not np .isnan (f_values [0 ]):
57- res .append ((feature_names [i ], f_values [0 ]))
52+ # Create a list of (feature_name, f_value) tuples, ignoring NaNs
53+ res = [
54+ (feature_names [i ], f_values [i ])
55+ for i in range (len (feature_names ))
56+ if not np .isnan (f_values [i ])
57+ ]
5858
5959 # Sort the list based on F-value in descending order
6060 sortedList = sorted (res , key = lambda x : x [1 ], reverse = True )
6161
6262 # Return column names of top n features
6363 nFeatures = sortedList [:n ] # Get top n features
64- finalColNames = [elem [0 ] for elem in nFeatures ] # Get column names
64+ finalColNames = [elem [0 ] for elem in nFeatures ]
65+
66+ # Add a check to ensure that at least one feature is returned.
67+ # If not, it means all features were filtered out (e.g., all had NaN F-values),
68+ # which would lead to an empty X_train and cause pipeline failure.
69+ if not finalColNames :
70+ # Fallback: if all features were filtered, return the single best one that is not NaN.
71+ # This can happen if n is too small or all f-values are NaN.
72+ if sortedList :
73+ return [sortedList [0 ][0 ]]
74+ else :
75+ raise ValueError ("getNfeaturesANOVAF returned no features. All features might have NaN F-values." )
6576
6677 return finalColNames
6778
6879
6980
70- def getNFeaturesMarkovBlanket (self , n , X_train , y_train ):
81+ def getNFeaturesMarkovBlanket (
82+ self ,
83+ n ,
84+ X_train ,
85+ y_train ,
86+ num_simul : int = 30 ,
87+ cv : int = 5 ,
88+ svc_kernel : str = "rbf" ,
89+ ):
7190
7291 """
7392 Get the names of the top n features from the Markov Blanket (MB) using PyImpetus.
@@ -76,6 +95,9 @@ def getNFeaturesMarkovBlanket(self, n, X_train, y_train):
7695 - n (int): The number of top features to retrieve.
7796 - X_train (array-like): The training input samples.
7897 - y_train (array-like): The target values.
98+ - num_simul (int): Number of simulations for stability selection in PyImpetus.
99+ - cv (int): Number of cross-validation folds.
100+ - svc_kernel (str): The kernel to be used by the SVC model.
79101
80102 Returns:
81103 - list: A list containing the names of the top n features from the Markov Blanket.
@@ -96,28 +118,40 @@ def getNFeaturesMarkovBlanket(self, n, X_train, y_train):
96118 top_features = getNFeaturesMarkovBlanket(5, X_train, y_train)
97119 ```
98120 """
121+ # Ensure input is a pandas DataFrame to access column names
122+ if not isinstance (X_train , pd .DataFrame ):
123+ raise TypeError (
124+ "X_train must be a pandas DataFrame for getNFeaturesMarkovBlanket."
125+ )
126+ original_columns = X_train .columns
99127
100128 # Initialize the PyImpetus object with desired parameters
101- model = PPIMBC (model = SVC (random_state = 27 , class_weight = "balanced" ),
129+ model = PPIMBC (model = SVC (random_state = 27 , class_weight = "balanced" , kernel = svc_kernel ),
102130 p_val_thresh = 0.05 ,
103- num_simul = 30 ,
131+ num_simul = num_simul ,
104132 simul_size = 0.2 ,
105133 simul_type = 0 ,
106134 sig_test_type = "non-parametric" ,
107- cv = 5 ,
135+ cv = cv ,
108136 random_state = 27 ,
109137 n_jobs = - 1 ,
110138 verbose = 2 )
111139
112140 # Fit and transform the training data
113- df_train_transformed = model .fit_transform (X_train , y_train )
141+ # PyImpetus works with numpy arrays and returns feature indices in model.MB
142+ model .fit (X_train .values , y_train )
114143
115- # Get the feature names from the Markov blanket (MB) and truncate by n elements
116- feature_names = model .MB [:n ]
144+ # Get the feature indices from the Markov blanket (MB)
145+ feature_indices = model .MB
146+
147+ # Map indices back to original column names and truncate by n
148+ feature_names = [original_columns [i ] for i in feature_indices ][:n ]
149+
150+ # Fallback: If feature selection returns an empty list, but the model found features,
151+ # return the single most important one. This prevents pipeline failure.
152+ if not feature_names and feature_indices :
153+ feature_names = [original_columns [feature_indices [0 ]]]
117154
118155 return feature_names
119156
120157
121-
122-
123-
0 commit comments