Skip to content

Commit 636ae05

Browse files
committed
efficiency: get all f values for features at once
1 parent 9fec6fe commit 636ae05

1 file changed

Lines changed: 53 additions & 19 deletions

File tree

ml_grid/pipeline/data_feature_methods.py

Lines changed: 53 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -46,28 +46,47 @@ def getNfeaturesANOVAF(self, n, X_train, y_train):
4646
else:
4747
raise ValueError("X_train must be a pandas DataFrame or numpy array")
4848

49-
# Calculate F-value for each column in X_train
50-
res = []
51-
for i, col in enumerate(X_train.T):
52-
# Get the F-values from f_classif
53-
f_values = f_classif(col.reshape(-1, 1), y_train)[0]
49+
# Calculate F-values for all features at once
50+
f_values, _ = f_classif(X_train, y_train)
5451

55-
# If the F-value is not NaN, add it to the results
56-
if not np.isnan(f_values[0]):
57-
res.append((feature_names[i], f_values[0]))
52+
# Create a list of (feature_name, f_value) tuples, ignoring NaNs
53+
res = [
54+
(feature_names[i], f_values[i])
55+
for i in range(len(feature_names))
56+
if not np.isnan(f_values[i])
57+
]
5858

5959
# Sort the list based on F-value in descending order
6060
sortedList = sorted(res, key=lambda x: x[1], reverse=True)
6161

6262
# Return column names of top n features
6363
nFeatures = sortedList[:n] # Get top n features
64-
finalColNames = [elem[0] for elem in nFeatures] # Get column names
64+
finalColNames = [elem[0] for elem in nFeatures]
65+
66+
# Add a check to ensure that at least one feature is returned.
67+
# If not, it means all features were filtered out (e.g., all had NaN F-values),
68+
# which would lead to an empty X_train and cause pipeline failure.
69+
if not finalColNames:
70+
# Fallback: if all features were filtered, return the single best one that is not NaN.
71+
# This can happen if n is too small or all f-values are NaN.
72+
if sortedList:
73+
return [sortedList[0][0]]
74+
else:
75+
raise ValueError("getNfeaturesANOVAF returned no features. All features might have NaN F-values.")
6576

6677
return finalColNames
6778

6879

6980

70-
def getNFeaturesMarkovBlanket(self, n, X_train, y_train):
81+
def getNFeaturesMarkovBlanket(
82+
self,
83+
n,
84+
X_train,
85+
y_train,
86+
num_simul: int = 30,
87+
cv: int = 5,
88+
svc_kernel: str = "rbf",
89+
):
7190

7291
"""
7392
Get the names of the top n features from the Markov Blanket (MB) using PyImpetus.
@@ -76,6 +95,9 @@ def getNFeaturesMarkovBlanket(self, n, X_train, y_train):
7695
- n (int): The number of top features to retrieve.
7796
- X_train (array-like): The training input samples.
7897
- y_train (array-like): The target values.
98+
- num_simul (int): Number of simulations for stability selection in PyImpetus.
99+
- cv (int): Number of cross-validation folds.
100+
- svc_kernel (str): The kernel to be used by the SVC model.
79101
80102
Returns:
81103
- list: A list containing the names of the top n features from the Markov Blanket.
@@ -96,28 +118,40 @@ def getNFeaturesMarkovBlanket(self, n, X_train, y_train):
96118
top_features = getNFeaturesMarkovBlanket(5, X_train, y_train)
97119
```
98120
"""
121+
# Ensure input is a pandas DataFrame to access column names
122+
if not isinstance(X_train, pd.DataFrame):
123+
raise TypeError(
124+
"X_train must be a pandas DataFrame for getNFeaturesMarkovBlanket."
125+
)
126+
original_columns = X_train.columns
99127

100128
# Initialize the PyImpetus object with desired parameters
101-
model = PPIMBC(model=SVC(random_state=27, class_weight="balanced"),
129+
model = PPIMBC(model=SVC(random_state=27, class_weight="balanced", kernel=svc_kernel),
102130
p_val_thresh=0.05,
103-
num_simul=30,
131+
num_simul=num_simul,
104132
simul_size=0.2,
105133
simul_type=0,
106134
sig_test_type="non-parametric",
107-
cv=5,
135+
cv=cv,
108136
random_state=27,
109137
n_jobs=-1,
110138
verbose=2)
111139

112140
# Fit and transform the training data
113-
df_train_transformed = model.fit_transform(X_train, y_train)
141+
# PyImpetus works with numpy arrays and returns feature indices in model.MB
142+
model.fit(X_train.values, y_train)
114143

115-
# Get the feature names from the Markov blanket (MB) and truncate by n elements
116-
feature_names = model.MB[:n]
144+
# Get the feature indices from the Markov blanket (MB)
145+
feature_indices = model.MB
146+
147+
# Map indices back to original column names and truncate by n
148+
feature_names = [original_columns[i] for i in feature_indices][:n]
149+
150+
# Fallback: If feature selection returns an empty list, but the model found features,
151+
# return the single most important one. This prevents pipeline failure.
152+
if not feature_names and feature_indices:
153+
feature_names = [original_columns[feature_indices[0]]]
117154

118155
return feature_names
119156

120157

121-
122-
123-

0 commit comments

Comments
 (0)