Skip to content

Commit 8c45750

Browse files
committed
Initial embedding feature reduction method implementation, test data and tests.
1 parent d511656 commit 8c45750

9 files changed

Lines changed: 1064 additions & 35 deletions

ml_grid/pipeline/data.py

Lines changed: 117 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import re
22
import random
3+
import numpy as np
34
from typing import Any, Dict, List, Optional
45
import warnings
56

@@ -12,6 +13,7 @@
1213
from ml_grid.pipeline.data_clean_up import clean_up_class
1314
from ml_grid.pipeline.data_constant_columns import remove_constant_columns, remove_constant_columns_with_debug
1415
from ml_grid.pipeline.data_correlation_matrix import handle_correlation_matrix
16+
from ml_grid.pipeline.embeddings import create_embedding_pipeline, apply_embedding
1517
from ml_grid.pipeline.data_feature_importance_methods import feature_importance_methods
1618
from ml_grid.pipeline.data_outcome_list import handle_outcome_list
1719
from ml_grid.pipeline.data_percent_missing import handle_percent_missing
@@ -248,42 +250,45 @@ def __init__(
248250
for col in self.pertubation_columns
249251
if (col not in self.drop_list and col in self.df.columns)
250252
]
251-
# Add safety mechanism to retain minimum features
252-
min_required_features = 5 # Set your minimum threshold
253-
core_protected_columns = ['age', 'male', 'client_idcode'] # Columns to protect
254253

254+
# Safety net: If all features are pruned, retain a minimum set
255255
if not self.final_column_list:
256-
print("WARNING: All features pruned! Activating safety retention...")
256+
print("Warning: All features were pruned. Activating safety retention mechanism.")
257257

258-
# Try to keep protected columns first
259-
safety_columns = [col for col in core_protected_columns
260-
if col in self.df.columns and col in self.pertubation_columns]
258+
# Define core columns to try and protect
259+
core_protected_columns = ['age', 'male', 'client_idcode']
260+
min_features = 2
261261

262-
# If no protected columns, use first available columns
263-
if not safety_columns:
264-
safety_columns = [col for col in self.pertubation_columns
265-
if col in self.df.columns][:min_required_features]
262+
# 1. Try to retain core protected columns
263+
retained_cols = [
264+
col for col in core_protected_columns
265+
if col in self.pertubation_columns and col in self.df.columns
266+
]
266267

267-
# Update final columns and drop list
268-
self.final_column_list = safety_columns
269-
# Also update the main drop list to prevent re-pruning
270-
self.drop_list = [col for col in self.drop_list if col not in self.final_column_list]
268+
# 2. If no core columns, try to retain any of the original perturbed columns
269+
if not retained_cols:
270+
retained_cols = [
271+
col for col in self.pertubation_columns if col in self.df.columns
272+
][:min_features]
273+
274+
# 3. As a last resort, pick random columns from the original features
275+
if not retained_cols:
276+
print("Last resort: Selecting random features.")
277+
available_features = [
278+
col for col in self.orignal_feature_names
279+
if col != self.outcome_variable and col in self.df.columns
280+
]
281+
if len(available_features) >= min_features:
282+
retained_cols = random.sample(available_features, min_features)
283+
elif available_features:
284+
retained_cols = available_features
271285

272-
print(f"Retaining minimum features: {self.final_column_list}")
273-
274-
# Re-filter final_column_list to be absolutely sure
275-
self.final_column_list = [col for col in self.pertubation_columns if col not in self.drop_list and col in self.df.columns]
276-
286+
self.final_column_list = retained_cols
287+
print(f"Retained minimum features: {self.final_column_list}")
277288

278-
# Add two random features if list still empty
279-
if not self.final_column_list:
280-
print("Warning no feature columns retained, selecting two at random")
281-
self.final_column_list.append(random.choice(self.orignal_feature_names))
282-
self.final_column_list.append(random.choice(self.orignal_feature_names))
283-
284-
# Ensure we still have at least 1 feature
289+
# Final check to ensure we have at least one feature
285290
if not self.final_column_list:
286-
raise ValueError("CRITICAL: Unable to retain any features despite safety measures")
291+
raise ValueError("CRITICAL: Unable to retain any features despite safety measures. Halting pipeline.")
287292

288293
if not self.final_column_list:
289294
raise ValueError("All features pruned. No columns remaining in final_column_list.")
@@ -362,7 +367,7 @@ def __init__(
362367
print(self.X.shape)
363368

364369
self.X, self.y = convert_Xy_to_time_series(self.X, self.y, max_seq_length)
365-
if self.verbose >= 1:
370+
if not self.final_column_list:
366371
print(self.X.shape)
367372

368373
(
@@ -381,19 +386,30 @@ def __init__(
381386
self.X_test_orig,
382387
verbosity=self.verbose
383388
)
389+
if self.verbose >= 1:
390+
print(f"Shape of X_train after removing constant columns post-split: {self.X_train.shape}")
384391

385-
target_n_features = self.local_param_dict.get("feature_n")
392+
393+
# Add a safeguard to ensure features remain after removing constant columns
394+
if self.X_train.shape[1] == 0:
395+
raise ValueError("All feature columns were removed after data splitting because they were constant in the training set. Consider adjusting feature selection or data cleaning parameters.")
386396

387-
if target_n_features != 100:
388397

398+
target_n_features = self.local_param_dict.get("feature_n")
399+
400+
if target_n_features is not None and target_n_features < 100:
389401
target_n_features_eval = int(
390402
(target_n_features / 100) * self.X_train.shape[1]
391403
)
392-
393404
# Ensure at least one feature is selected. The previous logic here
394405
# was incorrect and disabled feature selection entirely.
395406
target_n_features_eval = max(1, target_n_features_eval)
396407

408+
if target_n_features is not None and target_n_features < 100 and self.X_train.shape[1] > 1 and not self.local_param_dict.get("use_embedding", False) and target_n_features_eval < self.X_train.shape[1]:
409+
410+
if self.verbose >= 1:
411+
print(f"Shape of X_train before feature importance selection: {self.X_train.shape}")
412+
397413
print(
398414
f"Pre target_n_features {target_n_features}% reduction {target_n_features_eval}/{self.X_train.shape[1]}"
399415
)
@@ -410,12 +426,77 @@ def __init__(
410426
ml_grid_object=self
411427
)
412428
)
429+
if self.verbose >= 1:
430+
print(f"Shape of X_train after feature importance selection: {self.X_train.shape}")
431+
413432
if self.X_train.shape[1] == 0:
414433
raise ValueError("Feature importance selection removed all features.")
415434

435+
# Safeguard: Ensure X_train is not empty after feature selection
436+
if self.X_train.shape[1] == 0:
437+
raise ValueError("All features were removed by the feature importance selection method. X_train is empty.")
438+
416439
except Exception as e:
417440
print("failed target_n_features", e)
418441

442+
# Apply embeddings if configured (after feature selection)
443+
if self.local_param_dict.get("use_embedding", False):
444+
if self.verbose >= 1:
445+
print("Applying embeddings...")
446+
447+
embedding_method = self.local_param_dict.get("embedding_method", "pca")
448+
embedding_dim = self.local_param_dict.get("embedding_dim", 64)
449+
scale_before_embedding = self.local_param_dict.get("scale_features_before_embedding", True)
450+
451+
if self.verbose >= 2:
452+
print(f" Embedding Method: {embedding_method}")
453+
print(f" Original features: {self.X_train.shape[1]}")
454+
print(f" Target embedding dimensions: {embedding_dim}")
455+
print(f" Scale before embedding: {scale_before_embedding}")
456+
457+
if self.X_train.shape[1] > 1 and embedding_dim >= self.X_train.shape[1]:
458+
embedding_dim = self.X_train.shape[1] - 1
459+
if self.verbose >= 1:
460+
print(f" Warning: embedding_dim >= n_features. Adjusting to {embedding_dim}")
461+
462+
embedding_pipeline = create_embedding_pipeline(
463+
method=embedding_method,
464+
n_components=embedding_dim,
465+
scale=scale_before_embedding,
466+
)
467+
468+
# Fit on train and transform all splits
469+
# Check if the method is supervised to pass y_train
470+
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
471+
from sklearn.feature_selection import SelectKBest
472+
from ml_grid.pipeline.embeddings import get_explained_variance
473+
474+
embed_step = embedding_pipeline.named_steps['embed']
475+
if isinstance(embed_step, (LinearDiscriminantAnalysis, SelectKBest)):
476+
if self.verbose >= 2:
477+
print(" Supervised embedding method detected, passing y_train.")
478+
self.X_train = pd.DataFrame(embedding_pipeline.fit_transform(self.X_train, self.y_train), index=self.X_train.index, columns=[f"embed_{i}" for i in range(embedding_dim)])
479+
else:
480+
self.X_train = pd.DataFrame(embedding_pipeline.fit_transform(self.X_train), index=self.X_train.index, columns=[f"embed_{i}" for i in range(embedding_dim)])
481+
482+
self.X_test = pd.DataFrame(embedding_pipeline.transform(self.X_test), index=self.X_test.index, columns=[f"embed_{i}" for i in range(embedding_dim)])
483+
self.X_test_orig = pd.DataFrame(embedding_pipeline.transform(self.X_test_orig), index=self.X_test_orig.index, columns=[f"embed_{i}" for i in range(embedding_dim)])
484+
485+
# The main self.X should also be updated for consistency, using the training data's embedding
486+
self.X = self.X_train.copy()
487+
488+
if self.verbose >= 1:
489+
print(f"Shape of X_train after embedding: {self.X_train.shape}")
490+
491+
if self.verbose >= 1:
492+
print(f"Data transformed to {self.X_train.shape[1]} embedding dimensions.")
493+
explained_variance = get_explained_variance(embedding_pipeline)
494+
if explained_variance is not None:
495+
print(f" Total explained variance by {embedding_dim} components: {explained_variance.sum():.2%}")
496+
497+
498+
499+
419500
if self.verbose >= 2:
420501
print(
421502
f"Data Split Information:\n"
@@ -462,4 +543,7 @@ def __init__(
462543
self.model_class_list = get_model_class_list(self)
463544

464545
if isinstance(self.X_train, pd.DataFrame) and self.X_train.empty:
465-
raise ValueError("-- end data pipeline-- Input data X_train is an empty DataFrame -- end data pipeline--")
546+
raise ValueError(
547+
"-- end data pipeline-- Input data X_train is an empty DataFrame. "
548+
"This is likely due to aggressive feature selection or data cleaning."
549+
)

0 commit comments

Comments
 (0)