|
23 | 23 | from ml_grid.pipeline.embeddings import create_embedding_pipeline |
24 | 24 | from ml_grid.util.global_params import global_parameters |
25 | 25 | from ml_grid.util.logger_setup import setup_logger |
26 | | - |
27 | | -warnings.filterwarnings("ignore", category=ConvergenceWarning) |
28 | | -warnings.filterwarnings("ignore", category=UserWarning) |
29 | 26 | from sklearn.preprocessing import ( |
30 | 27 | StandardScaler, |
31 | 28 | ) # Added explicit import for StandardScaler |
32 | 29 |
|
| 30 | +warnings.filterwarnings("ignore", category=ConvergenceWarning) |
| 31 | +warnings.filterwarnings("ignore", category=UserWarning) |
33 | 32 | warnings.filterwarnings("ignore", category=FutureWarning) |
34 | 33 |
|
35 | 34 |
|
@@ -135,10 +134,20 @@ def _log_feature_transformation( |
135 | 134 | } |
136 | 135 | ) |
137 | 136 |
|
138 | | - def _assert_index_alignment( |
139 | | - self, df1: pd.DataFrame, df2: pd.Series, step_name: str |
140 | | - ): |
| 137 | + def _assert_index_alignment(self, df1: Any, df2: Any, step_name: str): |
141 | 138 | """Helper function to assert that DataFrame and Series indices are equal.""" |
| 139 | + # Handle objects without .index (e.g. numpy arrays in time series mode) |
| 140 | + if not hasattr(df1, "index") or not hasattr(df2, "index"): |
| 141 | + if len(df1) != len(df2): |
| 142 | + self.logger.error( |
| 143 | + f"Length mismatch at {step_name}: {len(df1)} vs {len(df2)}" |
| 144 | + ) |
| 145 | + raise AssertionError(f"Length mismatch at {step_name}") |
| 146 | + self.logger.debug( |
| 147 | + f"Length alignment PASSED at: {step_name} (non-pandas objects)" |
| 148 | + ) |
| 149 | + return |
| 150 | + |
142 | 151 | try: |
143 | 152 | assert_index_equal(df1.index, df2.index) |
144 | 153 | self.logger.debug(f"Index alignment PASSED at: {step_name}") |
@@ -499,18 +508,28 @@ def _split_data(self): |
499 | 508 | # --- CRITICAL FIX: Reset all indices immediately after splitting --- |
500 | 509 | # This ensures all downstream processing (constant removal, feature selection, embedding) |
501 | 510 | # operates on data with clean, aligned, 0-based integer indices. |
502 | | - self.X_train.reset_index(drop=True, inplace=True) |
503 | | - self.y_train.reset_index(drop=True, inplace=True) |
504 | | - self.X_test.reset_index(drop=True, inplace=True) |
505 | | - self.y_test.reset_index(drop=True, inplace=True) |
506 | | - self.X_test_orig.reset_index(drop=True, inplace=True) |
507 | | - self.y_test_orig.reset_index(drop=True, inplace=True) |
| 511 | + if hasattr(self.X_train, "reset_index"): |
| 512 | + self.X_train.reset_index(drop=True, inplace=True) |
| 513 | + self.X_test.reset_index(drop=True, inplace=True) |
| 514 | + self.X_test_orig.reset_index(drop=True, inplace=True) |
| 515 | + |
| 516 | + if hasattr(self.y_train, "reset_index"): |
| 517 | + self.y_train.reset_index(drop=True, inplace=True) |
| 518 | + self.y_test.reset_index(drop=True, inplace=True) |
| 519 | + self.y_test_orig.reset_index(drop=True, inplace=True) |
| 520 | + |
508 | 521 | self._assert_index_alignment( |
509 | 522 | self.X_train, self.y_train, "After master reset_index" |
510 | 523 | ) |
511 | 524 |
|
512 | 525 | def _post_split_cleaning(self): |
513 | 526 | """Applies cleaning steps post-split to prevent data leakage.""" |
| 527 | + if not isinstance(self.X_train, pd.DataFrame): |
| 528 | + self.logger.info( |
| 529 | + "Skipping post-split cleaning (not a DataFrame, likely Time Series mode)." |
| 530 | + ) |
| 531 | + return |
| 532 | + |
514 | 533 | # Clean column names *before* dropping operations to ensure stable column order. |
515 | 534 | cleanup = clean_up_class() |
516 | 535 | cleanup.screen_non_float_types(self.X_train) |
@@ -608,6 +627,12 @@ def _post_split_cleaning(self): |
608 | 627 |
|
609 | 628 | def _scale_features(self): |
610 | 629 | """Applies standard scaling to the feature sets.""" |
| 630 | + if not isinstance(self.X_train, pd.DataFrame): |
| 631 | + self.logger.info( |
| 632 | + "Skipping scaling (not a DataFrame, likely Time Series mode)." |
| 633 | + ) |
| 634 | + return |
| 635 | + |
611 | 636 | features_before = self.X_train.shape[1] |
612 | 637 | scale = self.local_param_dict.get("scale") |
613 | 638 | if scale: |
@@ -650,6 +675,12 @@ def _scale_features(self): |
650 | 675 |
|
651 | 676 | def _select_features_by_importance(self): |
652 | 677 | """Selects features based on importance scores if configured.""" |
| 678 | + if not isinstance(self.X_train, pd.DataFrame): |
| 679 | + self.logger.info( |
| 680 | + "Skipping feature selection (not a DataFrame, likely Time Series mode)." |
| 681 | + ) |
| 682 | + return |
| 683 | + |
653 | 684 | target_n_features = self.local_param_dict.get("feature_n") |
654 | 685 |
|
655 | 686 | if target_n_features is not None and target_n_features < 100: |
@@ -752,6 +783,12 @@ def _select_features_by_importance(self): |
752 | 783 |
|
753 | 784 | def _apply_embeddings(self): |
754 | 785 | """Applies feature embedding/dimensionality reduction if configured.""" |
| 786 | + if not isinstance(self.X_train, pd.DataFrame): |
| 787 | + self.logger.info( |
| 788 | + "Skipping embeddings (not a DataFrame, likely Time Series mode)." |
| 789 | + ) |
| 790 | + return |
| 791 | + |
755 | 792 | if self.local_param_dict.get("use_embedding", False): |
756 | 793 | features_before = self.X_train.shape[1] |
757 | 794 | self.logger.info("Applying embeddings...") |
@@ -925,16 +962,9 @@ def _finalize_pipeline(self): |
925 | 962 | # Final definitive assertion before exiting the data pipeline. |
926 | 963 | # This ensures that the X_train and y_train that will be passed to the |
927 | 964 | # model training steps are perfectly aligned. |
928 | | - try: |
929 | | - assert_index_equal(self.X_train.index, self.y_train.index) |
930 | | - self.logger.info( |
931 | | - "Final data alignment check PASSED. X_train and y_train indices are identical." |
932 | | - ) |
933 | | - except AssertionError: |
934 | | - self.logger.error( |
935 | | - "CRITICAL: Final data alignment check FAILED. X_train and y_train indices are NOT identical." |
936 | | - ) |
937 | | - raise |
| 965 | + self._assert_index_alignment( |
| 966 | + self.X_train, self.y_train, "Final data alignment check" |
| 967 | + ) |
938 | 968 |
|
939 | 969 | def _compile_and_log_feature_transformations(self, error_occurred: bool = False): |
940 | 970 | """Compiles the feature transformation log and displays it.""" |
|
0 commit comments