|
1 | 1 | import pandas as pd |
| 2 | +import numpy as np |
2 | 3 | from typing import List, Optional |
3 | 4 |
|
4 | 5 | def remove_constant_columns(X: pd.DataFrame, drop_list: Optional[List[str]] = None, verbose: int = 1) -> List[str]: |
@@ -55,43 +56,76 @@ def remove_constant_columns_with_debug(X_train, X_test, X_test_orig, verbosity=2 |
55 | 56 | print(f"Initial X_train shape: {X_train.shape}") |
56 | 57 | print(f"Initial X_test shape: {X_test.shape}") |
57 | 58 | print(f"Initial X_test_orig shape: {X_test_orig.shape}") |
58 | | - |
59 | | - # Calculate the variance for each column in X_train |
60 | | - train_variances = X_train.var(axis=0) |
61 | | - if verbosity > 1: |
62 | | - print(f"Variance of X_train columns:\n{train_variances}") |
63 | | - |
64 | | - # Identify and remove constant columns in X_train |
65 | | - constant_columns_train = train_variances[train_variances == 0].index |
66 | | - if verbosity > 0: |
67 | | - print(f"Constant columns in X_train: {list(constant_columns_train)}") |
68 | | - |
69 | | - # Calculate the variance for each column in X_test |
70 | | - test_variances = X_test.var(axis=0) |
71 | | - if verbosity > 1: |
72 | | - print(f"Variance of X_test columns:\n{test_variances}") |
73 | | - |
74 | | - # Identify constant columns in X_test |
75 | | - constant_columns_test = test_variances[test_variances == 0].index |
76 | | - if verbosity > 0: |
77 | | - print(f"Constant columns in X_test: {list(constant_columns_test)}") |
78 | | - |
79 | | - # Combine constant columns from both X_train and X_test |
80 | | - constant_columns = constant_columns_train.union(constant_columns_test) |
81 | | - |
82 | | - # Remove the constant columns from both X_train and X_test |
83 | | - X_train = X_train.loc[:, ~X_train.columns.isin(constant_columns)] |
84 | | - X_test = X_test.loc[:, ~X_test.columns.isin(constant_columns)] |
85 | | - |
86 | | - # Also remove the same constant columns from X_test_orig |
87 | | - X_test_orig = X_test_orig.loc[:, ~X_test_orig.columns.isin(constant_columns)] |
88 | | - |
| 59 | + |
| 60 | + is_pandas = isinstance(X_train, pd.DataFrame) |
| 61 | + |
| 62 | + if is_pandas: |
| 63 | + # Original logic for pandas DataFrames |
| 64 | + train_variances = X_train.var(axis=0) |
| 65 | + if verbosity > 1: |
| 66 | + print(f"Variance of X_train columns:\n{train_variances}") |
| 67 | + |
| 68 | + constant_columns_train = train_variances[train_variances == 0].index |
| 69 | + if verbosity > 0: |
| 70 | + print(f"Constant columns in X_train: {list(constant_columns_train)}") |
| 71 | + |
| 72 | + test_variances = X_test.var(axis=0) |
| 73 | + if verbosity > 1: |
| 74 | + print(f"Variance of X_test columns:\n{test_variances}") |
| 75 | + |
| 76 | + constant_columns_test = test_variances[test_variances == 0].index |
| 77 | + if verbosity > 0: |
| 78 | + print(f"Constant columns in X_test: {list(constant_columns_test)}") |
| 79 | + |
| 80 | + constant_columns = constant_columns_train.union(constant_columns_test) |
| 81 | + |
| 82 | + X_train = X_train.loc[:, ~X_train.columns.isin(constant_columns)] |
| 83 | + X_test = X_test.loc[:, ~X_test.columns.isin(constant_columns)] |
| 84 | + X_test_orig = X_test_orig.loc[:, ~X_test_orig.columns.isin(constant_columns)] |
| 85 | + else: # Handle numpy arrays |
| 86 | + # Determine variance calculation axis based on dimensions |
| 87 | + if X_train.ndim == 3: |
| 88 | + # For 3D time series data (e.g., from aeon: samples, features, timesteps), |
| 89 | + # calculate variance for each feature across samples and timesteps. |
| 90 | + var_axis = (0, 2) |
| 91 | + else: |
| 92 | + # For 2D data, calculate variance across samples (axis 0). |
| 93 | + var_axis = 0 |
| 94 | + |
| 95 | + train_variances = X_train.var(axis=var_axis) |
| 96 | + constant_indices_train = np.where(train_variances == 0)[0] |
| 97 | + if verbosity > 0: |
| 98 | + print(f"Constant feature indices in X_train: {list(constant_indices_train)}") |
| 99 | + |
| 100 | + test_variances = X_test.var(axis=var_axis) |
| 101 | + constant_indices_test = np.where(test_variances == 0)[0] |
| 102 | + if verbosity > 0: |
| 103 | + print(f"Constant feature indices in X_test: {list(constant_indices_test)}") |
| 104 | + |
| 105 | + # Combine indices of constant features from both train and test sets |
| 106 | + constant_indices = np.union1d(constant_indices_train, constant_indices_test) |
| 107 | + |
| 108 | + # Create a boolean mask for features to keep |
| 109 | + num_features = X_train.shape[1] |
| 110 | + keep_mask = np.ones(num_features, dtype=bool) |
| 111 | + keep_mask[constant_indices] = False |
| 112 | + |
| 113 | + # Apply the mask to remove constant features |
| 114 | + if X_train.ndim == 3: |
| 115 | + X_train = X_train[:, keep_mask, :] |
| 116 | + X_test = X_test[:, keep_mask, :] |
| 117 | + X_test_orig = X_test_orig[:, keep_mask, :] |
| 118 | + else: # 2D array |
| 119 | + X_train = X_train[:, keep_mask] |
| 120 | + X_test = X_test[:, keep_mask] |
| 121 | + X_test_orig = X_test_orig[:, keep_mask] |
| 122 | + |
89 | 123 | if verbosity > 0: |
90 | 124 | # Debug message: Shape after removing constant columns from X_train, X_test, X_test_orig |
91 | 125 | print(f"Shape of X_train after removing constant columns: {X_train.shape}") |
92 | 126 | print(f"Shape of X_test after removing constant columns: {X_test.shape}") |
93 | 127 | print(f"Shape of X_test_orig after removing constant columns: {X_test_orig.shape}") |
94 | | - |
| 128 | + |
95 | 129 | # Return the modified X_train, X_test, and X_test_orig, with y_test_orig unchanged |
96 | 130 | return X_train, X_test, X_test_orig |
97 | 131 |
|
|
0 commit comments