Skip to content

Commit 3384f77

Browse files
committed
handle constant columns in time series data structure
1 parent 636ae05 commit 3384f77

1 file changed

Lines changed: 66 additions & 32 deletions

File tree

ml_grid/pipeline/data_constant_columns.py

Lines changed: 66 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import pandas as pd
2+
import numpy as np
23
from typing import List, Optional
34

45
def remove_constant_columns(X: pd.DataFrame, drop_list: Optional[List[str]] = None, verbose: int = 1) -> List[str]:
@@ -55,43 +56,76 @@ def remove_constant_columns_with_debug(X_train, X_test, X_test_orig, verbosity=2
5556
print(f"Initial X_train shape: {X_train.shape}")
5657
print(f"Initial X_test shape: {X_test.shape}")
5758
print(f"Initial X_test_orig shape: {X_test_orig.shape}")
58-
59-
# Calculate the variance for each column in X_train
60-
train_variances = X_train.var(axis=0)
61-
if verbosity > 1:
62-
print(f"Variance of X_train columns:\n{train_variances}")
63-
64-
# Identify and remove constant columns in X_train
65-
constant_columns_train = train_variances[train_variances == 0].index
66-
if verbosity > 0:
67-
print(f"Constant columns in X_train: {list(constant_columns_train)}")
68-
69-
# Calculate the variance for each column in X_test
70-
test_variances = X_test.var(axis=0)
71-
if verbosity > 1:
72-
print(f"Variance of X_test columns:\n{test_variances}")
73-
74-
# Identify constant columns in X_test
75-
constant_columns_test = test_variances[test_variances == 0].index
76-
if verbosity > 0:
77-
print(f"Constant columns in X_test: {list(constant_columns_test)}")
78-
79-
# Combine constant columns from both X_train and X_test
80-
constant_columns = constant_columns_train.union(constant_columns_test)
81-
82-
# Remove the constant columns from both X_train and X_test
83-
X_train = X_train.loc[:, ~X_train.columns.isin(constant_columns)]
84-
X_test = X_test.loc[:, ~X_test.columns.isin(constant_columns)]
85-
86-
# Also remove the same constant columns from X_test_orig
87-
X_test_orig = X_test_orig.loc[:, ~X_test_orig.columns.isin(constant_columns)]
88-
59+
60+
is_pandas = isinstance(X_train, pd.DataFrame)
61+
62+
if is_pandas:
63+
# Original logic for pandas DataFrames
64+
train_variances = X_train.var(axis=0)
65+
if verbosity > 1:
66+
print(f"Variance of X_train columns:\n{train_variances}")
67+
68+
constant_columns_train = train_variances[train_variances == 0].index
69+
if verbosity > 0:
70+
print(f"Constant columns in X_train: {list(constant_columns_train)}")
71+
72+
test_variances = X_test.var(axis=0)
73+
if verbosity > 1:
74+
print(f"Variance of X_test columns:\n{test_variances}")
75+
76+
constant_columns_test = test_variances[test_variances == 0].index
77+
if verbosity > 0:
78+
print(f"Constant columns in X_test: {list(constant_columns_test)}")
79+
80+
constant_columns = constant_columns_train.union(constant_columns_test)
81+
82+
X_train = X_train.loc[:, ~X_train.columns.isin(constant_columns)]
83+
X_test = X_test.loc[:, ~X_test.columns.isin(constant_columns)]
84+
X_test_orig = X_test_orig.loc[:, ~X_test_orig.columns.isin(constant_columns)]
85+
else: # Handle numpy arrays
86+
# Determine variance calculation axis based on dimensions
87+
if X_train.ndim == 3:
88+
# For 3D time series data (e.g., from aeon: samples, features, timesteps),
89+
# calculate variance for each feature across samples and timesteps.
90+
var_axis = (0, 2)
91+
else:
92+
# For 2D data, calculate variance across samples (axis 0).
93+
var_axis = 0
94+
95+
train_variances = X_train.var(axis=var_axis)
96+
constant_indices_train = np.where(train_variances == 0)[0]
97+
if verbosity > 0:
98+
print(f"Constant feature indices in X_train: {list(constant_indices_train)}")
99+
100+
test_variances = X_test.var(axis=var_axis)
101+
constant_indices_test = np.where(test_variances == 0)[0]
102+
if verbosity > 0:
103+
print(f"Constant feature indices in X_test: {list(constant_indices_test)}")
104+
105+
# Combine indices of constant features from both train and test sets
106+
constant_indices = np.union1d(constant_indices_train, constant_indices_test)
107+
108+
# Create a boolean mask for features to keep
109+
num_features = X_train.shape[1]
110+
keep_mask = np.ones(num_features, dtype=bool)
111+
keep_mask[constant_indices] = False
112+
113+
# Apply the mask to remove constant features
114+
if X_train.ndim == 3:
115+
X_train = X_train[:, keep_mask, :]
116+
X_test = X_test[:, keep_mask, :]
117+
X_test_orig = X_test_orig[:, keep_mask, :]
118+
else: # 2D array
119+
X_train = X_train[:, keep_mask]
120+
X_test = X_test[:, keep_mask]
121+
X_test_orig = X_test_orig[:, keep_mask]
122+
89123
if verbosity > 0:
90124
# Debug message: Shape after removing constant columns from X_train, X_test, X_test_orig
91125
print(f"Shape of X_train after removing constant columns: {X_train.shape}")
92126
print(f"Shape of X_test after removing constant columns: {X_test.shape}")
93127
print(f"Shape of X_test_orig after removing constant columns: {X_test_orig.shape}")
94-
128+
95129
# Return the modified X_train, X_test, and X_test_orig, with y_test_orig unchanged
96130
return X_train, X_test, X_test_orig
97131

0 commit comments

Comments
 (0)