Skip to content

Commit f87218b

Browse files
committed
minor fixes, additional tests
1 parent 927a7d5 commit f87218b

7 files changed

Lines changed: 279 additions & 39 deletions

ml_grid/pipeline/data_constant_columns.py

Lines changed: 8 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -99,15 +99,10 @@ def remove_constant_columns_with_debug(
9999
if verbosity > 0:
100100
print(f"Constant columns in X_train: {list(constant_columns_train)}")
101101

102-
test_variances = X_test.var(axis=0)
103-
if verbosity > 1:
104-
print(f"Variance of X_test columns:\n{test_variances}")
105-
106-
constant_columns_test = test_variances[test_variances == 0].index
107-
if verbosity > 0:
108-
print(f"Constant columns in X_test: {list(constant_columns_test)}")
109-
110-
constant_columns = constant_columns_train.union(constant_columns_test)
102+
# A column is constant if it has no variance in the training set.
103+
# We should not consider the test set variance, as a small test set
104+
# might misleadingly have constant columns.
105+
constant_columns = constant_columns_train
111106

112107
X_train = X_train.loc[:, ~X_train.columns.isin(constant_columns)]
113108
X_test = X_test.loc[:, ~X_test.columns.isin(constant_columns)]
@@ -127,14 +122,10 @@ def remove_constant_columns_with_debug(
127122
if verbosity > 0:
128123
print(f"Constant feature indices in X_train: {list(constant_indices_train)}")
129124

130-
test_variances = X_test.var(axis=var_axis)
131-
constant_indices_test = np.where(test_variances == 0)[0]
132-
if verbosity > 0:
133-
print(f"Constant feature indices in X_test: {list(constant_indices_test)}")
134-
135-
# Combine indices of constant features from both train and test sets
136-
constant_indices = np.union1d(constant_indices_train, constant_indices_test)
137-
125+
# A feature is constant if it has no variance in the training set.
126+
# We should not consider the test set variance, as a small test set
127+
# might misleadingly have constant features.
128+
constant_indices = constant_indices_train
138129
# Create a boolean mask for features to keep
139130
num_features = X_train.shape[1]
140131
keep_mask = np.ones(num_features, dtype=bool)

ml_grid/pipeline/data_feature_importance_methods.py

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -48,20 +48,17 @@ def handle_feature_importance_methods(
4848
feature_method = ml_grid_object.local_param_dict.get("feature_selection_method")
4949

5050
if feature_method == "anova" or feature_method is None:
51-
print("feature_method ANOVA")
52-
53-
features = feature_methods.getNfeaturesANOVAF(
54-
self, n=target_n_features, X_train=X_train, y_train=y_train
55-
)
51+
print("feature_method ANOVA")
52+
fm = feature_methods()
53+
features = fm.getNfeaturesANOVAF(n=target_n_features, X_train=X_train, y_train=y_train)
5654

5755
elif feature_method == "markov_blanket":
58-
print("feature method Markov")
59-
60-
features = feature_methods.getNFeaturesMarkovBlanket(
61-
self, n=target_n_features, X_train=X_train, y_train=y_train
62-
)
56+
print("feature method Markov")
57+
fm = feature_methods()
58+
features = fm.getNFeaturesMarkovBlanket(n=target_n_features, X_train=X_train, y_train=y_train)
6359

6460
print(f"target_n_features: {target_n_features}")
61+
print(f"Selected features: {features}")
6562

6663
X_train = X_train[features]
6764

ml_grid/pipeline/data_feature_methods.py

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,10 @@ def getNFeaturesMarkovBlanket(
114114
"X_train must be a pandas DataFrame for getNFeaturesMarkovBlanket."
115115
)
116116
original_columns = X_train.columns
117+
118+
# Ensure y_train is a pandas Series, as expected by PyImpetus internally
119+
if not isinstance(y_train, pd.Series):
120+
y_train = pd.Series(y_train)
117121

118122
# Initialize the PyImpetus object with desired parameters
119123
model = PPIMBC(model=SVC(random_state=27, class_weight="balanced", kernel=svc_kernel),
@@ -132,15 +136,23 @@ def getNFeaturesMarkovBlanket(
132136
model.fit(X_train.values, y_train)
133137

134138
# Get the feature indices from the Markov blanket (MB)
135-
feature_indices = model.MB
139+
selected_features = model.MB
136140

137-
# Map indices back to original column names and truncate by n
138-
feature_names = [original_columns[i] for i in feature_indices][:n]
141+
# PyImpetus can return column names (str) or indices (int).
142+
# We need to handle both cases to get the final list of feature names.
143+
if all(isinstance(f, int) for f in selected_features):
144+
# It returned indices, so map them to names
145+
feature_names = [original_columns[i] for i in selected_features][:n]
146+
else:
147+
# It returned names directly
148+
feature_names = list(selected_features)[:n]
139149

140150
# Fallback: If feature selection returns an empty list, but the model found features,
141151
# return the single most important one. This prevents pipeline failure.
142-
if not feature_names and feature_indices:
143-
feature_names = [original_columns[feature_indices[0]]]
152+
if not feature_names and selected_features:
153+
# Re-evaluate the first selected feature to ensure it's a valid name
154+
first_feature = selected_features[0]
155+
feature_names = [original_columns[first_feature] if isinstance(first_feature, int) else first_feature]
144156

145157
return feature_names
146158

tests/test_column_names.py

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
import unittest
2+
import pandas as pd
3+
from ml_grid.pipeline.column_names import get_pertubation_columns, filter_substring_list
4+
from ml_grid.util.global_params import global_parameters
5+
6+
class TestColumnNames(unittest.TestCase):
7+
8+
def setUp(self):
9+
"""Set up common variables for tests."""
10+
self.all_df_columns = [
11+
'age', 'male', 'bmi_val', 'census_A', 'blood_test_mean', # Corrected to bmi_ and census_
12+
'diag_order_num_diagnostic-order', 'drug_order_num_drug-order', 'annotation_1_count',
13+
'meta_sp_annotation_1_count_subject_present', 'annotation_mrc_1_count_mrc_cs',
14+
'meta_sp_annotation_mrc_1_count_subject_present_mrc_cs', 'core_02_feature',
15+
'bed_feature', 'vte_status_feature', 'hosp_site_A', # Corrected to hosp_site_
16+
'core_resus_feature', 'news_resus_feature', # Corrected to news_
17+
'date_time_stamp_2022', 'ConsultantCode_X', 'outcome_var_1',
18+
'some_col__index_level_0', 'Unnamed: 0'
19+
]
20+
self.drop_term_list = ['bad_term']
21+
# Mute verbose output for tests
22+
global_parameters.verbose = 0
23+
24+
def test_filter_substring_list(self):
25+
"""Test the filter_substring_list utility function."""
26+
string_list = ['test_mean', 'test_median', 'other_val', 'bmi_mean']
27+
substr_list = ['_mean', '_median']
28+
# 'bmi_mean' should be excluded by the function's logic
29+
expected = ['test_mean', 'test_median']
30+
result = filter_substring_list(string_list, substr_list)
31+
self.assertCountEqual(result, expected)
32+
33+
def test_get_pertubation_columns_selects_all(self):
34+
"""Test that all categories are selected when flags are True."""
35+
local_param_dict = {
36+
"outcome_var_n": 1,
37+
"data": {
38+
'age': True, 'sex': True, 'bmi': True, 'ethnicity': True,
39+
'bloods': True, 'diagnostic_order': True, 'drug_order': True,
40+
'annotation_n': True, 'meta_sp_annotation_n': True,
41+
'annotation_mrc_n': True, 'meta_sp_annotation_mrc_n': True,
42+
'core_02': True, 'bed': True, 'vte_status': True,
43+
'hosp_site': True, 'core_resus': True, 'news': True,
44+
'date_time_stamp': True, 'appointments': True
45+
}
46+
}
47+
pert_cols, _ = get_pertubation_columns(
48+
self.all_df_columns, local_param_dict, self.drop_term_list
49+
)
50+
# Expect all columns except outcome and special drop columns
51+
self.assertEqual(len(pert_cols), 19)
52+
53+
def test_get_pertubation_columns_selects_none(self):
54+
"""Test that no categories are selected when flags are False."""
55+
local_param_dict = {
56+
"outcome_var_n": 1,
57+
"data": {key: False for key in [
58+
'age', 'sex', 'bmi', 'ethnicity', 'bloods', 'diagnostic_order',
59+
'drug_order', 'annotation_n', 'meta_sp_annotation_n',
60+
'annotation_mrc_n', 'meta_sp_annotation_mrc_n', 'core_02',
61+
'bed', 'vte_status', 'hosp_site', 'core_resus', 'news',
62+
'date_time_stamp', 'appointments'
63+
]}
64+
}
65+
pert_cols, _ = get_pertubation_columns(
66+
self.all_df_columns, local_param_dict, self.drop_term_list
67+
)
68+
self.assertEqual(len(pert_cols), 0)
69+
70+
def test_drop_list_population(self):
71+
"""Test that the initial drop_list is populated correctly."""
72+
local_param_dict = {"outcome_var_n": 1, "data": {}}
73+
_, drop_list = get_pertubation_columns(
74+
self.all_df_columns, local_param_dict, self.drop_term_list
75+
)
76+
# Should contain '__index_level' and 'Unnamed:' columns
77+
self.assertIn('some_col__index_level_0', drop_list)
78+
self.assertIn('Unnamed: 0', drop_list)
79+
self.assertEqual(len(drop_list), 2)
80+
81+
if __name__ == '__main__':
82+
unittest.main()
Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
import unittest
2+
import pandas as pd
3+
import numpy as np
4+
from ml_grid.pipeline.data_constant_columns import remove_constant_columns, remove_constant_columns_with_debug
5+
6+
class TestRemoveConstantColumns(unittest.TestCase):
7+
8+
def test_remove_constant_columns_with_constants(self):
9+
"""Test that constant columns are identified and added to the drop list."""
10+
df = pd.DataFrame({
11+
'a': [1, 2, 3],
12+
'b': [5, 5, 5],
13+
'c': ['x', 'y', 'z'],
14+
'd': [0, 0, 0]
15+
})
16+
initial_drop_list = ['e']
17+
updated_drop_list = remove_constant_columns(df, initial_drop_list.copy(), verbose=0)
18+
self.assertCountEqual(updated_drop_list, ['e', 'b', 'd'])
19+
20+
def test_remove_constant_columns_no_constants(self):
21+
"""Test that no columns are added when there are no constants."""
22+
df = pd.DataFrame({
23+
'a': [1, 2, 3],
24+
'b': [4, 5, 6]
25+
})
26+
updated_drop_list = remove_constant_columns(df, [], verbose=0)
27+
self.assertEqual(updated_drop_list, [])
28+
29+
def test_remove_constant_columns_empty_df(self):
30+
"""Test with an empty DataFrame."""
31+
df = pd.DataFrame()
32+
updated_drop_list = remove_constant_columns(df, [], verbose=0)
33+
self.assertEqual(updated_drop_list, [])
34+
35+
class TestRemoveConstantColumnsWithDebug(unittest.TestCase):
36+
37+
def test_pandas_2d_constant_in_train(self):
38+
"""Test with a constant column in the training DataFrame."""
39+
X_train = pd.DataFrame({'a': [1, 2, 3], 'b': [5, 5, 5]})
40+
X_test = pd.DataFrame({'a': [4, 5, 6], 'b': [7, 8, 9]})
41+
X_test_orig = X_test.copy()
42+
43+
train_out, test_out, orig_out = remove_constant_columns_with_debug(
44+
X_train, X_test, X_test_orig, verbosity=0
45+
)
46+
47+
self.assertNotIn('b', train_out.columns)
48+
self.assertNotIn('b', test_out.columns)
49+
self.assertNotIn('b', orig_out.columns)
50+
self.assertIn('a', train_out.columns)
51+
52+
def test_pandas_2d_constant_in_test(self):
53+
"""Test that a column constant only in the test set is NOT removed."""
54+
X_train = pd.DataFrame({'a': [1, 2, 3], 'b': [7, 8, 9]})
55+
X_test = pd.DataFrame({'a': [4, 5, 6], 'b': [5, 5, 5]})
56+
X_test_orig = X_test.copy()
57+
58+
train_out, test_out, orig_out = remove_constant_columns_with_debug(
59+
X_train, X_test, X_test_orig, verbosity=0
60+
)
61+
62+
# 'b' should NOT be removed as it has variance in the training set.
63+
self.assertIn('b', train_out.columns)
64+
self.assertIn('b', test_out.columns)
65+
self.assertIn('b', orig_out.columns)
66+
self.assertIn('a', train_out.columns)
67+
68+
def test_numpy_2d(self):
69+
"""Test with 2D numpy arrays."""
70+
X_train = np.array([[1, 5], [2, 5], [3, 5]])
71+
X_test = np.array([[4, 7], [5, 8], [6, 9]])
72+
X_test_orig = X_test.copy()
73+
74+
train_out, test_out, orig_out = remove_constant_columns_with_debug(
75+
X_train, X_test, X_test_orig, verbosity=0
76+
)
77+
78+
self.assertEqual(train_out.shape[1], 1)
79+
self.assertEqual(test_out.shape[1], 1)
80+
self.assertEqual(orig_out.shape[1], 1)
81+
self.assertTrue(np.array_equal(train_out, np.array([[1], [2], [3]])))
82+
83+
def test_numpy_3d_time_series(self):
84+
"""Test with 3D numpy arrays for time series data."""
85+
# Shape: (samples, features, timesteps)
86+
X_train = np.array([
87+
[[1, 1], [5, 5], [1, 1]], # Sample 1: Feature 1 varies, Feature 2 is constant
88+
[[2, 2], [5, 5], [2, 2]], # Sample 2
89+
])
90+
X_test = np.array([
91+
[[3, 3], [9, 9], [3, 3]],
92+
])
93+
X_test_orig = X_test.copy()
94+
95+
train_out, test_out, orig_out = remove_constant_columns_with_debug(
96+
X_train, X_test, X_test_orig, verbosity=0
97+
)
98+
99+
# Expecting feature 1 (index 0) and 2 (index 2) to be kept, feature 2 (index 1) to be dropped
100+
self.assertEqual(train_out.shape[1], 2)
101+
self.assertEqual(test_out.shape[1], 2)
102+
self.assertEqual(orig_out.shape[1], 2)
103+
104+
if __name__ == '__main__':
105+
unittest.main()

tests/test_markov_blanket_feature_reduction.py

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
import unittest
2-
from ml_grid.pipeline.data_feature_importance_methods import (
3-
feature_importance_methods,
4-
) # Replace 'your_module' with the actual module name where MyClass is defined
2+
import pandas as pd
3+
from ml_grid.pipeline.data_feature_methods import (
4+
feature_methods,
5+
)
56
from sklearn.datasets import make_classification
67
from sklearn.model_selection import train_test_split
78

@@ -15,14 +16,18 @@ def test_number_of_features(self):
1516

1617
# Split the data into training and testing sets
1718
X_train, _, y_train, _ = train_test_split(X, y, test_size=0.33, random_state=42)
19+
20+
# Convert numpy array to pandas DataFrame as the method expects it
21+
X_train = pd.DataFrame(X_train, columns=[f'feature_{i}' for i in range(X_train.shape[1])])
1822

19-
# Create an instance of MyClass
20-
my_instance = feature_importance_methods()
23+
# Create an instance of the correct class
24+
my_instance = feature_methods()
2125

22-
# Call the function to get the top 3 features
26+
# Call the function to get the top 5 features
2327
top_features = my_instance.getNFeaturesMarkovBlanket(5, X_train, y_train)
2428

25-
# Assert that the number of features returned is approximately 5
29+
# Assert that the number of features returned is reasonable.
30+
# The method may return fewer than n features.
2631
self.assertAlmostEqual(len(top_features), 3, delta=1)
2732

2833

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
import unittest
2+
import pandas as pd
3+
from ml_grid.util.synthetic_data_generator import generate_time_series, columns
4+
5+
6+
class TestGenerateTimeSeries(unittest.TestCase):
7+
8+
def setUp(self):
9+
"""Set up a test DataFrame for all test methods."""
10+
self.num_clients = 5
11+
self.num_rows_per_client = 10
12+
self.df = generate_time_series(self.num_clients, self.num_rows_per_client)
13+
14+
def test_output_is_dataframe(self):
15+
"""Test that the output is a pandas DataFrame."""
16+
self.assertIsInstance(self.df, pd.DataFrame)
17+
18+
def test_dataframe_shape(self):
19+
"""Test the shape of the generated DataFrame."""
20+
expected_rows = self.num_clients * self.num_rows_per_client
21+
expected_cols = len(columns)
22+
self.assertEqual(self.df.shape, (expected_rows, expected_cols))
23+
24+
def test_number_of_unique_clients(self):
25+
"""Test that the number of unique clients is correct."""
26+
self.assertEqual(self.df['client_idcode'].nunique(), self.num_clients)
27+
28+
def test_timestamp_column_type(self):
29+
"""Test that the timestamp column has the correct data type."""
30+
self.assertTrue(pd.api.types.is_datetime64_any_dtype(self.df['timestamp']))
31+
32+
def test_sorting(self):
33+
"""Test that the DataFrame is sorted by client_idcode and timestamp."""
34+
# Check if each client's timestamps are sorted
35+
for client_id in self.df['client_idcode'].unique():
36+
client_df = self.df[self.df['client_idcode'] == client_id]
37+
self.assertTrue(client_df['timestamp'].is_monotonic_increasing)
38+
39+
def test_outcome_variable_is_binary(self):
40+
"""Test that the outcome variable is binary (0 or 1)."""
41+
outcome_col = 'outcome_var_1'
42+
self.assertIn(outcome_col, self.df.columns)
43+
unique_outcomes = self.df[outcome_col].unique()
44+
self.assertTrue(all(item in [0, 1] for item in unique_outcomes))
45+
46+
47+
if __name__ == '__main__':
48+
unittest.main()

0 commit comments

Comments
 (0)