Skip to content

Commit 37178d4

Browse files
committed
additional data pipeline test
1 parent b4d1e95 commit 37178d4

1 file changed

Lines changed: 134 additions & 0 deletions

File tree

tests/test_data_integrity.py

Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
"""
2+
Dedicated tests for data integrity checks within the data pipeline.
3+
"""
4+
5+
import unittest
6+
import pandas as pd
7+
import numpy as np
8+
import os
9+
import tempfile
10+
import shutil
11+
from pathlib import Path
12+
13+
from ml_grid.pipeline.data import pipe
14+
from ml_grid.util.global_params import global_parameters
15+
16+
class TestDataIntegrity(unittest.TestCase):
17+
"""Test suite for data integrity aspects of the data.pipe class."""
18+
19+
def setUp(self):
20+
"""Set up a temporary environment for each test."""
21+
self.project_root = Path(__file__).resolve().parents[1]
22+
self.test_dir = tempfile.mkdtemp()
23+
24+
# Create a custom test CSV file for this specific test case
25+
self.test_data_path = Path(self.test_dir) / "integrity_test_data.csv"
26+
27+
# Strategy: Create a dataset where one column is ALL constant (100% constant)
28+
# This way it MUST be removed regardless of any sampling strategy
29+
np.random.seed(42)
30+
n_samples = 200
31+
32+
data = {
33+
'feature1': np.random.rand(n_samples),
34+
# Make this column 100% constant - ALL rows have the same value
35+
'constant_in_train': [5] * n_samples, # ALL 200 rows are constant!
36+
'highly_correlated': [x * 2.0 for x in range(n_samples)],
37+
'feature2': np.random.rand(n_samples),
38+
'also_highly_correlated': [x * 2.0 + np.random.normal(0, 0.001) for x in range(n_samples)],
39+
'outcome_var_1': np.random.randint(0, 2, n_samples)
40+
}
41+
pd.DataFrame(data).to_csv(self.test_data_path, index=False)
42+
43+
global_parameters.verbose = 0
44+
global_parameters.error_raise = True
45+
global_parameters.bayessearch = False
46+
47+
self.base_local_param_dict = {
48+
'outcome_var_n': 1,
49+
'param_space_size': 'small',
50+
'scale': False,
51+
'feature_n': 100,
52+
'use_embedding': False,
53+
'percent_missing': 100,
54+
'corr': 0.99, # High threshold to catch the highly correlated features
55+
'test_size': 0.25,
56+
'resample': None,
57+
'random_state': 42,
58+
'data': {
59+
'feature1': True,
60+
'constant_in_train': True,
61+
'feature2': True,
62+
'highly_correlated': True,
63+
'also_highly_correlated': True
64+
}
65+
}
66+
self.model_class_dict = {'LogisticRegression_class': True}
67+
68+
def tearDown(self):
69+
"""Clean up the temporary directory after each test."""
70+
shutil.rmtree(self.test_dir)
71+
72+
def test_constant_in_train_removed_from_all_splits(self):
73+
"""
74+
Verify that a column that is 100% constant is removed from all splits.
75+
76+
This is a simpler test: if a column has the same value in every single
77+
row of the entire dataset, it MUST be removed from the training set
78+
(and therefore from all splits) since it's constant everywhere.
79+
"""
80+
pipeline = pipe(
81+
file_name=str(self.test_data_path),
82+
drop_term_list=[],
83+
experiment_dir=self.test_dir,
84+
base_project_dir=str(self.project_root),
85+
local_param_dict=self.base_local_param_dict,
86+
param_space_index=0,
87+
model_class_dict=self.model_class_dict
88+
)
89+
90+
# Debug info if test fails
91+
if 'constant_in_train' in pipeline.X_train.columns:
92+
n_unique = pipeline.X_train['constant_in_train'].nunique()
93+
unique_vals = pipeline.X_train['constant_in_train'].unique()
94+
fail_msg = (f"constant_in_train was not removed from X_train. "
95+
f"Nunique in X_train: {n_unique}, "
96+
f"Unique values: {unique_vals}, "
97+
f"All X_train columns: {pipeline.X_train.columns.tolist()}")
98+
else:
99+
fail_msg = None
100+
101+
# The column 'constant_in_train' should be removed from all splits
102+
# because it is constant everywhere (all values are 5).
103+
self.assertNotIn('constant_in_train', pipeline.X_train.columns, fail_msg)
104+
self.assertNotIn('constant_in_train', pipeline.X_test.columns)
105+
self.assertNotIn('constant_in_train', pipeline.X_test_orig.columns)
106+
107+
# Ensure other columns are preserved
108+
self.assertIn('feature1', pipeline.X_train.columns)
109+
self.assertIn('feature2', pipeline.X_train.columns)
110+
111+
def test_highly_correlated_features_removed(self):
112+
"""
113+
Verify that highly correlated features are correctly identified and one of them is removed.
114+
"""
115+
pipeline = pipe(
116+
file_name=str(self.test_data_path),
117+
drop_term_list=[],
118+
experiment_dir=self.test_dir,
119+
base_project_dir=str(self.project_root),
120+
local_param_dict=self.base_local_param_dict,
121+
param_space_index=0,
122+
model_class_dict=self.model_class_dict
123+
)
124+
125+
# 'highly_correlated' and 'also_highly_correlated' are designed to be correlated > 0.99
126+
# One of them should be dropped. We check if at least one is gone.
127+
remaining_corr_cols = {'highly_correlated', 'also_highly_correlated'}.intersection(pipeline.X_train.columns)
128+
129+
self.assertLess(len(remaining_corr_cols), 2,
130+
"Both highly correlated columns remained in the dataframe")
131+
132+
133+
if __name__ == '__main__':
134+
unittest.main()

0 commit comments

Comments
 (0)