1+ """
2+ Dedicated tests for data integrity checks within the data pipeline.
3+ """
4+
5+ import unittest
6+ import pandas as pd
7+ import numpy as np
8+ import os
9+ import tempfile
10+ import shutil
11+ from pathlib import Path
12+
13+ from ml_grid .pipeline .data import pipe
14+ from ml_grid .util .global_params import global_parameters
15+
16+ class TestDataIntegrity (unittest .TestCase ):
17+ """Test suite for data integrity aspects of the data.pipe class."""
18+
19+ def setUp (self ):
20+ """Set up a temporary environment for each test."""
21+ self .project_root = Path (__file__ ).resolve ().parents [1 ]
22+ self .test_dir = tempfile .mkdtemp ()
23+
24+ # Create a custom test CSV file for this specific test case
25+ self .test_data_path = Path (self .test_dir ) / "integrity_test_data.csv"
26+
27+ # Strategy: Create a dataset where one column is ALL constant (100% constant)
28+ # This way it MUST be removed regardless of any sampling strategy
29+ np .random .seed (42 )
30+ n_samples = 200
31+
32+ data = {
33+ 'feature1' : np .random .rand (n_samples ),
34+ # Make this column 100% constant - ALL rows have the same value
35+ 'constant_in_train' : [5 ] * n_samples , # ALL 200 rows are constant!
36+ 'highly_correlated' : [x * 2.0 for x in range (n_samples )],
37+ 'feature2' : np .random .rand (n_samples ),
38+ 'also_highly_correlated' : [x * 2.0 + np .random .normal (0 , 0.001 ) for x in range (n_samples )],
39+ 'outcome_var_1' : np .random .randint (0 , 2 , n_samples )
40+ }
41+ pd .DataFrame (data ).to_csv (self .test_data_path , index = False )
42+
43+ global_parameters .verbose = 0
44+ global_parameters .error_raise = True
45+ global_parameters .bayessearch = False
46+
47+ self .base_local_param_dict = {
48+ 'outcome_var_n' : 1 ,
49+ 'param_space_size' : 'small' ,
50+ 'scale' : False ,
51+ 'feature_n' : 100 ,
52+ 'use_embedding' : False ,
53+ 'percent_missing' : 100 ,
54+ 'corr' : 0.99 , # High threshold to catch the highly correlated features
55+ 'test_size' : 0.25 ,
56+ 'resample' : None ,
57+ 'random_state' : 42 ,
58+ 'data' : {
59+ 'feature1' : True ,
60+ 'constant_in_train' : True ,
61+ 'feature2' : True ,
62+ 'highly_correlated' : True ,
63+ 'also_highly_correlated' : True
64+ }
65+ }
66+ self .model_class_dict = {'LogisticRegression_class' : True }
67+
68+ def tearDown (self ):
69+ """Clean up the temporary directory after each test."""
70+ shutil .rmtree (self .test_dir )
71+
72+ def test_constant_in_train_removed_from_all_splits (self ):
73+ """
74+ Verify that a column that is 100% constant is removed from all splits.
75+
76+ This is a simpler test: if a column has the same value in every single
77+ row of the entire dataset, it MUST be removed from the training set
78+ (and therefore from all splits) since it's constant everywhere.
79+ """
80+ pipeline = pipe (
81+ file_name = str (self .test_data_path ),
82+ drop_term_list = [],
83+ experiment_dir = self .test_dir ,
84+ base_project_dir = str (self .project_root ),
85+ local_param_dict = self .base_local_param_dict ,
86+ param_space_index = 0 ,
87+ model_class_dict = self .model_class_dict
88+ )
89+
90+ # Debug info if test fails
91+ if 'constant_in_train' in pipeline .X_train .columns :
92+ n_unique = pipeline .X_train ['constant_in_train' ].nunique ()
93+ unique_vals = pipeline .X_train ['constant_in_train' ].unique ()
94+ fail_msg = (f"constant_in_train was not removed from X_train. "
95+ f"Nunique in X_train: { n_unique } , "
96+ f"Unique values: { unique_vals } , "
97+ f"All X_train columns: { pipeline .X_train .columns .tolist ()} " )
98+ else :
99+ fail_msg = None
100+
101+ # The column 'constant_in_train' should be removed from all splits
102+ # because it is constant everywhere (all values are 5).
103+ self .assertNotIn ('constant_in_train' , pipeline .X_train .columns , fail_msg )
104+ self .assertNotIn ('constant_in_train' , pipeline .X_test .columns )
105+ self .assertNotIn ('constant_in_train' , pipeline .X_test_orig .columns )
106+
107+ # Ensure other columns are preserved
108+ self .assertIn ('feature1' , pipeline .X_train .columns )
109+ self .assertIn ('feature2' , pipeline .X_train .columns )
110+
111+ def test_highly_correlated_features_removed (self ):
112+ """
113+ Verify that highly correlated features are correctly identified and one of them is removed.
114+ """
115+ pipeline = pipe (
116+ file_name = str (self .test_data_path ),
117+ drop_term_list = [],
118+ experiment_dir = self .test_dir ,
119+ base_project_dir = str (self .project_root ),
120+ local_param_dict = self .base_local_param_dict ,
121+ param_space_index = 0 ,
122+ model_class_dict = self .model_class_dict
123+ )
124+
125+ # 'highly_correlated' and 'also_highly_correlated' are designed to be correlated > 0.99
126+ # One of them should be dropped. We check if at least one is gone.
127+ remaining_corr_cols = {'highly_correlated' , 'also_highly_correlated' }.intersection (pipeline .X_train .columns )
128+
129+ self .assertLess (len (remaining_corr_cols ), 2 ,
130+ "Both highly correlated columns remained in the dataframe" )
131+
132+
133+ if __name__ == '__main__' :
134+ unittest .main ()
0 commit comments