@@ -15,6 +15,45 @@ def filter_substring_list(string, substr):
1515
1616
1717def get_pertubation_columns (all_df_columns , local_param_dict , drop_term_list ):
18+
19+ """Identifies and categorizes columns for perturbation and dropping based on predefined rules and
20+ local parameters.
21+
22+ This function processes a list of all DataFrame columns, categorizing them into various
23+ groups such as blood tests, diagnostic orders, drug orders, BMI, ethnicity, and
24+ different types of annotation counts. It also identifies columns to be dropped based
25+ on specific keywords and prefixes. The selection of columns for 'perturbation'
26+ is determined by flags within `local_param_dict`.
27+
28+ Args:
29+ all_df_columns (list): A list of all column names in the DataFrame.
30+ local_param_dict (dict): A dictionary containing local parameters, including
31+ 'outcome_var_n' and a 'data' sub-dictionary that specifies which column
32+ categories to include for perturbation (e.g., 'age', 'sex', 'bmi', 'bloods').
33+ drop_term_list (list): A list of strings. Any column name containing these
34+ strings (case-insensitive) will be added to the `drop_list`.
35+
36+ Returns:
37+ tuple: A tuple containing two lists:
38+ - pertubation_columns (list): A list of column names selected for perturbation
39+ based on the `local_param_dict` settings.
40+ - drop_list (list): A list of column names identified to be dropped from the
41+ DataFrame.
42+
43+ Raises:
44+ NameError: If `global_parameters` or `find_near_matches` or `filter_substring_list`
45+ or `plot_candidate_feature_category_lists` or `plot_dict_values` are not defined.
46+ These are assumed to be accessible in the global scope or imported.
47+
48+ Notes:
49+ - The function relies on several global or externally defined functions:
50+ `global_parameters`, `find_near_matches`, `filter_substring_list`,
51+ `plot_candidate_feature_category_lists`, and `plot_dict_values`.
52+ - Column categorization is based on hardcoded substrings and prefixes.
53+ - Overlapping columns are handled by removing elements from `bloods_list` if
54+ they appear in any other categorized list.
55+ - Verbose output and plotting are controlled by the `verbose` level from `global_parameters`.
56+ """
1857
1958 global_params = global_parameters
2059
@@ -170,6 +209,47 @@ def get_pertubation_columns(all_df_columns, local_param_dict, drop_term_list):
170209 date_time_stamp_list = list (
171210 filter (lambda k : "date_time_stamp" in k , all_df_columns )
172211 )
212+
213+ # Combine these into a single conceptual list for overlap check later
214+ meta_sp_annotation_all_counts = (
215+ meta_sp_annotation_count_list +
216+ not_meta_sp_annotation_count_list +
217+ meta_rp_annotation_count_list +
218+ not_meta_rp_annotation_count_list
219+ )
220+ # Combine these into a single conceptual list for overlap check later
221+ meta_sp_annotation_mrc_all_counts = (
222+ meta_sp_annotation_mrc_count_list +
223+ not_meta_sp_annotation_mrc_count_list +
224+ relative_meta_rp_annotation_mrc_count_list +
225+ not_relative_meta_rp_annotation_mrc_count_list
226+ )
227+
228+ # --- Post-Processing: Remove overlaps from bloods_list ---
229+ # Create a set of all columns in other categories
230+ all_other_categorized_cols = set ()
231+
232+ # Add all columns from other specific lists to this set
233+ all_other_categorized_cols .update (annotation_count_list )
234+ all_other_categorized_cols .update (meta_sp_annotation_all_counts ) # Use the combined list
235+ all_other_categorized_cols .update (diagnostic_order_list )
236+ all_other_categorized_cols .update (drug_order_list )
237+ all_other_categorized_cols .update (bmi_list )
238+ all_other_categorized_cols .update (ethnicity_list )
239+ all_other_categorized_cols .update (annotation_mrc_count_list )
240+ all_other_categorized_cols .update (meta_sp_annotation_mrc_all_counts ) # Use the combined list
241+ all_other_categorized_cols .update (core_02_list )
242+ all_other_categorized_cols .update (bed_list )
243+ all_other_categorized_cols .update (vte_status_list )
244+ all_other_categorized_cols .update (hosp_site_list )
245+ all_other_categorized_cols .update (core_resus_list )
246+ all_other_categorized_cols .update (news_list )
247+ all_other_categorized_cols .update (date_time_stamp_list )
248+ all_other_categorized_cols .update (appointments_list )
249+
250+ # Filter bloods_list: keep only elements NOT found in any other category to avoid vte status and others being added to bloods.
251+ bloods_list = [col for col in bloods_list if col not in all_other_categorized_cols ]
252+
173253
174254 candidate_feature_category_lists = [
175255 meta_sp_annotation_count_list ,
0 commit comments