Skip to content

Commit b35727b

Browse files
committed
patched blood features mis categorization bug. added doc string
1 parent 0fd1079 commit b35727b

1 file changed

Lines changed: 80 additions & 0 deletions

File tree

ml_grid/pipeline/column_names.py

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,45 @@ def filter_substring_list(string, substr):
1515

1616

1717
def get_pertubation_columns(all_df_columns, local_param_dict, drop_term_list):
18+
19+
"""Identifies and categorizes columns for perturbation and dropping based on predefined rules and
20+
local parameters.
21+
22+
This function processes a list of all DataFrame columns, categorizing them into various
23+
groups such as blood tests, diagnostic orders, drug orders, BMI, ethnicity, and
24+
different types of annotation counts. It also identifies columns to be dropped based
25+
on specific keywords and prefixes. The selection of columns for 'perturbation'
26+
is determined by flags within `local_param_dict`.
27+
28+
Args:
29+
all_df_columns (list): A list of all column names in the DataFrame.
30+
local_param_dict (dict): A dictionary containing local parameters, including
31+
'outcome_var_n' and a 'data' sub-dictionary that specifies which column
32+
categories to include for perturbation (e.g., 'age', 'sex', 'bmi', 'bloods').
33+
drop_term_list (list): A list of strings. Any column name containing these
34+
strings (case-insensitive) will be added to the `drop_list`.
35+
36+
Returns:
37+
tuple: A tuple containing two lists:
38+
- pertubation_columns (list): A list of column names selected for perturbation
39+
based on the `local_param_dict` settings.
40+
- drop_list (list): A list of column names identified to be dropped from the
41+
DataFrame.
42+
43+
Raises:
44+
NameError: If `global_parameters` or `find_near_matches` or `filter_substring_list`
45+
or `plot_candidate_feature_category_lists` or `plot_dict_values` are not defined.
46+
These are assumed to be accessible in the global scope or imported.
47+
48+
Notes:
49+
- The function relies on several global or externally defined functions:
50+
`global_parameters`, `find_near_matches`, `filter_substring_list`,
51+
`plot_candidate_feature_category_lists`, and `plot_dict_values`.
52+
- Column categorization is based on hardcoded substrings and prefixes.
53+
- Overlapping columns are handled by removing elements from `bloods_list` if
54+
they appear in any other categorized list.
55+
- Verbose output and plotting are controlled by the `verbose` level from `global_parameters`.
56+
"""
1857

1958
global_params = global_parameters
2059

@@ -170,6 +209,47 @@ def get_pertubation_columns(all_df_columns, local_param_dict, drop_term_list):
170209
date_time_stamp_list = list(
171210
filter(lambda k: "date_time_stamp" in k, all_df_columns)
172211
)
212+
213+
# Combine these into a single conceptual list for overlap check later
214+
meta_sp_annotation_all_counts = (
215+
meta_sp_annotation_count_list +
216+
not_meta_sp_annotation_count_list +
217+
meta_rp_annotation_count_list +
218+
not_meta_rp_annotation_count_list
219+
)
220+
# Combine these into a single conceptual list for overlap check later
221+
meta_sp_annotation_mrc_all_counts = (
222+
meta_sp_annotation_mrc_count_list +
223+
not_meta_sp_annotation_mrc_count_list +
224+
relative_meta_rp_annotation_mrc_count_list +
225+
not_relative_meta_rp_annotation_mrc_count_list
226+
)
227+
228+
# --- Post-Processing: Remove overlaps from bloods_list ---
229+
# Create a set of all columns in other categories
230+
all_other_categorized_cols = set()
231+
232+
# Add all columns from other specific lists to this set
233+
all_other_categorized_cols.update(annotation_count_list)
234+
all_other_categorized_cols.update(meta_sp_annotation_all_counts) # Use the combined list
235+
all_other_categorized_cols.update(diagnostic_order_list)
236+
all_other_categorized_cols.update(drug_order_list)
237+
all_other_categorized_cols.update(bmi_list)
238+
all_other_categorized_cols.update(ethnicity_list)
239+
all_other_categorized_cols.update(annotation_mrc_count_list)
240+
all_other_categorized_cols.update(meta_sp_annotation_mrc_all_counts) # Use the combined list
241+
all_other_categorized_cols.update(core_02_list)
242+
all_other_categorized_cols.update(bed_list)
243+
all_other_categorized_cols.update(vte_status_list)
244+
all_other_categorized_cols.update(hosp_site_list)
245+
all_other_categorized_cols.update(core_resus_list)
246+
all_other_categorized_cols.update(news_list)
247+
all_other_categorized_cols.update(date_time_stamp_list)
248+
all_other_categorized_cols.update(appointments_list)
249+
250+
# Filter bloods_list: keep only elements NOT found in any other category to avoid vte status and others being added to bloods.
251+
bloods_list = [col for col in bloods_list if col not in all_other_categorized_cols]
252+
173253

174254
candidate_feature_category_lists = [
175255
meta_sp_annotation_count_list,

0 commit comments

Comments
 (0)