Skip to content

Commit 4e5adea

Browse files
committed
initial prefetching data batches implementation, and associated adjustments to unit test
1 parent bbcb5d8 commit 4e5adea

6 files changed

Lines changed: 63 additions & 26 deletions

File tree

notebooks/example_usage.ipynb

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -45,9 +45,9 @@
4545
"\n",
4646
"if(clear_previous_outputs):\n",
4747
"\n",
48-
" shutil.rmtree('notebooks/new_project', ignore_errors=True)\n",
48+
" shutil.rmtree('new_project', ignore_errors=True)\n",
4949
"\n",
50-
" shutil.rmtree('notebooks/treatment_doc_extract', ignore_errors=True)"
50+
" shutil.rmtree('treatment_doc_extract', ignore_errors=True)"
5151
]
5252
},
5353
{
@@ -936,25 +936,34 @@
936936
"assert example_pat_res.shape == (5, 26)"
937937
]
938938
},
939+
{
940+
"cell_type": "code",
941+
"execution_count": null,
942+
"metadata": {},
943+
"outputs": [],
944+
"source": [
945+
"len(treatment_docs)"
946+
]
947+
},
939948
{
940949
"cell_type": "code",
941950
"execution_count": null,
942951
"metadata": {},
943952
"outputs": [],
944953
"source": [
945954
"treatment_docs = pd.read_csv('test_files/treatment_docs.csv')\n",
946-
"assert len(treatment_docs) == 6\n",
947-
"print(len(treatment_docs)==6)\n",
955+
"assert len(treatment_docs) == 29\n",
956+
"print(len(treatment_docs)==29)\n",
948957
"treatment_docs"
949958
]
950959
},
951960
{
952961
"cell_type": "code",
953-
"execution_count": 44,
962+
"execution_count": 45,
954963
"metadata": {},
955964
"outputs": [],
956965
"source": [
957-
"assert treatment_docs['basicobs_itemname_analysed'].iloc[5] == 'PFA-100'"
966+
"assert treatment_docs['basicobs_itemname_analysed'].iloc[21] == 'Luteinizing Hormone (LH)'"
958967
]
959968
},
960969
{
@@ -968,7 +977,7 @@
968977
},
969978
{
970979
"cell_type": "code",
971-
"execution_count": 46,
980+
"execution_count": 47,
972981
"metadata": {},
973982
"outputs": [],
974983
"source": [
@@ -988,7 +997,7 @@
988997
},
989998
{
990999
"cell_type": "code",
991-
"execution_count": 48,
1000+
"execution_count": 49,
9921001
"metadata": {},
9931002
"outputs": [],
9941003
"source": [

pat2vec/main_pat2vec.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,10 @@
77
from datetime import datetime
88
from multiprocessing import Pool
99
import pandas as pd
10+
from pat2vec.patvec_get_batch_methods.get_merged_batches import (
11+
get_merged_pat_batch_bloods,
12+
split_and_save_csv,
13+
)
1014
from pat2vec.util.cogstack_v8_lite import *
1115
from colorama import Back, Fore, Style
1216
from credentials import *
@@ -227,6 +231,25 @@ def __init__(
227231

228232
self.n_pat_lines = config_obj.n_pat_lines
229233

234+
if self.config_obj.prefetch_pat_batches:
235+
if self.config_obj.verbosity > 0:
236+
237+
if self.config_obj.main_options.get("bloods", True):
238+
239+
dfb = get_merged_pat_batch_bloods(
240+
client_idcode_list=self.all_patient_list,
241+
search_term=None,
242+
config_obj=self.config_obj,
243+
cohort_searcher_with_terms_and_search=self.cohort_searcher_with_terms_and_search,
244+
)
245+
246+
split_and_save_csv(
247+
df=dfb,
248+
client_idcode_column="client_idcode",
249+
save_folder=self.config_obj.pre_bloods_batch_path,
250+
num_processes=None,
251+
)
252+
230253
# ------------------------------------begin main----------------------------------
231254

232255
def pat_maker(self, i):

pat2vec/patvec_get_batch_methods/get_merged_batches.py

Lines changed: 9 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
import os
22
import pandas as pd
3+
from multiprocessing import Pool, cpu_count
4+
from functools import partial
35

46
from pat2vec.util.filter_methods import filter_dataframe_by_fuzzy_terms
57
from pat2vec.util.filter_methods import (
@@ -9,9 +11,6 @@
911
filter_dataframe_by_fuzzy_terms,
1012
)
1113

12-
import pandas as pd
13-
import os
14-
1514

1615
def verify_split_data_concatenated(original_df, client_idcode_column, save_folder):
1716
"""
@@ -79,12 +78,6 @@ def verify_split_data_individual(original_df, client_idcode_column, save_folder)
7978
print("Verification successful: All CSVs match the original DataFrame.")
8079

8180

82-
import pandas as pd
83-
import os
84-
from multiprocessing import Pool, cpu_count
85-
from functools import partial
86-
87-
8881
def save_group(client_idcode_group, save_folder):
8982
"""Helper function to save a single group to CSV."""
9083
client_idcode, group = client_idcode_group
@@ -129,7 +122,7 @@ def split_and_save_csv(df, client_idcode_column, save_folder, num_processes=None
129122
# split_and_save_csv(df, 'client_idcode', 'client_data', num_processes=4)
130123

131124

132-
def get_pat_batch_bloods(
125+
def get_merged_pat_batch_bloods(
133126
client_idcode_list,
134127
search_term,
135128
config_obj=None,
@@ -177,7 +170,10 @@ def get_pat_batch_bloods(
177170
bloods_time_field = config_obj.bloods_time_field
178171

179172
# Define the output directory using config_obj.proj_name
180-
input_directory = os.path.join(config_obj.proj_name, "merged_input_batches")
173+
input_directory = os.path.join(config_obj.proj_name, "merged_input_pat_batches")
174+
175+
input_directory = config_obj.pre_merged_input_batches_path
176+
181177
os.makedirs(input_directory, exist_ok=True) # Ensure the directory exists
182178

183179
# Define the path for the merged batches output
@@ -236,7 +232,8 @@ def get_pat_batch_bloods(
236232
# Save the merged DataFrame to the dynamically constructed directory
237233
if store_pat_batch_observations or overwrite_stored_pat_observations:
238234
batch_target.to_csv(merged_batches_path, index=False)
239-
print(f"Merged batches saved to {merged_batches_path}")
235+
if config_obj.verbosity >= 1:
236+
print(f"Merged batches saved to {merged_batches_path}")
240237

241238
return batch_target
242239

pat2vec/util/cogstack_v8_lite.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1188,7 +1188,7 @@ def iterative_multi_term_cohort_searcher_no_terms_fuzzy_mct(
11881188
fields_list=field_list,
11891189
term_name="client_idcode",
11901190
entered_list=generate_uuid_list(
1191-
random.randint(0, 10), random.choice(["P", "V"])
1191+
random.randint(2, 10), random.choice(["P", "V"])
11921192
),
11931193
search_string=search_string,
11941194
)
@@ -1456,7 +1456,7 @@ def iterative_multi_term_cohort_searcher_no_terms_fuzzy_textual_obs(
14561456
fields_list=field_list,
14571457
term_name="client_idcode",
14581458
entered_list=generate_uuid_list(
1459-
random.randint(0, 10), random.choice(["P", "V"])
1459+
random.randint(2, 10), random.choice(["P", "V"])
14601460
),
14611461
search_string=search_string,
14621462
)

pat2vec/util/config_pat2vec.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,7 @@ def __init__(
178178
client_idcode_term_name="client_idcode.keyword", # Used for elastic search index keyword search
179179
sanitize_pat_list=True,
180180
calculate_vectors=True,
181+
prefetch_pat_batches=True,
181182
):
182183

183184
# Configure logging
@@ -221,6 +222,8 @@ def __init__(
221222
# # Now you can use the logger to log messages within the class
222223
# self.logger.info("Initialized config_pat2vec")
223224

225+
self.prefetch_pat_batches = prefetch_pat_batches
226+
224227
self.calculate_vectors = calculate_vectors # Calculate vectors for each patient else just extract batches
225228

226229
self.sanitize_pat_list = (
@@ -554,6 +557,10 @@ def __init__(
554557
self.root_path, f"current_pat_appointments_batches{self.suffix}/"
555558
)
556559

560+
self.pre_merged_input_batches_path = os.path.join(
561+
self.root_path, f"merged_input_pat_batches{self.suffix}/"
562+
)
563+
557564
self.output_folder = "outputs"
558565

559566
self.PathsClass_instance = PathsClass(
@@ -723,10 +730,10 @@ def update_main_options(self):
723730
print("looking forward with ", self.time_window_interval_delta)
724731

725732
self.model_paths = {
726-
"aliencat": "/home/aliencat/samora/HFE/HFE/medcat_models/medcat_model_pack_316666b47dfaac07.zip",
727-
"dgx": "/data/AS/Samora/HFE/HFE/v18/medcat_models/20230328_trained_model_hfe_redone/medcat_model_pack_316666b47dfaac07",
728-
"dhcap": "/home/jovyan/work/medcat_models/medcat_model_pack_316666b47dfaac07.zip",
729-
"dhcap02": "/home/cogstack/samora/_data/medcat_models/medcat_model_pack_316666b47dfaac07.zip",
733+
"aliencat": "../medcat_model_pack_316666b47dfaac07.zip",
734+
"dgx": "../medcat_models/20230328_trained_model_hfe_redone/medcat_model_pack_316666b47dfaac07",
735+
"dhcap": "../medcat_model_pack_316666b47dfaac07.zip",
736+
"dhcap02": "../medcat_model_pack_316666b47dfaac07.zip",
730737
"override_medcat_model_path": None,
731738
}
732739

pat2vec/util/current_pat_batch_path_methods.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ def __init__(self, root_path, suffix, output_folder):
2828
f"current_pat_appointments_batches{self.suffix}/",
2929
f"current_pat_textual_obs_document_batches{self.suffix}/",
3030
f"current_pat_textual_obs_annotation_batches{self.suffix}/",
31+
f"merged_input_pat_batches{self.suffix}/",
3132
output_folder,
3233
]
3334

0 commit comments

Comments
 (0)