initial prefetching data batches implementation, and associated adjustments to unit test

SamoraHunter · SamoraHunter · commit 4e5adeae4fb6 · 2025-03-10T20:08:13.000Z
diff --git a/notebooks/example_usage.ipynb b/notebooks/example_usage.ipynb
@@ -45,9 +45,9 @@
     "\n",
     "if(clear_previous_outputs):\n",
     "\n",
-    "    shutil.rmtree('notebooks/new_project', ignore_errors=True)\n",
+    "    shutil.rmtree('new_project', ignore_errors=True)\n",
     "\n",
-    "    shutil.rmtree('notebooks/treatment_doc_extract', ignore_errors=True)"
+    "    shutil.rmtree('treatment_doc_extract', ignore_errors=True)"
    ]
   },
   {
@@ -936,25 +936,34 @@
     "assert example_pat_res.shape == (5, 26)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "len(treatment_docs)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "treatment_docs = pd.read_csv('test_files/treatment_docs.csv')\n",
-    "assert len(treatment_docs) == 6\n",
-    "print(len(treatment_docs)==6)\n",
+    "assert len(treatment_docs) == 29\n",
+    "print(len(treatment_docs)==29)\n",
     "treatment_docs"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 44,
+   "execution_count": 45,
    "metadata": {},
    "outputs": [],
    "source": [
-    "assert treatment_docs['basicobs_itemname_analysed'].iloc[5] == 'PFA-100'"
+    "assert treatment_docs['basicobs_itemname_analysed'].iloc[21] == 'Luteinizing Hormone (LH)'"
    ]
   },
   {
@@ -968,7 +977,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 46,
+   "execution_count": 47,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -988,7 +997,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 48,
+   "execution_count": 49,
    "metadata": {},
    "outputs": [],
    "source": [
diff --git a/pat2vec/main_pat2vec.py b/pat2vec/main_pat2vec.py
@@ -7,6 +7,10 @@
 from datetime import datetime
 from multiprocessing import Pool
 import pandas as pd
+from pat2vec.patvec_get_batch_methods.get_merged_batches import (
+    get_merged_pat_batch_bloods,
+    split_and_save_csv,
+)
 from pat2vec.util.cogstack_v8_lite import *
 from colorama import Back, Fore, Style
 from credentials import *
@@ -227,6 +231,25 @@ def __init__(
 
         self.n_pat_lines = config_obj.n_pat_lines
 
+        if self.config_obj.prefetch_pat_batches:
+            if self.config_obj.verbosity > 0:
+
+                if self.config_obj.main_options.get("bloods", True):
+
+                    dfb = get_merged_pat_batch_bloods(
+                        client_idcode_list=self.all_patient_list,
+                        search_term=None,
+                        config_obj=self.config_obj,
+                        cohort_searcher_with_terms_and_search=self.cohort_searcher_with_terms_and_search,
+                    )
+
+                    split_and_save_csv(
+                        df=dfb,
+                        client_idcode_column="client_idcode",
+                        save_folder=self.config_obj.pre_bloods_batch_path,
+                        num_processes=None,
+                    )
+
     # ------------------------------------begin main----------------------------------
 
     def pat_maker(self, i):
diff --git a/pat2vec/patvec_get_batch_methods/get_merged_batches.py b/pat2vec/patvec_get_batch_methods/get_merged_batches.py
@@ -1,5 +1,7 @@
 import os
 import pandas as pd
+from multiprocessing import Pool, cpu_count
+from functools import partial
 
 from pat2vec.util.filter_methods import filter_dataframe_by_fuzzy_terms
 from pat2vec.util.filter_methods import (
@@ -9,9 +11,6 @@
     filter_dataframe_by_fuzzy_terms,
 )
 
-import pandas as pd
-import os
-
 
 def verify_split_data_concatenated(original_df, client_idcode_column, save_folder):
     """
@@ -79,12 +78,6 @@ def verify_split_data_individual(original_df, client_idcode_column, save_folder)
     print("Verification successful: All CSVs match the original DataFrame.")
 
 
-import pandas as pd
-import os
-from multiprocessing import Pool, cpu_count
-from functools import partial
-
-
 def save_group(client_idcode_group, save_folder):
     """Helper function to save a single group to CSV."""
     client_idcode, group = client_idcode_group
@@ -129,7 +122,7 @@ def split_and_save_csv(df, client_idcode_column, save_folder, num_processes=None
 # split_and_save_csv(df, 'client_idcode', 'client_data', num_processes=4)
 
 
-def get_pat_batch_bloods(
+def get_merged_pat_batch_bloods(
     client_idcode_list,
     search_term,
     config_obj=None,
@@ -177,7 +170,10 @@ def get_pat_batch_bloods(
     bloods_time_field = config_obj.bloods_time_field
 
     # Define the output directory using config_obj.proj_name
-    input_directory = os.path.join(config_obj.proj_name, "merged_input_batches")
+    input_directory = os.path.join(config_obj.proj_name, "merged_input_pat_batches")
+
+    input_directory = config_obj.pre_merged_input_batches_path
+
     os.makedirs(input_directory, exist_ok=True)  # Ensure the directory exists
 
     # Define the path for the merged batches output
@@ -236,7 +232,8 @@ def get_pat_batch_bloods(
         # Save the merged DataFrame to the dynamically constructed directory
         if store_pat_batch_observations or overwrite_stored_pat_observations:
             batch_target.to_csv(merged_batches_path, index=False)
-            print(f"Merged batches saved to {merged_batches_path}")
+            if config_obj.verbosity >= 1:
+                print(f"Merged batches saved to {merged_batches_path}")
 
         return batch_target
 
diff --git a/pat2vec/util/cogstack_v8_lite.py b/pat2vec/util/cogstack_v8_lite.py
@@ -1188,7 +1188,7 @@ def iterative_multi_term_cohort_searcher_no_terms_fuzzy_mct(
                     fields_list=field_list,
                     term_name="client_idcode",
                     entered_list=generate_uuid_list(
-                        random.randint(0, 10), random.choice(["P", "V"])
+                        random.randint(2, 10), random.choice(["P", "V"])
                     ),
                     search_string=search_string,
                 )
@@ -1456,7 +1456,7 @@ def iterative_multi_term_cohort_searcher_no_terms_fuzzy_textual_obs(
                     fields_list=field_list,
                     term_name="client_idcode",
                     entered_list=generate_uuid_list(
-                        random.randint(0, 10), random.choice(["P", "V"])
+                        random.randint(2, 10), random.choice(["P", "V"])
                     ),
                     search_string=search_string,
                 )
diff --git a/pat2vec/util/config_pat2vec.py b/pat2vec/util/config_pat2vec.py
@@ -178,6 +178,7 @@ def __init__(
         client_idcode_term_name="client_idcode.keyword",  # Used for elastic search index keyword search
         sanitize_pat_list=True,
         calculate_vectors=True,
+        prefetch_pat_batches=True,
     ):
 
         # Configure logging
@@ -221,6 +222,8 @@ def __init__(
         # # Now you can use the logger to log messages within the class
         # self.logger.info("Initialized config_pat2vec")
 
+        self.prefetch_pat_batches = prefetch_pat_batches
+
         self.calculate_vectors = calculate_vectors  # Calculate vectors for each patient else just extract batches
 
         self.sanitize_pat_list = (
@@ -554,6 +557,10 @@ def __init__(
                 self.root_path, f"current_pat_appointments_batches{self.suffix}/"
             )
 
+            self.pre_merged_input_batches_path = os.path.join(
+                self.root_path, f"merged_input_pat_batches{self.suffix}/"
+            )
+
             self.output_folder = "outputs"
 
             self.PathsClass_instance = PathsClass(
@@ -723,10 +730,10 @@ def update_main_options(self):
             print("looking forward with ", self.time_window_interval_delta)
 
         self.model_paths = {
-            "aliencat": "/home/aliencat/samora/HFE/HFE/medcat_models/medcat_model_pack_316666b47dfaac07.zip",
-            "dgx": "/data/AS/Samora/HFE/HFE/v18/medcat_models/20230328_trained_model_hfe_redone/medcat_model_pack_316666b47dfaac07",
-            "dhcap": "/home/jovyan/work/medcat_models/medcat_model_pack_316666b47dfaac07.zip",
-            "dhcap02": "/home/cogstack/samora/_data/medcat_models/medcat_model_pack_316666b47dfaac07.zip",
+            "aliencat": "../medcat_model_pack_316666b47dfaac07.zip",
+            "dgx": "../medcat_models/20230328_trained_model_hfe_redone/medcat_model_pack_316666b47dfaac07",
+            "dhcap": "../medcat_model_pack_316666b47dfaac07.zip",
+            "dhcap02": "../medcat_model_pack_316666b47dfaac07.zip",
             "override_medcat_model_path": None,
         }
 
diff --git a/pat2vec/util/current_pat_batch_path_methods.py b/pat2vec/util/current_pat_batch_path_methods.py
@@ -28,6 +28,7 @@ def __init__(self, root_path, suffix, output_folder):
             f"current_pat_appointments_batches{self.suffix}/",
             f"current_pat_textual_obs_document_batches{self.suffix}/",
             f"current_pat_textual_obs_annotation_batches{self.suffix}/",
+            f"merged_input_pat_batches{self.suffix}/",
             output_folder,
         ]
 

Original file line number	Diff line number	Diff line change
`@@ -28,6 +28,7 @@ def __init__(self, root_path, suffix, output_folder):`
`28`	`28`	`f"current_pat_appointments_batches{self.suffix}/",`
`29`	`29`	`f"current_pat_textual_obs_document_batches{self.suffix}/",`
`30`	`30`	`f"current_pat_textual_obs_annotation_batches{self.suffix}/",`
	`31`	`+ f"merged_input_pat_batches{self.suffix}/",`
`31`	`32`	`output_folder,`
`32`	`33`	`]`
`33`	`34`