Skip to content

Commit d739811

Browse files
committed
prefetch batch > 10000 size fix
1 parent b640c59 commit d739811

2 files changed

Lines changed: 17 additions & 22 deletions

File tree

pat2vec/patvec_get_batch_methods/get_merged_batches.py

Lines changed: 0 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -368,10 +368,6 @@ def get_merged_pat_batch_drugs(
368368
return pd.DataFrame() # Return an empty DataFrame in case of error
369369

370370

371-
import os
372-
import pandas as pd
373-
374-
375371
def get_merged_pat_batch_diagnostics(
376372
client_idcode_list,
377373
config_obj=None,
@@ -571,11 +567,6 @@ def get_merged_pat_batch_mct_docs(
571567
# Apply data type filters for MCT documents
572568
batch_target = apply_data_type_mct_docs_filters(config_obj, batch_target)
573569

574-
# larger batches returned as lists...
575-
if isinstance(batch_target, list) and len(batch_target) >= 1:
576-
577-
batch_target = batch_target[0]
578-
579570
# Drop rows with NaN values in critical columns
580571
col_list_drop_nan = [
581572
"observation_valuetext_analysed",
@@ -717,10 +708,6 @@ def get_merged_pat_batch_epr_docs(
717708
text_column="body_analysed",
718709
debug=config_obj.verbosity > 5,
719710
)
720-
# larger batches returned as lists...
721-
if isinstance(batch_target, list) and len(batch_target) >= 1:
722-
723-
batch_target = batch_target[0]
724711

725712
# Drop rows with NaN values in critical columns
726713
col_list_drop_nan = ["body_analysed", "updatetime", "client_idcode"]
@@ -850,11 +837,6 @@ def get_merged_pat_batch_textual_obs_docs(
850837
search_string=f"{bloods_time_field}:[{global_start_year}-{global_start_month}-{global_start_day} TO {global_end_year}-{global_end_month}-{global_end_day}]",
851838
)
852839

853-
# larger batches returned as lists...
854-
if isinstance(batch_target, list) and len(batch_target) >= 1:
855-
856-
batch_target = batch_target[0]
857-
858840
# Drop rows with no textualObs
859841
batch_target = batch_target.dropna(subset=["textualObs"])
860842
# Drop rows with empty string in textualObs
@@ -1429,9 +1411,6 @@ def get_merged_pat_batch_reports(
14291411
f"updatetime:[{global_start_year}-{global_start_month}-{global_start_day} TO {global_end_year}-{global_end_month}-{global_end_day}]",
14301412
)
14311413

1432-
if isinstance(batch_target, list) and len(batch_target) >= 1:
1433-
1434-
batch_target = batch_target[0]
14351414
# Combine textualObs and basicobs_value_analysed into body_analysed
14361415
batch_target["body_analysed"] = (
14371416
batch_target["textualObs"].astype(str)

pat2vec/util/cogstack_v8_lite.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ def create_credentials_file():
9696
hosts = ["https://your-actual-elasticsearch-host:9200"]
9797

9898

99-
print(f"Imported cogstack_v8_lite from pat2vec.util")
99+
print(f"Imported cogstack_v8_lite from pat2vec.util .")
100100
print(f"Username: %s" % username)
101101

102102

@@ -258,10 +258,17 @@ def list_chunker(entered_list):
258258
return chunks
259259

260260

261+
# Use a generator to yield DataFrames one by one
262+
def dataframe_generator(list_of_dfs):
263+
for df in list_of_dfs:
264+
yield df
265+
266+
261267
def cohort_searcher_with_terms_and_search(
262268
index_name, fields_list, term_name, entered_list, search_string
263269
):
264270
if len(entered_list) >= 10000:
271+
print("cohort_searcher_with_terms_and_search list chunking")
265272
results = []
266273
chunked_list = list_chunker(entered_list)
267274
for mini_list in chunked_list:
@@ -284,9 +291,18 @@ def cohort_searcher_with_terms_and_search(
284291
merged_df = [df.set_index("_id") for df in results]
285292

286293
except Exception as e:
294+
print(e)
295+
raise e
287296
print(e)
288297
return results
289298

299+
try:
300+
# Concatenate DataFrames using the generator
301+
merged_df = pd.concat(dataframe_generator(results), ignore_index=True)
302+
merged_df = merged_df.set_index("_id")
303+
except Exception as e:
304+
raise e
305+
290306
return merged_df
291307
else:
292308
query = {

0 commit comments

Comments
 (0)