Pass cached input df to avoid reloading each trial.

SamoraHunter · SamoraHunter · commit 8656cd401759 · 2026-01-14T15:51:24.000Z
diff --git a/ml_grid/pipeline/data.py b/ml_grid/pipeline/data.py
@@ -193,6 +193,7 @@ def __init__(
         time_series_mode: bool = False,
         model_class_dict: Optional[Dict[str, bool]] = None,
         outcome_var_override: Optional[str] = None,
+        input_df: Optional[pd.DataFrame] = None,
     ):
         """Initializes the data pipeline object.
 
@@ -224,6 +225,8 @@ def __init__(
             outcome_var_override (Optional[str], optional): A specific outcome
                 variable name to use, overriding the one from `local_param_dict`.
                 Defaults to None.
+            input_df (Optional[pd.DataFrame], optional): A pre-loaded DataFrame to use
+                instead of reading from file_name. Defaults to None.
         """
 
         self.additional_naming = additional_naming
@@ -249,7 +252,18 @@ def __init__(
 
         pipeline_error = None
         try:
-            self._load_data(file_name, test_sample_n, column_sample_n)
+            if input_df is not None:
+                self.df = input_df.copy()
+                self.all_df_columns = list(self.df.columns)
+                self.orignal_feature_names = self.all_df_columns.copy()
+                self._log_feature_transformation(
+                    "Initial Load",
+                    len(self.all_df_columns),
+                    len(self.all_df_columns),
+                    "Initial data loaded from passed DataFrame.",
+                )
+            else:
+                self._load_data(file_name, test_sample_n, column_sample_n)
             self._initial_feature_selection(
                 local_param_dict, drop_term_list, outcome_var_override
             )
diff --git a/notebooks/unit_test_synthetic.ipynb b/notebooks/unit_test_synthetic.ipynb
@@ -992,15 +992,18 @@
     "    # Retrieve trial timeout, defaulting to None if not set\n",
     "    trial_timeout = config['hyperopt_settings'].get('trial_timeout', None)\n",
     "\n",
-    "    # 5. --- Determine Outcome Variables ---\n",
+    "    # 5. --- Determine Outcome Variables & PRE-LOAD DATA ---\n",
+    "    data_file_path = config['data']['file_path']\n",
+    "    if not Path(data_file_path).is_absolute():\n",
+    "        data_file_path = project_root / data_file_path\n",
+    "\n",
+    "    # OPTIMIZATION: Load data ONCE here\n",
+    "    print(f\"Pre-loading data from {data_file_path}...\", flush=True)\n",
+    "    GLOBAL_DF = pd.read_csv(data_file_path)\n",
+    "\n",
     "    outcome_var_list = []\n",
     "    if config['data'].get('multiple_outcomes', False):\n",
-    "        data_file_path = config['data']['file_path']\n",
-    "        if not Path(data_file_path).is_absolute():\n",
-    "            data_file_path = project_root / data_file_path\n",
-    "\n",
-    "        df = pd.read_csv(data_file_path)\n",
-    "        outcome_var_list = [col for col in df.columns if 'outcome_var_' in col]\n",
+    "        outcome_var_list = [col for col in GLOBAL_DF.columns if 'outcome_var_' in col]\n",
     "\n",
     "        if not outcome_var_list:\n",
     "            raise ValueError(f\"No outcome variables found with 'outcome_var_' prefix in {data_file_path}\")\n",
@@ -1009,26 +1012,27 @@
     "        outcome_var_list = config['hyperopt_search_space']['outcome_var_n']\n",
     "\n",
     "    # 6. --- Define the Objective Function for Hyperopt ---\n",
-    "    def objective(params, outcome_var):\n",
+    "    def objective(params, outcome_var, loaded_df):\n",
     "        \"\"\"\n",
-    "        Objective function for hyperopt. It receives sampled parameters\n",
-    "        and the specific outcome variable for the current run.\n",
+    "        Objective function for hyperopt. It receives sampled parameters,\n",
+    "        the specific outcome variable, and the PRE-LOADED DataFrame.\n",
     "        \"\"\"\n",
     "        try:\n",
     "            # Wrap the entire trial execution in the time_limit context manager\n",
     "            with time_limit(trial_timeout):\n",
     "                local_param_dict = params\n",
     "\n",
-    "                # Initialize the data pipeline\n",
+    "                # Initialize the data pipeline using the cached DataFrame\n",
     "                ml_grid_object = pipe(\n",
-    "                    file_name=config['data']['file_path'],\n",
+    "                    file_name=None, # Not needed when input_df is provided\n",
     "                    drop_term_list=config['data']['drop_term_list'],\n",
     "                    model_class_dict=config['models'],\n",
     "                    local_param_dict=local_param_dict,\n",
     "                    base_project_dir=project_root,\n",
     "                    experiment_dir=experiment_dir,\n",
     "                    param_space_index=0, \n",
-    "                    outcome_var_override=outcome_var\n",
+    "                    outcome_var_override=outcome_var,\n",
+    "                    input_df=loaded_df # <--- PASS CACHED DATA\n",
     "                )\n",
     "\n",
     "                # Execute the models\n",
@@ -1052,7 +1056,9 @@
     "        start_time = datetime.now()\n",
     "        print(f\"[{start_time}] Starting optimization for outcome: {outcome_var}\", flush=True)\n",
     "\n",
-    "        fmin_objective = partial(objective, outcome_var=outcome_var)\n",
+    "        # Pass the global dataframe to the objective function via partial\n",
+    "        # Joblib will handle the serialization/shared memory of GLOBAL_DF efficiently\n",
+    "        fmin_objective = partial(objective, outcome_var=outcome_var, loaded_df=GLOBAL_DF)\n",
     "\n",
     "        trials = Trials()\n",
     "        best = fmin(\n",
@@ -1068,8 +1074,8 @@
     "        failed_trials = [t for t in trials.results if t['status'] == 'fail']\n",
     "\n",
     "        print(f\"[{end_time}] Finished {outcome_var} (Duration: {end_time - start_time})\", flush=True)\n",
-    "        print(f\"  -> Best param set for this outcome: {best}\", flush=True)\n",
-    "        print(f\"  -> Trials summary: {len(failed_trials)}/{len(trials.results)} failed.\", flush=True)\n",
+    "        print(f\"  -&gt; Best param set for this outcome: {best}\", flush=True)\n",
+    "        print(f\"  -&gt; Trials summary: {len(failed_trials)}/{len(trials.results)} failed.\", flush=True)\n",
     "\n",
     "        return (outcome_var, best)\n",
     "\n",
@@ -1098,7 +1104,8 @@
     "        print(f\"   Best parameter combination found: {best_params}\")\n",
     "\n",
     "    end_total = datetime.now()\n",
-    "    print(f\"\\nCompleted all optimizations at {end_total} (Total duration: {end_total - start_total})\")\n"
+    "    print(f\"\\nCompleted all optimizations at {end_total} (Total duration: {end_total - start_total})\")\n",
+    "\n"
    ]
   },
   {