|
992 | 992 | " # Retrieve trial timeout, defaulting to None if not set\n", |
993 | 993 | " trial_timeout = config['hyperopt_settings'].get('trial_timeout', None)\n", |
994 | 994 | "\n", |
995 | | - " # 5. --- Determine Outcome Variables ---\n", |
| 995 | + " # 5. --- Determine Outcome Variables & PRE-LOAD DATA ---\n", |
| 996 | + " data_file_path = config['data']['file_path']\n", |
| 997 | + " if not Path(data_file_path).is_absolute():\n", |
| 998 | + " data_file_path = project_root / data_file_path\n", |
| 999 | + "\n", |
| 1000 | + " # OPTIMIZATION: Load data ONCE here\n", |
| 1001 | + " print(f\"Pre-loading data from {data_file_path}...\", flush=True)\n", |
| 1002 | + " GLOBAL_DF = pd.read_csv(data_file_path)\n", |
| 1003 | + "\n", |
996 | 1004 | " outcome_var_list = []\n", |
997 | 1005 | " if config['data'].get('multiple_outcomes', False):\n", |
998 | | - " data_file_path = config['data']['file_path']\n", |
999 | | - " if not Path(data_file_path).is_absolute():\n", |
1000 | | - " data_file_path = project_root / data_file_path\n", |
1001 | | - "\n", |
1002 | | - " df = pd.read_csv(data_file_path)\n", |
1003 | | - " outcome_var_list = [col for col in df.columns if 'outcome_var_' in col]\n", |
| 1006 | + " outcome_var_list = [col for col in GLOBAL_DF.columns if 'outcome_var_' in col]\n", |
1004 | 1007 | "\n", |
1005 | 1008 | " if not outcome_var_list:\n", |
1006 | 1009 | " raise ValueError(f\"No outcome variables found with 'outcome_var_' prefix in {data_file_path}\")\n", |
|
1009 | 1012 | " outcome_var_list = config['hyperopt_search_space']['outcome_var_n']\n", |
1010 | 1013 | "\n", |
1011 | 1014 | " # 6. --- Define the Objective Function for Hyperopt ---\n", |
1012 | | - " def objective(params, outcome_var):\n", |
| 1015 | + " def objective(params, outcome_var, loaded_df):\n", |
1013 | 1016 | " \"\"\"\n", |
1014 | | - " Objective function for hyperopt. It receives sampled parameters\n", |
1015 | | - " and the specific outcome variable for the current run.\n", |
| 1017 | + " Objective function for hyperopt. It receives sampled parameters,\n", |
| 1018 | + " the specific outcome variable, and the PRE-LOADED DataFrame.\n", |
1016 | 1019 | " \"\"\"\n", |
1017 | 1020 | " try:\n", |
1018 | 1021 | " # Wrap the entire trial execution in the time_limit context manager\n", |
1019 | 1022 | " with time_limit(trial_timeout):\n", |
1020 | 1023 | " local_param_dict = params\n", |
1021 | 1024 | "\n", |
1022 | | - " # Initialize the data pipeline\n", |
| 1025 | + " # Initialize the data pipeline using the cached DataFrame\n", |
1023 | 1026 | " ml_grid_object = pipe(\n", |
1024 | | - " file_name=config['data']['file_path'],\n", |
| 1027 | + " file_name=None, # Not needed when input_df is provided\n", |
1025 | 1028 | " drop_term_list=config['data']['drop_term_list'],\n", |
1026 | 1029 | " model_class_dict=config['models'],\n", |
1027 | 1030 | " local_param_dict=local_param_dict,\n", |
1028 | 1031 | " base_project_dir=project_root,\n", |
1029 | 1032 | " experiment_dir=experiment_dir,\n", |
1030 | 1033 | " param_space_index=0, \n", |
1031 | | - " outcome_var_override=outcome_var\n", |
| 1034 | + " outcome_var_override=outcome_var,\n", |
| 1035 | + " input_df=loaded_df # <--- PASS CACHED DATA\n", |
1032 | 1036 | " )\n", |
1033 | 1037 | "\n", |
1034 | 1038 | " # Execute the models\n", |
|
1052 | 1056 | " start_time = datetime.now()\n", |
1053 | 1057 | " print(f\"[{start_time}] Starting optimization for outcome: {outcome_var}\", flush=True)\n", |
1054 | 1058 | "\n", |
1055 | | - " fmin_objective = partial(objective, outcome_var=outcome_var)\n", |
| 1059 | + " # Pass the global dataframe to the objective function via partial\n", |
| 1060 | + " # Joblib will handle the serialization/shared memory of GLOBAL_DF efficiently\n", |
| 1061 | + " fmin_objective = partial(objective, outcome_var=outcome_var, loaded_df=GLOBAL_DF)\n", |
1056 | 1062 | "\n", |
1057 | 1063 | " trials = Trials()\n", |
1058 | 1064 | " best = fmin(\n", |
|
1068 | 1074 | " failed_trials = [t for t in trials.results if t['status'] == 'fail']\n", |
1069 | 1075 | "\n", |
1070 | 1076 | " print(f\"[{end_time}] Finished {outcome_var} (Duration: {end_time - start_time})\", flush=True)\n", |
1071 | | - " print(f\" -> Best param set for this outcome: {best}\", flush=True)\n", |
1072 | | - " print(f\" -> Trials summary: {len(failed_trials)}/{len(trials.results)} failed.\", flush=True)\n", |
| 1077 | + " print(f\" -> Best param set for this outcome: {best}\", flush=True)\n", |
| 1078 | + " print(f\" -> Trials summary: {len(failed_trials)}/{len(trials.results)} failed.\", flush=True)\n", |
1073 | 1079 | "\n", |
1074 | 1080 | " return (outcome_var, best)\n", |
1075 | 1081 | "\n", |
|
1098 | 1104 | " print(f\" Best parameter combination found: {best_params}\")\n", |
1099 | 1105 | "\n", |
1100 | 1106 | " end_total = datetime.now()\n", |
1101 | | - " print(f\"\\nCompleted all optimizations at {end_total} (Total duration: {end_total - start_total})\")\n" |
| 1107 | + " print(f\"\\nCompleted all optimizations at {end_total} (Total duration: {end_total - start_total})\")\n", |
| 1108 | + "\n" |
1102 | 1109 | ] |
1103 | 1110 | }, |
1104 | 1111 | { |
|
0 commit comments