Merge branch 'dev' of https://github.com/UrbsLab/STREAMLINE into dev

raptor419 · raptor419 · commit 3d8f108b14d8 · 2025-11-18T16:59:12.000-08:00
diff --git a/requirements.txt b/requirements.txt
@@ -3,7 +3,7 @@ numpy
 optuna
 sqlalchemy<2.0
 plotly>=4.0.0
-pandas>=1.5.2
+pandas>=2.2.2
 pip
 pycodestyle
 scikit-learn>=1.1.3,<1.3.0
diff --git a/run.py b/run.py
@@ -91,7 +91,7 @@ def len_datasets(output_path, experiment_name):
     return len(datasets)
 
 
-def run(params):
+def run(params): #second part
     start_g = time.time()
 
     if params['do_eda']:
@@ -256,20 +256,20 @@ def run(params):
 if __name__ == '__main__':
 
     # NOTE: All keys must be small
-    config_dict = parser_function(sys.argv)
+    config_dict = parser_function(sys.argv) #run first  #returns dictionary of configuration parameters
 
-    if not os.path.exists(config_dict['output_path']):
+    if not os.path.exists(config_dict['output_path']): #make the output path = everything stored here
         os.mkdir(str(config_dict['output_path']))
 
-    if config_dict['verbose']:
+    if config_dict['verbose']: # do we want verbose output to command line. 
         stdout_handler = logging.StreamHandler(sys.stdout)
         stdout_handler.setLevel(logging.INFO)
         stdout_handler.setFormatter(formatter)
         logger.addHandler(stdout_handler)
     else:
-        file_handler = logging.FileHandler(str(config_dict['output_path']) + '/logs.log')
+        file_handler = logging.FileHandler(str(config_dict['output_path']) + '/logs.log') #otherwise puts output into log. 
         file_handler.setLevel(logging.INFO)
         file_handler.setFormatter(formatter)
         logger.addHandler(file_handler)
 
-    sys.exit(run(config_dict))
+    sys.exit(run(config_dict)) 
diff --git a/streamline/runners/compare_runner.py b/streamline/runners/compare_runner.py
@@ -62,28 +62,28 @@ def __init__(self, output_path, experiment_name, experiment_path=None,
         if not os.path.exists(self.output_path + '/' + self.experiment_name):
             raise Exception("Experiment must exist (from phase 1) before phase 6 can begin")
 
-    def run(self, run_parallel=False):
+    def run(self, run_parallel=False): #UPenn or Cedars - LSFOld - uses shell file to submit. Old = shell file to submit. without shell , just LSF, etc. 5 types of hpcs
         if self.run_cluster in ["SLURMOld", "LSFOld"]:
             if self.run_cluster == "SLURMOld":
                 self.submit_slurm_cluster_job()
 
             if self.run_cluster == "LSFOld":
                 self.submit_lsf_cluster_job()
-        else:
+        else: # Not slurm or lsf OLD - Dask jobs submission. Run for local or HPC
             job_obj = CompareJob(self.output_path, self.experiment_name, None,
                                  self.outcome_label, self.outcome_type, self.instance_label, self.sig_cutoff,
                                  self.show_plots)
-            if run_parallel in ["multiprocessing", "True", True]:
+            if run_parallel in ["multiprocessing", "True", True]: #Multiprocessing but not on an HPC. 
                 # p = multiprocessing.Process(target=runner_fn, args=(job_obj, ))
                 # p.start()
                 # p.join()
                 Parallel()(delayed(runner_fn)(job_obj) for job_obj in [job_obj, ])
-            elif self.run_cluster and "Old" not in self.run_cluster:
+            elif self.run_cluster and "Old" not in self.run_cluster: #run on hpc - run jobs sequentially up to 400 (cluster defined)
                 get_cluster(self.run_cluster,
-                            self.output_path + '/' + self.experiment_name, self.queue, self.reserved_memory)
-                dask.compute([dask.delayed(runner_fn)(job_obj) for job_obj in [job_obj, ]])
+                            self.output_path + '/' + self.experiment_name, self.queue, self.reserved_memory) #function in utils. 
+                dask.compute([dask.delayed(runner_fn)(job_obj) for job_obj in [job_obj, ]]) # job object is like a list of jobs - and runs it. 
             else:
-                job_obj.run()
+                job_obj.run() #run locally without multiprocessing
 
     def get_algorithms(self):
         pickle_in = open(self.output_path + '/' + self.experiment_name + '/' + "algInfo.pickle", 'rb')
@@ -95,13 +95,13 @@ def get_algorithms(self):
         self.algorithms = algorithms
         pickle_in.close()
 
-    def get_cluster_params(self):
+    def get_cluster_params(self): 
         cluster_params = [self.output_path, self.experiment_name, None,
                           self.outcome_label, self.outcome_type, self.instance_label, self.sig_cutoff, self.show_plots]
         cluster_params = [str(i) for i in cluster_params]
         return cluster_params
 
-    def submit_slurm_cluster_job(self):
+    def submit_slurm_cluster_job(self): #legacy mode just for cedars (no head node) note cedars has a different hpc - we'd need to write a method for (this is the more recent one)
         job_ref = str(time.time())
         job_name = self.output_path + '/' + self.experiment_name + '/jobs/P1_' + job_ref + '_run.sh'
         sh_file = open(job_name, 'w')
@@ -124,7 +124,7 @@ def submit_slurm_cluster_job(self):
         sh_file.close()
         os.system('sbatch ' + job_name)
 
-    def submit_lsf_cluster_job(self):
+    def submit_lsf_cluster_job(self): #UPENN - Legacy mode (using shell file) - memory on head node
         job_ref = str(time.time())
         job_name = self.output_path + '/' + self.experiment_name + '/jobs/P7_' + job_ref + '_run.sh'
         sh_file = open(job_name, 'w')
diff --git a/streamline/runners/model_runner.py b/streamline/runners/model_runner.py
@@ -284,7 +284,7 @@ def run(self, run_parallel=False):
                     if run_parallel and run_parallel != "False":
                         # p = multiprocessing.Process(target=model_runner_fn, args=(job_obj, model))
                         # job_list.append(p)
-                        job_list.append((job_obj, copy.deepcopy(model)))
+                        job_list.append((job_obj, copy.deepcopy(model))) #adds a job object
                     else:
                         job_obj.run(model)
         if run_parallel and run_parallel != "False" and not self.run_cluster:
@@ -296,7 +296,7 @@ def run(self, run_parallel=False):
             get_cluster(self.run_cluster,
                         self.output_path + '/' + self.experiment_name, self.queue, self.reserved_memory)
             dask.compute([dask.delayed(model_runner_fn)(job_obj, model
-                                                        ) for job_obj, model in job_list])
+                                                        ) for job_obj, model in job_list]) # job list  create all run objects and then run with dask compute
 
     def save_metadata(self):
         # Load metadata
@@ -377,9 +377,9 @@ def submit_slurm_cluster_job(self, full_path, algorithm, cv_count):
             '#SBATCH -e ' + self.output_path + '/' + self.experiment_name + '/logs/P5_'
             + str(algorithm) + '_' + str(cv_count) + '_' + job_ref + '.e\n')
 
-        file_path = str(Path(__file__).parent.parent.parent) + "/streamline/legacy" + '/ModelJobSubmit.py'
+        file_path = str(Path(__file__).parent.parent.parent) + "/streamline/legacy" + '/ModelJobSubmit.py' #
         cluster_params = self.get_cluster_params(full_path, algorithm, cv_count)
-        command = ' '.join(['srun', 'python', file_path] + cluster_params)
+        command = ' '.join(['srun', 'python', file_path] + cluster_params) #
         sh_file.write(command + '\n')
         sh_file.close()
         os.system('sbatch ' + job_name)
diff --git a/streamline/utils/parser.py b/streamline/utils/parser.py
@@ -103,15 +103,15 @@ def single_parse(mode_params, argv, config_dict=None):
     return config_dict
 
 
-def parser_function(argv):
+def parser_function(argv): #not an exhaustive list - just first parsing needed
     parser = argparse.ArgumentParser(description="STREAMLINE: \n"
                                                  "Simple Transparent End-To-End Automated Machine "
                                                  "Learning Pipeline for Supervised Learning in Tabular "
                                                  "Binary Classification Data",
                                      formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     parser.add_argument('--config', '-c',
                         dest='config', type=str, default="",
-                        help='flag to load config file')
+                        help='flag to load config file') #if this passed we don't need any of the items below
     parser.add_argument('--verbose', dest='verbose', type=str2bool, nargs='?', const=True, default=False,
                         help='give output to command line')
     parser.add_argument('--do-till-report', '--dtr', dest='do_till_report', type=str2bool, nargs='?', const=True,