Skip to content

Commit 3d8f108

Browse files
committed
Merge branch 'dev' of https://github.com/UrbsLab/STREAMLINE into dev
2 parents c3c8f36 + 63eb45c commit 3d8f108

5 files changed

Lines changed: 23 additions & 23 deletions

File tree

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ numpy
33
optuna
44
sqlalchemy<2.0
55
plotly>=4.0.0
6-
pandas>=1.5.2
6+
pandas>=2.2.2
77
pip
88
pycodestyle
99
scikit-learn>=1.1.3,<1.3.0

run.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ def len_datasets(output_path, experiment_name):
9191
return len(datasets)
9292

9393

94-
def run(params):
94+
def run(params): #second part
9595
start_g = time.time()
9696

9797
if params['do_eda']:
@@ -256,20 +256,20 @@ def run(params):
256256
if __name__ == '__main__':
257257

258258
# NOTE: All keys must be small
259-
config_dict = parser_function(sys.argv)
259+
config_dict = parser_function(sys.argv) #run first #returns dictionary of configuration parameters
260260

261-
if not os.path.exists(config_dict['output_path']):
261+
if not os.path.exists(config_dict['output_path']): #make the output path = everything stored here
262262
os.mkdir(str(config_dict['output_path']))
263263

264-
if config_dict['verbose']:
264+
if config_dict['verbose']: # do we want verbose output to command line.
265265
stdout_handler = logging.StreamHandler(sys.stdout)
266266
stdout_handler.setLevel(logging.INFO)
267267
stdout_handler.setFormatter(formatter)
268268
logger.addHandler(stdout_handler)
269269
else:
270-
file_handler = logging.FileHandler(str(config_dict['output_path']) + '/logs.log')
270+
file_handler = logging.FileHandler(str(config_dict['output_path']) + '/logs.log') #otherwise puts output into log.
271271
file_handler.setLevel(logging.INFO)
272272
file_handler.setFormatter(formatter)
273273
logger.addHandler(file_handler)
274274

275-
sys.exit(run(config_dict))
275+
sys.exit(run(config_dict))

streamline/runners/compare_runner.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -62,28 +62,28 @@ def __init__(self, output_path, experiment_name, experiment_path=None,
6262
if not os.path.exists(self.output_path + '/' + self.experiment_name):
6363
raise Exception("Experiment must exist (from phase 1) before phase 6 can begin")
6464

65-
def run(self, run_parallel=False):
65+
def run(self, run_parallel=False): #UPenn or Cedars - LSFOld - uses shell file to submit. Old = shell file to submit. without shell , just LSF, etc. 5 types of hpcs
6666
if self.run_cluster in ["SLURMOld", "LSFOld"]:
6767
if self.run_cluster == "SLURMOld":
6868
self.submit_slurm_cluster_job()
6969

7070
if self.run_cluster == "LSFOld":
7171
self.submit_lsf_cluster_job()
72-
else:
72+
else: # Not slurm or lsf OLD - Dask jobs submission. Run for local or HPC
7373
job_obj = CompareJob(self.output_path, self.experiment_name, None,
7474
self.outcome_label, self.outcome_type, self.instance_label, self.sig_cutoff,
7575
self.show_plots)
76-
if run_parallel in ["multiprocessing", "True", True]:
76+
if run_parallel in ["multiprocessing", "True", True]: #Multiprocessing but not on an HPC.
7777
# p = multiprocessing.Process(target=runner_fn, args=(job_obj, ))
7878
# p.start()
7979
# p.join()
8080
Parallel()(delayed(runner_fn)(job_obj) for job_obj in [job_obj, ])
81-
elif self.run_cluster and "Old" not in self.run_cluster:
81+
elif self.run_cluster and "Old" not in self.run_cluster: #run on hpc - run jobs sequentially up to 400 (cluster defined)
8282
get_cluster(self.run_cluster,
83-
self.output_path + '/' + self.experiment_name, self.queue, self.reserved_memory)
84-
dask.compute([dask.delayed(runner_fn)(job_obj) for job_obj in [job_obj, ]])
83+
self.output_path + '/' + self.experiment_name, self.queue, self.reserved_memory) #function in utils.
84+
dask.compute([dask.delayed(runner_fn)(job_obj) for job_obj in [job_obj, ]]) # job object is like a list of jobs - and runs it.
8585
else:
86-
job_obj.run()
86+
job_obj.run() #run locally without multiprocessing
8787

8888
def get_algorithms(self):
8989
pickle_in = open(self.output_path + '/' + self.experiment_name + '/' + "algInfo.pickle", 'rb')
@@ -95,13 +95,13 @@ def get_algorithms(self):
9595
self.algorithms = algorithms
9696
pickle_in.close()
9797

98-
def get_cluster_params(self):
98+
def get_cluster_params(self):
9999
cluster_params = [self.output_path, self.experiment_name, None,
100100
self.outcome_label, self.outcome_type, self.instance_label, self.sig_cutoff, self.show_plots]
101101
cluster_params = [str(i) for i in cluster_params]
102102
return cluster_params
103103

104-
def submit_slurm_cluster_job(self):
104+
def submit_slurm_cluster_job(self): #legacy mode just for cedars (no head node) note cedars has a different hpc - we'd need to write a method for (this is the more recent one)
105105
job_ref = str(time.time())
106106
job_name = self.output_path + '/' + self.experiment_name + '/jobs/P1_' + job_ref + '_run.sh'
107107
sh_file = open(job_name, 'w')
@@ -124,7 +124,7 @@ def submit_slurm_cluster_job(self):
124124
sh_file.close()
125125
os.system('sbatch ' + job_name)
126126

127-
def submit_lsf_cluster_job(self):
127+
def submit_lsf_cluster_job(self): #UPENN - Legacy mode (using shell file) - memory on head node
128128
job_ref = str(time.time())
129129
job_name = self.output_path + '/' + self.experiment_name + '/jobs/P7_' + job_ref + '_run.sh'
130130
sh_file = open(job_name, 'w')

streamline/runners/model_runner.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -284,7 +284,7 @@ def run(self, run_parallel=False):
284284
if run_parallel and run_parallel != "False":
285285
# p = multiprocessing.Process(target=model_runner_fn, args=(job_obj, model))
286286
# job_list.append(p)
287-
job_list.append((job_obj, copy.deepcopy(model)))
287+
job_list.append((job_obj, copy.deepcopy(model))) #adds a job object
288288
else:
289289
job_obj.run(model)
290290
if run_parallel and run_parallel != "False" and not self.run_cluster:
@@ -296,7 +296,7 @@ def run(self, run_parallel=False):
296296
get_cluster(self.run_cluster,
297297
self.output_path + '/' + self.experiment_name, self.queue, self.reserved_memory)
298298
dask.compute([dask.delayed(model_runner_fn)(job_obj, model
299-
) for job_obj, model in job_list])
299+
) for job_obj, model in job_list]) # job list create all run objects and then run with dask compute
300300

301301
def save_metadata(self):
302302
# Load metadata
@@ -377,9 +377,9 @@ def submit_slurm_cluster_job(self, full_path, algorithm, cv_count):
377377
'#SBATCH -e ' + self.output_path + '/' + self.experiment_name + '/logs/P5_'
378378
+ str(algorithm) + '_' + str(cv_count) + '_' + job_ref + '.e\n')
379379

380-
file_path = str(Path(__file__).parent.parent.parent) + "/streamline/legacy" + '/ModelJobSubmit.py'
380+
file_path = str(Path(__file__).parent.parent.parent) + "/streamline/legacy" + '/ModelJobSubmit.py' #
381381
cluster_params = self.get_cluster_params(full_path, algorithm, cv_count)
382-
command = ' '.join(['srun', 'python', file_path] + cluster_params)
382+
command = ' '.join(['srun', 'python', file_path] + cluster_params) #
383383
sh_file.write(command + '\n')
384384
sh_file.close()
385385
os.system('sbatch ' + job_name)

streamline/utils/parser.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -103,15 +103,15 @@ def single_parse(mode_params, argv, config_dict=None):
103103
return config_dict
104104

105105

106-
def parser_function(argv):
106+
def parser_function(argv): #not an exhaustive list - just first parsing needed
107107
parser = argparse.ArgumentParser(description="STREAMLINE: \n"
108108
"Simple Transparent End-To-End Automated Machine "
109109
"Learning Pipeline for Supervised Learning in Tabular "
110110
"Binary Classification Data",
111111
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
112112
parser.add_argument('--config', '-c',
113113
dest='config', type=str, default="",
114-
help='flag to load config file')
114+
help='flag to load config file') #if this passed we don't need any of the items below
115115
parser.add_argument('--verbose', dest='verbose', type=str2bool, nargs='?', const=True, default=False,
116116
help='give output to command line')
117117
parser.add_argument('--do-till-report', '--dtr', dest='do_till_report', type=str2bool, nargs='?', const=True,

0 commit comments

Comments
 (0)