Skip to content

Commit fdb5a6b

Browse files
author
Obada Haddad
committed
rebased branch; added some more try...except and error handling; moved scoring and ingestion update to prepare() from start()
1 parent 6e19ba2 commit fdb5a6b

1 file changed

Lines changed: 19 additions & 15 deletions

File tree

compute_worker/compute_worker.py

Lines changed: 19 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -452,6 +452,7 @@ async def send_detailed_results(self, file_path):
452452
logger.error(f"This error might result in a Execution Time Exceeded error: {e}")
453453
if os.environ.get("LOG_LEVEL", "info").lower() == "debug":
454454
logger.exception(e)
455+
raise SubmissionException("Could not connect to instance to update detailed result")
455456

456457
def _get_stdout_stderr_file_names(self, run_args):
457458
# run_args should be the run_args argument passed to __init__ from the run_wrapper.
@@ -628,10 +629,10 @@ def _get_bundle(self, url, destination, cache=True):
628629
except BadZipFile:
629630
retries += 1
630631
if retries >= max_retries:
631-
raise # Re-raise the last caught BadZipFile exception
632+
raise SubmissionException("Bad or empty zip file")
632633
else:
633-
logger.warning("Failed. Retrying in 60 seconds...")
634-
time.sleep(60) # Wait 60 seconds before retrying
634+
logger.warning("Failed. Retrying in 20 seconds...")
635+
time.sleep(20) # Wait 20 seconds before retrying
635636
# Return the zip file path for other uses, e.g. for creating a MD5 hash to identify it
636637
return bundle_file
637638

@@ -1064,6 +1065,15 @@ def _prep_cache_dir(self, max_size=MAX_CACHE_DIR_SIZE_GB):
10641065
logger.info("Cache directory does not need to be pruned!")
10651066

10661067
def prepare(self):
1068+
hostname = utils.nodenames.gethostname()
1069+
if self.is_scoring:
1070+
self._update_status(
1071+
STATUS_RUNNING, extra_information=f"scoring_hostname-{hostname}"
1072+
)
1073+
else:
1074+
self._update_status(
1075+
STATUS_RUNNING, extra_information=f"ingestion_hostname-{hostname}"
1076+
)
10671077
if not self.is_scoring:
10681078
# Only during prediction step do we want to announce "preparing"
10691079
self._update_status(STATUS_PREPARING)
@@ -1108,15 +1118,6 @@ def prepare(self):
11081118
self._get_container_image(self.container_image)
11091119

11101120
def start(self):
1111-
hostname = utils.nodenames.gethostname()
1112-
if self.is_scoring:
1113-
self._update_status(
1114-
STATUS_RUNNING, extra_information=f"scoring_hostname-{hostname}"
1115-
)
1116-
else:
1117-
self._update_status(
1118-
STATUS_RUNNING, extra_information=f"ingestion_hostname-{hostname}"
1119-
)
11201121
program_dir = os.path.join(self.root_dir, "program")
11211122
ingestion_program_dir = os.path.join(self.root_dir, "ingestion_program")
11221123

@@ -1290,9 +1291,12 @@ def push_output(self):
12901291
"Error, the output directory already contains a metadata file. This file is used "
12911292
"to store exitCode and other data, do not write to this file manually."
12921293
)
1293-
1294-
with open(metadata_path, "w") as f:
1295-
f.write(yaml.dump(prog_status, default_flow_style=False))
1294+
try:
1295+
with open(metadata_path, "w") as f:
1296+
f.write(yaml.dump(prog_status, default_flow_style=False))
1297+
except Exception as e:
1298+
logger.error(e)
1299+
raise SubmissionException("Metadata file not found")
12961300

12971301
if not self.is_scoring:
12981302
self._put_dir(self.prediction_result, self.output_dir)

0 commit comments

Comments
 (0)