Skip to content

Commit cfb8d7f

Browse files
authored
Merge pull request #43 from score-p/hotfix
hotfix if gpus are not available, check for length of the lists
2 parents d350f21 + d368559 commit cfb8d7f

6 files changed

Lines changed: 149 additions & 113 deletions

File tree

src/jumper/kernel.py

Lines changed: 53 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ def __init__(self, **kwargs):
109109
self.scorep_python_available_ = False
110110

111111
logging.config.dictConfig(LOGGING)
112-
self.log = logging.getLogger('kernel')
112+
self.log = logging.getLogger("kernel")
113113

114114
def cell_output(self, string, stream="stdout"):
115115
"""
@@ -704,7 +704,9 @@ async def scorep_execute(
704704
os.open(scorep_script_name, os.O_WRONLY | os.O_CREAT), "w"
705705
) as file:
706706
file.write(self.pershelper.subprocess_wrapper(code))
707-
self.log.debug(f"Code written to temporary script: {scorep_script_name}")
707+
self.log.debug(
708+
f"Code written to temporary script: {scorep_script_name}"
709+
)
708710

709711
# For disk mode use implicit synchronization between kernel and
710712
# subprocess: await jupyter_dump, subprocess.wait(),
@@ -722,7 +724,10 @@ async def scorep_execute(
722724
)
723725

724726
if reply_status_dump["status"] != "ok":
725-
self.log_error(KernelErrorCode.PERSISTENCE_DUMP_FAIL, direction="Jupyter -> Score-P")
727+
self.log_error(
728+
KernelErrorCode.PERSISTENCE_DUMP_FAIL,
729+
direction="Jupyter -> Score-P",
730+
)
726731
self.pershelper.postprocess()
727732
return reply_status_dump
728733

@@ -775,7 +780,10 @@ async def scorep_execute(
775780
cell_id=cell_id,
776781
)
777782
if reply_status_dump["status"] != "ok":
778-
self.log_error(KernelErrorCode.PERSISTENCE_DUMP_FAIL, direction="Jupyter -> Score-P")
783+
self.log_error(
784+
KernelErrorCode.PERSISTENCE_DUMP_FAIL,
785+
direction="Jupyter -> Score-P",
786+
)
779787
self.pershelper.postprocess()
780788
return reply_status_dump
781789

@@ -785,16 +793,17 @@ async def scorep_execute(
785793

786794
stdout_lock = threading.Lock()
787795
process_busy_spinner = create_busy_spinner(stdout_lock)
788-
process_busy_spinner.start('Process is running...')
796+
process_busy_spinner.start("Process is running...")
789797

790798
multicellmode_timestamps = []
791799

792800
try:
793-
multicellmode_timestamps = self.read_scorep_process_pipe(proc, stdout_lock)
794-
process_busy_spinner.stop('Done.')
801+
multicellmode_timestamps = self.read_scorep_process_pipe(
802+
proc, stdout_lock
803+
)
804+
process_busy_spinner.stop("Done.")
795805
except KeyboardInterrupt:
796-
process_busy_spinner.stop('Kernel interrupted.')
797-
806+
process_busy_spinner.stop("Kernel interrupted.")
798807

799808
# for multiple nodes, we have to add more lists here, one list per node
800809
# this is required to be in line with the performance data aggregation
@@ -850,7 +859,8 @@ async def scorep_execute(
850859
)
851860

852861
# Check if the score-p process is running.
853-
# This prevents jupyter_update() from getting stuck while reading non-existent temporary files
862+
# This prevents jupyter_update() from getting stuck while reading
863+
# non-existent temporary files
854864
# if something goes wrong during process execution.
855865
if proc.poll():
856866
self.log_error(KernelErrorCode.SCOREP_SUBPROCESS_FAIL)
@@ -873,7 +883,10 @@ async def scorep_execute(
873883
cell_id=cell_id,
874884
)
875885
if reply_status_update["status"] != "ok":
876-
self.log_error(KernelErrorCode.PERSISTENCE_LOAD_FAIL, direction=f"Score-P -> Jupyter")
886+
self.log_error(
887+
KernelErrorCode.PERSISTENCE_LOAD_FAIL,
888+
direction="Score-P -> Jupyter",
889+
)
877890
self.pershelper.postprocess()
878891
return reply_status_update
879892

@@ -882,7 +895,10 @@ async def scorep_execute(
882895
if self.pershelper.mode == "memory":
883896
if proc.poll():
884897
self.pershelper.postprocess()
885-
self.log_error(KernelErrorCode.PERSISTENCE_LOAD_FAIL, direction="Score-P -> Jupyter")
898+
self.log_error(
899+
KernelErrorCode.PERSISTENCE_LOAD_FAIL,
900+
direction="Score-P -> Jupyter",
901+
)
886902
return self.standard_reply()
887903

888904
# Determine directory to which trace files were saved by Score-P
@@ -934,19 +950,24 @@ async def scorep_execute(
934950
)
935951
return self.standard_reply()
936952

937-
938-
def read_scorep_process_pipe(self, proc: subprocess.Popen[bytes], stdout_lock: threading.Lock) -> list:
953+
def read_scorep_process_pipe(
954+
self, proc: subprocess.Popen[bytes], stdout_lock: threading.Lock
955+
) -> list:
939956
"""
940-
This function reads stdout and stderr of the subprocess running with Score-P instrumentation independently.
957+
This function reads stdout and stderr of the subprocess running with
958+
Score-P instrumentation independently.
941959
It logs all stderr output, collects lines containing
942-
the marker "MCM_TS" (used to identify multi-cell mode timestamps) into a list, and sends the remaining
960+
the marker "MCM_TS" (used to identify multi-cell mode timestamps) into
961+
a list, and sends the remaining
943962
stdout lines to the Jupyter cell output.
944963
945-
Simultaneous access to stdout is synchronized via a lock to prevent overlapping with another thread performing
964+
Simultaneous access to stdout is synchronized via a lock to prevent
965+
overlapping with another thread performing
946966
long-running process animation.
947967
948968
Args:
949-
proc (subprocess.Popen[bytes]): The subprocess whose output is being read.
969+
proc (subprocess.Popen[bytes]): The subprocess whose output is
970+
being read.
950971
stdout_lock (threading.Lock): Lock to avoid output overlapping
951972
952973
Returns:
@@ -969,12 +990,14 @@ def read_scorep_process_pipe(self, proc: subprocess.Popen[bytes], stdout_lock: t
969990
sel.unregister(key.fileobj)
970991
continue
971992

972-
decoded_line = line.decode(sys.getdefaultencoding(), errors='ignore')
993+
decoded_line = line.decode(
994+
sys.getdefaultencoding(), errors="ignore"
995+
)
973996

974997
if key.fileobj is proc.stderr:
975998
with stdout_lock:
976-
self.log.warning(f'{decoded_line.strip()}')
977-
elif 'MCM_TS' in decoded_line:
999+
self.log.warning(f"{decoded_line.strip()}")
1000+
elif "MCM_TS" in decoded_line:
9781001
multicellmode_timestamps.append(decoded_line)
9791002
else:
9801003
with stdout_lock:
@@ -988,7 +1011,6 @@ def read_scorep_process_pipe(self, proc: subprocess.Popen[bytes], stdout_lock: t
9881011

9891012
return multicellmode_timestamps
9901013

991-
9921014
async def do_execute(
9931015
self,
9941016
code,
@@ -1323,11 +1345,14 @@ def do_shutdown(self, restart):
13231345

13241346
def log_error(self, code: KernelErrorCode, **kwargs):
13251347
"""
1326-
Logs a kernel error with predefined error code and adds an extensible message format.
1348+
Logs a kernel error with predefined error code and adds an extensible
1349+
message format.
13271350
13281351
Parameters:
1329-
code (KernelErrorCode): error code to select message template from `KERNEL_ERROR_MESSAGES`.
1330-
**kwargs: contextual fields for the error message template (e.g., active_kernel="jupyter").
1352+
code (KernelErrorCode): error code to select message template from
1353+
`KERNEL_ERROR_MESSAGES`.
1354+
**kwargs: contextual fields for the error message template
1355+
(e.g., active_kernel="jupyter").
13311356
13321357
In addition to the dynamic arguments, the formatter always injects:
13331358
- mode (str): PersHelper() mode (e.g. "memory")
@@ -1336,12 +1361,10 @@ def log_error(self, code: KernelErrorCode, **kwargs):
13361361
mode = self.pershelper.mode
13371362
marshaller = self.pershelper.marshaller
13381363

1339-
template = KERNEL_ERROR_MESSAGES.get(code, "Unknown error. Mode: {mode}, Marshaller: {marshaller}")
1340-
message = template.format(
1341-
mode=mode,
1342-
marshaller=marshaller,
1343-
**kwargs
1364+
template = KERNEL_ERROR_MESSAGES.get(
1365+
code, "Unknown error. Mode: {mode}, Marshaller: {marshaller}"
13441366
)
1367+
message = template.format(mode=mode, marshaller=marshaller, **kwargs)
13451368

13461369
self.log.error(message)
13471370
self.cell_output("KernelError: " + message, "stderr")

src/jumper/kernel_messages.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,15 @@ class KernelErrorCode(Enum):
2424
"Failed to set up persistence communication files/pipes "
2525
),
2626
KernelErrorCode.PERSISTENCE_DUMP_FAIL: (
27-
"[mode: {mode}] Failed to serialize notebook persistence ({direction}, marshaller: {marshaller})."
27+
"[mode: {mode}] Failed to serialize notebook persistence ({direction},"
28+
"marshaller: {marshaller})."
2829
),
2930
KernelErrorCode.PERSISTENCE_LOAD_FAIL: (
30-
"[mode: {mode}] Failed to load persistence ({direction}, marshaller: {marshaller})."
31+
"[mode: {mode}] Failed to load persistence ({direction}, "
32+
"marshaller: {marshaller})."
3133
),
3234
KernelErrorCode.SCOREP_SUBPROCESS_FAIL: (
33-
"[mode: {mode}] Subprocess terminated unexpectedly. Persistence not recorded (marshaller: {marshaller})."
35+
"[mode: {mode}] Subprocess terminated unexpectedly. "
36+
"Persistence not recorded (marshaller: {marshaller})."
3437
),
35-
}
38+
}

src/jumper/logging_config.py

Lines changed: 47 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import sys
44

55

6-
LOGGING_DIR = 'logging'
6+
LOGGING_DIR = "logging"
77
os.makedirs(LOGGING_DIR, exist_ok=True)
88

99

@@ -19,67 +19,62 @@ def filter(self, record):
1919

2020
class JumperKernelOnlyFilter(logging.Filter):
2121
def filter(self, record):
22-
return 'jumper' in record.pathname
22+
return "jumper" in record.pathname
23+
2324

2425
LOGGING = {
25-
'version': 1,
26-
'disable_existing_loggers': False,
27-
'formatters': {
28-
'verbose': {
29-
'format': '[{levelname[0]} {asctime} {name}] {message}',
30-
'style': '{',
26+
"version": 1,
27+
"disable_existing_loggers": False,
28+
"formatters": {
29+
"verbose": {
30+
"format": "[{levelname[0]} {asctime} {name}] {message}",
31+
"style": "{",
3132
},
3233
},
33-
'handlers': {
34-
'info_file': {
35-
'level': 'INFO',
36-
'class': 'logging.FileHandler',
37-
'filename': os.path.join(LOGGING_DIR, 'info.log'),
38-
'formatter': 'verbose'
34+
"handlers": {
35+
"info_file": {
36+
"level": "INFO",
37+
"class": "logging.FileHandler",
38+
"filename": os.path.join(LOGGING_DIR, "info.log"),
39+
"formatter": "verbose",
3940
},
40-
'debug_file': {
41-
'level': 'DEBUG',
42-
'class': 'logging.FileHandler',
43-
'filename': os.path.join(LOGGING_DIR, 'debug.log'),
44-
'formatter': 'verbose'
41+
"debug_file": {
42+
"level": "DEBUG",
43+
"class": "logging.FileHandler",
44+
"filename": os.path.join(LOGGING_DIR, "debug.log"),
45+
"formatter": "verbose",
4546
},
46-
'error_file': {
47-
'level': 'ERROR',
48-
'class': 'logging.FileHandler',
49-
'filename': os.path.join(LOGGING_DIR, 'error.log'),
50-
'formatter': 'verbose'
47+
"error_file": {
48+
"level": "ERROR",
49+
"class": "logging.FileHandler",
50+
"filename": os.path.join(LOGGING_DIR, "error.log"),
51+
"formatter": "verbose",
5152
},
52-
'console': {
53-
'level': 'DEBUG',
54-
'class': 'logging.StreamHandler',
55-
'stream': sys.stdout,
56-
'filters': [
57-
'ignore_error_filter', # prevents from writing to jupyter cell output twice
58-
'jumper_kernel_only_filter',
59-
]
53+
"console": {
54+
"level": "DEBUG",
55+
"class": "logging.StreamHandler",
56+
"stream": sys.stdout,
57+
"filters": [
58+
"ignore_error_filter", # prevents from writing to jupyter
59+
# cell output twice
60+
"jumper_kernel_only_filter",
61+
],
6062
},
6163
},
62-
'filters': {
63-
'jupyter_filter': {
64-
'()': JupyterLogFilter
65-
},
66-
'ignore_error_filter': {
67-
'()': IgnoreErrorFilter
68-
},
69-
'jumper_kernel_only_filter': {
70-
'()': JumperKernelOnlyFilter
71-
}
64+
"filters": {
65+
"jupyter_filter": {"()": JupyterLogFilter},
66+
"ignore_error_filter": {"()": IgnoreErrorFilter},
67+
"jumper_kernel_only_filter": {"()": JumperKernelOnlyFilter},
7268
},
73-
'root': {
74-
'handlers': [],
75-
'level': 'WARNING',
69+
"root": {
70+
"handlers": [],
71+
"level": "WARNING",
7672
},
77-
78-
'loggers': {
79-
'kernel': {
80-
'handlers': ['console', 'debug_file', 'info_file', 'error_file'],
81-
'level': 'WARNING',
82-
'propagate': False,
73+
"loggers": {
74+
"kernel": {
75+
"handlers": ["console", "debug_file", "info_file", "error_file"],
76+
"level": "WARNING",
77+
"propagate": False,
8378
},
84-
}
79+
},
8580
}

src/jumper/perfdatahandler.py

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -149,14 +149,16 @@ def get_perfdata_aggregated(self):
149149
perfdata_aggregated[node][3].extend(perfdata[node][3])
150150
perfdata_aggregated[node][4].extend(perfdata[node][4])
151151
perfdata_aggregated[node][5].extend(perfdata[node][5])
152-
for gpu_index in range(0, len(perfdata[node][6])):
153-
perfdata_aggregated[node][6][gpu_index].extend(
154-
perfdata[node][6][gpu_index]
155-
)
156-
for gpu_index in range(0, len(perfdata[node][7])):
157-
perfdata_aggregated[node][7][gpu_index].extend(
158-
perfdata[node][7][gpu_index]
159-
)
152+
if len(perfdata[node][6]) - 3 > 0:
153+
# GPUs available
154+
for gpu_index in range(0, len(perfdata[node][6])):
155+
perfdata_aggregated[node][6][gpu_index].extend(
156+
perfdata[node][6][gpu_index]
157+
)
158+
for gpu_index in range(0, len(perfdata[node][7])):
159+
perfdata_aggregated[node][7][gpu_index].extend(
160+
perfdata[node][7][gpu_index]
161+
)
160162

161163
# add cell index and the number of measurements
162164
# we will use that in the visualization to generate

0 commit comments

Comments
 (0)