Skip to content

Commit fc40f64

Browse files
authored
Merge pull request #1265 from Libensemble/feature/merge_platform_specs_detection
platform_specs is merged with detected platform
2 parents 3459023 + 786a83b commit fc40f64

3 files changed

Lines changed: 59 additions & 45 deletions

File tree

libensemble/executors/mpi_runner.py

Lines changed: 28 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ def __init__(self, run_command="mpiexec", platform_info=None):
3737
self.arg_nnodes = ("--LIBE_NNODES_ARG_EMPTY",)
3838
self.arg_ppn = ("--LIBE_PPN_ARG_EMPTY",)
3939
self.default_mpi_options = None
40-
self.default_gpu_arg = None
40+
self.default_gpu_args = None
4141
self.default_gpu_arg_type = None
4242
self.platform_info = platform_info
4343

@@ -126,16 +126,32 @@ def _set_gpu_env_var(self, wresources, task, gpus_per_node, gpus_env):
126126

127127
def _local_runner_set_gpus(self, task, wresources, extra_args, gpus_per_node, ppn):
128128
"""Set default GPU setting for MPI runner"""
129-
if self.default_gpu_arg is not None:
130-
arg_type = self.default_gpu_arg_type
129+
130+
arg_type = self.default_gpu_arg_type
131+
if arg_type is not None:
131132
gpu_value = gpus_per_node // ppn if arg_type == "option_gpus_per_task" else gpus_per_node
132-
gpu_setting_name = self.default_gpu_arg
133+
gpu_setting_name = self.default_gpu_args[arg_type]
134+
jassert(gpu_setting_name is not None, f"No default gpu_setting_name for {arg_type}")
133135
extra_args = self._set_gpu_cli_option(wresources, extra_args, gpu_setting_name, gpu_value)
134136
else:
135137
gpus_env = "CUDA_VISIBLE_DEVICES"
136138
self._set_gpu_env_var(wresources, task, gpus_per_node, gpus_env)
137139
return extra_args
138140

141+
def _get_default_arg(self, gpu_setting_type):
142+
"""Return default setting for the given gpu_setting_type if it exists, else error"""
143+
jassert(
144+
gpu_setting_type in ["option_gpus_per_node", "option_gpus_per_task"],
145+
f"Unrecognized gpu_setting_type {gpu_setting_type}",
146+
)
147+
jassert(
148+
self.default_gpu_args is not None,
149+
"The current MPI runner has no default command line option for setting GPUs",
150+
)
151+
gpu_setting_name = self.default_gpu_args[gpu_setting_type]
152+
jassert(gpu_setting_name is not None, f"No default GPU setting for {gpu_setting_type}")
153+
return gpu_setting_name
154+
139155
def _assign_gpus(self, task, resources, nprocs, nnodes, ppn, ngpus, extra_args, match_procs_to_gpus):
140156
"""Assign GPU resources to slots, limited by ngpus if present.
141157
@@ -199,7 +215,7 @@ def _assign_gpus(self, task, resources, nprocs, nnodes, ppn, ngpus, extra_args,
199215

200216
elif gpu_setting_type in ["option_gpus_per_node", "option_gpus_per_task"]:
201217
gpu_value = gpus_per_node // ppn if gpu_setting_type == "option_gpus_per_task" else gpus_per_node
202-
gpu_setting_name = self.platform_info.get("gpu_setting_name", self.default_gpu_arg)
218+
gpu_setting_name = self.platform_info.get("gpu_setting_name", self._get_default_arg(gpu_setting_type))
203219
extra_args = self._set_gpu_cli_option(wresources, extra_args, gpu_setting_name, gpu_value)
204220

205221
elif gpu_setting_type == "env":
@@ -319,7 +335,7 @@ def __init__(self, run_command="mpirun", platform_info=None):
319335
self.arg_nnodes = ("--LIBE_NNODES_ARG_EMPTY",)
320336
self.arg_ppn = ("--ppn", "-ppn")
321337
self.default_mpi_options = None
322-
self.default_gpu_arg = None
338+
self.default_gpu_args = None
323339
self.default_gpu_arg_type = None
324340
self.platform_info = platform_info
325341

@@ -343,7 +359,7 @@ def __init__(self, run_command="mpirun", platform_info=None):
343359
self.arg_nnodes = ("--LIBE_NNODES_ARG_EMPTY",)
344360
self.arg_ppn = ("-npernode",)
345361
self.default_mpi_options = None
346-
self.default_gpu_arg = None
362+
self.default_gpu_args = None
347363
self.default_gpu_arg_type = None
348364
self.platform_info = platform_info
349365
self.mpi_command = [
@@ -388,7 +404,7 @@ def __init__(self, run_command="aprun", platform_info=None):
388404
self.arg_nnodes = ("--LIBE_NNODES_ARG_EMPTY",)
389405
self.arg_ppn = ("-N",)
390406
self.default_mpi_options = None
391-
self.default_gpu_arg = None
407+
self.default_gpu_args = None
392408
self.default_gpu_arg_type = None
393409
self.platform_info = platform_info
394410
self.mpi_command = [
@@ -410,7 +426,7 @@ def __init__(self, run_command="mpiexec", platform_info=None):
410426
self.arg_nnodes = ("--LIBE_NNODES_ARG_EMPTY",)
411427
self.arg_ppn = ("-cores",)
412428
self.default_mpi_options = None
413-
self.default_gpu_arg = None
429+
self.default_gpu_args = None
414430
self.default_gpu_arg_type = None
415431
self.platform_info = platform_info
416432
self.mpi_command = [
@@ -431,8 +447,9 @@ def __init__(self, run_command="srun", platform_info=None):
431447
self.arg_nnodes = ("-N", "--nodes")
432448
self.arg_ppn = ("--ntasks-per-node",)
433449
self.default_mpi_options = "--exact"
434-
self.default_gpu_arg = "--gpus-per-task"
435450
self.default_gpu_arg_type = "option_gpus_per_task"
451+
self.default_gpu_args = {"option_gpus_per_task": "--gpus-per-task", "option_gpus_per_node": "--gpus-per-node"}
452+
436453
self.platform_info = platform_info
437454
self.mpi_command = [
438455
self.run_command,
@@ -453,8 +470,8 @@ def __init__(self, run_command="jsrun", platform_info=None):
453470
self.arg_nnodes = ("--LIBE_NNODES_ARG_EMPTY",)
454471
self.arg_ppn = ("-r",)
455472
self.default_mpi_options = None
456-
self.default_gpu_arg = "-g"
457473
self.default_gpu_arg_type = "option_gpus_per_task"
474+
self.default_gpu_args = {"option_gpus_per_task": "-g", "option_gpus_per_node": None}
458475

459476
self.platform_info = platform_info
460477
self.mpi_command = [self.run_command, "-n {num_procs}", "-r {procs_per_node}", "{extra_args}"]

libensemble/resources/platforms.py

Lines changed: 23 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -279,53 +279,54 @@ class Known_platforms(BaseModel):
279279

280280
# Dictionary of known systems (or system partitions) detectable by domain name
281281
detect_systems = {
282-
"crusher.olcf.ornl.gov": Crusher,
283-
"frontier.olcf.ornl.gov": Frontier,
284-
"hostmgmt.cm.aurora.alcf.anl.gov": Aurora,
285-
"hsn.cm.polaris.alcf.anl.gov": Polaris,
286-
"spock.olcf.ornl.gov": Spock,
287-
"summit.olcf.ornl.gov": Summit, # Need to detect gpu count
282+
"crusher.olcf.ornl.gov": "crusher",
283+
"frontier.olcf.ornl.gov": "frontier",
284+
"hostmgmt.cm.aurora.alcf.anl.gov": "aurora",
285+
"hsn.cm.polaris.alcf.anl.gov": "polaris",
286+
"spock.olcf.ornl.gov": "spock",
287+
"summit.olcf.ornl.gov": "summit", # Need to detect gpu count
288288
}
289289

290290

291291
def known_envs():
292292
"""Detect system by environment variables"""
293-
platform_info = {}
293+
name = None
294294
if os.environ.get("NERSC_HOST") == "perlmutter":
295295
if "gpu_" in os.environ.get("SLURM_JOB_PARTITION"):
296-
platform_info = specs_dump(PerlmutterGPU(), by_alias=True)
296+
name = "perlmutter_g"
297297
else:
298-
platform_info = specs_dump(PerlmutterCPU(), by_alias=True)
299-
return platform_info
298+
name = "perlmutter_c"
299+
return name
300300

301301

302302
def known_system_detect(cmd="hostname -d"):
303303
"""Detect known systems
304304
305-
This function attempts to detect if on a known system, but users
306-
should specify systems to be sure.
305+
This function attempts to detect if on a known system, and
306+
returns the name of the system as a string.
307307
"""
308308
run_cmd = cmd.split()
309-
platform_info = {}
309+
name = None
310310
try:
311311
domain_name = subprocess.check_output(run_cmd).decode().rstrip()
312-
platform_info = specs_dump(detect_systems[domain_name](), by_alias=True)
312+
name = detect_systems[domain_name]
313313
except Exception:
314-
platform_info = known_envs()
315-
return platform_info
314+
name = known_envs()
315+
return name
316316

317317

318318
def get_platform(libE_specs):
319319
"""Return platform as a dictionary from relevant libE_specs option.
320320
321321
For internal use, return a platform as a dictionary from either
322-
platform name or platform_specs.
322+
platform name or platform_specs or auto-detection.
323323
324-
If both platform and platform_spec fields are present, any fields in
325-
platform_specs are added or overwrite fields in the known platform.
324+
If a platform is given or detected and platform_spec fields are present,
325+
any fields in platform_specs are added to or overwrite fields in the known
326+
platform.
326327
"""
327-
328-
name = libE_specs.get("platform") or os.environ.get("LIBE_PLATFORM")
328+
platform_info = {}
329+
name = libE_specs.get("platform") or os.environ.get("LIBE_PLATFORM") or known_system_detect()
329330
if name:
330331
try:
331332
known_platforms = specs_dump(Known_platforms(), exclude_none=True)
@@ -340,9 +341,5 @@ def get_platform(libE_specs):
340341
platform_info[k] = v
341342
elif libE_specs.get("platform_specs"):
342343
platform_info = libE_specs["platform_specs"]
343-
else:
344-
# See if in detection list
345-
platform_info = known_system_detect()
346-
347-
platform_info = {k: v for k, v in platform_info.items() if v is not None}
344+
platform_info = {k: v for k, v in platform_info.items() if v is not None}
348345
return platform_info

libensemble/tests/unit_tests/test_platform.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
import pytest
22

3+
from libensemble.utils.misc import specs_dump
34
from libensemble.resources.platforms import PlatformException, get_platform, known_system_detect
5+
from libensemble.resources.platforms import Known_platforms
46

57
my_spec = {
68
"mpi_runner": "srun",
@@ -10,15 +12,12 @@
1012

1113
summit_spec = {
1214
"mpi_runner": "jsrun",
13-
"runner_name": None,
1415
"cores_per_node": 42,
1516
"logical_cores_per_node": 168,
1617
"gpus_per_node": 6,
17-
"tiles_per_gpu": None,
1818
"gpu_setting_type": "option_gpus_per_task",
1919
"gpu_setting_name": "-g",
2020
"scheduler_match_slots": False,
21-
"gpu_env_fallback": None,
2221
}
2322

2423

@@ -60,7 +59,6 @@ def test_platform_known():
6059
def test_platform_specs():
6160
"""Test known platform and platform_specs supplied"""
6261
from libensemble.specs import LibeSpecs
63-
from libensemble.utils.misc import specs_dump
6462

6563
exp = my_spec
6664
libE_specs = {"platform_specs": my_spec}
@@ -85,16 +83,18 @@ def test_platform_specs():
8583

8684

8785
def test_known_sys_detect():
86+
known_platforms = specs_dump(Known_platforms(), exclude_none=True)
8887
get_sys_cmd = "echo summit.olcf.ornl.gov" # Overrides default "hostname -d"
89-
platform_info = known_system_detect(cmd=get_sys_cmd)
88+
name = known_system_detect(cmd=get_sys_cmd)
89+
platform_info = known_platforms[name]
9090
assert platform_info == summit_spec, f"Summit spec does not match expected ({platform_info})"
9191

9292
# Try unknown system
9393
get_sys_cmd = "echo madeup.system" # Overrides default "hostname -d"
94-
platform_info = known_system_detect(cmd=get_sys_cmd)
94+
name = known_system_detect(cmd=get_sys_cmd)
9595
assert (
96-
platform_info == {}
97-
), f"Expected known_system_detect to return empty dict for unknown system ({platform_info})"
96+
name is None
97+
), f"Expected known_system_detect to return None ({name})"
9898

9999

100100
if __name__ == "__main__":

0 commit comments

Comments
 (0)