Fix under-utilized resource usage (#1398)

shuds13 · web-flow · commit 36fb0e3c5d96 · 2024-08-13T16:06:13.000-05:00
* Fixes case where setting num_gpus to zero was treated as None.
* Make under-resourced runs use minimal nodes needed.
* Ensure multi-node resource sets assigned correctly.
* Update unit tests for better coverage of resource configs.
diff --git a/libensemble/executors/mpi_executor.py b/libensemble/executors/mpi_executor.py
@@ -326,12 +326,9 @@ def submit(
         if not num_procs and not match_procs_to_gpus:
             num_procs = self.gen_nprocs
 
-        if not num_gpus:
+        if num_gpus is None:
             num_gpus = self.gen_ngpus
 
-        if not num_nodes and (self.gen_ngpus or self.gen_nprocs):
-            num_nodes = self.resources.worker_resources.local_node_count
-
         if mpi_runner_type is not None:
             if isinstance(mpi_runner_type, str):
                 mpi_config = {"mpi_runner": mpi_runner_type}
diff --git a/libensemble/executors/mpi_runner.py b/libensemble/executors/mpi_runner.py
@@ -121,7 +121,7 @@ def _set_gpu_cli_option(self, wresources, extra_args, gpu_setting_name, gpu_valu
     def _set_gpu_env_var(self, wresources, task, gpus_per_node, gpus_env):
         """Add GPU environment variable setting to the tasks environment"""
         jassert(wresources.matching_slots, f"Cannot assign CPUs/GPUs to non-matching slots per node {wresources.slots}")
-        slot_list = wresources.get_slots_as_string(multiplier=wresources.gpus_per_rset, limit=gpus_per_node)
+        slot_list = wresources.get_slots_as_string(multiplier=wresources.gpus_per_rset_per_node, limit=gpus_per_node)
         task._add_to_env(gpus_env, slot_list)
 
     def _local_runner_set_gpus(self, task, wresources, extra_args, gpus_per_node, ppn):
@@ -171,7 +171,7 @@ def _assign_gpus(self, task, resources, nprocs, nnodes, ppn, ngpus, extra_args,
 
         # gpus per node for this worker.
         if wresources.doihave_gpus():
-            gpus_avail_per_node = wresources.slot_count * wresources.gpus_per_rset
+            gpus_avail_per_node = wresources.slot_count * wresources.gpus_per_rset_per_node
         else:
             gpus_avail_per_node = 0
 
@@ -224,6 +224,35 @@ def _assign_gpus(self, task, resources, nprocs, nnodes, ppn, ngpus, extra_args,
 
         return nprocs, nnodes, ppn, extra_args
 
+    def _get_min_nodes(self, nprocs, ppn, nnodes, ngpus, resources):
+        """Get minimum nodes needed to match configuration"""
+        if nnodes is not None:
+            return nnodes
+        if ppn:
+            return None  # nnodes gets processed later.
+        if resources is not None:
+            wresources = resources.worker_resources
+            total_nodes = wresources.local_node_count
+            procs_on_node = wresources.slot_count * wresources.procs_per_rset_per_node
+
+            if not nprocs and ngpus is None:
+                # Delay node evaluation to GPU assignment code
+                return None
+            proc_min_nodes = 1
+            gpu_min_nodes = 1
+            if nprocs:
+                proc_min_nodes = (nprocs + procs_on_node - 1) // procs_on_node
+            if ngpus:
+                gpus_on_node = wresources.slot_count * wresources.gpus_per_rset_per_node
+                gpu_min_nodes = (ngpus + gpus_on_node - 1) // gpus_on_node
+
+            min_nodes = max(proc_min_nodes, gpu_min_nodes)
+            nnodes = min(min_nodes, total_nodes)
+            # Must have atleast one processor per node to use GPUs
+            if nprocs:
+                nnodes = min(nnodes, nprocs)
+            return nnodes
+
     def _adjust_procs(self, nprocs, ppn, nnodes, ngpus, resources):
         """Adjust an invalid config"""
 
@@ -241,8 +270,8 @@ def adjust_resource(n_units, units_attr, units_name):
 
         if resources is not None:
             wresources = resources.worker_resources
-            ngpus = adjust_resource(ngpus, "gpus_per_rset", "ngpus")
-            nprocs = adjust_resource(nprocs, "procs_per_rset", "nprocs")
+            ngpus = adjust_resource(ngpus, "gpus_per_rset_per_node", "ngpus")
+            nprocs = adjust_resource(nprocs, "procs_per_rset_per_node", "nprocs")
         return nprocs, ngpus
 
     def get_mpi_specs(
@@ -284,6 +313,8 @@ def get_mpi_specs(
 
         if match_procs_to_gpus:
             jassert(no_config_set, "match_procs_to_gpus is mutually exclusive with either of nprocs/ppn")
+
+        nnodes = self._get_min_nodes(nprocs, ppn, nnodes, ngpus, resources)
         nprocs, ngpus = self._adjust_procs(nprocs, ppn, nnodes, ngpus, resources)
 
         if auto_assign_gpus or ngpus is not None:
@@ -294,7 +325,7 @@ def get_mpi_specs(
                 task, resources, nprocs, nnodes, ppn, ngpus, extra_args, match_procs_to_gpus
             )
 
-        rm_rpn = True if self.rm_rpn and ppn is None and nnodes is None else False
+        rm_rpn = self.rm_rpn and ppn is None and nnodes is None
 
         hostlist = None
         if machinefile and not self.mfile_support:
diff --git a/libensemble/resources/mpi_resources.py b/libensemble/resources/mpi_resources.py
@@ -213,7 +213,7 @@ def get_resources(resources, num_procs=None, num_nodes=None, procs_per_node=None
         )
 
     if num_nodes < local_node_count:
-        logger.warning(
+        logger.debug(
             "User constraints mean fewer nodes being used "
             f"than available. {num_nodes} nodes used. {local_node_count} nodes available"
         )
diff --git a/libensemble/resources/rset_resources.py b/libensemble/resources/rset_resources.py
@@ -51,8 +51,9 @@ def __init__(self, num_workers, resources):
         self.num_workers = num_workers
         self.num_workers_2assign2 = RSetResources.get_workers2assign2(self.num_workers, resources)
         self.total_num_rsets = resources.num_resource_sets or self.num_workers_2assign2
-
+        self.num_nodes = len(resources.global_nodelist)
         self.split_list, self.local_rsets_list = RSetResources.get_partitioned_nodelist(self.total_num_rsets, resources)
+        self.nodes_in_rset = len(self.split_list[0])
 
         gpus_avail_per_node = resources.gpus_avail_per_node
         self.rsets_per_node = RSetResources.get_rsets_on_a_node(self.total_num_rsets, resources)
@@ -67,16 +68,20 @@ def __init__(self, num_workers, resources):
         self.total_num_gpu_rsets = np.count_nonzero(self.all_rsets["gpus"])
         self.total_num_nongpu_rsets = np.count_nonzero(~self.all_rsets["gpus"])
 
-        self.gpus_per_rset = gpus_avail_per_node // self.gpu_rsets_per_node if self.gpu_rsets_per_node else 0
-        self.cores_per_rset = resources.physical_cores_avail_per_node // self.rsets_per_node
+        self.gpus_per_rset_per_node = gpus_avail_per_node // self.gpu_rsets_per_node if self.gpu_rsets_per_node else 0
+        self.cores_per_rset_per_node = resources.physical_cores_avail_per_node // self.rsets_per_node
 
         # Oversubsribe
-        if self.cores_per_rset == 0:
+        if self.cores_per_rset_per_node == 0:
             cpn = resources.physical_cores_avail_per_node
             procs_per_core = self.rsets_per_node // cpn + (self.rsets_per_node % cpn > 0)
-            self.procs_per_rset = resources.physical_cores_avail_per_node * procs_per_core
+            self.procs_per_rset_per_node = resources.physical_cores_avail_per_node * procs_per_core
         else:
-            self.procs_per_rset = self.cores_per_rset
+            self.procs_per_rset_per_node = self.cores_per_rset_per_node
+
+        self.gpus_per_rset = self.gpus_per_rset_per_node * self.nodes_in_rset
+        self.cores_per_rset = self.cores_per_rset_per_node * self.nodes_in_rset
+        self.procs_per_rset = self.procs_per_rset_per_node * self.nodes_in_rset
 
     @staticmethod
     def get_group_list(split_list, gpus_per_node=0, gpus_per_group=None):
diff --git a/libensemble/resources/worker_resources.py b/libensemble/resources/worker_resources.py
@@ -273,7 +273,7 @@ def set_env_to_gpus(self, env_var=None, delimiter=","):
         """
         assert self.matching_slots, f"Cannot assign GPUs to non-matching slots per node {self.slots}"
         if self.doihave_gpus():
-            env_value = self.get_slots_as_string(multiplier=self.gpus_per_rset, limit=self.gen_ngpus)
+            env_value = self.get_slots_as_string(multiplier=self.gpus_per_rset_per_node, limit=self.gen_ngpus)
             if env_var is None:
                 if self.platform_info is not None:
                     if self.platform_info.get("gpu_setting_type") == "env":
diff --git a/libensemble/sim_funcs/var_resources.py b/libensemble/sim_funcs/var_resources.py
@@ -279,7 +279,7 @@ def CUDA_variable_resources(H, _, sim_specs, libE_info):
     cores_per_node = resources.slot_count
 
     # Set to detected GPUs
-    # gpus_per_slot = resources.gpus_per_rset
+    # gpus_per_slot = resources.gpus_per_rset_per_node
     # resources.set_env_to_slots("CUDA_VISIBLE_DEVICES", multiplier=gpus_per_slot)
     # cores_per_node = resources.slot_count * gpus_per_slot  # One CPU per GPU
 
diff --git a/libensemble/tests/functionality_tests/test_mpi_runners.py b/libensemble/tests/functionality_tests/test_mpi_runners.py
@@ -196,12 +196,12 @@
         "jsrun -n 32 /path/to/fakeapp.x --testid base2",
         "jsrun -n 32 --xarg 1 /path/to/fakeapp.x --testid base3",
         "jsrun -n 128 --xarg 1 /path/to/fakeapp.x --testid base4",
-        "jsrun -n 16 --xarg 1 /path/to/fakeapp.x --testid base5",
+        "jsrun -n 16 -r 16 --xarg 1 /path/to/fakeapp.x --testid base5",
         "jsrun -n 16 -r 8 --xarg 1 /path/to/fakeapp.x --testid base6",
         "jsrun -n 16 --xarg 1 -r 16 /path/to/fakeapp.x --testid jsr1",
         "jsrun -n 8 --xarg 1 -r 4 /path/to/fakeapp.x --testid jsr2",
-        'jsrun -n 3 -a 1 -c 1 -g 1 --bind=packed:1 --smpiargs="-gpu" /path/to/fakeapp.x --testid jsr3',
-        'jsrun -n 3 -a 1 -c 1 -g 1 --bind=packed:1 --smpiargs="-gpu" /path/to/fakeapp.x --testid jsr4',
+        'jsrun -n 3 -r 3 -a 1 -c 1 -g 1 --bind=packed:1 --smpiargs="-gpu" /path/to/fakeapp.x --testid jsr3',
+        'jsrun -r 3 -n 3 -a 1 -c 1 -g 1 --bind=packed:1 --smpiargs="-gpu" /path/to/fakeapp.x --testid jsr4',
     ]
 
     exp_custom = [
diff --git a/libensemble/tests/unit_tests/test_executor_gpus.py b/libensemble/tests/unit_tests/test_executor_gpus.py
@@ -118,8 +118,10 @@ def run_check(exp_env, exp_cmd, **kwargs):
         args_for_sim = "sleep 0"
         exp_runline = exp_cmd + " simdir/my_simtask.x sleep 0"
         task = exctr.submit(calc_type="sim", app_args=args_for_sim, dry_run=True, **kwargs)
-        assert task.env == exp_env, f"task.env does not match expected: {task.env}"
-        assert task.runline == exp_runline, f"exp_runline does not match expected: {task.runline}"
+        assert task.env == exp_env, f"Task env does not match expected:\n Received: {task.env}\n Expected: {exp_env}"
+        assert (
+            task.runline == exp_runline
+        ), f"Run line  does not match expected.\n Received: {task.runline}\n Expected: {exp_runline}"
 
     return run_check
 
@@ -307,37 +309,46 @@ def test_dry_run_ngpus_srun_plat3_2nodes():
     run_check(exp_env, exp_cmd, num_procs=2, num_nodes=2, auto_assign_gpus=True)
     run_check(exp_env, exp_cmd, procs_per_node=1, auto_assign_gpus=True)
 
+    # restrict with num_gpus - too many, restrict to those available
+    exp_env = {"TESTING_VISIBLE_DEVICES": "0,1,2,3,4"}
+    run_check(exp_env, exp_cmd, procs_per_node=1, auto_assign_gpus=True, num_gpus=10)
+
     # auto_assign_gpus
     exp_env = {"TESTING_VISIBLE_DEVICES": "0,1,2,3,4,5"}
     exp_cmd = "srun -w node-1 --ntasks 1 --nodes 1 --ntasks-per-node 1 --exact"
     run_check(exp_env, exp_cmd, num_procs=1, auto_assign_gpus=True)
 
-    # restrict with num_gpus - too many, restrict to those available
+    # restrict with num_gpus - too many, restrict to those available (now honor num_procs=1)
     run_check(exp_env, exp_cmd, num_procs=1, auto_assign_gpus=True, num_gpus=10)
     run_check(exp_env, exp_cmd, num_procs=1, num_gpus=10)
 
-    exp_env = {"TESTING_VISIBLE_DEVICES": "0,1,2,3,4,5"}
-    exp_cmd = "srun -w node-1,node-2 --ntasks 2 --nodes 2 --ntasks-per-node 1 --exact"
-    run_check(exp_env, exp_cmd, procs_per_node=1, auto_assign_gpus=True)
+    exp_env = {"TESTING_VISIBLE_DEVICES": "0,1"}
+    exp_cmd = "srun -w node-1 --ntasks 2 --nodes 1 --ntasks-per-node 2 --exact"
+    run_check(exp_env, exp_cmd, num_procs=2, auto_assign_gpus=True, num_gpus=2)
+    run_check(exp_env, exp_cmd, num_procs=2, num_gpus=2)
 
-    # restrict with num_gpus
     exp_env = {"TESTING_VISIBLE_DEVICES": "0"}
     exp_cmd = "srun -w node-1,node-2 --ntasks 2 --nodes 2 --ntasks-per-node 1 --exact"
-    run_check(exp_env, exp_cmd, num_procs=2, auto_assign_gpus=True, num_gpus=2)
-    run_check(exp_env, exp_cmd, num_procs=2, num_gpus=2)
+    run_check(exp_env, exp_cmd, num_procs=2, procs_per_node=1, auto_assign_gpus=True, num_gpus=2)
+    run_check(exp_env, exp_cmd, num_procs=2, num_nodes=2, num_gpus=2)
 
     # match_procs_to_gpus
     exp_env = {"TESTING_VISIBLE_DEVICES": "0,1,2,3,4,5"}
     exp_cmd = "srun -w node-1,node-2 --ntasks 12 --nodes 2 --ntasks-per-node 6 --exact"
     run_check(exp_env, exp_cmd, match_procs_to_gpus=True, auto_assign_gpus=True)
 
-    exp_env = {"TESTING_VISIBLE_DEVICES": "0,1"}
-    exp_cmd = "srun -w node-1,node-2 --ntasks 4 --nodes 2 --ntasks-per-node 2 --exact"
+    exp_env = {"TESTING_VISIBLE_DEVICES": "0,1,2,3"}
+    exp_cmd = "srun -w node-1 --ntasks 4 --nodes 1 --ntasks-per-node 4 --exact"
     run_check(exp_env, exp_cmd, match_procs_to_gpus=True, num_gpus=4)
 
-    exp_env = {"TESTING_VISIBLE_DEVICES": "0"}
-    exp_cmd = "srun -w node-1,node-2 --ntasks 2 --nodes 2 --ntasks-per-node 1 --exact"
-    run_check(exp_env, exp_cmd, match_procs_to_gpus=True, num_gpus=3)
+    exp_env = {"TESTING_VISIBLE_DEVICES": "0,1,2,3"}
+    exp_cmd = "srun -w node-1,node-2 --ntasks 8 --nodes 2 --ntasks-per-node 4 --exact"
+    run_check(exp_env, exp_cmd, match_procs_to_gpus=True, num_gpus=8)
+    run_check(exp_env, exp_cmd, match_procs_to_gpus=True, num_gpus=7)
+
+    exp_env = {"TESTING_VISIBLE_DEVICES": "0,1"}
+    exp_cmd = "srun -w node-1,node-2 --ntasks 4 --nodes 2 --ntasks-per-node 2 --exact"
+    run_check(exp_env, exp_cmd, procs_per_node=2, num_gpus=4)
 
 
 if __name__ == "__main__":
diff --git a/libensemble/tools/test_support.py b/libensemble/tools/test_support.py
@@ -204,7 +204,7 @@ def check_gpu_setting(task, assert_setting=True, print_setting=False, resources=
 
     # Get expected numbers
     if cmd_line:
-        expected_nums = _safe_min(wresources.slot_count * wresources.gpus_per_rset, wresources.gen_ngpus)
+        expected_nums = _safe_min(wresources.slot_count * wresources.gpus_per_rset_per_node, wresources.gen_ngpus)
         if gpus_per_task:
             stype = "runline option: gpus per task"
             expected_nums //= int(ppn)
@@ -219,7 +219,9 @@ def check_gpu_setting(task, assert_setting=True, print_setting=False, resources=
             gpu_setting = _get_opt_value(expected_setting, task.runline)
     else:
         stype = "Env var"
-        expected_nums = wresources.get_slots_as_string(multiplier=wresources.gpus_per_rset, limit=wresources.gen_ngpus)
+        expected_nums = wresources.get_slots_as_string(
+            multiplier=wresources.gpus_per_rset_per_node, limit=wresources.gen_ngpus
+        )
         expected_nums = expected_nums if _set_gpus(task, wresources) else None
         if expected_nums is not None:
             expected = {expected_setting: expected_nums}

Original file line number	Diff line number	Diff line change
`@@ -213,7 +213,7 @@ def get_resources(resources, num_procs=None, num_nodes=None, procs_per_node=None`
`213`	`213`	`)`
`214`	`214`
`215`	`215`	`if num_nodes < local_node_count:`
`216`		`- logger.warning(`
	`216`	`+ logger.debug(`
`217`	`217`	`"User constraints mean fewer nodes being used "`
`218`	`218`	`f"than available. {num_nodes} nodes used. {local_node_count} nodes available"`
`219`	`219`	`)`