Skip to content

Commit 4c103d9

Browse files
committed
Merge branch 'develop' of https://github.com/Libensemble/libensemble into develop
2 parents f1fd08f + 394638d commit 4c103d9

12 files changed

Lines changed: 221 additions & 150 deletions

File tree

.github/workflows/extra.yml

Lines changed: 32 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -11,43 +11,43 @@ jobs:
1111
matrix:
1212
os: [ubuntu-latest]
1313
mpi-version: [mpich]
14-
python-version: ["3.10", "3.11", "3.12"]
15-
pydantic-version: ["2.8.2"]
14+
python-version: ['3.10', '3.11', '3.12', '3.13']
15+
pydantic-version: ['2.8.2']
1616
comms-type: [m, l]
1717
include:
1818
- os: macos-latest
19-
python-version: 3.11
19+
python-version: '3.13'
2020
mpi-version: mpich
21-
pydantic-version: "2.8.2"
21+
pydantic-version: '2.8.2'
2222
comms-type: m
2323
- os: macos-latest
24-
python-version: 3.11
24+
python-version: '3.13'
2525
mpi-version: mpich
26-
pydantic-version: "2.8.2"
26+
pydantic-version: '2.8.2'
2727
comms-type: l
2828
- os: ubuntu-latest
29-
python-version: "3.10"
29+
python-version: '3.12'
3030
mpi-version: mpich
31-
pydantic-version: "2.8.2"
31+
pydantic-version: '2.8.2'
3232
comms-type: t
3333
- os: ubuntu-latest
34-
mpi-version: "openmpi"
35-
pydantic-version: "2.8.2"
36-
python-version: "3.12"
34+
mpi-version: 'openmpi'
35+
pydantic-version: '2.8.2'
36+
python-version: '3.12'
3737
comms-type: l
3838
- os: ubuntu-latest
3939
mpi-version: mpich
40-
python-version: "3.10"
41-
pydantic-version: "1.10.17"
40+
python-version: '3.12'
41+
pydantic-version: '1.10.17'
4242
comms-type: m
4343
- os: ubuntu-latest
4444
mpi-version: mpich
45-
python-version: "3.10"
46-
pydantic-version: "1.10.17"
45+
python-version: '3.12'
46+
pydantic-version: '1.10.17'
4747
comms-type: l
4848

4949
env:
50-
HYDRA_LAUNCHER: "fork"
50+
HYDRA_LAUNCHER: 'fork'
5151
TERM: xterm-256color
5252
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
5353

@@ -61,7 +61,7 @@ jobs:
6161
uses: conda-incubator/setup-miniconda@v3
6262
with:
6363
activate-environment: condaenv
64-
miniconda-version: "latest"
64+
miniconda-version: 'latest'
6565
python-version: ${{ matrix.python-version }}
6666
channels: conda-forge
6767
channel-priority: flexible
@@ -75,8 +75,8 @@ jobs:
7575
- name: Install Ubuntu compilers
7676
if: matrix.os == 'ubuntu-latest'
7777
run: |
78-
conda install gcc_linux-64
79-
pip install nlopt==2.9.0
78+
conda install gcc_linux-64
79+
pip install nlopt==2.9.0
8080
8181
# Roundabout solution on macos for proper linking with mpicc
8282
- name: Install macOS compilers
@@ -93,22 +93,22 @@ jobs:
9393
run: |
9494
conda env update --file install/gen_deps_environment.yml
9595
96-
- name: Install gpcam
97-
if: matrix.python-version <= '3.13'
96+
- name: Install gpcam and octave # Neither yet support 3.13
97+
if: matrix.python-version <= '3.12'
9898
run: |
9999
pip install gpcam
100+
conda install octave
100101
101-
- name: Install surmise
102+
- name: Install surmise and Tasmanian
102103
if: matrix.os == 'ubuntu-latest'
103104
run: |
104105
pip install --upgrade git+https://github.com/bandframework/surmise.git
106+
pip install Tasmanian --user
105107
106108
- name: Install generator dependencies for Ubuntu tests
107-
if: matrix.os == 'ubuntu-latest' && matrix.python-version != '3.12'
109+
if: matrix.os == 'ubuntu-latest' && matrix.python-version <= '3.12'
108110
run: |
109-
sudo apt-get install bc
110-
pip install -r install/ubuntu_no312.txt
111-
pip install Tasmanian --user
111+
pip install scikit-build packaging
112112
113113
- name: Install Balsam on Pydantic 1
114114
if: matrix.pydantic-version == '1.10.17'
@@ -120,24 +120,23 @@ jobs:
120120
121121
- name: Install other testing dependencies
122122
run: |
123-
conda install octave
124123
pip install -r install/testing_requirements.txt
125124
pip install -r install/misc_feature_requirements.txt
126125
source install/install_ibcdfo.sh
126+
conda install numpy scipy
127127
128128
- name: Install libEnsemble, flake8, lock environment
129129
run: |
130130
pip install pydantic==${{ matrix.pydantic-version }}
131131
pip install -e .
132132
flake8 libensemble
133133
134-
- name: Remove test for persistent Tasmanian, Surmise on Python 3.12
135-
if: matrix.python-version >= '3.12'
134+
- name: Remove test using octave, gpcam on Python 3.13
135+
if: matrix.python-version >= '3.13'
136136
run: |
137-
rm ./libensemble/tests/regression_tests/test_persistent_tasmanian.py
138-
rm ./libensemble/tests/regression_tests/test_persistent_tasmanian_async.py
139-
rm ./libensemble/tests/regression_tests/test_persistent_surmise_calib.py
140-
rm ./libensemble/tests/regression_tests/test_persistent_surmise_killsims.py
137+
rm ./libensemble/tests/regression_tests/test_persistent_fd_param_finder.py # needs octave, which doesn't yet support 3.13
138+
rm ./libensemble/tests/regression_tests/test_persistent_aposmm_external_localopt.py # needs octave, which doesn't yet support 3.13
139+
rm ./libensemble/tests/regression_tests/test_gpCAM.py # needs gpcam, which doesn't build on 3.13
141140
142141
- name: Install redis/proxystore on Pydantic 2
143142
if: matrix.pydantic-version == '2.8.2'

docs/platforms/aurora.rst

Lines changed: 69 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,16 @@ nodes.
1212
Configuring Python and Installation
1313
-----------------------------------
1414

15-
To obtain Python use::
15+
To obtain Python and create a virtual environment:
16+
17+
.. code-block:: console
1618
17-
module use /soft/modulefiles
1819
module load frameworks
20+
python -m venv /path/to-venv --system-site-packages
21+
. /path/to-venv/bin/activate
22+
23+
where ``/path/to-venv`` can be anywhere you have write access. For future sessions,
24+
just load the frameworks module and run the activate line.
1925

2026
To obtain libEnsemble::
2127

@@ -31,7 +37,7 @@ To run the :doc:`forces_gpu<../tutorials/forces_gpu_tutorial>` tutorial on
3137
Aurora.
3238

3339
To obtain the example you can git clone libEnsemble - although only
34-
the forces sub-directory is needed::
40+
the ``forces`` sub-directory is strictly needed::
3541

3642
git clone https://github.com/Libensemble/libensemble
3743
cd libensemble/libensemble/tests/scaling_tests/forces/forces_app
@@ -44,40 +50,57 @@ Now go to forces_gpu directory::
4450

4551
cd ../forces_gpu
4652

47-
To make use of all available GPUs, open ``run_libe_forces.py`` and adjust
48-
the exit_criteria to do more simulations. The following will do two
49-
simulations for each worker::
53+
To make use of all available GPUs, open **run_libe_forces.py** and adjust
54+
the ``exit_criteria`` to perform more simulations. The following will run two
55+
simulations for each worker:
56+
57+
.. code-block:: python
5058
5159
# Instruct libEnsemble to exit after this many simulations
5260
ensemble.exit_criteria = ExitCriteria(sim_max=nsim_workers*2)
5361
5462
Now grab an interactive session on two nodes (or use the batch script at
5563
``../submission_scripts/submit_pbs_aurora.sh``)::
5664

57-
qsub -A <myproject> -l select=2 -l walltime=15:00 -lfilesystems=home -q EarlyAppAccess -I
65+
qsub -A <myproject> -l select=2 -l walltime=15:00 -lfilesystems=home:flare -q debug -I
5866

5967
Once in the interactive session, you may need to reload the frameworks module::
6068

6169
cd $PBS_O_WORKDIR
62-
module use /soft/modulefiles
63-
module load frameworks
70+
. /path/to-venv/bin/activate
6471

6572
Then in the session run::
6673

67-
python run_libe_forces.py --comms local --nworkers 13
74+
python run_libe_forces.py -n 13
6875

6976
This provides twelve workers for running simulations (one for each GPU across
7077
two nodes). An extra worker is added to run the persistent generator. The
7178
GPU settings for each worker simulation are printed.
7279

7380
Looking at ``libE_stats.txt`` will provide a summary of the runs.
7481

82+
Now try running::
83+
84+
./cleanup.sh
85+
python run_libe_forces.py -n 7
86+
87+
And you will see it runs with two cores and two GPUs are used per
88+
worker. The **forces** example automatically uses the GPUs available to
89+
each worker.
90+
91+
Live viewing GPU usage
92+
----------------------
93+
94+
To see GPU usage, SSH into a compute node you are on in another window and run::
95+
96+
module load xpu-smi
97+
watch -n 0.1 xpu-smi dump -d -1 -m 0 -n 1
98+
7599
Using tiles as GPUs
76100
-------------------
77101

78-
If you wish to treat each tile as its own GPU, then add the *libE_specs*
79-
option ``use_tiles_as_gpus=True``, so the *libE_specs* block of
80-
``run_libe_forces.py`` becomes:
102+
To treat each tile as its own GPU, add the ``use_tiles_as_gpus=True`` option
103+
to the ``libE_specs`` block in **run_libe_forces.py**:
81104

82105
.. code-block:: python
83106
@@ -90,19 +113,45 @@ option ``use_tiles_as_gpus=True``, so the *libE_specs* block of
90113
Now you can run again but with twice the workers for running simulations (each
91114
will use one GPU tile)::
92115

93-
python run_libe_forces.py --comms local --nworkers 25
116+
python run_libe_forces.py -n 25
117+
118+
119+
Running generator on the manager
120+
--------------------------------
121+
122+
An alternative is to run the generator on a thread on the manager. The
123+
number of workers can then be set to the number of simulation workers.
124+
125+
Change the ``libE_specs`` in **run_libe_forces.py** as follows:
126+
127+
.. code-block:: python
128+
129+
nsim_workers = ensemble.nworkers
130+
131+
# Persistent gen does not need resources
132+
ensemble.libE_specs = LibeSpecs(
133+
gen_on_manager=True,
134+
135+
136+
then we can run with 12 (instead of 13) workers::
137+
138+
python run_libe_forces.py -n 12
139+
140+
Dynamic resource assignment
141+
---------------------------
94142
95-
Note that the *forces* example will automatically use the GPUs available to
96-
each worker (with one MPI rank per GPU), so if fewer workers are provided,
97-
more than one GPU will be used per simulation.
143+
In the **forces** directory you will also find:
98144
99-
Also see ``forces_gpu_var_resources`` and ``forces_multi_app`` examples for
100-
cases that use varying processor/GPU counts per simulation.
145+
* ``forces_gpu_var_resources`` uses varying processor/GPU counts per simulation.
146+
* ``forces_multi_app`` uses varying processor/GPU counts per simulation and also
147+
uses two different user executables, one which is CPU-only and one which
148+
uses GPUs. This allows highly efficient use of nodes for multi-application
149+
ensembles.
101150
102151
Demonstration
103152
-------------
104153
105-
Note that a video demonstration_ of the *forces_gpu* example on *Frontier*
154+
Note that a video demonstration_ of the *forces_gpu* example on **Frontier**
106155
is also available. The workflow is identical when running on Aurora, with the
107156
exception of different compiler options and numbers of workers (because the
108157
numbers of GPUs on a node differs).

install/ubuntu_no312.txt

Lines changed: 0 additions & 4 deletions
This file was deleted.

libensemble/executors/mpi_executor.py

Lines changed: 20 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -47,29 +47,32 @@ class MPIExecutor(Executor):
4747
information using the ``custom_info`` argument. This takes
4848
a dictionary of values.
4949
50-
The allowable fields are::
50+
The allowable fields are:
5151
52-
'mpi_runner' [string]:
53-
Select runner: 'mpich', 'openmpi', 'aprun', 'srun', 'jsrun', 'custom'
54-
All except 'custom' relate to runner classes in libEnsemble.
52+
.. parsed-literal::
53+
54+
**'mpi_runner'** [string]:
55+
Select runner: `'mpich'`, `'openmpi'`, `'aprun'`, `'srun'`, `'jsrun'`, `'custom'`
56+
All except `'custom'` relate to runner classes in libEnsemble.
5557
Custom allows user to define their own run-lines but without parsing
5658
arguments or making use of auto-resources.
57-
'runner_name' [string]:
58-
Runner name: Replaces run command if present. All runners have a default
59-
except for 'custom'.
60-
'subgroup_launch' [bool]:
59+
**'runner_name'** [string]:
60+
The literal string that appears at the front of the run command.
61+
This is typically 'mpirun', 'srun', etc., and can be a full path.
62+
Defaults exist for all runners except 'custom'.
63+
**'subgroup_launch'** [bool]:
6164
Whether MPI runs should be initiated in a new process group. This needs
6265
to be correct for kills to work correctly. Use the standalone test at
63-
libensemble/tests/standalone_tests/kill_test to determine correct value
66+
`libensemble/tests/standalone_tests/kill_test` to determine correct value
6467
for a system.
6568
66-
For example::
69+
For example::
6770
68-
customizer = {'mpi_runner': 'mpich',
69-
'runner_name': 'wrapper -x mpich'}
71+
customizer = {'mpi_runner': 'mpich',
72+
'runner_name': 'wrapper -x mpich'}
7073
71-
from libensemble.executors.mpi_executor import MPIExecutor
72-
exctr = MPIExecutor(custom_info=customizer)
74+
from libensemble.executors.mpi_executor import MPIExecutor
75+
exctr = MPIExecutor(custom_info=customizer)
7376
7477
7578
"""
@@ -336,6 +339,9 @@ def submit(
336339
else:
337340
mpi_runner_obj = self.mpi_runner_obj or self._create_mpi_runner_from_attr()
338341

342+
if env_script is None and mpi_runner_obj is None:
343+
raise ExecutorException("No valid MPI runner was found")
344+
339345
mpi_specs = mpi_runner_obj.get_mpi_specs(
340346
task,
341347
num_procs,

libensemble/executors/mpi_runner.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,13 @@ def get_runner(mpi_runner_type, runner_name=None, platform_info=None):
2121
"msmpi": MSMPI_MPIRunner,
2222
"custom": MPIRunner,
2323
}
24-
mpi_runner = mpi_runners[mpi_runner_type]
25-
if runner_name is not None:
26-
runner = mpi_runner(run_command=runner_name, platform_info=platform_info)
27-
else:
28-
runner = mpi_runner(platform_info=platform_info)
24+
runner = None
25+
if mpi_runner_type is not None:
26+
mpi_runner = mpi_runners[mpi_runner_type]
27+
if runner_name is not None:
28+
runner = mpi_runner(run_command=runner_name, platform_info=platform_info)
29+
else:
30+
runner = mpi_runner(platform_info=platform_info)
2931
return runner
3032

3133
def __init__(self, run_command="mpiexec", platform_info=None):

libensemble/gen_funcs/persistent_ax_multitask.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -305,7 +305,7 @@ def persistent_gp_mt_ax_gen_f(H, persis_info, gen_specs, libE_info):
305305
# Increase iteration counter.
306306
model_iteration += 1
307307

308-
return [], persis_info, FINISHED_PERSISTENT_GEN_TAG
308+
return None, persis_info, FINISHED_PERSISTENT_GEN_TAG
309309

310310

311311
class AxRunner(Runner):

0 commit comments

Comments
 (0)