Skip to content

Commit 8c09098

Browse files
authored
Merge pull request #1539 from Libensemble/refactor/bebop_scripts
Refactor / update Bebop submission scripts
2 parents 53e29d6 + d946e90 commit 8c09098

4 files changed

Lines changed: 103 additions & 36 deletions

File tree

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
#!/bin/bash -l
2+
#PBS -l select=4
3+
#PBS -l walltime=00:15:00
4+
#PBS -q bdwall
5+
#PBS -A [project]
6+
#PBS -N libE_example
7+
8+
9+
cd $PBS_O_WORKDIR
10+
# Choose MPI backend. Note that the built mpi4py in your environment should match.
11+
module load oneapi/mpi
12+
# module load openmpi
13+
14+
python run_libe_example.py -n 16

examples/libE_submission_scripts/bebop_submit_pbs_distrib.sh

Lines changed: 0 additions & 12 deletions
This file was deleted.

examples/libE_submission_scripts/bebop_submit_slurm_central.sh

Lines changed: 0 additions & 24 deletions
This file was deleted.
Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
#!/bin/bash
2+
#SBATCH -J libE_test
3+
#SBATCH -N 4
4+
#SBATCH -p [queue]
5+
#SBATCH -A <my_project>
6+
#SBATCH -o tlib.%j.%N.out
7+
#SBATCH -e tlib.%j.%N.error
8+
#SBATCH -t 01:00:00
9+
10+
# Launch script that runs in distributed mode with mpi4py.
11+
# Workers are evenly spread over nodes and manager added to the first node.
12+
# Requires even distribution - either multiple workers per node or nodes per worker
13+
# Option for manager to have a dedicated node.
14+
# Use of MPI Executor will ensure workers co-locate tasks with workers
15+
# If node_list file is kept, this informs libe of resources. Else, libe auto-detects.
16+
17+
# User to edit these variables
18+
export EXE=libE_calling_script.py
19+
export NUM_WORKERS=4
20+
export MANAGER_NODE=false # true = Manager has a dedicated node (assign one extra)
21+
export USE_NODE_LIST=true # If false, allow libE to determine node_list from environment.
22+
23+
# As libE shares nodes with user applications allow fallback if contexts overrun.
24+
unset I_MPI_FABRICS
25+
export I_MPI_FABRICS_LIST=tmi,tcp
26+
export I_MPI_FALLBACK=1
27+
28+
# If using in calling script (After N mins manager kills workers and exits cleanly)
29+
export LIBE_WALLCLOCK=55
30+
31+
#-----------------------------------------------------------------------------
32+
# Work out distribution
33+
if [[ $MANAGER_NODE = "true" ]]; then
34+
WORKER_NODES=$(($SLURM_NNODES-1))
35+
else
36+
WORKER_NODES=$SLURM_NNODES
37+
fi
38+
39+
if [[ $NUM_WORKERS -ge $WORKER_NODES ]]; then
40+
SUB_NODE_WORKERS=true
41+
WORKERS_PER_NODE=$(($NUM_WORKERS/$WORKER_NODES))
42+
else
43+
SUB_NODE_WORKERS=false
44+
NODES_PER_WORKER=$(($WORKER_NODES/$NUM_WORKERS))
45+
fi;
46+
#-----------------------------------------------------------------------------
47+
48+
# A little useful information
49+
echo -e "Manager process running on: $HOSTNAME"
50+
echo -e "Directory is: $PWD"
51+
52+
# Generate a node list with 1 node per line:
53+
srun hostname | sort -u > node_list
54+
55+
# Add manager node to machinefile
56+
head -n 1 node_list > machinefile.$SLURM_JOBID
57+
58+
# Add worker nodes to machinefile
59+
if [[ $SUB_NODE_WORKERS = "true" ]]; then
60+
awk -v repeat=$WORKERS_PER_NODE '{for(i=0; i<repeat; i++)print}' node_list \
61+
>>machinefile.$SLURM_JOBID
62+
else
63+
awk -v patt="$NODES_PER_WORKER" 'NR % patt == 1' node_list \
64+
>> machinefile.$SLURM_JOBID
65+
fi;
66+
67+
if [[ $USE_NODE_LIST = "false" ]]; then
68+
rm node_list
69+
wait
70+
fi;
71+
72+
# Put in a timestamp
73+
echo Starting execution at: `date`
74+
75+
# To use srun
76+
export SLURM_HOSTFILE=machinefile.$SLURM_JOBID
77+
78+
# The "arbitrary" flag should ensure SLURM_HOSTFILE is picked up
79+
# cmd="srun --ntasks $(($NUM_WORKERS+1)) -m arbitrary python $EXE"
80+
cmd="srun --ntasks $(($NUM_WORKERS+1)) -m arbitrary python $EXE $LIBE_WALLCLOCK"
81+
82+
echo The command is: $cmd
83+
echo End PBS script information.
84+
echo All further output is from the process being run and not the script.\n\n $cmd
85+
86+
$cmd
87+
88+
# Print the date again -- when finished
89+
echo Finished at: `date`

0 commit comments

Comments
 (0)