Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
4b0e884
First round of changes for Polaris. Needs debugging and testing.
rickybalin Oct 28, 2022
012ecf0
Fixed bugs. Clustered and co-located DB tests run on Polaris.
rickybalin Oct 28, 2022
b0ecaca
Added cpu binding options with mpiexec
rickybalin Nov 2, 2022
b91c550
Merge remote-tracking branch 'upstream/develop' into develop
rickybalin Dec 21, 2022
5d2f82d
Correction to flag setting number of tasks for PalsMpiexecSettings
rickybalin Dec 21, 2022
dd67114
Removed mpiexecStep.py, no longer needed after merge with upstream Sm…
rickybalin Dec 21, 2022
a11464b
Merged with SmartSim upstream branch
rickybalin May 3, 2023
52ff300
Added option to specify affinity script to PALS mpiexec settings. Nee…
rickybalin May 3, 2023
f024ec4
Modified affinity script setting to include optional arguments
rickybalin Aug 30, 2023
5f90163
Merge branch 'develop' into develop
rickybalin Oct 12, 2023
f0fcf5c
Updated affinity script changes to have type defs and hints
rickybalin Oct 16, 2023
315009d
Added test for Pals affinity script option
rickybalin Oct 18, 2023
9f81b95
Merged with SmartSim develop official
Feb 21, 2024
8df7ead
Modified buildenv.py to take my fork of RedisAI which updates to C++ …
Feb 21, 2024
11bfb3d
Merge branch 'CrayLabs:develop' into develop
rickybalin Apr 26, 2024
6e56a70
Synced with SmartSim develop branch
rickybalin Jun 10, 2024
896a805
Merge pull request #1 from rickybalin/develop_full_sync
rickybalin Jun 10, 2024
6217a26
Added feature to pals settings to add any mpiexec argument
Oct 21, 2024
69d2ef2
Add a minitor flag to experiment start so can select which jobs to mo…
Feb 28, 2025
d981269
Clean up
Jul 2, 2025
f477030
Update docstrings and add test for set_launcher_args() in PALS settings
Jul 2, 2025
6327338
Fix type
Jul 2, 2025
7426dbc
Fix typo
Jul 2, 2025
058d0aa
Fix line length error
Jul 2, 2025
93d79d9
Formatting changes from make style
Jul 2, 2025
683d733
Make style
Jul 4, 2025
930ca5a
Merge remote-tracking branch 'origin/develop' into pr/rickybalin/788
Jul 4, 2025
b01cc78
Merge branch 'CrayLabs:develop' into feature/monitor_model
rickybalin Oct 9, 2025
55a02c0
Update changelog.md
rickybalin Oct 9, 2025
04a78c6
Add the new monitor parameter to the docstring of experiment.start()
rickybalin Oct 9, 2025
292a529
Fix format
rickybalin Oct 9, 2025
9826d08
Add monitor argument to start_wo_job_manager()
rickybalin Oct 9, 2025
c2c645b
Add monitor argument to launch_step_nop
rickybalin Oct 9, 2025
f97e5a7
Merge with upstream develop
rickybalin Mar 20, 2026
89147e1
Fix old typing
rickybalin Mar 20, 2026
70a4d5b
Fix typo
rickybalin Mar 20, 2026
1cdcddd
Removed ref to LaunchedManifest
rickybalin Mar 20, 2026
db0eed5
Fix call to _launch
rickybalin Mar 20, 2026
85efde8
minor fix
rickybalin Mar 20, 2026
9ef9c1b
Add monitor arg to restart_job
rickybalin Mar 20, 2026
a62e971
Fix typo in smartsim/experiment.py
rickybalin Mar 25, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions doc/changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,11 @@ Description

Detailed Notes

- Enable control over monitoring of Models launched with `experiment.start()` by
adding an optional boolean argument determining whether to monitor the particular
model or not. The argument is set to True by default, so no changes are needed for
the default behavior of monitoring all Models launched.
([SmartSim-PR788](https://github.com/CrayLabs/SmartSim/pull/788))
- Updated tests which would create experiment in root directory, patched
tests which would not work on some Slurm systems, added an environment variable
to control how long to wait for Redis server to be available.
Expand Down
18 changes: 12 additions & 6 deletions smartsim/_core/control/controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ def start(
manifest: Manifest,
block: bool = True,
kill_on_interrupt: bool = True,
monitor: bool = True,
) -> None:
"""Start the passed SmartSim entities

Expand All @@ -121,7 +122,7 @@ def start(
SignalInterceptionStack.get(signal.SIGINT).push_unique(
self._jobs.signal_interrupt
)
self._launch(exp_name, exp_path, manifest)
self._launch(exp_name, exp_path, manifest, monitor)

# start the job manager thread if not already started
if not self._jobs.actively_monitoring:
Expand Down Expand Up @@ -155,7 +156,7 @@ def poll(
:param kill_on_interrupt: flag for killing jobs when SIGINT is received
"""
self._jobs.kill_on_interrupt = kill_on_interrupt
to_monitor = self._jobs.jobs
to_monitor = self._jobs.monitor_jobs
while len(to_monitor) > 0:
time.sleep(interval)

Expand Down Expand Up @@ -370,7 +371,9 @@ def symlink_output_files(
"Symlinking files failed."
)

def _launch(self, _exp_name: str, exp_path: str, manifest: Manifest) -> None:
def _launch(
self, _exp_name: str, exp_path: str, manifest: Manifest, monitor: bool = True
) -> None:
"""Main launching function of the controller

Orchestrators are always launched first so that the
Expand All @@ -379,6 +382,7 @@ def _launch(self, _exp_name: str, exp_path: str, manifest: Manifest) -> None:
:param exp_name: The name of the launching experiment
:param exp_path: path to location of ``Experiment`` directory if generated
:param manifest: Manifest of deployables to launch
:param monitor: boolean to signal whether to monitor deployables
"""

# Create a unique timestamp for this launch to ensure unique metadata
Expand Down Expand Up @@ -454,7 +458,7 @@ def _launch(self, _exp_name: str, exp_path: str, manifest: Manifest) -> None:

# launch and symlink steps
for step, entity in steps:
self._launch_step(step, entity)
self._launch_step(step, entity, monitor)
self.symlink_output_files(step, entity)

# symlink substeps to maintain directory structure
Expand Down Expand Up @@ -533,11 +537,13 @@ def _launch_step(
self,
job_step: Step,
entity: SmartSimEntity | EntitySequence[SmartSimEntity],
monitor: bool = True,
) -> None:
"""Use the launcher to launch a job step

:param job_step: a job step instance
:param entity: entity instance
:param monitor: boolean determining whether to monitor job
:raises SmartSimError: if launch fails
"""
# attempt to retrieve entity name in JobManager.completed
Expand Down Expand Up @@ -582,10 +588,10 @@ def _launch_step(

if self._jobs.query_restart(entity.name):
logger.debug(f"Restarting {entity.name}")
self._jobs.restart_job(job_step.name, job_id, entity.name, is_task)
self._jobs.restart_job(job_step.name, job_id, entity.name, is_task, monitor)
else:
logger.debug(f"Launching {entity.name}")
self._jobs.add_job(job_step.name, job_id, entity, is_task)
self._jobs.add_job(job_step.name, job_id, entity, is_task, monitor)

def _create_batch_job_step(
self,
Expand Down
11 changes: 11 additions & 0 deletions smartsim/_core/control/jobmanager.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ def __init__(self, lock: RLock, launcher: Launcher | None = None) -> None:

# active jobs
self.jobs: dict[str, Job] = {}
self.monitor_jobs: dict[str, Job] = {}
self.db_jobs: dict[str, Job] = {}

# completed jobs
Expand Down Expand Up @@ -132,6 +133,8 @@ def move_to_completed(self, job: Job) -> None:
del self.db_jobs[job.ename]
elif job.ename in self.jobs:
del self.jobs[job.ename]
if job.ename in self.monitor_jobs:
del self.monitor_jobs[job.ename]

def __getitem__(self, entity_name: str) -> Job:
"""Return the job associated with the name of the entity
Expand Down Expand Up @@ -165,12 +168,14 @@ def add_job(
job_id: str | None,
entity: SmartSimEntity | EntitySequence[SmartSimEntity],
is_task: bool = True,
monitor: bool = True,
) -> None:
"""Add a job to the job manager which holds specific jobs by type.

:param job_name: name of the job step
:param job_id: job step id created by launcher
:param entity: entity that was launched on job step
:param monitor: boolean to monitor job
:param is_task: process monitored by TaskManager (True) or the WLM (True)
"""
launcher = str(self._launcher)
Expand All @@ -180,6 +185,8 @@ def add_job(
self.db_jobs[entity.name] = job
else:
self.jobs[entity.name] = job
if monitor:
self.monitor_jobs[entity.name] = job

def is_finished(self, entity: SmartSimEntity) -> bool:
"""Detect if a job has completed
Expand Down Expand Up @@ -264,6 +271,7 @@ def restart_job(
job_id: str | None,
entity_name: str,
is_task: bool = True,
monitor: bool = True,
) -> None:
"""Function to reset a job to record history and be
ready to launch again.
Expand All @@ -272,6 +280,7 @@ def restart_job(
:param job_id: new job id
:param entity_name: name of the entity of the job
:param is_task: process monitored by TaskManager (True) or the WLM (True)
:param monitor: boolean to monitor job

"""
with self._lock:
Expand All @@ -283,6 +292,8 @@ def restart_job(
self.db_jobs[entity_name] = job
else:
self.jobs[entity_name] = job
if monitor:
self.monitor_jobs[entity_name] = job

def get_db_host_addresses(self) -> dict[str, list[str]]:
"""Retrieve the list of hosts for the database
Expand Down
7 changes: 7 additions & 0 deletions smartsim/experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,7 @@ def start(
block: bool = True,
summary: bool = False,
kill_on_interrupt: bool = True,
monitor: bool = True,
) -> None:
"""Start passed instances using Experiment launcher

Expand Down Expand Up @@ -205,11 +206,16 @@ def start(
that all jobs launched by this experiment will be killed, and the
zombie processes will need to be manually killed.

If `monitor=True`, all the jobs being started will be monitored
by the Controller. If `monitor=False`, the jobs will not be
monitored, meaning that their status will not be reported.

:param block: block execution until all non-database
jobs are finished
:param summary: print a launch summary prior to launch
:param kill_on_interrupt: flag for killing jobs when ^C (SIGINT)
signal is received.
:param monitor: monitor the jobs being started
"""
start_manifest = Manifest(*args)
self._create_entity_dir(start_manifest)
Expand All @@ -222,6 +228,7 @@ def start(
manifest=start_manifest,
block=block,
kill_on_interrupt=kill_on_interrupt,
monitor=monitor,
)
except SmartSimError as e:
logger.error(e)
Expand Down
10 changes: 10 additions & 0 deletions smartsim/settings/palsSettings.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,16 @@ def set_broadcast(self, dest_path: str | None = None) -> None:
)
self.run_args["transfer"] = None

def set_launcher_args(
self, arguments: t.Dict[str, t.Union[int, str, float, None]]
) -> None:
"""Set any other task launcher argument

:param arguments: dictionary with string name and value
"""
for name, value in arguments.items():
self.run_args[name] = value

def set_walltime(self, walltime: str) -> None:
"""Set the maximum number of seconds that a job will run

Expand Down
10 changes: 8 additions & 2 deletions tests/test_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,12 +93,18 @@ def _monkeypatch_exp_controller(exp):
entity_steps = []

def start_wo_job_manager(
self, exp_name, exp_path, manifest, block=True, kill_on_interrupt=True
self,
exp_name,
exp_path,
manifest,
block=True,
kill_on_interrupt=True,
monitor=True,
):
self._launch(exp_name, exp_path, manifest)
return None

def launch_step_nop(self, step, entity):
def launch_step_nop(self, step, entity, monitor):
entity_steps.append((step, entity))

monkeypatch.setattr(
Expand Down
6 changes: 6 additions & 0 deletions tests/test_pals_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,12 @@
# func(None)


def test_set_launcher_args():
settings = PalsMpiexecSettings(default_exe, **default_kwargs)
settings.set_launcher_args({"mem-bind": "none", "line-buffer": ""})
assert settings.format_run_args() == ["--mem-bind", "none", "--line-buffer"]


def test_affinity_script():
settings = PalsMpiexecSettings(default_exe, **default_kwargs)
settings.set_gpu_affinity_script("/path/to/set_affinity_gpu.sh", 1, 2)
Expand Down
Loading