-
Notifications
You must be signed in to change notification settings - Fork 41
Expand file tree
/
Copy pathexperiment.py
More file actions
945 lines (794 loc) · 35.5 KB
/
experiment.py
File metadata and controls
945 lines (794 loc) · 35.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
# BSD 2-Clause License
#
# Copyright (c) 2021-2025, Hewlett Packard Enterprise
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# pylint: disable=too-many-lines
import os
import os.path as osp
import typing as t
from os import environ, getcwd
from tabulate import tabulate
from smartsim._core.config import CONFIG
from smartsim.error.errors import SSUnsupportedError
from smartsim.status import SmartSimStatus
from ._core import Controller, Generator, Manifest, previewrenderer
from .database import Orchestrator
from .entity import Ensemble, EntitySequence, Model, SmartSimEntity
from .error import SmartSimError
from .log import ctx_exp_path, get_logger, method_contextualizer
from .settings import Container, base, settings
from .wlm import detect_launcher
logger = get_logger(__name__)
def _exp_path_map(exp: "Experiment") -> str:
"""Mapping function for use by method contextualizer to place the path of
the currently-executing experiment into context for log enrichment"""
return exp.exp_path
_contextualize = method_contextualizer(ctx_exp_path, _exp_path_map)
# pylint: disable=no-self-use
class Experiment:
"""Experiment is a factory class that creates stages of a workflow
and manages their execution.
The instances created by an Experiment represent executable code
that is either user-specified, like the ``Model`` instance created
by ``Experiment.create_model``, or pre-configured, like the ``Orchestrator``
instance created by ``Experiment.create_database``.
Experiment methods that accept a variable list of arguments, such as
``Experiment.start`` or ``Experiment.stop``, accept any number of the
instances created by the Experiment.
In general, the Experiment class is designed to be initialized once
and utilized throughout runtime.
"""
def __init__(
self,
name: str,
exp_path: str | None = None,
launcher: str = "local",
):
"""Initialize an Experiment instance.
With the default settings, the Experiment will use the
local launcher, which will start all Experiment created
instances on the localhost.
Example of initializing an Experiment with the local launcher
.. highlight:: python
.. code-block:: python
exp = Experiment(name="my_exp", launcher="local")
SmartSim supports multiple launchers which also can be specified
based on the type of system you are running on.
.. highlight:: python
.. code-block:: python
exp = Experiment(name="my_exp", launcher="slurm")
If you want your Experiment driver script to be run across
multiple system with different schedulers (workload managers)
you can also use the `auto` argument to have the Experiment detect
which launcher to use based on system installed binaries and libraries.
.. highlight:: python
.. code-block:: python
exp = Experiment(name="my_exp", launcher="auto")
The Experiment path will default to the current working directory
and if the ``Experiment.generate`` method is called, a directory
with the Experiment name will be created to house the output
from the Experiment.
:param name: name for the ``Experiment``
:param exp_path: path to location of ``Experiment`` directory
:param launcher: type of launcher being used, options are "slurm", "pbs",
"dragon", "sge", or "local". If set to "auto",
an attempt will be made to find an available launcher
on the system.
"""
self.name = name
if exp_path:
if not isinstance(exp_path, str):
raise TypeError("exp_path argument was not of type str")
if not osp.isdir(osp.abspath(exp_path)):
raise NotADirectoryError("Experiment path provided does not exist")
exp_path = osp.abspath(exp_path)
else:
exp_path = osp.join(getcwd(), name)
self.exp_path = exp_path
self._launcher = launcher.lower()
if self._launcher == "auto":
self._launcher = detect_launcher()
if self._launcher == "cobalt":
raise SSUnsupportedError("Cobalt launcher is no longer supported.")
if launcher == "dragon":
self._set_dragon_server_path()
self._control = Controller(launcher=self._launcher)
self.db_identifiers: set[str] = set()
def _set_dragon_server_path(self) -> None:
"""Set path for dragon server through environment varialbes"""
if not "SMARTSIM_DRAGON_SERVER_PATH" in environ:
environ["SMARTSIM_DRAGON_SERVER_PATH_EXP"] = osp.join(
self.exp_path, CONFIG.dragon_default_subdir
)
@_contextualize
def start(
self,
*args: SmartSimEntity | EntitySequence[SmartSimEntity],
block: bool = True,
summary: bool = False,
kill_on_interrupt: bool = True,
monitor: bool = True,
) -> None:
"""Start passed instances using Experiment launcher
Any instance ``Model``, ``Ensemble`` or ``Orchestrator``
instance created by the Experiment can be passed as
an argument to the start method.
.. highlight:: python
.. code-block:: python
exp = Experiment(name="my_exp", launcher="slurm")
settings = exp.create_run_settings(exe="./path/to/binary")
model = exp.create_model("my_model", settings)
exp.start(model)
Multiple entity instances can also be passed to the start method
at once no matter which type of instance they are. These will
all be launched together.
.. highlight:: python
.. code-block:: python
exp.start(model_1, model_2, db, ensemble, block=True)
# alternatively
stage_1 = [model_1, model_2, db, ensemble]
exp.start(*stage_1, block=True)
If `block==True` the Experiment will poll the launched instances
at runtime until all non-database jobs have completed. Database
jobs *must* be killed by the user by passing them to
``Experiment.stop``. This allows for multiple stages of a workflow
to produce to and consume from the same Orchestrator database.
If `kill_on_interrupt=True`, then all jobs launched by this
experiment are guaranteed to be killed when ^C (SIGINT) signal is
received. If `kill_on_interrupt=False`, then it is not guaranteed
that all jobs launched by this experiment will be killed, and the
zombie processes will need to be manually killed.
If `monitor=True`, all the jobs being started will be monitored
by the Controller. If `monitor=False`, the jobs will not be
monitored, meaning that their status will not be reported.
:param block: block execution until all non-database
jobs are finished
:param summary: print a launch summary prior to launch
:param kill_on_interrupt: flag for killing jobs when ^C (SIGINT)
signal is received.
:param monitor: monitor the jobs being started
"""
start_manifest = Manifest(*args)
self._create_entity_dir(start_manifest)
try:
if summary:
self._launch_summary(start_manifest)
self._control.start(
exp_name=self.name,
exp_path=self.exp_path,
manifest=start_manifest,
block=block,
kill_on_interrupt=kill_on_interrupt,
monitor=monitor,
)
except SmartSimError as e:
logger.error(e)
raise
@_contextualize
def stop(self, *args: SmartSimEntity | EntitySequence[SmartSimEntity]) -> None:
"""Stop specific instances launched by this ``Experiment``
Instances of ``Model``, ``Ensemble`` and ``Orchestrator``
can all be passed as arguments to the stop method.
Whichever launcher was specified at Experiment initialization
will be used to stop the instance. For example, which using
the slurm launcher, this equates to running `scancel` on the
instance.
Example
.. highlight:: python
.. code-block:: python
exp.stop(model)
# multiple
exp.stop(model_1, model_2, db, ensemble)
:param args: One or more SmartSimEntity or EntitySequence objects.
:raises TypeError: if wrong type
:raises SmartSimError: if stop request fails
"""
stop_manifest = Manifest(*args)
try:
for entity in stop_manifest.models:
self._control.stop_entity(entity)
for entity_list in stop_manifest.ensembles:
self._control.stop_entity_list(entity_list)
dbs = stop_manifest.dbs
for db in dbs:
self._control.stop_db(db)
except SmartSimError as e:
logger.error(e)
raise
@_contextualize
def generate(
self,
*args: SmartSimEntity | EntitySequence[SmartSimEntity],
tag: str | None = None,
overwrite: bool = False,
verbose: bool = False,
) -> None:
"""Generate the file structure for an ``Experiment``
``Experiment.generate`` creates directories for each entity
passed to organize Experiments that launch many entities.
If files or directories are attached to ``Model`` objects
using ``Model.attach_generator_files()``, those files or
directories will be symlinked, copied, or configured and
written into the created directory for that instance.
Instances of ``Model``, ``Ensemble`` and ``Orchestrator``
can all be passed as arguments to the generate method.
:param tag: tag used in `to_configure` generator files
:param overwrite: overwrite existing folders and contents
:param verbose: log parameter settings to std out
"""
try:
generator = Generator(self.exp_path, overwrite=overwrite, verbose=verbose)
if tag:
generator.set_tag(tag)
generator.generate_experiment(*args)
except SmartSimError as e:
logger.error(e)
raise
@_contextualize
def poll(
self, interval: int = 10, verbose: bool = True, kill_on_interrupt: bool = True
) -> None:
"""Monitor jobs through logging to stdout.
This method should only be used if jobs were launched
with ``Experiment.start(block=False)``
The internal specified will control how often the
logging is performed, not how often the polling occurs.
By default, internal polling is set to every second for
local launcher jobs and every 10 seconds for all other
launchers.
If internal polling needs to be slower or faster based on
system or site standards, set the ``SMARTSIM_JM_INTERNAL``
environment variable to control the internal polling interval
for SmartSim.
For more verbose logging output, the ``SMARTSIM_LOG_LEVEL``
environment variable can be set to `debug`
If `kill_on_interrupt=True`, then all jobs launched by this
experiment are guaranteed to be killed when ^C (SIGINT) signal is
received. If `kill_on_interrupt=False`, then it is not guaranteed
that all jobs launched by this experiment will be killed, and the
zombie processes will need to be manually killed.
:param interval: frequency (in seconds) of logging to stdout
:param verbose: set verbosity
:param kill_on_interrupt: flag for killing jobs when SIGINT is received
:raises SmartSimError: if poll request fails
"""
try:
self._control.poll(interval, verbose, kill_on_interrupt=kill_on_interrupt)
except SmartSimError as e:
logger.error(e)
raise
@_contextualize
def finished(self, entity: SmartSimEntity) -> bool:
"""Query if a job has completed.
An instance of ``Model`` or ``Ensemble`` can be passed
as an argument.
Passing ``Orchestrator`` will return an error as a
database deployment is never finished until stopped
by the user.
:param entity: object launched by this ``Experiment``
:returns: True if the job has finished, False otherwise
:raises SmartSimError: if entity has not been launched
by this ``Experiment``
"""
try:
return self._control.finished(entity)
except SmartSimError as e:
logger.error(e)
raise
@_contextualize
def get_status(
self, *args: SmartSimEntity | EntitySequence[SmartSimEntity]
) -> list[SmartSimStatus]:
"""Query the status of launched entity instances
Return a smartsim.status string representing
the status of the launched instance.
.. highlight:: python
.. code-block:: python
exp.get_status(model)
As with an Experiment method, multiple instance of
varying types can be passed to and all statuses will
be returned at once.
.. highlight:: python
.. code-block:: python
statuses = exp.get_status(model, ensemble, orchestrator)
complete = [s == smartsim.status.STATUS_COMPLETED for s in statuses]
assert all(complete)
:returns: status of the instances passed as arguments
:raises SmartSimError: if status retrieval fails
"""
try:
manifest = Manifest(*args)
statuses: list[SmartSimStatus] = []
for entity in manifest.models:
statuses.append(self._control.get_entity_status(entity))
for entity_list in manifest.all_entity_lists:
statuses.extend(self._control.get_entity_list_status(entity_list))
return statuses
except SmartSimError as e:
logger.error(e)
raise
@_contextualize
def create_ensemble(
self,
name: str,
params: dict[str, t.Any] | None = None,
batch_settings: base.BatchSettings | None = None,
run_settings: base.RunSettings | None = None,
replicas: int | None = None,
perm_strategy: str = "all_perm",
path: str | None = None,
**kwargs: t.Any,
) -> Ensemble:
"""Create an ``Ensemble`` of ``Model`` instances
Ensembles can be launched sequentially or as a batch
if using a non-local launcher. e.g. slurm
Ensembles require one of the following combinations
of arguments:
- ``run_settings`` and ``params``
- ``run_settings`` and ``replicas``
- ``batch_settings``
- ``batch_settings``, ``run_settings``, and ``params``
- ``batch_settings``, ``run_settings``, and ``replicas``
If given solely batch settings, an empty ensemble
will be created that Models can be added to manually
through ``Ensemble.add_model()``.
The entire Ensemble will launch as one batch.
Provided batch and run settings, either ``params``
or ``replicas`` must be passed and the entire ensemble
will launch as a single batch.
Provided solely run settings, either ``params``
or ``replicas`` must be passed and the Ensemble members
will each launch sequentially.
The kwargs argument can be used to pass custom input
parameters to the permutation strategy.
:param name: name of the ``Ensemble``
:param params: parameters to expand into ``Model`` members
:param batch_settings: describes settings for ``Ensemble`` as batch workload
:param run_settings: describes how each ``Model`` should be executed
:param replicas: number of replicas to create
:param perm_strategy: strategy for expanding ``params`` into
``Model`` instances from params argument
options are "all_perm", "step", "random"
or a callable function.
:raises SmartSimError: if initialization fails
:return: ``Ensemble`` instance
"""
if name is None:
raise AttributeError("Entity has no name. Please set name attribute.")
check_path = path or osp.join(self.exp_path, name)
entity_path: str = osp.abspath(check_path)
try:
new_ensemble = Ensemble(
name=name,
params=params or {},
path=entity_path,
batch_settings=batch_settings,
run_settings=run_settings,
perm_strat=perm_strategy,
replicas=replicas,
**kwargs,
)
return new_ensemble
except SmartSimError as e:
logger.error(e)
raise
@_contextualize
def create_model(
self,
name: str,
run_settings: base.RunSettings,
params: dict[str, t.Any] | None = None,
path: str | None = None,
enable_key_prefixing: bool = False,
batch_settings: base.BatchSettings | None = None,
) -> Model:
"""Create a general purpose ``Model``
The ``Model`` class is the most general encapsulation of
executable code in SmartSim. ``Model`` instances are named
references to pieces of a workflow that can be parameterized,
and executed.
``Model`` instances can be launched sequentially, as a batch job,
or as a group by adding them into an ``Ensemble``.
All ``Models`` require a reference to run settings to specify which
executable to launch as well provide options for how to launch
the executable with the underlying WLM. Furthermore, batch a
reference to a batch settings can be added to launch the ``Model``
as a batch job through ``Experiment.start``. If a ``Model`` with
a reference to a set of batch settings is added to a larger
entity with its own set of batch settings (for e.g. an
``Ensemble``) the batch settings of the larger entity will take
precedence and the batch setting of the ``Model`` will be
strategically ignored.
Parameters supplied in the `params` argument can be written into
configuration files supplied at runtime to the ``Model`` through
``Model.attach_generator_files``. `params` can also be turned
into executable arguments by calling ``Model.params_to_args``
By default, ``Model`` instances will be executed in the
exp_path/model_name directory if no `path` argument is supplied.
If a ``Model`` instance is passed to ``Experiment.generate``,
a directory within the ``Experiment`` directory will be created
to house the input and output files from the ``Model``.
Example initialization of a ``Model`` instance
.. highlight:: python
.. code-block:: python
from smartsim import Experiment
run_settings = exp.create_run_settings("python", "run_pytorch_model.py")
model = exp.create_model("pytorch_model", run_settings)
# adding parameters to a model
run_settings = exp.create_run_settings("python", "run_pytorch_model.py")
train_params = {
"batch": 32,
"epoch": 10,
"lr": 0.001
}
model = exp.create_model("pytorch_model", run_settings, params=train_params)
model.attach_generator_files(to_configure="./train.cfg")
exp.generate(model)
New in 0.4.0, ``Model`` instances can be colocated with an
Orchestrator database shard through ``Model.colocate_db``. This
will launch a single ``Orchestrator`` instance on each compute
host used by the (possibly distributed) application. This is
useful for performant online inference or processing
at runtime.
New in 0.4.2, ``Model`` instances can now be colocated with
an Orchestrator database over either TCP or UDS using the
``Model.colocate_db_tcp`` or ``Model.colocate_db_uds`` method
respectively. The original ``Model.colocate_db`` method is now
deprecated, but remains as an alias for ``Model.colocate_db_tcp``
for backward compatibility.
:param name: name of the ``Model``
:param run_settings: defines how ``Model`` should be run
:param params: ``Model`` parameters for writing into configuration files
:param path: path to where the ``Model`` should be executed at runtime
:param enable_key_prefixing: If True, data sent to the ``Orchestrator``
using SmartRedis from this ``Model`` will
be prefixed with the ``Model`` name.
:param batch_settings: Settings to run ``Model`` individually as a batch job.
:raises SmartSimError: if initialization fails
:return: the created ``Model``
"""
if name is None:
raise AttributeError("Entity has no name. Please set name attribute.")
check_path = path or osp.join(self.exp_path, name)
entity_path: str = osp.abspath(check_path)
if params is None:
params = {}
try:
new_model = Model(
name=name,
params=params,
path=entity_path,
run_settings=run_settings,
batch_settings=batch_settings,
)
if enable_key_prefixing:
new_model.enable_key_prefixing()
return new_model
except SmartSimError as e:
logger.error(e)
raise
@_contextualize
def create_run_settings(
self,
exe: str,
exe_args: list[str] | None = None,
run_command: str = "auto",
run_args: dict[str, int | str | float | None] | None = None,
env_vars: dict[str, str | None] | None = None,
container: Container | None = None,
**kwargs: t.Any,
) -> settings.RunSettings:
"""Create a ``RunSettings`` instance.
run_command="auto" will attempt to automatically
match a run command on the system with a ``RunSettings``
class in SmartSim. If found, the class corresponding
to that run_command will be created and returned.
If the local launcher is being used, auto detection will
be turned off.
If a recognized run command is passed, the ``RunSettings``
instance will be a child class such as ``SrunSettings``
If not supported by smartsim, the base ``RunSettings`` class
will be created and returned with the specified run_command and run_args
will be evaluated literally.
Run Commands with implemented helper classes:
- aprun (ALPS)
- srun (SLURM)
- mpirun (OpenMPI)
:param run_command: command to run the executable
:param exe: executable to run
:param exe_args: arguments to pass to the executable
:param run_args: arguments to pass to the ``run_command``
:param env_vars: environment variables to pass to the executable
:param container: if execution environment is containerized
:return: the created ``RunSettings``
"""
try:
return settings.create_run_settings(
self._launcher,
exe,
exe_args=exe_args,
run_command=run_command,
run_args=run_args,
env_vars=env_vars,
container=container,
**kwargs,
)
except SmartSimError as e:
logger.error(e)
raise
@_contextualize
def create_batch_settings(
self,
nodes: int = 1,
time: str = "",
queue: str = "",
account: str = "",
batch_args: dict[str, str] | None = None,
**kwargs: t.Any,
) -> base.BatchSettings:
"""Create a ``BatchSettings`` instance
Batch settings parameterize batch workloads. The result of this
function can be passed to the ``Ensemble`` initialization.
the `batch_args` parameter can be used to pass in a dictionary
of additional batch command arguments that aren't supported through
the smartsim interface
.. highlight:: python
.. code-block:: python
# i.e. for Slurm
batch_args = {
"distribution": "block"
"exclusive": None
}
bs = exp.create_batch_settings(nodes=3,
time="10:00:00",
batch_args=batch_args)
bs.set_account("default")
:param nodes: number of nodes for batch job
:param time: length of batch job
:param queue: queue or partition (if slurm)
:param account: user account name for batch system
:param batch_args: additional batch arguments
:return: a newly created BatchSettings instance
:raises SmartSimError: if batch creation fails
"""
try:
return settings.create_batch_settings(
self._launcher,
nodes=nodes,
time=time,
queue=queue,
account=account,
batch_args=batch_args,
**kwargs,
)
except SmartSimError as e:
logger.error(e)
raise
@_contextualize
def create_database(
self,
port: int = 6379,
path: str | None = None,
db_nodes: int = 1,
batch: bool = False,
hosts: list[str] | str | None = None,
run_command: str = "auto",
interface: str | list[str] = "ipogif0",
account: str | None = None,
time: str | None = None,
queue: str | None = None,
single_cmd: bool = True,
db_identifier: str = "orchestrator",
**kwargs: t.Any,
) -> Orchestrator:
"""Initialize an ``Orchestrator`` database
The ``Orchestrator`` database is a key-value store based
on Redis that can be launched together with other ``Experiment``
created instances for online data storage.
When launched, ``Orchestrator`` can be used to communicate
data between Fortran, Python, C, and C++ applications.
Machine Learning models in Pytorch, Tensorflow, and ONNX (i.e. scikit-learn)
can also be stored within the ``Orchestrator`` database where they
can be called remotely and executed on CPU or GPU where
the database is hosted.
To enable a SmartSim ``Model`` to communicate with the database
the workload must utilize the SmartRedis clients. For more
information on the database, and SmartRedis clients see the
documentation at https://www.craylabs.org/docs/smartredis.html
:param port: TCP/IP port
:param db_nodes: number of database shards
:param batch: run as a batch workload
:param hosts: specify hosts to launch on
:param run_command: specify launch binary or detect automatically
:param interface: Network interface
:param account: account to run batch on
:param time: walltime for batch 'HH:MM:SS' format
:param queue: queue to run the batch on
:param single_cmd: run all shards with one (MPMD) command
:param db_identifier: an identifier to distinguish this orchestrator in
multiple-database experiments
:raises SmartSimError: if detection of launcher or of run command fails
:raises SmartSimError: if user indicated an incompatible run command
for the launcher
:return: Orchestrator or derived class
"""
self._append_to_db_identifier_list(db_identifier)
check_path = path or osp.join(self.exp_path, db_identifier)
entity_path: str = osp.abspath(check_path)
return Orchestrator(
port=port,
path=entity_path,
db_nodes=db_nodes,
batch=batch,
hosts=hosts,
run_command=run_command,
interface=interface,
account=account,
time=time,
queue=queue,
single_cmd=single_cmd,
launcher=self._launcher,
db_identifier=db_identifier,
**kwargs,
)
@_contextualize
def reconnect_orchestrator(self, checkpoint: str) -> Orchestrator:
"""Reconnect to a running ``Orchestrator``
This method can be used to connect to a ``Orchestrator`` deployment
that was launched by a previous ``Experiment``. This can be
helpful in the case where separate runs of an ``Experiment``
wish to use the same ``Orchestrator`` instance currently
running on a system.
:param checkpoint: the `smartsim_db.dat` file created
when an ``Orchestrator`` is launched
"""
try:
orc = self._control.reload_saved_db(checkpoint)
return orc
except SmartSimError as e:
logger.error(e)
raise
def preview(
self,
*args: t.Any,
verbosity_level: previewrenderer.Verbosity = previewrenderer.Verbosity.INFO,
output_format: previewrenderer.Format = previewrenderer.Format.PLAINTEXT,
output_filename: str | None = None,
) -> None:
"""Preview entity information prior to launch. This method
aggregates multiple pieces of information to give users insight
into what and how entities will be launched. Any instance of
``Model``, ``Ensemble``, or ``Orchestrator`` created by the
Experiment can be passed as an argument to the preview method.
Verbosity levels:
- info: Display user-defined fields and entities.
- debug: Display user-defined field and entities and auto-generated
fields.
- developer: Display user-defined field and entities, auto-generated
fields, and run commands.
:param verbosity_level: verbosity level specified by user, defaults to info.
:param output_format: Set output format. The possible accepted
output formats are ``plain_text``.
Defaults to ``plain_text``.
:param output_filename: Specify name of file and extension to write
preview data to. If no output filename is set, the preview will be
output to stdout. Defaults to None.
"""
# Retrieve any active orchestrator jobs
active_dbjobs = self._control.active_orchestrator_jobs
preview_manifest = Manifest(*args)
previewrenderer.render(
self,
preview_manifest,
verbosity_level,
output_format,
output_filename,
active_dbjobs,
)
@property
def launcher(self) -> str:
return self._launcher
@_contextualize
def summary(self, style: str = "github") -> str:
"""Return a summary of the ``Experiment``
The summary will show each instance that has been
launched and completed in this ``Experiment``
:param style: the style in which the summary table is formatted,
for a full list of styles see the table-format section of:
https://github.com/astanin/python-tabulate
:return: tabulate string of ``Experiment`` history
"""
values = []
headers = [
"Name",
"Entity-Type",
"JobID",
"RunID",
"Time",
"Status",
"Returncode",
]
for job in self._control.get_jobs().values():
for run in range(job.history.runs + 1):
values.append(
[
job.entity.name,
job.entity.type,
job.history.jids[run],
run,
f"{job.history.job_times[run]:.4f}",
job.history.statuses[run],
job.history.returns[run],
]
)
return tabulate(
values,
headers,
showindex=True,
tablefmt=style,
missingval="None",
disable_numparse=True,
)
def _launch_summary(self, manifest: Manifest) -> None:
"""Experiment pre-launch summary of entities that will be launched
:param manifest: Manifest of deployables.
"""
summary = "\n\n=== Launch Summary ===\n"
summary += f"Experiment: {self.name}\n"
summary += f"Experiment Path: {self.exp_path}\n"
summary += f"Launcher: {self._launcher}\n"
if manifest.models:
summary += f"Models: {len(manifest.models)}\n"
if self._control.orchestrator_active:
summary += "Database Status: active\n"
elif manifest.dbs:
summary += "Database Status: launching\n"
else:
summary += "Database Status: inactive\n"
summary += f"\n{str(manifest)}"
logger.info(summary)
def _create_entity_dir(self, start_manifest: Manifest) -> None:
def create_entity_dir(entity: Orchestrator | Model | Ensemble) -> None:
if not os.path.isdir(entity.path):
os.makedirs(entity.path)
for model in start_manifest.models:
create_entity_dir(model)
for orch in start_manifest.dbs:
create_entity_dir(orch)
for ensemble in start_manifest.ensembles:
create_entity_dir(ensemble)
for member in ensemble.models:
create_entity_dir(member)
def __str__(self) -> str:
return self.name
def _append_to_db_identifier_list(self, db_identifier: str) -> None:
"""Check if db_identifier already exists when calling create_database"""
if db_identifier in self.db_identifiers:
logger.warning(
f"A database with the identifier {db_identifier} has already been made "
"An error will be raised if multiple databases are started "
"with the same identifier"
)
# Otherwise, add
self.db_identifiers.add(db_identifier)