Skip to content

Commit cbb46d2

Browse files
committed
feat: introduce abstract RuntimeBackend base class
This commit introduces the _public interface_ of the new `RuntimeBackend` abstract base class, and generally introduces the notion of "Runtime Backends" as a whole. Runtime backends are intended to be replacements for the existing concepts of Launchers, where runtime backends expose async interfaces for asynchronously submitting and killing jobs, and being notified of completion. The new name is chosen because, whilst the Launchers do indeed "Launch" jobs, they also maintain, poll and kill them, which is not really represented in the old name. Submitting jobs to backends returns a handle to the job, which the async scheduler can then use to interact with the different runtime backends. Importantly, methods are provided for batch submitting and batch killing many jobs at once - this is because many launcher backends could offer optimized interfaces for batch actions by amortizing overheads. A single protected method `_emit_completion` is also introduced in this commit to show how the completion callback is intended to be used. No other launcher logic related to the output directories, status checks, log files, job environment, and job callbacks is included at this stage. Signed-off-by: Alex Jones <alex.jones@lowrisc.org>
1 parent c874331 commit cbb46d2

3 files changed

Lines changed: 178 additions & 0 deletions

File tree

src/dvsim/runtime/__init__.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# Copyright lowRISC contributors (OpenTitan project).
2+
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
"""Job runtime backends."""

src/dvsim/runtime/backend.py

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
# Copyright lowRISC contributors (OpenTitan project).
2+
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
"""Runtime backend abstract base class."""
6+
7+
from abc import ABC, abstractmethod
8+
from collections.abc import Hashable, Iterable
9+
10+
from dvsim.job.data import JobSpec
11+
from dvsim.job.status import JobStatus
12+
from dvsim.logging import log
13+
from dvsim.runtime.data import (
14+
CompletionCallback,
15+
JobCompletionEvent,
16+
JobHandle,
17+
)
18+
19+
20+
class RuntimeBackend(ABC):
21+
"""Abstraction for a backend that launches, maintains, polls and kills a job.
22+
23+
Provides methods to prepare an environment for running a job, launching the job,
24+
polling for its completion, killing it, and doing some cleanup activities.
25+
"""
26+
27+
name: str
28+
"""The name of the backend."""
29+
30+
max_parallelism: int = 0
31+
"""The maximum number of jobs that can be run at any time via this backend. The scheduler
32+
should respect the parallelism limit defined here.
33+
"""
34+
35+
max_output_dirs: int = 5
36+
"""If a history of previous invocations is to be maintained, keep at most this many dirs."""
37+
38+
supports_interactive: bool = False
39+
"""Whether this backend supports jobs in interactive mode (transparent stdin/stdout)."""
40+
41+
def __init__(self, *, max_parallelism: int | None = None) -> None:
42+
"""Construct a runtime backend.
43+
44+
Args:
45+
max_parallelism: The maximum number of jobs that can be dispatched to this backend
46+
at once. `0` means no limit, `None` means no override is applied to the default.
47+
48+
"""
49+
if max_parallelism is not None:
50+
self.max_parallelism = max_parallelism
51+
52+
self._completion_callback: CompletionCallback | None = None
53+
54+
def attach_completion_callback(self, callback: CompletionCallback) -> None:
55+
"""Attach a callback for completed events, to notify the scheduler.
56+
57+
Args:
58+
callback: the callback to use for job completion events.
59+
60+
"""
61+
self._completion_callback = callback
62+
63+
async def _emit_completion(self, events: Iterable[JobCompletionEvent]) -> None:
64+
"""Mark a job as now being in some completed/terminal state by notifying the scheduler."""
65+
if self._completion_callback is None:
66+
raise RuntimeError("Backend not attached to the scheduler")
67+
68+
for event in events:
69+
log.debug(
70+
"Job %s completed execution: %s", event.spec.qual_name, event.status.shorthand
71+
)
72+
if event.status != JobStatus.PASSED and event.reason is not None:
73+
log.verbose(
74+
"Job %s has status '%s' instead of 'Passed'. Reason: %s",
75+
event.spec.qual_name,
76+
event.status.name.capitalize(),
77+
event.reason.message,
78+
)
79+
80+
await self._completion_callback(events)
81+
82+
@abstractmethod
83+
async def submit_many(self, jobs: Iterable[JobSpec]) -> dict[Hashable, JobHandle]:
84+
"""Submit & launch multiple jobs.
85+
86+
Returns:
87+
mapping from job.id -> JobHandle. Entries are only present for jobs that successfully
88+
launched; jobs that failed in a non-fatal way are missing, and should be retried.
89+
90+
"""
91+
92+
async def submit(self, job: JobSpec) -> JobHandle | None:
93+
"""Submit & launch a job, returning a handle for that job.
94+
95+
If the job failed to launch in a non-fatal way (e.g. backend is busy), None is returned
96+
instead, and the job should be re-submitted at some later time.
97+
"""
98+
result = await self.submit_many([job])
99+
return result.get(job.id)
100+
101+
@abstractmethod
102+
async def kill_many(self, handles: Iterable[JobHandle]) -> None:
103+
"""Cancel ongoing jobs via their handle. Killed jobs should still "complete"."""
104+
105+
async def kill(self, handle: JobHandle) -> None:
106+
"""Cancel an ongoing job via its handle. Killed jobs should still "complete"."""
107+
await self.kill_many([handle])
108+
109+
async def close(self) -> None: # noqa: B027
110+
"""Release any resources that the backend holds; called when the scheduler completes.
111+
112+
The default implementation just does nothing.
113+
114+
"""

src/dvsim/runtime/data.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
# Copyright lowRISC contributors (OpenTitan project).
2+
# Licensed under the Apache License, Version 2.0, see LICENSE for details.
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
"""Various dataclasses used by the different runtime backends when executing jobs."""
6+
7+
from collections.abc import Awaitable, Callable, Hashable, Iterable
8+
from dataclasses import dataclass
9+
from typing import TypeAlias
10+
11+
from dvsim.job.data import JobSpec, JobStatusInfo
12+
from dvsim.job.status import JobStatus
13+
from dvsim.job.time import JobTime
14+
15+
__all__ = (
16+
"CompletionCallback",
17+
"JobCompletionEvent",
18+
"JobHandle",
19+
)
20+
21+
22+
@dataclass(kw_only=True)
23+
class JobHandle:
24+
"""A handle for a job that is actively executing on some backend."""
25+
26+
spec: JobSpec
27+
backend: str
28+
29+
# TODO: these are necessary for now because they are exposed in the CompletedJobStatus.
30+
# It would be nice to figure out a better mechanism for these.
31+
job_runtime: JobTime
32+
simulated_time: JobTime
33+
34+
@property
35+
def job_id(self) -> Hashable:
36+
"""Returns an object that uniquely identifies the job. Alias of self.spec.id."""
37+
return self.spec.id
38+
39+
40+
@dataclass(frozen=True)
41+
class JobCompletionEvent:
42+
"""Event emitted when a job finishes."""
43+
44+
# TODO: ideally we would rather store a `job_id: Hashable` here instead of the full spec, but
45+
# this is needed to access the `post_finish` callback in `RuntimeBackend._emit_completion`.
46+
# When these `pre_launch`/`post_finish` callbacks are refactored/removed, this can be changed.
47+
spec: JobSpec
48+
"""The specification for the job that has been completed."""
49+
status: JobStatus
50+
"""The terminal/final status of the completed job."""
51+
reason: JobStatusInfo | None
52+
"""The reason to report as to why the job is in the specified terminal state.
53+
Typically only reported for non-successful executions, as it is used for e.g. failure message
54+
buckets - success is implicit.
55+
"""
56+
57+
58+
# Callback (async) for (batches of) job completion events
59+
CompletionCallback: TypeAlias = Callable[[Iterable[JobCompletionEvent]], Awaitable[None]]

0 commit comments

Comments
 (0)