Skip to content

Commit 80c8dd7

Browse files
authored
Support cloud fleet in-place update (#3766)
* Support `nodes` in-place update for cloud fleets * Render fleet spec diff in CLI * Support `tags` and `reservation` in-place update * Drop services tests
1 parent da026f9 commit 80c8dd7

6 files changed

Lines changed: 454 additions & 39 deletions

File tree

src/dstack/_internal/cli/services/configurators/fleet.py

Lines changed: 62 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,10 @@
3434
InstanceGroupPlacement,
3535
)
3636
from dstack._internal.core.models.instances import InstanceStatus, SSHKey
37-
from dstack._internal.core.services.diff import diff_models
37+
from dstack._internal.core.services.diff import copy_model, diff_models
3838
from dstack._internal.utils.common import local_time
3939
from dstack._internal.utils.logging import get_logger
40+
from dstack._internal.utils.nested_list import NestedList, NestedListItem
4041
from dstack._internal.utils.ssh import convert_ssh_key_to_pem, generate_public_key, pkey_from_str
4142
from dstack.api.utils import load_profile
4243

@@ -85,14 +86,10 @@ def _apply_plan(self, plan: FleetPlan, command_args: argparse.Namespace):
8586
)
8687
confirm_message += "Create the fleet?"
8788
else:
89+
effective_spec = plan.get_effective_spec()
90+
diff = _render_fleet_spec_diff(plan.current_resource.spec, effective_spec)
8891
action_message += f"Found fleet [code]{plan.spec.configuration.name}[/]."
89-
if plan.action == ApplyAction.CREATE:
90-
delete_fleet_name = plan.current_resource.name
91-
action_message += (
92-
" Configuration changes detected. Cannot update the fleet in-place"
93-
)
94-
confirm_message += "Re-create the fleet?"
95-
elif plan.current_resource.spec == plan.effective_spec:
92+
if plan.current_resource.spec == effective_spec:
9693
if command_args.yes and not command_args.force:
9794
# --force is required only with --yes,
9895
# otherwise we may ask for force apply interactively.
@@ -103,8 +100,26 @@ def _apply_plan(self, plan: FleetPlan, command_args: argparse.Namespace):
103100
delete_fleet_name = plan.current_resource.name
104101
action_message += " No configuration changes detected."
105102
confirm_message += "Re-create the fleet?"
103+
elif plan.action == ApplyAction.CREATE:
104+
delete_fleet_name = plan.current_resource.name
105+
if diff is not None:
106+
# TODO: Highlight only the fields that block in-place update instead of
107+
# showing the full detected diff here.
108+
action_message += (
109+
f" Detected changes that [error]cannot[/] be updated in-place:\n{diff}"
110+
)
111+
else:
112+
action_message += (
113+
" Configuration changes detected. Cannot update the fleet in-place."
114+
)
115+
confirm_message += "Re-create the fleet?"
106116
else:
107-
action_message += " Configuration changes detected."
117+
if diff is not None:
118+
action_message += (
119+
f" Detected changes that [code]can[/] be updated in-place:\n{diff}"
120+
)
121+
else:
122+
action_message += " Configuration changes detected."
108123
confirm_message += "Update the fleet in-place?"
109124

110125
console.print(action_message)
@@ -357,6 +372,44 @@ def _resolve_ssh_key(ssh_key_path: Optional[str]) -> Optional[SSHKey]:
357372
exit()
358373

359374

375+
def _render_fleet_spec_diff(old_spec: FleetSpec, new_spec: FleetSpec) -> Optional[str]:
376+
old_spec = copy_model(old_spec)
377+
new_spec = copy_model(new_spec)
378+
changed_spec_fields = list(diff_models(old_spec, new_spec))
379+
if not changed_spec_fields:
380+
return None
381+
382+
nested_list = NestedList()
383+
for spec_field in changed_spec_fields:
384+
if spec_field == "merged_profile":
385+
continue
386+
if spec_field == "configuration":
387+
item = NestedListItem(
388+
"Configuration properties:",
389+
children=[
390+
NestedListItem(field)
391+
for field in diff_models(old_spec.configuration, new_spec.configuration)
392+
],
393+
)
394+
elif spec_field == "profile":
395+
item = NestedListItem(
396+
"Profile properties:",
397+
children=[
398+
NestedListItem(field)
399+
for field in diff_models(old_spec.profile, new_spec.profile)
400+
],
401+
)
402+
elif spec_field == "configuration_path":
403+
item = NestedListItem("Configuration path")
404+
else:
405+
item = NestedListItem(spec_field.replace("_", " ").capitalize())
406+
nested_list.children.append(item)
407+
408+
if not nested_list.children:
409+
return None
410+
return nested_list.render()
411+
412+
360413
def _print_plan_header(plan: FleetPlan):
361414
def th(s: str) -> str:
362415
return f"[bold]{s}[/bold]"

src/dstack/_internal/cli/services/configurators/run.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,8 @@ def apply_configuration(
152152
confirm_message = "Stop and override the run?"
153153
elif not run_plan.current_resource.status.is_finished():
154154
stop_run_name = run_plan.current_resource.run_spec.run_name
155+
# TODO: Highlight only the fields that block in-place update instead of
156+
# showing the full detected diff here.
155157
console.print(
156158
f"Active run [code]{conf.name}[/] already exists."
157159
f" Detected changes that [error]cannot[/] be updated in-place:\n{diff}"

src/dstack/_internal/server/background/pipeline_tasks/fleets.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ def __init__(
6363
workers_num: int = 10,
6464
queue_lower_limit_factor: float = 0.5,
6565
queue_upper_limit_factor: float = 2.0,
66-
min_processing_interval: timedelta = timedelta(seconds=30),
66+
min_processing_interval: timedelta = timedelta(seconds=15),
6767
lock_timeout: timedelta = timedelta(seconds=20),
6868
heartbeat_trigger: timedelta = timedelta(seconds=10),
6969
*,

src/dstack/_internal/server/services/fleets.py

Lines changed: 24 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1219,26 +1219,36 @@ def _check_can_update_inner(current: M, new: M, updatable_fields: tuple[str, ...
12191219
return diff
12201220

12211221

1222-
@_check_can_update("configuration", "configuration_path")
1222+
@_check_can_update("configuration", "configuration_path", "merged_profile")
12231223
def _check_can_update_fleet_spec(current: FleetSpec, new: FleetSpec, diff: ModelDiff):
1224+
# Allow `merged_profile` only to absorb derived changes from supported configuration updates
1225+
# such as `configuration.reservation` and `configuration.tags`.
1226+
# Direct `profile` updates are still not in-place updatable.
12241227
if "configuration" in diff:
12251228
_check_can_update_fleet_configuration(current.configuration, new.configuration)
12261229

12271230

1228-
@_check_can_update("ssh_config")
1229-
def _check_can_update_fleet_configuration(
1230-
current: FleetConfiguration, new: FleetConfiguration, diff: ModelDiff
1231-
):
1231+
def _check_can_update_fleet_configuration(current: FleetConfiguration, new: FleetConfiguration):
1232+
diff = diff_models(current, new)
1233+
current_ssh_config = current.ssh_config
1234+
new_ssh_config = new.ssh_config
1235+
if current_ssh_config is None:
1236+
if new_ssh_config is not None:
1237+
raise ServerClientError("Fleet type changed from Cloud to SSH, cannot update")
1238+
# TODO: Support best-effort `nodes.target` apply semantics:
1239+
# create missing instances and terminate extra idle instances.
1240+
# Current in-place update only persists `target`; FleetPipeline reconciles `min`/`max`.
1241+
#
1242+
# For `reservation` and `tags`, update affects only future provisioning.
1243+
_check_can_update_inner(current, new, ("nodes", "reservation", "tags"))
1244+
return
1245+
1246+
if new_ssh_config is None:
1247+
raise ServerClientError("Fleet type changed from SSH to Cloud, cannot update")
1248+
1249+
_check_can_update_inner(current, new, ("ssh_config",))
12321250
if "ssh_config" in diff:
1233-
current_ssh_config = current.ssh_config
1234-
new_ssh_config = new.ssh_config
1235-
if current_ssh_config is None:
1236-
if new_ssh_config is not None:
1237-
raise ServerClientError("Fleet type changed from Cloud to SSH, cannot update")
1238-
elif new_ssh_config is None:
1239-
raise ServerClientError("Fleet type changed from SSH to Cloud, cannot update")
1240-
else:
1241-
_check_can_update_ssh_config(current_ssh_config, new_ssh_config)
1251+
_check_can_update_ssh_config(current_ssh_config, new_ssh_config)
12421252

12431253

12441254
@_check_can_update("hosts")

0 commit comments

Comments
 (0)