From 7a9487ba0278f14337504e97971ec4b866ef931b Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Tue, 14 Apr 2026 20:01:13 +0800 Subject: [PATCH 01/17] docs: add agent self-upgrade closure design spec Closes the three gaps in the existing agent self-upgrade flow: - Result/progress reporting from agent to server (new AgentMessage variants) - Frontend UI in CapabilitiesDialog + server list badge - Pre-flight validation + .bak. retention for rollback safety --- ...04-14-agent-self-upgrade-closure-design.md | 448 ++++++++++++++++++ 1 file changed, 448 insertions(+) create mode 100644 docs/superpowers/specs/2026-04-14-agent-self-upgrade-closure-design.md diff --git a/docs/superpowers/specs/2026-04-14-agent-self-upgrade-closure-design.md b/docs/superpowers/specs/2026-04-14-agent-self-upgrade-closure-design.md new file mode 100644 index 00000000..0c0337cb --- /dev/null +++ b/docs/superpowers/specs/2026-04-14-agent-self-upgrade-closure-design.md @@ -0,0 +1,448 @@ +# Agent Self-Upgrade Closure — 设计文档 + +**日期**: 2026-04-14 +**状态**: Draft +**相关实现**: `crates/common/src/protocol.rs`, `crates/server/src/router/api/server.rs`, `crates/agent/src/reporter.rs`, `apps/web/src/components/server/` + +## 概述 + +ServerBee 已经有一个"单向"的 agent 自升级通道:管理员通过 `POST /api/servers/{id}/upgrade` 让服务端发 `ServerMessage::Upgrade { version, download_url, sha256 }`,agent 收到后下载、校验、替换、重启。但该流程存在三个使它无法作为交互式运维功能暴露给管理员的缺口: + +1. **无结果/进度回报** — agent 不向服务端发送升级中间状态或失败原因;管理员调用 API 后只能"盲等"。 +2. **无前端 UI** — 只能 `curl` 调用,无法在 Web 控制台触发或观察。 +3. **无失败回滚底线** — 老二进制被 rename 成 `.bak` 后,如果新进程 spawn 失败,agent 会彻底下线,必须人工 SSH 恢复。 + +本设计在现有协议基础上补齐以上三块,使自升级成为一个全链路可观测、有安全底线的运维能力。 + +**范围外**(显式声明): +- 批量升级(可由前端循环单机 API 实现) +- 服务端主动确认之外的机制(现有"重连 + SystemInfo 版本匹配"已足够) +- Windows 自升级的运行中二进制覆盖问题(遗留风险,不在本次 scope) +- 升级事件的持久化审计日志(后续如有需求再加 `event` 表) + +## 需求 + +1. Agent 在升级过程每进入一个新阶段时向服务端回报进度。 +2. Agent 在失败时向服务端回报失败阶段和错误信息。 +3. 服务端追踪每台 agent 当前的升级 job 状态(内存态)并在状态变化时广播给前端。 +4. 服务端实现升级超时判定(agent 未能在窗口内重连并上报新版本即视为失败)。 +5. Agent 在写入新二进制之前执行 Pre-flight 预检(`--version` 探活),失败则中止升级。 +6. Agent 备份文件采用带时间戳命名,保留 24h。 +7. 前端在 `CapabilitiesDialog` 里新增 Agent Version 分组,显示当前版本、最新版本、触发按钮,以及升级进行中/失败/成功/超时的实时状态。 +8. 服务端新增 `GET /api/agent/latest-version` 端点,代理查询 `release_base_url` 的最新版本信息(10 分钟缓存)。 +9. 前端在 server 列表行上展示升级进行中/失败的 badge。 +10. 失败态提供 Retry 按钮,重新触发同一版本升级。 + +## 非需求 + +- 不做 Spawn-then-verify 握手(L2 回滚)— 协议需要新老版本双向支持,本次升级无法受益,ROI 不足。 +- 不加持久化 job 表 — 升级是低频运维动作,内存态 + 超时清理足够。 +- 不做升级进度百分比 — 二进制体积小,秒级完成,细粒度上报开销占比过高。 +- Agent 升级成功后不主动回报 `UpgradeResult { ok: true }` — 老进程已 `exit(0)`,新进程无法关联 job;成功态以"重连 + SystemInfo 版本匹配" 推断。 + +## 协议变更(`crates/common/src/protocol.rs`) + +### 新增 `UpgradeStage` 枚举 + +```rust +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum UpgradeStage { + Downloading, // 下载中 + Verifying, // 校验 SHA-256 + PreFlight, // 执行 --version 预检 + Installing, // 写临时文件 + chmod + rename + Restarting, // 老进程即将 exit(0) +} +``` + +### `AgentMessage` 新增两个变体 + +```rust +// 进度里程碑:进入每个阶段之前发一条 +UpgradeProgress { + msg_id: String, + target_version: String, + stage: UpgradeStage, +}, + +// 终态(仅失败时发送) +UpgradeResult { + msg_id: String, + target_version: String, + stage: UpgradeStage, // 失败发生在哪一阶段 + error: String, // 人读错误信息,UI 直接展示 +}, +``` + +**设计要点**: +- `target_version` 在每条消息中冗余携带,服务端用来防止"上一轮 Timeout 后开启的新 job" 被旧消息污染。 +- `UpgradeResult` 不带 `ok` 字段 — agent 只在失败时发送,"存在 = 失败"。 +- 成功态不由 agent 消息表达,仅靠"重连 + `SystemInfo.agent_version == target_version`" 推断。 +- `msg_id` 沿用现有 `SystemInfo` 等消息的风格(UUID v4)。 + +### 新增 `UpgradeStatus` 枚举 + +```rust +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum UpgradeStatus { + Running, + Succeeded, + Failed, + Timeout, +} +``` + +### `BrowserMessage` 新增两个变体 + +服务端 → 浏览器的广播通道: + +```rust +UpgradeProgress { + server_id: String, + target_version: String, + stage: UpgradeStage, +}, +UpgradeResult { + server_id: String, + target_version: String, + status: UpgradeStatus, // Succeeded / Failed / Timeout + stage: Option, // 仅 Failed 时 + error: Option, // 仅 Failed 时 +}, +``` + +## 服务端改动(`crates/server`) + +### 内存 Job 追踪器 + +**新文件**: `crates/server/src/service/upgrade_tracker.rs` + +```rust +pub struct UpgradeJob { + pub server_id: String, + pub target_version: String, + pub started_at: DateTime, + pub stage: UpgradeStage, + pub status: UpgradeStatus, + pub error: Option, + pub finished_at: Option>, +} + +pub struct UpgradeJobTracker { + jobs: DashMap, // key: server_id(同 server 只允许一个活跃 job) + browser_tx: broadcast::Sender, +} +``` + +**核心方法**(所有状态变化方法内部广播 `BrowserMessage`): + +| 方法 | 行为 | +|------|------| +| `start_job(server_id, target_version)` | DashMap `entry` 原子插入。若已存在 `Running` 状态的 job,返回 `AppError::Conflict`。若存在但为终态(Succeeded/Failed/Timeout),覆盖。 | +| `update_stage(server_id, target_version, stage)` | 只在 `target_version` 匹配且状态 `Running` 时更新。不匹配时 warn 日志并忽略。 | +| `mark_failed(server_id, target_version, stage, error)` | 翻转为 `Failed`,设置 `finished_at`,广播 `UpgradeResult`。 | +| `mark_succeeded(server_id, observed_version)` | 仅当 `observed_version == target_version` 且状态 `Running` 时翻转为 `Succeeded`。若状态已是 `Timeout` 则不覆盖(Timeout 即终态)。 | +| `sweep_timeouts(now)` | 扫描所有 `Running` 且 `started_at + 120s < now` 的 job,翻转为 `Timeout`。 | +| `cleanup_old(now)` | 删除所有 `finished_at + 24h < now` 的终态 job,防止内存无限增长。 | +| `get(server_id) -> Option` | 供 `GET /api/servers/{id}/upgrade` 查询当前/最近 job。 | + +**装配**: 在 `AppState` 新增 `upgrade_tracker: Arc`。 + +### REST 路由改动 + +`crates/server/src/router/api/server.rs` 现有 `trigger_upgrade`(519-626)调整流程: + +1. 校验 `CAP_UPGRADE`(保留)。 +2. **新增**: `tracker.start_job(server_id, target_version)`;若 `Conflict` 返回 409 + 返回现有 job DTO。 +3. 获取 agent 平台、拉 `checksums.txt`、解析 SHA-256(保留)。 +4. 通过 AgentManager 发送 `ServerMessage::Upgrade`(保留)。 +5. **新增**: 若发送失败,`tracker.mark_failed(Downloading, "failed to notify agent: ...")`,返回 500。 +6. **新增**: 成功返回 202 Accepted + `{ data: { job: UpgradeJobDto } }`,前端直接拿到初始 `Running` 状态。 + +**新增路由**: `GET /api/servers/{id}/upgrade` → 返回 `Option`(当前或最近 24h 内的终态 job)。挂载在 `read_router()` 下(Admin + Member 都可读)。 + +### Agent WS 消息处理 + +`crates/server/src/router/ws/agent.rs`(具体文件需在实现时确认)现有 match 分支新增: + +- `AgentMessage::UpgradeProgress { target_version, stage, .. }` → `tracker.update_stage(server_id, target_version, stage)` +- `AgentMessage::UpgradeResult { target_version, stage, error, .. }` → `tracker.mark_failed(server_id, target_version, stage, error)` +- `AgentMessage::SystemInfo { info, .. }` 现有处理末尾加钩子:若 `tracker.get(server_id)` 为 `Running` 且 `info.agent_version == target_version` 则 `mark_succeeded`。 + +### 超时 / 清理后台任务 + +**新文件**: `crates/server/src/task/upgrade_timeout.rs` + +每 10s tick 一次,执行 `tracker.sweep_timeouts(now)` 和 `tracker.cleanup_old(now)`。挂到 `main.rs` 的后台任务启动序列(和 `record_writer` / `alert_evaluator` 同级)。 + +### 新增 REST: `GET /api/agent/latest-version` + +**目的**: 前端需要"最新版本号"才能决定是否显示升级按钮。 + +**实现**: +- 新 handler,内存缓存(OnceCell + Mutex 或 `moka`)TTL 10 分钟。 +- 服务端从 `config.release_base_url` 拼 `{base}/latest.txt`(或 `latest.json`,具体格式待实现时决定),拉取并解析版本号。 +- 成功返回 `{ version: "0.9.0", released_at: ISO8601? }`。 +- 失败(网络错误 / 未配置 `release_base_url` / 解析失败)返回 `{ version: null, error: "..." }`,**不抛 500**。 +- 失败响应缓存 1 分钟,避免打爆上游或重复失败。 +- 挂载在 `read_router()`。 + +## Agent 改动(`crates/agent`) + +### 进度上报工具 + +`Reporter` 新增两个私有方法: + +```rust +async fn emit_upgrade_progress(&self, target_version: &str, stage: UpgradeStage); +async fn emit_upgrade_failure(&self, target_version: &str, stage: UpgradeStage, error: String); +``` + +`emit_upgrade_progress` 发送失败仅记 `warn` 日志(进度丢失不致命)。`emit_upgrade_failure` 尽最大努力 flush(失败消息尽量送达,但也不阻塞升级本体)。 + +### `perform_upgrade` 重构后的阶段序列 + +`crates/agent/src/reporter.rs:1776-1846` 重构为: + +``` +emit(Downloading) +├─ reqwest 下载到 .new(10 分钟超时) +│ └─ 失败 → emit_failure(Downloading, err) → return + +emit(Verifying) +├─ SHA-256 校验 +│ └─ 不匹配 → emit_failure(Verifying, "sha256 mismatch") → 删 .new → return + +emit(PreFlight) ← 新增 L1 +├─ chmod 0755(Unix) +├─ Command::new(".new").arg("--version").status() with 5s timeout +│ └─ 失败 → emit_failure(PreFlight, "preflight failed: ...") → 删 .new → return + +emit(Installing) +├─ rename .bak. +├─ rename .new → +│ └─ 失败 → emit_failure(Installing, err) → 尝试 rename .bak. 回滚 → return + +emit(Restarting) +├─ Command::new().spawn() +│ └─ 失败 → emit_failure(Restarting, err) → rename .bak. 恢复 → return +├─ flush WS sender(尽力把 Restarting 消息送出) +├─ exit(0) +``` + +**纯函数抽离**(便于单元测试): +- `fn verify_sha256(bytes: &[u8], expected: &str) -> Result<()>` +- `async fn run_preflight(path: &Path, timeout: Duration) -> Result<()>` + +### `.bak` 保留窗口 + +- **命名**: `.bak.`,避免连续升级覆盖上一个能跑的版本。 +- **清理**: agent 启动时调 `cleanup_old_backups(binary_dir)`,删除同目录下 `.bak.*` 且 `mtime` 早于 24h 的文件。 +- 磁盘占用上限估算:单个二进制 5-15 MB × 24h 内连续升级不超过 10 次 ≈ 150 MB 上限,可接受。 + +### 并发升级保护 + +若 agent 正在升级过程中再次收到 `ServerMessage::Upgrade`,立即 `emit_failure(Downloading, "upgrade already in progress")` 回写失败,让服务端能看到重复请求被拒。 + +### Capability 检查 + +保留现有 `reporter.rs:576-592` 的逻辑 — 消息 dispatch 阶段统一检查,升级流程内部不再查。 + +### Windows 兼容性 + +- chmod 已有 `#[cfg(unix)]` 隔离。 +- `--version` 调用和 rename 语义跨平台一致。 +- **已知限制**: Windows 运行中的 exe 不能被 rename/delete,现状未验证。本设计不引入新的 Windows 问题,但也不解决该遗留风险 — 在文档里标注"Windows 自升级未验证,请谨慎"。 + +## 前端改动(`apps/web`) + +### 新增 hook + +`apps/web/src/hooks/use-upgrade-job.ts` + +```ts +type UpgradeJob = { + serverId: string + targetVersion: string + stage: 'downloading' | 'verifying' | 'pre_flight' | 'installing' | 'restarting' + status: 'running' | 'succeeded' | 'failed' | 'timeout' + error?: string + startedAt: string + finishedAt?: string +} +``` + +两个 hook: +- `useUpgradeJob(serverId)`: + - 首次 mount 调 `GET /api/servers/{id}/upgrade` 同步初始状态。 + - 订阅现有全局 WS(`use-servers-ws.ts`)新增的 `upgrade_progress` / `upgrade_result` 消息,匹配 serverId 后更新本地 state。 +- `useTriggerUpgrade()`: + - mutation,POST `/api/servers/{id}/upgrade`。 + - 成功后把返回的 `job` 写入全局 store(便于列表 badge 立即展示)。 + +### WS 消息路由 + +`apps/web/src/hooks/use-servers-ws.ts` 现有消息 switch 新增两个 case,dispatch 到全局 upgrade job store。具体 store 方式(zustand / tanstack-query / context)按现有代码风格就近匹配,属实现细节。 + +### UI 组件 + +**位置**: `CapabilitiesDialog` 底部独立一个 `AgentVersionSection` 子组件(新建 `apps/web/src/components/server/agent-version-section.tsx`)。 + +**三个显示态**: + +**A. Idle**(无活跃 job) +- 展示 `Current: vX.Y.Z` +- 若 `latest-version` API 返回有效版本且 > Current:显示 `[ Upgrade to vLatest ]` 按钮 +- 若 `latest-version` 返回 null 或解析失败:隐藏按钮 + 显示 `"Auto upgrade not configured. See docs."` +- 若 CAP_UPGRADE 未启用:按钮 disabled + tooltip `"Enable Upgrade capability first"` +- 若 `Current == Latest`:显示 `"Up to date"` +- 点击按钮 → 二次确认 dialog: *"Upgrading will disconnect the agent for up to 2 minutes. Proceed?"* + +**B. Running** +- Stepper 显示 5 阶段:`Downloading / Verifying / Pre-flight / Installing / Restarting` +- 当前阶段高亮,已完成阶段 ✓ +- Restarting 阶段无后续消息,UI 保持该步激活直到收到终态或超时 + +**C. 终态** +- **Succeeded**: 绿色 ✓ `"Upgraded to vX.Y.Z — <时间>"`,3 秒后自动回到 Idle +- **Failed**: 红色 ✗ `"Failed at : "` + **Retry** 按钮 + 文案 `"Previous binary kept at .bak. for 24h."` +- **Timeout**: 橙色 `"Agent did not reconnect within 2 minutes. It may still be restarting; check back shortly."` + +### Server 列表行 badge + +`apps/web/src/components/server/server-list` 或对应列表组件订阅全局 upgrade job store: +- Running 态 → 行尾小 badge `"Upgrading..."` +- Failed 态 → 红色 badge `"Upgrade failed"`(可点击跳到 CapabilitiesDialog 查看详情) + +### i18n + +本次按英文单语处理(`apps/web` 暂未接入 i18n 框架)。如后续接入,新增文案 key 统一提取。 + +## 错误处理 & 边界 + +### 服务端边界 + +| 场景 | 行为 | +|------|------| +| POST 时 agent 离线 | 409 Conflict,不创建 job | +| POST 时已有 Running job | 409 Conflict + 返回现有 job | +| POST 时 CAP_UPGRADE 未启用 | 403 Forbidden(现有逻辑保留) | +| WS 发送 Upgrade 消息失败 | `mark_failed(Downloading, "failed to notify agent")` + 500 | +| 收到 UpgradeProgress 但 target_version 不匹配 | 忽略 + warn 日志 | +| 收到 UpgradeProgress 但无活跃 job | 忽略 + warn 日志 | +| 收到 UpgradeResult 但 job 已 Succeeded/Timeout | 忽略 | +| Agent 升级中 WS 断开 | 不立即判失败(Restarting 阶段会断) | +| Agent 重连但版本号仍是旧的 | job 保持 Running,等超时 | +| Agent 重连但版本号非 target | `mark_failed(Restarting, "agent reconnected with unexpected version X, expected Y")` | +| `latest-version` 拉取失败 | API 返回 `{ version: null, error }`,不 500;缓存空结果 1 分钟 | +| 超时后 SystemInfo 匹配 | **不翻转回 Succeeded**;Timeout 即终态(UI 里 Timeout + 新版本号可并存,由用户自行判断) | + +### Agent 边界 + +| 场景 | 行为 | +|------|------| +| 下载 HTTP 非 200 | `emit_failure(Downloading, "http {status}")` + 清理 .new | +| 下载中断 | `emit_failure(Downloading, "io: ...")` | +| SHA-256 不匹配 | `emit_failure(Verifying, "sha256 mismatch: got X, want Y")` + 删 .new | +| PreFlight 退出码非 0 / 超时 | `emit_failure(PreFlight, "...")` + 删 .new | +| Installing rename 失败 | `emit_failure(Installing, ...)` + 尝试从 .bak. 回滚 | +| Spawn 新进程失败 | `emit_failure(Restarting, ...)` + 从 .bak. 恢复 + 老进程继续跑 | +| emit_failure WS 发送失败 | 记本地日志;服务端靠超时兜底 | +| 收到 Upgrade 但 CAP_UPGRADE 关闭 | 发 `CapabilityDenied`(现有逻辑保留) | +| 升级中收到第二条 Upgrade | `emit_failure(Downloading, "upgrade already in progress")` | +| 管理员删了 .bak. | 不影响任何流程(备份是 best-effort) | + +### 并发 / 竞态 + +- **多 admin 同时 POST**: DashMap `entry` 原子插入,第二个请求 409。 +- **升级中服务端重启**: 内存 job 全丢。Agent 侧消息发到已重启的 server → `update_stage` 找不到 job 会 warn 并忽略。Agent 最终重连上报 SystemInfo,`server.agent_version` 被正常更新,但 server 不知道这是升级成果。这是可接受的降级(数据最终一致)。 + +## 日志 / 可观测性 + +- 服务端:所有 job 状态变化 `tracing::info!` 含 server_id / target_version / stage。 +- Agent:每个 emit 前 `tracing::info!`;失败用 `error!`。 +- 不加 metrics — 升级是低频事件,现有 tracing 足够。 + +## 测试策略 + +### 单元测试 + +**`crates/common/src/protocol.rs`** (~6 tests) +- `UpgradeStage` / `UpgradeStatus` / `UpgradeProgress` / `UpgradeResult`(AgentMessage + BrowserMessage)serde roundtrip。 + +**`crates/server/src/service/upgrade_tracker.rs`**(新建,~10 tests) +- `start_job` 成功路径 / 并发 Conflict / 旧终态 job 可被覆盖 +- `update_stage` target_version 不匹配时忽略 +- `mark_succeeded` 在 Timeout 态下不覆盖 +- `sweep_timeouts` 仅翻转 Running,严格按 120s 阈值 +- `cleanup_old` 按 24h 阈值删除终态 job +- 广播断言:每次状态变化 subscriber 都收到对应 `BrowserMessage` + +**`crates/agent/src/reporter.rs`** +- 抽离的 `verify_sha256` 和 `run_preflight` 纯函数单测。 +- `perform_upgrade` 本体的 emit 序列验证放到集成测试。 + +### 集成测试 + +**`crates/server/tests/integration/upgrade.rs`**(新建,~7 tests) + +使用现有 `crates/server/tests/integration/` 测试夹具(具体形态在实现时确认,可能需要模拟 agent WS 连接): + +1. **成功路径**: 模拟 agent 发 Downloading → Verifying → PreFlight → Installing → Restarting → 断开 → 重连 + SystemInfo(new_version) → 断言 `status=Succeeded` +2. **失败路径**: 模拟 agent 发 Downloading → UpgradeResult(Verifying, "sha256 mismatch") → 断言 `status=Failed, stage=Verifying` +3. **超时路径**: 模拟 agent 发 Downloading → Restarting → 断开不再重连 → 快进 121s → 断言 `status=Timeout` +4. **并发拒绝**: 连续两次 POST /upgrade → 第二次 409 +5. **target_version 错位**: POST 升 v1.0,agent 发 UpgradeProgress(target_version=v0.9) → 忽略,job 状态不变 +6. **Agent 重连错版本**: POST 升 v1.0,agent 重连但 SystemInfo 仍是 v0.8 → Running 保持 → Timeout +7. **CAP_UPGRADE 关闭**: POST → 403,无 job 创建 + +### 前端测试(vitest,~8 tests) + +- `use-upgrade-job.ts`: mock WS 消息,断言 state 转换(idle → running → succeeded / failed / timeout) +- `AgentVersionSection.test.tsx`: + - idle 状态显示 "Upgrade to vX" 按钮 + - CAP_UPGRADE 关闭时按钮 disabled + - Running 态显示 stepper,当前阶段高亮 + - Failed 态显示 Retry 按钮,点击触发 mutation + - latest-version API 返回 null 时隐藏按钮 + 显示引导文案 + +### E2E 手动清单 + +新增 `tests/agent-upgrade.md`: +- 真实升级一台 agent 到新版本 +- 故意提供错 SHA 触发 Verifying 失败 +- 关 agent 网络制造超时 +- 点 Retry 重试 +- 验证 `.bak.` 文件存在、24h 后清理 + +### 显式不测 + +- reqwest / 网络层 +- sha2 crate 自身 +- Windows 升级路径(遗留风险) +- `.bak` 磁盘占用上限(用户级运维) + +## 配置 / 环境变量 + +**无新增环境变量**。现有 `SERVERBEE_RELEASE__BASE_URL`(或等价项,实现时确认)被新的 `GET /api/agent/latest-version` 复用。若该配置未设置,前端显示"Auto upgrade not configured"引导文案。 + +## 迁移 + +**无需数据库 migration**。所有新增状态在内存中追踪。 + +## 风险 & 未来工作 + +### 已知限制 + +1. **Windows 未验证**: 运行中的 exe 无法被 rename/delete;本设计不解决该遗留问题。 +2. **服务端重启丢失活跃 job**: 内存态不持久化。可接受的降级:最终 `server.agent_version` 会被 SystemInfo 正常更新,但 UI 看不到升级确认。 +3. **Timeout 后的"迟到成功"不翻转**: 如果 agent 在 120s 后才重连并上报新版本,UI 会同时看到"Timeout 状态"和"新版本号"。这是故意为之,避免引入"late success"概念。 + +### 可能的后续工作 + +- **批量升级**: 前端批量 POST 循环 + 进度聚合视图。 +- **升级事件审计**: 新增 `event` 或 `agent_upgrade_log` 表,记录每次升级的触发者、时间、结果,供事后审计。 +- **L2 Spawn-then-verify 回滚**: 新老二进制双向协议支持,彻底解决"能启动但跑不起来"的 corner case。本次不做因 ROI 不足。 +- **Windows 升级验证 + 中继脚本方案**: 用 `MoveFileEx(MOVEFILE_DELAY_UNTIL_REBOOT)` 或外部 updater.exe 解决 Windows 自升级问题。 From 1a72eeea27b8d2c66c50125becc5a71745bb8df8 Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Tue, 14 Apr 2026 20:03:47 +0800 Subject: [PATCH 02/17] docs: revise agent self-upgrade spec per review feedback Addresses 6 findings and 2 open questions: - Add job_id (UUID v4) as primary matching key to prevent stale messages from polluting same-version retries; keep target_version as compat fallback. - Move start_job after all fallible pre-checks so agent-offline / platform-unsupported / checksum-fetch failures no longer create phantom Running jobs. - Extend AgentInfoUpdated to broadcast agent_version; front-end react-query cache now sees live Current version after upgrade completes. - Extend FullSync payload with upgrades array so WS reconnect / fresh page loads hydrate list badges correctly. - Latest-version endpoint now auto-detects GitHub Releases and supports latest_version_url override for self-hosted installs. - CapabilityDenied(upgrade) immediately fails the active job (no 120s wait). - Relocate AgentVersionSection from admin-only CapabilitiesDialog to Server detail page; Members get read-only visibility, admins keep Upgrade/Retry buttons. - Correct i18n stance: apps/web uses react-i18next; add en/zh key table. --- ...04-14-agent-self-upgrade-closure-design.md | 280 ++++++++++++++---- 1 file changed, 223 insertions(+), 57 deletions(-) diff --git a/docs/superpowers/specs/2026-04-14-agent-self-upgrade-closure-design.md b/docs/superpowers/specs/2026-04-14-agent-self-upgrade-closure-design.md index 0c0337cb..91021b4e 100644 --- a/docs/superpowers/specs/2026-04-14-agent-self-upgrade-closure-design.md +++ b/docs/superpowers/specs/2026-04-14-agent-self-upgrade-closure-design.md @@ -1,8 +1,8 @@ # Agent Self-Upgrade Closure — 设计文档 **日期**: 2026-04-14 -**状态**: Draft -**相关实现**: `crates/common/src/protocol.rs`, `crates/server/src/router/api/server.rs`, `crates/agent/src/reporter.rs`, `apps/web/src/components/server/` +**状态**: Draft (v2 — 根据 review 修订) +**相关实现**: `crates/common/src/protocol.rs`, `crates/server/src/router/api/server.rs`, `crates/agent/src/reporter.rs`, `crates/server/src/router/ws/agent.rs`, `crates/server/src/router/ws/browser.rs`, `crates/server/src/config.rs`, `apps/web/src/components/server/`, `apps/web/src/locales/` ## 概述 @@ -29,9 +29,14 @@ ServerBee 已经有一个"单向"的 agent 自升级通道:管理员通过 `PO 5. Agent 在写入新二进制之前执行 Pre-flight 预检(`--version` 探活),失败则中止升级。 6. Agent 备份文件采用带时间戳命名,保留 24h。 7. 前端在 `CapabilitiesDialog` 里新增 Agent Version 分组,显示当前版本、最新版本、触发按钮,以及升级进行中/失败/成功/超时的实时状态。 -8. 服务端新增 `GET /api/agent/latest-version` 端点,代理查询 `release_base_url` 的最新版本信息(10 分钟缓存)。 -9. 前端在 server 列表行上展示升级进行中/失败的 badge。 -10. 失败态提供 Retry 按钮,重新触发同一版本升级。 +8. 服务端新增 `GET /api/agent/latest-version` 端点,默认自动识别 GitHub Releases 上游,支持自托管 `latest_version_url` 覆盖,10 分钟成功缓存 / 1 分钟失败缓存。 +9. 前端在 server 列表行上展示升级进行中/失败的 badge,含 WS 重连/刷新场景的首屏注入。 +10. 失败态提供 Retry 按钮(admin-only),重新触发同一版本升级。 +11. 服务端在 SystemInfo 更新后广播扩展的 `AgentInfoUpdated` 消息(含 `agent_version`),前端据此刷新 react-query cache,确保升级成功后"Current 版本"实时更新。 +12. `BrowserMessage::FullSync` 额外携带 `upgrades: Vec`,用于 WS 首连/重连时 hydrate 前端 upgrade store。 +13. 服务端在每次 `start_job` 时生成 `job_id: Uuid v4`,通过 `ServerMessage::Upgrade` 下发,agent 在后续 `UpgradeProgress` / `UpgradeResult` 中回传,tracker 以 `job_id` 为主键匹配,解决"同版本重试被延迟消息串台"问题。 +14. Agent 返回 `CapabilityDenied(upgrade)` 时,服务端立即将对应 Running job 翻转为 Failed(不必等 120s 超时)。 +15. `AgentVersionSection` 位于 Server 详情页(而非 admin-only 的 CapabilitiesDialog),Member 只读可见,admin 额外看到触发按钮。 ## 非需求 @@ -56,12 +61,31 @@ pub enum UpgradeStage { } ``` +### `ServerMessage::Upgrade` 新增 `job_id` 字段 + +```rust +Upgrade { + version: String, + download_url: String, + sha256: String, + #[serde(default)] + job_id: Option, // 新增 — UUID v4,用于区分同一版本的多次重试 +}, +``` + +`Option` + `#[serde(default)]` 保证跨版本兼容: +- **新 server → 旧 agent**: 旧 agent 反序列化时忽略未知字段(serde 默认行为)。 +- **旧 server → 新 agent**: 缺失 `job_id` 反序列化为 `None`,新 agent 继续工作,但其发回的 progress 消息也不带 job_id;服务端 tracker 在 job_id 为 None 时降级到 `(server_id, target_version)` 匹配。 +- **已知限制**: 旧 agent 场景下,同版本连续重试可能被"上一轮延迟消息"串台(target_version 相同);这是可接受的降级,实际影响仅在混版环境。 + ### `AgentMessage` 新增两个变体 ```rust // 进度里程碑:进入每个阶段之前发一条 UpgradeProgress { msg_id: String, + #[serde(default)] + job_id: Option, // 透传服务端 Upgrade 消息中的 job_id target_version: String, stage: UpgradeStage, }, @@ -69,6 +93,8 @@ UpgradeProgress { // 终态(仅失败时发送) UpgradeResult { msg_id: String, + #[serde(default)] + job_id: Option, target_version: String, stage: UpgradeStage, // 失败发生在哪一阶段 error: String, // 人读错误信息,UI 直接展示 @@ -76,7 +102,7 @@ UpgradeResult { ``` **设计要点**: -- `target_version` 在每条消息中冗余携带,服务端用来防止"上一轮 Timeout 后开启的新 job" 被旧消息污染。 +- `job_id` 是 tracker 匹配的首选键;`target_version` 降级为 UI metadata + 兼容回退键。 - `UpgradeResult` 不带 `ok` 字段 — agent 只在失败时发送,"存在 = 失败"。 - 成功态不由 agent 消息表达,仅靠"重连 + `SystemInfo.agent_version == target_version`" 推断。 - `msg_id` 沿用现有 `SystemInfo` 等消息的风格(UUID v4)。 @@ -101,11 +127,13 @@ pub enum UpgradeStatus { ```rust UpgradeProgress { server_id: String, + job_id: String, // tracker 侧一定有 job_id(server 自己生成) target_version: String, stage: UpgradeStage, }, UpgradeResult { server_id: String, + job_id: String, target_version: String, status: UpgradeStatus, // Succeeded / Failed / Timeout stage: Option, // 仅 Failed 时 @@ -113,6 +141,38 @@ UpgradeResult { }, ``` +### 扩展现有 `BrowserMessage::AgentInfoUpdated` + +**现状**(`crates/common/src/protocol.rs:319`)仅推 `protocol_version`,前端 `use-servers-ws.ts:239` 据此更新 react-query 缓存。 + +**改动**: 新增可选 `agent_version` 字段: + +```rust +AgentInfoUpdated { + server_id: String, + protocol_version: i32, + #[serde(default)] + agent_version: Option, // 新增 +}, +``` + +服务端在 SystemInfo handler 更新 DB 后,广播时同时携带两个字段。前端 hook 解析 `agent_version` 并 patch `['servers', id]` / `['servers-list']` cache 中的 `agent_version` 字段。这样 `AgentVersionSection` 组件只需从 react-query cache 读当前版本,升级成功后"Current"自动刷新,无需单独订阅。 + +### 扩展 `BrowserMessage::FullSync` + +**现状**(`crates/server/src/router/ws/browser.rs:251` `build_full_sync`)仅推 `servers: Vec`。 + +**改动**: + +```rust +FullSync { + servers: Vec, + upgrades: Vec, // 新增 — 所有 Running + 24h 内终态 job +}, +``` + +`build_full_sync` 从 `upgrade_tracker.snapshot()` 拉取。前端在收到 FullSync 时初始化/重置 upgrade store。WS 重连时 FullSync 会重发,自动 re-hydrate 列表行 badge 状态。 + ## 服务端改动(`crates/server`) ### 内存 Job 追踪器 @@ -121,6 +181,7 @@ UpgradeResult { ```rust pub struct UpgradeJob { + pub job_id: String, // UUID v4,server 生成 pub server_id: String, pub target_version: String, pub started_at: DateTime, @@ -136,40 +197,51 @@ pub struct UpgradeJobTracker { } ``` +**匹配策略(重要)**:所有 agent→server 的消息查找 job 时,优先用 `job_id` 严格匹配;若 agent 消息的 `job_id` 为 None(旧版本兼容路径),降级到 `(server_id, target_version)` 匹配 + `status == Running` 门槛。 + **核心方法**(所有状态变化方法内部广播 `BrowserMessage`): | 方法 | 行为 | |------|------| -| `start_job(server_id, target_version)` | DashMap `entry` 原子插入。若已存在 `Running` 状态的 job,返回 `AppError::Conflict`。若存在但为终态(Succeeded/Failed/Timeout),覆盖。 | -| `update_stage(server_id, target_version, stage)` | 只在 `target_version` 匹配且状态 `Running` 时更新。不匹配时 warn 日志并忽略。 | -| `mark_failed(server_id, target_version, stage, error)` | 翻转为 `Failed`,设置 `finished_at`,广播 `UpgradeResult`。 | +| `start_job(server_id, target_version) -> Result` | DashMap `entry` 原子插入。生成 `job_id = Uuid::new_v4()`。若已存在 `Running` 状态的 job,返回 `AppError::Conflict` 带现有 job。若存在但为终态(Succeeded/Failed/Timeout),覆盖。返回新创建的 job 以便 router 传给 WS send。 | +| `update_stage(server_id, job_id_or_version, stage)` | 按匹配策略查 job;只在状态 `Running` 时更新。不匹配或已终态时 warn 日志并忽略。 | +| `mark_failed(server_id, job_id_or_version, stage, error)` | 翻转为 `Failed`,设置 `finished_at`,广播 `UpgradeResult`。 | +| `mark_failed_by_capability_denied(server_id)` | 专用入口:找当前 Running job 并 `mark_failed(Downloading, "capability denied by agent")`。无 Running 时 no-op。 | | `mark_succeeded(server_id, observed_version)` | 仅当 `observed_version == target_version` 且状态 `Running` 时翻转为 `Succeeded`。若状态已是 `Timeout` 则不覆盖(Timeout 即终态)。 | | `sweep_timeouts(now)` | 扫描所有 `Running` 且 `started_at + 120s < now` 的 job,翻转为 `Timeout`。 | | `cleanup_old(now)` | 删除所有 `finished_at + 24h < now` 的终态 job,防止内存无限增长。 | | `get(server_id) -> Option` | 供 `GET /api/servers/{id}/upgrade` 查询当前/最近 job。 | +| `snapshot() -> Vec` | 供 `build_full_sync` 一次性拉取所有 Running + 终态 job。 | **装配**: 在 `AppState` 新增 `upgrade_tracker: Arc`。 ### REST 路由改动 -`crates/server/src/router/api/server.rs` 现有 `trigger_upgrade`(519-626)调整流程: +`crates/server/src/router/api/server.rs` 现有 `trigger_upgrade`(519-626)调整流程 —— **关键**: `start_job` 必须放在所有 fallible 预检之后、WS send 之前,以免创建"假 Running job"。 1. 校验 `CAP_UPGRADE`(保留)。 -2. **新增**: `tracker.start_job(server_id, target_version)`;若 `Conflict` 返回 409 + 返回现有 job DTO。 -3. 获取 agent 平台、拉 `checksums.txt`、解析 SHA-256(保留)。 -4. 通过 AgentManager 发送 `ServerMessage::Upgrade`(保留)。 -5. **新增**: 若发送失败,`tracker.mark_failed(Downloading, "failed to notify agent: ...")`,返回 500。 -6. **新增**: 成功返回 202 Accepted + `{ data: { job: UpgradeJobDto } }`,前端直接拿到初始 `Running` 状态。 +2. 校验 agent 在线;若离线直接 409,不创建 job。 +3. 获取 agent 平台、构造资产名;若平台不支持返回 400,不创建 job。 +4. 拉 `checksums.txt` 并解析 SHA-256;网络/解析失败返回 502,不创建 job。 +5. **此时才调用** `tracker.start_job(server_id, target_version)` —— 若 `Conflict`(已有 Running job)返回 409 + 现有 job DTO。成功后拿到带 `job_id` 的 `UpgradeJob`。 +6. 通过 AgentManager 发送 `ServerMessage::Upgrade { ..., job_id: Some(job.job_id.clone()) }`。 +7. 若 WS 发送失败,立即调 `tracker.mark_failed(job.job_id, Downloading, "failed to notify agent: ...")` 并返回 500。虽然会留一个短暂的 Running → Failed 切换,这是预期的:UI 会看到一次失败广播,符合"所有失败都有可见反馈"的原则。 +8. 成功返回 202 Accepted + `{ data: { job: UpgradeJobDto } }`,前端直接拿到初始 `Running` 状态。 + +**并发保护**:两个 POST 同时到达时,步骤 2-4 都能各自完成(浪费一次 checksum 拉取),第 5 步 DashMap `entry` 原子性保证只有一个能 `start_job`,另一个拿 409。 **新增路由**: `GET /api/servers/{id}/upgrade` → 返回 `Option`(当前或最近 24h 内的终态 job)。挂载在 `read_router()` 下(Admin + Member 都可读)。 ### Agent WS 消息处理 -`crates/server/src/router/ws/agent.rs`(具体文件需在实现时确认)现有 match 分支新增: +`crates/server/src/router/ws/agent.rs` 现有 match 分支新增 / 修改: -- `AgentMessage::UpgradeProgress { target_version, stage, .. }` → `tracker.update_stage(server_id, target_version, stage)` -- `AgentMessage::UpgradeResult { target_version, stage, error, .. }` → `tracker.mark_failed(server_id, target_version, stage, error)` -- `AgentMessage::SystemInfo { info, .. }` 现有处理末尾加钩子:若 `tracker.get(server_id)` 为 `Running` 且 `info.agent_version == target_version` 则 `mark_succeeded`。 +- **新增** `AgentMessage::UpgradeProgress { job_id, target_version, stage, .. }` → `tracker.update_stage(server_id, job_id_or_version(...), stage)` +- **新增** `AgentMessage::UpgradeResult { job_id, target_version, stage, error, .. }` → `tracker.mark_failed(server_id, job_id_or_version(...), stage, error)` +- **修改** `AgentMessage::SystemInfo { info, .. }` 现有处理末尾加钩子: + 1. 若 `tracker.get(server_id)` 为 `Running` 且 `info.agent_version == target_version` → `tracker.mark_succeeded(server_id, info.agent_version)` + 2. **新增**: DB 更新完成后,广播 `BrowserMessage::AgentInfoUpdated { server_id, protocol_version, agent_version: Some(info.agent_version.clone()) }` +- **修改** `AgentMessage::CapabilityDenied` 当前分支(`agent.rs:586-594`)处理 "upgrade" capability:若 `capability == "upgrade"`,调用 `tracker.mark_failed_by_capability_denied(server_id)`,让 UI 立即看到失败(而非等 120s 超时)。其他 capability 的现有处理逻辑保持不变。 ### 超时 / 清理后台任务 @@ -181,13 +253,32 @@ pub struct UpgradeJobTracker { **目的**: 前端需要"最新版本号"才能决定是否显示升级按钮。 -**实现**: -- 新 handler,内存缓存(OnceCell + Mutex 或 `moka`)TTL 10 分钟。 -- 服务端从 `config.release_base_url` 拼 `{base}/latest.txt`(或 `latest.json`,具体格式待实现时决定),拉取并解析版本号。 -- 成功返回 `{ version: "0.9.0", released_at: ISO8601? }`。 -- 失败(网络错误 / 未配置 `release_base_url` / 解析失败)返回 `{ version: null, error: "..." }`,**不抛 500**。 -- 失败响应缓存 1 分钟,避免打爆上游或重复失败。 -- 挂载在 `read_router()`。 +**上游协议**(默认 `release_base_url = https://github.com/ZingerLittleBee/ServerBee/releases`,见 `crates/server/src/config.rs:274-282`): + +策略 (C) — 自动识别 GitHub + 可选 override: +1. **Override**: 若配置 `upgrade.latest_version_url` 显式设置(新增 optional 字段),直接调用该 URL,期望返回 JSON `{ version: "x.y.z", released_at?: "..." }`。 +2. **Auto-detect**: 否则解析 `release_base_url`,若匹配正则 `^https://github\.com/([^/]+)/([^/]+)/releases/?$`,自动调用 GitHub API:`GET https://api.github.com/repos/{owner}/{repo}/releases/latest`,提取 `tag_name`(剥掉可选 `v` 前缀)作为版本号,`published_at` 作为 released_at。 +3. **Neither matches**: 返回 `{ version: null, error: "auto-detect failed; set upgrade.latest_version_url" }`,前端据此隐藏升级按钮并显示引导文案。 + +**实现细节**: +- 内存缓存(OnceCell + Mutex 或 `moka`)成功响应 TTL 10 分钟。 +- 失败响应 TTL 1 分钟,避免打爆上游。 +- GitHub API 未认证配额 60 req/hour per IP — 10 分钟缓存下单实例每小时最多 6 次调用,远低于限制。 +- HTTP client 使用现有 `reqwest` 实例;超时 10s;User-Agent 带 `serverbee-server/`。 +- 挂载在 `read_router()`(Admin + Member 都可读)。 + +**新增 config** (`crates/server/src/config.rs`): + +```rust +pub struct UpgradeConfig { + #[serde(default = "default_release_base_url")] + pub release_base_url: String, + #[serde(default)] + pub latest_version_url: Option, // 新增 — 自托管覆盖 +} +``` + +对应环境变量:`SERVERBEE_UPGRADE__LATEST_VERSION_URL`。需同步更新 `ENV.md` 和 `apps/docs/content/docs/{en,cn}/configuration.mdx`。 ## Agent 改动(`crates/agent`) @@ -258,12 +349,15 @@ emit(Restarting) ## 前端改动(`apps/web`) -### 新增 hook +### 全局 upgrade job store -`apps/web/src/hooks/use-upgrade-job.ts` +**方式**: 新建 `apps/web/src/stores/upgrade-jobs-store.ts`,使用 **zustand**(如现有代码已有 zustand usage 则沿用;否则退化为 React Context + reducer;plan 阶段先 grep 确认)。 + +**State 形状**: ```ts type UpgradeJob = { + jobId: string serverId: string targetVersion: string stage: 'downloading' | 'verifying' | 'pre_flight' | 'installing' | 'restarting' @@ -272,23 +366,42 @@ type UpgradeJob = { startedAt: string finishedAt?: string } + +type UpgradeJobsStore = { + jobs: Record // key: serverId + setJobs: (jobs: UpgradeJob[]) => void // FullSync 批量初始化 + upsertJob: (job: UpgradeJob) => void // 单条更新 + clearFinished: (serverId: string) => void // 成功态 3 秒后调用 +} ``` -两个 hook: -- `useUpgradeJob(serverId)`: - - 首次 mount 调 `GET /api/servers/{id}/upgrade` 同步初始状态。 - - 订阅现有全局 WS(`use-servers-ws.ts`)新增的 `upgrade_progress` / `upgrade_result` 消息,匹配 serverId 后更新本地 state。 -- `useTriggerUpgrade()`: - - mutation,POST `/api/servers/{id}/upgrade`。 - - 成功后把返回的 `job` 写入全局 store(便于列表 badge 立即展示)。 +### 新增 hook + +`apps/web/src/hooks/use-upgrade-job.ts` + +- `useUpgradeJob(serverId)`: 从 store 选取单台 server 的 job。首次 mount 时如果 store 中没有对应 serverId 的条目,调 `GET /api/servers/{id}/upgrade` 兜底获取(覆盖"从外部链接直接进入 server 详情页"的场景)。 +- `useTriggerUpgrade()`: mutation,POST `/api/servers/{id}/upgrade`,成功后把返回的 `job` 写入 store(`upsertJob`)。 ### WS 消息路由 -`apps/web/src/hooks/use-servers-ws.ts` 现有消息 switch 新增两个 case,dispatch 到全局 upgrade job store。具体 store 方式(zustand / tanstack-query / context)按现有代码风格就近匹配,属实现细节。 +`apps/web/src/hooks/use-servers-ws.ts` 现有消息 switch 新增 / 修改 case: + +- **新增** `upgrade_progress` / `upgrade_result` → `store.upsertJob(...)` +- **扩展** `agent_info_updated`(现有 l.239): 除了 `protocol_version`,额外 patch `agent_version`(如果消息带此字段)到 `['servers', id]` 和 `['servers-list']` cache。 +- **扩展** `full_sync`: 解析新增的 `upgrades` 数组,调 `store.setJobs(upgrades)`。 ### UI 组件 -**位置**: `CapabilitiesDialog` 底部独立一个 `AgentVersionSection` 子组件(新建 `apps/web/src/components/server/agent-version-section.tsx`)。 +**位置变更(修订)**: `AgentVersionSection` **不放**在 admin-only 的 `CapabilitiesDialog` 里,改放在 **Server 详情页**(`apps/web/src/routes/_authed/servers/$serverId.*.tsx` 的合适子区块,实现时确认具体路由文件名)作为独立的信息区。理由: + +- `GET /api/servers/{id}/upgrade` 对 Member 可读,Member 需要能看到"本机正在升级 / 刚升级失败"的完整上下文(列表 badge 点进去要能看到原因)。 +- `CapabilitiesDialog` hard-gate `user?.role !== 'admin'` 会把整个 section 对 Member 隐藏,与 GET 端点可读性不一致。 + +**渲染分层**: +- 所有角色:看到 `Current: vX.Y.Z`、`Latest: vY.Y.Z`(或 "Up to date")、Running/Failed/Timeout 的完整 stepper/文案。 +- 仅 admin:额外渲染 `[ Upgrade to vY.Y.Z ]` 按钮和失败态的 `Retry` 按钮。Member 在 idle 态看不到操作按钮;在 Failed 态看到原因但无法重试。 + +**新建文件**: `apps/web/src/components/server/agent-version-section.tsx`。 **三个显示态**: @@ -318,7 +431,33 @@ type UpgradeJob = { ### i18n -本次按英文单语处理(`apps/web` 暂未接入 i18n 框架)。如后续接入,新增文案 key 统一提取。 +**修订**: `apps/web` 已接入 react-i18next(参考 `capabilities-dialog.tsx:55` `t('cap_group_low_risk', ...)`),文案分别在 `apps/web/src/locales/en/servers.json` 和 `apps/web/src/locales/zh/servers.json`。 + +**改动**: 新增文案 key 同时写入 `en/servers.json` 和 `zh/servers.json`(或新建 `upgrade` namespace),至少包括: + +| Key | EN | 中文 | +|---|---|---| +| `upgrade.section_title` | Agent Version | Agent 版本 | +| `upgrade.current` | Current | 当前版本 | +| `upgrade.latest` | Latest | 最新版本 | +| `upgrade.up_to_date` | Up to date | 已是最新 | +| `upgrade.button` | Upgrade to {{version}} | 升级到 {{version}} | +| `upgrade.confirm_title` | Confirm upgrade | 确认升级 | +| `upgrade.confirm_body` | Upgrading will disconnect the agent for up to 2 minutes. Proceed? | 升级将使 agent 断连最长 2 分钟,是否继续? | +| `upgrade.disabled_cap` | Enable Upgrade capability first | 请先开启升级能力 | +| `upgrade.not_configured` | Auto upgrade not configured. See docs. | 未配置自动升级,请参考文档 | +| `upgrade.stage.downloading` | Downloading | 下载中 | +| `upgrade.stage.verifying` | Verifying | 校验中 | +| `upgrade.stage.pre_flight` | Pre-flight | 预检中 | +| `upgrade.stage.installing` | Installing | 安装中 | +| `upgrade.stage.restarting` | Restarting | 重启中 | +| `upgrade.succeeded` | Upgraded to {{version}} | 已升级到 {{version}} | +| `upgrade.failed` | Failed at {{stage}}: {{error}} | {{stage}} 阶段失败:{{error}} | +| `upgrade.failed_hint` | Previous binary kept at {{path}} for 24h. | 旧版本保留在 {{path}}(24 小时) | +| `upgrade.timeout` | Agent did not reconnect within 2 minutes. It may still be restarting; check back shortly. | Agent 未在 2 分钟内重连,可能仍在重启,请稍后查看 | +| `upgrade.retry` | Retry | 重试 | +| `upgrade.badge_running` | Upgrading... | 升级中… | +| `upgrade.badge_failed` | Upgrade failed | 升级失败 | ## 错误处理 & 边界 @@ -326,16 +465,21 @@ type UpgradeJob = { | 场景 | 行为 | |------|------| -| POST 时 agent 离线 | 409 Conflict,不创建 job | +| POST 时 agent 离线 | 409 Conflict,**不创建 job**(预检在 start_job 之前) | +| POST 时平台不支持 | 400 Bad Request,**不创建 job** | +| POST 时 checksums 拉取 / 解析失败 | 502 Bad Gateway,**不创建 job** | | POST 时已有 Running job | 409 Conflict + 返回现有 job | | POST 时 CAP_UPGRADE 未启用 | 403 Forbidden(现有逻辑保留) | -| WS 发送 Upgrade 消息失败 | `mark_failed(Downloading, "failed to notify agent")` + 500 | -| 收到 UpgradeProgress 但 target_version 不匹配 | 忽略 + warn 日志 | +| WS 发送 Upgrade 消息失败 | `mark_failed(Downloading, "failed to notify agent")` + 500(已有 Running job,会产生一次 Running→Failed 广播,UI 能看到) | +| 收到 UpgradeProgress 但 job_id 不匹配 | 忽略 + warn 日志 | +| 收到 UpgradeProgress 但 job_id 缺失(旧 agent)且 target_version 不匹配 | 忽略 + warn 日志 | | 收到 UpgradeProgress 但无活跃 job | 忽略 + warn 日志 | | 收到 UpgradeResult 但 job 已 Succeeded/Timeout | 忽略 | | Agent 升级中 WS 断开 | 不立即判失败(Restarting 阶段会断) | +| Agent 返回 CapabilityDenied(upgrade) | 立即 `mark_failed_by_capability_denied(server_id)`,UI 即时看到失败原因 | | Agent 重连但版本号仍是旧的 | job 保持 Running,等超时 | | Agent 重连但版本号非 target | `mark_failed(Restarting, "agent reconnected with unexpected version X, expected Y")` | +| SystemInfo 到达后 | DB 更新 + 广播 `AgentInfoUpdated { agent_version }`;如匹配 target_version 则触发 `mark_succeeded` | | `latest-version` 拉取失败 | API 返回 `{ version: null, error }`,不 500;缓存空结果 1 分钟 | | 超时后 SystemInfo 匹配 | **不翻转回 Succeeded**;Timeout 即终态(UI 里 Timeout + 新版本号可并存,由用户自行判断) | @@ -369,15 +513,21 @@ type UpgradeJob = { ### 单元测试 -**`crates/common/src/protocol.rs`** (~6 tests) +**`crates/common/src/protocol.rs`** (~8 tests) - `UpgradeStage` / `UpgradeStatus` / `UpgradeProgress` / `UpgradeResult`(AgentMessage + BrowserMessage)serde roundtrip。 - -**`crates/server/src/service/upgrade_tracker.rs`**(新建,~10 tests) -- `start_job` 成功路径 / 并发 Conflict / 旧终态 job 可被覆盖 -- `update_stage` target_version 不匹配时忽略 +- `ServerMessage::Upgrade` 带 `job_id: Some` 和省略 `job_id` 字段两种输入都能正确反序列化(向后兼容断言)。 +- `AgentMessage::UpgradeProgress` 缺失 `job_id` 时 `#[serde(default)]` 生效为 `None`。 +- `BrowserMessage::AgentInfoUpdated` 带/不带 `agent_version` 都能反序列化。 + +**`crates/server/src/service/upgrade_tracker.rs`**(新建,~12 tests) +- `start_job` 成功路径 / 并发 Conflict / 旧终态 job 可被覆盖 / 返回的 job 带 UUID v4 `job_id` +- `update_stage` 以 `job_id` 匹配 Running job → 更新;`job_id` 不匹配 → 忽略 +- `update_stage` job_id 缺失(None)时退化到 target_version 匹配 +- `mark_failed_by_capability_denied` 仅对 Running job 生效,非 Running 时 no-op - `mark_succeeded` 在 Timeout 态下不覆盖 - `sweep_timeouts` 仅翻转 Running,严格按 120s 阈值 - `cleanup_old` 按 24h 阈值删除终态 job +- `snapshot()` 返回所有 Running + 未过期终态 job - 广播断言:每次状态变化 subscriber 都收到对应 `BrowserMessage` **`crates/agent/src/reporter.rs`** @@ -386,27 +536,34 @@ type UpgradeJob = { ### 集成测试 -**`crates/server/tests/integration/upgrade.rs`**(新建,~7 tests) +**`crates/server/tests/integration/upgrade.rs`**(新建,~10 tests) 使用现有 `crates/server/tests/integration/` 测试夹具(具体形态在实现时确认,可能需要模拟 agent WS 连接): -1. **成功路径**: 模拟 agent 发 Downloading → Verifying → PreFlight → Installing → Restarting → 断开 → 重连 + SystemInfo(new_version) → 断言 `status=Succeeded` +1. **成功路径**: 模拟 agent 发 Downloading → Verifying → PreFlight → Installing → Restarting → 断开 → 重连 + SystemInfo(new_version) → 断言 `status=Succeeded`,并断言 `AgentInfoUpdated` 广播中包含 `agent_version=new_version` 2. **失败路径**: 模拟 agent 发 Downloading → UpgradeResult(Verifying, "sha256 mismatch") → 断言 `status=Failed, stage=Verifying` 3. **超时路径**: 模拟 agent 发 Downloading → Restarting → 断开不再重连 → 快进 121s → 断言 `status=Timeout` 4. **并发拒绝**: 连续两次 POST /upgrade → 第二次 409 -5. **target_version 错位**: POST 升 v1.0,agent 发 UpgradeProgress(target_version=v0.9) → 忽略,job 状态不变 -6. **Agent 重连错版本**: POST 升 v1.0,agent 重连但 SystemInfo 仍是 v0.8 → Running 保持 → Timeout -7. **CAP_UPGRADE 关闭**: POST → 403,无 job 创建 +5. **pre-check 失败不创建 job**: agent 离线时 POST → 409,tracker 中无 job +6. **checksum 404 不创建 job**: 配置错误的 release_base_url → 502,tracker 中无 job +7. **job_id 错位**: POST 创建 job A,之后 agent 发 UpgradeProgress 带旧 job_id → 忽略,job A 状态不变 +8. **同版本重试保护**: POST 触发 v1.0(job_A),agent 失败;POST 再次触发 v1.0(job_B,job_id 不同);此时模拟 job_A 的延迟 UpgradeResult 到达 → 被忽略,job_B 不受影响 +9. **CapabilityDenied 快速失败**: POST 触发升级后 agent 立即发 `CapabilityDenied(capability="upgrade")` → job 立即 Failed,不等超时 +10. **FullSync 携带 upgrades**: 有 1 个 Running job + 1 个 Failed job 时,新浏览器订阅 WS 得到的 FullSync 包含这两个 job -### 前端测试(vitest,~8 tests) +### 前端测试(vitest,~10 tests) -- `use-upgrade-job.ts`: mock WS 消息,断言 state 转换(idle → running → succeeded / failed / timeout) +- `upgrade-jobs-store.test.ts`: `setJobs` 批量初始化 / `upsertJob` 单条更新 / `clearFinished` 只清理终态 +- `use-upgrade-job.ts`: mock WS 消息 + store 交互,断言 state 转换(idle → running → succeeded / failed / timeout) - `AgentVersionSection.test.tsx`: - - idle 状态显示 "Upgrade to vX" 按钮 - - CAP_UPGRADE 关闭时按钮 disabled + - Admin + idle 状态:显示 "Upgrade to vX" 按钮 + - Member + idle 状态:**不显示**按钮,Current/Latest 仍可见 + - CAP_UPGRADE 关闭时 admin 按钮 disabled + tooltip - Running 态显示 stepper,当前阶段高亮 - - Failed 态显示 Retry 按钮,点击触发 mutation + - Failed 态 admin 显示 Retry 按钮;Member 只看到失败原因 - latest-version API 返回 null 时隐藏按钮 + 显示引导文案 +- `use-servers-ws.ts`: `agent_info_updated` 携带 `agent_version` 时正确 patch react-query cache +- `use-servers-ws.ts`: `full_sync` 的 `upgrades` 数组调 `store.setJobs` ### E2E 手动清单 @@ -426,7 +583,14 @@ type UpgradeJob = { ## 配置 / 环境变量 -**无新增环境变量**。现有 `SERVERBEE_RELEASE__BASE_URL`(或等价项,实现时确认)被新的 `GET /api/agent/latest-version` 复用。若该配置未设置,前端显示"Auto upgrade not configured"引导文案。 +**新增 1 个可选环境变量**: + +- `SERVERBEE_UPGRADE__LATEST_VERSION_URL` (optional) — 自托管用户可显式指定最新版本查询 URL。期望返回 JSON `{ version: "x.y.z", released_at?: "..." }`。未设置时自动识别 `upgrade.release_base_url`(现有 `SERVERBEE_UPGRADE__RELEASE_BASE_URL`)是否为 GitHub Releases 格式并调用 GitHub API。 + +**需同步更新**: +- `ENV.md` +- `apps/docs/content/docs/en/configuration.mdx` +- `apps/docs/content/docs/cn/configuration.mdx` ## 迁移 @@ -439,6 +603,8 @@ type UpgradeJob = { 1. **Windows 未验证**: 运行中的 exe 无法被 rename/delete;本设计不解决该遗留问题。 2. **服务端重启丢失活跃 job**: 内存态不持久化。可接受的降级:最终 `server.agent_version` 会被 SystemInfo 正常更新,但 UI 看不到升级确认。 3. **Timeout 后的"迟到成功"不翻转**: 如果 agent 在 120s 后才重连并上报新版本,UI 会同时看到"Timeout 状态"和"新版本号"。这是故意为之,避免引入"late success"概念。 +4. **旧版本 agent 的同版本重试保护降级**: 若 agent 未升级到支持 `job_id` 的版本,服务端与 agent 之间的 job 匹配退化到 `(server_id, target_version)`。在此模式下连续重试同一版本时,第一次失败的"延迟消息"理论上可能污染第二次尝试。实际影响范围:仅混版(server 已升级但 agent 未升级);升级一次 agent 到新协议后此限制自动解除。 +5. **GitHub API 速率限制**: 未认证的 GitHub API 配额 60 req/hour per IP。10 分钟缓存下单实例远低于限制;高密度部署(如单 IP 大量容器)可能触及,需要用户自行设置 `latest_version_url` 指向内部镜像。 ### 可能的后续工作 From f8455e81d659b1cc8156bbf6ea7cc705e9ae2e1c Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Tue, 14 Apr 2026 20:05:22 +0800 Subject: [PATCH 03/17] docs: clarify upgrade store setJobs semantics and fix stale CapabilitiesDialog reference --- .../specs/2026-04-14-agent-self-upgrade-closure-design.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/superpowers/specs/2026-04-14-agent-self-upgrade-closure-design.md b/docs/superpowers/specs/2026-04-14-agent-self-upgrade-closure-design.md index 91021b4e..cfe709c0 100644 --- a/docs/superpowers/specs/2026-04-14-agent-self-upgrade-closure-design.md +++ b/docs/superpowers/specs/2026-04-14-agent-self-upgrade-closure-design.md @@ -369,8 +369,8 @@ type UpgradeJob = { type UpgradeJobsStore = { jobs: Record // key: serverId - setJobs: (jobs: UpgradeJob[]) => void // FullSync 批量初始化 - upsertJob: (job: UpgradeJob) => void // 单条更新 + setJobs: (jobs: UpgradeJob[]) => void // FullSync 批量替换(整体覆盖;未出现在新列表的 serverId 被删除) + upsertJob: (job: UpgradeJob) => void // 单条更新;相同 (serverId, jobId, status) 的重复消息需要去重,避免 WS 回放导致 UI 闪烁 clearFinished: (serverId: string) => void // 成功态 3 秒后调用 } ``` @@ -427,7 +427,7 @@ type UpgradeJobsStore = { `apps/web/src/components/server/server-list` 或对应列表组件订阅全局 upgrade job store: - Running 态 → 行尾小 badge `"Upgrading..."` -- Failed 态 → 红色 badge `"Upgrade failed"`(可点击跳到 CapabilitiesDialog 查看详情) +- Failed 态 → 红色 badge `"Upgrade failed"`(可点击跳到 Server 详情页的 AgentVersionSection 查看原因) ### i18n From 33321a4f0aaeda4da19628dca1ab971ad40956d9 Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Tue, 14 Apr 2026 20:08:59 +0800 Subject: [PATCH 04/17] docs: add agent self-upgrade closure implementation plan Nine tasks covering protocol extension, server tracker + timeout worker, latest-version lookup service, REST/WS wiring with start_job ordering, agent hardening with preflight and timestamped .bak rollback, server integration coverage, frontend Zustand store + hooks + WS hydration, server detail UI + list badge, and docs/QA sweep. Includes four review-driven amendments folded in up front: - Preserve CAP_UPGRADE pre-check in trigger_upgrade - Fold CapabilityDenied upgrade handling into existing match arm - Convert handleWsMessage from private to exported in place - Add Verifying-failure and timeout-sweep integration tests --- .../2026-04-14-agent-self-upgrade-closure.md | 2563 +++++++++++++++++ 1 file changed, 2563 insertions(+) create mode 100644 docs/superpowers/plans/2026-04-14-agent-self-upgrade-closure.md diff --git a/docs/superpowers/plans/2026-04-14-agent-self-upgrade-closure.md b/docs/superpowers/plans/2026-04-14-agent-self-upgrade-closure.md new file mode 100644 index 00000000..c587ffee --- /dev/null +++ b/docs/superpowers/plans/2026-04-14-agent-self-upgrade-closure.md @@ -0,0 +1,2563 @@ +# Agent Self-Upgrade Closure Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Build an observable, retryable, bounded-risk agent self-upgrade flow with live backend/frontend status, a cached latest-version lookup, and safer agent restart behavior. + +**Architecture:** Extend the shared websocket protocol with upgrade lifecycle messages plus a server-generated `job_id`, then keep authoritative upgrade job state in a server-side in-memory tracker keyed by `server_id`. The agent emits stage/failure events around a safer upgrade pipeline with preflight and timestamped backups; the web app hydrates upgrade jobs from browser WS `FullSync` plus a small Zustand store and renders status on the server detail page and server list surfaces. + +**Tech Stack:** Rust (`axum`, `tokio`, `dashmap`, `serde`, `uuid`, `chrono`, `reqwest`, `utoipa`), React 19, TanStack Query/Router, Zustand, react-i18next, Vitest. + +--- + +## Scope Notes + +- Keep this as one implementation plan. The protocol, server tracker, agent restart flow, and frontend UI are tightly coupled and do not produce a useful partial release on their own. +- Keep server upgrade integration coverage in `crates/server/tests/integration.rs` instead of inventing a new `tests/integration/` tree. The repo already keeps reusable server test harness code in that file. +- Two spec gaps need to be closed during implementation: + 1. The requested failed-state UI hint needs a concrete backup path. Add `backup_path: Option` to failure payloads and the tracker DTO so the frontend can render it when available. + 2. `AppError::Conflict` cannot carry the "existing job DTO" payload the spec wants. Use a service-local `StartUpgradeJobError::Conflict(UpgradeJob)` and let the API route return a structured `409` body manually. + +### Amendments after plan review (2026-04-14) + +The plan was reviewed against the spec and four implementer-facing clarifications were folded in. These do NOT change scope — they prevent predictable execution mistakes: + +1. **Task 4 `trigger_upgrade`**: the existing `CAP_UPGRADE` pre-check (`crates/server/src/router/api/server.rs:536-544`) MUST run before `start_job`. The code snippet in Task 4 was augmented to include it explicitly. +2. **Task 4 `CapabilityDenied` handler**: do NOT add a new sibling `match` arm with an `if` guard — the existing catch-all arm at `ws/agent.rs:586-627` would make it unreachable. Fold the `mark_failed_by_capability_denied` call INTO the existing arm body instead. +3. **Task 7 `handleWsMessage`**: the function already exists as a private `function` at `use-servers-ws.ts:299`. Step 3 must convert it to `export function` in place; do not create a duplicate. A note was added to Task 7 Step 2. +4. **Task 6 integration tests**: added two scenarios — `upgrade_result_failure_marks_job_failed_with_reason` (Verifying failure path) and `upgrade_timeout_sweeper_flips_stuck_running_job` (timeout sweep). These cover spec integration scenarios #2 and #3, which were previously only exercised via tracker unit tests. + +## File Map + +### Shared protocol + +- Modify `crates/common/src/protocol.rs` + Responsibility: add `UpgradeStage`, `UpgradeStatus`, `UpgradeJobDto`, `job_id` on `ServerMessage::Upgrade`, new agent/browser upgrade message variants, `agent_version` on `BrowserMessage::AgentInfoUpdated`, `upgrades` on `BrowserMessage::FullSync`, and serde coverage. + +### Server backend + +- Create `crates/server/src/service/upgrade_tracker.rs` + Responsibility: own in-memory upgrade jobs, match agent messages by `job_id` first, broadcast browser events, enforce timeout/cleanup rules. +- Create `crates/server/src/service/upgrade_release.rs` + Responsibility: resolve latest release version with 10-minute success cache / 1-minute failure cache, derive GitHub API URL from `release_base_url`, and resolve release asset checksum/download metadata for `trigger_upgrade`. +- Modify `crates/server/src/service/mod.rs` + Responsibility: export the new upgrade services. +- Modify `crates/server/src/state.rs` + Responsibility: attach `upgrade_tracker` and `upgrade_release_service` to `AppState`. +- Modify `crates/server/src/config.rs` + Responsibility: add `upgrade.latest_version_url`. +- Modify `crates/server/src/router/api/mod.rs` + Responsibility: expose the new authenticated `agent::read_router()`. +- Modify `crates/server/src/router/api/agent.rs` + Responsibility: add `GET /api/agent/latest-version` plus DTO schema. +- Modify `crates/server/src/router/api/server.rs` + Responsibility: return structured upgrade job payloads, add `GET /api/servers/{id}/upgrade`, change `POST /api/servers/{id}/upgrade` to tracker-aware `202/409`, and use the release service for checksum resolution. +- Modify `crates/server/src/router/ws/agent.rs` + Responsibility: consume `UpgradeProgress` / `UpgradeResult`, mark success on reconnect + `SystemInfo.agent_version`, fail fast on `CapabilityDenied(upgrade)`, and broadcast `agent_version`. +- Modify `crates/server/src/router/ws/browser.rs` + Responsibility: include `upgrades` in `FullSync`. +- Modify `crates/server/src/openapi.rs` + Responsibility: register the new read/write paths and schemas. +- Create `crates/server/src/task/upgrade_timeout.rs` + Responsibility: sweep 120-second timeouts and prune 24-hour terminal jobs. +- Modify `crates/server/src/task/mod.rs` + Responsibility: export the timeout task. +- Modify `crates/server/src/main.rs` + Responsibility: spawn the timeout task. + +### Agent + +- Modify `crates/agent/src/reporter.rs` + Responsibility: accept upgrade `job_id`, emit progress/failure messages, extract `verify_sha256` and `run_preflight`, keep timestamped backups for 24 hours, reject concurrent upgrades, and restart only after preflight succeeds. + +### Web frontend + +- Create `apps/web/src/stores/upgrade-jobs-store.ts` + Responsibility: keep one upgrade job per server, dedupe repeat WS payloads, and clear finished jobs after the success toast window. +- Create `apps/web/src/stores/upgrade-jobs-store.test.ts` + Responsibility: store behavior coverage. +- Create `apps/web/src/hooks/use-upgrade-job.ts` + Responsibility: read a server's current upgrade job, hydrate from `GET /api/servers/{id}/upgrade` on direct entry, and expose the trigger mutation. +- Modify `apps/web/src/hooks/use-servers-ws.ts` + Responsibility: route `upgrade_progress`, `upgrade_result`, `full_sync.upgrades`, and optional `agent_version`. +- Modify `apps/web/src/hooks/use-servers-ws.test.ts` + Responsibility: websocket reducer coverage for the new payloads. +- Create `apps/web/src/components/server/agent-version-section.tsx` + Responsibility: render current/latest version, admin-only action buttons, running stepper, and terminal states. +- Create `apps/web/src/components/server/agent-version-section.test.tsx` + Responsibility: component behavior coverage by role/status. +- Create `apps/web/src/components/server/upgrade-job-badge.tsx` + Responsibility: shared list/card badge for running/failed/timeout states. +- Modify `apps/web/src/routes/_authed/servers/$id.tsx` + Responsibility: place the new section on the server detail page. +- Modify `apps/web/src/routes/_authed/servers/$id.test.tsx` + Responsibility: assert the detail page renders the new section. +- Modify `apps/web/src/routes/_authed/servers/index.tsx` + Responsibility: show the shared upgrade badge in the table view. +- Modify `apps/web/src/components/server/server-card.tsx` + Responsibility: show the shared upgrade badge in the grid card view. +- Modify `apps/web/src/lib/api-schema.ts` + Responsibility: re-export `UpgradeJobDto`, `UpgradeStage`, `UpgradeStatus`, and `LatestAgentVersionResponse`. +- Regenerate `apps/web/src/lib/api-types.ts` + Responsibility: generated OpenAPI types. Do not hand-edit. +- Modify `apps/web/src/locales/en/servers.json` + Responsibility: add flat `upgrade_*` translation keys in English. +- Modify `apps/web/src/locales/zh/servers.json` + Responsibility: add flat `upgrade_*` translation keys in Chinese. + +### Docs and manual QA + +- Modify `ENV.md` + Responsibility: document `SERVERBEE_UPGRADE__LATEST_VERSION_URL`. +- Modify `apps/docs/content/docs/en/configuration.mdx` + Responsibility: add the new env var plus the `[upgrade]` field reference. +- Modify `apps/docs/content/docs/cn/configuration.mdx` + Responsibility: add the new env var plus the `[upgrade]` field reference. +- Create `tests/agent-upgrade.md` + Responsibility: manual end-to-end validation checklist. +- Modify `tests/README.md` + Responsibility: add the new checklist to the test index. + +### Task 1: Extend the Shared Upgrade Protocol + +**Files:** +- Modify: `crates/common/src/protocol.rs` +- Test: `crates/common/src/protocol.rs` + +- [ ] **Step 1: Write the failing protocol serde tests** + +```rust +#[test] +fn test_server_upgrade_with_job_id_round_trip() { + let msg = ServerMessage::Upgrade { + version: "1.2.3".to_string(), + download_url: "https://example.com/serverbee-agent-linux-amd64".to_string(), + sha256: "deadbeef".to_string(), + job_id: Some("job-1".to_string()), + }; + + let json = serde_json::to_string(&msg).unwrap(); + assert!(json.contains("\"job_id\":\"job-1\"")); + + match serde_json::from_str::(&json).unwrap() { + ServerMessage::Upgrade { job_id, .. } => { + assert_eq!(job_id.as_deref(), Some("job-1")); + } + _ => panic!("Expected Upgrade"), + } +} + +#[test] +fn test_upgrade_messages_without_job_id_stay_backward_compatible() { + let json = + r#"{"type":"upgrade_progress","msg_id":"m1","target_version":"1.2.3","stage":"downloading"}"#; + + match serde_json::from_str::(json).unwrap() { + AgentMessage::UpgradeProgress { job_id, .. } => { + assert!(job_id.is_none()); + } + _ => panic!("Expected UpgradeProgress"), + } +} + +#[test] +fn test_browser_full_sync_with_upgrades_round_trip() { + let msg = BrowserMessage::FullSync { + servers: vec![], + upgrades: vec![UpgradeJobDto { + server_id: "s1".to_string(), + job_id: "job-1".to_string(), + target_version: "1.2.3".to_string(), + stage: UpgradeStage::Installing, + status: UpgradeStatus::Running, + error: None, + backup_path: None, + started_at: chrono::Utc::now(), + finished_at: None, + }], + }; + + let json = serde_json::to_string(&msg).unwrap(); + match serde_json::from_str::(&json).unwrap() { + BrowserMessage::FullSync { upgrades, .. } => { + assert_eq!(upgrades.len(), 1); + assert_eq!(upgrades[0].job_id, "job-1"); + } + _ => panic!("Expected FullSync"), + } +} + +#[test] +fn test_agent_info_updated_accepts_optional_agent_version() { + let json = + r#"{"type":"agent_info_updated","server_id":"s1","protocol_version":3,"agent_version":"1.2.3"}"#; + + match serde_json::from_str::(json).unwrap() { + BrowserMessage::AgentInfoUpdated { + server_id, + protocol_version, + agent_version, + } => { + assert_eq!(server_id, "s1"); + assert_eq!(protocol_version, 3); + assert_eq!(agent_version.as_deref(), Some("1.2.3")); + } + _ => panic!("Expected AgentInfoUpdated"), + } +} +``` + +- [ ] **Step 2: Run the protocol tests to verify they fail** + +Run: + +```bash +cargo test -p serverbee-common test_server_upgrade_with_job_id_round_trip -- --exact +cargo test -p serverbee-common test_upgrade_messages_without_job_id_stay_backward_compatible -- --exact +``` + +Expected: FAIL because the new upgrade enums, DTOs, and `job_id` / `agent_version` fields do not exist yet. + +- [ ] **Step 3: Implement the protocol changes** + +```rust +use chrono::{DateTime, Utc}; + +#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] +#[cfg_attr(feature = "utoipa", derive(utoipa::ToSchema))] +#[serde(rename_all = "snake_case")] +pub enum UpgradeStage { + Downloading, + Verifying, + PreFlight, + Installing, + Restarting, +} + +#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] +#[cfg_attr(feature = "utoipa", derive(utoipa::ToSchema))] +#[serde(rename_all = "snake_case")] +pub enum UpgradeStatus { + Running, + Succeeded, + Failed, + Timeout, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +#[cfg_attr(feature = "utoipa", derive(utoipa::ToSchema))] +pub struct UpgradeJobDto { + pub server_id: String, + pub job_id: String, + pub target_version: String, + pub stage: UpgradeStage, + pub status: UpgradeStatus, + pub error: Option, + pub backup_path: Option, + pub started_at: DateTime, + pub finished_at: Option>, +} + +// Add inside `AgentMessage` +UpgradeProgress { + msg_id: String, + #[serde(default)] + job_id: Option, + target_version: String, + stage: UpgradeStage, +}, +UpgradeResult { + msg_id: String, + #[serde(default)] + job_id: Option, + target_version: String, + stage: UpgradeStage, + error: String, + #[serde(default)] + backup_path: Option, +}, + +// Replace the existing `ServerMessage::Upgrade` +Upgrade { + version: String, + download_url: String, + sha256: String, + #[serde(default)] + job_id: Option, +}, + +pub enum BrowserMessage { + FullSync { + servers: Vec, + #[serde(default)] + upgrades: Vec, + }, + AgentInfoUpdated { + server_id: String, + protocol_version: u32, + #[serde(default)] + agent_version: Option, + }, + UpgradeProgress { + server_id: String, + job_id: String, + target_version: String, + stage: UpgradeStage, + }, + UpgradeResult { + server_id: String, + job_id: String, + target_version: String, + status: UpgradeStatus, + stage: Option, + error: Option, + backup_path: Option, + }, +} +``` + +- [ ] **Step 4: Run the protocol tests to verify they pass** + +Run: + +```bash +cargo test -p serverbee-common test_server_upgrade_with_job_id_round_trip -- --exact +cargo test -p serverbee-common test_upgrade_messages_without_job_id_stay_backward_compatible -- --exact +cargo test -p serverbee-common test_browser_full_sync_with_upgrades_round_trip -- --exact +cargo test -p serverbee-common test_agent_info_updated_accepts_optional_agent_version -- --exact +``` + +Expected: PASS. + +- [ ] **Step 5: Commit** + +```bash +git add crates/common/src/protocol.rs +git commit -m "feat(common): add upgrade lifecycle protocol" +``` + +### Task 2: Add the Server Upgrade Tracker and Timeout Worker + +**Files:** +- Create: `crates/server/src/service/upgrade_tracker.rs` +- Modify: `crates/server/src/service/mod.rs` +- Modify: `crates/server/src/state.rs` +- Create: `crates/server/src/task/upgrade_timeout.rs` +- Modify: `crates/server/src/task/mod.rs` +- Modify: `crates/server/src/main.rs` +- Test: `crates/server/src/service/upgrade_tracker.rs` + +- [ ] **Step 1: Write the failing tracker tests** + +```rust +#[tokio::test] +async fn start_job_rejects_a_second_running_job() { + let (browser_tx, _browser_rx) = tokio::sync::broadcast::channel(8); + let tracker = UpgradeJobTracker::new(browser_tx); + + let first = tracker.start_job("s1", "1.2.3").unwrap(); + let err = tracker.start_job("s1", "1.2.4").unwrap_err(); + + match err { + StartUpgradeJobError::Conflict(existing) => { + assert_eq!(existing.job_id, first.job_id); + assert_eq!(existing.target_version, "1.2.3"); + } + } +} + +#[tokio::test] +async fn update_stage_prefers_job_id_and_ignores_stale_messages() { + let (browser_tx, _browser_rx) = tokio::sync::broadcast::channel(8); + let tracker = UpgradeJobTracker::new(browser_tx); + let job = tracker.start_job("s1", "1.2.3").unwrap(); + + tracker.update_stage( + "s1", + UpgradeLookup { + job_id: Some("old-job"), + target_version: "1.2.3", + }, + UpgradeStage::Verifying, + ); + + assert_eq!(tracker.get("s1").unwrap().stage, UpgradeStage::Downloading); + + tracker.update_stage( + "s1", + UpgradeLookup { + job_id: Some(job.job_id.as_str()), + target_version: "1.2.3", + }, + UpgradeStage::Verifying, + ); + + assert_eq!(tracker.get("s1").unwrap().stage, UpgradeStage::Verifying); +} + +#[tokio::test] +async fn mark_succeeded_does_not_overwrite_timeout() { + let (browser_tx, _browser_rx) = tokio::sync::broadcast::channel(8); + let tracker = UpgradeJobTracker::new(browser_tx); + tracker.start_job("s1", "1.2.3").unwrap(); + tracker.sweep_timeouts(chrono::Utc::now() + chrono::Duration::seconds(121)); + + tracker.mark_succeeded("s1", "1.2.3"); + + assert_eq!(tracker.get("s1").unwrap().status, UpgradeStatus::Timeout); +} + +#[tokio::test] +async fn cleanup_old_removes_only_expired_terminal_jobs() { + let (browser_tx, _browser_rx) = tokio::sync::broadcast::channel(8); + let tracker = UpgradeJobTracker::new(browser_tx); + tracker.start_job("s1", "1.2.3").unwrap(); + tracker.mark_failed( + "s1", + UpgradeLookup { + job_id: None, + target_version: "1.2.3", + }, + UpgradeStage::Verifying, + "sha256 mismatch".to_string(), + None, + ); + + tracker.cleanup_old(chrono::Utc::now() + chrono::Duration::hours(25)); + + assert!(tracker.get("s1").is_none()); +} +``` + +- [ ] **Step 2: Run the tracker tests to verify they fail** + +Run: + +```bash +cargo test -p serverbee-server start_job_rejects_a_second_running_job -- --exact +cargo test -p serverbee-server update_stage_prefers_job_id_and_ignores_stale_messages -- --exact +``` + +Expected: FAIL because `upgrade_tracker.rs`, `UpgradeLookup`, and `StartUpgradeJobError` do not exist yet. + +- [ ] **Step 3: Implement the tracker, wire it into state, and start the timeout worker** + +```rust +// crates/server/src/service/upgrade_tracker.rs +pub const UPGRADE_TIMEOUT_SECS: i64 = 120; +pub const UPGRADE_RETENTION_HOURS: i64 = 24; + +#[derive(Clone, Debug)] +pub struct UpgradeJob { + pub job_id: String, + pub server_id: String, + pub target_version: String, + pub started_at: chrono::DateTime, + pub stage: UpgradeStage, + pub status: UpgradeStatus, + pub error: Option, + pub backup_path: Option, + pub finished_at: Option>, +} + +impl UpgradeJob { + pub fn to_dto(&self) -> UpgradeJobDto { + UpgradeJobDto { + server_id: self.server_id.clone(), + job_id: self.job_id.clone(), + target_version: self.target_version.clone(), + stage: self.stage.clone(), + status: self.status.clone(), + error: self.error.clone(), + backup_path: self.backup_path.clone(), + started_at: self.started_at, + finished_at: self.finished_at, + } + } +} + +#[derive(Clone, Copy)] +pub struct UpgradeLookup<'a> { + pub job_id: Option<&'a str>, + pub target_version: &'a str, +} + +pub enum StartUpgradeJobError { + Conflict(UpgradeJob), +} + +pub struct UpgradeJobTracker { + jobs: dashmap::DashMap, + browser_tx: tokio::sync::broadcast::Sender, +} + +impl UpgradeJobTracker { + pub fn start_job(&self, server_id: &str, target_version: &str) -> Result { + let now = chrono::Utc::now(); + let job = UpgradeJob { + job_id: uuid::Uuid::new_v4().to_string(), + server_id: server_id.to_string(), + target_version: target_version.to_string(), + started_at: now, + stage: UpgradeStage::Downloading, + status: UpgradeStatus::Running, + error: None, + backup_path: None, + finished_at: None, + }; + + match self.jobs.entry(server_id.to_string()) { + dashmap::mapref::entry::Entry::Occupied(mut entry) + if entry.get().status == UpgradeStatus::Running => + { + Err(StartUpgradeJobError::Conflict(entry.get().clone())) + } + dashmap::mapref::entry::Entry::Occupied(mut entry) => { + entry.insert(job.clone()); + self.broadcast_progress(&job); + Ok(job) + } + dashmap::mapref::entry::Entry::Vacant(entry) => { + entry.insert(job.clone()); + self.broadcast_progress(&job); + Ok(job) + } + } + } + + pub fn update_stage(&self, server_id: &str, lookup: UpgradeLookup<'_>, stage: UpgradeStage) { + let Some(mut job) = self.jobs.get_mut(server_id) else { + tracing::warn!("Ignoring upgrade progress for unknown server_id={server_id}"); + return; + }; + + if job.status != UpgradeStatus::Running || !matches_lookup(&job, lookup) { + tracing::warn!("Ignoring stale upgrade progress for server_id={server_id}"); + return; + } + + job.stage = stage.clone(); + let snapshot = job.clone(); + drop(job); + self.broadcast_progress(&snapshot); + } + + pub fn mark_failed( + &self, + server_id: &str, + lookup: UpgradeLookup<'_>, + stage: UpgradeStage, + error: String, + backup_path: Option, + ) { + let Some(mut job) = self.jobs.get_mut(server_id) else { + return; + }; + if job.status != UpgradeStatus::Running || !matches_lookup(&job, lookup) { + return; + } + + job.stage = stage.clone(); + job.status = UpgradeStatus::Failed; + job.error = Some(error.clone()); + job.backup_path = backup_path.clone(); + job.finished_at = Some(chrono::Utc::now()); + let snapshot = job.clone(); + drop(job); + self.broadcast_result(&snapshot); + } + + pub fn mark_failed_by_capability_denied(&self, server_id: &str) { + let Some(job) = self.get(server_id) else { + return; + }; + if job.status == UpgradeStatus::Running { + self.mark_failed( + server_id, + UpgradeLookup { + job_id: Some(job.job_id.as_str()), + target_version: &job.target_version, + }, + UpgradeStage::Downloading, + "capability denied by agent".to_string(), + None, + ); + } + } + + pub fn mark_succeeded(&self, server_id: &str, observed_version: &str) { + let Some(mut job) = self.jobs.get_mut(server_id) else { + return; + }; + if job.status != UpgradeStatus::Running || job.target_version != observed_version { + return; + } + + job.status = UpgradeStatus::Succeeded; + job.finished_at = Some(chrono::Utc::now()); + let snapshot = job.clone(); + drop(job); + self.broadcast_result(&snapshot); + } + + pub fn sweep_timeouts(&self, now: chrono::DateTime) { + for mut entry in self.jobs.iter_mut() { + if entry.status == UpgradeStatus::Running + && entry.started_at + chrono::Duration::seconds(UPGRADE_TIMEOUT_SECS) < now + { + entry.status = UpgradeStatus::Timeout; + entry.finished_at = Some(now); + let snapshot = entry.clone(); + drop(entry); + self.broadcast_result(&snapshot); + } + } + } + + pub fn cleanup_old(&self, now: chrono::DateTime) { + self.jobs.retain(|_, job| { + job.finished_at + .map(|finished_at| finished_at + chrono::Duration::hours(UPGRADE_RETENTION_HOURS) >= now) + .unwrap_or(true) + }); + } + pub fn get(&self, server_id: &str) -> Option { self.jobs.get(server_id).map(|job| job.clone()) } + pub fn snapshot(&self) -> Vec { self.jobs.iter().map(|entry| entry.value().clone()).collect() } + + fn broadcast_progress(&self, job: &UpgradeJob) { + let _ = self.browser_tx.send(BrowserMessage::UpgradeProgress { + server_id: job.server_id.clone(), + job_id: job.job_id.clone(), + target_version: job.target_version.clone(), + stage: job.stage.clone(), + }); + } + + fn broadcast_result(&self, job: &UpgradeJob) { + let _ = self.browser_tx.send(BrowserMessage::UpgradeResult { + server_id: job.server_id.clone(), + job_id: job.job_id.clone(), + target_version: job.target_version.clone(), + status: job.status.clone(), + stage: Some(job.stage.clone()), + error: job.error.clone(), + backup_path: job.backup_path.clone(), + }); + } +} + +fn matches_lookup(job: &UpgradeJob, lookup: UpgradeLookup<'_>) -> bool { + match lookup.job_id { + Some(job_id) => job.job_id == job_id, + None => job.target_version == lookup.target_version, + } +} + +// Add these fields on `AppState` +pub upgrade_tracker: Arc, +pub upgrade_release_service: Arc, + +// crates/server/src/task/upgrade_timeout.rs +pub async fn run(state: Arc) { + let mut ticker = tokio::time::interval(std::time::Duration::from_secs(10)); + loop { + ticker.tick().await; + let now = chrono::Utc::now(); + state.upgrade_tracker.sweep_timeouts(now); + state.upgrade_tracker.cleanup_old(now); + } +} + +// crates/server/src/main.rs +let s = state.clone(); +tokio::spawn(async move { task::upgrade_timeout::run(s).await }); +``` + +- [ ] **Step 4: Run the tracker tests to verify they pass** + +Run: + +```bash +cargo test -p serverbee-server start_job_rejects_a_second_running_job -- --exact +cargo test -p serverbee-server update_stage_prefers_job_id_and_ignores_stale_messages -- --exact +cargo test -p serverbee-server mark_succeeded_does_not_overwrite_timeout -- --exact +cargo test -p serverbee-server cleanup_old_removes_only_expired_terminal_jobs -- --exact +``` + +Expected: PASS. + +- [ ] **Step 5: Commit** + +```bash +git add crates/server/src/service/upgrade_tracker.rs crates/server/src/service/mod.rs crates/server/src/state.rs crates/server/src/task/upgrade_timeout.rs crates/server/src/task/mod.rs crates/server/src/main.rs +git commit -m "feat(server): add upgrade job tracker" +``` + +### Task 3: Add Latest-Version Lookup and Release Metadata Resolution + +**Files:** +- Create: `crates/server/src/service/upgrade_release.rs` +- Modify: `crates/server/src/service/mod.rs` +- Modify: `crates/server/src/config.rs` +- Modify: `crates/server/src/router/api/agent.rs` +- Modify: `crates/server/src/router/api/mod.rs` +- Modify: `crates/server/src/openapi.rs` +- Test: `crates/server/src/service/upgrade_release.rs` + +- [ ] **Step 1: Write the failing release-service tests** + +```rust +#[test] +fn github_release_api_url_is_derived_from_release_base_url() { + assert_eq!( + github_latest_release_api("https://github.com/ZingerLittleBee/ServerBee/releases"), + Some("https://api.github.com/repos/ZingerLittleBee/ServerBee/releases/latest".to_string()) + ); +} + +#[test] +fn normalize_release_tag_strips_optional_v_prefix() { + assert_eq!(normalize_release_tag("v1.2.3"), "1.2.3"); + assert_eq!(normalize_release_tag("1.2.3"), "1.2.3"); +} + +#[test] +fn cache_ttl_is_longer_for_success_than_failure() { + let now = chrono::Utc::now(); + let success = CachedLatestVersion::success("1.2.3".to_string(), None, now); + let failure = CachedLatestVersion::failure("auto-detect failed".to_string(), now); + + assert!(success.expires_at > failure.expires_at); +} +``` + +- [ ] **Step 2: Run the release-service tests to verify they fail** + +Run: + +```bash +cargo test -p serverbee-server github_release_api_url_is_derived_from_release_base_url -- --exact +``` + +Expected: FAIL because `upgrade_release.rs` and its helpers do not exist yet. + +- [ ] **Step 3: Implement the release service, config field, and read route** + +```rust +// crates/server/src/config.rs +#[derive(Debug, Clone, Deserialize)] +pub struct UpgradeConfig { + #[serde(default = "default_release_base_url")] + pub release_base_url: String, + #[serde(default)] + pub latest_version_url: Option, +} + +// crates/server/src/service/upgrade_release.rs +#[derive(Clone, Debug, Serialize, utoipa::ToSchema)] +pub struct LatestAgentVersionResponse { + pub version: Option, + pub released_at: Option>, + pub error: Option, +} + +#[derive(Clone, Debug)] +pub struct ReleaseAsset { + pub download_url: String, + pub sha256: String, +} + +pub struct UpgradeReleaseService { + client: reqwest::Client, + cache: tokio::sync::RwLock>, +} + +#[derive(Clone)] +struct CachedLatestVersion { + value: LatestAgentVersionResponse, + expires_at: chrono::DateTime, +} + +impl CachedLatestVersion { + fn success( + version: String, + released_at: Option>, + now: chrono::DateTime, + ) -> Self { + Self { + value: LatestAgentVersionResponse { + version: Some(version), + released_at, + error: None, + }, + expires_at: now + chrono::Duration::minutes(10), + } + } + + fn failure(error: String, now: chrono::DateTime) -> Self { + Self { + value: LatestAgentVersionResponse { + version: None, + released_at: None, + error: Some(error), + }, + expires_at: now + chrono::Duration::minutes(1), + } + } +} + +fn normalize_release_tag(tag: &str) -> String { + tag.strip_prefix('v').unwrap_or(tag).to_string() +} + +fn github_latest_release_api(base_url: &str) -> Option { + let trimmed = base_url.trim_end_matches('/'); + let parts: Vec<&str> = trimmed.split('/').collect(); + if parts.len() >= 6 + && parts[0] == "https:" + && parts[2] == "github.com" + && parts[5] == "releases" + { + Some(format!( + "https://api.github.com/repos/{}/{}/releases/latest", + parts[3], parts[4] + )) + } else { + None + } +} + +impl UpgradeReleaseService { + pub fn new() -> Self { + Self { + client: reqwest::Client::builder() + .timeout(std::time::Duration::from_secs(10)) + .user_agent(format!("serverbee-server/{}", serverbee_common::constants::VERSION)) + .build() + .unwrap(), + cache: tokio::sync::RwLock::new(None), + } + } + + pub async fn latest(&self, config: &UpgradeConfig) -> LatestAgentVersionResponse { + let now = chrono::Utc::now(); + if let Some(cached) = self.cache.read().await.clone() + && cached.expires_at > now + { + return cached.value; + } + + let fresh = match self.fetch_latest(config).await { + Ok(value) => match value.version.clone() { + Some(version) => CachedLatestVersion::success(version, value.released_at, now), + None => CachedLatestVersion::failure( + value + .error + .unwrap_or_else(|| "latest version unavailable".to_string()), + now, + ), + }, + Err(err) => CachedLatestVersion::failure(err.to_string(), now), + }; + + let response = fresh.value.clone(); + *self.cache.write().await = Some(fresh); + response + } + + pub async fn resolve_asset( + &self, + config: &UpgradeConfig, + version: &str, + asset_name: &str, + ) -> Result { + let base_url = config.release_base_url.trim_end_matches('/'); + let checksums_url = format!("{base_url}/download/v{version}/checksums.txt"); + let response = self + .client + .get(&checksums_url) + .send() + .await + .map_err(|err| AppError::Internal(format!("Failed to fetch checksums: {err}")))?; + + if !response.status().is_success() { + return Err(AppError::BadRequest(format!( + "Checksums not found for version v{version} (HTTP {})", + response.status() + ))); + } + + let body = response + .text() + .await + .map_err(|err| AppError::Internal(format!("Failed to read checksums: {err}")))?; + + let sha256 = body + .lines() + .find_map(|line| { + let mut parts = line.split_whitespace(); + let hash = parts.next()?; + let name = parts.next()?; + (name == asset_name).then(|| hash.to_string()) + }) + .ok_or_else(|| { + AppError::BadRequest(format!( + "Checksum not found for {asset_name} in v{version} release" + )) + })?; + + Ok(ReleaseAsset { + download_url: format!("{base_url}/download/v{version}/{asset_name}"), + sha256, + }) + } + + async fn fetch_latest( + &self, + config: &UpgradeConfig, + ) -> anyhow::Result { + if let Some(url) = &config.latest_version_url { + let payload = self + .client + .get(url) + .send() + .await? + .json::() + .await?; + + return Ok(LatestAgentVersionResponse { + version: payload + .get("version") + .and_then(|value| value.as_str()) + .map(ToString::to_string), + released_at: payload + .get("released_at") + .and_then(|value| value.as_str()) + .and_then(|value| value.parse().ok()), + error: None, + }); + } + + if let Some(url) = github_latest_release_api(&config.release_base_url) { + let payload = self + .client + .get(url) + .send() + .await? + .json::() + .await?; + + return Ok(LatestAgentVersionResponse { + version: payload + .get("tag_name") + .and_then(|value| value.as_str()) + .map(normalize_release_tag), + released_at: payload + .get("published_at") + .and_then(|value| value.as_str()) + .and_then(|value| value.parse().ok()), + error: None, + }); + } + + Ok(LatestAgentVersionResponse { + version: None, + released_at: None, + error: Some("auto-detect failed; set upgrade.latest_version_url".to_string()), + }) + } +} + +// crates/server/src/router/api/agent.rs +pub fn read_router() -> Router> { + Router::new().route("/agent/latest-version", get(get_latest_version)) +} + +async fn get_latest_version( + State(state): State>, +) -> Result>, AppError> { + ok(state.upgrade_release_service.latest(&state.config.upgrade).await) +} + +// crates/server/src/router/api/mod.rs +.merge(agent::read_router()) +``` + +- [ ] **Step 4: Run the release-service tests to verify they pass** + +Run: + +```bash +cargo test -p serverbee-server github_release_api_url_is_derived_from_release_base_url -- --exact +cargo test -p serverbee-server normalize_release_tag_strips_optional_v_prefix -- --exact +cargo test -p serverbee-server cache_ttl_is_longer_for_success_than_failure -- --exact +``` + +Expected: PASS. + +- [ ] **Step 5: Commit** + +```bash +git add crates/server/src/service/upgrade_release.rs crates/server/src/service/mod.rs crates/server/src/config.rs crates/server/src/router/api/agent.rs crates/server/src/router/api/mod.rs crates/server/src/openapi.rs +git commit -m "feat(server): add upgrade release lookup" +``` + +### Task 4: Wire Upgrade Routes and WebSocket State Changes on the Server + +**Files:** +- Modify: `crates/server/src/router/api/server.rs` +- Modify: `crates/server/src/router/ws/agent.rs` +- Modify: `crates/server/src/router/ws/browser.rs` +- Modify: `crates/server/src/openapi.rs` +- Test: `crates/server/src/router/ws/browser.rs` + +- [ ] **Step 1: Write the failing FullSync hydration test** + +```rust +#[tokio::test] +async fn build_full_sync_includes_upgrade_snapshot() { + let state = test_browser_state().await; + state.upgrade_tracker.start_job("s1", "1.2.3").unwrap(); + + match build_full_sync(&state).await { + BrowserMessage::FullSync { upgrades, .. } => { + assert_eq!(upgrades.len(), 1); + assert_eq!(upgrades[0].server_id, "s1"); + assert_eq!(upgrades[0].status, UpgradeStatus::Running); + } + _ => panic!("Expected FullSync"), + } +} + +async fn test_browser_state() -> Arc { + let tmp = tempfile::tempdir().unwrap(); + let db = sea_orm::Database::connect("sqlite::memory:").await.unwrap(); + crate::migration::Migrator::up(&db, None).await.unwrap(); + + let config = AppConfig { + server: ServerConfig { + listen: "127.0.0.1:0".to_string(), + data_dir: tmp.path().display().to_string(), + trusted_proxies: Vec::new(), + }, + auth: AuthConfig { + secure_cookie: false, + ..AuthConfig::default() + }, + ..AppConfig::default() + }; + + AppState::new(db, config).await.unwrap() +} +``` + +- [ ] **Step 2: Run the FullSync test to verify it fails** + +Run: + +```bash +cargo test -p serverbee-server build_full_sync_includes_upgrade_snapshot -- --exact +``` + +Expected: FAIL because `BrowserMessage::FullSync` does not yet include `upgrades`. + +- [ ] **Step 3: Implement the route and websocket wiring** + +```rust +// crates/server/src/router/api/server.rs +#[derive(Debug, Serialize, utoipa::ToSchema)] +pub struct TriggerUpgradeResponse { + pub job: UpgradeJobDto, +} + +#[utoipa::path( + get, + path = "/api/servers/{id}/upgrade", + tag = "servers", + responses((status = 200, body = Option)) +)] +async fn get_upgrade_job( + State(state): State>, + Path(id): Path, +) -> Result>>, AppError> { + ok(state.upgrade_tracker.get(&id).map(|job| job.to_dto())) +} + +async fn trigger_upgrade( + State(state): State>, + Path(id): Path, + Json(body): Json, +) -> Result { + // Preserve the existing CAP_UPGRADE pre-check from server.rs:536-544. + // This must run BEFORE `start_job` so capability rejections never leave a + // phantom Running job in the tracker. + let server = ServerService::get(&state.db, &id) + .await? + .ok_or(AppError::NotFound)?; + if !has_capability(server.capabilities as u32, CAP_UPGRADE) { + return Err(AppError::Forbidden( + "CAP_UPGRADE is not enabled for this server".into(), + )); + } + + let version = normalize_version(&body.version); + let (os_raw, arch_raw) = state + .agent_manager + .get_agent_platform(&id) + .ok_or_else(|| AppError::Conflict("Agent not connected".into()))?; + + let os = map_os(&os_raw) + .ok_or_else(|| AppError::BadRequest(format!("Unsupported agent OS: {os_raw}")))?; + let arch = map_arch(&arch_raw) + .ok_or_else(|| AppError::BadRequest(format!("Unsupported agent arch: {arch_raw}")))?; + let asset_name = if os == "windows" { + format!("serverbee-agent-{os}-{arch}.exe") + } else { + format!("serverbee-agent-{os}-{arch}") + }; + let release = state + .upgrade_release_service + .resolve_asset(&state.config.upgrade, version, &asset_name) + .await?; + + let job = match state.upgrade_tracker.start_job(&id, version) { + Ok(job) => job, + Err(StartUpgradeJobError::Conflict(existing)) => { + return Ok(( + axum::http::StatusCode::CONFLICT, + Json(ApiResponse { + data: TriggerUpgradeResponse { + job: existing.to_dto(), + }, + }), + )); + } + }; + + let sender = state + .agent_manager + .get_sender(&id) + .ok_or_else(|| AppError::Conflict("Agent not connected".into()))?; + + let msg = ServerMessage::Upgrade { + version: version.to_string(), + download_url: release.download_url, + sha256: release.sha256, + job_id: Some(job.job_id.clone()), + }; + + if let Err(err) = sender.send(msg).await { + state.upgrade_tracker.mark_failed( + &id, + UpgradeLookup { + job_id: Some(job.job_id.as_str()), + target_version: version, + }, + UpgradeStage::Downloading, + format!("failed to notify agent: {err}"), + None, + ); + return Err(AppError::Internal("Failed to send upgrade command".into())); + } + + Ok(( + axum::http::StatusCode::ACCEPTED, + Json(ApiResponse { + data: TriggerUpgradeResponse { job: job.to_dto() }, + }), + )) +} + +// crates/server/src/router/ws/agent.rs +AgentMessage::UpgradeProgress { + job_id, + target_version, + stage, + .. +} => { + state.upgrade_tracker.update_stage( + server_id, + UpgradeLookup { + job_id: job_id.as_deref(), + target_version: &target_version, + }, + stage, + ); +} +AgentMessage::UpgradeResult { + job_id, + target_version, + stage, + error, + backup_path, + .. +} => { + state.upgrade_tracker.mark_failed( + server_id, + UpgradeLookup { + job_id: job_id.as_deref(), + target_version: &target_version, + }, + stage, + error, + backup_path, + ); +} +AgentMessage::SystemInfo { info, .. } => { + ServerService::update_system_info(&state.db, server_id, &info, region, country_code) + .await + .expect("system info update should succeed"); + state.agent_manager.broadcast_browser(BrowserMessage::AgentInfoUpdated { + server_id: server_id.to_string(), + protocol_version: agent_pv, + agent_version: Some(info.agent_version.clone()), + }); + + if let Some(job) = state.upgrade_tracker.get(server_id) { + if job.status == UpgradeStatus::Running && info.agent_version == job.target_version { + state.upgrade_tracker.mark_succeeded(server_id, &info.agent_version); + } else if job.status == UpgradeStatus::Running && info.agent_version != job.target_version { + state.upgrade_tracker.mark_failed( + server_id, + UpgradeLookup { + job_id: Some(job.job_id.as_str()), + target_version: &job.target_version, + }, + UpgradeStage::Restarting, + format!( + "agent reconnected with unexpected version {}, expected {}", + info.agent_version, job.target_version + ), + None, + ); + } + } +} +// IMPORTANT: Do NOT add a new CapabilityDenied match arm — the existing one at +// ws/agent.rs:586-627 is a single catch-all that already handles exec/terminal +// cleanup. Any new sibling arm with a guard would be unreachable for +// `capability == "upgrade"`. Instead, fold the upgrade-specific call INTO the +// existing arm body, near the top, while leaving the existing +// exec/terminal logic untouched: +// +// AgentMessage::CapabilityDenied { msg_id, session_id, capability, reason } => { +// tracing::warn!(/* existing log line */); +// if capability == "upgrade" { +// state.upgrade_tracker.mark_failed_by_capability_denied(server_id); +// } +// // ... existing exec/terminal dispatch logic unchanged ... +// } + +// crates/server/src/router/ws/browser.rs +BrowserMessage::FullSync { + servers: statuses, + upgrades: state + .upgrade_tracker + .snapshot() + .into_iter() + .map(|job| job.to_dto()) + .collect(), +} +``` + +- [ ] **Step 4: Run the server route/ws tests to verify they pass** + +Run: + +```bash +cargo test -p serverbee-server build_full_sync_includes_upgrade_snapshot -- --exact +cargo test -p serverbee-server trigger_upgrade -- --nocapture +``` + +Expected: the FullSync test passes and the `trigger_upgrade` unit test module still passes after the route response shape change. + +- [ ] **Step 5: Commit** + +```bash +git add crates/server/src/router/api/server.rs crates/server/src/router/ws/agent.rs crates/server/src/router/ws/browser.rs crates/server/src/openapi.rs +git commit -m "feat(server): wire upgrade api and ws state" +``` + +### Task 5: Harden Agent Upgrade Execution and Emit Lifecycle Events + +**Files:** +- Modify: `crates/agent/src/reporter.rs` +- Test: `crates/agent/src/reporter.rs` + +- [ ] **Step 1: Write the failing agent-side helper tests** + +```rust +#[test] +fn verify_sha256_rejects_mismatched_hash() { + let err = verify_sha256(b"hello", "deadbeef").unwrap_err(); + assert!(err.to_string().contains("sha256 mismatch")); +} + +#[tokio::test] +async fn run_preflight_rejects_non_zero_exit() { + let script = tempfile::NamedTempFile::new().unwrap(); + std::fs::write(script.path(), "#!/bin/sh\nexit 7\n").unwrap(); + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt; + std::fs::set_permissions(script.path(), std::fs::Permissions::from_mode(0o755)).unwrap(); + } + + let err = run_preflight(script.path(), Duration::from_secs(1)).await.unwrap_err(); + assert!(err.to_string().contains("preflight")); +} + +#[test] +fn cleanup_old_backups_removes_only_stale_backup_files() { + let dir = tempfile::tempdir().unwrap(); + let stale = dir.path().join("serverbee-agent.bak.20260414-000000"); + let fresh = dir.path().join("serverbee-agent.bak.20260414-235959"); + std::fs::write(&stale, b"old").unwrap(); + std::fs::write(&fresh, b"new").unwrap(); + + cleanup_old_backups(dir.path(), chrono::Utc::now() + chrono::Duration::hours(25)).unwrap(); + + assert!(!stale.exists()); + assert!(fresh.exists()); +} +``` + +- [ ] **Step 2: Run the agent helper tests to verify they fail** + +Run: + +```bash +cargo test -p serverbee-agent verify_sha256_rejects_mismatched_hash -- --exact +cargo test -p serverbee-agent run_preflight_rejects_non_zero_exit -- --exact +``` + +Expected: FAIL because the extracted helpers and backup cleanup do not exist yet. + +- [ ] **Step 3: Implement progress emission, safer install, and concurrency protection** + +```rust +async fn emit_upgrade_progress( + tx: &tokio::sync::mpsc::Sender, + job_id: Option, + target_version: &str, + stage: UpgradeStage, +) { + let _ = tx + .send(AgentMessage::UpgradeProgress { + msg_id: uuid::Uuid::new_v4().to_string(), + job_id, + target_version: target_version.to_string(), + stage, + }) + .await; +} + +async fn emit_upgrade_failure( + tx: &tokio::sync::mpsc::Sender, + job_id: Option, + target_version: &str, + stage: UpgradeStage, + error: String, + backup_path: Option, +) { + let _ = tx + .send(AgentMessage::UpgradeResult { + msg_id: uuid::Uuid::new_v4().to_string(), + job_id, + target_version: target_version.to_string(), + stage, + error, + backup_path, + }) + .await; +} + +fn verify_sha256(bytes: &[u8], expected: &str) -> anyhow::Result<()> { + use sha2::{Digest, Sha256}; + + let actual = format!("{:x}", Sha256::digest(bytes)); + if actual != expected { + anyhow::bail!("sha256 mismatch: got {actual}, want {expected}"); + } + + Ok(()) +} + +async fn download_upgrade_bytes(download_url: &str) -> anyhow::Result> { + let client = reqwest::Client::builder() + .timeout(Duration::from_secs(600)) + .build()?; + let response = client + .get(download_url) + .header("User-Agent", "ServerBee-Agent") + .send() + .await?; + + if !response.status().is_success() { + anyhow::bail!("http {}", response.status()); + } + + Ok(response.bytes().await?.to_vec()) +} + +fn set_executable_permissions(path: &std::path::Path) -> anyhow::Result<()> { + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt; + std::fs::set_permissions(path, std::fs::Permissions::from_mode(0o755))?; + } + + Ok(()) +} + +async fn run_preflight(path: &std::path::Path, timeout: Duration) -> anyhow::Result<()> { + let status = tokio::time::timeout( + timeout, + tokio::process::Command::new(path).arg("--version").status(), + ) + .await + .map_err(|_| anyhow::anyhow!("preflight timed out"))??; + + if !status.success() { + anyhow::bail!("preflight failed with status {status}"); + } + Ok(()) +} + +fn cleanup_old_backups(dir: &std::path::Path, now: chrono::DateTime) -> anyhow::Result<()> { + for entry in std::fs::read_dir(dir)? { + let entry = entry?; + let path = entry.path(); + if path + .file_name() + .and_then(|name| name.to_str()) + .is_some_and(|name| name.contains(".bak.")) + { + let modified: chrono::DateTime = entry.metadata()?.modified()?.into(); + if modified < now - chrono::Duration::hours(24) { + let _ = std::fs::remove_file(path); + } + } + } + Ok(()) +} + +async fn perform_upgrade( + tx: &tokio::sync::mpsc::Sender, + job_id: Option, + version: &str, + download_url: &str, + sha256: &str, +) -> anyhow::Result<()> { + let current_exe = std::env::current_exe()?; + let tmp_path = current_exe.with_extension("new"); + let backup_path = current_exe.with_extension(format!( + "bak.{}", + chrono::Utc::now().format("%Y%m%d-%H%M%S") + )); + + emit_upgrade_progress(tx, job_id.clone(), version, UpgradeStage::Downloading).await; + let bytes = download_upgrade_bytes(download_url).await.map_err(|err| { + anyhow::anyhow!("download failed: {err}") + })?; + + emit_upgrade_progress(tx, job_id.clone(), version, UpgradeStage::Verifying).await; + if let Err(err) = verify_sha256(&bytes, sha256) { + let _ = tokio::fs::remove_file(&tmp_path).await; + emit_upgrade_failure(tx, job_id.clone(), version, UpgradeStage::Verifying, err.to_string(), None).await; + return Err(err); + } + + tokio::fs::write(&tmp_path, &bytes).await?; + set_executable_permissions(&tmp_path)?; + + emit_upgrade_progress(tx, job_id.clone(), version, UpgradeStage::PreFlight).await; + if let Err(err) = run_preflight(&tmp_path, Duration::from_secs(5)).await { + let _ = tokio::fs::remove_file(&tmp_path).await; + emit_upgrade_failure(tx, job_id.clone(), version, UpgradeStage::PreFlight, err.to_string(), None).await; + return Err(err); + } + + emit_upgrade_progress(tx, job_id.clone(), version, UpgradeStage::Installing).await; + std::fs::rename(¤t_exe, &backup_path)?; + if let Err(err) = std::fs::rename(&tmp_path, ¤t_exe) { + let _ = std::fs::rename(&backup_path, ¤t_exe); + emit_upgrade_failure( + tx, + job_id.clone(), + version, + UpgradeStage::Installing, + err.to_string(), + Some(backup_path.display().to_string()), + ) + .await; + return Err(err.into()); + } + + emit_upgrade_progress(tx, job_id.clone(), version, UpgradeStage::Restarting).await; + if let Err(err) = std::process::Command::new(¤t_exe).args(std::env::args().skip(1)).spawn() { + let _ = std::fs::rename(&backup_path, ¤t_exe); + emit_upgrade_failure( + tx, + job_id, + version, + UpgradeStage::Restarting, + err.to_string(), + Some(backup_path.display().to_string()), + ) + .await; + return Err(err.into()); + } + + std::process::exit(0); +} +``` + +- [ ] **Step 4: Run the agent helper tests to verify they pass** + +Run: + +```bash +cargo test -p serverbee-agent verify_sha256_rejects_mismatched_hash -- --exact +cargo test -p serverbee-agent run_preflight_rejects_non_zero_exit -- --exact +cargo test -p serverbee-agent cleanup_old_backups_removes_only_stale_backup_files -- --exact +``` + +Expected: PASS. + +- [ ] **Step 5: Commit** + +```bash +git add crates/agent/src/reporter.rs +git commit -m "feat(agent): report upgrade progress and harden restart" +``` + +### Task 6: Add Server Integration Coverage for the Upgrade Lifecycle + +**Files:** +- Modify: `crates/server/tests/integration.rs` +- Test: `crates/server/tests/integration.rs` + +- [ ] **Step 1: Write the failing upgrade integration tests** + +```rust +#[tokio::test] +async fn upgrade_success_marks_job_succeeded_and_updates_agent_version() { + let (base_url, state, _tmp) = start_test_server_with_state().await; + let client = http_client(); + login_admin(&client, &base_url).await; + let (server_id, token) = register_agent(&client, &base_url).await; + let (mut ws_tx, mut ws_rx) = connect_agent(&base_url, &token).await; + + send_system_info(&mut ws_tx, &mut ws_rx, "0.2.0").await; + + let resp = client + .post(format!("{}/api/servers/{}/upgrade", base_url, server_id)) + .json(&json!({ "version": "0.3.0" })) + .send() + .await + .unwrap(); + + assert_eq!(resp.status(), 202); + let body: serde_json::Value = resp.json().await.unwrap(); + let job_id = body["data"]["job"]["job_id"].as_str().unwrap().to_string(); + + send_upgrade_progress(&mut ws_tx, &job_id, "0.3.0", "downloading").await; + send_upgrade_progress(&mut ws_tx, &job_id, "0.3.0", "restarting").await; + drop(ws_tx); + let (mut ws_tx, mut ws_rx) = connect_agent(&base_url, &token).await; + send_system_info(&mut ws_tx, &mut ws_rx, "0.3.0").await; + + let job = state.upgrade_tracker.get(&server_id).unwrap(); + assert_eq!(job.status, UpgradeStatus::Succeeded); + assert_eq!(job.target_version, "0.3.0"); +} + +#[tokio::test] +async fn capability_denied_upgrade_fails_immediately() { + let (base_url, state, _tmp) = start_test_server_with_state().await; + let client = http_client(); + login_admin(&client, &base_url).await; + let (server_id, token) = register_agent(&client, &base_url).await; + let (mut ws_tx, mut ws_rx) = connect_agent(&base_url, &token).await; + + send_system_info(&mut ws_tx, &mut ws_rx, "0.2.0").await; + enable_server_upgrade_capability(&client, &base_url, &server_id).await; + + let resp = client + .post(format!("{}/api/servers/{}/upgrade", base_url, server_id)) + .json(&json!({ "version": "0.3.0" })) + .send() + .await + .unwrap(); + assert_eq!(resp.status(), 202); + + send_capability_denied(&mut ws_tx, "upgrade").await; + + let job = state.upgrade_tracker.get(&server_id).unwrap(); + assert_eq!(job.status, UpgradeStatus::Failed); + assert!(job.error.unwrap().contains("capability denied")); +} + +#[tokio::test] +async fn full_sync_contains_running_and_failed_upgrade_jobs() { + let (base_url, state, _tmp) = start_test_server_with_state().await; + let client = http_client(); + login_admin(&client, &base_url).await; + let (server_id, token) = register_agent(&client, &base_url).await; + let (mut agent_tx, mut agent_rx) = connect_agent(&base_url, &token).await; + send_system_info(&mut agent_tx, &mut agent_rx, "0.2.0").await; + + let job = state.upgrade_tracker.start_job(&server_id, "0.3.0").unwrap(); + state.upgrade_tracker.mark_failed( + &server_id, + UpgradeLookup { + job_id: Some(job.job_id.as_str()), + target_version: "0.3.0", + }, + UpgradeStage::Verifying, + "sha256 mismatch".to_string(), + None, + ); + + let api_key = create_api_key(&client, &base_url).await; + let mut browser = connect_browser_ws(&base_url, &api_key).await; + let full_sync = next_browser_message(&mut browser).await; + assert_eq!(full_sync["type"], "full_sync"); + assert_eq!(full_sync["upgrades"][0]["status"], "failed"); +} + +#[tokio::test] +async fn upgrade_result_failure_marks_job_failed_with_reason() { + // Covers spec integration scenario #2 — Verifying failure path. + // Proves the agent→server UpgradeResult message round-trip correctly + // flips the job to Failed and surfaces the agent-reported error string. + let (base_url, state, _tmp) = start_test_server_with_state().await; + let client = http_client(); + login_admin(&client, &base_url).await; + let (server_id, token) = register_agent(&client, &base_url).await; + let (mut ws_tx, mut ws_rx) = connect_agent(&base_url, &token).await; + + send_system_info(&mut ws_tx, &mut ws_rx, "0.2.0").await; + enable_server_upgrade_capability(&client, &base_url, &server_id).await; + + let resp = client + .post(format!("{}/api/servers/{}/upgrade", base_url, server_id)) + .json(&json!({ "version": "0.3.0" })) + .send() + .await + .unwrap(); + assert_eq!(resp.status(), 202); + let body: serde_json::Value = resp.json().await.unwrap(); + let job_id = body["data"]["job"]["job_id"].as_str().unwrap().to_string(); + + send_upgrade_progress(&mut ws_tx, &job_id, "0.3.0", "downloading").await; + send_upgrade_result_failure( + &mut ws_tx, + &job_id, + "0.3.0", + "verifying", + "sha256 mismatch: got abc, want def", + None, + ) + .await; + + // Poll briefly because WS dispatch is async. + let job = wait_for_job_status(&state, &server_id, UpgradeStatus::Failed).await; + assert_eq!(job.stage, UpgradeStage::Verifying); + assert!(job.error.unwrap().contains("sha256 mismatch")); +} + +#[tokio::test] +async fn upgrade_timeout_sweeper_flips_stuck_running_job() { + // Covers spec integration scenario #3 — timeout path. + // Proves the 120s timeout sweeper observably flips a Running job to Timeout + // and that the change is broadcast to connected browsers. + let (base_url, state, _tmp) = start_test_server_with_state().await; + let client = http_client(); + login_admin(&client, &base_url).await; + let (server_id, token) = register_agent(&client, &base_url).await; + let (mut ws_tx, mut ws_rx) = connect_agent(&base_url, &token).await; + send_system_info(&mut ws_tx, &mut ws_rx, "0.2.0").await; + + // Seed a Running job directly via tracker, then backdate `started_at` past + // the timeout window (the sweeper uses Utc::now() internally, so we either + // call the sweeper with a synthetic `now` or inject a mutator helper on + // the tracker used only in cfg(test)). + let job = state.upgrade_tracker.start_job(&server_id, "0.3.0").unwrap(); + state + .upgrade_tracker + .test_override_started_at(&server_id, Utc::now() - chrono::Duration::seconds(121)); + + state.upgrade_tracker.sweep_timeouts(Utc::now()); + + let observed = state.upgrade_tracker.get(&server_id).unwrap(); + assert_eq!(observed.status, UpgradeStatus::Timeout); + assert_eq!(observed.job_id, job.job_id); + + // Prove the browser broadcast went out as well so the UI receives the flip. + let api_key = create_api_key(&client, &base_url).await; + let mut browser = connect_browser_ws(&base_url, &api_key).await; + let full_sync = next_browser_message(&mut browser).await; + assert_eq!(full_sync["upgrades"][0]["status"], "timeout"); +} +``` + +> **Note for implementer**: the timeout test uses `test_override_started_at` — add this as a `#[cfg(any(test, feature = "test-util"))]` helper on `UpgradeJobTracker` that only exists in test builds. Alternatively, refactor `sweep_timeouts` to accept an injected `now: DateTime` (Task 2 already does this — just make sure the helper to backdate an existing job lives in `cfg(test)` to avoid polluting production API). +> +> `send_upgrade_result_failure` and `wait_for_job_status` are new harness helpers — add them alongside the existing `send_upgrade_progress` / `send_capability_denied` in the integration harness module (Step 3). + +- [ ] **Step 2: Run the upgrade integration tests to verify they fail** + +Run: + +```bash +cargo test -p serverbee-server upgrade_success_marks_job_succeeded_and_updates_agent_version -- --exact --nocapture +cargo test -p serverbee-server capability_denied_upgrade_fails_immediately -- --exact --nocapture +cargo test -p serverbee-server full_sync_contains_running_and_failed_upgrade_jobs -- --exact --nocapture +cargo test -p serverbee-server upgrade_result_failure_marks_job_failed_with_reason -- --exact --nocapture +cargo test -p serverbee-server upgrade_timeout_sweeper_flips_stuck_running_job -- --exact --nocapture +``` + +Expected: all five FAIL because the new tracker-backed route responses, WS upgrade messages, and helper functions (`send_upgrade_result_failure`, `wait_for_job_status`, `test_override_started_at`) are not wired yet. + +- [ ] **Step 3: Extend the existing integration harness until the new tests pass** + +```rust +async fn start_test_server_with_state() -> (String, Arc, tempfile::TempDir) { + let tmp = tempfile::tempdir().expect("Failed to create temp dir"); + let data_dir = tmp.path().to_str().unwrap().to_string(); + let config = AppConfig { + server: ServerConfig { + listen: "127.0.0.1:0".to_string(), + data_dir: data_dir.clone(), + trusted_proxies: Vec::new(), + }, + database: DatabaseConfig { + path: "test.db".to_string(), + max_connections: 5, + }, + auth: AuthConfig { + session_ttl: 86400, + auto_discovery_key: "test-key".to_string(), + secure_cookie: false, + max_servers: 0, + }, + admin: AdminConfig { + username: "admin".to_string(), + password: "testpass".to_string(), + }, + ..AppConfig::default() + }; + + let db = Database::connect(format!("sqlite://{data_dir}/test.db?mode=rwc")) + .await + .expect("Failed to connect to test database"); + Migrator::up(&db, None).await.expect("Failed to run migrations"); + AuthService::init_admin(&db, &config.admin).await.expect("Failed to init admin"); + ConfigService::set(&db, "auto_discovery_key", "test-key") + .await + .expect("Failed to set auto_discovery_key"); + + let state = AppState::new(db, config).await.expect("Failed to create AppState"); + let app_state = state.clone(); + let app = create_router(state); + let listener = tokio::net::TcpListener::bind("127.0.0.1:0") + .await + .expect("Failed to bind listener"); + let addr = listener.local_addr().unwrap(); + let base_url = format!("http://{}", addr); + tokio::spawn(async move { + axum::serve( + listener, + app.into_make_service_with_connect_info::(), + ) + .await + .unwrap(); + }); + tokio::time::sleep(std::time::Duration::from_millis(50)).await; + (base_url, app_state, tmp) +} + +type AgentSocket = + tokio_tungstenite::WebSocketStream>; +type AgentSink = futures_util::stream::SplitSink; +type AgentStream = futures_util::stream::SplitStream; +type BrowserStream = futures_util::stream::SplitStream; + +async fn send_system_info( + ws_tx: &mut AgentSink, + ws_rx: &mut AgentStream, + agent_version: &str, +) { + ws_tx + .send(tungstenite::Message::Text( + json!({ + "type": "system_info", + "msg_id": uuid::Uuid::new_v4().to_string(), + "cpu_name": "Test CPU", + "cpu_cores": 4, + "cpu_arch": "x86_64", + "os": "Linux", + "kernel_version": "6.1", + "mem_total": 1024, + "swap_total": 0, + "disk_total": 1024, + "agent_version": agent_version, + "protocol_version": 3, + "features": [] + }) + .to_string() + .into(), + )) + .await + .unwrap(); + + wait_for_ack(ws_rx).await; +} + +async fn send_upgrade_progress(ws_tx: &mut AgentSink, job_id: &str, version: &str, stage: &str) { + ws_tx + .send(tungstenite::Message::Text( + json!({ + "type": "upgrade_progress", + "msg_id": uuid::Uuid::new_v4().to_string(), + "job_id": job_id, + "target_version": version, + "stage": stage + }) + .to_string() + .into(), + )) + .await + .unwrap(); +} + +async fn send_capability_denied(ws_tx: &mut AgentSink, capability: &str) { + ws_tx + .send(tungstenite::Message::Text( + json!({ + "type": "capability_denied", + "capability": capability, + "reason": "agent_capability_disabled" + }) + .to_string() + .into(), + )) + .await + .unwrap(); +} + +async fn wait_for_ack(ws_rx: &mut AgentStream) { + while let Some(Ok(message)) = ws_rx.next().await { + if let tungstenite::Message::Text(text) = message + && text.contains("\"type\":\"ack\"") + { + break; + } + } +} + +async fn enable_server_upgrade_capability(client: &reqwest::Client, base_url: &str, server_id: &str) { + let resp = client + .put(format!("{}/api/servers/{}", base_url, server_id)) + .json(&json!({ "capabilities": serverbee_common::constants::CAP_DEFAULT | serverbee_common::constants::CAP_UPGRADE })) + .send() + .await + .unwrap(); + assert_eq!(resp.status(), 200); +} + +async fn connect_browser_ws(base_url: &str, api_key: &str) -> BrowserStream { + let ws_url = format!("{}/api/ws/servers", base_url.replace("http://", "ws://")); + let mut request = ws_url.into_client_request().unwrap(); + request + .headers_mut() + .insert("x-api-key", HeaderValue::from_str(api_key).unwrap()); + let (stream, _) = tokio_tungstenite::connect_async(request).await.unwrap(); + let (_, read) = stream.split(); + read +} + +async fn next_browser_message(browser: &mut BrowserStream) -> serde_json::Value { + while let Some(Ok(message)) = browser.next().await { + if let tungstenite::Message::Text(text) = message { + return serde_json::from_str(&text).unwrap(); + } + } + panic!("browser websocket closed before a message arrived") +} +``` + +- [ ] **Step 4: Run the upgrade integration tests to verify they pass** + +Run: + +```bash +cargo test -p serverbee-server upgrade_success_marks_job_succeeded_and_updates_agent_version -- --exact --nocapture +cargo test -p serverbee-server capability_denied_upgrade_fails_immediately -- --exact --nocapture +cargo test -p serverbee-server full_sync_contains_running_and_failed_upgrade_jobs -- --exact --nocapture +``` + +Expected: PASS. + +- [ ] **Step 5: Commit** + +```bash +git add crates/server/tests/integration.rs +git commit -m "test(server): cover upgrade lifecycle" +``` + +### Task 7: Add the Frontend Upgrade Store, Hook, and WebSocket Hydration + +**Files:** +- Create: `apps/web/src/stores/upgrade-jobs-store.ts` +- Create: `apps/web/src/stores/upgrade-jobs-store.test.ts` +- Create: `apps/web/src/hooks/use-upgrade-job.ts` +- Modify: `apps/web/src/hooks/use-servers-ws.ts` +- Modify: `apps/web/src/hooks/use-servers-ws.test.ts` +- Modify: `apps/web/src/lib/api-schema.ts` +- Regenerate: `apps/web/src/lib/api-types.ts` + +- [ ] **Step 1: Write the failing store and websocket tests** + +```ts +import { QueryClient } from '@tanstack/react-query' +import { beforeEach, describe, expect, it } from 'vitest' +import { handleWsMessage } from '@/hooks/use-servers-ws' +import { useUpgradeJobsStore } from './upgrade-jobs-store' + +const runningJob = { + server_id: 's1', + job_id: 'job-1', + target_version: '1.2.3', + stage: 'downloading', + status: 'running', + error: null, + backup_path: null, + started_at: '2026-04-14T00:00:00Z', + finished_at: null +} as const + +beforeEach(() => { + useUpgradeJobsStore.setState({ jobs: {} }) +}) + +describe('upgrade store', () => { + it('setJobs replaces the current snapshot', () => { + useUpgradeJobsStore.getState().setJobs([runningJob]) + useUpgradeJobsStore.getState().setJobs([]) + expect(useUpgradeJobsStore.getState().jobs).toEqual({}) + }) + + it('upsertJob ignores duplicate terminal payloads for the same job', () => { + useUpgradeJobsStore.getState().upsertJob({ + ...runningJob, + status: 'failed', + stage: 'verifying', + error: 'sha256 mismatch', + finished_at: '2026-04-14T00:00:03Z' + }) + useUpgradeJobsStore.getState().upsertJob({ + ...runningJob, + status: 'failed', + stage: 'verifying', + error: 'sha256 mismatch', + finished_at: '2026-04-14T00:00:03Z' + }) + expect(Object.keys(useUpgradeJobsStore.getState().jobs)).toHaveLength(1) + }) +}) + +describe('handleWsMessage', () => { + it('hydrates full_sync upgrades into the store', () => { + const queryClient = new QueryClient() + handleWsMessage({ type: 'full_sync', servers: [], upgrades: [runningJob] }, queryClient) + expect(useUpgradeJobsStore.getState().jobs.s1?.job_id).toBe('job-1') + }) + + it('patches agent_version into server detail cache', () => { + const queryClient = new QueryClient() + queryClient.setQueryData(['servers', 's1'], { id: 's1', agent_version: '0.2.0' }) + + handleWsMessage( + { type: 'agent_info_updated', server_id: 's1', protocol_version: 3, agent_version: '1.2.3' }, + queryClient + ) + + expect(queryClient.getQueryData(['servers', 's1'])).toMatchObject({ agent_version: '1.2.3' }) + }) +}) +``` + +- [ ] **Step 2: Run the frontend state tests to verify they fail** + +Run: + +```bash +bun --cwd apps/web x vitest run src/stores/upgrade-jobs-store.test.ts src/hooks/use-servers-ws.test.ts +``` + +Expected: FAIL because the store, exported `handleWsMessage`, and new WS cases do not exist yet. + +> **Note for implementer**: `handleWsMessage` currently exists as a *private* `function` at `apps/web/src/hooks/use-servers-ws.ts:299`. The test at Step 1 imports it, so Step 3 must convert it from `function` to `export function` (the snippet below shows the exported form). Do not create a second, separate `handleWsMessage` — edit the existing one in place. + +- [ ] **Step 3: Implement the store, hook, schema exports, and websocket cases** + +```ts +// apps/web/src/stores/upgrade-jobs-store.ts +import { create } from 'zustand' +import type { UpgradeJobDto } from '@/lib/api-schema' + +interface UpgradeJobsState { + jobs: Record + setJobs: (jobs: UpgradeJobDto[]) => void + upsertJob: (job: UpgradeJobDto) => void + clearFinished: (serverId: string) => void +} + +export const useUpgradeJobsStore = create()((set) => ({ + jobs: {}, + setJobs: (jobs) => + set({ + jobs: Object.fromEntries(jobs.map((job) => [job.server_id, job])), + }), + upsertJob: (job) => + set((state) => { + const prev = state.jobs[job.server_id] + if ( + prev && + prev.job_id === job.job_id && + prev.status === job.status && + prev.stage === job.stage && + prev.error === job.error && + prev.finished_at === job.finished_at + ) { + return state + } + + return { + jobs: { + ...state.jobs, + [job.server_id]: { ...prev, ...job }, + }, + } + }), + clearFinished: (serverId) => + set((state) => { + const next = { ...state.jobs } + delete next[serverId] + return { jobs: next } + }), +})) + +// apps/web/src/hooks/use-upgrade-job.ts +export function useUpgradeJob(serverId: string) { + const job = useUpgradeJobsStore((state) => state.jobs[serverId] ?? null) + const upsertJob = useUpgradeJobsStore((state) => state.upsertJob) + + const query = useQuery({ + queryKey: ['servers', serverId, 'upgrade'], + queryFn: () => api.get(`/api/servers/${serverId}/upgrade`), + enabled: !!serverId && !job, + staleTime: 0, + }) + + useEffect(() => { + if (query.data) { + upsertJob(query.data) + } + }, [query.data, upsertJob]) + + return job ?? query.data ?? null +} + +export function useTriggerUpgrade() { + const upsertJob = useUpgradeJobsStore((state) => state.upsertJob) + return useMutation({ + mutationFn: ({ serverId, version }: { serverId: string; version: string }) => + api.post(`/api/servers/${serverId}/upgrade`, { version }), + onSuccess: ({ job }) => upsertJob(job), + }) +} + +// apps/web/src/hooks/use-servers-ws.ts +type WsMessage = + | { type: 'full_sync'; servers: ServerMetrics[]; upgrades?: UpgradeJobDto[] } + | { type: 'upgrade_progress'; server_id: string; job_id: string; target_version: string; stage: UpgradeStage } + | { + type: 'upgrade_result' + server_id: string + job_id: string + target_version: string + status: UpgradeStatus + stage?: UpgradeStage | null + error?: string | null + backup_path?: string | null + } + | { type: 'agent_info_updated'; server_id: string; protocol_version: number; agent_version?: string | null } + +export function handleWsMessage(raw: unknown, queryClient: QueryClient): void { + if ( + isWsMessageLike(raw) && + (raw.type === 'full_sync' || raw.type === 'update' || raw.type === 'server_online' || raw.type === 'server_offline') + ) { + handleServerMetricsMessage(raw, queryClient) + } + + if (isWsMessageLike(raw) && (raw.type === 'capabilities_changed' || raw.type === 'agent_info_updated')) { + handleCapabilityMessage(raw, queryClient) + } + + if (isWsMessageLike(raw) && raw.type === 'full_sync') { + useUpgradeJobsStore.getState().setJobs(Array.isArray(raw.upgrades) ? (raw.upgrades as UpgradeJobDto[]) : []) + } + + if (isWsMessageLike(raw) && raw.type === 'upgrade_progress') { + const prev = useUpgradeJobsStore.getState().jobs[raw.server_id as string] + useUpgradeJobsStore.getState().upsertJob({ + server_id: raw.server_id as string, + job_id: raw.job_id as string, + target_version: raw.target_version as string, + stage: raw.stage as UpgradeStage, + status: 'running', + error: null, + backup_path: prev?.backup_path ?? null, + started_at: prev?.started_at ?? new Date().toISOString(), + finished_at: null, + }) + return + } + + if (isWsMessageLike(raw) && raw.type === 'upgrade_result') { + const prev = useUpgradeJobsStore.getState().jobs[raw.server_id as string] + useUpgradeJobsStore.getState().upsertJob({ + server_id: raw.server_id as string, + job_id: raw.job_id as string, + target_version: raw.target_version as string, + stage: (raw.stage as UpgradeStage | null | undefined) ?? prev?.stage ?? 'restarting', + status: raw.status as UpgradeStatus, + error: (raw.error as string | null | undefined) ?? null, + backup_path: (raw.backup_path as string | null | undefined) ?? null, + started_at: prev?.started_at ?? new Date().toISOString(), + finished_at: new Date().toISOString(), + }) + return + } +} + +// apps/web/src/lib/api-schema.ts +export type UpgradeJobDto = S['UpgradeJobDto'] +export type UpgradeStage = S['UpgradeStage'] +export type UpgradeStatus = S['UpgradeStatus'] +export type LatestAgentVersionResponse = S['LatestAgentVersionResponse'] +export type TriggerUpgradeResponse = S['TriggerUpgradeResponse'] +``` + +- [ ] **Step 4: Regenerate API types and run the frontend state tests to verify they pass** + +Run: + +```bash +bun --cwd apps/web run generate:api-types +bun --cwd apps/web x vitest run src/stores/upgrade-jobs-store.test.ts src/hooks/use-servers-ws.test.ts +``` + +Expected: PASS, and `apps/web/src/lib/api-types.ts` changes to include the new upgrade schemas. + +- [ ] **Step 5: Commit** + +```bash +git add apps/web/src/stores/upgrade-jobs-store.ts apps/web/src/stores/upgrade-jobs-store.test.ts apps/web/src/hooks/use-upgrade-job.ts apps/web/src/hooks/use-servers-ws.ts apps/web/src/hooks/use-servers-ws.test.ts apps/web/src/lib/api-schema.ts apps/web/src/lib/api-types.ts +git commit -m "feat(web): add upgrade job state and ws hydration" +``` + +### Task 8: Add the Detail-Page Upgrade UI and List Badges + +**Files:** +- Create: `apps/web/src/components/server/agent-version-section.tsx` +- Create: `apps/web/src/components/server/agent-version-section.test.tsx` +- Create: `apps/web/src/components/server/upgrade-job-badge.tsx` +- Modify: `apps/web/src/routes/_authed/servers/$id.tsx` +- Modify: `apps/web/src/routes/_authed/servers/$id.test.tsx` +- Modify: `apps/web/src/routes/_authed/servers/index.tsx` +- Modify: `apps/web/src/components/server/server-card.tsx` +- Modify: `apps/web/src/locales/en/servers.json` +- Modify: `apps/web/src/locales/zh/servers.json` + +- [ ] **Step 1: Write the failing component and placement tests** + +```tsx +import { render, screen } from '@testing-library/react' +import { describe, expect, it, vi } from 'vitest' +import { AgentVersionSection } from './agent-version-section' + +const mockUseUpgradeJob = vi.fn(() => null) + +vi.mock('@/hooks/use-auth', () => ({ + useAuth: () => ({ user: { role: 'admin' } }), +})) + +vi.mock('react-i18next', () => ({ + useTranslation: () => ({ + t: (key: string, vars?: Record) => { + if (key === 'upgrade_button') { + return `Upgrade to ${vars?.version}` + } + if (key === 'upgrade_retry') { + return 'Retry' + } + if (key === 'upgrade_failed') { + return `Failed at ${vars?.stage}: ${vars?.error}` + } + if (key.startsWith('upgrade_stage_')) { + return key.replace('upgrade_stage_', '') + } + return key + }, + }), +})) + +vi.mock('@/hooks/use-upgrade-job', () => ({ + useTriggerUpgrade: () => ({ mutate: vi.fn(), isPending: false }), + useUpgradeJob: mockUseUpgradeJob, +})) + +vi.mock('@tanstack/react-query', () => ({ + useQuery: () => ({ data: { version: '1.2.3', released_at: null, error: null } }), +})) + +describe('AgentVersionSection', () => { + it('shows the admin upgrade button when latest is newer', () => { + render( + + ) + + expect(screen.getByRole('button', { name: /upgrade to 1.2.3/i })).toBeInTheDocument() + }) + + it('shows failure details and retry for admin users', () => { + mockUseUpgradeJob.mockReturnValue({ + server_id: 's1', + job_id: 'job-1', + target_version: '1.2.3', + stage: 'verifying', + status: 'failed', + error: 'sha256 mismatch', + backup_path: '/opt/serverbee/serverbee-agent.bak.20260414-120000', + started_at: '2026-04-14T12:00:00Z', + finished_at: '2026-04-14T12:00:03Z', + }) + + render() + + expect(screen.getByText(/sha256 mismatch/i)).toBeInTheDocument() + expect(screen.getByRole('button', { name: /retry/i })).toBeInTheDocument() + }) +}) + +// apps/web/src/routes/_authed/servers/$id.test.tsx +vi.mock('@/components/server/agent-version-section', () => ({ + AgentVersionSection: () =>
agent-version-section
, +})) + +it('renders the agent version section on the detail page', () => { + render() + expect(screen.getByText('agent-version-section')).toBeInTheDocument() +}) +``` + +- [ ] **Step 2: Run the UI tests to verify they fail** + +Run: + +```bash +bun --cwd apps/web x vitest run src/components/server/agent-version-section.test.tsx 'src/routes/_authed/servers/$id.test.tsx' +``` + +Expected: FAIL because the new section, badge component, and translation keys do not exist yet. + +- [ ] **Step 3: Implement the detail section, shared badge, and translations** + +```tsx +// apps/web/src/components/server/upgrade-job-badge.tsx +export function UpgradeJobBadge({ serverId }: { serverId: string }) { + const { t } = useTranslation('servers') + const job = useUpgradeJobsStore((state) => state.jobs[serverId]) + + if (!job || job.status === 'succeeded') { + return null + } + + const variant = job.status === 'running' ? 'secondary' : 'destructive' + const label = job.status === 'running' ? t('upgrade_badge_running') : t('upgrade_badge_failed') + + return ( + + {label} + + ) +} + +// apps/web/src/components/server/agent-version-section.tsx +const STAGES: UpgradeStage[] = ['downloading', 'verifying', 'pre_flight', 'installing', 'restarting'] + +interface ServerWithCaps { + id: string + agent_version?: string | null + capabilities?: number | null + effective_capabilities?: number | null +} + +function compareVersion(left: string, right: string): number { + return left.localeCompare(right, undefined, { numeric: true, sensitivity: 'base' }) +} + +export function AgentVersionSection({ server }: { server: ServerResponse & ServerWithCaps }) { + const { t } = useTranslation(['servers', 'common']) + const { user } = useAuth() + const job = useUpgradeJob(server.id) + const clearFinished = useUpgradeJobsStore((state) => state.clearFinished) + const triggerUpgrade = useTriggerUpgrade() + const { data: latest } = useQuery({ + queryKey: ['agent', 'latest-version'], + queryFn: () => api.get('/api/agent/latest-version'), + staleTime: 60_000, + }) + + const canUpgrade = user?.role === 'admin' + const upgradeEnabled = getEffectiveCapabilityEnabled( + server.effective_capabilities, + server.capabilities, + CAP_UPGRADE + ) + const currentVersion = server.agent_version ?? '-' + const latestVersion = latest?.version ?? null + const showUpgradeButton = + canUpgrade && + latestVersion && + server.agent_version && + compareVersion(latestVersion, server.agent_version) > 0 + + useEffect(() => { + if (job?.status === 'succeeded') { + const timer = window.setTimeout(() => clearFinished(server.id), 3000) + return () => window.clearTimeout(timer) + } + }, [job?.status, server.id, clearFinished]) + + return ( +
+
+
+

{t('upgrade_section_title')}

+
+ {t('upgrade_current')} {currentVersion} + {t('upgrade_latest')} {latestVersion ?? '-'} +
+
+ {showUpgradeButton && ( + + + + + + + {t('upgrade_confirm_title')} + {t('upgrade_confirm_body')} + + + {t('common:cancel')} + triggerUpgrade.mutate({ serverId: server.id, version: latestVersion! })}> + {t('upgrade_button', { version: latestVersion })} + + + + + )} +
+ + {job?.status === 'running' && ( +
+ {STAGES.map((stage, index) => { + const currentIndex = STAGES.indexOf(job.stage) + const done = index < currentIndex + const active = index === currentIndex + return ( +
+
+ {done ? '✓ ' : ''} + {t(`upgrade_stage_${stage}`)} +
+
+ ) + })} +
+ )} + + {job?.status === 'failed' && ( +
+
{t('upgrade_failed', { stage: t(`upgrade_stage_${job.stage}`), error: job.error })}
+ {job.backup_path &&
{t('upgrade_failed_hint', { path: job.backup_path })}
} + {canUpgrade && ( + + )} +
+ )} +
+ ) +} + +// apps/web/src/routes/_authed/servers/$id.tsx + + + +// apps/web/src/routes/_authed/servers/index.tsx +
+ + +
+ +// apps/web/src/components/server/server-card.tsx +
+ + {flag && ( + + {flag} + + )} + {osEmoji && ( + + {osEmoji} + + )} +

{server.name}

+ +
+ + +
+
+``` + +- [ ] **Step 4: Run the UI tests to verify they pass** + +Run: + +```bash +bun --cwd apps/web x vitest run src/components/server/agent-version-section.test.tsx 'src/routes/_authed/servers/$id.test.tsx' +``` + +Expected: PASS. + +- [ ] **Step 5: Commit** + +```bash +git add apps/web/src/components/server/agent-version-section.tsx apps/web/src/components/server/agent-version-section.test.tsx apps/web/src/components/server/upgrade-job-badge.tsx apps/web/src/routes/_authed/servers/$id.tsx apps/web/src/routes/_authed/servers/$id.test.tsx apps/web/src/routes/_authed/servers/index.tsx apps/web/src/components/server/server-card.tsx apps/web/src/locales/en/servers.json apps/web/src/locales/zh/servers.json +git commit -m "feat(web): add agent upgrade ui" +``` + +### Task 9: Update Docs, Manual QA, and Run the Full Verification Sweep + +**Files:** +- Modify: `ENV.md` +- Modify: `apps/docs/content/docs/en/configuration.mdx` +- Modify: `apps/docs/content/docs/cn/configuration.mdx` +- Create: `tests/agent-upgrade.md` +- Modify: `tests/README.md` + +- [ ] **Step 1: Update the config docs and manual checklist** + +```md + +| `SERVERBEE_UPGRADE__LATEST_VERSION_URL` | `upgrade.latest_version_url` | string | `""` | Optional override for the latest agent version endpoint. Expected JSON: `{ "version": "x.y.z", "released_at": "..." }` | + + +| `SERVERBEE_UPGRADE__LATEST_VERSION_URL` | `""` | Optional override URL for the latest agent version lookup. When unset, ServerBee auto-detects GitHub Releases from `release_base_url` | + +| `latest_version_url` | string? | `None` | Optional JSON endpoint returning `{ "version": "x.y.z", "released_at"?: "..." }` for self-hosted release mirrors | + + +| `SERVERBEE_UPGRADE__LATEST_VERSION_URL` | `""` | 可选的最新版本查询 URL。未设置时,ServerBee 会根据 `release_base_url` 自动识别 GitHub Releases | + +| `latest_version_url` | string? | `None` | 可选的 JSON 端点,返回 `{ "version": "x.y.z", "released_at"?: "..." }`,适用于自托管镜像源 | + + +# Agent Upgrade + +1. Trigger an upgrade from the server detail page and confirm the stepper advances through Downloading -> Restarting. +2. Force a checksum mismatch and confirm the detail page shows a failed status with the verifying-stage error. +3. Block agent reconnect for more than 120 seconds and confirm the job becomes Timeout. +4. Click Retry from a failed or timeout state and confirm a new `job_id` is created. +5. Verify a timestamped `.bak.` file exists beside the agent binary and that stale backups are removed after 24 hours. +``` + +- [ ] **Step 2: Add the new checklist to the test index** + +```md +| [agent-upgrade.md](agent-upgrade.md) | Agent upgrade lifecycle | `/servers`, `/servers/:id` | +``` + +- [ ] **Step 3: Run the full verification sweep** + +Run: + +```bash +cargo test --workspace +cargo clippy --workspace -- -D warnings +bun --cwd apps/web test +bun --cwd apps/web run typecheck +bun --cwd apps/web x ultracite check +``` + +Expected: every command passes cleanly. + +- [ ] **Step 4: Commit** + +```bash +git add ENV.md apps/docs/content/docs/en/configuration.mdx apps/docs/content/docs/cn/configuration.mdx tests/agent-upgrade.md tests/README.md +git commit -m "docs: document agent upgrade workflow" +``` + +## Self-Review + +### Spec Coverage + +- Requirements 1, 2, 11, 12, 13 are covered by Task 1 and Task 4. +- Requirements 3, 4, 9, 10, 14 are covered by Task 2, Task 4, Task 7, and Task 8. +- Requirements 5 and 6 are covered by Task 5. +- Requirements 7, 8, and 15 are covered by Task 3, Task 7, and Task 8. +- Manual verification and docs updates are covered by Task 9. +- Spec integration scenarios 1-10 coverage: + - Scenarios 1, 9 (success, capability-denied fast-fail) and 2, 3 (Verifying failure, timeout sweep): Task 6 integration tests. + - Scenario 10 (FullSync contains upgrades): Task 6 integration test. + - Scenarios 4 (concurrent 409), 7 (job_id mismatch), 8 (same-version retry): Task 2 tracker unit tests. + - Scenarios 5 (agent offline pre-check), 6 (checksum 404 pre-check): covered transitively by `trigger_upgrade` returning early before `start_job` is reached (verified by the success-path integration test confirming `start_job` is reachable, combined with tracker unit tests that assert no job is created on early-return paths). +- No spec gaps remain after adding `backup_path` to failure payloads and using a service-local conflict error instead of `AppError::Conflict`. + +### Placeholder Scan + +- No `TODO`, `TBD`, or "similar to above" placeholders remain. +- Every code-changing step includes concrete code snippets or exact command sequences. +- Generated files are explicitly called out as generated instead of hand-edited. + +### Type Consistency + +- `UpgradeStage`, `UpgradeStatus`, `UpgradeJobDto`, `LatestAgentVersionResponse`, `useUpgradeJob`, and `useTriggerUpgrade` are named consistently across backend, OpenAPI, and frontend tasks. +- The frontend translation keys use the repo's existing flat `upgrade_*` naming style instead of introducing nested i18n structures mid-feature. From 44d5366466be4e6d5b1523aac1f4a882f56ff266 Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Tue, 14 Apr 2026 20:12:41 +0800 Subject: [PATCH 05/17] feat(common): add upgrade lifecycle protocol --- crates/common/src/protocol.rs | 201 ++++++++++++++++++++++++++++++++++ 1 file changed, 201 insertions(+) diff --git a/crates/common/src/protocol.rs b/crates/common/src/protocol.rs index f9fe4203..fd1f02c5 100644 --- a/crates/common/src/protocol.rs +++ b/crates/common/src/protocol.rs @@ -1,3 +1,4 @@ +use chrono::{DateTime, Utc}; use serde::{Deserialize, Serialize}; use crate::constants::CapabilityDeniedReason; @@ -7,6 +8,44 @@ use crate::types::{ PingTaskConfig, SystemInfo, SystemReport, TaskResult, TracerouteHop, }; +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +#[cfg_attr(feature = "utoipa", derive(utoipa::ToSchema))] +pub enum UpgradeStage { + Downloading, + Verifying, + PreFlight, + Installing, + Restarting, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +#[cfg_attr(feature = "utoipa", derive(utoipa::ToSchema))] +pub enum UpgradeStatus { + Running, + Succeeded, + Failed, + Timeout, +} + +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +#[cfg_attr(feature = "utoipa", derive(utoipa::ToSchema))] +pub struct UpgradeJobDto { + pub server_id: String, + pub job_id: String, + pub target_version: String, + pub stage: UpgradeStage, + pub status: UpgradeStatus, + #[serde(default)] + pub error: Option, + #[serde(default)] + pub backup_path: Option, + pub started_at: DateTime, + #[serde(default)] + pub finished_at: Option>, +} + /// Agent -> Server messages #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(tag = "type", rename_all = "snake_case")] @@ -145,6 +184,23 @@ pub enum AgentMessage { completed: bool, error: Option, }, + UpgradeProgress { + msg_id: String, + #[serde(default)] + job_id: Option, + target_version: String, + stage: UpgradeStage, + }, + UpgradeResult { + msg_id: String, + #[serde(default)] + job_id: Option, + target_version: String, + stage: UpgradeStage, + error: Option, + #[serde(default)] + backup_path: Option, + }, Pong, } @@ -288,6 +344,8 @@ pub enum ServerMessage { version: String, download_url: String, sha256: String, + #[serde(default)] + job_id: Option, }, CapabilitiesSync { capabilities: u32, @@ -300,6 +358,8 @@ pub enum ServerMessage { pub enum BrowserMessage { FullSync { servers: Vec, + #[serde(default)] + upgrades: Vec, }, Update { servers: Vec, @@ -319,6 +379,23 @@ pub enum BrowserMessage { AgentInfoUpdated { server_id: String, protocol_version: u32, + #[serde(default)] + agent_version: Option, + }, + UpgradeProgress { + server_id: String, + job_id: String, + target_version: String, + stage: UpgradeStage, + }, + UpgradeResult { + server_id: String, + job_id: String, + target_version: String, + status: UpgradeStatus, + stage: Option, + error: Option, + backup_path: Option, }, NetworkProbeUpdate { server_id: String, @@ -1100,4 +1177,128 @@ mod tests { _ => panic!("Expected ServerIpChanged"), } } + + #[test] + fn test_server_upgrade_with_job_id_round_trip() { + let msg = ServerMessage::Upgrade { + version: "2.0.0".to_string(), + download_url: "https://example.com/serverbee.tar.gz".to_string(), + sha256: "abc123".to_string(), + job_id: Some("job-1".to_string()), + }; + + let json = serde_json::to_string(&msg).unwrap(); + let parsed: ServerMessage = serde_json::from_str(&json).unwrap(); + + match parsed { + ServerMessage::Upgrade { + version, + download_url, + sha256, + job_id, + } => { + assert_eq!(version, "2.0.0"); + assert_eq!(download_url, "https://example.com/serverbee.tar.gz"); + assert_eq!(sha256, "abc123"); + assert_eq!(job_id, Some("job-1".to_string())); + } + _ => panic!("Expected Upgrade"), + } + } + + #[test] + fn test_upgrade_messages_without_job_id_stay_backward_compatible() { + let server_json = + r#"{"type":"upgrade","version":"2.0.0","download_url":"https://example.com/serverbee.tar.gz","sha256":"abc123"}"#; + let server_msg: ServerMessage = serde_json::from_str(server_json).unwrap(); + match server_msg { + ServerMessage::Upgrade { + job_id, + version, + download_url, + sha256, + } => { + assert_eq!(job_id, None); + assert_eq!(version, "2.0.0"); + assert_eq!(download_url, "https://example.com/serverbee.tar.gz"); + assert_eq!(sha256, "abc123"); + } + _ => panic!("Expected Upgrade"), + } + + let agent_json = + r#"{"type":"upgrade_progress","msg_id":"m1","target_version":"2.0.0","stage":"downloading"}"#; + let agent_msg: AgentMessage = serde_json::from_str(agent_json).unwrap(); + match agent_msg { + AgentMessage::UpgradeProgress { + msg_id, + job_id, + target_version, + stage, + } => { + assert_eq!(msg_id, "m1"); + assert_eq!(job_id, None); + assert_eq!(target_version, "2.0.0"); + assert_eq!(stage, UpgradeStage::Downloading); + } + _ => panic!("Expected UpgradeProgress"), + } + } + + #[test] + fn test_browser_full_sync_with_upgrades_round_trip() { + let msg = BrowserMessage::FullSync { + servers: vec![], + upgrades: vec![UpgradeJobDto { + server_id: "server-1".to_string(), + job_id: "job-1".to_string(), + target_version: "2.0.0".to_string(), + stage: UpgradeStage::Installing, + status: UpgradeStatus::Running, + error: None, + backup_path: Some("/backups/server-1.tar.gz".to_string()), + started_at: chrono::Utc::now(), + finished_at: None, + }], + }; + + let json = serde_json::to_string(&msg).unwrap(); + let parsed: BrowserMessage = serde_json::from_str(&json).unwrap(); + + match parsed { + BrowserMessage::FullSync { servers, upgrades } => { + assert!(servers.is_empty()); + assert_eq!(upgrades.len(), 1); + assert_eq!(upgrades[0].server_id, "server-1"); + assert_eq!(upgrades[0].job_id, "job-1"); + assert_eq!(upgrades[0].target_version, "2.0.0"); + assert_eq!(upgrades[0].stage, UpgradeStage::Installing); + assert_eq!(upgrades[0].status, UpgradeStatus::Running); + assert_eq!(upgrades[0].error, None); + assert_eq!(upgrades[0].backup_path, Some("/backups/server-1.tar.gz".to_string())); + assert!(upgrades[0].finished_at.is_none()); + } + _ => panic!("Expected FullSync"), + } + } + + #[test] + fn test_agent_info_updated_accepts_optional_agent_version() { + let json = + r#"{"type":"agent_info_updated","server_id":"server-1","protocol_version":3}"#; + let msg: BrowserMessage = serde_json::from_str(json).unwrap(); + + match msg { + BrowserMessage::AgentInfoUpdated { + server_id, + protocol_version, + agent_version, + } => { + assert_eq!(server_id, "server-1"); + assert_eq!(protocol_version, 3); + assert_eq!(agent_version, None); + } + _ => panic!("Expected AgentInfoUpdated"), + } + } } From 6cfe08bd24dcf1adde5f028402f523106bc0ada1 Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Tue, 14 Apr 2026 20:15:18 +0800 Subject: [PATCH 06/17] feat(server): add upgrade job tracker --- crates/common/src/protocol.rs | 51 +- crates/server/src/main.rs | 2 + crates/server/src/router/api/server.rs | 25 +- crates/server/src/router/ws/agent.rs | 42 ++ crates/server/src/router/ws/browser.rs | 6 +- crates/server/src/service/mod.rs | 1 + crates/server/src/service/upgrade_tracker.rs | 585 +++++++++++++++++++ crates/server/src/state.rs | 6 +- crates/server/src/task/mod.rs | 1 + crates/server/src/task/upgrade_timeout.rs | 26 + 10 files changed, 713 insertions(+), 32 deletions(-) create mode 100644 crates/server/src/service/upgrade_tracker.rs create mode 100644 crates/server/src/task/upgrade_timeout.rs diff --git a/crates/common/src/protocol.rs b/crates/common/src/protocol.rs index fd1f02c5..edd71eb9 100644 --- a/crates/common/src/protocol.rs +++ b/crates/common/src/protocol.rs @@ -197,7 +197,7 @@ pub enum AgentMessage { job_id: Option, target_version: String, stage: UpgradeStage, - error: Option, + error: String, #[serde(default)] backup_path: Option, }, @@ -439,6 +439,24 @@ mod tests { use super::*; use crate::types::DiskIo; + #[test] + fn test_agent_info_updated_accepts_optional_agent_version() { + let json = r#"{"type":"agent_info_updated","server_id":"s1","protocol_version":3,"agent_version":"1.2.3"}"#; + + match serde_json::from_str::(json).unwrap() { + BrowserMessage::AgentInfoUpdated { + server_id, + protocol_version, + agent_version, + } => { + assert_eq!(server_id, "s1"); + assert_eq!(protocol_version, 3); + assert_eq!(agent_version.as_deref(), Some("1.2.3")); + } + _ => panic!("Expected AgentInfoUpdated"), + } + } + #[test] fn test_welcome_without_capabilities_deserializes() { let json = @@ -1208,8 +1226,7 @@ mod tests { #[test] fn test_upgrade_messages_without_job_id_stay_backward_compatible() { - let server_json = - r#"{"type":"upgrade","version":"2.0.0","download_url":"https://example.com/serverbee.tar.gz","sha256":"abc123"}"#; + let server_json = r#"{"type":"upgrade","version":"2.0.0","download_url":"https://example.com/serverbee.tar.gz","sha256":"abc123"}"#; let server_msg: ServerMessage = serde_json::from_str(server_json).unwrap(); match server_msg { ServerMessage::Upgrade { @@ -1226,8 +1243,7 @@ mod tests { _ => panic!("Expected Upgrade"), } - let agent_json = - r#"{"type":"upgrade_progress","msg_id":"m1","target_version":"2.0.0","stage":"downloading"}"#; + let agent_json = r#"{"type":"upgrade_progress","msg_id":"m1","target_version":"2.0.0","stage":"downloading"}"#; let agent_msg: AgentMessage = serde_json::from_str(agent_json).unwrap(); match agent_msg { AgentMessage::UpgradeProgress { @@ -1275,30 +1291,13 @@ mod tests { assert_eq!(upgrades[0].stage, UpgradeStage::Installing); assert_eq!(upgrades[0].status, UpgradeStatus::Running); assert_eq!(upgrades[0].error, None); - assert_eq!(upgrades[0].backup_path, Some("/backups/server-1.tar.gz".to_string())); + assert_eq!( + upgrades[0].backup_path, + Some("/backups/server-1.tar.gz".to_string()) + ); assert!(upgrades[0].finished_at.is_none()); } _ => panic!("Expected FullSync"), } } - - #[test] - fn test_agent_info_updated_accepts_optional_agent_version() { - let json = - r#"{"type":"agent_info_updated","server_id":"server-1","protocol_version":3}"#; - let msg: BrowserMessage = serde_json::from_str(json).unwrap(); - - match msg { - BrowserMessage::AgentInfoUpdated { - server_id, - protocol_version, - agent_version, - } => { - assert_eq!(server_id, "server-1"); - assert_eq!(protocol_version, 3); - assert_eq!(agent_version, None); - } - _ => panic!("Expected AgentInfoUpdated"), - } - } } diff --git a/crates/server/src/main.rs b/crates/server/src/main.rs index ef880eaf..83409014 100644 --- a/crates/server/src/main.rs +++ b/crates/server/src/main.rs @@ -88,6 +88,8 @@ async fn main() -> anyhow::Result<()> { tokio::spawn(async move { task::task_scheduler::run(s).await }); let s = state.clone(); tokio::spawn(async move { task::service_monitor_checker::run(s).await }); + let s = state.clone(); + tokio::spawn(async move { task::upgrade_timeout::run(s).await }); // Build router let app = create_router(state); diff --git a/crates/server/src/router/api/server.rs b/crates/server/src/router/api/server.rs index a5723116..83d71573 100644 --- a/crates/server/src/router/api/server.rs +++ b/crates/server/src/router/api/server.rs @@ -26,6 +26,7 @@ use crate::service::network_probe::NetworkProbeService; use crate::service::ping::PingService; use crate::service::record::{QueryHistoryResult, RecordService}; use crate::service::server::{ServerService, UpdateServerInput}; +use crate::service::upgrade_tracker::{StartUpgradeJobError, UpgradeLookup}; use crate::state::AppState; use serverbee_common::constants::effective_capabilities; use serverbee_common::protocol::{BrowserMessage, ServerMessage}; @@ -612,15 +613,31 @@ async fn trigger_upgrade( .get_sender(&id) .ok_or_else(|| AppError::NotFound("Agent not connected".into()))?; + let job = state + .upgrade_tracker + .start_job(&id, version.to_string()) + .map_err(|error| match error { + StartUpgradeJobError::Conflict(existing) => AppError::Conflict(format!( + "Upgrade already running for server {} (job_id={}, target_version={})", + existing.server_id, existing.job_id, existing.target_version + )), + })?; + let msg = ServerMessage::Upgrade { version: version.to_string(), download_url, sha256, + job_id: Some(job.job_id.clone()), }; - sender - .send(msg) - .await - .map_err(|_| AppError::Internal("Failed to send upgrade command".into()))?; + if let Err(_send_error) = sender.send(msg).await { + state.upgrade_tracker.mark_failed( + UpgradeLookup::from_job(&job), + job.stage, + "Failed to send upgrade command".into(), + None, + ); + return Err(AppError::Internal("Failed to send upgrade command".into())); + } ok("ok") } diff --git a/crates/server/src/router/ws/agent.rs b/crates/server/src/router/ws/agent.rs index 11962abc..02765889 100644 --- a/crates/server/src/router/ws/agent.rs +++ b/crates/server/src/router/ws/agent.rs @@ -19,6 +19,7 @@ use crate::service::network_probe::NetworkProbeService; use crate::service::ping::PingService; use crate::service::record::RecordService; use crate::service::server::ServerService; +use crate::service::upgrade_tracker::UpgradeLookup; use crate::state::AppState; use serverbee_common::constants::{MAX_WS_MESSAGE_SIZE, effective_capabilities}; use serverbee_common::protocol::{AgentMessage, BrowserMessage, ServerMessage}; @@ -505,6 +506,7 @@ async fn handle_agent_message(state: &Arc, server_id: &str, msg: Agent .broadcast_browser(BrowserMessage::AgentInfoUpdated { server_id: server_id.to_string(), protocol_version: agent_pv, + agent_version: Some(info.agent_version.clone()), }); // Send Ack @@ -553,6 +555,39 @@ async fn handle_agent_message(state: &Arc, server_id: &str, msg: Agent let _ = tx.send(ServerMessage::Ack { msg_id }).await; } } + AgentMessage::UpgradeProgress { + msg_id, + job_id, + target_version, + stage, + } => { + state + .upgrade_tracker + .update_stage(UpgradeLookup::new(server_id, job_id, target_version), stage); + + if let Some(tx) = state.agent_manager.get_sender(server_id) { + let _ = tx.send(ServerMessage::Ack { msg_id }).await; + } + } + AgentMessage::UpgradeResult { + msg_id, + job_id, + target_version, + stage, + error, + backup_path, + } => { + state.upgrade_tracker.mark_failed( + UpgradeLookup::new(server_id, job_id, target_version), + stage, + error, + backup_path, + ); + + if let Some(tx) = state.agent_manager.get_sender(server_id) { + let _ = tx.send(ServerMessage::Ack { msg_id }).await; + } + } AgentMessage::PingResult(result) => { if let Err(e) = save_ping_result(&state.db, server_id, &result).await { tracing::error!("Failed to save ping result for {server_id}: {e}"); @@ -625,6 +660,13 @@ async fn handle_agent_message(state: &Arc, server_id: &str, msg: Agent } } } + if capability == "upgrade" + && let Some(job) = state.upgrade_tracker.get(server_id) + { + state + .upgrade_tracker + .mark_failed_by_capability_denied(UpgradeLookup::from_job(&job), reason); + } // For terminal: unregister session so browser gets notified if let Some(sid) = &session_id { state.agent_manager.unregister_terminal_session(sid); diff --git a/crates/server/src/router/ws/browser.rs b/crates/server/src/router/ws/browser.rs index bca1e365..270fbba7 100644 --- a/crates/server/src/router/ws/browser.rs +++ b/crates/server/src/router/ws/browser.rs @@ -255,6 +255,7 @@ async fn build_full_sync(state: &Arc) -> BrowserMessage { tracing::error!("Failed to list servers for FullSync: {e}"); return BrowserMessage::FullSync { servers: Vec::new(), + upgrades: state.upgrade_tracker.snapshot(), }; } }; @@ -351,7 +352,10 @@ async fn build_full_sync(state: &Arc) -> BrowserMessage { }) .collect(); - BrowserMessage::FullSync { servers: statuses } + BrowserMessage::FullSync { + servers: statuses, + upgrades: state.upgrade_tracker.snapshot(), + } } async fn send_browser_message( diff --git a/crates/server/src/service/mod.rs b/crates/server/src/service/mod.rs index 6128cb3d..f7395340 100644 --- a/crates/server/src/service/mod.rs +++ b/crates/server/src/service/mod.rs @@ -24,5 +24,6 @@ pub mod service_monitor; pub mod status_page; pub mod task_scheduler; pub mod traffic; +pub mod upgrade_tracker; pub mod uptime; pub mod user; diff --git a/crates/server/src/service/upgrade_tracker.rs b/crates/server/src/service/upgrade_tracker.rs new file mode 100644 index 00000000..036bf5a1 --- /dev/null +++ b/crates/server/src/service/upgrade_tracker.rs @@ -0,0 +1,585 @@ +use chrono::{DateTime, Duration, Utc}; +use dashmap::DashMap; +use serverbee_common::constants::CapabilityDeniedReason; +use serverbee_common::protocol::{BrowserMessage, UpgradeJobDto, UpgradeStage, UpgradeStatus}; +use tokio::sync::broadcast; +use uuid::Uuid; + +pub const UPGRADE_TIMEOUT_SECS: i64 = 120; +pub const UPGRADE_RETENTION_HOURS: i64 = 24; + +#[derive(Debug, Clone, PartialEq)] +pub struct UpgradeJob { + pub server_id: String, + pub job_id: String, + pub target_version: String, + pub stage: UpgradeStage, + pub status: UpgradeStatus, + pub error: Option, + pub backup_path: Option, + pub started_at: DateTime, + pub finished_at: Option>, +} + +impl UpgradeJob { + fn to_dto(&self) -> UpgradeJobDto { + UpgradeJobDto { + server_id: self.server_id.clone(), + job_id: self.job_id.clone(), + target_version: self.target_version.clone(), + stage: self.stage, + status: self.status, + error: self.error.clone(), + backup_path: self.backup_path.clone(), + started_at: self.started_at, + finished_at: self.finished_at, + } + } + + fn is_terminal(&self) -> bool { + self.status != UpgradeStatus::Running + } +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct UpgradeLookup { + pub server_id: String, + pub job_id: Option, + pub target_version: String, +} + +impl UpgradeLookup { + pub fn new( + server_id: impl Into, + job_id: Option, + target_version: impl Into, + ) -> Self { + Self { + server_id: server_id.into(), + job_id, + target_version: target_version.into(), + } + } + + pub fn from_job(job: &UpgradeJob) -> Self { + Self { + server_id: job.server_id.clone(), + job_id: Some(job.job_id.clone()), + target_version: job.target_version.clone(), + } + } + + fn matches(&self, job: &UpgradeJob) -> bool { + if job.server_id != self.server_id { + return false; + } + + if let Some(job_id) = &self.job_id { + job.job_id == *job_id + } else { + job.target_version == self.target_version + } + } +} + +#[derive(Debug, Clone, PartialEq)] +pub enum StartUpgradeJobError { + Conflict(UpgradeJob), +} + +pub struct UpgradeJobTracker { + pub(crate) jobs: DashMap, + browser_tx: broadcast::Sender, +} + +impl UpgradeJobTracker { + pub fn new(browser_tx: broadcast::Sender) -> Self { + Self { + jobs: DashMap::new(), + browser_tx, + } + } + + pub fn start_job( + &self, + server_id: impl Into, + target_version: impl Into, + ) -> Result { + let server_id = server_id.into(); + let target_version = target_version.into(); + + if let Some(existing) = self.jobs.get(&server_id) + && existing.status == UpgradeStatus::Running + { + return Err(StartUpgradeJobError::Conflict(existing.clone())); + } + + let job = UpgradeJob { + server_id: server_id.clone(), + job_id: Uuid::new_v4().to_string(), + target_version, + stage: UpgradeStage::Downloading, + status: UpgradeStatus::Running, + error: None, + backup_path: None, + started_at: Utc::now(), + finished_at: None, + }; + + self.jobs.insert(server_id, job.clone()); + self.broadcast_progress(&job); + + Ok(job) + } + + pub fn update_stage(&self, lookup: UpgradeLookup, stage: UpgradeStage) -> Option { + let mut job = self.jobs.get_mut(&lookup.server_id)?; + if job.status != UpgradeStatus::Running || !lookup.matches(&job) { + return None; + } + if job.stage == stage { + return Some(job.clone()); + } + + job.stage = stage; + let updated = job.clone(); + drop(job); + self.broadcast_progress(&updated); + + Some(updated) + } + + pub fn mark_failed( + &self, + lookup: UpgradeLookup, + stage: UpgradeStage, + error: String, + backup_path: Option, + ) -> Option { + self.finish_job( + lookup, + UpgradeStatus::Failed, + Some(stage), + Some(error), + backup_path, + ) + } + + pub fn mark_failed_by_capability_denied( + &self, + lookup: UpgradeLookup, + reason: CapabilityDeniedReason, + ) -> Option { + self.finish_job( + lookup, + UpgradeStatus::Failed, + None, + Some(format!("Upgrade capability denied: {reason:?}")), + None, + ) + } + + pub fn mark_succeeded( + &self, + lookup: UpgradeLookup, + backup_path: Option, + ) -> Option { + self.finish_job(lookup, UpgradeStatus::Succeeded, None, None, backup_path) + } + + pub fn sweep_timeouts(&self, now: DateTime) -> Vec { + let mut timed_out = Vec::new(); + let timeout_cutoff = now - Duration::seconds(UPGRADE_TIMEOUT_SECS); + + for mut entry in self.jobs.iter_mut() { + if entry.status != UpgradeStatus::Running || entry.started_at > timeout_cutoff { + continue; + } + + entry.status = UpgradeStatus::Timeout; + entry.error = Some(format!("Upgrade timed out after {UPGRADE_TIMEOUT_SECS}s")); + entry.finished_at = Some(now); + + let job = entry.clone(); + timed_out.push(job.clone()); + drop(entry); + self.broadcast_result(&job); + } + + timed_out + } + + pub fn cleanup_old(&self, now: DateTime) -> usize { + let retention_cutoff = now - Duration::hours(UPGRADE_RETENTION_HOURS); + let before = self.jobs.len(); + + self.jobs.retain(|_, job| { + !(job.is_terminal() + && job + .finished_at + .is_some_and(|finished_at| finished_at <= retention_cutoff)) + }); + + before.saturating_sub(self.jobs.len()) + } + + pub fn get(&self, server_id: &str) -> Option { + self.jobs.get(server_id).map(|job| job.clone()) + } + + pub fn snapshot(&self) -> Vec { + self.jobs.iter().map(|job| job.to_dto()).collect::>() + } + + fn finish_job( + &self, + lookup: UpgradeLookup, + status: UpgradeStatus, + stage: Option, + error: Option, + backup_path: Option, + ) -> Option { + let mut job = self.jobs.get_mut(&lookup.server_id)?; + if job.status != UpgradeStatus::Running || !lookup.matches(&job) { + return None; + } + + if let Some(stage) = stage { + job.stage = stage; + } + job.status = status; + job.error = error; + job.backup_path = backup_path; + job.finished_at = Some(Utc::now()); + + let updated = job.clone(); + drop(job); + self.broadcast_result(&updated); + + Some(updated) + } + + fn broadcast_progress(&self, job: &UpgradeJob) { + let _ = self.browser_tx.send(BrowserMessage::UpgradeProgress { + server_id: job.server_id.clone(), + job_id: job.job_id.clone(), + target_version: job.target_version.clone(), + stage: job.stage, + }); + } + + fn broadcast_result(&self, job: &UpgradeJob) { + let _ = self.browser_tx.send(BrowserMessage::UpgradeResult { + server_id: job.server_id.clone(), + job_id: job.job_id.clone(), + target_version: job.target_version.clone(), + status: job.status, + stage: Some(job.stage), + error: job.error.clone(), + backup_path: job.backup_path.clone(), + }); + } +} + +#[cfg(test)] +mod tests { + use chrono::{Duration, Utc}; + use serverbee_common::constants::CapabilityDeniedReason; + use serverbee_common::protocol::{BrowserMessage, UpgradeJobDto, UpgradeStage, UpgradeStatus}; + use tokio::sync::broadcast; + + use super::*; + + fn make_tracker() -> (UpgradeJobTracker, broadcast::Receiver) { + let (tx, rx) = broadcast::channel(16); + (UpgradeJobTracker::new(tx), rx) + } + + fn assert_progress( + msg: BrowserMessage, + server_id: &str, + job_id: &str, + target_version: &str, + stage: UpgradeStage, + ) { + assert!(matches!( + msg, + BrowserMessage::UpgradeProgress { + server_id: ref actual_server_id, + job_id: ref actual_job_id, + target_version: ref actual_target_version, + stage: actual_stage, + } if actual_server_id == server_id + && actual_job_id == job_id + && actual_target_version == target_version + && actual_stage == stage + )); + } + + fn assert_result( + msg: BrowserMessage, + server_id: &str, + job_id: &str, + target_version: &str, + status: UpgradeStatus, + stage: Option, + ) { + assert!(matches!( + msg, + BrowserMessage::UpgradeResult { + server_id: ref actual_server_id, + job_id: ref actual_job_id, + target_version: ref actual_target_version, + status: actual_status, + stage: actual_stage, + .. + } if actual_server_id == server_id + && actual_job_id == job_id + && actual_target_version == target_version + && actual_status == status + && actual_stage == stage + )); + } + + #[test] + fn start_job_rejects_a_second_running_job() { + let (tracker, mut rx) = make_tracker(); + + let first = tracker + .start_job("server-1", "1.2.3") + .expect("first job should start"); + assert_progress( + rx.try_recv().expect("start should broadcast progress"), + "server-1", + &first.job_id, + "1.2.3", + UpgradeStage::Downloading, + ); + + let conflict = tracker + .start_job("server-1", "1.2.4") + .expect_err("second running job should be rejected"); + + match conflict { + StartUpgradeJobError::Conflict(existing) => { + assert_eq!(existing.job_id, first.job_id); + assert_eq!(existing.target_version, "1.2.3"); + assert_eq!(existing.status, UpgradeStatus::Running); + } + } + + assert!(rx.try_recv().is_err(), "conflict should not broadcast"); + } + + #[test] + fn update_stage_prefers_job_id_and_ignores_stale_messages() { + let (tracker, mut rx) = make_tracker(); + + let stale = tracker + .start_job("server-1", "1.2.3") + .expect("stale job should start"); + rx.try_recv().expect("start broadcast"); + tracker.mark_failed( + UpgradeLookup::from_job(&stale), + UpgradeStage::Installing, + "boom".into(), + None, + ); + rx.try_recv().expect("failure broadcast"); + + let active = tracker + .start_job("server-1", "1.2.3") + .expect("replacement job should start"); + rx.try_recv().expect("replacement start broadcast"); + + tracker.update_stage( + UpgradeLookup { + server_id: "server-1".into(), + job_id: Some(stale.job_id.clone()), + target_version: "1.2.3".into(), + }, + UpgradeStage::Verifying, + ); + + assert!(rx.try_recv().is_err(), "stale update should be ignored"); + + let current = tracker.get("server-1").expect("active job should remain"); + assert_eq!(current.job_id, active.job_id); + assert_eq!(current.stage, UpgradeStage::Downloading); + + tracker.update_stage(UpgradeLookup::from_job(&active), UpgradeStage::Verifying); + + assert_progress( + rx.try_recv().expect("active update should broadcast"), + "server-1", + &active.job_id, + "1.2.3", + UpgradeStage::Verifying, + ); + } + + #[test] + fn mark_succeeded_does_not_overwrite_timeout() { + let (tracker, mut rx) = make_tracker(); + + let job = tracker + .start_job("server-1", "1.2.3") + .expect("job should start"); + rx.try_recv().expect("start broadcast"); + + let timed_out = + tracker.sweep_timeouts(Utc::now() + Duration::seconds(UPGRADE_TIMEOUT_SECS + 1)); + assert_eq!(timed_out.len(), 1); + + let timeout_msg = rx.try_recv().expect("timeout broadcast"); + assert_result( + timeout_msg, + "server-1", + &job.job_id, + "1.2.3", + UpgradeStatus::Timeout, + Some(UpgradeStage::Downloading), + ); + + tracker.mark_succeeded(UpgradeLookup::from_job(&job), Some("/tmp/backup".into())); + + assert!(rx.try_recv().is_err(), "late success should not broadcast"); + + let current = tracker.get("server-1").expect("job should still exist"); + assert_eq!(current.status, UpgradeStatus::Timeout); + assert_eq!(current.backup_path.as_deref(), None); + assert!(current.finished_at.is_some()); + } + + #[test] + fn cleanup_old_removes_only_expired_terminal_jobs() { + let (tracker, _rx) = make_tracker(); + let now = Utc::now(); + let expired = now - Duration::hours(UPGRADE_RETENTION_HOURS + 1); + let fresh = now - Duration::hours(1); + + tracker.jobs.insert( + "expired-succeeded".into(), + UpgradeJob { + server_id: "expired-succeeded".into(), + job_id: "job-expired-succeeded".into(), + target_version: "1.0.0".into(), + stage: UpgradeStage::Restarting, + status: UpgradeStatus::Succeeded, + error: None, + backup_path: None, + started_at: expired, + finished_at: Some(expired), + }, + ); + tracker.jobs.insert( + "expired-timeout".into(), + UpgradeJob { + server_id: "expired-timeout".into(), + job_id: "job-expired-timeout".into(), + target_version: "1.0.0".into(), + stage: UpgradeStage::Installing, + status: UpgradeStatus::Timeout, + error: Some("timed out".into()), + backup_path: None, + started_at: expired, + finished_at: Some(expired), + }, + ); + tracker.jobs.insert( + "fresh-failed".into(), + UpgradeJob { + server_id: "fresh-failed".into(), + job_id: "job-fresh-failed".into(), + target_version: "1.0.0".into(), + stage: UpgradeStage::Installing, + status: UpgradeStatus::Failed, + error: Some("failed".into()), + backup_path: None, + started_at: fresh, + finished_at: Some(fresh), + }, + ); + tracker.jobs.insert( + "still-running".into(), + UpgradeJob { + server_id: "still-running".into(), + job_id: "job-still-running".into(), + target_version: "1.0.0".into(), + stage: UpgradeStage::Installing, + status: UpgradeStatus::Running, + error: None, + backup_path: None, + started_at: expired, + finished_at: None, + }, + ); + + let removed = tracker.cleanup_old(now); + assert_eq!(removed, 2); + + assert!(tracker.get("expired-succeeded").is_none()); + assert!(tracker.get("expired-timeout").is_none()); + assert!(tracker.get("fresh-failed").is_some()); + assert!(tracker.get("still-running").is_some()); + } + + #[test] + fn mark_failed_by_capability_denied_sets_failed_result() { + let (tracker, mut rx) = make_tracker(); + + let job = tracker + .start_job("server-1", "1.2.3") + .expect("job should start"); + rx.try_recv().expect("start broadcast"); + + tracker.mark_failed_by_capability_denied( + UpgradeLookup::from_job(&job), + CapabilityDeniedReason::AgentCapabilityDisabled, + ); + + let current = tracker.get("server-1").expect("job should remain"); + assert_eq!(current.status, UpgradeStatus::Failed); + assert!( + current + .error + .expect("failure error should be recorded") + .contains("AgentCapabilityDisabled") + ); + assert_result( + rx.try_recv() + .expect("capability denied should broadcast result"), + "server-1", + &job.job_id, + "1.2.3", + UpgradeStatus::Failed, + Some(UpgradeStage::Downloading), + ); + } + + #[test] + fn snapshot_returns_upgrade_job_dtos() { + let (tracker, _rx) = make_tracker(); + + let job = tracker + .start_job("server-1", "1.2.3") + .expect("job should start"); + + let snapshot = tracker.snapshot(); + assert_eq!(snapshot.len(), 1); + assert_eq!( + snapshot[0], + UpgradeJobDto { + server_id: "server-1".into(), + job_id: job.job_id, + target_version: "1.2.3".into(), + stage: UpgradeStage::Downloading, + status: UpgradeStatus::Running, + error: None, + backup_path: None, + started_at: snapshot[0].started_at, + finished_at: None, + } + ); + } +} diff --git a/crates/server/src/state.rs b/crates/server/src/state.rs index c6327773..ee56f3e8 100644 --- a/crates/server/src/state.rs +++ b/crates/server/src/state.rs @@ -17,6 +17,7 @@ use crate::service::high_risk_audit::{ DockerLogsAuditContext, ExecAuditContext, TerminalAuditContext, }; use crate::service::task_scheduler::TaskScheduler; +use crate::service::upgrade_tracker::UpgradeJobTracker; /// Pending TOTP setup data, keyed by user_id. pub struct PendingTotp { @@ -59,6 +60,8 @@ pub struct AppState { pub task_scheduler: Arc, /// Shared alert state manager for dedup across poll-based and event-driven evaluation. pub alert_state_manager: AlertStateManager, + /// Tracks in-flight and recent agent upgrade jobs. + pub upgrade_tracker: UpgradeJobTracker, /// Pending mobile pairing codes for QR login, keyed by code. pub pending_pairs: DashMap, /// Terminal session audit contexts keyed by session_id. @@ -160,7 +163,7 @@ impl AppState { Ok(Arc::new(Self { db, agent_manager, - browser_tx, + browser_tx: browser_tx.clone(), config, geoip: Arc::new(std::sync::RwLock::new(geoip)), geoip_downloading: AtomicBool::new(false), @@ -172,6 +175,7 @@ impl AppState { docker_viewers: DockerViewerTracker::new(), task_scheduler, alert_state_manager, + upgrade_tracker: UpgradeJobTracker::new(browser_tx.clone()), pending_pairs: DashMap::new(), terminal_audit_contexts: DashMap::new(), docker_logs_audit_contexts: DashMap::new(), diff --git a/crates/server/src/task/mod.rs b/crates/server/src/task/mod.rs index b3a852ff..c0ada32d 100644 --- a/crates/server/src/task/mod.rs +++ b/crates/server/src/task/mod.rs @@ -6,3 +6,4 @@ pub mod record_writer; pub mod service_monitor_checker; pub mod session_cleaner; pub mod task_scheduler; +pub mod upgrade_timeout; diff --git a/crates/server/src/task/upgrade_timeout.rs b/crates/server/src/task/upgrade_timeout.rs new file mode 100644 index 00000000..691cda90 --- /dev/null +++ b/crates/server/src/task/upgrade_timeout.rs @@ -0,0 +1,26 @@ +use std::sync::Arc; +use std::time::Duration; + +use chrono::Utc; + +use crate::state::AppState; + +/// Periodically marks stuck upgrade jobs as timed out and removes expired history. +pub async fn run(state: Arc) { + let mut interval = tokio::time::interval(Duration::from_secs(10)); + + loop { + interval.tick().await; + + let now = Utc::now(); + let timed_out = state.upgrade_tracker.sweep_timeouts(now); + let removed = state.upgrade_tracker.cleanup_old(now); + + if !timed_out.is_empty() { + tracing::warn!("Timed out {} upgrade job(s)", timed_out.len()); + } + if removed > 0 { + tracing::debug!("Removed {removed} expired upgrade job(s)"); + } + } +} From 425990aec6b9bbd21a2fb2cce3128396e292fb67 Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Tue, 14 Apr 2026 20:18:56 +0800 Subject: [PATCH 07/17] feat(server): add upgrade release lookup --- crates/server/src/config.rs | 5 +- crates/server/src/openapi.rs | 2 + crates/server/src/router/api/agent.rs | 22 +- crates/server/src/router/api/mod.rs | 1 + crates/server/src/router/api/server.rs | 47 +-- crates/server/src/service/mod.rs | 1 + crates/server/src/service/upgrade_release.rs | 298 +++++++++++++++++++ crates/server/src/state.rs | 5 + 8 files changed, 338 insertions(+), 43 deletions(-) create mode 100644 crates/server/src/service/upgrade_release.rs diff --git a/crates/server/src/config.rs b/crates/server/src/config.rs index 21f57d90..500288fe 100644 --- a/crates/server/src/config.rs +++ b/crates/server/src/config.rs @@ -1,6 +1,6 @@ use figment::{ - Figment, providers::{Env, Format, Toml}, + Figment, }; use ipnet::IpNet; use serde::Deserialize; @@ -275,6 +275,8 @@ impl Default for SchedulerConfig { pub struct UpgradeConfig { #[serde(default = "default_release_base_url")] pub release_base_url: String, + #[serde(default)] + pub latest_version_url: String, } fn default_release_base_url() -> String { @@ -285,6 +287,7 @@ impl Default for UpgradeConfig { fn default() -> Self { Self { release_base_url: default_release_base_url(), + latest_version_url: String::new(), } } } diff --git a/crates/server/src/openapi.rs b/crates/server/src/openapi.rs index fb529c2c..feb1cd57 100644 --- a/crates/server/src/openapi.rs +++ b/crates/server/src/openapi.rs @@ -59,6 +59,7 @@ impl Modify for SecurityAddon { crate::router::api::status::public_status, // agent crate::router::api::agent::register, + crate::router::api::agent::latest_version, // servers crate::router::api::server::list_servers, crate::router::api::server::get_server, @@ -221,6 +222,7 @@ impl Modify for SecurityAddon { crate::router::api::oauth::OAuthProvidersResponse, // agent crate::router::api::agent::RegisterResponse, + crate::service::upgrade_release::LatestAgentVersionResponse, // servers crate::router::api::server::ServerResponse, crate::router::api::server::BatchDeleteRequest, diff --git a/crates/server/src/router/api/agent.rs b/crates/server/src/router/api/agent.rs index 3f143c6e..1682a241 100644 --- a/crates/server/src/router/api/agent.rs +++ b/crates/server/src/router/api/agent.rs @@ -3,7 +3,7 @@ use std::sync::Arc; use axum::extract::{ConnectInfo, State}; use axum::http::HeaderMap; -use axum::routing::post; +use axum::routing::{get, post}; use axum::{Json, Router}; use chrono::Utc; use sea_orm::{ @@ -19,6 +19,7 @@ use crate::router::utils::extract_client_ip; use crate::service::auth::AuthService; use crate::service::config::ConfigService; use crate::service::network_probe::NetworkProbeService; +use crate::service::upgrade_release::LatestAgentVersionResponse; use crate::state::AppState; const CONFIG_KEY_AUTO_DISCOVERY: &str = "auto_discovery_key"; @@ -41,6 +42,25 @@ pub fn public_router() -> Router> { Router::new().route("/agent/register", post(register)) } +pub fn read_router() -> Router> { + Router::new().route("/agent/latest-version", get(latest_version)) +} + +#[utoipa::path( + get, + path = "/api/agent/latest-version", + tag = "agent", + responses( + (status = 200, description = "Latest agent release metadata", body = LatestAgentVersionResponse), + ), + security(("session_cookie" = []), ("api_key" = []), ("bearer_token" = [])) +)] +pub async fn latest_version( + State(state): State>, +) -> Result>, AppError> { + ok(state.upgrade_release_service.latest().await) +} + #[utoipa::path( post, path = "/api/agent/register", diff --git a/crates/server/src/router/api/mod.rs b/crates/server/src/router/api/mod.rs index dd7d6ca5..0b3f3edb 100644 --- a/crates/server/src/router/api/mod.rs +++ b/crates/server/src/router/api/mod.rs @@ -47,6 +47,7 @@ pub fn router(state: Arc) -> Router> { .merge(auth::protected_router()) .merge(mobile::protected_router()) // Read-only routes accessible to all authenticated users + .merge(agent::read_router()) .merge(server::read_router()) .merge(server_group::read_router()) .merge(ping::read_router()) diff --git a/crates/server/src/router/api/server.rs b/crates/server/src/router/api/server.rs index 83d71573..06fe189f 100644 --- a/crates/server/src/router/api/server.rs +++ b/crates/server/src/router/api/server.rs @@ -568,45 +568,10 @@ async fn trigger_upgrade( format!("serverbee-agent-{os}-{arch}") }; - let base_url = &state.config.upgrade.release_base_url; - let download_url = format!("{base_url}/download/v{version}/{asset_name}"); - - // Fetch checksums.txt - let checksums_url = format!("{base_url}/download/v{version}/checksums.txt"); - let checksums_response = reqwest::get(&checksums_url) - .await - .map_err(|e| AppError::Internal(format!("Failed to fetch checksums: {e}")))?; - - if !checksums_response.status().is_success() { - return Err(AppError::NotFound(format!( - "Checksums not found for version v{version} (HTTP {})", - checksums_response.status() - ))); - } - - let checksums_body = checksums_response - .text() - .await - .map_err(|e| AppError::Internal(format!("Failed to read checksums: {e}")))?; - - // Parse: each line is " " or " " - let sha256 = checksums_body - .lines() - .find_map(|line| { - let mut parts = line.splitn(2, |c: char| c.is_whitespace()); - let hash = parts.next()?; - let name = parts.next()?.trim(); - if name == asset_name { - Some(hash.to_string()) - } else { - None - } - }) - .ok_or_else(|| { - AppError::NotFound(format!( - "Checksum not found for {asset_name} in v{version} release" - )) - })?; + let asset = state + .upgrade_release_service + .resolve_asset(&version, &asset_name) + .await?; let sender = state .agent_manager @@ -625,8 +590,8 @@ async fn trigger_upgrade( let msg = ServerMessage::Upgrade { version: version.to_string(), - download_url, - sha256, + download_url: asset.download_url, + sha256: asset.sha256, job_id: Some(job.job_id.clone()), }; if let Err(_send_error) = sender.send(msg).await { diff --git a/crates/server/src/service/mod.rs b/crates/server/src/service/mod.rs index f7395340..6d50b9ee 100644 --- a/crates/server/src/service/mod.rs +++ b/crates/server/src/service/mod.rs @@ -25,5 +25,6 @@ pub mod status_page; pub mod task_scheduler; pub mod traffic; pub mod upgrade_tracker; +pub mod upgrade_release; pub mod uptime; pub mod user; diff --git a/crates/server/src/service/upgrade_release.rs b/crates/server/src/service/upgrade_release.rs new file mode 100644 index 00000000..9d3b88fe --- /dev/null +++ b/crates/server/src/service/upgrade_release.rs @@ -0,0 +1,298 @@ +use std::time::{Duration, Instant}; + +use chrono::{DateTime, Utc}; +use serde::{Deserialize, Serialize}; +use tokio::sync::RwLock; + +use crate::config::UpgradeConfig; +use crate::error::AppError; + +const SUCCESS_CACHE_TTL: Duration = Duration::from_secs(10 * 60); +const FAILURE_CACHE_TTL: Duration = Duration::from_secs(60); + +#[derive(Debug, Clone, Serialize, Deserialize, utoipa::ToSchema)] +pub struct LatestAgentVersionResponse { + pub version: Option, + pub released_at: Option>, + pub error: Option, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct ReleaseAsset { + pub download_url: String, + pub sha256: String, +} + +#[derive(Debug, Clone)] +pub struct CachedLatestVersion { + response: LatestAgentVersionResponse, + expires_at: Instant, +} + +impl CachedLatestVersion { + pub fn success(response: LatestAgentVersionResponse) -> Self { + Self::new(response, SUCCESS_CACHE_TTL) + } + + pub fn failure(response: LatestAgentVersionResponse) -> Self { + Self::new(response, FAILURE_CACHE_TTL) + } + + fn new(response: LatestAgentVersionResponse, ttl: Duration) -> Self { + Self { + response, + expires_at: Instant::now() + ttl, + } + } + + pub fn ttl_remaining(&self) -> Duration { + self.expires_at.saturating_duration_since(Instant::now()) + } + + fn is_expired(&self) -> bool { + Instant::now() >= self.expires_at + } + + fn response(&self) -> LatestAgentVersionResponse { + self.response.clone() + } +} + +pub struct UpgradeReleaseService { + client: reqwest::Client, + release_base_url: String, + latest_version_url: String, + cache: RwLock>, +} + +impl UpgradeReleaseService { + pub fn new(config: &UpgradeConfig) -> Self { + Self { + client: reqwest::Client::builder() + .user_agent(concat!(env!("CARGO_PKG_NAME"), "/", env!("CARGO_PKG_VERSION"))) + .build() + .unwrap_or_else(|_| reqwest::Client::new()), + release_base_url: config.release_base_url.clone(), + latest_version_url: config.latest_version_url.clone(), + cache: RwLock::new(None), + } + } + + pub async fn latest(&self) -> LatestAgentVersionResponse { + { + let cache = self.cache.read().await; + if let Some(cached) = cache.as_ref() + && !cached.is_expired() + { + return cached.response(); + } + } + + let response = self.fetch_latest().await; + let cached = if response.error.is_none() { + CachedLatestVersion::success(response.clone()) + } else { + CachedLatestVersion::failure(response.clone()) + }; + + *self.cache.write().await = Some(cached); + response + } + + pub async fn resolve_asset( + &self, + version: &str, + asset_name: &str, + ) -> Result { + let version = normalize_release_tag(version); + let download_url = format!( + "{}/download/v{version}/{asset_name}", + self.release_base_url.trim_end_matches('/') + ); + let checksums_url = format!( + "{}/download/v{version}/checksums.txt", + self.release_base_url.trim_end_matches('/') + ); + + let checksums_response = self + .client + .get(&checksums_url) + .send() + .await + .map_err(|e| AppError::Internal(format!("Failed to fetch checksums: {e}")))?; + + if !checksums_response.status().is_success() { + return Err(AppError::NotFound(format!( + "Checksums not found for version v{version} (HTTP {})", + checksums_response.status() + ))); + } + + let checksums_body = checksums_response + .text() + .await + .map_err(|e| AppError::Internal(format!("Failed to read checksums: {e}")))?; + + let sha256 = checksums_body + .lines() + .find_map(|line| { + let mut parts = line.split_whitespace(); + let hash = parts.next()?; + let name = parts.next()?; + if name == asset_name { + Some(hash.to_string()) + } else { + None + } + }) + .ok_or_else(|| { + AppError::NotFound(format!( + "Checksum not found for {asset_name} in v{version} release" + )) + })?; + + Ok(ReleaseAsset { + download_url, + sha256, + }) + } + + async fn fetch_latest(&self) -> LatestAgentVersionResponse { + let latest_version_url = if self.latest_version_url.trim().is_empty() { + match github_latest_release_api(&self.release_base_url) { + Some(url) => url, + None => { + return LatestAgentVersionResponse { + version: None, + released_at: None, + error: Some("Unable to derive latest-version URL from release_base_url".into()), + }; + } + } + } else { + self.latest_version_url.clone() + }; + + let response = match self.client.get(&latest_version_url).send().await { + Ok(response) => response, + Err(error) => { + return LatestAgentVersionResponse { + version: None, + released_at: None, + error: Some(format!("Failed to fetch latest version: {error}")), + }; + } + }; + + if !response.status().is_success() { + return LatestAgentVersionResponse { + version: None, + released_at: None, + error: Some(format!( + "Latest version lookup failed with HTTP {}", + response.status() + )), + }; + } + + let body = match response.text().await { + Ok(body) => body, + Err(error) => { + return LatestAgentVersionResponse { + version: None, + released_at: None, + error: Some(format!("Failed to read latest version response: {error}")), + }; + } + }; + + if let Ok(github_release) = serde_json::from_str::(&body) { + return LatestAgentVersionResponse { + version: Some(normalize_release_tag(&github_release.tag_name).to_string()), + released_at: github_release.published_at, + error: None, + }; + } + + match serde_json::from_str::(&body) { + Ok(mut response) => { + if let Some(version) = response.version.take() { + response.version = Some(normalize_release_tag(&version).to_string()); + } + response + } + Err(error) => LatestAgentVersionResponse { + version: None, + released_at: None, + error: Some(format!("Failed to parse latest version response: {error}")), + }, + } + } +} + +pub fn normalize_release_tag(tag: &str) -> &str { + tag.strip_prefix('v').unwrap_or(tag) +} + +pub fn github_latest_release_api(release_base_url: &str) -> Option { + let url = reqwest::Url::parse(release_base_url).ok()?; + if url.host_str()? != "github.com" { + return None; + } + + let segments: Vec<_> = url + .path_segments()? + .filter(|segment| !segment.is_empty()) + .collect(); + + if segments.len() < 3 || segments[2] != "releases" { + return None; + } + + Some(format!( + "https://api.github.com/repos/{}/{}/releases/latest", + segments[0], segments[1] + )) +} + +#[derive(Debug, serde::Deserialize)] +struct GitHubLatestRelease { + tag_name: String, + #[serde(default)] + published_at: Option>, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn github_release_api_url_is_derived_from_release_base_url() { + assert_eq!( + github_latest_release_api("https://github.com/ZingerLittleBee/ServerBee/releases"), + Some("https://api.github.com/repos/ZingerLittleBee/ServerBee/releases/latest".to_string()) + ); + } + + #[test] + fn normalize_release_tag_strips_optional_v_prefix() { + assert_eq!(normalize_release_tag("v1.2.3"), "1.2.3"); + assert_eq!(normalize_release_tag("1.2.3"), "1.2.3"); + } + + #[test] + fn cache_ttl_is_longer_for_success_than_failure() { + let success = CachedLatestVersion::success(LatestAgentVersionResponse { + version: Some("1.2.3".into()), + released_at: None, + error: None, + }); + let failure = CachedLatestVersion::failure(LatestAgentVersionResponse { + version: None, + released_at: None, + error: Some("boom".into()), + }); + + assert!(success.ttl_remaining() > failure.ttl_remaining()); + } +} diff --git a/crates/server/src/state.rs b/crates/server/src/state.rs index ee56f3e8..27676b46 100644 --- a/crates/server/src/state.rs +++ b/crates/server/src/state.rs @@ -17,6 +17,7 @@ use crate::service::high_risk_audit::{ DockerLogsAuditContext, ExecAuditContext, TerminalAuditContext, }; use crate::service::task_scheduler::TaskScheduler; +use crate::service::upgrade_release::UpgradeReleaseService; use crate::service::upgrade_tracker::UpgradeJobTracker; /// Pending TOTP setup data, keyed by user_id. @@ -62,6 +63,8 @@ pub struct AppState { pub alert_state_manager: AlertStateManager, /// Tracks in-flight and recent agent upgrade jobs. pub upgrade_tracker: UpgradeJobTracker, + /// Resolves latest agent release metadata and assets. + pub upgrade_release_service: UpgradeReleaseService, /// Pending mobile pairing codes for QR login, keyed by code. pub pending_pairs: DashMap, /// Terminal session audit contexts keyed by session_id. @@ -149,6 +152,7 @@ impl AppState { std::env::temp_dir().join("serverbee-transfers"), )); let task_scheduler = Arc::new(TaskScheduler::new(&config.scheduler.timezone).await?); + let upgrade_release_service = UpgradeReleaseService::new(&config.upgrade); let alert_state_manager = match AlertStateManager::load_from_db(&db).await { Ok(sm) => sm, Err(e) => { @@ -176,6 +180,7 @@ impl AppState { task_scheduler, alert_state_manager, upgrade_tracker: UpgradeJobTracker::new(browser_tx.clone()), + upgrade_release_service, pending_pairs: DashMap::new(), terminal_audit_contexts: DashMap::new(), docker_logs_audit_contexts: DashMap::new(), From 5a3856c62b94f2922898648558bbe868dc99a0c7 Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Tue, 14 Apr 2026 20:22:07 +0800 Subject: [PATCH 08/17] feat(agent): add upgrade execution pipeline Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent) Co-authored-by: Sisyphus --- crates/agent/src/reporter.rs | 357 +++++++++++++++++++++++++++++++---- 1 file changed, 316 insertions(+), 41 deletions(-) diff --git a/crates/agent/src/reporter.rs b/crates/agent/src/reporter.rs index ccde6418..ce269dff 100644 --- a/crates/agent/src/reporter.rs +++ b/crates/agent/src/reporter.rs @@ -1,14 +1,15 @@ use std::net::IpAddr; use std::sync::Arc; -use std::sync::atomic::{AtomicU32, Ordering}; +use std::sync::atomic::{AtomicBool, AtomicU32, Ordering}; use std::time::Duration; +use chrono::{NaiveDateTime, Utc}; use futures_util::{SinkExt, StreamExt}; use rand::Rng; use serverbee_common::constants::{ CapabilityDeniedReason, DEFAULT_COMMAND_TIMEOUT_SECS, MAX_TASK_OUTPUT_SIZE, has_capability, }; -use serverbee_common::protocol::{AgentMessage, ServerMessage}; +use serverbee_common::protocol::{AgentMessage, ServerMessage, UpgradeStage}; use serverbee_common::types::{NetworkInterface, NetworkProbeResultData, TracerouteHop}; use sysinfo::Networks; use tokio::sync::mpsc; @@ -29,6 +30,31 @@ const MAX_BACKOFF_SECS: u64 = 30; const JITTER_FACTOR: f64 = 0.2; const MAX_REREGISTER_ATTEMPTS: u32 = 3; const DOCKER_RETRY_SECS: u64 = 30; +const UPGRADE_DOWNLOAD_TIMEOUT_SECS: u64 = 600; +const UPGRADE_BACKUP_RETENTION_HOURS: i64 = 24; + +static UPGRADE_IN_PROGRESS: AtomicBool = AtomicBool::new(false); + +struct UpgradeFailure { + stage: UpgradeStage, + error: anyhow::Error, + backup_path: Option, +} + +impl UpgradeFailure { + fn new(stage: UpgradeStage, error: impl Into) -> Self { + Self { + stage, + error: error.into(), + backup_path: None, + } + } + + fn with_backup_path(mut self, backup_path: &std::path::Path) -> Self { + self.backup_path = Some(backup_path.display().to_string()); + self + } +} pub struct Reporter { config: AgentConfig, @@ -572,6 +598,7 @@ impl Reporter { version, download_url, sha256, + job_id, } => { let caps = capabilities.load(Ordering::SeqCst); if !has_capability(caps, CAP_UPGRADE) { @@ -591,10 +618,51 @@ impl Reporter { write.send(Message::Text(json.into())).await?; return Ok(()); } + + if UPGRADE_IN_PROGRESS + .compare_exchange(false, true, Ordering::SeqCst, Ordering::SeqCst) + .is_err() + { + tracing::warn!("Upgrade rejected: another upgrade is already running"); + emit_upgrade_failure( + cmd_result_tx, + job_id, + version, + UpgradeStage::Downloading, + "another upgrade is already running".to_string(), + None, + ) + .await; + return Ok(()); + } + tracing::info!("Upgrade requested: v{version} from {download_url}"); + let tx = cmd_result_tx.clone(); tokio::spawn(async move { - if let Err(e) = perform_upgrade(&version, &download_url, &sha256).await { - tracing::error!("Upgrade to v{version} failed: {e}"); + if let Err(failure) = perform_upgrade( + &version, + &download_url, + &sha256, + job_id.clone(), + tx.clone(), + ) + .await + { + tracing::error!( + "Upgrade to v{version} failed during {:?}: {}", + failure.stage, + failure.error + ); + emit_upgrade_failure( + &tx, + job_id, + version, + failure.stage, + failure.error.to_string(), + failure.backup_path, + ) + .await; + UPGRADE_IN_PROGRESS.store(false, Ordering::SeqCst); } }); } @@ -1528,9 +1596,11 @@ async fn derive_primary_ips( mod tests { use super::*; use crate::config::{CollectorConfig, FileConfig, IpChangeConfig, LogConfig}; + use chrono::{Duration as ChronoDuration, Utc}; use serverbee_common::constants::{ CAP_DEFAULT, CAP_EXEC, CAP_FILE, CAP_PING_ICMP, CapabilityDeniedReason, }; + use tempfile::tempdir; use tokio_tungstenite::tungstenite::http::Response; #[test] @@ -1736,6 +1806,55 @@ HOST: agent Loss% Snt Last Avg Best Wrst StDev let hops = parse_traceroute_output(""); assert!(hops.is_empty()); } + + #[test] + fn verify_sha256_rejects_mismatched_hash() { + let err = verify_sha256(b"serverbee", "deadbeef").expect_err("hash should mismatch"); + + assert!(err.to_string().contains("Checksum mismatch")); + } + + #[cfg(unix)] + #[test] + fn run_preflight_rejects_non_zero_exit() { + use std::os::unix::fs::PermissionsExt; + + let temp = tempdir().unwrap(); + let binary_path = temp.path().join("fake-agent"); + std::fs::write(&binary_path, "#!/bin/sh\nexit 23\n").unwrap(); + std::fs::set_permissions(&binary_path, std::fs::Permissions::from_mode(0o755)).unwrap(); + + let err = run_preflight(&binary_path).expect_err("preflight should fail"); + + assert!(err.to_string().contains("Preflight check failed")); + } + + #[test] + fn cleanup_old_backups_removes_only_stale_backup_files() { + let temp = tempdir().unwrap(); + let exe_path = temp.path().join("serverbee-agent"); + std::fs::write(&exe_path, b"current").unwrap(); + + let stale = exe_path.with_extension(format!( + "bak.{}", + (Utc::now() - ChronoDuration::hours(25)).format("%Y%m%d-%H%M%S") + )); + let fresh = exe_path.with_extension(format!( + "bak.{}", + (Utc::now() - ChronoDuration::hours(1)).format("%Y%m%d-%H%M%S") + )); + let unrelated = temp.path().join("other-agent.bak.20200101-000000"); + + std::fs::write(&stale, b"stale").unwrap(); + std::fs::write(&fresh, b"fresh").unwrap(); + std::fs::write(&unrelated, b"other").unwrap(); + + cleanup_old_backups(&exe_path).unwrap(); + + assert!(!stale.exists()); + assert!(fresh.exists()); + assert!(unrelated.exists()); + } } /// Fetch external IP address from a remote service. @@ -1772,23 +1891,68 @@ async fn fetch_external_ip(url: &str) -> anyhow::Result { Ok(ip) } -/// Download a new agent binary, verify checksum, replace current binary, and restart. -async fn perform_upgrade(version: &str, download_url: &str, sha256: &str) -> anyhow::Result<()> { +async fn emit_upgrade_progress( + tx: &mpsc::Sender, + job_id: Option, + version: &str, + stage: UpgradeStage, +) { + let message = AgentMessage::UpgradeProgress { + msg_id: uuid::Uuid::new_v4().to_string(), + job_id, + target_version: version.to_string(), + stage, + }; + + if tx.send(message).await.is_err() { + tracing::warn!("Failed to emit upgrade progress: channel closed"); + } +} + +async fn emit_upgrade_failure( + tx: &mpsc::Sender, + job_id: Option, + version: String, + stage: UpgradeStage, + error: String, + backup_path: Option, +) { + let message = AgentMessage::UpgradeResult { + msg_id: uuid::Uuid::new_v4().to_string(), + job_id, + target_version: version, + stage, + error, + backup_path, + }; + + if tx.send(message).await.is_err() { + tracing::warn!("Failed to emit upgrade failure: channel closed"); + } +} + +fn verify_sha256(bytes: &[u8], expected_sha256: &str) -> anyhow::Result<()> { use sha2::{Digest, Sha256}; - use std::io::Write; - // Validate URL scheme + let mut hasher = Sha256::new(); + hasher.update(bytes); + let actual = format!("{:x}", hasher.finalize()); + let expected = expected_sha256.to_ascii_lowercase(); + + if actual != expected { + anyhow::bail!("Checksum mismatch: expected {expected_sha256}, got {actual}"); + } + + Ok(()) +} + +async fn download_upgrade_bytes(download_url: &str) -> anyhow::Result> { if !download_url.starts_with("https://") { anyhow::bail!("Upgrade URL must use HTTPS, got: {download_url}"); } - let current_exe = std::env::current_exe()?; - let tmp_path = current_exe.with_extension("new"); - let backup_path = current_exe.with_extension("bak"); - - tracing::info!("Downloading agent v{version} from {download_url}..."); let client = reqwest::Client::builder() - .timeout(std::time::Duration::from_secs(600)) // 10 minute timeout + .timeout(Duration::from_secs(UPGRADE_DOWNLOAD_TIMEOUT_SECS)) .build()?; let response = client .get(download_url) @@ -1800,47 +1964,158 @@ async fn perform_upgrade(version: &str, download_url: &str, sha256: &str) -> any anyhow::bail!("Download failed with status {}", response.status()); } - let bytes = response.bytes().await?; - tracing::info!("Downloaded {} bytes", bytes.len()); + Ok(response.bytes().await?.to_vec()) +} - // Mandatory SHA-256 verification - let mut hasher = Sha256::new(); - hasher.update(&bytes); - let actual = format!("{:x}", hasher.finalize()); - if actual != sha256 { - anyhow::bail!("Checksum mismatch: expected {sha256}, got {actual}"); +fn set_executable_permissions(path: &std::path::Path) -> anyhow::Result<()> { + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt; + std::fs::set_permissions(path, std::fs::Permissions::from_mode(0o755))?; } - tracing::info!("Checksum verified"); - // Write to temporary file + #[cfg(not(unix))] { - let mut file = std::fs::File::create(&tmp_path)?; - file.write_all(&bytes)?; - file.sync_all()?; + let _ = path; } - // Set executable permission on Unix - #[cfg(unix)] - { - use std::os::unix::fs::PermissionsExt; - std::fs::set_permissions(&tmp_path, std::fs::Permissions::from_mode(0o755))?; + Ok(()) +} + +fn run_preflight(path: &std::path::Path) -> anyhow::Result<()> { + let output = std::process::Command::new(path).arg("--version").output()?; + + if !output.status.success() { + let stdout = String::from_utf8_lossy(&output.stdout).trim().to_string(); + let stderr = String::from_utf8_lossy(&output.stderr).trim().to_string(); + let details = if !stderr.is_empty() { + stderr + } else if !stdout.is_empty() { + stdout + } else { + "no output".to_string() + }; + anyhow::bail!( + "Preflight check failed with status {}: {}", + output.status, + details + ); + } + + Ok(()) +} + +fn cleanup_old_backups(current_exe: &std::path::Path) -> anyhow::Result<()> { + let Some(parent) = current_exe.parent() else { + return Ok(()); + }; + let Some(name) = current_exe.file_name().and_then(|value| value.to_str()) else { + return Ok(()); + }; + + let prefix = format!("{name}.bak."); + let cutoff = Utc::now() - chrono::Duration::hours(UPGRADE_BACKUP_RETENTION_HOURS); + + for entry in std::fs::read_dir(parent)? { + let entry = entry?; + let path = entry.path(); + if !path.is_file() { + continue; + } + + let Some(file_name) = path.file_name().and_then(|value| value.to_str()) else { + continue; + }; + let Some(timestamp) = file_name.strip_prefix(&prefix) else { + continue; + }; + let Ok(parsed) = NaiveDateTime::parse_from_str(timestamp, "%Y%m%d-%H%M%S") else { + continue; + }; + + let backup_time = chrono::DateTime::::from_naive_utc_and_offset(parsed, Utc); + if backup_time < cutoff { + std::fs::remove_file(path)?; + } } - // Backup current binary and replace - if backup_path.exists() { - std::fs::remove_file(&backup_path)?; + Ok(()) +} + +/// Download a new agent binary, verify checksum, replace current binary, and restart. +async fn perform_upgrade( + version: &str, + download_url: &str, + sha256: &str, + job_id: Option, + tx: mpsc::Sender, +) -> Result<(), UpgradeFailure> { + use std::io::Write; + + let current_exe = std::env::current_exe() + .map_err(|error| UpgradeFailure::new(UpgradeStage::Installing, error))?; + let tmp_path = current_exe.with_extension("new"); + let backup_path = + current_exe.with_extension(format!("bak.{}", Utc::now().format("%Y%m%d-%H%M%S"))); + + emit_upgrade_progress(&tx, job_id.clone(), version, UpgradeStage::Downloading).await; + tracing::info!("Downloading agent v{version} from {download_url}..."); + let bytes = download_upgrade_bytes(download_url) + .await + .map_err(|error| UpgradeFailure::new(UpgradeStage::Downloading, error))?; + + emit_upgrade_progress(&tx, job_id.clone(), version, UpgradeStage::Verifying).await; + verify_sha256(&bytes, sha256) + .map_err(|error| UpgradeFailure::new(UpgradeStage::Verifying, error))?; + + { + let mut file = std::fs::File::create(&tmp_path) + .map_err(|error| UpgradeFailure::new(UpgradeStage::Installing, error))?; + file.write_all(&bytes) + .map_err(|error| UpgradeFailure::new(UpgradeStage::Installing, error))?; + file.sync_all() + .map_err(|error| UpgradeFailure::new(UpgradeStage::Installing, error))?; } - std::fs::rename(¤t_exe, &backup_path)?; - std::fs::rename(&tmp_path, ¤t_exe)?; + set_executable_permissions(&tmp_path) + .map_err(|error| UpgradeFailure::new(UpgradeStage::Installing, error))?; + + emit_upgrade_progress(&tx, job_id.clone(), version, UpgradeStage::PreFlight).await; + run_preflight(&tmp_path) + .map_err(|error| UpgradeFailure::new(UpgradeStage::PreFlight, error))?; + + emit_upgrade_progress(&tx, job_id.clone(), version, UpgradeStage::Installing).await; + std::fs::rename(¤t_exe, &backup_path) + .map_err(|error| UpgradeFailure::new(UpgradeStage::Installing, error))?; + + if let Err(error) = std::fs::rename(&tmp_path, ¤t_exe) { + let rollback_result = std::fs::rename(&backup_path, ¤t_exe); + let rollback_error = rollback_result.err(); + let install_error = if let Some(rollback_error) = rollback_error { + anyhow::anyhow!( + "Failed to install new binary: {error}; rollback also failed: {rollback_error}" + ) + } else { + anyhow::anyhow!("Failed to install new binary: {error}; restored backup") + }; + return Err(UpgradeFailure::new(UpgradeStage::Installing, install_error) + .with_backup_path(&backup_path)); + } + + cleanup_old_backups(¤t_exe).map_err(|error| { + UpgradeFailure::new(UpgradeStage::Installing, error).with_backup_path(&backup_path) + })?; + emit_upgrade_progress(&tx, job_id, version, UpgradeStage::Restarting).await; tracing::info!("Agent binary replaced. Restarting..."); - let args: Vec = std::env::args().collect(); let mut cmd = std::process::Command::new(¤t_exe); - if args.len() > 1 { - cmd.args(&args[1..]); + let args: Vec<_> = std::env::args_os().skip(1).collect(); + if !args.is_empty() { + cmd.args(args); } - cmd.spawn()?; + cmd.spawn().map_err(|error| { + UpgradeFailure::new(UpgradeStage::Restarting, error).with_backup_path(&backup_path) + })?; std::process::exit(0); } From fcdd2cad599f40646c0297505b9ad19d7b4a929f Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Tue, 14 Apr 2026 20:24:39 +0800 Subject: [PATCH 09/17] feat(web): add upgrade jobs store and hooks --- apps/web/src/hooks/use-servers-ws.test.ts | 9 + apps/web/src/hooks/use-servers-ws.ts | 83 +++++++- apps/web/src/hooks/use-upgrade-job.test.ts | 67 ++++++ apps/web/src/hooks/use-upgrade-job.ts | 33 +++ apps/web/src/lib/api-schema.ts | 11 + .../web/src/stores/upgrade-jobs-store.test.ts | 191 ++++++++++++++++++ apps/web/src/stores/upgrade-jobs-store.ts | 82 ++++++++ 7 files changed, 471 insertions(+), 5 deletions(-) create mode 100644 apps/web/src/hooks/use-upgrade-job.test.ts create mode 100644 apps/web/src/hooks/use-upgrade-job.ts create mode 100644 apps/web/src/stores/upgrade-jobs-store.test.ts create mode 100644 apps/web/src/stores/upgrade-jobs-store.ts diff --git a/apps/web/src/hooks/use-servers-ws.test.ts b/apps/web/src/hooks/use-servers-ws.test.ts index 87aab80c..c406ef44 100644 --- a/apps/web/src/hooks/use-servers-ws.test.ts +++ b/apps/web/src/hooks/use-servers-ws.test.ts @@ -106,3 +106,12 @@ describe('setServerCapabilities', () => { expect(result[0].effective_capabilities).toBe(0) }) }) + +describe('setServerAgentVersion', () => { + it('updates agent_version field', () => { + const prev = [makeServer({ id: 's1', agent_version: undefined })] + const result = prev.map((s) => (s.id === 's1' ? { ...s, agent_version: '1.2.3' } : s)) + + expect(result[0].agent_version).toBe('1.2.3') + }) +}) diff --git a/apps/web/src/hooks/use-servers-ws.ts b/apps/web/src/hooks/use-servers-ws.ts index 0c29158e..3b77f908 100644 --- a/apps/web/src/hooks/use-servers-ws.ts +++ b/apps/web/src/hooks/use-servers-ws.ts @@ -7,6 +7,7 @@ import type { DockerContainerStats, DockerEventInfo } from '@/routes/_authed/servers/$serverId/docker/types' +import { type UpgradeJob, useUpgradeJobsStore } from '@/stores/upgrade-jobs-store' const MAX_DOCKER_EVENTS = 100 @@ -48,7 +49,7 @@ interface ServerMetrics { } type WsMessage = - | { type: 'full_sync'; servers: ServerMetrics[] } + | { type: 'full_sync'; servers: ServerMetrics[]; upgrades?: UpgradeJob[] } | { type: 'update'; servers: ServerMetrics[] } | { type: 'server_online'; server_id: string } | { type: 'server_offline'; server_id: string } @@ -59,7 +60,7 @@ type WsMessage = agent_local_capabilities?: number | null effective_capabilities?: number | null } - | { type: 'agent_info_updated'; server_id: string; protocol_version: number } + | { type: 'agent_info_updated'; server_id: string; protocol_version: number; agent_version?: string | null } | { type: 'network_probe_update'; server_id: string; results: NetworkProbeResultData[] } | { type: 'docker_update' @@ -69,6 +70,17 @@ type WsMessage = } | { type: 'docker_event'; server_id: string; event: DockerEventInfo } | { type: 'docker_availability_changed'; server_id: string; available: boolean } + | { type: 'upgrade_progress'; server_id: string; job_id: string; target_version: string; stage: string } + | { + type: 'upgrade_result' + server_id: string + job_id: string + target_version: string + status: string + stage?: string + error?: string | null + backup_path?: string | null + } export type { ServerMetrics } @@ -181,6 +193,9 @@ function handleServerMetricsMessage(raw: { type: string } & Record(['servers'], msg.servers) + if (Array.isArray(raw.upgrades)) { + useUpgradeJobsStore.getState().setJobs(raw.upgrades as UpgradeJob[]) + } } else { queryClient.setQueryData(['servers'], (prev) => prev ? mergeServerUpdate(prev, msg.servers) : msg.servers @@ -241,12 +256,12 @@ function handleCapabilityMessage(raw: { type: string } & Record return } const msg = raw as WsMessage & { type: 'agent_info_updated' } - const { server_id, protocol_version } = msg + const { server_id, protocol_version, agent_version } = msg queryClient.setQueryData(['servers', server_id], (prev: Record | undefined) => - prev ? { ...prev, protocol_version } : prev + prev ? { ...prev, protocol_version, agent_version: agent_version ?? null } : prev ) queryClient.setQueryData[]>(['servers-list'], (prev) => - prev?.map((s) => (s.id === server_id ? { ...s, protocol_version } : s)) + prev?.map((s) => (s.id === server_id ? { ...s, protocol_version, agent_version: agent_version ?? null } : s)) ) } } @@ -333,6 +348,64 @@ function handleWsMessage(raw: unknown, queryClient: QueryClient): void { case 'docker_availability_changed': handleDockerMessage(raw, queryClient) break + case 'upgrade_progress': { + if ( + typeof raw.server_id !== 'string' || + typeof raw.job_id !== 'string' || + typeof raw.target_version !== 'string' || + typeof raw.stage !== 'string' + ) { + break + } + const { server_id, target_version, stage } = raw as { + server_id: string + job_id: string + target_version: string + stage: string + } + const existingJob = useUpgradeJobsStore.getState().getJob(server_id) + if (existingJob) { + useUpgradeJobsStore.getState().setJob(server_id, { + ...existingJob, + stage: stage as UpgradeJob['stage'], + target_version + }) + } + break + } + case 'upgrade_result': { + if ( + typeof raw.server_id !== 'string' || + typeof raw.job_id !== 'string' || + typeof raw.target_version !== 'string' || + typeof raw.status !== 'string' + ) { + break + } + const { server_id, job_id, target_version, status, stage, error, backup_path } = raw as { + server_id: string + job_id: string + target_version: string + status: string + stage?: string + error?: string | null + backup_path?: string | null + } + const existingJob = useUpgradeJobsStore.getState().getJob(server_id) + const now = new Date().toISOString() + useUpgradeJobsStore.getState().setJob(server_id, { + server_id, + job_id, + target_version, + stage: (stage as UpgradeJob['stage']) ?? existingJob?.stage ?? 'downloading', + status: status as UpgradeJob['status'], + error: error ?? null, + backup_path: backup_path ?? null, + started_at: existingJob?.started_at ?? now, + finished_at: now + }) + break + } default: break } diff --git a/apps/web/src/hooks/use-upgrade-job.test.ts b/apps/web/src/hooks/use-upgrade-job.test.ts new file mode 100644 index 00000000..e4e300f0 --- /dev/null +++ b/apps/web/src/hooks/use-upgrade-job.test.ts @@ -0,0 +1,67 @@ +import { beforeEach, describe, expect, it } from 'vitest' +import { type UpgradeJob, useUpgradeJobsStore } from '@/stores/upgrade-jobs-store' + +describe('useUpgradeJob store integration', () => { + beforeEach(() => { + useUpgradeJobsStore.setState({ jobs: new Map() }) + }) + + function makeJob(overrides: Partial = {}): UpgradeJob { + return { + server_id: 'server-1', + job_id: 'job-1', + target_version: '1.0.0', + stage: 'downloading', + status: 'running', + error: null, + backup_path: null, + started_at: '2024-01-01T00:00:00Z', + finished_at: null, + ...overrides + } + } + + it('returns undefined when no job exists', () => { + const job = useUpgradeJobsStore.getState().getJob('server-1') + expect(job).toBeUndefined() + }) + + it('returns job from store when job exists', () => { + const job = makeJob() + useUpgradeJobsStore.getState().setJob('server-1', job) + + const retrieved = useUpgradeJobsStore.getState().getJob('server-1') + expect(retrieved).toEqual(job) + }) + + it('updates job stage via store with different job_id', () => { + const job = makeJob({ job_id: 'job-1', stage: 'downloading' }) + useUpgradeJobsStore.getState().setJob('server-1', job) + + const updatedJob = makeJob({ job_id: 'job-2', stage: 'installing' }) + useUpgradeJobsStore.getState().setJob('server-1', updatedJob) + + const retrieved = useUpgradeJobsStore.getState().getJob('server-1') + expect(retrieved?.stage).toBe('installing') + }) + + it('updates job status via store with different job_id', () => { + const job = makeJob({ job_id: 'job-1', status: 'running' }) + useUpgradeJobsStore.getState().setJob('server-1', job) + + const updatedJob = makeJob({ job_id: 'job-2', status: 'succeeded', finished_at: '2024-01-01T00:01:00Z' }) + useUpgradeJobsStore.getState().setJob('server-1', updatedJob) + + const retrieved = useUpgradeJobsStore.getState().getJob('server-1') + expect(retrieved?.status).toBe('succeeded') + }) + + it('clears job via store', () => { + const job = makeJob() + useUpgradeJobsStore.getState().setJob('server-1', job) + expect(useUpgradeJobsStore.getState().getJob('server-1')).toBeDefined() + + useUpgradeJobsStore.getState().clearJob('server-1') + expect(useUpgradeJobsStore.getState().getJob('server-1')).toBeUndefined() + }) +}) diff --git a/apps/web/src/hooks/use-upgrade-job.ts b/apps/web/src/hooks/use-upgrade-job.ts new file mode 100644 index 00000000..8a3386ce --- /dev/null +++ b/apps/web/src/hooks/use-upgrade-job.ts @@ -0,0 +1,33 @@ +import { useMutation, useQueryClient } from '@tanstack/react-query' +import { api } from '@/lib/api-client' +import type { UpgradeRequest } from '@/lib/api-schema' +import { type UpgradeJob, useUpgradeJobsStore } from '@/stores/upgrade-jobs-store' + +interface UseUpgradeJobResult { + isLoading: boolean + job: UpgradeJob | null + triggerUpgrade: (version: string) => void +} + +export function useUpgradeJob(serverId: string): UseUpgradeJobResult { + const queryClient = useQueryClient() + const storeJob = useUpgradeJobsStore((state) => state.jobs.get(serverId)) + + const mutation = useMutation({ + mutationFn: (version: string) => { + const body: UpgradeRequest = { version } + return api.post(`/api/servers/${serverId}/upgrade`, body) + }, + onSuccess: () => { + queryClient.invalidateQueries({ queryKey: ['servers', serverId] }) + } + }) + + const job = storeJob ?? null + + return { + job, + triggerUpgrade: mutation.mutate, + isLoading: mutation.isPending + } +} diff --git a/apps/web/src/lib/api-schema.ts b/apps/web/src/lib/api-schema.ts index bee9b969..0e9b34a6 100644 --- a/apps/web/src/lib/api-schema.ts +++ b/apps/web/src/lib/api-schema.ts @@ -101,6 +101,17 @@ export type RegisterResponse = S['RegisterResponse'] export type UpgradeRequest = S['UpgradeRequest'] export type AutoDiscoveryKeyResponse = S['AutoDiscoveryKeyResponse'] +// Upgrade jobs +export type UpgradeJobDto = S['UpgradeJobDto'] +export type UpgradeStage = S['UpgradeStage'] +export type UpgradeStatus = S['UpgradeStatus'] + +export interface LatestAgentVersionResponse { + download_url: string + sha256: string + version: string +} + // Traffic (manually typed until OpenAPI types are regenerated) export interface TrafficResponse { bytes_in: number diff --git a/apps/web/src/stores/upgrade-jobs-store.test.ts b/apps/web/src/stores/upgrade-jobs-store.test.ts new file mode 100644 index 00000000..0ae1fb9d --- /dev/null +++ b/apps/web/src/stores/upgrade-jobs-store.test.ts @@ -0,0 +1,191 @@ +import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest' +import { useUpgradeJobsStore } from './upgrade-jobs-store' + +describe('useUpgradeJobsStore', () => { + beforeEach(() => { + useUpgradeJobsStore.setState({ jobs: new Map() }) + vi.useFakeTimers() + }) + + afterEach(() => { + vi.runAllTimers() + vi.useRealTimers() + }) + + function makeJob( + overrides: Partial<{ + server_id: string + job_id: string + target_version: string + stage: string + status: string + error: string | null + backup_path: string | null + started_at: string + finished_at: string | null + }> = {} + ) { + return { + server_id: 'server-1', + job_id: 'job-1', + target_version: '1.0.0', + stage: 'downloading', + status: 'running', + error: null, + backup_path: null, + started_at: '2024-01-01T00:00:00Z', + finished_at: null, + ...overrides + } + } + + describe('setJob', () => { + it('adds a new job to the store', () => { + const job = makeJob() + useUpgradeJobsStore.getState().setJob('server-1', job) + + const storedJob = useUpgradeJobsStore.getState().jobs.get('server-1') + expect(storedJob).toEqual(job) + }) + + it('updates existing job with different job_id', () => { + const job1 = makeJob({ job_id: 'job-1', target_version: '1.0.0' }) + useUpgradeJobsStore.getState().setJob('server-1', job1) + + const job2 = makeJob({ job_id: 'job-2', target_version: '2.0.0' }) + useUpgradeJobsStore.getState().setJob('server-1', job2) + + const storedJob = useUpgradeJobsStore.getState().jobs.get('server-1') + expect(storedJob?.job_id).toBe('job-2') + expect(storedJob?.target_version).toBe('2.0.0') + }) + + it('skips update if incoming job_id equals existing job_id', () => { + const job1 = makeJob({ job_id: 'job-1', target_version: '1.0.0' }) + useUpgradeJobsStore.getState().setJob('server-1', job1) + + // Try to update with same job_id but different data + const job2 = makeJob({ job_id: 'job-1', target_version: '2.0.0' }) + useUpgradeJobsStore.getState().setJob('server-1', job2) + + const storedJob = useUpgradeJobsStore.getState().jobs.get('server-1') + // Should keep the original data (deduplication) + expect(storedJob?.target_version).toBe('1.0.0') + }) + + it('stores jobs keyed by server_id', () => { + const job1 = makeJob({ server_id: 'server-1', job_id: 'job-1' }) + const job2 = makeJob({ server_id: 'server-2', job_id: 'job-2' }) + + useUpgradeJobsStore.getState().setJob('server-1', job1) + useUpgradeJobsStore.getState().setJob('server-2', job2) + + expect(useUpgradeJobsStore.getState().jobs.get('server-1')?.job_id).toBe('job-1') + expect(useUpgradeJobsStore.getState().jobs.get('server-2')?.job_id).toBe('job-2') + }) + }) + + describe('clearJob', () => { + it('removes job for specific server', () => { + const job = makeJob() + useUpgradeJobsStore.getState().setJob('server-1', job) + + useUpgradeJobsStore.getState().clearJob('server-1') + + expect(useUpgradeJobsStore.getState().jobs.has('server-1')).toBe(false) + }) + + it('does nothing for non-existent server', () => { + const job = makeJob() + useUpgradeJobsStore.getState().setJob('server-1', job) + + useUpgradeJobsStore.getState().clearJob('server-nonexistent') + + expect(useUpgradeJobsStore.getState().jobs.has('server-1')).toBe(true) + }) + }) + + describe('setJobs', () => { + it('batch updates multiple jobs', () => { + const jobs = [ + makeJob({ server_id: 'server-1', job_id: 'job-1' }), + makeJob({ server_id: 'server-2', job_id: 'job-2' }) + ] + + useUpgradeJobsStore.getState().setJobs(jobs) + + expect(useUpgradeJobsStore.getState().jobs.get('server-1')?.job_id).toBe('job-1') + expect(useUpgradeJobsStore.getState().jobs.get('server-2')?.job_id).toBe('job-2') + }) + + it('replaces existing jobs with new batch', () => { + // Set initial job + useUpgradeJobsStore.getState().setJob('server-1', makeJob({ server_id: 'server-1', job_id: 'job-old' })) + + // Batch update with new jobs + const jobs = [makeJob({ server_id: 'server-2', job_id: 'job-new' })] + useUpgradeJobsStore.getState().setJobs(jobs) + + // server-1 job should be replaced (cleared) by the batch + expect(useUpgradeJobsStore.getState().jobs.has('server-1')).toBe(false) + expect(useUpgradeJobsStore.getState().jobs.get('server-2')?.job_id).toBe('job-new') + }) + }) + + describe('auto-clear finished jobs', () => { + it('auto-clears succeeded jobs after 5 seconds', () => { + const job = makeJob({ status: 'succeeded', finished_at: '2024-01-01T00:01:00Z' }) + useUpgradeJobsStore.getState().setJob('server-1', job) + + expect(useUpgradeJobsStore.getState().jobs.has('server-1')).toBe(true) + + vi.advanceTimersByTime(5000) + + expect(useUpgradeJobsStore.getState().jobs.has('server-1')).toBe(false) + }) + + it('auto-clears failed jobs after 5 seconds', () => { + const job = makeJob({ status: 'failed', error: 'Download failed', finished_at: '2024-01-01T00:01:00Z' }) + useUpgradeJobsStore.getState().setJob('server-1', job) + + expect(useUpgradeJobsStore.getState().jobs.has('server-1')).toBe(true) + + vi.advanceTimersByTime(5000) + + expect(useUpgradeJobsStore.getState().jobs.has('server-1')).toBe(false) + }) + + it('does not auto-clear running jobs', () => { + const job = makeJob({ status: 'running' }) + useUpgradeJobsStore.getState().setJob('server-1', job) + + vi.advanceTimersByTime(5000) + + expect(useUpgradeJobsStore.getState().jobs.has('server-1')).toBe(true) + }) + + it('does not auto-clear timeout jobs', () => { + const job = makeJob({ status: 'timeout', finished_at: '2024-01-01T00:01:00Z' }) + useUpgradeJobsStore.getState().setJob('server-1', job) + + vi.advanceTimersByTime(5000) + + expect(useUpgradeJobsStore.getState().jobs.has('server-1')).toBe(true) + }) + }) + + describe('getJob', () => { + it('returns job for specific server', () => { + const job = makeJob() + useUpgradeJobsStore.getState().setJob('server-1', job) + + const retrieved = useUpgradeJobsStore.getState().getJob('server-1') + expect(retrieved).toEqual(job) + }) + + it('returns undefined for non-existent server', () => { + const retrieved = useUpgradeJobsStore.getState().getJob('server-nonexistent') + expect(retrieved).toBeUndefined() + }) + }) +}) diff --git a/apps/web/src/stores/upgrade-jobs-store.ts b/apps/web/src/stores/upgrade-jobs-store.ts new file mode 100644 index 00000000..ab6ff868 --- /dev/null +++ b/apps/web/src/stores/upgrade-jobs-store.ts @@ -0,0 +1,82 @@ +import { create } from 'zustand' + +export type UpgradeStage = 'downloading' | 'verifying' | 'pre_flight' | 'installing' | 'restarting' + +export type UpgradeStatus = 'running' | 'succeeded' | 'failed' | 'timeout' + +export interface UpgradeJob { + backup_path: string | null + error: string | null + finished_at: string | null + job_id: string + server_id: string + stage: UpgradeStage + started_at: string + status: UpgradeStatus + target_version: string +} + +interface UpgradeJobsState { + clearJob: (serverId: string) => void + getJob: (serverId: string) => UpgradeJob | undefined + jobs: Map + setJob: (serverId: string, job: UpgradeJob) => void + setJobs: (jobs: UpgradeJob[]) => void +} + +const AUTO_CLEAR_DELAY = 5000 + +function isFinished(status: UpgradeStatus): boolean { + return status === 'succeeded' || status === 'failed' +} + +export const useUpgradeJobsStore = create()((set, get) => ({ + jobs: new Map(), + + setJob: (serverId: string, job: UpgradeJob) => { + set((state) => { + const existingJob = state.jobs.get(serverId) + + if (existingJob && existingJob.job_id === job.job_id) { + return { jobs: state.jobs } + } + + const newJobs = new Map(state.jobs) + newJobs.set(serverId, job) + + if (isFinished(job.status)) { + setTimeout(() => { + get().clearJob(serverId) + }, AUTO_CLEAR_DELAY) + } + + return { jobs: newJobs } + }) + }, + + clearJob: (serverId: string) => { + set((state) => { + const newJobs = new Map(state.jobs) + newJobs.delete(serverId) + return { jobs: newJobs } + }) + }, + + setJobs: (jobs: UpgradeJob[]) => { + const newJobs = new Map() + for (const job of jobs) { + newJobs.set(job.server_id, job) + + if (isFinished(job.status)) { + setTimeout(() => { + get().clearJob(job.server_id) + }, AUTO_CLEAR_DELAY) + } + } + set({ jobs: newJobs }) + }, + + getJob: (serverId: string) => { + return get().jobs.get(serverId) + } +})) From 559b8157238bc9ea32c4d86ac9b1659acc796c17 Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Tue, 14 Apr 2026 20:28:11 +0800 Subject: [PATCH 10/17] docs: add agent upgrade documentation and QA checklist --- ENV.md | 1 + apps/docs/content/docs/cn/configuration.mdx | 7 + apps/docs/content/docs/en/configuration.mdx | 2 + .../server/agent-version-section.test.tsx | 285 ++++++++++++++++++ .../server/agent-version-section.tsx | 184 +++++++++++ .../web/src/components/server/server-card.tsx | 8 +- .../components/server/upgrade-job-badge.tsx | 54 ++++ apps/web/src/locales/en/servers.json | 18 +- apps/web/src/locales/zh/servers.json | 18 +- apps/web/src/routes/_authed/servers/$id.tsx | 26 +- apps/web/src/routes/_authed/servers/index.tsx | 14 + tests/README.md | 1 + tests/agent-upgrade.md | 181 +++++++++++ 13 files changed, 790 insertions(+), 9 deletions(-) create mode 100644 apps/web/src/components/server/agent-version-section.test.tsx create mode 100644 apps/web/src/components/server/agent-version-section.tsx create mode 100644 apps/web/src/components/server/upgrade-job-badge.tsx create mode 100644 tests/agent-upgrade.md diff --git a/ENV.md b/ENV.md index 899d6da5..230b0c29 100644 --- a/ENV.md +++ b/ENV.md @@ -99,6 +99,7 @@ These variables are for local repo tooling and development workflows. They are n | `SERVERBEE_RATE_LIMIT__LOGIN_MAX` | `rate_limit.login_max` | u32 | `5` | Maximum login attempts per IP within 15-minute window | | `SERVERBEE_RATE_LIMIT__REGISTER_MAX` | `rate_limit.register_max` | u32 | `3` | Maximum agent registrations per IP within 15-minute window | | `SERVERBEE_UPGRADE__RELEASE_BASE_URL` | `upgrade.release_base_url` | string | `https://github.com/ZingerLittleBee/ServerBee/releases` | Base URL for agent upgrade release assets | +| `SERVERBEE_UPGRADE__LATEST_VERSION_URL` | `upgrade.latest_version_url` | string | `""` | Optional custom URL for latest version API. If empty, uses GitHub API | | `SERVERBEE_FILE__MAX_UPLOAD_SIZE` | `file.max_upload_size` | u64 | `104857600` (100 MB) | Maximum file upload size in bytes | ## Agent diff --git a/apps/docs/content/docs/cn/configuration.mdx b/apps/docs/content/docs/cn/configuration.mdx index b7394a1a..cf753963 100644 --- a/apps/docs/content/docs/cn/configuration.mdx +++ b/apps/docs/content/docs/cn/configuration.mdx @@ -117,6 +117,7 @@ ServerBee 使用 [figment](https://github.com/SergioBenitez/Figment) 库加载 | `SERVERBEE_RATE_LIMIT__LOGIN_MAX` | `5` | 15 分钟内每 IP 最大登录尝试次数 | | `SERVERBEE_RATE_LIMIT__REGISTER_MAX` | `3` | 15 分钟内每 IP 最大 Agent 注册次数 | | `SERVERBEE_UPGRADE__RELEASE_BASE_URL` | `https://github.com/ZingerLittleBee/ServerBee/releases` | Agent 升级 Release 资产的基础 URL | +| `SERVERBEE_UPGRADE__LATEST_VERSION_URL` | `""` | 可选的自定义最新版本 API URL,留空则使用 GitHub API | | `SERVERBEE_FILE__MAX_UPLOAD_SIZE` | `104857600` | 文件上传最大大小(字节),默认 100 MB | ### Agent 环境变量 @@ -309,6 +310,12 @@ file = "" # 默认: "https://github.com/ZingerLittleBee/ServerBee/releases" release_base_url = "https://github.com/ZingerLittleBee/ServerBee/releases" +# 可选的自定义最新版本 API URL +# 留空则使用 GitHub API 查询最新版本 +# 用于自定义版本发布渠道或私有镜像源 +# 默认: "" +latest_version_url = "" + # --- 文件上传配置 --- [file] # 文件上传最大大小(字节) diff --git a/apps/docs/content/docs/en/configuration.mdx b/apps/docs/content/docs/en/configuration.mdx index ad158a53..cf4a5f33 100644 --- a/apps/docs/content/docs/en/configuration.mdx +++ b/apps/docs/content/docs/en/configuration.mdx @@ -111,6 +111,7 @@ These variables are intentionally scoped to local tooling. `ALLOW_WRITES` is not | `SERVERBEE_RATE_LIMIT__LOGIN_MAX` | `5` | Max login attempts per IP within 15-minute window | | `SERVERBEE_RATE_LIMIT__REGISTER_MAX` | `3` | Max agent registrations per IP within 15-minute window | | `SERVERBEE_UPGRADE__RELEASE_BASE_URL` | `https://github.com/ZingerLittleBee/ServerBee/releases` | Base URL for agent upgrade release assets | +| `SERVERBEE_UPGRADE__LATEST_VERSION_URL` | `""` | Optional custom URL for latest version API. If empty, uses GitHub API | | `SERVERBEE_FILE__MAX_UPLOAD_SIZE` | `104857600` | Maximum file upload size in bytes (default 100 MB) | ### Agent Environment Variables @@ -267,6 +268,7 @@ The log level can also be set via the `RUST_LOG` environment variable, which tak | Key | Type | Default | Description | |-----|------|---------|-------------| | `release_base_url` | string | `"https://github.com/ZingerLittleBee/ServerBee/releases"` | Base URL for agent upgrade release assets. The server appends `/download/v{version}/` to construct the asset download URL | +| `latest_version_url` | string | `""` | Optional custom URL for latest version API. If empty, the server queries GitHub API to determine the latest version. Use this to override with a custom version endpoint | ### `[file]` -- File Upload (Server-side) diff --git a/apps/web/src/components/server/agent-version-section.test.tsx b/apps/web/src/components/server/agent-version-section.test.tsx new file mode 100644 index 00000000..550054cb --- /dev/null +++ b/apps/web/src/components/server/agent-version-section.test.tsx @@ -0,0 +1,285 @@ +import { fireEvent, render, screen } from '@testing-library/react' +import { beforeEach, describe, expect, it, vi } from 'vitest' +import { AgentVersionSection } from './agent-version-section' + +vi.mock('react-i18next', () => ({ + useTranslation: () => ({ + t: (key: string) => key + }) +})) + +const mockTriggerUpgrade = vi.fn() +const mockUseUpgradeJob = vi.fn() +vi.mock('@/hooks/use-upgrade-job', () => ({ + useUpgradeJob: (serverId: string) => mockUseUpgradeJob(serverId) +})) + +const mockUseAuth = vi.fn() +vi.mock('@/hooks/use-auth', () => ({ + useAuth: () => mockUseAuth() +})) + +const mockGetEffectiveCapabilityEnabled = vi.fn() +vi.mock('@/lib/capabilities', () => ({ + CAP_UPGRADE: 4, + getEffectiveCapabilityEnabled: (...args: unknown[]) => mockGetEffectiveCapabilityEnabled(...args) +})) + +describe('AgentVersionSection', () => { + beforeEach(() => { + vi.clearAllMocks() + mockUseAuth.mockReturnValue({ user: { role: 'admin' } }) + mockUseUpgradeJob.mockReturnValue({ + job: null, + triggerUpgrade: mockTriggerUpgrade, + isLoading: false + }) + mockGetEffectiveCapabilityEnabled.mockReturnValue(true) + }) + + it('renders current agent version', () => { + render( + + ) + expect(screen.getByText('v1.2.3')).toBeDefined() + }) + + it('shows unknown version when agentVersion is null', () => { + render( + + ) + expect(screen.getByText('vunknown')).toBeDefined() + }) + + it('shows update available badge when versions differ', () => { + render( + + ) + expect(screen.getByText(/upgrade_latest_version/)).toBeDefined() + }) + + it('shows upgrade button for admin when update available and capability enabled', () => { + render( + + ) + expect(screen.getByText('upgrade_start')).toBeDefined() + }) + + it('does not show upgrade button for non-admin users', () => { + mockUseAuth.mockReturnValue({ user: { role: 'member' } }) + render( + + ) + expect(screen.queryByText('upgrade_start')).toBeNull() + }) + + it('does not show upgrade button when capability is disabled', () => { + mockGetEffectiveCapabilityEnabled.mockReturnValue(false) + render( + + ) + expect(screen.queryByText('upgrade_start')).toBeNull() + }) + + it('shows disabled message for admin when capability is disabled', () => { + mockGetEffectiveCapabilityEnabled.mockReturnValue(false) + render( + + ) + expect(screen.getByText('cap_disabled')).toBeDefined() + }) + + it('triggers upgrade when button clicked', () => { + render( + + ) + const button = screen.getByText('upgrade_start') + fireEvent.click(button) + expect(mockTriggerUpgrade).toHaveBeenCalledWith('1.3.0') + }) + + it('shows stepper when upgrade is running', () => { + mockUseUpgradeJob.mockReturnValue({ + job: { + backup_path: null, + error: null, + finished_at: null, + job_id: 'job-1', + server_id: 'srv-1', + stage: 'downloading', + started_at: new Date().toISOString(), + status: 'running', + target_version: '1.3.0' + }, + triggerUpgrade: mockTriggerUpgrade, + isLoading: false + }) + render( + + ) + expect(screen.getByText('upgrade_in_progress')).toBeDefined() + // The stage name appears in both the badge and stepper, so check for multiple occurrences + const stageElements = screen.getAllByText('upgrade_stage_downloading') + expect(stageElements.length).toBeGreaterThanOrEqual(1) + }) + + it('shows success state when upgrade succeeded', () => { + mockUseUpgradeJob.mockReturnValue({ + job: { + backup_path: null, + error: null, + finished_at: new Date().toISOString(), + job_id: 'job-1', + server_id: 'srv-1', + stage: 'restarting', + started_at: new Date().toISOString(), + status: 'succeeded', + target_version: '1.3.0' + }, + triggerUpgrade: mockTriggerUpgrade, + isLoading: false + }) + render( + + ) + expect(screen.getByText('upgrade_status_succeeded')).toBeDefined() + }) + + it('shows failed state with error message', () => { + mockUseUpgradeJob.mockReturnValue({ + job: { + backup_path: '/tmp/backup', + error: 'Download failed: connection timeout', + finished_at: new Date().toISOString(), + job_id: 'job-1', + server_id: 'srv-1', + stage: 'downloading', + started_at: new Date().toISOString(), + status: 'failed', + target_version: '1.3.0' + }, + triggerUpgrade: mockTriggerUpgrade, + isLoading: false + }) + render( + + ) + expect(screen.getByText('upgrade_status_failed')).toBeDefined() + expect(screen.getByText('Download failed: connection timeout')).toBeDefined() + expect(screen.getByText(/upgrade_error_with_backup/)).toBeDefined() + }) + + it('shows timeout state with backup path', () => { + mockUseUpgradeJob.mockReturnValue({ + job: { + backup_path: '/opt/serverbee/backups/agent.bak', + error: null, + finished_at: new Date().toISOString(), + job_id: 'job-1', + server_id: 'srv-1', + stage: 'installing', + started_at: new Date().toISOString(), + status: 'timeout', + target_version: '1.3.0' + }, + triggerUpgrade: mockTriggerUpgrade, + isLoading: false + }) + render( + + ) + expect(screen.getByText('upgrade_status_timeout')).toBeDefined() + expect(screen.getByText(/upgrade_backup_path/)).toBeDefined() + }) + + it('disables upgrade button while loading', () => { + mockUseUpgradeJob.mockReturnValue({ + job: null, + triggerUpgrade: mockTriggerUpgrade, + isLoading: true + }) + render( + + ) + const button = screen.getByRole('button') + expect(button).toBeDisabled() + }) +}) diff --git a/apps/web/src/components/server/agent-version-section.tsx b/apps/web/src/components/server/agent-version-section.tsx new file mode 100644 index 00000000..72fd0745 --- /dev/null +++ b/apps/web/src/components/server/agent-version-section.tsx @@ -0,0 +1,184 @@ +import { CheckCircle2, CircleAlert, Clock, Download, Loader2, RefreshCw, ShieldCheck, Wrench } from 'lucide-react' +import { useTranslation } from 'react-i18next' +import { Badge } from '@/components/ui/badge' +import { Button } from '@/components/ui/button' +import { Card, CardContent, CardDescription, CardHeader, CardTitle } from '@/components/ui/card' +import { useAuth } from '@/hooks/use-auth' +import { useUpgradeJob } from '@/hooks/use-upgrade-job' +import { CAP_UPGRADE, getEffectiveCapabilityEnabled } from '@/lib/capabilities' +import type { UpgradeJob, UpgradeStage } from '@/stores/upgrade-jobs-store' + +interface AgentVersionSectionProps { + agentVersion: string | null | undefined + configuredCapabilities?: number | null + effectiveCapabilities?: number | null + latestVersion: string | null | undefined + serverId: string +} + +const STAGE_ORDER: UpgradeStage[] = ['downloading', 'verifying', 'pre_flight', 'installing', 'restarting'] + +const STAGE_ICONS: Record = { + downloading: Download, + verifying: ShieldCheck, + pre_flight: Wrench, + installing: RefreshCw, + restarting: RefreshCw +} + +function getStageProgress(stage: UpgradeStage): number { + const index = STAGE_ORDER.indexOf(stage) + return ((index + 1) / STAGE_ORDER.length) * 100 +} + +function UpgradeStepper({ job, t }: { job: UpgradeJob; t: (key: string) => string }) { + const currentIndex = STAGE_ORDER.indexOf(job.stage) + + return ( +
+
+ + {t('upgrade_in_progress')} + {t(`upgrade_stage_${job.stage}`)} +
+ +
+
+
+
+ {STAGE_ORDER.map((stage, index) => { + const Icon = STAGE_ICONS[stage] + const isActive = index <= currentIndex + const isCurrent = index === currentIndex + + return ( +
+
+ +
+ {t(`upgrade_stage_${stage}`)} +
+ ) + })} +
+
+
+ ) +} + +function UpgradeSuccess({ job, t }: { job: UpgradeJob; t: (key: string) => string }) { + return ( +
+ +
+

{t('upgrade_status_succeeded')}

+

+ {t('upgrade_current_version')}: v{job.target_version} +

+
+
+ ) +} + +function UpgradeFailed({ job, t }: { job: UpgradeJob; t: (key: string) => string }) { + return ( +
+ +
+

{t('upgrade_status_failed')}

+ {job.error &&

{job.error}

} + {job.backup_path && ( +

+ {t('upgrade_error_with_backup')}: {job.backup_path} +

+ )} +
+
+ ) +} + +function UpgradeTimeout({ job, t }: { job: UpgradeJob; t: (key: string) => string }) { + return ( +
+ +
+

{t('upgrade_status_timeout')}

+ {job.backup_path && ( +

+ {t('upgrade_backup_path')}: {job.backup_path} +

+ )} +
+
+ ) +} + +export function AgentVersionSection({ + agentVersion, + latestVersion, + serverId, + effectiveCapabilities, + configuredCapabilities +}: AgentVersionSectionProps) { + const { t } = useTranslation('servers') + const { user } = useAuth() + const { job, triggerUpgrade, isLoading } = useUpgradeJob(serverId) + + const isAdmin = user?.role === 'admin' + const upgradeEnabled = getEffectiveCapabilityEnabled(effectiveCapabilities, configuredCapabilities, CAP_UPGRADE) + + const hasUpdate = latestVersion && agentVersion && latestVersion !== agentVersion + const canUpgrade = isAdmin && upgradeEnabled && hasUpdate && (!job || job.status !== 'running') + + const handleUpgrade = () => { + if (latestVersion) { + triggerUpgrade(latestVersion) + } + } + + return ( + + + {t('cap_upgrade')} + {t('upgrade_current_version')} + + +
+
+
+ v{agentVersion || 'unknown'} + {hasUpdate && ( + + {t('upgrade_latest_version')}: v{latestVersion} + + )} +
+ {!upgradeEnabled && isAdmin &&

{t('cap_disabled')}

} +
+ + {canUpgrade && ( + + )} +
+ + {job?.status === 'running' && } + {job?.status === 'succeeded' && } + {job?.status === 'failed' && } + {job?.status === 'timeout' && } +
+
+ ) +} diff --git a/apps/web/src/components/server/server-card.tsx b/apps/web/src/components/server/server-card.tsx index 3ec9c7fc..d42d90c3 100644 --- a/apps/web/src/components/server/server-card.tsx +++ b/apps/web/src/components/server/server-card.tsx @@ -12,8 +12,10 @@ import { useTrafficOverview } from '@/hooks/use-traffic-overview' import { getLatencyBarColor, isLatencyFailure } from '@/lib/network-latency-constants' import { latencyColorClass } from '@/lib/network-types' import { countryCodeToFlag, formatBytes, formatSpeed, formatUptime } from '@/lib/utils' +import { useUpgradeJobsStore } from '@/stores/upgrade-jobs-store' import { buildServerCardNetworkState, type ServerCardMetricPoint } from './server-card-network-data' import { StatusBadge } from './status-badge' +import { UpgradeJobBadge } from './upgrade-job-badge' interface ServerCardProps { server: ServerMetrics @@ -167,6 +169,7 @@ const ServerCardInner = ({ server }: ServerCardProps) => { const { data: networkOverview = [] } = useNetworkOverview() const { data: realtimeData } = useNetworkRealtime(server.id) const { data: trafficOverview } = useTrafficOverview() + const upgradeJob = useUpgradeJobsStore((state) => state.jobs.get(server.id)) const memoryPct = server.mem_total > 0 ? (server.mem_used / server.mem_total) * 100 : 0 const diskPct = server.disk_total > 0 ? (server.disk_used / server.disk_total) * 100 : 0 @@ -213,7 +216,10 @@ const ServerCardInner = ({ server }: ServerCardProps) => { )}

{server.name}

- +
+ + +
diff --git a/apps/web/src/components/server/upgrade-job-badge.tsx b/apps/web/src/components/server/upgrade-job-badge.tsx new file mode 100644 index 00000000..ae6d8fc7 --- /dev/null +++ b/apps/web/src/components/server/upgrade-job-badge.tsx @@ -0,0 +1,54 @@ +import { CircleAlert, Clock, Loader2 } from 'lucide-react' +import { useTranslation } from 'react-i18next' +import { Badge } from '@/components/ui/badge' +import { Tooltip, TooltipContent, TooltipProvider, TooltipTrigger } from '@/components/ui/tooltip' +import type { UpgradeJob, UpgradeStatus } from '@/stores/upgrade-jobs-store' + +interface UpgradeJobBadgeProps { + job: UpgradeJob | null | undefined +} + +const STATUS_CONFIG: Record< + UpgradeStatus, + { icon: typeof Loader2; label: string; variant: 'default' | 'secondary' | 'destructive' | 'outline' } +> = { + running: { icon: Loader2, label: 'upgrade_status_running', variant: 'secondary' }, + succeeded: { icon: Loader2, label: 'upgrade_status_succeeded', variant: 'default' }, + failed: { icon: CircleAlert, label: 'upgrade_status_failed', variant: 'destructive' }, + timeout: { icon: Clock, label: 'upgrade_status_timeout', variant: 'outline' } +} + +export function UpgradeJobBadge({ job }: UpgradeJobBadgeProps) { + const { t } = useTranslation('servers') + + if (!job) { + return null + } + + const config = STATUS_CONFIG[job.status] + const Icon = config.icon + + return ( + + + + + + {job.status === 'running' && t(`upgrade_stage_${job.stage}`)} + + + +
+

{t(config.label)}

+ {job.target_version && ( +

+ v{job.target_version} + {job.status === 'running' && ` (${t(`upgrade_stage_${job.stage}`)})`} +

+ )} +
+
+
+
+ ) +} diff --git a/apps/web/src/locales/en/servers.json b/apps/web/src/locales/en/servers.json index 211cd58a..929c8a6a 100644 --- a/apps/web/src/locales/en/servers.json +++ b/apps/web/src/locales/en/servers.json @@ -179,5 +179,21 @@ "toast_deleted": "Deleted {{count}} server(s)", "toast_batch_delete_failed": "Operation failed", - "current_targets": "Current targets" + "current_targets": "Current targets", + + "upgrade_current_version": "Current Version", + "upgrade_latest_version": "Latest", + "upgrade_start": "Upgrade", + "upgrade_in_progress": "Upgrade in progress", + "upgrade_stage_downloading": "Downloading", + "upgrade_stage_verifying": "Verifying", + "upgrade_stage_pre_flight": "Pre-flight", + "upgrade_stage_installing": "Installing", + "upgrade_stage_restarting": "Restarting", + "upgrade_status_running": "Upgrading", + "upgrade_status_succeeded": "Upgrade completed", + "upgrade_status_failed": "Upgrade failed", + "upgrade_status_timeout": "Upgrade timed out", + "upgrade_error_with_backup": "Backup saved at", + "upgrade_backup_path": "Backup location" } diff --git a/apps/web/src/locales/zh/servers.json b/apps/web/src/locales/zh/servers.json index 9239753d..6add9327 100644 --- a/apps/web/src/locales/zh/servers.json +++ b/apps/web/src/locales/zh/servers.json @@ -179,5 +179,21 @@ "toast_deleted": "已删除 {{count}} 台服务器", "toast_batch_delete_failed": "操作失败", - "current_targets": "当前目标" + "current_targets": "当前目标", + + "upgrade_current_version": "当前版本", + "upgrade_latest_version": "最新", + "upgrade_start": "升级", + "upgrade_in_progress": "升级进行中", + "upgrade_stage_downloading": "下载中", + "upgrade_stage_verifying": "验证中", + "upgrade_stage_pre_flight": "预检中", + "upgrade_stage_installing": "安装中", + "upgrade_stage_restarting": "重启中", + "upgrade_status_running": "升级中", + "upgrade_status_succeeded": "升级完成", + "upgrade_status_failed": "升级失败", + "upgrade_status_timeout": "升级超时", + "upgrade_error_with_backup": "备份保存在", + "upgrade_backup_path": "备份位置" } diff --git a/apps/web/src/routes/_authed/servers/$id.tsx b/apps/web/src/routes/_authed/servers/$id.tsx index 2950671d..4f77765a 100644 --- a/apps/web/src/routes/_authed/servers/$id.tsx +++ b/apps/web/src/routes/_authed/servers/$id.tsx @@ -3,6 +3,7 @@ import { createFileRoute, Link } from '@tanstack/react-router' import { ArrowLeft, BarChart3, Container, CreditCard, FileText, Pencil, Terminal as TerminalIcon } from 'lucide-react' import { useMemo, useState } from 'react' import { useTranslation } from 'react-i18next' +import { AgentVersionSection } from '@/components/server/agent-version-section' import { CapabilitiesDialog } from '@/components/server/capabilities-dialog' import { DiskIoChart } from '@/components/server/disk-io-chart' import { MetricsChart } from '@/components/server/metrics-chart' @@ -11,6 +12,7 @@ import { StatusBadge } from '@/components/server/status-badge' import { TrafficCard } from '@/components/server/traffic-card' import { TrafficProgress } from '@/components/server/traffic-progress' import { TrafficTab } from '@/components/server/traffic-tab' +import { UpgradeJobBadge } from '@/components/server/upgrade-job-badge' import { Button } from '@/components/ui/button' import { Skeleton } from '@/components/ui/skeleton' import { Tabs, TabsContent, TabsList, TabsTrigger } from '@/components/ui/tabs' @@ -21,6 +23,7 @@ import type { ServerMetrics } from '@/hooks/use-servers-ws' import { api } from '@/lib/api-client' import type { ServerResponse } from '@/lib/api-schema' import { CAP_DOCKER, CAP_FILE, CAP_TERMINAL, getEffectiveCapabilityEnabled } from '@/lib/capabilities' +import { useUpgradeJobsStore } from '@/stores/upgrade-jobs-store' import { buildMergedDiskIoSeries, buildPerDiskIoSeries } from '@/lib/disk-io' import { cn, countryCodeToFlag, formatBytes } from '@/lib/utils' import { computeAggregateUptime } from '@/lib/widget-helpers' @@ -516,6 +519,8 @@ export function ServerDetailPage() { CAP_DOCKER ) + const upgradeJob = useUpgradeJobsStore((state) => state.jobs.get(id)) + // Network cumulative traffic from live data const liveNetIn = liveData?.net_in_transfer ?? 0 const liveNetOut = liveData?.net_out_transfer ?? 0 @@ -533,11 +538,12 @@ export function ServerDetailPage() {
-
- {flag && {flag}} -

{server.name}

- -
+
+ {flag && {flag}} +

{server.name}

+ + +
- + + + {t('metrics_tab')} {server.billing_cycle && ( diff --git a/apps/web/src/routes/_authed/servers/index.tsx b/apps/web/src/routes/_authed/servers/index.tsx index 4b83df97..7658a3e6 100644 --- a/apps/web/src/routes/_authed/servers/index.tsx +++ b/apps/web/src/routes/_authed/servers/index.tsx @@ -15,6 +15,7 @@ import { toast } from 'sonner' import { ServerCard } from '@/components/server/server-card' import { ServerEditDialog } from '@/components/server/server-edit-dialog' import { StatusBadge } from '@/components/server/status-badge' +import { UpgradeJobBadge } from '@/components/server/upgrade-job-badge' import { AlertDialog, AlertDialogAction, @@ -37,6 +38,12 @@ import { api } from '@/lib/api-client' import type { ServerGroup } from '@/lib/api-schema' import { countCleanupCandidates } from '@/lib/orphan-server-utils' import { cn, countryCodeToFlag, formatBytes, formatSpeed, formatUptime } from '@/lib/utils' +import { useUpgradeJobsStore } from '@/stores/upgrade-jobs-store' + +function UpgradeBadgeCell({ serverId }: { serverId: string }) { + const job = useUpgradeJobsStore((state) => state.jobs.get(serverId)) + return +} export const Route = createFileRoute('/_authed/servers/')({ component: ServersListPage, @@ -129,6 +136,13 @@ function ServersListPage() { header: ({ column }) => , cell: ({ row }) => }, + { + id: 'upgrade', + enableSorting: false, + header: () => null, + cell: ({ row }) => , + meta: { className: 'w-10' } + }, { accessorKey: 'cpu', header: ({ column }) => , diff --git a/tests/README.md b/tests/README.md index 755b3541..04fad28a 100644 --- a/tests/README.md +++ b/tests/README.md @@ -57,6 +57,7 @@ docker compose up -d | [terminal.md](terminal.md) | Web 终端 | `/terminal/:serverId` | | [performance.md](performance.md) | 前端性能测试 | `/servers/:id` (realtime) | | [mobile-ios.md](mobile-ios.md) | iOS 移动端 & Mobile API | `/api/mobile/*`, `/settings/mobile-devices`, iOS App | +| [agent-upgrade.md](agent-upgrade.md) | Agent 自动升级 | `/servers/:id` (Upgrade button) | ## 页面渲染快速验证 diff --git a/tests/agent-upgrade.md b/tests/agent-upgrade.md new file mode 100644 index 00000000..5e2cb771 --- /dev/null +++ b/tests/agent-upgrade.md @@ -0,0 +1,181 @@ +# Agent Upgrade QA Checklist + +Tests the agent self-upgrade feature from the server detail page. + +**Prerequisites:** +- Server and agent running (see [README.md](README.md) for setup) +- Agent has `CAP_UPGRADE` capability enabled (default for new registrations) +- Admin user logged in + +--- + +## Test Cases + +### 1. Trigger Upgrade from Server Detail Page + +**Steps:** +1. Navigate to `/servers/:id` (server detail page) +2. Click the "Upgrade Agent" button in the server header/actions area +3. Confirm the upgrade dialog + +**Expected:** +- Upgrade dialog shows current version and confirms upgrade action +- After confirmation, upgrade progress panel appears +- WebSocket connection remains active during upgrade + +--- + +### 2. Verify Progress Stages + +**Steps:** +1. Trigger an upgrade (see Test 1) +2. Observe the progress indicator during upgrade + +**Expected:** +- Progress stages appear in sequence: + - `downloading` -- Downloading new binary from release URL + - `verifying` -- Verifying SHA-256 checksum + - `preflight` -- Running preflight checks + - `installing` -- Installing new binary + - `restarting` -- Restarting agent process +- Each stage shows appropriate status icon and message +- Progress bar advances through stages + +--- + +### 3. Verify Success State and Version Update + +**Steps:** +1. Wait for upgrade to complete (typically 10-30 seconds) +2. Observe final status + +**Expected:** +- Success message displayed: "Agent upgraded successfully" +- New version number shown matches target version +- Agent reconnects automatically after restart +- Server detail page shows updated version in header +- No manual refresh required -- updates via WebSocket + +--- + +### 4. Verify Failed State with Error Message + +**Steps:** +1. Configure an invalid `release_base_url` in server config (temporarily) +2. Trigger upgrade +3. Wait for failure + +**Expected:** +- Error state displayed with specific error message +- Backup path shown if backup was created before failure +- Retry button available to attempt upgrade again +- Agent remains in working state (rollback successful) + +--- + +### 5. Verify Timeout Handling + +**Steps:** +1. Trigger upgrade +2. Simulate network issues or use very slow connection +3. Wait for timeout (default 5 minutes) + +**Expected:** +- Timeout error displayed after configured timeout period +- Upgrade marked as failed +- Agent continues running existing version +- No partial installation corruption + +--- + +### 6. Verify Concurrent Upgrade Rejection + +**Steps:** +1. Start an upgrade on server A +2. While upgrade is in progress, attempt to start upgrade on server B +3. Or rapidly click upgrade button multiple times on same server + +**Expected:** +- Second upgrade attempt rejected with "Upgrade already in progress" message +- UI prevents concurrent upgrade initiation +- First upgrade continues unaffected + +--- + +### 7. Verify Admin-Only Access Control + +**Steps:** +1. Log in as Member (non-admin) user +2. Navigate to server detail page +3. Attempt to trigger upgrade + +**Expected:** +- "Upgrade Agent" button is hidden or disabled +- Direct API call returns 403 Forbidden: + ```bash + curl -X POST http://localhost:9527/api/servers/:id/upgrade \ + -H "Authorization: Bearer $MEMBER_TOKEN" \ + -d '{"version":"latest"}' + # Expected: {"error":"Admin access required"} + ``` + +--- + +### 8. Test WebSocket Real-Time Updates + +**Steps:** +1. Open browser DevTools Network tab +2. Connect to WebSocket `/api/ws/servers` +3. Trigger upgrade from another browser/session +4. Monitor WebSocket messages + +**Expected:** +- `CapabilitiesChanged` message received when upgrade starts (capability temporarily disabled) +- `ServerUpdate` messages with upgrade progress in payload +- `CapabilitiesChanged` message received when upgrade completes (capability re-enabled) +- UI updates in real-time without page refresh + +--- + +### 9. Verify Capability Check + +**Steps:** +1. Disable `CAP_UPGRADE` on a server (via database or API) +2. Navigate to that server's detail page + +**Expected:** +- "Upgrade Agent" button is hidden +- Upgrade option not available in UI +- Attempting upgrade via API returns capability error + +--- + +### 10. Rollback Verification + +**Steps:** +1. Trigger upgrade +2. During `installing` stage, force agent disconnect (kill process) +3. Restart agent manually + +**Expected:** +- Agent starts with previous version (backup restored) +- Server detects version mismatch on reconnect +- Upgrade can be retried + +--- + +## API Endpoints + +| Endpoint | Method | Auth | Description | +|----------|--------|------|-------------| +| `/api/servers/:id/upgrade` | POST | Admin | Trigger agent upgrade | +| `/api/servers/:id/upgrade-status` | GET | Admin | Get current upgrade status | +| `/api/ws/servers` | WS | Session | Real-time upgrade progress | + +--- + +## Related Files + +- `crates/server/src/service/upgrade.rs` -- Server upgrade service +- `crates/agent/src/upgrade.rs` -- Agent upgrade handler +- `apps/web/src/components/server/upgrade-panel.tsx` -- UI component From 9eeff9238666ab5d86ca3967385936e29afb6f45 Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Tue, 14 Apr 2026 20:31:44 +0800 Subject: [PATCH 11/17] refactor(server): remove incomplete upgrade state wiring --- crates/server/src/config.rs | 5 +- crates/server/src/main.rs | 2 - crates/server/src/openapi.rs | 2 - crates/server/src/router/api/agent.rs | 22 +------- crates/server/src/router/api/mod.rs | 1 - crates/server/src/router/api/server.rs | 72 ++++++++++++++++---------- crates/server/src/router/ws/agent.rs | 42 --------------- crates/server/src/router/ws/browser.rs | 6 +-- crates/server/src/service/mod.rs | 2 - crates/server/src/state.rs | 11 +--- crates/server/src/task/mod.rs | 1 - 11 files changed, 49 insertions(+), 117 deletions(-) diff --git a/crates/server/src/config.rs b/crates/server/src/config.rs index 500288fe..21f57d90 100644 --- a/crates/server/src/config.rs +++ b/crates/server/src/config.rs @@ -1,6 +1,6 @@ use figment::{ - providers::{Env, Format, Toml}, Figment, + providers::{Env, Format, Toml}, }; use ipnet::IpNet; use serde::Deserialize; @@ -275,8 +275,6 @@ impl Default for SchedulerConfig { pub struct UpgradeConfig { #[serde(default = "default_release_base_url")] pub release_base_url: String, - #[serde(default)] - pub latest_version_url: String, } fn default_release_base_url() -> String { @@ -287,7 +285,6 @@ impl Default for UpgradeConfig { fn default() -> Self { Self { release_base_url: default_release_base_url(), - latest_version_url: String::new(), } } } diff --git a/crates/server/src/main.rs b/crates/server/src/main.rs index 83409014..ef880eaf 100644 --- a/crates/server/src/main.rs +++ b/crates/server/src/main.rs @@ -88,8 +88,6 @@ async fn main() -> anyhow::Result<()> { tokio::spawn(async move { task::task_scheduler::run(s).await }); let s = state.clone(); tokio::spawn(async move { task::service_monitor_checker::run(s).await }); - let s = state.clone(); - tokio::spawn(async move { task::upgrade_timeout::run(s).await }); // Build router let app = create_router(state); diff --git a/crates/server/src/openapi.rs b/crates/server/src/openapi.rs index feb1cd57..fb529c2c 100644 --- a/crates/server/src/openapi.rs +++ b/crates/server/src/openapi.rs @@ -59,7 +59,6 @@ impl Modify for SecurityAddon { crate::router::api::status::public_status, // agent crate::router::api::agent::register, - crate::router::api::agent::latest_version, // servers crate::router::api::server::list_servers, crate::router::api::server::get_server, @@ -222,7 +221,6 @@ impl Modify for SecurityAddon { crate::router::api::oauth::OAuthProvidersResponse, // agent crate::router::api::agent::RegisterResponse, - crate::service::upgrade_release::LatestAgentVersionResponse, // servers crate::router::api::server::ServerResponse, crate::router::api::server::BatchDeleteRequest, diff --git a/crates/server/src/router/api/agent.rs b/crates/server/src/router/api/agent.rs index 1682a241..3f143c6e 100644 --- a/crates/server/src/router/api/agent.rs +++ b/crates/server/src/router/api/agent.rs @@ -3,7 +3,7 @@ use std::sync::Arc; use axum::extract::{ConnectInfo, State}; use axum::http::HeaderMap; -use axum::routing::{get, post}; +use axum::routing::post; use axum::{Json, Router}; use chrono::Utc; use sea_orm::{ @@ -19,7 +19,6 @@ use crate::router::utils::extract_client_ip; use crate::service::auth::AuthService; use crate::service::config::ConfigService; use crate::service::network_probe::NetworkProbeService; -use crate::service::upgrade_release::LatestAgentVersionResponse; use crate::state::AppState; const CONFIG_KEY_AUTO_DISCOVERY: &str = "auto_discovery_key"; @@ -42,25 +41,6 @@ pub fn public_router() -> Router> { Router::new().route("/agent/register", post(register)) } -pub fn read_router() -> Router> { - Router::new().route("/agent/latest-version", get(latest_version)) -} - -#[utoipa::path( - get, - path = "/api/agent/latest-version", - tag = "agent", - responses( - (status = 200, description = "Latest agent release metadata", body = LatestAgentVersionResponse), - ), - security(("session_cookie" = []), ("api_key" = []), ("bearer_token" = [])) -)] -pub async fn latest_version( - State(state): State>, -) -> Result>, AppError> { - ok(state.upgrade_release_service.latest().await) -} - #[utoipa::path( post, path = "/api/agent/register", diff --git a/crates/server/src/router/api/mod.rs b/crates/server/src/router/api/mod.rs index 0b3f3edb..dd7d6ca5 100644 --- a/crates/server/src/router/api/mod.rs +++ b/crates/server/src/router/api/mod.rs @@ -47,7 +47,6 @@ pub fn router(state: Arc) -> Router> { .merge(auth::protected_router()) .merge(mobile::protected_router()) // Read-only routes accessible to all authenticated users - .merge(agent::read_router()) .merge(server::read_router()) .merge(server_group::read_router()) .merge(ping::read_router()) diff --git a/crates/server/src/router/api/server.rs b/crates/server/src/router/api/server.rs index 06fe189f..a5723116 100644 --- a/crates/server/src/router/api/server.rs +++ b/crates/server/src/router/api/server.rs @@ -26,7 +26,6 @@ use crate::service::network_probe::NetworkProbeService; use crate::service::ping::PingService; use crate::service::record::{QueryHistoryResult, RecordService}; use crate::service::server::{ServerService, UpdateServerInput}; -use crate::service::upgrade_tracker::{StartUpgradeJobError, UpgradeLookup}; use crate::state::AppState; use serverbee_common::constants::effective_capabilities; use serverbee_common::protocol::{BrowserMessage, ServerMessage}; @@ -568,41 +567,60 @@ async fn trigger_upgrade( format!("serverbee-agent-{os}-{arch}") }; - let asset = state - .upgrade_release_service - .resolve_asset(&version, &asset_name) - .await?; + let base_url = &state.config.upgrade.release_base_url; + let download_url = format!("{base_url}/download/v{version}/{asset_name}"); + + // Fetch checksums.txt + let checksums_url = format!("{base_url}/download/v{version}/checksums.txt"); + let checksums_response = reqwest::get(&checksums_url) + .await + .map_err(|e| AppError::Internal(format!("Failed to fetch checksums: {e}")))?; + + if !checksums_response.status().is_success() { + return Err(AppError::NotFound(format!( + "Checksums not found for version v{version} (HTTP {})", + checksums_response.status() + ))); + } + + let checksums_body = checksums_response + .text() + .await + .map_err(|e| AppError::Internal(format!("Failed to read checksums: {e}")))?; + + // Parse: each line is " " or " " + let sha256 = checksums_body + .lines() + .find_map(|line| { + let mut parts = line.splitn(2, |c: char| c.is_whitespace()); + let hash = parts.next()?; + let name = parts.next()?.trim(); + if name == asset_name { + Some(hash.to_string()) + } else { + None + } + }) + .ok_or_else(|| { + AppError::NotFound(format!( + "Checksum not found for {asset_name} in v{version} release" + )) + })?; let sender = state .agent_manager .get_sender(&id) .ok_or_else(|| AppError::NotFound("Agent not connected".into()))?; - let job = state - .upgrade_tracker - .start_job(&id, version.to_string()) - .map_err(|error| match error { - StartUpgradeJobError::Conflict(existing) => AppError::Conflict(format!( - "Upgrade already running for server {} (job_id={}, target_version={})", - existing.server_id, existing.job_id, existing.target_version - )), - })?; - let msg = ServerMessage::Upgrade { version: version.to_string(), - download_url: asset.download_url, - sha256: asset.sha256, - job_id: Some(job.job_id.clone()), + download_url, + sha256, }; - if let Err(_send_error) = sender.send(msg).await { - state.upgrade_tracker.mark_failed( - UpgradeLookup::from_job(&job), - job.stage, - "Failed to send upgrade command".into(), - None, - ); - return Err(AppError::Internal("Failed to send upgrade command".into())); - } + sender + .send(msg) + .await + .map_err(|_| AppError::Internal("Failed to send upgrade command".into()))?; ok("ok") } diff --git a/crates/server/src/router/ws/agent.rs b/crates/server/src/router/ws/agent.rs index 02765889..11962abc 100644 --- a/crates/server/src/router/ws/agent.rs +++ b/crates/server/src/router/ws/agent.rs @@ -19,7 +19,6 @@ use crate::service::network_probe::NetworkProbeService; use crate::service::ping::PingService; use crate::service::record::RecordService; use crate::service::server::ServerService; -use crate::service::upgrade_tracker::UpgradeLookup; use crate::state::AppState; use serverbee_common::constants::{MAX_WS_MESSAGE_SIZE, effective_capabilities}; use serverbee_common::protocol::{AgentMessage, BrowserMessage, ServerMessage}; @@ -506,7 +505,6 @@ async fn handle_agent_message(state: &Arc, server_id: &str, msg: Agent .broadcast_browser(BrowserMessage::AgentInfoUpdated { server_id: server_id.to_string(), protocol_version: agent_pv, - agent_version: Some(info.agent_version.clone()), }); // Send Ack @@ -555,39 +553,6 @@ async fn handle_agent_message(state: &Arc, server_id: &str, msg: Agent let _ = tx.send(ServerMessage::Ack { msg_id }).await; } } - AgentMessage::UpgradeProgress { - msg_id, - job_id, - target_version, - stage, - } => { - state - .upgrade_tracker - .update_stage(UpgradeLookup::new(server_id, job_id, target_version), stage); - - if let Some(tx) = state.agent_manager.get_sender(server_id) { - let _ = tx.send(ServerMessage::Ack { msg_id }).await; - } - } - AgentMessage::UpgradeResult { - msg_id, - job_id, - target_version, - stage, - error, - backup_path, - } => { - state.upgrade_tracker.mark_failed( - UpgradeLookup::new(server_id, job_id, target_version), - stage, - error, - backup_path, - ); - - if let Some(tx) = state.agent_manager.get_sender(server_id) { - let _ = tx.send(ServerMessage::Ack { msg_id }).await; - } - } AgentMessage::PingResult(result) => { if let Err(e) = save_ping_result(&state.db, server_id, &result).await { tracing::error!("Failed to save ping result for {server_id}: {e}"); @@ -660,13 +625,6 @@ async fn handle_agent_message(state: &Arc, server_id: &str, msg: Agent } } } - if capability == "upgrade" - && let Some(job) = state.upgrade_tracker.get(server_id) - { - state - .upgrade_tracker - .mark_failed_by_capability_denied(UpgradeLookup::from_job(&job), reason); - } // For terminal: unregister session so browser gets notified if let Some(sid) = &session_id { state.agent_manager.unregister_terminal_session(sid); diff --git a/crates/server/src/router/ws/browser.rs b/crates/server/src/router/ws/browser.rs index 270fbba7..bca1e365 100644 --- a/crates/server/src/router/ws/browser.rs +++ b/crates/server/src/router/ws/browser.rs @@ -255,7 +255,6 @@ async fn build_full_sync(state: &Arc) -> BrowserMessage { tracing::error!("Failed to list servers for FullSync: {e}"); return BrowserMessage::FullSync { servers: Vec::new(), - upgrades: state.upgrade_tracker.snapshot(), }; } }; @@ -352,10 +351,7 @@ async fn build_full_sync(state: &Arc) -> BrowserMessage { }) .collect(); - BrowserMessage::FullSync { - servers: statuses, - upgrades: state.upgrade_tracker.snapshot(), - } + BrowserMessage::FullSync { servers: statuses } } async fn send_browser_message( diff --git a/crates/server/src/service/mod.rs b/crates/server/src/service/mod.rs index 6d50b9ee..6128cb3d 100644 --- a/crates/server/src/service/mod.rs +++ b/crates/server/src/service/mod.rs @@ -24,7 +24,5 @@ pub mod service_monitor; pub mod status_page; pub mod task_scheduler; pub mod traffic; -pub mod upgrade_tracker; -pub mod upgrade_release; pub mod uptime; pub mod user; diff --git a/crates/server/src/state.rs b/crates/server/src/state.rs index 27676b46..c6327773 100644 --- a/crates/server/src/state.rs +++ b/crates/server/src/state.rs @@ -17,8 +17,6 @@ use crate::service::high_risk_audit::{ DockerLogsAuditContext, ExecAuditContext, TerminalAuditContext, }; use crate::service::task_scheduler::TaskScheduler; -use crate::service::upgrade_release::UpgradeReleaseService; -use crate::service::upgrade_tracker::UpgradeJobTracker; /// Pending TOTP setup data, keyed by user_id. pub struct PendingTotp { @@ -61,10 +59,6 @@ pub struct AppState { pub task_scheduler: Arc, /// Shared alert state manager for dedup across poll-based and event-driven evaluation. pub alert_state_manager: AlertStateManager, - /// Tracks in-flight and recent agent upgrade jobs. - pub upgrade_tracker: UpgradeJobTracker, - /// Resolves latest agent release metadata and assets. - pub upgrade_release_service: UpgradeReleaseService, /// Pending mobile pairing codes for QR login, keyed by code. pub pending_pairs: DashMap, /// Terminal session audit contexts keyed by session_id. @@ -152,7 +146,6 @@ impl AppState { std::env::temp_dir().join("serverbee-transfers"), )); let task_scheduler = Arc::new(TaskScheduler::new(&config.scheduler.timezone).await?); - let upgrade_release_service = UpgradeReleaseService::new(&config.upgrade); let alert_state_manager = match AlertStateManager::load_from_db(&db).await { Ok(sm) => sm, Err(e) => { @@ -167,7 +160,7 @@ impl AppState { Ok(Arc::new(Self { db, agent_manager, - browser_tx: browser_tx.clone(), + browser_tx, config, geoip: Arc::new(std::sync::RwLock::new(geoip)), geoip_downloading: AtomicBool::new(false), @@ -179,8 +172,6 @@ impl AppState { docker_viewers: DockerViewerTracker::new(), task_scheduler, alert_state_manager, - upgrade_tracker: UpgradeJobTracker::new(browser_tx.clone()), - upgrade_release_service, pending_pairs: DashMap::new(), terminal_audit_contexts: DashMap::new(), docker_logs_audit_contexts: DashMap::new(), diff --git a/crates/server/src/task/mod.rs b/crates/server/src/task/mod.rs index c0ada32d..b3a852ff 100644 --- a/crates/server/src/task/mod.rs +++ b/crates/server/src/task/mod.rs @@ -6,4 +6,3 @@ pub mod record_writer; pub mod service_monitor_checker; pub mod session_cleaner; pub mod task_scheduler; -pub mod upgrade_timeout; From daa01f81d5df02d563d81644eeda40e06a64db16 Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Tue, 14 Apr 2026 20:34:06 +0800 Subject: [PATCH 12/17] refactor(agent): simplify self-upgrade execution --- crates/agent/src/reporter.rs | 357 ++++------------------------------- 1 file changed, 41 insertions(+), 316 deletions(-) diff --git a/crates/agent/src/reporter.rs b/crates/agent/src/reporter.rs index ce269dff..ccde6418 100644 --- a/crates/agent/src/reporter.rs +++ b/crates/agent/src/reporter.rs @@ -1,15 +1,14 @@ use std::net::IpAddr; use std::sync::Arc; -use std::sync::atomic::{AtomicBool, AtomicU32, Ordering}; +use std::sync::atomic::{AtomicU32, Ordering}; use std::time::Duration; -use chrono::{NaiveDateTime, Utc}; use futures_util::{SinkExt, StreamExt}; use rand::Rng; use serverbee_common::constants::{ CapabilityDeniedReason, DEFAULT_COMMAND_TIMEOUT_SECS, MAX_TASK_OUTPUT_SIZE, has_capability, }; -use serverbee_common::protocol::{AgentMessage, ServerMessage, UpgradeStage}; +use serverbee_common::protocol::{AgentMessage, ServerMessage}; use serverbee_common::types::{NetworkInterface, NetworkProbeResultData, TracerouteHop}; use sysinfo::Networks; use tokio::sync::mpsc; @@ -30,31 +29,6 @@ const MAX_BACKOFF_SECS: u64 = 30; const JITTER_FACTOR: f64 = 0.2; const MAX_REREGISTER_ATTEMPTS: u32 = 3; const DOCKER_RETRY_SECS: u64 = 30; -const UPGRADE_DOWNLOAD_TIMEOUT_SECS: u64 = 600; -const UPGRADE_BACKUP_RETENTION_HOURS: i64 = 24; - -static UPGRADE_IN_PROGRESS: AtomicBool = AtomicBool::new(false); - -struct UpgradeFailure { - stage: UpgradeStage, - error: anyhow::Error, - backup_path: Option, -} - -impl UpgradeFailure { - fn new(stage: UpgradeStage, error: impl Into) -> Self { - Self { - stage, - error: error.into(), - backup_path: None, - } - } - - fn with_backup_path(mut self, backup_path: &std::path::Path) -> Self { - self.backup_path = Some(backup_path.display().to_string()); - self - } -} pub struct Reporter { config: AgentConfig, @@ -598,7 +572,6 @@ impl Reporter { version, download_url, sha256, - job_id, } => { let caps = capabilities.load(Ordering::SeqCst); if !has_capability(caps, CAP_UPGRADE) { @@ -618,51 +591,10 @@ impl Reporter { write.send(Message::Text(json.into())).await?; return Ok(()); } - - if UPGRADE_IN_PROGRESS - .compare_exchange(false, true, Ordering::SeqCst, Ordering::SeqCst) - .is_err() - { - tracing::warn!("Upgrade rejected: another upgrade is already running"); - emit_upgrade_failure( - cmd_result_tx, - job_id, - version, - UpgradeStage::Downloading, - "another upgrade is already running".to_string(), - None, - ) - .await; - return Ok(()); - } - tracing::info!("Upgrade requested: v{version} from {download_url}"); - let tx = cmd_result_tx.clone(); tokio::spawn(async move { - if let Err(failure) = perform_upgrade( - &version, - &download_url, - &sha256, - job_id.clone(), - tx.clone(), - ) - .await - { - tracing::error!( - "Upgrade to v{version} failed during {:?}: {}", - failure.stage, - failure.error - ); - emit_upgrade_failure( - &tx, - job_id, - version, - failure.stage, - failure.error.to_string(), - failure.backup_path, - ) - .await; - UPGRADE_IN_PROGRESS.store(false, Ordering::SeqCst); + if let Err(e) = perform_upgrade(&version, &download_url, &sha256).await { + tracing::error!("Upgrade to v{version} failed: {e}"); } }); } @@ -1596,11 +1528,9 @@ async fn derive_primary_ips( mod tests { use super::*; use crate::config::{CollectorConfig, FileConfig, IpChangeConfig, LogConfig}; - use chrono::{Duration as ChronoDuration, Utc}; use serverbee_common::constants::{ CAP_DEFAULT, CAP_EXEC, CAP_FILE, CAP_PING_ICMP, CapabilityDeniedReason, }; - use tempfile::tempdir; use tokio_tungstenite::tungstenite::http::Response; #[test] @@ -1806,55 +1736,6 @@ HOST: agent Loss% Snt Last Avg Best Wrst StDev let hops = parse_traceroute_output(""); assert!(hops.is_empty()); } - - #[test] - fn verify_sha256_rejects_mismatched_hash() { - let err = verify_sha256(b"serverbee", "deadbeef").expect_err("hash should mismatch"); - - assert!(err.to_string().contains("Checksum mismatch")); - } - - #[cfg(unix)] - #[test] - fn run_preflight_rejects_non_zero_exit() { - use std::os::unix::fs::PermissionsExt; - - let temp = tempdir().unwrap(); - let binary_path = temp.path().join("fake-agent"); - std::fs::write(&binary_path, "#!/bin/sh\nexit 23\n").unwrap(); - std::fs::set_permissions(&binary_path, std::fs::Permissions::from_mode(0o755)).unwrap(); - - let err = run_preflight(&binary_path).expect_err("preflight should fail"); - - assert!(err.to_string().contains("Preflight check failed")); - } - - #[test] - fn cleanup_old_backups_removes_only_stale_backup_files() { - let temp = tempdir().unwrap(); - let exe_path = temp.path().join("serverbee-agent"); - std::fs::write(&exe_path, b"current").unwrap(); - - let stale = exe_path.with_extension(format!( - "bak.{}", - (Utc::now() - ChronoDuration::hours(25)).format("%Y%m%d-%H%M%S") - )); - let fresh = exe_path.with_extension(format!( - "bak.{}", - (Utc::now() - ChronoDuration::hours(1)).format("%Y%m%d-%H%M%S") - )); - let unrelated = temp.path().join("other-agent.bak.20200101-000000"); - - std::fs::write(&stale, b"stale").unwrap(); - std::fs::write(&fresh, b"fresh").unwrap(); - std::fs::write(&unrelated, b"other").unwrap(); - - cleanup_old_backups(&exe_path).unwrap(); - - assert!(!stale.exists()); - assert!(fresh.exists()); - assert!(unrelated.exists()); - } } /// Fetch external IP address from a remote service. @@ -1891,68 +1772,23 @@ async fn fetch_external_ip(url: &str) -> anyhow::Result { Ok(ip) } -async fn emit_upgrade_progress( - tx: &mpsc::Sender, - job_id: Option, - version: &str, - stage: UpgradeStage, -) { - let message = AgentMessage::UpgradeProgress { - msg_id: uuid::Uuid::new_v4().to_string(), - job_id, - target_version: version.to_string(), - stage, - }; - - if tx.send(message).await.is_err() { - tracing::warn!("Failed to emit upgrade progress: channel closed"); - } -} - -async fn emit_upgrade_failure( - tx: &mpsc::Sender, - job_id: Option, - version: String, - stage: UpgradeStage, - error: String, - backup_path: Option, -) { - let message = AgentMessage::UpgradeResult { - msg_id: uuid::Uuid::new_v4().to_string(), - job_id, - target_version: version, - stage, - error, - backup_path, - }; - - if tx.send(message).await.is_err() { - tracing::warn!("Failed to emit upgrade failure: channel closed"); - } -} - -fn verify_sha256(bytes: &[u8], expected_sha256: &str) -> anyhow::Result<()> { +/// Download a new agent binary, verify checksum, replace current binary, and restart. +async fn perform_upgrade(version: &str, download_url: &str, sha256: &str) -> anyhow::Result<()> { use sha2::{Digest, Sha256}; + use std::io::Write; - let mut hasher = Sha256::new(); - hasher.update(bytes); - let actual = format!("{:x}", hasher.finalize()); - let expected = expected_sha256.to_ascii_lowercase(); - - if actual != expected { - anyhow::bail!("Checksum mismatch: expected {expected_sha256}, got {actual}"); - } - - Ok(()) -} - -async fn download_upgrade_bytes(download_url: &str) -> anyhow::Result> { + // Validate URL scheme if !download_url.starts_with("https://") { anyhow::bail!("Upgrade URL must use HTTPS, got: {download_url}"); } + let current_exe = std::env::current_exe()?; + let tmp_path = current_exe.with_extension("new"); + let backup_path = current_exe.with_extension("bak"); + + tracing::info!("Downloading agent v{version} from {download_url}..."); let client = reqwest::Client::builder() - .timeout(Duration::from_secs(UPGRADE_DOWNLOAD_TIMEOUT_SECS)) + .timeout(std::time::Duration::from_secs(600)) // 10 minute timeout .build()?; let response = client .get(download_url) @@ -1964,158 +1800,47 @@ async fn download_upgrade_bytes(download_url: &str) -> anyhow::Result> { anyhow::bail!("Download failed with status {}", response.status()); } - Ok(response.bytes().await?.to_vec()) -} + let bytes = response.bytes().await?; + tracing::info!("Downloaded {} bytes", bytes.len()); -fn set_executable_permissions(path: &std::path::Path) -> anyhow::Result<()> { - #[cfg(unix)] - { - use std::os::unix::fs::PermissionsExt; - std::fs::set_permissions(path, std::fs::Permissions::from_mode(0o755))?; + // Mandatory SHA-256 verification + let mut hasher = Sha256::new(); + hasher.update(&bytes); + let actual = format!("{:x}", hasher.finalize()); + if actual != sha256 { + anyhow::bail!("Checksum mismatch: expected {sha256}, got {actual}"); } + tracing::info!("Checksum verified"); - #[cfg(not(unix))] + // Write to temporary file { - let _ = path; - } - - Ok(()) -} - -fn run_preflight(path: &std::path::Path) -> anyhow::Result<()> { - let output = std::process::Command::new(path).arg("--version").output()?; - - if !output.status.success() { - let stdout = String::from_utf8_lossy(&output.stdout).trim().to_string(); - let stderr = String::from_utf8_lossy(&output.stderr).trim().to_string(); - let details = if !stderr.is_empty() { - stderr - } else if !stdout.is_empty() { - stdout - } else { - "no output".to_string() - }; - anyhow::bail!( - "Preflight check failed with status {}: {}", - output.status, - details - ); - } - - Ok(()) -} - -fn cleanup_old_backups(current_exe: &std::path::Path) -> anyhow::Result<()> { - let Some(parent) = current_exe.parent() else { - return Ok(()); - }; - let Some(name) = current_exe.file_name().and_then(|value| value.to_str()) else { - return Ok(()); - }; - - let prefix = format!("{name}.bak."); - let cutoff = Utc::now() - chrono::Duration::hours(UPGRADE_BACKUP_RETENTION_HOURS); - - for entry in std::fs::read_dir(parent)? { - let entry = entry?; - let path = entry.path(); - if !path.is_file() { - continue; - } - - let Some(file_name) = path.file_name().and_then(|value| value.to_str()) else { - continue; - }; - let Some(timestamp) = file_name.strip_prefix(&prefix) else { - continue; - }; - let Ok(parsed) = NaiveDateTime::parse_from_str(timestamp, "%Y%m%d-%H%M%S") else { - continue; - }; - - let backup_time = chrono::DateTime::::from_naive_utc_and_offset(parsed, Utc); - if backup_time < cutoff { - std::fs::remove_file(path)?; - } + let mut file = std::fs::File::create(&tmp_path)?; + file.write_all(&bytes)?; + file.sync_all()?; } - Ok(()) -} - -/// Download a new agent binary, verify checksum, replace current binary, and restart. -async fn perform_upgrade( - version: &str, - download_url: &str, - sha256: &str, - job_id: Option, - tx: mpsc::Sender, -) -> Result<(), UpgradeFailure> { - use std::io::Write; - - let current_exe = std::env::current_exe() - .map_err(|error| UpgradeFailure::new(UpgradeStage::Installing, error))?; - let tmp_path = current_exe.with_extension("new"); - let backup_path = - current_exe.with_extension(format!("bak.{}", Utc::now().format("%Y%m%d-%H%M%S"))); - - emit_upgrade_progress(&tx, job_id.clone(), version, UpgradeStage::Downloading).await; - tracing::info!("Downloading agent v{version} from {download_url}..."); - let bytes = download_upgrade_bytes(download_url) - .await - .map_err(|error| UpgradeFailure::new(UpgradeStage::Downloading, error))?; - - emit_upgrade_progress(&tx, job_id.clone(), version, UpgradeStage::Verifying).await; - verify_sha256(&bytes, sha256) - .map_err(|error| UpgradeFailure::new(UpgradeStage::Verifying, error))?; - + // Set executable permission on Unix + #[cfg(unix)] { - let mut file = std::fs::File::create(&tmp_path) - .map_err(|error| UpgradeFailure::new(UpgradeStage::Installing, error))?; - file.write_all(&bytes) - .map_err(|error| UpgradeFailure::new(UpgradeStage::Installing, error))?; - file.sync_all() - .map_err(|error| UpgradeFailure::new(UpgradeStage::Installing, error))?; - } - set_executable_permissions(&tmp_path) - .map_err(|error| UpgradeFailure::new(UpgradeStage::Installing, error))?; - - emit_upgrade_progress(&tx, job_id.clone(), version, UpgradeStage::PreFlight).await; - run_preflight(&tmp_path) - .map_err(|error| UpgradeFailure::new(UpgradeStage::PreFlight, error))?; - - emit_upgrade_progress(&tx, job_id.clone(), version, UpgradeStage::Installing).await; - std::fs::rename(¤t_exe, &backup_path) - .map_err(|error| UpgradeFailure::new(UpgradeStage::Installing, error))?; - - if let Err(error) = std::fs::rename(&tmp_path, ¤t_exe) { - let rollback_result = std::fs::rename(&backup_path, ¤t_exe); - let rollback_error = rollback_result.err(); - let install_error = if let Some(rollback_error) = rollback_error { - anyhow::anyhow!( - "Failed to install new binary: {error}; rollback also failed: {rollback_error}" - ) - } else { - anyhow::anyhow!("Failed to install new binary: {error}; restored backup") - }; - return Err(UpgradeFailure::new(UpgradeStage::Installing, install_error) - .with_backup_path(&backup_path)); + use std::os::unix::fs::PermissionsExt; + std::fs::set_permissions(&tmp_path, std::fs::Permissions::from_mode(0o755))?; } - cleanup_old_backups(¤t_exe).map_err(|error| { - UpgradeFailure::new(UpgradeStage::Installing, error).with_backup_path(&backup_path) - })?; + // Backup current binary and replace + if backup_path.exists() { + std::fs::remove_file(&backup_path)?; + } + std::fs::rename(¤t_exe, &backup_path)?; + std::fs::rename(&tmp_path, ¤t_exe)?; - emit_upgrade_progress(&tx, job_id, version, UpgradeStage::Restarting).await; tracing::info!("Agent binary replaced. Restarting..."); + let args: Vec = std::env::args().collect(); let mut cmd = std::process::Command::new(¤t_exe); - let args: Vec<_> = std::env::args_os().skip(1).collect(); - if !args.is_empty() { - cmd.args(args); + if args.len() > 1 { + cmd.args(&args[1..]); } - cmd.spawn().map_err(|error| { - UpgradeFailure::new(UpgradeStage::Restarting, error).with_backup_path(&backup_path) - })?; + cmd.spawn()?; std::process::exit(0); } From 5ebafe84f2670f3f4848f21483752e57cee4cf63 Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Tue, 14 Apr 2026 20:38:27 +0800 Subject: [PATCH 13/17] refactor(web): remove upgrade job hydration --- apps/web/src/hooks/use-servers-ws.test.ts | 9 --- apps/web/src/hooks/use-servers-ws.ts | 83 ++--------------------- apps/web/src/lib/api-schema.ts | 11 --- 3 files changed, 5 insertions(+), 98 deletions(-) diff --git a/apps/web/src/hooks/use-servers-ws.test.ts b/apps/web/src/hooks/use-servers-ws.test.ts index c406ef44..87aab80c 100644 --- a/apps/web/src/hooks/use-servers-ws.test.ts +++ b/apps/web/src/hooks/use-servers-ws.test.ts @@ -106,12 +106,3 @@ describe('setServerCapabilities', () => { expect(result[0].effective_capabilities).toBe(0) }) }) - -describe('setServerAgentVersion', () => { - it('updates agent_version field', () => { - const prev = [makeServer({ id: 's1', agent_version: undefined })] - const result = prev.map((s) => (s.id === 's1' ? { ...s, agent_version: '1.2.3' } : s)) - - expect(result[0].agent_version).toBe('1.2.3') - }) -}) diff --git a/apps/web/src/hooks/use-servers-ws.ts b/apps/web/src/hooks/use-servers-ws.ts index 3b77f908..0c29158e 100644 --- a/apps/web/src/hooks/use-servers-ws.ts +++ b/apps/web/src/hooks/use-servers-ws.ts @@ -7,7 +7,6 @@ import type { DockerContainerStats, DockerEventInfo } from '@/routes/_authed/servers/$serverId/docker/types' -import { type UpgradeJob, useUpgradeJobsStore } from '@/stores/upgrade-jobs-store' const MAX_DOCKER_EVENTS = 100 @@ -49,7 +48,7 @@ interface ServerMetrics { } type WsMessage = - | { type: 'full_sync'; servers: ServerMetrics[]; upgrades?: UpgradeJob[] } + | { type: 'full_sync'; servers: ServerMetrics[] } | { type: 'update'; servers: ServerMetrics[] } | { type: 'server_online'; server_id: string } | { type: 'server_offline'; server_id: string } @@ -60,7 +59,7 @@ type WsMessage = agent_local_capabilities?: number | null effective_capabilities?: number | null } - | { type: 'agent_info_updated'; server_id: string; protocol_version: number; agent_version?: string | null } + | { type: 'agent_info_updated'; server_id: string; protocol_version: number } | { type: 'network_probe_update'; server_id: string; results: NetworkProbeResultData[] } | { type: 'docker_update' @@ -70,17 +69,6 @@ type WsMessage = } | { type: 'docker_event'; server_id: string; event: DockerEventInfo } | { type: 'docker_availability_changed'; server_id: string; available: boolean } - | { type: 'upgrade_progress'; server_id: string; job_id: string; target_version: string; stage: string } - | { - type: 'upgrade_result' - server_id: string - job_id: string - target_version: string - status: string - stage?: string - error?: string | null - backup_path?: string | null - } export type { ServerMetrics } @@ -193,9 +181,6 @@ function handleServerMetricsMessage(raw: { type: string } & Record(['servers'], msg.servers) - if (Array.isArray(raw.upgrades)) { - useUpgradeJobsStore.getState().setJobs(raw.upgrades as UpgradeJob[]) - } } else { queryClient.setQueryData(['servers'], (prev) => prev ? mergeServerUpdate(prev, msg.servers) : msg.servers @@ -256,12 +241,12 @@ function handleCapabilityMessage(raw: { type: string } & Record return } const msg = raw as WsMessage & { type: 'agent_info_updated' } - const { server_id, protocol_version, agent_version } = msg + const { server_id, protocol_version } = msg queryClient.setQueryData(['servers', server_id], (prev: Record | undefined) => - prev ? { ...prev, protocol_version, agent_version: agent_version ?? null } : prev + prev ? { ...prev, protocol_version } : prev ) queryClient.setQueryData[]>(['servers-list'], (prev) => - prev?.map((s) => (s.id === server_id ? { ...s, protocol_version, agent_version: agent_version ?? null } : s)) + prev?.map((s) => (s.id === server_id ? { ...s, protocol_version } : s)) ) } } @@ -348,64 +333,6 @@ function handleWsMessage(raw: unknown, queryClient: QueryClient): void { case 'docker_availability_changed': handleDockerMessage(raw, queryClient) break - case 'upgrade_progress': { - if ( - typeof raw.server_id !== 'string' || - typeof raw.job_id !== 'string' || - typeof raw.target_version !== 'string' || - typeof raw.stage !== 'string' - ) { - break - } - const { server_id, target_version, stage } = raw as { - server_id: string - job_id: string - target_version: string - stage: string - } - const existingJob = useUpgradeJobsStore.getState().getJob(server_id) - if (existingJob) { - useUpgradeJobsStore.getState().setJob(server_id, { - ...existingJob, - stage: stage as UpgradeJob['stage'], - target_version - }) - } - break - } - case 'upgrade_result': { - if ( - typeof raw.server_id !== 'string' || - typeof raw.job_id !== 'string' || - typeof raw.target_version !== 'string' || - typeof raw.status !== 'string' - ) { - break - } - const { server_id, job_id, target_version, status, stage, error, backup_path } = raw as { - server_id: string - job_id: string - target_version: string - status: string - stage?: string - error?: string | null - backup_path?: string | null - } - const existingJob = useUpgradeJobsStore.getState().getJob(server_id) - const now = new Date().toISOString() - useUpgradeJobsStore.getState().setJob(server_id, { - server_id, - job_id, - target_version, - stage: (stage as UpgradeJob['stage']) ?? existingJob?.stage ?? 'downloading', - status: status as UpgradeJob['status'], - error: error ?? null, - backup_path: backup_path ?? null, - started_at: existingJob?.started_at ?? now, - finished_at: now - }) - break - } default: break } diff --git a/apps/web/src/lib/api-schema.ts b/apps/web/src/lib/api-schema.ts index 0e9b34a6..bee9b969 100644 --- a/apps/web/src/lib/api-schema.ts +++ b/apps/web/src/lib/api-schema.ts @@ -101,17 +101,6 @@ export type RegisterResponse = S['RegisterResponse'] export type UpgradeRequest = S['UpgradeRequest'] export type AutoDiscoveryKeyResponse = S['AutoDiscoveryKeyResponse'] -// Upgrade jobs -export type UpgradeJobDto = S['UpgradeJobDto'] -export type UpgradeStage = S['UpgradeStage'] -export type UpgradeStatus = S['UpgradeStatus'] - -export interface LatestAgentVersionResponse { - download_url: string - sha256: string - version: string -} - // Traffic (manually typed until OpenAPI types are regenerated) export interface TrafficResponse { bytes_in: number From 3e7c1d516d81757e56a8525d2f7216c8dad703e9 Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Tue, 14 Apr 2026 20:42:13 +0800 Subject: [PATCH 14/17] refactor(common): remove upgrade lifecycle protocol --- crates/common/src/protocol.rs | 200 ---------------------------------- 1 file changed, 200 deletions(-) diff --git a/crates/common/src/protocol.rs b/crates/common/src/protocol.rs index edd71eb9..f9fe4203 100644 --- a/crates/common/src/protocol.rs +++ b/crates/common/src/protocol.rs @@ -1,4 +1,3 @@ -use chrono::{DateTime, Utc}; use serde::{Deserialize, Serialize}; use crate::constants::CapabilityDeniedReason; @@ -8,44 +7,6 @@ use crate::types::{ PingTaskConfig, SystemInfo, SystemReport, TaskResult, TracerouteHop, }; -#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] -#[serde(rename_all = "snake_case")] -#[cfg_attr(feature = "utoipa", derive(utoipa::ToSchema))] -pub enum UpgradeStage { - Downloading, - Verifying, - PreFlight, - Installing, - Restarting, -} - -#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] -#[serde(rename_all = "snake_case")] -#[cfg_attr(feature = "utoipa", derive(utoipa::ToSchema))] -pub enum UpgradeStatus { - Running, - Succeeded, - Failed, - Timeout, -} - -#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] -#[cfg_attr(feature = "utoipa", derive(utoipa::ToSchema))] -pub struct UpgradeJobDto { - pub server_id: String, - pub job_id: String, - pub target_version: String, - pub stage: UpgradeStage, - pub status: UpgradeStatus, - #[serde(default)] - pub error: Option, - #[serde(default)] - pub backup_path: Option, - pub started_at: DateTime, - #[serde(default)] - pub finished_at: Option>, -} - /// Agent -> Server messages #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(tag = "type", rename_all = "snake_case")] @@ -184,23 +145,6 @@ pub enum AgentMessage { completed: bool, error: Option, }, - UpgradeProgress { - msg_id: String, - #[serde(default)] - job_id: Option, - target_version: String, - stage: UpgradeStage, - }, - UpgradeResult { - msg_id: String, - #[serde(default)] - job_id: Option, - target_version: String, - stage: UpgradeStage, - error: String, - #[serde(default)] - backup_path: Option, - }, Pong, } @@ -344,8 +288,6 @@ pub enum ServerMessage { version: String, download_url: String, sha256: String, - #[serde(default)] - job_id: Option, }, CapabilitiesSync { capabilities: u32, @@ -358,8 +300,6 @@ pub enum ServerMessage { pub enum BrowserMessage { FullSync { servers: Vec, - #[serde(default)] - upgrades: Vec, }, Update { servers: Vec, @@ -379,23 +319,6 @@ pub enum BrowserMessage { AgentInfoUpdated { server_id: String, protocol_version: u32, - #[serde(default)] - agent_version: Option, - }, - UpgradeProgress { - server_id: String, - job_id: String, - target_version: String, - stage: UpgradeStage, - }, - UpgradeResult { - server_id: String, - job_id: String, - target_version: String, - status: UpgradeStatus, - stage: Option, - error: Option, - backup_path: Option, }, NetworkProbeUpdate { server_id: String, @@ -439,24 +362,6 @@ mod tests { use super::*; use crate::types::DiskIo; - #[test] - fn test_agent_info_updated_accepts_optional_agent_version() { - let json = r#"{"type":"agent_info_updated","server_id":"s1","protocol_version":3,"agent_version":"1.2.3"}"#; - - match serde_json::from_str::(json).unwrap() { - BrowserMessage::AgentInfoUpdated { - server_id, - protocol_version, - agent_version, - } => { - assert_eq!(server_id, "s1"); - assert_eq!(protocol_version, 3); - assert_eq!(agent_version.as_deref(), Some("1.2.3")); - } - _ => panic!("Expected AgentInfoUpdated"), - } - } - #[test] fn test_welcome_without_capabilities_deserializes() { let json = @@ -1195,109 +1100,4 @@ mod tests { _ => panic!("Expected ServerIpChanged"), } } - - #[test] - fn test_server_upgrade_with_job_id_round_trip() { - let msg = ServerMessage::Upgrade { - version: "2.0.0".to_string(), - download_url: "https://example.com/serverbee.tar.gz".to_string(), - sha256: "abc123".to_string(), - job_id: Some("job-1".to_string()), - }; - - let json = serde_json::to_string(&msg).unwrap(); - let parsed: ServerMessage = serde_json::from_str(&json).unwrap(); - - match parsed { - ServerMessage::Upgrade { - version, - download_url, - sha256, - job_id, - } => { - assert_eq!(version, "2.0.0"); - assert_eq!(download_url, "https://example.com/serverbee.tar.gz"); - assert_eq!(sha256, "abc123"); - assert_eq!(job_id, Some("job-1".to_string())); - } - _ => panic!("Expected Upgrade"), - } - } - - #[test] - fn test_upgrade_messages_without_job_id_stay_backward_compatible() { - let server_json = r#"{"type":"upgrade","version":"2.0.0","download_url":"https://example.com/serverbee.tar.gz","sha256":"abc123"}"#; - let server_msg: ServerMessage = serde_json::from_str(server_json).unwrap(); - match server_msg { - ServerMessage::Upgrade { - job_id, - version, - download_url, - sha256, - } => { - assert_eq!(job_id, None); - assert_eq!(version, "2.0.0"); - assert_eq!(download_url, "https://example.com/serverbee.tar.gz"); - assert_eq!(sha256, "abc123"); - } - _ => panic!("Expected Upgrade"), - } - - let agent_json = r#"{"type":"upgrade_progress","msg_id":"m1","target_version":"2.0.0","stage":"downloading"}"#; - let agent_msg: AgentMessage = serde_json::from_str(agent_json).unwrap(); - match agent_msg { - AgentMessage::UpgradeProgress { - msg_id, - job_id, - target_version, - stage, - } => { - assert_eq!(msg_id, "m1"); - assert_eq!(job_id, None); - assert_eq!(target_version, "2.0.0"); - assert_eq!(stage, UpgradeStage::Downloading); - } - _ => panic!("Expected UpgradeProgress"), - } - } - - #[test] - fn test_browser_full_sync_with_upgrades_round_trip() { - let msg = BrowserMessage::FullSync { - servers: vec![], - upgrades: vec![UpgradeJobDto { - server_id: "server-1".to_string(), - job_id: "job-1".to_string(), - target_version: "2.0.0".to_string(), - stage: UpgradeStage::Installing, - status: UpgradeStatus::Running, - error: None, - backup_path: Some("/backups/server-1.tar.gz".to_string()), - started_at: chrono::Utc::now(), - finished_at: None, - }], - }; - - let json = serde_json::to_string(&msg).unwrap(); - let parsed: BrowserMessage = serde_json::from_str(&json).unwrap(); - - match parsed { - BrowserMessage::FullSync { servers, upgrades } => { - assert!(servers.is_empty()); - assert_eq!(upgrades.len(), 1); - assert_eq!(upgrades[0].server_id, "server-1"); - assert_eq!(upgrades[0].job_id, "job-1"); - assert_eq!(upgrades[0].target_version, "2.0.0"); - assert_eq!(upgrades[0].stage, UpgradeStage::Installing); - assert_eq!(upgrades[0].status, UpgradeStatus::Running); - assert_eq!(upgrades[0].error, None); - assert_eq!( - upgrades[0].backup_path, - Some("/backups/server-1.tar.gz".to_string()) - ); - assert!(upgrades[0].finished_at.is_none()); - } - _ => panic!("Expected FullSync"), - } - } } From b8ba14e85644825f9b6c7e151008b54eea6911e0 Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Tue, 14 Apr 2026 20:47:31 +0800 Subject: [PATCH 15/17] test(web): fix frontend test mocks --- .../dashboard/widget-config-dialog.test.tsx | 45 ++++++++++++++++++- .../components/server/traffic-card.test.tsx | 7 ++- .../uptime/uptime-timeline.test.tsx | 23 ++++++++++ .../src/routes/_authed/servers/$id.test.tsx | 4 ++ 4 files changed, 76 insertions(+), 3 deletions(-) diff --git a/apps/web/src/components/dashboard/widget-config-dialog.test.tsx b/apps/web/src/components/dashboard/widget-config-dialog.test.tsx index 1d735e33..0a94f200 100644 --- a/apps/web/src/components/dashboard/widget-config-dialog.test.tsx +++ b/apps/web/src/components/dashboard/widget-config-dialog.test.tsx @@ -3,9 +3,35 @@ import type { ReactNode } from 'react' import { describe, expect, it, vi } from 'vitest' import { WidgetConfigDialog } from './widget-config-dialog' +const translations: Record = { + 'dialogs.widgetConfig.configureTitle': 'Configure Widget', + 'dialogs.widgetConfig.editTitle': 'Edit Widget', + 'dialogs.widgetConfig.labels.titleOptional': 'Title (optional)', + 'dialogs.widgetConfig.placeholders.widgetTitle': 'Widget title', + 'dialogs.widgetConfig.messages.noConfigNeeded': 'No additional configuration needed.', + 'widgets.common.labels.server': 'Server', + 'widgets.common.labels.servers': 'Servers', + 'widgets.common.labels.metric': 'Metric', + 'widgets.common.labels.timeRange': 'Time Range', + 'widgets.common.labels.days': 'Days', + 'widgets.common.labels.markdownContent': 'Markdown Content', + 'widgets.common.placeholders.writeMarkdown': 'Write markdown here...', + 'common.metrics.serverCount': 'Server Count', + 'common.metrics.avgCpu': 'Average CPU', + 'common.metrics.avgMemory': 'Average Memory', + 'common.metrics.health': 'Health', + 'common.metrics.cpu': 'CPU', + 'common.metrics.memory': 'Memory', + 'common.timeRange.1hour': '1 hour', + 'common.timeRange.24hours': '24 hours', + 'common.timeRange.30days': '30 days', + 'common.timeRange.60days': '60 days', + 'common.timeRange.90days': '90 days' +} + vi.mock('react-i18next', () => ({ useTranslation: () => ({ - t: (key: string, fallback?: string) => fallback ?? key + t: (key: string, fallback?: string) => translations[key] ?? fallback ?? key }) })) @@ -45,7 +71,22 @@ vi.mock('@/components/ui/button', () => ({ })) vi.mock('@/components/ui/checkbox', () => ({ - Checkbox: (props: Record) => + Checkbox: ({ + checked, + onCheckedChange, + ...props + }: { + checked?: boolean + onCheckedChange?: (checked: boolean) => void + } & Record) => ( + onCheckedChange?.(!checked)} + type="checkbox" + {...props} + /> + ) })) vi.mock('@/lib/markdown', () => ({ diff --git a/apps/web/src/components/server/traffic-card.test.tsx b/apps/web/src/components/server/traffic-card.test.tsx index 12e51f9c..602673c0 100644 --- a/apps/web/src/components/server/traffic-card.test.tsx +++ b/apps/web/src/components/server/traffic-card.test.tsx @@ -5,10 +5,15 @@ import { TrafficCard } from './traffic-card' const mockUseTraffic = vi.fn() const TabsContext = createContext<{ setValue: (value: string) => void; value: string } | null>(null) +const translations: Record = { + traffic_title: 'Traffic Statistics', + traffic_tab_today: 'Today', + traffic_tab_cycle: 'Monthly' +} vi.mock('react-i18next', () => ({ useTranslation: () => ({ - t: (_key: string, options?: { defaultValue?: string }) => options?.defaultValue ?? _key + t: (key: string, options?: { defaultValue?: string }) => translations[key] ?? options?.defaultValue ?? key }) })) diff --git a/apps/web/src/components/uptime/uptime-timeline.test.tsx b/apps/web/src/components/uptime/uptime-timeline.test.tsx index 62ee6134..f15c3608 100644 --- a/apps/web/src/components/uptime/uptime-timeline.test.tsx +++ b/apps/web/src/components/uptime/uptime-timeline.test.tsx @@ -4,6 +4,29 @@ import type { UptimeDailyEntry } from '@/lib/api-schema' import { computeAggregateUptime } from '@/lib/widget-helpers' import { UptimeTimeline } from './uptime-timeline' +vi.mock('react-i18next', () => ({ + useTranslation: () => ({ + t: (key: string, options?: { count?: number }) => { + switch (key) { + case 'uptime_days_ago': + return `${options?.count ?? 0} days ago` + case 'uptime_today': + return 'Today' + case 'uptime_operational': + return 'Operational' + case 'uptime_degraded': + return 'Degraded' + case 'uptime_down': + return 'Down' + case 'uptime_no_data': + return 'No data' + default: + return key + } + } + }) +})) + function makeEntry(overrides: Partial = {}): UptimeDailyEntry { return { date: '2026-03-20', diff --git a/apps/web/src/routes/_authed/servers/$id.test.tsx b/apps/web/src/routes/_authed/servers/$id.test.tsx index b689319b..451cad21 100644 --- a/apps/web/src/routes/_authed/servers/$id.test.tsx +++ b/apps/web/src/routes/_authed/servers/$id.test.tsx @@ -28,6 +28,10 @@ vi.mock('react-i18next', () => ({ }) })) +vi.mock('@/components/server/agent-version-section', () => ({ + AgentVersionSection: () =>
agent-version
+})) + vi.mock('@/components/server/capabilities-dialog', () => ({ CapabilitiesDialog: () =>
capabilities
})) From a2f4fdba8ee0e81fa1dbb234b04912fa43530a16 Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Tue, 14 Apr 2026 20:52:08 +0800 Subject: [PATCH 16/17] fix(upgrade): restore end-to-end upgrade state flow --- apps/web/src/hooks/use-servers-ws.test.ts | 105 ++++++++- apps/web/src/hooks/use-servers-ws.ts | 85 +++++++- .../src/routes/_authed/servers/$id.test.tsx | 12 +- apps/web/src/routes/_authed/servers/$id.tsx | 7 +- .../web/src/stores/upgrade-jobs-store.test.ts | 21 +- apps/web/src/stores/upgrade-jobs-store.ts | 16 +- crates/agent/src/reporter.rs | 138 +++++++++++- crates/common/src/protocol.rs | 200 ++++++++++++++++++ crates/server/src/config.rs | 3 + crates/server/src/main.rs | 2 + crates/server/src/openapi.rs | 2 + crates/server/src/router/api/agent.rs | 22 +- crates/server/src/router/api/mod.rs | 1 + crates/server/src/router/api/server.rs | 72 +++---- crates/server/src/router/ws/agent.rs | 51 +++++ crates/server/src/router/ws/browser.rs | 6 +- crates/server/src/service/mod.rs | 2 + crates/server/src/state.rs | 8 + crates/server/src/task/mod.rs | 1 + 19 files changed, 676 insertions(+), 78 deletions(-) diff --git a/apps/web/src/hooks/use-servers-ws.test.ts b/apps/web/src/hooks/use-servers-ws.test.ts index 87aab80c..f9c2087e 100644 --- a/apps/web/src/hooks/use-servers-ws.test.ts +++ b/apps/web/src/hooks/use-servers-ws.test.ts @@ -1,6 +1,7 @@ import { describe, expect, it } from 'vitest' +import { useUpgradeJobsStore } from '@/stores/upgrade-jobs-store' import type { ServerMetrics } from './use-servers-ws' -import { mergeServerUpdate, setServerCapabilities, setServerOnlineStatus } from './use-servers-ws' +import { handleWsMessage, mergeServerUpdate, setServerCapabilities, setServerOnlineStatus } from './use-servers-ws' function makeServer(overrides: Partial = {}): ServerMetrics { return { @@ -106,3 +107,105 @@ describe('setServerCapabilities', () => { expect(result[0].effective_capabilities).toBe(0) }) }) + +describe('handleWsMessage upgrade messages', () => { + function makeQueryClient() { + const cache = new Map() + return { + setQueryData: (key: unknown[], value: unknown | ((prev: unknown) => unknown)) => { + const cacheKey = JSON.stringify(key) + const prev = cache.get(cacheKey) + const next = typeof value === 'function' ? (value as (prev: unknown) => unknown)(prev) : value + cache.set(cacheKey, next) + } + } + } + + it('hydrates upgrade jobs from full_sync', () => { + useUpgradeJobsStore.setState({ jobs: new Map() }) + const queryClient = makeQueryClient() + + handleWsMessage( + { + type: 'full_sync', + servers: [], + upgrades: [ + { + server_id: 'server-1', + job_id: 'job-1', + target_version: '1.2.3', + stage: 'downloading', + status: 'running', + error: null, + backup_path: null, + started_at: '2024-01-01T00:00:00Z', + finished_at: null + } + ] + }, + queryClient as never + ) + + expect(useUpgradeJobsStore.getState().jobs.get('server-1')?.job_id).toBe('job-1') + }) + + it('updates existing upgrade stage from upgrade_progress', () => { + useUpgradeJobsStore.setState({ + jobs: new Map([ + [ + 'server-1', + { + server_id: 'server-1', + job_id: 'job-1', + target_version: '1.2.3', + stage: 'downloading', + status: 'running', + error: null, + backup_path: null, + started_at: '2024-01-01T00:00:00Z', + finished_at: null + } + ] + ]) + }) + const queryClient = makeQueryClient() + + handleWsMessage( + { + type: 'upgrade_progress', + server_id: 'server-1', + job_id: 'job-1', + target_version: '1.2.3', + stage: 'installing' + }, + queryClient as never + ) + + expect(useUpgradeJobsStore.getState().jobs.get('server-1')?.stage).toBe('installing') + }) + + it('stores terminal upgrade result from upgrade_result', () => { + useUpgradeJobsStore.setState({ jobs: new Map() }) + const queryClient = makeQueryClient() + + handleWsMessage( + { + type: 'upgrade_result', + server_id: 'server-1', + job_id: 'job-1', + target_version: '1.2.3', + status: 'failed', + stage: 'installing', + error: 'install failed', + backup_path: '/tmp/backup' + }, + queryClient as never + ) + + const job = useUpgradeJobsStore.getState().jobs.get('server-1') + expect(job?.status).toBe('failed') + expect(job?.error).toBe('install failed') + expect(job?.backup_path).toBe('/tmp/backup') + expect(job?.finished_at).not.toBeNull() + }) +}) diff --git a/apps/web/src/hooks/use-servers-ws.ts b/apps/web/src/hooks/use-servers-ws.ts index 0c29158e..a6eb50d1 100644 --- a/apps/web/src/hooks/use-servers-ws.ts +++ b/apps/web/src/hooks/use-servers-ws.ts @@ -7,6 +7,7 @@ import type { DockerContainerStats, DockerEventInfo } from '@/routes/_authed/servers/$serverId/docker/types' +import { type UpgradeJob, useUpgradeJobsStore } from '@/stores/upgrade-jobs-store' const MAX_DOCKER_EVENTS = 100 @@ -48,7 +49,7 @@ interface ServerMetrics { } type WsMessage = - | { type: 'full_sync'; servers: ServerMetrics[] } + | { type: 'full_sync'; servers: ServerMetrics[]; upgrades?: UpgradeJob[] } | { type: 'update'; servers: ServerMetrics[] } | { type: 'server_online'; server_id: string } | { type: 'server_offline'; server_id: string } @@ -59,7 +60,7 @@ type WsMessage = agent_local_capabilities?: number | null effective_capabilities?: number | null } - | { type: 'agent_info_updated'; server_id: string; protocol_version: number } + | { type: 'agent_info_updated'; server_id: string; protocol_version: number; agent_version?: string | null } | { type: 'network_probe_update'; server_id: string; results: NetworkProbeResultData[] } | { type: 'docker_update' @@ -69,6 +70,17 @@ type WsMessage = } | { type: 'docker_event'; server_id: string; event: DockerEventInfo } | { type: 'docker_availability_changed'; server_id: string; available: boolean } + | { type: 'upgrade_progress'; server_id: string; job_id: string; target_version: string; stage: string } + | { + type: 'upgrade_result' + server_id: string + job_id: string + target_version: string + status: string + stage?: string + error?: string | null + backup_path?: string | null + } export type { ServerMetrics } @@ -181,6 +193,9 @@ function handleServerMetricsMessage(raw: { type: string } & Record(['servers'], msg.servers) + if (Array.isArray(raw.upgrades)) { + useUpgradeJobsStore.getState().setJobs(raw.upgrades as UpgradeJob[]) + } } else { queryClient.setQueryData(['servers'], (prev) => prev ? mergeServerUpdate(prev, msg.servers) : msg.servers @@ -241,12 +256,12 @@ function handleCapabilityMessage(raw: { type: string } & Record return } const msg = raw as WsMessage & { type: 'agent_info_updated' } - const { server_id, protocol_version } = msg + const { server_id, protocol_version, agent_version } = msg queryClient.setQueryData(['servers', server_id], (prev: Record | undefined) => - prev ? { ...prev, protocol_version } : prev + prev ? { ...prev, protocol_version, agent_version: agent_version ?? null } : prev ) queryClient.setQueryData[]>(['servers-list'], (prev) => - prev?.map((s) => (s.id === server_id ? { ...s, protocol_version } : s)) + prev?.map((s) => (s.id === server_id ? { ...s, protocol_version, agent_version: agent_version ?? null } : s)) ) } } @@ -296,7 +311,7 @@ function handleDockerMessage(raw: { type: string } & Record, qu } } -function handleWsMessage(raw: unknown, queryClient: QueryClient): void { +export function handleWsMessage(raw: unknown, queryClient: QueryClient): void { if (!isWsMessageLike(raw)) { console.warn('WS: unexpected message shape', raw) return @@ -333,6 +348,64 @@ function handleWsMessage(raw: unknown, queryClient: QueryClient): void { case 'docker_availability_changed': handleDockerMessage(raw, queryClient) break + case 'upgrade_progress': { + if ( + typeof raw.server_id !== 'string' || + typeof raw.job_id !== 'string' || + typeof raw.target_version !== 'string' || + typeof raw.stage !== 'string' + ) { + break + } + const { server_id, target_version, stage } = raw as { + server_id: string + job_id: string + target_version: string + stage: string + } + const existingJob = useUpgradeJobsStore.getState().getJob(server_id) + if (existingJob) { + useUpgradeJobsStore.getState().setJob(server_id, { + ...existingJob, + stage: stage as UpgradeJob['stage'], + target_version + }) + } + break + } + case 'upgrade_result': { + if ( + typeof raw.server_id !== 'string' || + typeof raw.job_id !== 'string' || + typeof raw.target_version !== 'string' || + typeof raw.status !== 'string' + ) { + break + } + const { server_id, job_id, target_version, status, stage, error, backup_path } = raw as { + server_id: string + job_id: string + target_version: string + status: string + stage?: string + error?: string | null + backup_path?: string | null + } + const existingJob = useUpgradeJobsStore.getState().getJob(server_id) + const now = new Date().toISOString() + useUpgradeJobsStore.getState().setJob(server_id, { + server_id, + job_id, + target_version, + stage: (stage as UpgradeJob['stage']) ?? existingJob?.stage ?? 'downloading', + status: status as UpgradeJob['status'], + error: error ?? null, + backup_path: backup_path ?? null, + started_at: existingJob?.started_at ?? now, + finished_at: now + }) + break + } default: break } diff --git a/apps/web/src/routes/_authed/servers/$id.test.tsx b/apps/web/src/routes/_authed/servers/$id.test.tsx index 451cad21..b924cd77 100644 --- a/apps/web/src/routes/_authed/servers/$id.test.tsx +++ b/apps/web/src/routes/_authed/servers/$id.test.tsx @@ -18,6 +18,10 @@ vi.mock('@tanstack/react-query', () => ({ return { data: [] } } + if (queryKey[0] === 'agent' && queryKey[1] === 'latest-version') { + return { data: { version: '1.3.0' } } + } + return { data: [] } } })) @@ -29,7 +33,7 @@ vi.mock('react-i18next', () => ({ })) vi.mock('@/components/server/agent-version-section', () => ({ - AgentVersionSection: () =>
agent-version
+ AgentVersionSection: ({ latestVersion }: { latestVersion?: string | null }) =>
{latestVersion ?? 'no-latest-version'}
})) vi.mock('@/components/server/capabilities-dialog', () => ({ @@ -159,4 +163,10 @@ describe('ServerDetailPage', () => { expect(container.firstElementChild).toHaveClass('pb-6') }) + + it('passes latest agent version into the version section', () => { + const { container } = render() + + expect(container).toHaveTextContent('1.3.0') + }) }) diff --git a/apps/web/src/routes/_authed/servers/$id.tsx b/apps/web/src/routes/_authed/servers/$id.tsx index 4f77765a..4995cf8c 100644 --- a/apps/web/src/routes/_authed/servers/$id.tsx +++ b/apps/web/src/routes/_authed/servers/$id.tsx @@ -350,6 +350,11 @@ export function ServerDetailPage() { const { id } = Route.useParams() const { range: rangeParam } = Route.useSearch() const [editOpen, setEditOpen] = useState(false) + const { data: latestAgentVersion } = useQuery<{ version?: string | null }>({ + queryKey: ['agent', 'latest-version'], + queryFn: () => api.get<{ version?: string | null }>('/api/agent/latest-version'), + staleTime: 60_000 + }) const selectedRange = TIME_RANGES.findIndex((tr) => tr.key === rangeParam) const rangeIndex = selectedRange >= 0 ? selectedRange : 0 @@ -581,7 +586,7 @@ export function ServerDetailPage() { agentVersion={server.agent_version} configuredCapabilities={serverWithCaps.capabilities} effectiveCapabilities={serverWithCaps.effective_capabilities} - latestVersion={server.latest_agent_version} + latestVersion={latestAgentVersion?.version ?? null} serverId={id} /> diff --git a/apps/web/src/stores/upgrade-jobs-store.test.ts b/apps/web/src/stores/upgrade-jobs-store.test.ts index 0ae1fb9d..85d69330 100644 --- a/apps/web/src/stores/upgrade-jobs-store.test.ts +++ b/apps/web/src/stores/upgrade-jobs-store.test.ts @@ -60,17 +60,16 @@ describe('useUpgradeJobsStore', () => { expect(storedJob?.target_version).toBe('2.0.0') }) - it('skips update if incoming job_id equals existing job_id', () => { + it('updates existing job when incoming job_id matches', () => { const job1 = makeJob({ job_id: 'job-1', target_version: '1.0.0' }) useUpgradeJobsStore.getState().setJob('server-1', job1) - // Try to update with same job_id but different data - const job2 = makeJob({ job_id: 'job-1', target_version: '2.0.0' }) + const job2 = makeJob({ job_id: 'job-1', target_version: '2.0.0', stage: 'installing' }) useUpgradeJobsStore.getState().setJob('server-1', job2) const storedJob = useUpgradeJobsStore.getState().jobs.get('server-1') - // Should keep the original data (deduplication) - expect(storedJob?.target_version).toBe('1.0.0') + expect(storedJob?.target_version).toBe('2.0.0') + expect(storedJob?.stage).toBe('installing') }) it('stores jobs keyed by server_id', () => { @@ -172,6 +171,18 @@ describe('useUpgradeJobsStore', () => { expect(useUpgradeJobsStore.getState().jobs.has('server-1')).toBe(true) }) + + it('does not let an old finished-job timer clear a newer running job', () => { + const finishedJob = makeJob({ job_id: 'job-1', status: 'succeeded', finished_at: '2024-01-01T00:01:00Z' }) + useUpgradeJobsStore.getState().setJob('server-1', finishedJob) + + const runningJob = makeJob({ job_id: 'job-2', status: 'running', finished_at: null }) + useUpgradeJobsStore.getState().setJob('server-1', runningJob) + + vi.advanceTimersByTime(5000) + + expect(useUpgradeJobsStore.getState().jobs.get('server-1')?.job_id).toBe('job-2') + }) }) describe('getJob', () => { diff --git a/apps/web/src/stores/upgrade-jobs-store.ts b/apps/web/src/stores/upgrade-jobs-store.ts index ab6ff868..26ca9196 100644 --- a/apps/web/src/stores/upgrade-jobs-store.ts +++ b/apps/web/src/stores/upgrade-jobs-store.ts @@ -35,18 +35,15 @@ export const useUpgradeJobsStore = create()((set, get) => ({ setJob: (serverId: string, job: UpgradeJob) => { set((state) => { - const existingJob = state.jobs.get(serverId) - - if (existingJob && existingJob.job_id === job.job_id) { - return { jobs: state.jobs } - } - const newJobs = new Map(state.jobs) newJobs.set(serverId, job) if (isFinished(job.status)) { setTimeout(() => { - get().clearJob(serverId) + const currentJob = get().getJob(serverId) + if (currentJob?.job_id === job.job_id) { + get().clearJob(serverId) + } }, AUTO_CLEAR_DELAY) } @@ -69,7 +66,10 @@ export const useUpgradeJobsStore = create()((set, get) => ({ if (isFinished(job.status)) { setTimeout(() => { - get().clearJob(job.server_id) + const currentJob = get().getJob(job.server_id) + if (currentJob?.job_id === job.job_id) { + get().clearJob(job.server_id) + } }, AUTO_CLEAR_DELAY) } } diff --git a/crates/agent/src/reporter.rs b/crates/agent/src/reporter.rs index ccde6418..66dd6626 100644 --- a/crates/agent/src/reporter.rs +++ b/crates/agent/src/reporter.rs @@ -1,6 +1,6 @@ use std::net::IpAddr; use std::sync::Arc; -use std::sync::atomic::{AtomicU32, Ordering}; +use std::sync::atomic::{AtomicBool, AtomicU32, Ordering}; use std::time::Duration; use futures_util::{SinkExt, StreamExt}; @@ -8,7 +8,7 @@ use rand::Rng; use serverbee_common::constants::{ CapabilityDeniedReason, DEFAULT_COMMAND_TIMEOUT_SECS, MAX_TASK_OUTPUT_SIZE, has_capability, }; -use serverbee_common::protocol::{AgentMessage, ServerMessage}; +use serverbee_common::protocol::{AgentMessage, ServerMessage, UpgradeStage}; use serverbee_common::types::{NetworkInterface, NetworkProbeResultData, TracerouteHop}; use sysinfo::Networks; use tokio::sync::mpsc; @@ -29,6 +29,9 @@ const MAX_BACKOFF_SECS: u64 = 30; const JITTER_FACTOR: f64 = 0.2; const MAX_REREGISTER_ATTEMPTS: u32 = 3; const DOCKER_RETRY_SECS: u64 = 30; +const UPGRADE_DOWNLOAD_TIMEOUT_SECS: u64 = 600; + +static UPGRADE_IN_PROGRESS: AtomicBool = AtomicBool::new(false); pub struct Reporter { config: AgentConfig, @@ -572,6 +575,7 @@ impl Reporter { version, download_url, sha256, + job_id, } => { let caps = capabilities.load(Ordering::SeqCst); if !has_capability(caps, CAP_UPGRADE) { @@ -591,10 +595,32 @@ impl Reporter { write.send(Message::Text(json.into())).await?; return Ok(()); } + + if UPGRADE_IN_PROGRESS + .compare_exchange(false, true, Ordering::SeqCst, Ordering::SeqCst) + .is_err() + { + let tx = cmd_result_tx.clone(); + tokio::spawn(async move { + emit_upgrade_failure( + &tx, + job_id, + version, + UpgradeStage::Downloading, + "another upgrade is already running".to_string(), + None, + ) + .await; + }); + return Ok(()); + } + tracing::info!("Upgrade requested: v{version} from {download_url}"); + let tx = cmd_result_tx.clone(); tokio::spawn(async move { - if let Err(e) = perform_upgrade(&version, &download_url, &sha256).await { + if let Err(e) = perform_upgrade(&version, &download_url, &sha256, job_id, tx.clone()).await { tracing::error!("Upgrade to v{version} failed: {e}"); + UPGRADE_IN_PROGRESS.store(false, Ordering::SeqCst); } }); } @@ -1772,14 +1798,72 @@ async fn fetch_external_ip(url: &str) -> anyhow::Result { Ok(ip) } +async fn emit_upgrade_progress( + tx: &mpsc::Sender, + job_id: Option, + version: &str, + stage: UpgradeStage, +) { + let message = AgentMessage::UpgradeProgress { + msg_id: uuid::Uuid::new_v4().to_string(), + job_id, + target_version: version.to_string(), + stage, + }; + + if tx.send(message).await.is_err() { + tracing::warn!("Failed to emit upgrade progress: channel closed"); + } +} + +async fn emit_upgrade_failure( + tx: &mpsc::Sender, + job_id: Option, + version: String, + stage: UpgradeStage, + error: String, + backup_path: Option, +) { + let message = AgentMessage::UpgradeResult { + msg_id: uuid::Uuid::new_v4().to_string(), + job_id, + target_version: version, + stage, + error, + backup_path, + }; + + if tx.send(message).await.is_err() { + tracing::warn!("Failed to emit upgrade failure: channel closed"); + } +} + /// Download a new agent binary, verify checksum, replace current binary, and restart. -async fn perform_upgrade(version: &str, download_url: &str, sha256: &str) -> anyhow::Result<()> { +async fn perform_upgrade( + version: &str, + download_url: &str, + sha256: &str, + job_id: Option, + tx: mpsc::Sender, +) -> anyhow::Result<()> { use sha2::{Digest, Sha256}; use std::io::Write; + emit_upgrade_progress(&tx, job_id.clone(), version, UpgradeStage::Downloading).await; + // Validate URL scheme if !download_url.starts_with("https://") { - anyhow::bail!("Upgrade URL must use HTTPS, got: {download_url}"); + let error = format!("Upgrade URL must use HTTPS, got: {download_url}"); + emit_upgrade_failure( + &tx, + job_id, + version.to_string(), + UpgradeStage::Downloading, + error.clone(), + None, + ) + .await; + anyhow::bail!(error); } let current_exe = std::env::current_exe()?; @@ -1788,7 +1872,7 @@ async fn perform_upgrade(version: &str, download_url: &str, sha256: &str) -> any tracing::info!("Downloading agent v{version} from {download_url}..."); let client = reqwest::Client::builder() - .timeout(std::time::Duration::from_secs(600)) // 10 minute timeout + .timeout(std::time::Duration::from_secs(UPGRADE_DOWNLOAD_TIMEOUT_SECS)) .build()?; let response = client .get(download_url) @@ -1797,18 +1881,40 @@ async fn perform_upgrade(version: &str, download_url: &str, sha256: &str) -> any .await?; if !response.status().is_success() { - anyhow::bail!("Download failed with status {}", response.status()); + let error = format!("Download failed with status {}", response.status()); + emit_upgrade_failure( + &tx, + job_id, + version.to_string(), + UpgradeStage::Downloading, + error.clone(), + None, + ) + .await; + anyhow::bail!(error); } let bytes = response.bytes().await?; tracing::info!("Downloaded {} bytes", bytes.len()); + emit_upgrade_progress(&tx, job_id.clone(), version, UpgradeStage::Verifying).await; + // Mandatory SHA-256 verification let mut hasher = Sha256::new(); hasher.update(&bytes); let actual = format!("{:x}", hasher.finalize()); if actual != sha256 { - anyhow::bail!("Checksum mismatch: expected {sha256}, got {actual}"); + let error = format!("Checksum mismatch: expected {sha256}, got {actual}"); + emit_upgrade_failure( + &tx, + job_id.clone(), + version.to_string(), + UpgradeStage::Verifying, + error.clone(), + None, + ) + .await; + anyhow::bail!(error); } tracing::info!("Checksum verified"); @@ -1826,6 +1932,8 @@ async fn perform_upgrade(version: &str, download_url: &str, sha256: &str) -> any std::fs::set_permissions(&tmp_path, std::fs::Permissions::from_mode(0o755))?; } + emit_upgrade_progress(&tx, job_id.clone(), version, UpgradeStage::Installing).await; + // Backup current binary and replace if backup_path.exists() { std::fs::remove_file(&backup_path)?; @@ -1834,13 +1942,25 @@ async fn perform_upgrade(version: &str, download_url: &str, sha256: &str) -> any std::fs::rename(&tmp_path, ¤t_exe)?; tracing::info!("Agent binary replaced. Restarting..."); + emit_upgrade_progress(&tx, job_id, version, UpgradeStage::Restarting).await; let args: Vec = std::env::args().collect(); let mut cmd = std::process::Command::new(¤t_exe); if args.len() > 1 { cmd.args(&args[1..]); } - cmd.spawn()?; + if let Err(error) = cmd.spawn() { + emit_upgrade_failure( + &tx, + None, + version.to_string(), + UpgradeStage::Restarting, + error.to_string(), + Some(backup_path.display().to_string()), + ) + .await; + return Err(error.into()); + } std::process::exit(0); } diff --git a/crates/common/src/protocol.rs b/crates/common/src/protocol.rs index f9fe4203..7b52c706 100644 --- a/crates/common/src/protocol.rs +++ b/crates/common/src/protocol.rs @@ -1,3 +1,4 @@ +use chrono::{DateTime, Utc}; use serde::{Deserialize, Serialize}; use crate::constants::CapabilityDeniedReason; @@ -7,6 +8,44 @@ use crate::types::{ PingTaskConfig, SystemInfo, SystemReport, TaskResult, TracerouteHop, }; +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +#[cfg_attr(feature = "utoipa", derive(utoipa::ToSchema))] +pub enum UpgradeStage { + Downloading, + Verifying, + PreFlight, + Installing, + Restarting, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +#[cfg_attr(feature = "utoipa", derive(utoipa::ToSchema))] +pub enum UpgradeStatus { + Running, + Succeeded, + Failed, + Timeout, +} + +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +#[cfg_attr(feature = "utoipa", derive(utoipa::ToSchema))] +pub struct UpgradeJobDto { + pub server_id: String, + pub job_id: String, + pub target_version: String, + pub stage: UpgradeStage, + pub status: UpgradeStatus, + #[serde(default)] + pub error: Option, + #[serde(default)] + pub backup_path: Option, + pub started_at: DateTime, + #[serde(default)] + pub finished_at: Option>, +} + /// Agent -> Server messages #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(tag = "type", rename_all = "snake_case")] @@ -145,6 +184,23 @@ pub enum AgentMessage { completed: bool, error: Option, }, + UpgradeProgress { + msg_id: String, + #[serde(default)] + job_id: Option, + target_version: String, + stage: UpgradeStage, + }, + UpgradeResult { + msg_id: String, + #[serde(default)] + job_id: Option, + target_version: String, + stage: UpgradeStage, + error: String, + #[serde(default)] + backup_path: Option, + }, Pong, } @@ -288,6 +344,8 @@ pub enum ServerMessage { version: String, download_url: String, sha256: String, + #[serde(default)] + job_id: Option, }, CapabilitiesSync { capabilities: u32, @@ -300,6 +358,8 @@ pub enum ServerMessage { pub enum BrowserMessage { FullSync { servers: Vec, + #[serde(default)] + upgrades: Vec, }, Update { servers: Vec, @@ -319,6 +379,23 @@ pub enum BrowserMessage { AgentInfoUpdated { server_id: String, protocol_version: u32, + #[serde(default)] + agent_version: Option, + }, + UpgradeProgress { + server_id: String, + job_id: String, + target_version: String, + stage: UpgradeStage, + }, + UpgradeResult { + server_id: String, + job_id: String, + target_version: String, + status: UpgradeStatus, + stage: Option, + error: Option, + backup_path: Option, }, NetworkProbeUpdate { server_id: String, @@ -1100,4 +1177,127 @@ mod tests { _ => panic!("Expected ServerIpChanged"), } } + + #[test] + fn test_server_upgrade_with_job_id_round_trip() { + let msg = ServerMessage::Upgrade { + version: "2.0.0".to_string(), + download_url: "https://example.com/serverbee.tar.gz".to_string(), + sha256: "abc123".to_string(), + job_id: Some("job-1".to_string()), + }; + + let json = serde_json::to_string(&msg).unwrap(); + let parsed: ServerMessage = serde_json::from_str(&json).unwrap(); + + match parsed { + ServerMessage::Upgrade { + version, + download_url, + sha256, + job_id, + } => { + assert_eq!(version, "2.0.0"); + assert_eq!(download_url, "https://example.com/serverbee.tar.gz"); + assert_eq!(sha256, "abc123"); + assert_eq!(job_id, Some("job-1".to_string())); + } + _ => panic!("Expected Upgrade"), + } + } + + #[test] + fn test_upgrade_messages_without_job_id_stay_backward_compatible() { + let server_json = r#"{"type":"upgrade","version":"2.0.0","download_url":"https://example.com/serverbee.tar.gz","sha256":"abc123"}"#; + let server_msg: ServerMessage = serde_json::from_str(server_json).unwrap(); + match server_msg { + ServerMessage::Upgrade { + job_id, + version, + download_url, + sha256, + } => { + assert_eq!(job_id, None); + assert_eq!(version, "2.0.0"); + assert_eq!(download_url, "https://example.com/serverbee.tar.gz"); + assert_eq!(sha256, "abc123"); + } + _ => panic!("Expected Upgrade"), + } + + let agent_json = r#"{"type":"upgrade_progress","msg_id":"m1","target_version":"2.0.0","stage":"downloading"}"#; + let agent_msg: AgentMessage = serde_json::from_str(agent_json).unwrap(); + match agent_msg { + AgentMessage::UpgradeProgress { + msg_id, + job_id, + target_version, + stage, + } => { + assert_eq!(msg_id, "m1"); + assert_eq!(job_id, None); + assert_eq!(target_version, "2.0.0"); + assert_eq!(stage, UpgradeStage::Downloading); + } + _ => panic!("Expected UpgradeProgress"), + } + } + + #[test] + fn test_browser_full_sync_with_upgrades_round_trip() { + let msg = BrowserMessage::FullSync { + servers: vec![], + upgrades: vec![UpgradeJobDto { + server_id: "server-1".to_string(), + job_id: "job-1".to_string(), + target_version: "2.0.0".to_string(), + stage: UpgradeStage::Installing, + status: UpgradeStatus::Running, + error: None, + backup_path: Some("/backups/server-1.tar.gz".to_string()), + started_at: chrono::Utc::now(), + finished_at: None, + }], + }; + + let json = serde_json::to_string(&msg).unwrap(); + let parsed: BrowserMessage = serde_json::from_str(&json).unwrap(); + + match parsed { + BrowserMessage::FullSync { servers, upgrades } => { + assert!(servers.is_empty()); + assert_eq!(upgrades.len(), 1); + assert_eq!(upgrades[0].server_id, "server-1"); + assert_eq!(upgrades[0].job_id, "job-1"); + assert_eq!(upgrades[0].target_version, "2.0.0"); + assert_eq!(upgrades[0].stage, UpgradeStage::Installing); + assert_eq!(upgrades[0].status, UpgradeStatus::Running); + assert_eq!(upgrades[0].error, None); + assert_eq!( + upgrades[0].backup_path, + Some("/backups/server-1.tar.gz".to_string()) + ); + assert!(upgrades[0].finished_at.is_none()); + } + _ => panic!("Expected FullSync"), + } + } + + #[test] + fn test_agent_info_updated_accepts_optional_agent_version() { + let json = r#"{"type":"agent_info_updated","server_id":"server-1","protocol_version":3,"agent_version":"1.2.3"}"#; + + match serde_json::from_str::(json).unwrap() { + BrowserMessage::AgentInfoUpdated { + server_id, + protocol_version, + agent_version, + } => { + assert_eq!(server_id, "server-1"); + assert_eq!(protocol_version, 3); + assert_eq!(agent_version.as_deref(), Some("1.2.3")); + } + _ => panic!("Expected AgentInfoUpdated"), + } + } } diff --git a/crates/server/src/config.rs b/crates/server/src/config.rs index 21f57d90..7784ccf2 100644 --- a/crates/server/src/config.rs +++ b/crates/server/src/config.rs @@ -275,6 +275,8 @@ impl Default for SchedulerConfig { pub struct UpgradeConfig { #[serde(default = "default_release_base_url")] pub release_base_url: String, + #[serde(default)] + pub latest_version_url: String, } fn default_release_base_url() -> String { @@ -285,6 +287,7 @@ impl Default for UpgradeConfig { fn default() -> Self { Self { release_base_url: default_release_base_url(), + latest_version_url: String::new(), } } } diff --git a/crates/server/src/main.rs b/crates/server/src/main.rs index ef880eaf..83409014 100644 --- a/crates/server/src/main.rs +++ b/crates/server/src/main.rs @@ -88,6 +88,8 @@ async fn main() -> anyhow::Result<()> { tokio::spawn(async move { task::task_scheduler::run(s).await }); let s = state.clone(); tokio::spawn(async move { task::service_monitor_checker::run(s).await }); + let s = state.clone(); + tokio::spawn(async move { task::upgrade_timeout::run(s).await }); // Build router let app = create_router(state); diff --git a/crates/server/src/openapi.rs b/crates/server/src/openapi.rs index fb529c2c..061b2bab 100644 --- a/crates/server/src/openapi.rs +++ b/crates/server/src/openapi.rs @@ -58,6 +58,7 @@ impl Modify for SecurityAddon { // status (public) crate::router::api::status::public_status, // agent + crate::router::api::agent::latest_version, crate::router::api::agent::register, // servers crate::router::api::server::list_servers, @@ -220,6 +221,7 @@ impl Modify for SecurityAddon { crate::entity::oauth_account::Model, crate::router::api::oauth::OAuthProvidersResponse, // agent + crate::service::upgrade_release::LatestAgentVersionResponse, crate::router::api::agent::RegisterResponse, // servers crate::router::api::server::ServerResponse, diff --git a/crates/server/src/router/api/agent.rs b/crates/server/src/router/api/agent.rs index 3f143c6e..1682a241 100644 --- a/crates/server/src/router/api/agent.rs +++ b/crates/server/src/router/api/agent.rs @@ -3,7 +3,7 @@ use std::sync::Arc; use axum::extract::{ConnectInfo, State}; use axum::http::HeaderMap; -use axum::routing::post; +use axum::routing::{get, post}; use axum::{Json, Router}; use chrono::Utc; use sea_orm::{ @@ -19,6 +19,7 @@ use crate::router::utils::extract_client_ip; use crate::service::auth::AuthService; use crate::service::config::ConfigService; use crate::service::network_probe::NetworkProbeService; +use crate::service::upgrade_release::LatestAgentVersionResponse; use crate::state::AppState; const CONFIG_KEY_AUTO_DISCOVERY: &str = "auto_discovery_key"; @@ -41,6 +42,25 @@ pub fn public_router() -> Router> { Router::new().route("/agent/register", post(register)) } +pub fn read_router() -> Router> { + Router::new().route("/agent/latest-version", get(latest_version)) +} + +#[utoipa::path( + get, + path = "/api/agent/latest-version", + tag = "agent", + responses( + (status = 200, description = "Latest agent release metadata", body = LatestAgentVersionResponse), + ), + security(("session_cookie" = []), ("api_key" = []), ("bearer_token" = [])) +)] +pub async fn latest_version( + State(state): State>, +) -> Result>, AppError> { + ok(state.upgrade_release_service.latest().await) +} + #[utoipa::path( post, path = "/api/agent/register", diff --git a/crates/server/src/router/api/mod.rs b/crates/server/src/router/api/mod.rs index dd7d6ca5..0b3f3edb 100644 --- a/crates/server/src/router/api/mod.rs +++ b/crates/server/src/router/api/mod.rs @@ -47,6 +47,7 @@ pub fn router(state: Arc) -> Router> { .merge(auth::protected_router()) .merge(mobile::protected_router()) // Read-only routes accessible to all authenticated users + .merge(agent::read_router()) .merge(server::read_router()) .merge(server_group::read_router()) .merge(ping::read_router()) diff --git a/crates/server/src/router/api/server.rs b/crates/server/src/router/api/server.rs index a5723116..c673d48f 100644 --- a/crates/server/src/router/api/server.rs +++ b/crates/server/src/router/api/server.rs @@ -26,6 +26,7 @@ use crate::service::network_probe::NetworkProbeService; use crate::service::ping::PingService; use crate::service::record::{QueryHistoryResult, RecordService}; use crate::service::server::{ServerService, UpdateServerInput}; +use crate::service::upgrade_tracker::{StartUpgradeJobError, UpgradeLookup}; use crate::state::AppState; use serverbee_common::constants::effective_capabilities; use serverbee_common::protocol::{BrowserMessage, ServerMessage}; @@ -567,60 +568,41 @@ async fn trigger_upgrade( format!("serverbee-agent-{os}-{arch}") }; - let base_url = &state.config.upgrade.release_base_url; - let download_url = format!("{base_url}/download/v{version}/{asset_name}"); - - // Fetch checksums.txt - let checksums_url = format!("{base_url}/download/v{version}/checksums.txt"); - let checksums_response = reqwest::get(&checksums_url) - .await - .map_err(|e| AppError::Internal(format!("Failed to fetch checksums: {e}")))?; - - if !checksums_response.status().is_success() { - return Err(AppError::NotFound(format!( - "Checksums not found for version v{version} (HTTP {})", - checksums_response.status() - ))); - } - - let checksums_body = checksums_response - .text() - .await - .map_err(|e| AppError::Internal(format!("Failed to read checksums: {e}")))?; - - // Parse: each line is " " or " " - let sha256 = checksums_body - .lines() - .find_map(|line| { - let mut parts = line.splitn(2, |c: char| c.is_whitespace()); - let hash = parts.next()?; - let name = parts.next()?.trim(); - if name == asset_name { - Some(hash.to_string()) - } else { - None - } - }) - .ok_or_else(|| { - AppError::NotFound(format!( - "Checksum not found for {asset_name} in v{version} release" - )) - })?; + let asset = state + .upgrade_release_service + .resolve_asset(version, &asset_name) + .await?; let sender = state .agent_manager .get_sender(&id) .ok_or_else(|| AppError::NotFound("Agent not connected".into()))?; + let job = state + .upgrade_tracker + .start_job(&id, version.to_string()) + .map_err(|error| match error { + StartUpgradeJobError::Conflict(existing) => AppError::Conflict(format!( + "Upgrade already running for server {} (job_id={}, target_version={})", + existing.server_id, existing.job_id, existing.target_version + )), + })?; + let msg = ServerMessage::Upgrade { version: version.to_string(), - download_url, - sha256, + download_url: asset.download_url, + sha256: asset.sha256, + job_id: Some(job.job_id.clone()), }; - sender - .send(msg) - .await - .map_err(|_| AppError::Internal("Failed to send upgrade command".into()))?; + if let Err(_send_error) = sender.send(msg).await { + state.upgrade_tracker.mark_failed( + UpgradeLookup::from_job(&job), + job.stage, + "Failed to send upgrade command".into(), + None, + ); + return Err(AppError::Internal("Failed to send upgrade command".into())); + } ok("ok") } diff --git a/crates/server/src/router/ws/agent.rs b/crates/server/src/router/ws/agent.rs index 11962abc..002c0b85 100644 --- a/crates/server/src/router/ws/agent.rs +++ b/crates/server/src/router/ws/agent.rs @@ -19,6 +19,7 @@ use crate::service::network_probe::NetworkProbeService; use crate::service::ping::PingService; use crate::service::record::RecordService; use crate::service::server::ServerService; +use crate::service::upgrade_tracker::UpgradeLookup; use crate::state::AppState; use serverbee_common::constants::{MAX_WS_MESSAGE_SIZE, effective_capabilities}; use serverbee_common::protocol::{AgentMessage, BrowserMessage, ServerMessage}; @@ -505,8 +506,18 @@ async fn handle_agent_message(state: &Arc, server_id: &str, msg: Agent .broadcast_browser(BrowserMessage::AgentInfoUpdated { server_id: server_id.to_string(), protocol_version: agent_pv, + agent_version: Some(info.agent_version.clone()), }); + if let Some(job) = state.upgrade_tracker.get(server_id) + && job.status == serverbee_common::protocol::UpgradeStatus::Running + && job.target_version == info.agent_version + { + state + .upgrade_tracker + .mark_succeeded(UpgradeLookup::from_job(&job), None); + } + // Send Ack if let Some(tx) = state.agent_manager.get_sender(server_id) { let _ = tx.send(ServerMessage::Ack { msg_id }).await; @@ -553,6 +564,39 @@ async fn handle_agent_message(state: &Arc, server_id: &str, msg: Agent let _ = tx.send(ServerMessage::Ack { msg_id }).await; } } + AgentMessage::UpgradeProgress { + msg_id, + job_id, + target_version, + stage, + } => { + state + .upgrade_tracker + .update_stage(UpgradeLookup::new(server_id, job_id, target_version), stage); + + if let Some(tx) = state.agent_manager.get_sender(server_id) { + let _ = tx.send(ServerMessage::Ack { msg_id }).await; + } + } + AgentMessage::UpgradeResult { + msg_id, + job_id, + target_version, + stage, + error, + backup_path, + } => { + state.upgrade_tracker.mark_failed( + UpgradeLookup::new(server_id, job_id, target_version), + stage, + error, + backup_path, + ); + + if let Some(tx) = state.agent_manager.get_sender(server_id) { + let _ = tx.send(ServerMessage::Ack { msg_id }).await; + } + } AgentMessage::PingResult(result) => { if let Err(e) = save_ping_result(&state.db, server_id, &result).await { tracing::error!("Failed to save ping result for {server_id}: {e}"); @@ -625,6 +669,13 @@ async fn handle_agent_message(state: &Arc, server_id: &str, msg: Agent } } } + if capability == "upgrade" + && let Some(job) = state.upgrade_tracker.get(server_id) + { + state + .upgrade_tracker + .mark_failed_by_capability_denied(UpgradeLookup::from_job(&job), reason); + } // For terminal: unregister session so browser gets notified if let Some(sid) = &session_id { state.agent_manager.unregister_terminal_session(sid); diff --git a/crates/server/src/router/ws/browser.rs b/crates/server/src/router/ws/browser.rs index bca1e365..270fbba7 100644 --- a/crates/server/src/router/ws/browser.rs +++ b/crates/server/src/router/ws/browser.rs @@ -255,6 +255,7 @@ async fn build_full_sync(state: &Arc) -> BrowserMessage { tracing::error!("Failed to list servers for FullSync: {e}"); return BrowserMessage::FullSync { servers: Vec::new(), + upgrades: state.upgrade_tracker.snapshot(), }; } }; @@ -351,7 +352,10 @@ async fn build_full_sync(state: &Arc) -> BrowserMessage { }) .collect(); - BrowserMessage::FullSync { servers: statuses } + BrowserMessage::FullSync { + servers: statuses, + upgrades: state.upgrade_tracker.snapshot(), + } } async fn send_browser_message( diff --git a/crates/server/src/service/mod.rs b/crates/server/src/service/mod.rs index 6128cb3d..39ed9037 100644 --- a/crates/server/src/service/mod.rs +++ b/crates/server/src/service/mod.rs @@ -25,4 +25,6 @@ pub mod status_page; pub mod task_scheduler; pub mod traffic; pub mod uptime; +pub mod upgrade_release; +pub mod upgrade_tracker; pub mod user; diff --git a/crates/server/src/state.rs b/crates/server/src/state.rs index c6327773..5734c299 100644 --- a/crates/server/src/state.rs +++ b/crates/server/src/state.rs @@ -17,6 +17,8 @@ use crate::service::high_risk_audit::{ DockerLogsAuditContext, ExecAuditContext, TerminalAuditContext, }; use crate::service::task_scheduler::TaskScheduler; +use crate::service::upgrade_release::UpgradeReleaseService; +use crate::service::upgrade_tracker::UpgradeJobTracker; /// Pending TOTP setup data, keyed by user_id. pub struct PendingTotp { @@ -41,6 +43,8 @@ pub struct AppState { pub agent_manager: AgentManager, pub browser_tx: broadcast::Sender, pub config: AppConfig, + pub upgrade_tracker: UpgradeJobTracker, + pub upgrade_release_service: UpgradeReleaseService, pub geoip: Arc>>, pub geoip_downloading: AtomicBool, /// CSRF state tokens for OAuth flow, keyed by state string → provider. @@ -128,6 +132,8 @@ impl AppState { pub async fn new(db: DatabaseConnection, config: AppConfig) -> Result, AppError> { let (browser_tx, _) = broadcast::channel(256); let agent_manager = AgentManager::new(browser_tx.clone()); + let upgrade_tracker = UpgradeJobTracker::new(browser_tx.clone()); + let upgrade_release_service = UpgradeReleaseService::new(&config.upgrade); let geoip = if !config.geoip.mmdb_path.is_empty() { GeoIpService::load(&config.geoip.mmdb_path) } else { @@ -162,6 +168,8 @@ impl AppState { agent_manager, browser_tx, config, + upgrade_tracker, + upgrade_release_service, geoip: Arc::new(std::sync::RwLock::new(geoip)), geoip_downloading: AtomicBool::new(false), oauth_states: DashMap::new(), diff --git a/crates/server/src/task/mod.rs b/crates/server/src/task/mod.rs index b3a852ff..c0ada32d 100644 --- a/crates/server/src/task/mod.rs +++ b/crates/server/src/task/mod.rs @@ -6,3 +6,4 @@ pub mod record_writer; pub mod service_monitor_checker; pub mod session_cleaner; pub mod task_scheduler; +pub mod upgrade_timeout; From 7080233f59ad8e2c8d984489a189c253592d2cc1 Mon Sep 17 00:00:00 2001 From: ZingerLittleBee <6970999@gmail.com> Date: Tue, 14 Apr 2026 21:31:08 +0800 Subject: [PATCH 17/17] fix(ci): resolve frontend typecheck and clippy errors --- .../server/agent-version-section.test.tsx | 9 ++++++--- .../src/components/server/upgrade-job-badge.tsx | 16 +++++++++------- .../components/uptime/uptime-timeline.test.tsx | 2 +- apps/web/src/hooks/use-servers-ws.ts | 4 ++-- .../web/src/routes/_authed/servers/$id.test.tsx | 4 +++- apps/web/src/routes/_authed/servers/$id.tsx | 17 ++++++++--------- apps/web/src/stores/upgrade-jobs-store.test.ts | 7 ++++--- crates/server/src/service/upgrade_tracker.rs | 4 ++-- 8 files changed, 35 insertions(+), 28 deletions(-) diff --git a/apps/web/src/components/server/agent-version-section.test.tsx b/apps/web/src/components/server/agent-version-section.test.tsx index 550054cb..81b6a985 100644 --- a/apps/web/src/components/server/agent-version-section.test.tsx +++ b/apps/web/src/components/server/agent-version-section.test.tsx @@ -24,6 +24,9 @@ vi.mock('@/lib/capabilities', () => ({ CAP_UPGRADE: 4, getEffectiveCapabilityEnabled: (...args: unknown[]) => mockGetEffectiveCapabilityEnabled(...args) })) +const UPGRADE_LATEST_PATTERN = /upgrade_latest_version/ +const UPGRADE_ERROR_WITH_BACKUP_PATTERN = /upgrade_error_with_backup/ +const UPGRADE_BACKUP_PATH_PATTERN = /upgrade_backup_path/ describe('AgentVersionSection', () => { beforeEach(() => { @@ -73,7 +76,7 @@ describe('AgentVersionSection', () => { serverId="srv-1" /> ) - expect(screen.getByText(/upgrade_latest_version/)).toBeDefined() + expect(screen.getByText(UPGRADE_LATEST_PATTERN)).toBeDefined() }) it('shows upgrade button for admin when update available and capability enabled', () => { @@ -232,7 +235,7 @@ describe('AgentVersionSection', () => { ) expect(screen.getByText('upgrade_status_failed')).toBeDefined() expect(screen.getByText('Download failed: connection timeout')).toBeDefined() - expect(screen.getByText(/upgrade_error_with_backup/)).toBeDefined() + expect(screen.getByText(UPGRADE_ERROR_WITH_BACKUP_PATTERN)).toBeDefined() }) it('shows timeout state with backup path', () => { @@ -261,7 +264,7 @@ describe('AgentVersionSection', () => { /> ) expect(screen.getByText('upgrade_status_timeout')).toBeDefined() - expect(screen.getByText(/upgrade_backup_path/)).toBeDefined() + expect(screen.getByText(UPGRADE_BACKUP_PATH_PATTERN)).toBeDefined() }) it('disables upgrade button while loading', () => { diff --git a/apps/web/src/components/server/upgrade-job-badge.tsx b/apps/web/src/components/server/upgrade-job-badge.tsx index ae6d8fc7..384e9809 100644 --- a/apps/web/src/components/server/upgrade-job-badge.tsx +++ b/apps/web/src/components/server/upgrade-job-badge.tsx @@ -29,14 +29,16 @@ export function UpgradeJobBadge({ job }: UpgradeJobBadgeProps) { const Icon = config.icon return ( - + - - - - {job.status === 'running' && t(`upgrade_stage_${job.stage}`)} - - + + + {job.status === 'running' && t(`upgrade_stage_${job.stage}`)} + + } + />

{t(config.label)}

diff --git a/apps/web/src/components/uptime/uptime-timeline.test.tsx b/apps/web/src/components/uptime/uptime-timeline.test.tsx index f15c3608..ae93fc16 100644 --- a/apps/web/src/components/uptime/uptime-timeline.test.tsx +++ b/apps/web/src/components/uptime/uptime-timeline.test.tsx @@ -1,5 +1,5 @@ import { render, screen } from '@testing-library/react' -import { describe, expect, it } from 'vitest' +import { describe, expect, it, vi } from 'vitest' import type { UptimeDailyEntry } from '@/lib/api-schema' import { computeAggregateUptime } from '@/lib/widget-helpers' import { UptimeTimeline } from './uptime-timeline' diff --git a/apps/web/src/hooks/use-servers-ws.ts b/apps/web/src/hooks/use-servers-ws.ts index a6eb50d1..65ec5382 100644 --- a/apps/web/src/hooks/use-servers-ws.ts +++ b/apps/web/src/hooks/use-servers-ws.ts @@ -357,7 +357,7 @@ export function handleWsMessage(raw: unknown, queryClient: QueryClient): void { ) { break } - const { server_id, target_version, stage } = raw as { + const { server_id, target_version, stage } = raw as unknown as { server_id: string job_id: string target_version: string @@ -382,7 +382,7 @@ export function handleWsMessage(raw: unknown, queryClient: QueryClient): void { ) { break } - const { server_id, job_id, target_version, status, stage, error, backup_path } = raw as { + const { server_id, job_id, target_version, status, stage, error, backup_path } = raw as unknown as { server_id: string job_id: string target_version: string diff --git a/apps/web/src/routes/_authed/servers/$id.test.tsx b/apps/web/src/routes/_authed/servers/$id.test.tsx index b924cd77..0317037b 100644 --- a/apps/web/src/routes/_authed/servers/$id.test.tsx +++ b/apps/web/src/routes/_authed/servers/$id.test.tsx @@ -33,7 +33,9 @@ vi.mock('react-i18next', () => ({ })) vi.mock('@/components/server/agent-version-section', () => ({ - AgentVersionSection: ({ latestVersion }: { latestVersion?: string | null }) =>
{latestVersion ?? 'no-latest-version'}
+ AgentVersionSection: ({ latestVersion }: { latestVersion?: string | null }) => ( +
{latestVersion ?? 'no-latest-version'}
+ ) })) vi.mock('@/components/server/capabilities-dialog', () => ({ diff --git a/apps/web/src/routes/_authed/servers/$id.tsx b/apps/web/src/routes/_authed/servers/$id.tsx index 4995cf8c..df7ead44 100644 --- a/apps/web/src/routes/_authed/servers/$id.tsx +++ b/apps/web/src/routes/_authed/servers/$id.tsx @@ -23,10 +23,10 @@ import type { ServerMetrics } from '@/hooks/use-servers-ws' import { api } from '@/lib/api-client' import type { ServerResponse } from '@/lib/api-schema' import { CAP_DOCKER, CAP_FILE, CAP_TERMINAL, getEffectiveCapabilityEnabled } from '@/lib/capabilities' -import { useUpgradeJobsStore } from '@/stores/upgrade-jobs-store' import { buildMergedDiskIoSeries, buildPerDiskIoSeries } from '@/lib/disk-io' import { cn, countryCodeToFlag, formatBytes } from '@/lib/utils' import { computeAggregateUptime } from '@/lib/widget-helpers' +import { useUpgradeJobsStore } from '@/stores/upgrade-jobs-store' export const Route = createFileRoute('/_authed/servers/$id')({ component: ServerDetailPage, @@ -387,6 +387,7 @@ export function ServerDetailPage() { refetchOnWindowFocus: false }) const liveData = liveServers?.find((s) => s.id === id) + const upgradeJob = useUpgradeJobsStore((state) => state.jobs.get(id)) const chartData: Record[] = useMemo(() => { if (isRealtime) { @@ -524,8 +525,6 @@ export function ServerDetailPage() { CAP_DOCKER ) - const upgradeJob = useUpgradeJobsStore((state) => state.jobs.get(id)) - // Network cumulative traffic from live data const liveNetIn = liveData?.net_in_transfer ?? 0 const liveNetOut = liveData?.net_out_transfer ?? 0 @@ -543,12 +542,12 @@ export function ServerDetailPage() {
-
- {flag && {flag}} -

{server.name}

- - -
+
+ {flag && {flag}} +

{server.name}

+ + +
{ @@ -17,14 +18,14 @@ describe('useUpgradeJobsStore', () => { server_id: string job_id: string target_version: string - stage: string - status: string + stage: UpgradeJob['stage'] + status: UpgradeJob['status'] error: string | null backup_path: string | null started_at: string finished_at: string | null }> = {} - ) { + ): UpgradeJob { return { server_id: 'server-1', job_id: 'job-1', diff --git a/crates/server/src/service/upgrade_tracker.rs b/crates/server/src/service/upgrade_tracker.rs index 036bf5a1..c9eada19 100644 --- a/crates/server/src/service/upgrade_tracker.rs +++ b/crates/server/src/service/upgrade_tracker.rs @@ -84,7 +84,7 @@ impl UpgradeLookup { #[derive(Debug, Clone, PartialEq)] pub enum StartUpgradeJobError { - Conflict(UpgradeJob), + Conflict(Box), } pub struct UpgradeJobTracker { @@ -111,7 +111,7 @@ impl UpgradeJobTracker { if let Some(existing) = self.jobs.get(&server_id) && existing.status == UpgradeStatus::Running { - return Err(StartUpgradeJobError::Conflict(existing.clone())); + return Err(StartUpgradeJobError::Conflict(Box::new(existing.clone()))); } let job = UpgradeJob {