|
| 1 | +//! Graceful Shutdown with Drain Mode — FR-010 |
| 2 | +//! |
| 3 | +//! Coordinated shutdown with message preservation: |
| 4 | +//! - Drain mode: stop accepting new messages, process all in-flight |
| 5 | +//! - Grace period: configurable timeout, then hard kill |
| 6 | +//! - Dependency ordering: leaf actors first, parents last |
| 7 | +//! - Checkpoint on drain: automatically snapshot state before termination |
| 8 | +
|
| 9 | +use std::time::{Duration, Instant}; |
| 10 | + |
| 11 | +use crate::actor::{ActorId, ActorState, ActorSupervisor}; |
| 12 | + |
| 13 | +/// Shutdown phase. |
| 14 | +#[derive(Debug, Clone, Copy, PartialEq, Eq)] |
| 15 | +pub enum ShutdownPhase { |
| 16 | + /// Normal operation. |
| 17 | + Running, |
| 18 | + /// Draining: no new messages accepted, processing in-flight. |
| 19 | + Draining, |
| 20 | + /// Grace period expired, hard-killing remaining actors. |
| 21 | + ForceKilling, |
| 22 | + /// Shutdown complete. |
| 23 | + Terminated, |
| 24 | +} |
| 25 | + |
| 26 | +/// Configuration for graceful shutdown. |
| 27 | +#[derive(Debug, Clone)] |
| 28 | +pub struct ShutdownConfig { |
| 29 | + /// Maximum time to wait for actors to drain. |
| 30 | + pub drain_timeout: Duration, |
| 31 | + /// Whether to checkpoint actor state before termination. |
| 32 | + pub checkpoint_on_drain: bool, |
| 33 | + /// Whether to process remaining queue messages before terminating. |
| 34 | + pub process_in_flight: bool, |
| 35 | + /// Shutdown ordering strategy. |
| 36 | + pub ordering: ShutdownOrdering, |
| 37 | +} |
| 38 | + |
| 39 | +impl Default for ShutdownConfig { |
| 40 | + fn default() -> Self { |
| 41 | + Self { |
| 42 | + drain_timeout: Duration::from_secs(30), |
| 43 | + checkpoint_on_drain: true, |
| 44 | + process_in_flight: true, |
| 45 | + ordering: ShutdownOrdering::LeafFirst, |
| 46 | + } |
| 47 | + } |
| 48 | +} |
| 49 | + |
| 50 | +/// Strategy for ordering actor shutdown. |
| 51 | +#[derive(Debug, Clone, Copy, PartialEq, Eq)] |
| 52 | +pub enum ShutdownOrdering { |
| 53 | + /// Shut down leaf actors first, then parents (bottom-up). |
| 54 | + /// Ensures children finish before parents. |
| 55 | + LeafFirst, |
| 56 | + /// Shut down parents first (top-down, cascading). |
| 57 | + ParentFirst, |
| 58 | + /// Shut down all actors simultaneously. |
| 59 | + Parallel, |
| 60 | +} |
| 61 | + |
| 62 | +/// Manages the graceful shutdown process. |
| 63 | +pub struct ShutdownCoordinator { |
| 64 | + /// Current phase. |
| 65 | + phase: ShutdownPhase, |
| 66 | + /// Configuration. |
| 67 | + config: ShutdownConfig, |
| 68 | + /// When shutdown was initiated. |
| 69 | + started_at: Option<Instant>, |
| 70 | + /// Actors that have been told to drain. |
| 71 | + draining_actors: Vec<ActorId>, |
| 72 | + /// Actors that have confirmed drain complete. |
| 73 | + drained_actors: Vec<ActorId>, |
| 74 | + /// Actors that were force-killed. |
| 75 | + force_killed: Vec<ActorId>, |
| 76 | +} |
| 77 | + |
| 78 | +impl ShutdownCoordinator { |
| 79 | + /// Create a new shutdown coordinator. |
| 80 | + pub fn new(config: ShutdownConfig) -> Self { |
| 81 | + Self { |
| 82 | + phase: ShutdownPhase::Running, |
| 83 | + config, |
| 84 | + started_at: None, |
| 85 | + draining_actors: Vec::new(), |
| 86 | + drained_actors: Vec::new(), |
| 87 | + force_killed: Vec::new(), |
| 88 | + } |
| 89 | + } |
| 90 | + |
| 91 | + /// Initiate graceful shutdown. |
| 92 | + /// |
| 93 | + /// Returns the ordered list of actors to drain. |
| 94 | + pub fn initiate(&mut self, supervisor: &ActorSupervisor) -> Vec<ActorId> { |
| 95 | + self.phase = ShutdownPhase::Draining; |
| 96 | + self.started_at = Some(Instant::now()); |
| 97 | + |
| 98 | + // Determine shutdown order |
| 99 | + let actors = self.compute_shutdown_order(supervisor); |
| 100 | + self.draining_actors = actors.clone(); |
| 101 | + |
| 102 | + tracing::info!( |
| 103 | + phase = "draining", |
| 104 | + actors = actors.len(), |
| 105 | + timeout_secs = self.config.drain_timeout.as_secs(), |
| 106 | + "Initiating graceful shutdown" |
| 107 | + ); |
| 108 | + |
| 109 | + actors |
| 110 | + } |
| 111 | + |
| 112 | + /// Mark an actor as drained (finished processing in-flight messages). |
| 113 | + pub fn mark_drained(&mut self, actor: ActorId) { |
| 114 | + self.drained_actors.push(actor); |
| 115 | + } |
| 116 | + |
| 117 | + /// Check if the drain timeout has expired. |
| 118 | + pub fn is_timeout_expired(&self) -> bool { |
| 119 | + self.started_at |
| 120 | + .map(|start| start.elapsed() >= self.config.drain_timeout) |
| 121 | + .unwrap_or(false) |
| 122 | + } |
| 123 | + |
| 124 | + /// Advance the shutdown state machine. |
| 125 | + /// |
| 126 | + /// Returns the current phase and any actors that need force-killing. |
| 127 | + pub fn tick(&mut self) -> (ShutdownPhase, Vec<ActorId>) { |
| 128 | + match self.phase { |
| 129 | + ShutdownPhase::Running => (self.phase, Vec::new()), |
| 130 | + |
| 131 | + ShutdownPhase::Draining => { |
| 132 | + // Check if all actors have drained |
| 133 | + let all_drained = self.draining_actors.iter().all(|a| { |
| 134 | + self.drained_actors.contains(a) |
| 135 | + }); |
| 136 | + |
| 137 | + if all_drained { |
| 138 | + self.phase = ShutdownPhase::Terminated; |
| 139 | + tracing::info!("All actors drained, shutdown complete"); |
| 140 | + (self.phase, Vec::new()) |
| 141 | + } else if self.is_timeout_expired() { |
| 142 | + // Force kill remaining |
| 143 | + self.phase = ShutdownPhase::ForceKilling; |
| 144 | + let remaining: Vec<ActorId> = self.draining_actors |
| 145 | + .iter() |
| 146 | + .filter(|a| !self.drained_actors.contains(a)) |
| 147 | + .copied() |
| 148 | + .collect(); |
| 149 | + |
| 150 | + tracing::warn!( |
| 151 | + remaining = remaining.len(), |
| 152 | + "Drain timeout expired, force-killing remaining actors" |
| 153 | + ); |
| 154 | + |
| 155 | + self.force_killed = remaining.clone(); |
| 156 | + self.phase = ShutdownPhase::Terminated; |
| 157 | + (ShutdownPhase::ForceKilling, remaining) |
| 158 | + } else { |
| 159 | + (self.phase, Vec::new()) |
| 160 | + } |
| 161 | + } |
| 162 | + |
| 163 | + ShutdownPhase::ForceKilling => { |
| 164 | + self.phase = ShutdownPhase::Terminated; |
| 165 | + (self.phase, Vec::new()) |
| 166 | + } |
| 167 | + |
| 168 | + ShutdownPhase::Terminated => (self.phase, Vec::new()), |
| 169 | + } |
| 170 | + } |
| 171 | + |
| 172 | + /// Current shutdown phase. |
| 173 | + pub fn phase(&self) -> ShutdownPhase { |
| 174 | + self.phase |
| 175 | + } |
| 176 | + |
| 177 | + /// Elapsed time since shutdown was initiated. |
| 178 | + pub fn elapsed(&self) -> Option<Duration> { |
| 179 | + self.started_at.map(|s| s.elapsed()) |
| 180 | + } |
| 181 | + |
| 182 | + /// Get shutdown report. |
| 183 | + pub fn report(&self) -> ShutdownReport { |
| 184 | + ShutdownReport { |
| 185 | + phase: self.phase, |
| 186 | + total_actors: self.draining_actors.len(), |
| 187 | + drained: self.drained_actors.len(), |
| 188 | + force_killed: self.force_killed.len(), |
| 189 | + elapsed: self.elapsed(), |
| 190 | + checkpoint_enabled: self.config.checkpoint_on_drain, |
| 191 | + } |
| 192 | + } |
| 193 | + |
| 194 | + /// Compute the shutdown order based on the configured strategy. |
| 195 | + fn compute_shutdown_order(&self, supervisor: &ActorSupervisor) -> Vec<ActorId> { |
| 196 | + let mut order: Vec<ActorId> = supervisor |
| 197 | + .entries() |
| 198 | + .iter() |
| 199 | + .filter(|e| e.actor_state().is_alive()) |
| 200 | + .map(|e| ActorId(e.actor_id)) |
| 201 | + .collect(); |
| 202 | + |
| 203 | + match self.config.ordering { |
| 204 | + ShutdownOrdering::LeafFirst => { |
| 205 | + // Sort by depth (deepest first = leaves first) |
| 206 | + order.sort_by(|a, b| { |
| 207 | + let da = supervisor.depth(*a); |
| 208 | + let db = supervisor.depth(*b); |
| 209 | + db.cmp(&da) // Descending depth = leaves first |
| 210 | + }); |
| 211 | + } |
| 212 | + ShutdownOrdering::ParentFirst => { |
| 213 | + // Sort by depth (shallowest first = parents first) |
| 214 | + order.sort_by(|a, b| { |
| 215 | + let da = supervisor.depth(*a); |
| 216 | + let db = supervisor.depth(*b); |
| 217 | + da.cmp(&db) |
| 218 | + }); |
| 219 | + } |
| 220 | + ShutdownOrdering::Parallel => { |
| 221 | + // No sorting needed |
| 222 | + } |
| 223 | + } |
| 224 | + |
| 225 | + order |
| 226 | + } |
| 227 | +} |
| 228 | + |
| 229 | +/// Report of shutdown results. |
| 230 | +#[derive(Debug, Clone)] |
| 231 | +pub struct ShutdownReport { |
| 232 | + /// Final phase. |
| 233 | + pub phase: ShutdownPhase, |
| 234 | + /// Total actors that were draining. |
| 235 | + pub total_actors: usize, |
| 236 | + /// Successfully drained. |
| 237 | + pub drained: usize, |
| 238 | + /// Force-killed after timeout. |
| 239 | + pub force_killed: usize, |
| 240 | + /// Time taken. |
| 241 | + pub elapsed: Option<Duration>, |
| 242 | + /// Whether checkpointing was enabled. |
| 243 | + pub checkpoint_enabled: bool, |
| 244 | +} |
| 245 | + |
| 246 | +impl std::fmt::Display for ShutdownReport { |
| 247 | + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { |
| 248 | + write!( |
| 249 | + f, |
| 250 | + "Shutdown: {} actors, {} drained, {} force-killed, {:?} elapsed", |
| 251 | + self.total_actors, |
| 252 | + self.drained, |
| 253 | + self.force_killed, |
| 254 | + self.elapsed.unwrap_or_default() |
| 255 | + ) |
| 256 | + } |
| 257 | +} |
| 258 | + |
| 259 | +#[cfg(test)] |
| 260 | +mod tests { |
| 261 | + use super::*; |
| 262 | + use crate::actor::ActorConfig; |
| 263 | + |
| 264 | + #[test] |
| 265 | + fn test_graceful_shutdown_all_drain() { |
| 266 | + let mut supervisor = ActorSupervisor::new(8); |
| 267 | + let config = ActorConfig::named("worker"); |
| 268 | + |
| 269 | + let a1 = supervisor.create_actor(&config, None).unwrap(); |
| 270 | + supervisor.activate_actor(a1).unwrap(); |
| 271 | + let a2 = supervisor.create_actor(&config, None).unwrap(); |
| 272 | + supervisor.activate_actor(a2).unwrap(); |
| 273 | + |
| 274 | + let mut coord = ShutdownCoordinator::new(ShutdownConfig::default()); |
| 275 | + let actors = coord.initiate(&supervisor); |
| 276 | + assert_eq!(actors.len(), 2); |
| 277 | + assert_eq!(coord.phase(), ShutdownPhase::Draining); |
| 278 | + |
| 279 | + // Simulate actors draining |
| 280 | + coord.mark_drained(a1); |
| 281 | + coord.mark_drained(a2); |
| 282 | + |
| 283 | + let (phase, force) = coord.tick(); |
| 284 | + assert_eq!(phase, ShutdownPhase::Terminated); |
| 285 | + assert!(force.is_empty()); |
| 286 | + |
| 287 | + let report = coord.report(); |
| 288 | + assert_eq!(report.drained, 2); |
| 289 | + assert_eq!(report.force_killed, 0); |
| 290 | + } |
| 291 | + |
| 292 | + #[test] |
| 293 | + fn test_shutdown_timeout_force_kill() { |
| 294 | + let mut supervisor = ActorSupervisor::new(8); |
| 295 | + let config = ActorConfig::named("worker"); |
| 296 | + |
| 297 | + let a1 = supervisor.create_actor(&config, None).unwrap(); |
| 298 | + supervisor.activate_actor(a1).unwrap(); |
| 299 | + |
| 300 | + let mut coord = ShutdownCoordinator::new(ShutdownConfig { |
| 301 | + drain_timeout: Duration::from_millis(1), // Very short |
| 302 | + ..Default::default() |
| 303 | + }); |
| 304 | + |
| 305 | + coord.initiate(&supervisor); |
| 306 | + // Don't mark_drained — simulate timeout |
| 307 | + |
| 308 | + std::thread::sleep(Duration::from_millis(5)); |
| 309 | + |
| 310 | + let (phase, force_killed) = coord.tick(); |
| 311 | + assert_eq!(phase, ShutdownPhase::ForceKilling); |
| 312 | + assert_eq!(force_killed.len(), 1); |
| 313 | + assert_eq!(force_killed[0], a1); |
| 314 | + } |
| 315 | + |
| 316 | + #[test] |
| 317 | + fn test_leaf_first_ordering() { |
| 318 | + let mut supervisor = ActorSupervisor::new(8); |
| 319 | + let config = ActorConfig::named("node"); |
| 320 | + |
| 321 | + let root = supervisor.create_actor(&config, None).unwrap(); |
| 322 | + supervisor.activate_actor(root).unwrap(); |
| 323 | + let child = supervisor.create_actor(&config, Some(root)).unwrap(); |
| 324 | + supervisor.activate_actor(child).unwrap(); |
| 325 | + let grandchild = supervisor.create_actor(&config, Some(child)).unwrap(); |
| 326 | + supervisor.activate_actor(grandchild).unwrap(); |
| 327 | + |
| 328 | + let coord = ShutdownCoordinator::new(ShutdownConfig { |
| 329 | + ordering: ShutdownOrdering::LeafFirst, |
| 330 | + ..Default::default() |
| 331 | + }); |
| 332 | + |
| 333 | + let order = coord.compute_shutdown_order(&supervisor); |
| 334 | + // Grandchild should be first (deepest) |
| 335 | + assert_eq!(order[0], grandchild); |
| 336 | + // Root should be last (shallowest) |
| 337 | + assert_eq!(*order.last().unwrap(), root); |
| 338 | + } |
| 339 | + |
| 340 | + #[test] |
| 341 | + fn test_shutdown_report_display() { |
| 342 | + let report = ShutdownReport { |
| 343 | + phase: ShutdownPhase::Terminated, |
| 344 | + total_actors: 5, |
| 345 | + drained: 4, |
| 346 | + force_killed: 1, |
| 347 | + elapsed: Some(Duration::from_secs(2)), |
| 348 | + checkpoint_enabled: true, |
| 349 | + }; |
| 350 | + let s = format!("{}", report); |
| 351 | + assert!(s.contains("5 actors")); |
| 352 | + assert!(s.contains("4 drained")); |
| 353 | + assert!(s.contains("1 force-killed")); |
| 354 | + } |
| 355 | +} |
0 commit comments