phd-runner option to defer guest cleanup on failure

lif · lif · commit d2b3f4471c9a · 2026-03-25T04:39:10.000Z
When `--manual-stop-on-failure` is passed, each propolis-server in failed
test cases is left running (if it hadn't been shut down by the test case
prior to failure explicitly), and its address is echoed to the operator
such that they can e.g. connect to its serial console to investigate or
debug whatever may have caused the test failure. The test suite pauses
until the instances left in this state are shut down manually, then
continues running further tests (unless interrupted).

Aside from convenience, this can be useful vs. reproducing test failures
with manually-reconstructed scenarios via a transcription of a phd-test's
instance spec and steps, which may have differences due to human-scale
timing of guest shell command invocations.
diff --git a/phd-tests/framework/src/lib.rs b/phd-tests/framework/src/lib.rs
@@ -43,7 +43,10 @@ use test_vm::{
     environment::EnvironmentSpec, spec::VmSpec, VmConfig, VmLocation,
 };
 use tokio::{
-    sync::mpsc::{UnboundedReceiver, UnboundedSender},
+    sync::{
+        mpsc::{UnboundedReceiver, UnboundedSender},
+        watch,
+    },
     task::JoinHandle,
 };
 
@@ -63,6 +66,8 @@ pub(crate) mod zfs;
 pub struct TestCtx {
     pub(crate) framework: Arc<Framework>,
     pub(crate) output_dir: Utf8PathBuf,
+    pub(crate) success_sigint_rx:
+        Option<(watch::Receiver<Option<bool>>, watch::Receiver<bool>)>,
 }
 
 /// An instance of the PHD test framework.
@@ -239,6 +244,17 @@ impl TestCtx {
         )
         .await
     }
+
+    /// When phd-runner is configured to leave instances running on failed
+    /// tests, the watch channel whose Receiver is passed to this function is
+    /// used to indicate to the instance cleanup task that a test *has* failed.
+    pub fn set_cleanup_task_outcome_receiver(
+        &mut self,
+        success_rx: watch::Receiver<Option<bool>>,
+        sigint_rx: watch::Receiver<bool>,
+    ) {
+        self.success_sigint_rx = Some((success_rx, sigint_rx));
+    }
 }
 
 // The framework implementation includes some "runner-only" functions
@@ -330,7 +346,7 @@ impl Framework {
     pub fn test_ctx(self: &Arc<Self>, fully_qualified_name: String) -> TestCtx {
         let output_dir =
             self.tmp_directory.as_path().join(&fully_qualified_name);
-        TestCtx { framework: self.clone(), output_dir }
+        TestCtx { framework: self.clone(), output_dir, success_sigint_rx: None }
     }
 
     /// Resets the state of any stateful objects in the framework to prepare it
diff --git a/phd-tests/framework/src/test_vm/mod.rs b/phd-tests/framework/src/test_vm/mod.rs
@@ -43,7 +43,7 @@ use propolis_client::{
 use propolis_client::{Client, ResponseValue};
 use thiserror::Error;
 use tokio::{
-    sync::{mpsc::UnboundedSender, oneshot},
+    sync::{mpsc::UnboundedSender, oneshot, watch},
     task::JoinHandle,
     time::timeout,
 };
@@ -272,6 +272,13 @@ pub struct TestVm {
 
     state: VmState,
 
+    /// If we should wait for operator intervention before terminating this
+    /// instance, a `Some((success_rx, _))` here will be sent a `Some(false)`.
+    /// While waiting, a `Some((_, sigint_rx))` will be sent a `true` if the
+    /// user sends a keyboard interrupt to indicate we should stop waiting.
+    success_sigint_rx:
+        Option<(watch::Receiver<Option<bool>>, watch::Receiver<bool>)>,
+
     /// Sending a task handle to this channel will ensure that the task runs to
     /// completion as part of the post-test cleanup fixture (i.e. before any
     /// other tests run).
@@ -328,6 +335,7 @@ impl TestVm {
                 environment.clone(),
                 params,
                 ctx.framework.cleanup_task_channel(),
+                ctx.success_sigint_rx.clone(),
             ),
         }
     }
@@ -338,6 +346,10 @@ impl TestVm {
         environment_spec: EnvironmentSpec,
         mut params: ServerProcessParameters,
         cleanup_task_tx: UnboundedSender<JoinHandle<()>>,
+        success_sigint_rx: Option<(
+            watch::Receiver<Option<bool>>,
+            watch::Receiver<bool>,
+        )>,
     ) -> Result<Self> {
         let metrics = environment_spec.metrics.as_ref().map(|m| match m {
             MetricsLocation::Local => {
@@ -372,6 +384,7 @@ impl TestVm {
             guest_os,
             state: VmState::New,
             cleanup_task_tx,
+            success_sigint_rx,
         })
     }
 
@@ -1134,6 +1147,9 @@ impl Drop for TestVm {
 
         let disks: Vec<_> = self.vm_spec().disk_handles.drain(..).collect();
 
+        let success_sigint_rx_opt = self.success_sigint_rx.take();
+        let name = self.vm_spec().vm_name.to_owned();
+
         // The order in which the task destroys objects is important: the server
         // can't be killed until the client has gotten a chance to shut down
         // the VM, and the disks can't be destroyed until the server process has
@@ -1144,6 +1160,49 @@ impl Drop for TestVm {
                 // kept alive until the server process is gone.
                 let _disks = disks;
 
+                // Check if we should let the user access the instance of a
+                // failed testcase before ensuring its demolition
+                if let Some((mut success_rx, sigint_rx)) = success_sigint_rx_opt {
+                    while success_rx.changed().await.is_ok() {
+                        match *success_rx.borrow() {
+                            None => continue,
+                            Some(true) => break,
+                            Some(false) => {}
+                        }
+                        let sock = server.server_addr();
+                        let ip = sock.ip();
+                        let port = sock.port();
+                        let mut uninformed = true;
+                        // States that might be worth inspecting out-of-band
+                        while let Ok(InstanceState::Running
+                            | InstanceState::Migrating
+                            | InstanceState::Rebooting
+                            | InstanceState::Repairing
+                        ) = client
+                            .instance_get()
+                            .send()
+                            .await
+                            .map(|inst| inst.instance.state)
+                        {
+                            if *sigint_rx.borrow() { break }
+                            if uninformed {
+                                // write to stderr irrespective of log level
+                                eprintln!(r#"
+propolis-server {name:?} left running at address {sock}
+phd-runner will resume when this instance is shut down; e.g. by one of:
+
+$ propolis-cli -s {ip} -p {port} serial
+localhost:~# poweroff
+
+$ propolis-cli -s {ip} -p {port} state stop
+"#);
+                                uninformed = false;
+                            }
+                            tokio::time::sleep(Duration::from_secs(1)).await;
+                        }
+                    }
+                }
+
                 // Try to make sure the server's kernel VMM is cleaned up before
                 // killing the server process. This is best-effort; if it fails,
                 // the kernel VMM is leaked. This generally indicates a bug in
diff --git a/phd-tests/runner/src/config.rs b/phd-tests/runner/src/config.rs
@@ -202,6 +202,16 @@ pub struct RunOptions {
     // number of seconds, but i'm lazy...
     #[clap(long, value_parser, default_value_t = 60 * 20)]
     pub max_buildomat_wait_secs: u64,
+
+    /// When a testcase fails while this is enabled, any instances started by
+    /// the failed test are left running and phd-runner waits until they are
+    /// manually shut down out-of-band by the operator.
+    ///
+    /// This feature is intended to give the operator a chance to inspect the
+    /// state of the guest(s) easily without necessarily having to reconstruct
+    /// the scenario by hand.
+    #[clap(long, value_parser)]
+    pub manual_stop_on_failure: bool,
 }
 
 #[derive(Args, Debug)]
diff --git a/phd-tests/runner/src/execute.rs b/phd-tests/runner/src/execute.rs
@@ -101,8 +101,18 @@ pub async fn run_tests_with_ctx(
         let framework = ctx.clone();
         let tc = execution.tc;
         let mut sigint_rx_task = sigint_rx.clone();
+        let mut test_ctx = framework.test_ctx(tc.fully_qualified_name());
+        let mut success_tx = None;
+        if run_opts.manual_stop_on_failure {
+            let sigint_rx_cleanup = sigint_rx.clone();
+            let (tx, success_rx) = watch::channel(None);
+            test_ctx.set_cleanup_task_outcome_receiver(
+                success_rx,
+                sigint_rx_cleanup,
+            );
+            success_tx = Some(tx);
+        }
         let test_outcome = tokio::spawn(async move {
-            let test_ctx = framework.test_ctx(tc.fully_qualified_name());
             tokio::select! {
                 // Ensure interrupt signals are always handled instead of
                 // continuing to run the test.
@@ -144,6 +154,13 @@ pub async fn run_tests_with_ctx(
             }
         );
 
+        if let Some(tx) = success_tx {
+            let succeeded = !matches!(&test_outcome, TestOutcome::Failed(_));
+            if let Err(e) = tx.send(Some(succeeded)) {
+                error!("Error sending outcome to instance cleanup tasks: {e}");
+            }
+        }
+
         match test_outcome {
             TestOutcome::Passed => stats.tests_passed += 1,
             TestOutcome::Failed(_) => {