Skip to content

Commit d2b3f44

Browse files
author
lif
committed
phd-runner option to defer guest cleanup on failure
When `--manual-stop-on-failure` is passed, each propolis-server in failed test cases is left running (if it hadn't been shut down by the test case prior to failure explicitly), and its address is echoed to the operator such that they can e.g. connect to its serial console to investigate or debug whatever may have caused the test failure. The test suite pauses until the instances left in this state are shut down manually, then continues running further tests (unless interrupted). Aside from convenience, this can be useful vs. reproducing test failures with manually-reconstructed scenarios via a transcription of a phd-test's instance spec and steps, which may have differences due to human-scale timing of guest shell command invocations.
1 parent 368a222 commit d2b3f44

4 files changed

Lines changed: 106 additions & 4 deletions

File tree

phd-tests/framework/src/lib.rs

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,10 @@ use test_vm::{
4343
environment::EnvironmentSpec, spec::VmSpec, VmConfig, VmLocation,
4444
};
4545
use tokio::{
46-
sync::mpsc::{UnboundedReceiver, UnboundedSender},
46+
sync::{
47+
mpsc::{UnboundedReceiver, UnboundedSender},
48+
watch,
49+
},
4750
task::JoinHandle,
4851
};
4952

@@ -63,6 +66,8 @@ pub(crate) mod zfs;
6366
pub struct TestCtx {
6467
pub(crate) framework: Arc<Framework>,
6568
pub(crate) output_dir: Utf8PathBuf,
69+
pub(crate) success_sigint_rx:
70+
Option<(watch::Receiver<Option<bool>>, watch::Receiver<bool>)>,
6671
}
6772

6873
/// An instance of the PHD test framework.
@@ -239,6 +244,17 @@ impl TestCtx {
239244
)
240245
.await
241246
}
247+
248+
/// When phd-runner is configured to leave instances running on failed
249+
/// tests, the watch channel whose Receiver is passed to this function is
250+
/// used to indicate to the instance cleanup task that a test *has* failed.
251+
pub fn set_cleanup_task_outcome_receiver(
252+
&mut self,
253+
success_rx: watch::Receiver<Option<bool>>,
254+
sigint_rx: watch::Receiver<bool>,
255+
) {
256+
self.success_sigint_rx = Some((success_rx, sigint_rx));
257+
}
242258
}
243259

244260
// The framework implementation includes some "runner-only" functions
@@ -330,7 +346,7 @@ impl Framework {
330346
pub fn test_ctx(self: &Arc<Self>, fully_qualified_name: String) -> TestCtx {
331347
let output_dir =
332348
self.tmp_directory.as_path().join(&fully_qualified_name);
333-
TestCtx { framework: self.clone(), output_dir }
349+
TestCtx { framework: self.clone(), output_dir, success_sigint_rx: None }
334350
}
335351

336352
/// Resets the state of any stateful objects in the framework to prepare it

phd-tests/framework/src/test_vm/mod.rs

Lines changed: 60 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ use propolis_client::{
4343
use propolis_client::{Client, ResponseValue};
4444
use thiserror::Error;
4545
use tokio::{
46-
sync::{mpsc::UnboundedSender, oneshot},
46+
sync::{mpsc::UnboundedSender, oneshot, watch},
4747
task::JoinHandle,
4848
time::timeout,
4949
};
@@ -272,6 +272,13 @@ pub struct TestVm {
272272

273273
state: VmState,
274274

275+
/// If we should wait for operator intervention before terminating this
276+
/// instance, a `Some((success_rx, _))` here will be sent a `Some(false)`.
277+
/// While waiting, a `Some((_, sigint_rx))` will be sent a `true` if the
278+
/// user sends a keyboard interrupt to indicate we should stop waiting.
279+
success_sigint_rx:
280+
Option<(watch::Receiver<Option<bool>>, watch::Receiver<bool>)>,
281+
275282
/// Sending a task handle to this channel will ensure that the task runs to
276283
/// completion as part of the post-test cleanup fixture (i.e. before any
277284
/// other tests run).
@@ -328,6 +335,7 @@ impl TestVm {
328335
environment.clone(),
329336
params,
330337
ctx.framework.cleanup_task_channel(),
338+
ctx.success_sigint_rx.clone(),
331339
),
332340
}
333341
}
@@ -338,6 +346,10 @@ impl TestVm {
338346
environment_spec: EnvironmentSpec,
339347
mut params: ServerProcessParameters,
340348
cleanup_task_tx: UnboundedSender<JoinHandle<()>>,
349+
success_sigint_rx: Option<(
350+
watch::Receiver<Option<bool>>,
351+
watch::Receiver<bool>,
352+
)>,
341353
) -> Result<Self> {
342354
let metrics = environment_spec.metrics.as_ref().map(|m| match m {
343355
MetricsLocation::Local => {
@@ -372,6 +384,7 @@ impl TestVm {
372384
guest_os,
373385
state: VmState::New,
374386
cleanup_task_tx,
387+
success_sigint_rx,
375388
})
376389
}
377390

@@ -1134,6 +1147,9 @@ impl Drop for TestVm {
11341147

11351148
let disks: Vec<_> = self.vm_spec().disk_handles.drain(..).collect();
11361149

1150+
let success_sigint_rx_opt = self.success_sigint_rx.take();
1151+
let name = self.vm_spec().vm_name.to_owned();
1152+
11371153
// The order in which the task destroys objects is important: the server
11381154
// can't be killed until the client has gotten a chance to shut down
11391155
// the VM, and the disks can't be destroyed until the server process has
@@ -1144,6 +1160,49 @@ impl Drop for TestVm {
11441160
// kept alive until the server process is gone.
11451161
let _disks = disks;
11461162

1163+
// Check if we should let the user access the instance of a
1164+
// failed testcase before ensuring its demolition
1165+
if let Some((mut success_rx, sigint_rx)) = success_sigint_rx_opt {
1166+
while success_rx.changed().await.is_ok() {
1167+
match *success_rx.borrow() {
1168+
None => continue,
1169+
Some(true) => break,
1170+
Some(false) => {}
1171+
}
1172+
let sock = server.server_addr();
1173+
let ip = sock.ip();
1174+
let port = sock.port();
1175+
let mut uninformed = true;
1176+
// States that might be worth inspecting out-of-band
1177+
while let Ok(InstanceState::Running
1178+
| InstanceState::Migrating
1179+
| InstanceState::Rebooting
1180+
| InstanceState::Repairing
1181+
) = client
1182+
.instance_get()
1183+
.send()
1184+
.await
1185+
.map(|inst| inst.instance.state)
1186+
{
1187+
if *sigint_rx.borrow() { break }
1188+
if uninformed {
1189+
// write to stderr irrespective of log level
1190+
eprintln!(r#"
1191+
propolis-server {name:?} left running at address {sock}
1192+
phd-runner will resume when this instance is shut down; e.g. by one of:
1193+
1194+
$ propolis-cli -s {ip} -p {port} serial
1195+
localhost:~# poweroff
1196+
1197+
$ propolis-cli -s {ip} -p {port} state stop
1198+
"#);
1199+
uninformed = false;
1200+
}
1201+
tokio::time::sleep(Duration::from_secs(1)).await;
1202+
}
1203+
}
1204+
}
1205+
11471206
// Try to make sure the server's kernel VMM is cleaned up before
11481207
// killing the server process. This is best-effort; if it fails,
11491208
// the kernel VMM is leaked. This generally indicates a bug in

phd-tests/runner/src/config.rs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,16 @@ pub struct RunOptions {
202202
// number of seconds, but i'm lazy...
203203
#[clap(long, value_parser, default_value_t = 60 * 20)]
204204
pub max_buildomat_wait_secs: u64,
205+
206+
/// When a testcase fails while this is enabled, any instances started by
207+
/// the failed test are left running and phd-runner waits until they are
208+
/// manually shut down out-of-band by the operator.
209+
///
210+
/// This feature is intended to give the operator a chance to inspect the
211+
/// state of the guest(s) easily without necessarily having to reconstruct
212+
/// the scenario by hand.
213+
#[clap(long, value_parser)]
214+
pub manual_stop_on_failure: bool,
205215
}
206216

207217
#[derive(Args, Debug)]

phd-tests/runner/src/execute.rs

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,8 +101,18 @@ pub async fn run_tests_with_ctx(
101101
let framework = ctx.clone();
102102
let tc = execution.tc;
103103
let mut sigint_rx_task = sigint_rx.clone();
104+
let mut test_ctx = framework.test_ctx(tc.fully_qualified_name());
105+
let mut success_tx = None;
106+
if run_opts.manual_stop_on_failure {
107+
let sigint_rx_cleanup = sigint_rx.clone();
108+
let (tx, success_rx) = watch::channel(None);
109+
test_ctx.set_cleanup_task_outcome_receiver(
110+
success_rx,
111+
sigint_rx_cleanup,
112+
);
113+
success_tx = Some(tx);
114+
}
104115
let test_outcome = tokio::spawn(async move {
105-
let test_ctx = framework.test_ctx(tc.fully_qualified_name());
106116
tokio::select! {
107117
// Ensure interrupt signals are always handled instead of
108118
// continuing to run the test.
@@ -144,6 +154,13 @@ pub async fn run_tests_with_ctx(
144154
}
145155
);
146156

157+
if let Some(tx) = success_tx {
158+
let succeeded = !matches!(&test_outcome, TestOutcome::Failed(_));
159+
if let Err(e) = tx.send(Some(succeeded)) {
160+
error!("Error sending outcome to instance cleanup tasks: {e}");
161+
}
162+
}
163+
147164
match test_outcome {
148165
TestOutcome::Passed => stats.tests_passed += 1,
149166
TestOutcome::Failed(_) => {

0 commit comments

Comments
 (0)