Skip to content

Commit 46deadc

Browse files
committed
ephemeral: Fix run-ssh error handling
If we fail to launch, because we used `--rm` the container would exit and we couldn't get its logs. Change to removing the container image via a drop handler, so if the system fails to launch (e.g. it's missing a kernel or bwrap) we can get the logs from that. Signed-off-by: Colin Walters <walters@verbum.org>
1 parent 48f2cf0 commit 46deadc

4 files changed

Lines changed: 285 additions & 31 deletions

File tree

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# Test fixture: Bootc image with kernel removed
2+
# This simulates a broken/malformed bootc image that will fail during ephemeral VM startup
3+
# Used to test error handling and cleanup in ephemeral run-ssh command
4+
5+
ARG BASE_IMAGE=quay.io/centos-bootc/centos-bootc:stream10
6+
7+
FROM ${BASE_IMAGE}
8+
9+
# Remove kernel and modules to simulate a broken bootc image
10+
RUN rm -rf /usr/lib/modules

crates/integration-tests/src/tests/run_ephemeral_ssh.rs

Lines changed: 125 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,71 @@ use color_eyre::Result;
1818
use integration_tests::{integration_test, parameterized_integration_test};
1919

2020
use std::process::Command;
21-
use std::thread;
22-
use std::time::Duration;
21+
use std::time::{Duration, Instant};
2322

2423
use crate::{get_test_image, run_bcvk, INTEGRATION_TEST_LABEL};
2524

25+
/// Poll until a container is removed or timeout is reached
26+
///
27+
/// Returns Ok(()) if container is removed within timeout, Err otherwise.
28+
/// Timeout is set to 60 seconds to account for slow CI runners.
29+
fn wait_for_container_removal(container_name: &str) -> Result<()> {
30+
let timeout = Duration::from_secs(60);
31+
let start = Instant::now();
32+
let poll_interval = Duration::from_millis(100);
33+
34+
loop {
35+
let output = Command::new("podman")
36+
.args(["ps", "-a", "--format", "{{.Names}}"])
37+
.output()
38+
.expect("Failed to list containers");
39+
40+
let containers = String::from_utf8_lossy(&output.stdout);
41+
if !containers.lines().any(|line| line == container_name) {
42+
return Ok(());
43+
}
44+
45+
if start.elapsed() >= timeout {
46+
return Err(color_eyre::eyre::eyre!(
47+
"Timeout waiting for container {} to be removed. Active containers: {}",
48+
container_name,
49+
containers
50+
));
51+
}
52+
53+
std::thread::sleep(poll_interval);
54+
}
55+
}
56+
57+
/// Build a test fixture image with the kernel removed
58+
fn build_broken_image() -> Result<String> {
59+
let fixture_path = concat!(env!("CARGO_MANIFEST_DIR"), "/fixtures/Dockerfile.no-kernel");
60+
let image_name = format!("localhost/bcvk-test-no-kernel:{}", std::process::id());
61+
62+
let output = Command::new("podman")
63+
.args([
64+
"build",
65+
"-f",
66+
fixture_path,
67+
"-t",
68+
&image_name,
69+
"--build-arg",
70+
&format!("BASE_IMAGE={}", get_test_image()),
71+
".",
72+
])
73+
.output()?;
74+
75+
if !output.status.success() {
76+
let stderr = String::from_utf8_lossy(&output.stderr);
77+
return Err(color_eyre::eyre::eyre!(
78+
"Failed to build broken test image: {}",
79+
stderr
80+
));
81+
}
82+
83+
Ok(image_name)
84+
}
85+
2686
/// Test running a non-interactive command via SSH
2787
fn test_run_ephemeral_ssh_command() -> Result<()> {
2888
let output = run_bcvk(&[
@@ -66,20 +126,9 @@ fn test_run_ephemeral_ssh_cleanup() -> Result<()> {
66126

67127
output.assert_success("ephemeral run-ssh");
68128

69-
thread::sleep(Duration::from_secs(1));
70-
71-
let check_output = Command::new("podman")
72-
.args(["ps", "-a", "--format", "{{.Names}}"])
73-
.output()
74-
.expect("Failed to list containers");
129+
// Poll for container removal with timeout
130+
wait_for_container_removal(&container_name)?;
75131

76-
let containers = String::from_utf8_lossy(&check_output.stdout);
77-
assert!(
78-
!containers.contains(&container_name),
79-
"Container {} was not cleaned up after SSH exit. Active containers: {}",
80-
container_name,
81-
containers
82-
);
83132
Ok(())
84133
}
85134
integration_test!(test_run_ephemeral_ssh_cleanup);
@@ -248,3 +297,64 @@ echo "All checks passed!"
248297
Ok(())
249298
}
250299
integration_test!(test_run_tmpfs);
300+
301+
/// Test that containers are properly cleaned up even when the image is broken
302+
///
303+
/// This test verifies that the drop handler for ContainerCleanup works correctly
304+
/// when ephemeral run-ssh fails early due to a broken image (missing kernel).
305+
/// Previously this would fail with "setns `mnt`: Bad file descriptor" when using
306+
/// podman's --rm flag. Now it should fail cleanly and remove the container.
307+
fn test_run_ephemeral_ssh_broken_image_cleanup() -> Result<()> {
308+
// Build a broken test image (bootc image with kernel removed)
309+
eprintln!("Building broken test image...");
310+
let broken_image = build_broken_image()?;
311+
eprintln!("Built broken image: {}", broken_image);
312+
313+
let container_name = format!("test-broken-cleanup-{}", std::process::id());
314+
315+
// Try to run ephemeral SSH with the broken image - this should fail
316+
let output = run_bcvk(&[
317+
"ephemeral",
318+
"run-ssh",
319+
"--name",
320+
&container_name,
321+
"--label",
322+
INTEGRATION_TEST_LABEL,
323+
&broken_image,
324+
"--",
325+
"echo",
326+
"should not reach here",
327+
])?;
328+
329+
// The command should fail (no kernel found)
330+
assert!(
331+
!output.success(),
332+
"Expected ephemeral run-ssh to fail with broken image, but it succeeded"
333+
);
334+
335+
// Verify the error message indicates the problem
336+
assert!(
337+
output
338+
.stderr
339+
.contains("Failed to read kernel modules directory")
340+
|| output
341+
.stderr
342+
.contains("Container exited before SSH became available")
343+
|| output
344+
.stderr
345+
.contains("Monitor process exited unexpectedly"),
346+
"Expected error about missing kernel or container failure, got: {}",
347+
output.stderr
348+
);
349+
350+
// Poll for container removal with timeout
351+
wait_for_container_removal(&container_name)?;
352+
353+
// Clean up the test image
354+
let _ = Command::new("podman")
355+
.args(["rmi", "-f", &broken_image])
356+
.output();
357+
358+
Ok(())
359+
}
360+
integration_test!(test_run_ephemeral_ssh_broken_image_cleanup);

crates/kit/src/run_ephemeral.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -777,7 +777,10 @@ pub(crate) async fn run_impl(opts: RunEphemeralOpts) -> Result<()> {
777777
let mut vmlinuz_path: Option<Utf8PathBuf> = None;
778778
let mut initramfs_path: Option<Utf8PathBuf> = None;
779779

780-
for entry in fs::read_dir(modules_dir)? {
780+
let entries = fs::read_dir(modules_dir)
781+
.with_context(|| format!("Failed to read kernel modules directory at {}. This container image may not be a valid bootc image.", modules_dir))?;
782+
783+
for entry in entries {
781784
let entry = entry?;
782785
let path = Utf8PathBuf::from_path_buf(entry.path())
783786
.map_err(|p| eyre!("Path is not valid UTF-8: {}", p.display()))?;

0 commit comments

Comments
 (0)