From 1d364a4c1d491d0eaa4dd6a3b5a5bece0e31ed5a Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Tue, 5 May 2026 12:56:11 -0700 Subject: [PATCH 1/6] Decouple support bundle inner layer from Nexus management Replaces `BundleCollection.bundle: SupportBundle` with a slim `BundleInfo { id, reason_for_creation }`. Moves the sled-storage chunked transfer (`store_bundle_on_sled`), zip helpers (`bundle_to_zipfile`, `recursively_add_directory_to_zipfile`, `sha2_hash`), the `CHUNK_SIZE` and `TEMPDIR` constants, and the DB-polling cancellation (`check_for_cancellation`) out of the inner `support_bundle/` module and into `support_bundle_collector.rs`. After this change the inner layer is a pure mechanism: it never reads the `support_bundle` DB row, never talks to a sled-agent's bundle storage endpoints, and treats CRDB only as a source of facts about sleds, ereports, and blueprints. The outer collector remains the manager of the bundle lifecycle. This is the first step toward a future shared crate that omdb can use to collect bundles when Nexus is down. --- .../tasks/support_bundle/collection.rs | 420 ++---------------- .../tasks/support_bundle_collector.rs | 350 ++++++++++++++- 2 files changed, 391 insertions(+), 379 deletions(-) diff --git a/nexus/src/app/background/tasks/support_bundle/collection.rs b/nexus/src/app/background/tasks/support_bundle/collection.rs index 08c9df8dbde..2326f05e677 100644 --- a/nexus/src/app/background/tasks/support_bundle/collection.rs +++ b/nexus/src/app/background/tasks/support_bundle/collection.rs @@ -2,9 +2,16 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -//! The entrypoint to all support bundle collection. +//! The mechanism layer of support bundle collection. //! -//! These are the primitives used to look up everything else within the bundle. +//! Given a datastore handle, internal-DNS resolver, [`OpContext`], a +//! [`BundleDataSelection`], a synthesized [`BundleInfo`] (id + +//! reason_for_creation), and an output directory, gather data into the +//! directory and produce a [`SupportBundleCollectionReport`]. +//! +//! This layer never reads the `support_bundle` table, never transfers +//! data to a sled-agent's bundle storage endpoints, and never polls +//! bundle state. Those responsibilities belong to the caller. use crate::app::background::tasks::support_bundle::cache::Cache; use crate::app::background::tasks::support_bundle::perfetto; @@ -13,44 +20,26 @@ use crate::app::background::tasks::support_bundle::steps; use nexus_types::support_bundle::BundleDataSelection; use anyhow::Context; -use camino::Utf8DirEntry; -use camino::Utf8Path; use camino_tempfile::Utf8TempDir; -use camino_tempfile::tempdir_in; -use camino_tempfile::tempfile_in; use internal_dns_resolver::Resolver; -use nexus_db_model::SupportBundle; -use nexus_db_model::SupportBundleState; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; use nexus_types::internal_api::background::SupportBundleCollectionReport; -use omicron_common::api::external::Error; -use omicron_uuid_kinds::DatasetUuid; use omicron_uuid_kinds::SupportBundleUuid; -use omicron_uuid_kinds::ZpoolUuid; use parallel_task_set::ParallelTaskSet; use serde_json::json; -use sha2::Digest; -use sha2::Sha256; -use slog_error_chain::InlineErrorChain; -use std::io::Write; -use std::num::NonZeroU64; use std::sync::Arc; -use tokio::io::AsyncReadExt; -use tokio::io::AsyncSeekExt; -use tokio::io::SeekFrom; use tokio_util::sync::CancellationToken; -use tufaceous_artifact::ArtifactHash; -use zip::ZipWriter; -use zip::write::FullFileOptions; - -/// We use "/var/tmp" to use Nexus' filesystem for temporary storage, -/// rather than "/tmp", which would keep this collected data in-memory. -pub const TEMPDIR: &str = "/var/tmp"; -/// The size of piece of a support bundle to transfer to the sled agent -/// within a single streaming request. -pub const CHUNK_SIZE: NonZeroU64 = NonZeroU64::new(1024 * 1024 * 1024).unwrap(); +/// Minimal bundle metadata needed by the mechanism layer. +/// +/// Built by the caller — either drawn from an existing `support_bundle` +/// DB row or synthesized fresh. +#[derive(Clone, Debug)] +pub struct BundleInfo { + pub id: SupportBundleUuid, + pub reason_for_creation: String, +} /// Wraps up all arguments to perform a single support bundle collection pub struct BundleCollection { @@ -59,8 +48,7 @@ pub struct BundleCollection { log: slog::Logger, opctx: OpContext, data_selection: BundleDataSelection, - bundle: SupportBundle, - transfer_chunk_size: NonZeroU64, + bundle: BundleInfo, cancellation_token: CancellationToken, } @@ -71,8 +59,7 @@ impl BundleCollection { log: slog::Logger, opctx: OpContext, data_selection: BundleDataSelection, - bundle: SupportBundle, - transfer_chunk_size: NonZeroU64, + bundle: BundleInfo, ) -> Self { Self { datastore, @@ -81,7 +68,6 @@ impl BundleCollection { opctx, data_selection, bundle, - transfer_chunk_size, cancellation_token: CancellationToken::new(), } } @@ -106,7 +92,7 @@ impl BundleCollection { &self.data_selection } - pub fn bundle(&self) -> &SupportBundle { + pub fn bundle(&self) -> &BundleInfo { &self.bundle } @@ -120,8 +106,8 @@ impl BundleCollection { /// Returns a reference to the cancellation token. /// - /// Pass to helper functions that need to `select!` on cancellation - /// independently (e.g., futures inside `FuturesUnordered`). + /// Cancellation is driven by the caller; this type only observes + /// the token. pub fn cancellation_token(&self) -> &CancellationToken { &self.cancellation_token } @@ -134,79 +120,24 @@ impl BundleCollection { self.cancellation_token.cancelled().await } - /// Collect the bundle within Nexus, and store it on a target sled. - pub async fn collect_bundle_and_store_on_sled( - self: &Arc, - ) -> anyhow::Result { - // Create a temporary directory where we'll store the support bundle - // as it's being collected. - let dir = tempdir_in(TEMPDIR)?; - - let report = self.collect_bundle_locally(&dir).await?; - self.store_bundle_on_sled(dir).await?; - Ok(report) - } - - // Create the support bundle, placing the contents into a user-specified - // directory. - // - // Does not attempt to convert the contents into a zipfile, nor send them - // to any durable storage. - async fn collect_bundle_locally( + /// Collect the bundle into the supplied temporary directory. + /// + /// The caller drives cancellation via `cancellation_token()`; this + /// function drains in-flight work before returning. The caller is + /// responsible for whatever happens to the directory afterward + /// (zipping, transferring to durable storage, writing to stdout, + /// etc.). + pub async fn collect_bundle_locally( self: &Arc, dir: &Utf8TempDir, ) -> anyhow::Result { - // Spawn a background task that periodically checks whether this - // bundle should still be collected. If not, it cancels the - // `CancellationToken`. - // - // Cancellation is hybrid: - // - // - Cancel-safe operations (HTTP requests, DB queries) use - // `tokio::select!` with the token for immediate cancellation. - // Dropping these futures is safe — no local side effects. - // - // - Cancel-unsafe operations (filesystem writes via `tokio::fs`, - // `spawn_blocking`) use cooperative `is_cancelled()` checks. - // These are never dropped mid-flight, ensuring all - // `spawn_blocking` work completes before the TempDir is dropped. - // - // `run_collect_bundle_steps` checks the token before spawning new - // steps and drains all in-flight tasks before returning. - // - // Previous iterations used a top-level `tokio::select!` to race - // cancellation against the entire collection. This had two problems: - // - // 1. Dropping the collection future left `spawn_blocking` file - // writes in-flight, racing with TempDir cleanup. - // See: https://github.com/oxidecomputer/omicron/issues/10198 - // - // 2. Earlier versions of the `select!` did async DB work in the - // cancellation branch body while the collection branch held a - // connection pool claim, causing deadlocks. - // See: https://github.com/oxidecomputer/omicron/issues/9259 - // - // The current design avoids both: cancel-unsafe futures are never - // dropped, and the DB check runs in a separate spawned task. - let cancel_task = tokio::spawn({ - let this = Arc::clone(self); - async move { this.check_for_cancellation().await } - }); - - // Run the collection. It checks self.cancelled and drains all - // in-flight work before returning. let report = self.collect_bundle_as_file(dir).await; - // Collection is done — stop the cancellation checker. - cancel_task.abort(); - let _ = cancel_task.await; - if self.is_cancelled() { warn!( &self.log, "Support Bundle cancelled - stopping collection"; "bundle" => %self.bundle.id, - "state" => ?self.bundle.state ); return Err(anyhow::anyhow!("Support Bundle Cancelled")); } @@ -219,180 +150,28 @@ impl BundleCollection { report } - async fn store_bundle_on_sled( - &self, - dir: Utf8TempDir, - ) -> anyhow::Result<()> { - // Create the zipfile as a temporary file - let mut zipfile = tokio::fs::File::from_std(bundle_to_zipfile(&dir)?); - let total_len = zipfile.metadata().await?.len(); - - // Collect the hash locally before we send it over the network - // - // We'll use this later during finalization to confirm the bundle - // has been stored successfully. - zipfile.seek(SeekFrom::Start(0)).await?; - let hash = sha2_hash(&mut zipfile).await?; - - // Find the sled where we're storing this bundle. - let sled_id = self - .datastore - .zpool_get_sled_if_in_service( - &self.opctx, - self.bundle.zpool_id.into(), - ) - .await?; - let sled_client = nexus_networking::sled_client( - &self.datastore, - &self.opctx, - sled_id, - &self.log, - ) - .await?; - - let zpool = ZpoolUuid::from(self.bundle.zpool_id); - let dataset = DatasetUuid::from(self.bundle.dataset_id); - let support_bundle = SupportBundleUuid::from(self.bundle.id); - - // Tell this sled to create the bundle. - let creation_result = sled_client - .support_bundle_start_creation(&zpool, &dataset, &support_bundle) - .await - .with_context(|| "Support bundle failed to start creation")?; - - if matches!( - creation_result.state, - sled_agent_client::types::SupportBundleState::Complete - ) { - // Early exit case: the bundle was already created -- we must have either - // crashed or failed between "finalizing" and "writing to the database that we - // finished". - info!(&self.log, "Support bundle was already collected"; "bundle" => %self.bundle.id); - return Ok(()); - } - info!(&self.log, "Support bundle creation started"; "bundle" => %self.bundle.id); - - let mut offset = 0; - while offset < total_len { - // Stream the zipfile to the sled where it should be kept - let mut file = zipfile - .try_clone() - .await - .with_context(|| "Failed to clone zipfile")?; - file.seek(SeekFrom::Start(offset)).await.with_context(|| { - format!("Failed to seek to offset {offset} / {total_len} within zipfile") - })?; - - // Only stream at most "transfer_chunk_size" bytes at once - let chunk_size = std::cmp::min( - self.transfer_chunk_size.get(), - total_len - offset, - ); - - let limited_file = file.take(chunk_size); - let stream = tokio_util::io::ReaderStream::new(limited_file); - let body = reqwest::Body::wrap_stream(stream); - - info!( - &self.log, - "Streaming bundle chunk"; - "bundle" => %self.bundle.id, - "offset" => offset, - "length" => chunk_size, - ); - - sled_client.support_bundle_transfer( - &zpool, &dataset, &support_bundle, offset, body - ).await.with_context(|| { - format!("Failed to transfer bundle: {chunk_size}@{offset} of {total_len} to sled") - })?; - - offset += chunk_size; - } - - sled_client - .support_bundle_finalize( - &zpool, - &dataset, - &support_bundle, - &hash.to_string(), - ) - .await - .with_context(|| "Failed to finalize bundle")?; - - // Returning from this method should drop all temporary storage - // allocated locally for this support bundle. - Ok(()) - } - - // Indefinitely perform periodic checks about whether or not we should - // cancel the bundle. + // Cancellation is hybrid: // - // Cancels `self.cancellation_token` and returns if: - // - The bundle state is no longer SupportBundleState::Collecting - // (which happens if the bundle has been explicitly cancelled, or - // if the backing storage has been expunged). - // - The bundle has been deleted + // - Cancel-safe operations (HTTP requests, DB queries) use + // `tokio::select!` with the token for immediate cancellation. + // Dropping these futures is safe — no local side effects. // - // Otherwise, keeps checking indefinitely while polled. - async fn check_for_cancellation(&self) { - let work_duration = tokio::time::Duration::from_secs(5); - let mut yield_interval = tokio::time::interval_at( - tokio::time::Instant::now() + work_duration, - work_duration, - ); - - loop { - // Timer fired mid-collection - check if we should stop. - yield_interval.tick().await; - trace!( - self.log, - "Checking if Bundle Collection cancelled"; - "bundle" => %self.bundle.id - ); - - match self - .datastore - .support_bundle_get(&self.opctx, self.bundle.id.into()) - .await - { - Ok(SupportBundle { - state: SupportBundleState::Collecting, - .. - }) => { - // Bundle still collecting; continue... - continue; - } - Ok(_) => { - // Not collecting, for any reason: Time to exit - self.cancellation_token.cancel(); - return; - } - Err(Error::ObjectNotFound { .. } | Error::NotFound { .. }) => { - self.cancellation_token.cancel(); - return; - } - Err(err) => { - warn!( - self.log, - "Database error checking bundle cancellation"; - InlineErrorChain::new(&err) - ); - - // If we cannot contact the database, retry later - continue; - } - } - } - } - + // - Cancel-unsafe operations (filesystem writes via `tokio::fs`, + // `spawn_blocking`) use cooperative `is_cancelled()` checks. + // These are never dropped mid-flight, ensuring all + // `spawn_blocking` work completes before the output directory + // is dropped. + // + // This loop checks the token before spawning new steps and drains + // all in-flight tasks before returning. Failing to do so previously + // raced `spawn_blocking` writes against `TempDir` cleanup; see + // https://github.com/oxidecomputer/omicron/issues/10198. async fn run_collect_bundle_steps( self: &Arc, output: &Utf8TempDir, mut steps: Vec, ) -> SupportBundleCollectionReport { - let mut report = - SupportBundleCollectionReport::new(self.bundle.id.into()); + let mut report = SupportBundleCollectionReport::new(self.bundle.id); const MAX_CONCURRENT_STEPS: usize = 16; let mut tasks = @@ -569,110 +348,3 @@ impl BundleCollection { Ok(self.run_collect_bundle_steps(dir, steps).await) } } - -// Takes a directory "dir", and zips the contents into a single zipfile. -fn bundle_to_zipfile(dir: &Utf8TempDir) -> anyhow::Result { - let tempfile = tempfile_in(TEMPDIR)?; - let mut zip = ZipWriter::new(tempfile); - - recursively_add_directory_to_zipfile(&mut zip, dir.path(), dir.path())?; - - Ok(zip.finish()?) -} - -fn recursively_add_directory_to_zipfile( - zip: &mut ZipWriter, - root_path: &Utf8Path, - dir_path: &Utf8Path, -) -> anyhow::Result<()> { - // Readdir might return entries in a non-deterministic order. - // Let's sort it for the zipfile, to be nice. - let mut entries = dir_path - .read_dir_utf8()? - .filter_map(Result::ok) - .collect::>(); - entries.sort_by(|a, b| a.file_name().cmp(&b.file_name())); - - for entry in &entries { - // Remove the "/tmp/..." prefix from the path when we're storing it in the - // zipfile. - let dst = entry.path().strip_prefix(root_path)?; - - let file_type = entry.file_type()?; - if file_type.is_file() { - let src = entry.path(); - - let zip_time = entry - .path() - .metadata() - .and_then(|m| m.modified()) - .ok() - .and_then(|sys_time| jiff::Zoned::try_from(sys_time).ok()) - .and_then(|zoned| { - zip::DateTime::try_from(zoned.datetime()).ok() - }) - .unwrap_or_else(zip::DateTime::default); - - let opts = FullFileOptions::default() - .last_modified_time(zip_time) - .compression_method(zip::CompressionMethod::Deflated) - .large_file(true); - - zip.start_file_from_path(dst, opts)?; - let mut file = std::fs::File::open(&src)?; - std::io::copy(&mut file, zip)?; - } - if file_type.is_dir() { - let opts = FullFileOptions::default(); - zip.add_directory_from_path(dst, opts)?; - recursively_add_directory_to_zipfile(zip, root_path, entry.path())?; - } - } - Ok(()) -} - -async fn sha2_hash(file: &mut tokio::fs::File) -> anyhow::Result { - let mut buf = vec![0u8; 65536]; - let mut ctx = Sha256::new(); - loop { - let n = file.read(&mut buf).await?; - if n == 0 { - break; - } - ctx.write_all(&buf[0..n])?; - } - - let digest = ctx.finalize(); - Ok(ArtifactHash(digest.as_slice().try_into()?)) -} - -#[cfg(test)] -mod test { - use super::*; - - use camino_tempfile::tempdir; - - // Ensure that we can convert a temporary directory into a zipfile - #[test] - fn test_zipfile_creation() { - let dir = tempdir().unwrap(); - - std::fs::create_dir_all(dir.path().join("dir-a")).unwrap(); - std::fs::create_dir_all(dir.path().join("dir-b")).unwrap(); - std::fs::write(dir.path().join("dir-a").join("file-a"), "some data") - .unwrap(); - std::fs::write(dir.path().join("file-b"), "more data").unwrap(); - - let zipfile = bundle_to_zipfile(&dir) - .expect("Should have been able to bundle zipfile"); - let archive = zip::read::ZipArchive::new(zipfile).unwrap(); - - // We expect the order to be deterministically alphabetical - let mut names = archive.file_names(); - assert_eq!(names.next(), Some("dir-a/")); - assert_eq!(names.next(), Some("dir-a/file-a")); - assert_eq!(names.next(), Some("dir-b/")); - assert_eq!(names.next(), Some("file-b")); - assert_eq!(names.next(), None); - } -} diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs index d9091f1cc98..2f3aeadbb31 100644 --- a/nexus/src/app/background/tasks/support_bundle_collector.rs +++ b/nexus/src/app/background/tasks/support_bundle_collector.rs @@ -6,6 +6,11 @@ use crate::app::background::BackgroundTask; use anyhow::Context; +use camino::Utf8DirEntry; +use camino::Utf8Path; +use camino_tempfile::Utf8TempDir; +use camino_tempfile::tempdir_in; +use camino_tempfile::tempfile_in; use futures::FutureExt; use futures::future::BoxFuture; use internal_dns_resolver::Resolver; @@ -26,13 +31,31 @@ use omicron_uuid_kinds::SledUuid; use omicron_uuid_kinds::SupportBundleUuid; use omicron_uuid_kinds::ZpoolUuid; use serde_json::json; +use sha2::Digest; +use sha2::Sha256; use sled_agent_types::support_bundle::NESTED_DATASET_NOT_FOUND; use slog_error_chain::InlineErrorChain; +use std::io::Write; use std::num::NonZeroU64; use std::sync::Arc; +use tokio::io::AsyncReadExt; +use tokio::io::AsyncSeekExt; +use tokio::io::SeekFrom; +use tokio_util::sync::CancellationToken; +use tufaceous_artifact::ArtifactHash; +use zip::ZipWriter; +use zip::write::FullFileOptions; use super::support_bundle::collection::BundleCollection; -use super::support_bundle::collection::CHUNK_SIZE; +use super::support_bundle::collection::BundleInfo; + +/// We use "/var/tmp" to use Nexus' filesystem for temporary storage, +/// rather than "/tmp", which would keep this collected data in-memory. +pub const TEMPDIR: &str = "/var/tmp"; + +/// The size of piece of a support bundle to transfer to the sled agent +/// within a single streaming request. +pub const CHUNK_SIZE: NonZeroU64 = NonZeroU64::new(1024 * 1024 * 1024).unwrap(); fn authz_support_bundle_from_id(id: SupportBundleUuid) -> authz::SupportBundle { authz::SupportBundle::new(authz::FLEET, id, LookupType::by_id(id)) @@ -370,17 +393,59 @@ impl SupportBundleCollector { .await .context("failed to query bundle data selection")?; + let bundle_log = + opctx.log.new(slog::o!("bundle" => bundle.id.to_string())); let collection = Arc::new(BundleCollection::new( self.datastore.clone(), self.resolver.clone(), - opctx.log.new(slog::o!("bundle" => bundle.id.to_string())), + bundle_log.clone(), opctx.child(std::collections::BTreeMap::new()), data_selection, - bundle.clone(), - self.transfer_chunk_size, + BundleInfo { + id: bundle.id.into(), + reason_for_creation: bundle.reason_for_creation.clone(), + }, )); - let mut report = collection.collect_bundle_and_store_on_sled().await?; + // Spawn a task that periodically polls the `support_bundle` row's + // state. If the bundle has been cancelled (or removed), it cancels + // the inner `CancellationToken`, prompting `collect_bundle_locally` + // to drain in-flight work and return. + // + // This is a separate task rather than a top-level + // `tokio::select!`: previous designs raced collection against an + // async DB check in the cancellation branch body, deadlocking + // when the collection branch held a connection pool claim. See + // https://github.com/oxidecomputer/omicron/issues/9259. + let cancel_task = tokio::spawn({ + let datastore = self.datastore.clone(); + let cancel_opctx = opctx.child(std::collections::BTreeMap::new()); + let token = collection.cancellation_token().clone(); + let log = bundle_log.clone(); + let bundle_id = bundle.id; + async move { + check_for_cancellation( + datastore, + cancel_opctx, + token, + log, + bundle_id.into(), + ) + .await + } + }); + + // Create a temporary directory where we'll store the support bundle + // as it's being collected. + let dir = tempdir_in(TEMPDIR)?; + let collect_result = collection.collect_bundle_locally(&dir).await; + + // Collection is done — stop the cancellation checker. + cancel_task.abort(); + let _ = cancel_task.await; + + let mut report = collect_result?; + self.store_bundle_on_sled(&bundle_log, opctx, &bundle, dir).await?; if let Err(err) = self .datastore .support_bundle_update( @@ -411,6 +476,256 @@ impl SupportBundleCollector { report.activated_in_db_ok = true; Ok(Some(report)) } + + // Zip the collected bundle directory and stream it to the sled-agent + // that owns the target zpool/dataset for durable storage. + async fn store_bundle_on_sled( + &self, + log: &slog::Logger, + opctx: &OpContext, + bundle: &SupportBundle, + dir: Utf8TempDir, + ) -> anyhow::Result<()> { + // Create the zipfile as a temporary file + let mut zipfile = tokio::fs::File::from_std(bundle_to_zipfile( + &dir, + Utf8Path::new(TEMPDIR), + )?); + let total_len = zipfile.metadata().await?.len(); + + // Collect the hash locally before we send it over the network + // + // We'll use this later during finalization to confirm the bundle + // has been stored successfully. + zipfile.seek(SeekFrom::Start(0)).await?; + let hash = sha2_hash(&mut zipfile).await?; + + // Find the sled where we're storing this bundle. + let sled_id = self + .datastore + .zpool_get_sled_if_in_service(opctx, bundle.zpool_id.into()) + .await?; + let sled_client = + nexus_networking::sled_client(&self.datastore, opctx, sled_id, log) + .await?; + + let zpool = ZpoolUuid::from(bundle.zpool_id); + let dataset = DatasetUuid::from(bundle.dataset_id); + let support_bundle = SupportBundleUuid::from(bundle.id); + + // Tell this sled to create the bundle. + let creation_result = sled_client + .support_bundle_start_creation(&zpool, &dataset, &support_bundle) + .await + .with_context(|| "Support bundle failed to start creation")?; + + if matches!( + creation_result.state, + sled_agent_client::types::SupportBundleState::Complete + ) { + // Early exit case: the bundle was already created -- we must have either + // crashed or failed between "finalizing" and "writing to the database that we + // finished". + info!(log, "Support bundle was already collected"; "bundle" => %bundle.id); + return Ok(()); + } + info!(log, "Support bundle creation started"; "bundle" => %bundle.id); + + let mut offset = 0; + while offset < total_len { + // Stream the zipfile to the sled where it should be kept + let mut file = zipfile + .try_clone() + .await + .with_context(|| "Failed to clone zipfile")?; + file.seek(SeekFrom::Start(offset)).await.with_context(|| { + format!("Failed to seek to offset {offset} / {total_len} within zipfile") + })?; + + // Only stream at most "transfer_chunk_size" bytes at once + let chunk_size = std::cmp::min( + self.transfer_chunk_size.get(), + total_len - offset, + ); + + let limited_file = file.take(chunk_size); + let stream = tokio_util::io::ReaderStream::new(limited_file); + let body = reqwest::Body::wrap_stream(stream); + + info!( + log, + "Streaming bundle chunk"; + "bundle" => %bundle.id, + "offset" => offset, + "length" => chunk_size, + ); + + sled_client.support_bundle_transfer( + &zpool, &dataset, &support_bundle, offset, body + ).await.with_context(|| { + format!("Failed to transfer bundle: {chunk_size}@{offset} of {total_len} to sled") + })?; + + offset += chunk_size; + } + + sled_client + .support_bundle_finalize( + &zpool, + &dataset, + &support_bundle, + &hash.to_string(), + ) + .await + .with_context(|| "Failed to finalize bundle")?; + + // Returning from this method should drop all temporary storage + // allocated locally for this support bundle. + Ok(()) + } +} + +// Indefinitely poll the database for the bundle's state. Cancels the +// supplied token when: +// +// - The bundle's state is no longer `Collecting` (typically because +// the bundle was explicitly cancelled by a user, or because its +// backing storage was expunged). +// - The bundle row no longer exists. +// +// The mechanism layer is otherwise oblivious to the bundle lifecycle — +// this is the management bridge that turns DB state changes into +// cancellation. +async fn check_for_cancellation( + datastore: Arc, + opctx: OpContext, + cancellation_token: CancellationToken, + log: slog::Logger, + bundle_id: SupportBundleUuid, +) { + let work_duration = tokio::time::Duration::from_secs(5); + let mut yield_interval = tokio::time::interval_at( + tokio::time::Instant::now() + work_duration, + work_duration, + ); + + loop { + // Timer fired mid-collection - check if we should stop. + yield_interval.tick().await; + trace!( + log, + "Checking if Bundle Collection cancelled"; + "bundle" => %bundle_id + ); + + match datastore.support_bundle_get(&opctx, bundle_id).await { + Ok(SupportBundle { + state: SupportBundleState::Collecting, .. + }) => { + // Bundle still collecting; continue... + continue; + } + Ok(_) => { + // Not collecting, for any reason: Time to exit + cancellation_token.cancel(); + return; + } + Err(Error::ObjectNotFound { .. } | Error::NotFound { .. }) => { + cancellation_token.cancel(); + return; + } + Err(err) => { + warn!( + log, + "Database error checking bundle cancellation"; + InlineErrorChain::new(&err) + ); + + // If we cannot contact the database, retry later + continue; + } + } + } +} + +// Takes a directory "dir", and zips the contents into a single zipfile +// stored as a tempfile under `tempdir`. +fn bundle_to_zipfile( + dir: &Utf8TempDir, + tempdir: &Utf8Path, +) -> anyhow::Result { + let tempfile = tempfile_in(tempdir)?; + let mut zip = ZipWriter::new(tempfile); + + recursively_add_directory_to_zipfile(&mut zip, dir.path(), dir.path())?; + + Ok(zip.finish()?) +} + +fn recursively_add_directory_to_zipfile( + zip: &mut ZipWriter, + root_path: &Utf8Path, + dir_path: &Utf8Path, +) -> anyhow::Result<()> { + // Readdir might return entries in a non-deterministic order. + // Let's sort it for the zipfile, to be nice. + let mut entries = dir_path + .read_dir_utf8()? + .filter_map(Result::ok) + .collect::>(); + entries.sort_by(|a, b| a.file_name().cmp(&b.file_name())); + + for entry in &entries { + // Remove the "/tmp/..." prefix from the path when we're storing it in the + // zipfile. + let dst = entry.path().strip_prefix(root_path)?; + + let file_type = entry.file_type()?; + if file_type.is_file() { + let src = entry.path(); + + let zip_time = entry + .path() + .metadata() + .and_then(|m| m.modified()) + .ok() + .and_then(|sys_time| jiff::Zoned::try_from(sys_time).ok()) + .and_then(|zoned| { + zip::DateTime::try_from(zoned.datetime()).ok() + }) + .unwrap_or_else(zip::DateTime::default); + + let opts = FullFileOptions::default() + .last_modified_time(zip_time) + .compression_method(zip::CompressionMethod::Deflated) + .large_file(true); + + zip.start_file_from_path(dst, opts)?; + let mut file = std::fs::File::open(&src)?; + std::io::copy(&mut file, zip)?; + } + if file_type.is_dir() { + let opts = FullFileOptions::default(); + zip.add_directory_from_path(dst, opts)?; + recursively_add_directory_to_zipfile(zip, root_path, entry.path())?; + } + } + Ok(()) +} + +async fn sha2_hash(file: &mut tokio::fs::File) -> anyhow::Result { + let mut buf = vec![0u8; 65536]; + let mut ctx = Sha256::new(); + loop { + let n = file.read(&mut buf).await?; + if n == 0 { + break; + } + ctx.write_all(&buf[0..n])?; + } + + let digest = ctx.finalize(); + Ok(ArtifactHash(digest.as_slice().try_into()?)) } impl BackgroundTask for SupportBundleCollector { @@ -1839,4 +2154,29 @@ mod test { SupportBundleCollectionStepStatus::Skipped ); } + + // Ensure that we can convert a temporary directory into a zipfile + #[test] + fn test_zipfile_creation() { + let dir = camino_tempfile::tempdir().unwrap(); + let tempdir_for_zip = camino_tempfile::tempdir().unwrap(); + + std::fs::create_dir_all(dir.path().join("dir-a")).unwrap(); + std::fs::create_dir_all(dir.path().join("dir-b")).unwrap(); + std::fs::write(dir.path().join("dir-a").join("file-a"), "some data") + .unwrap(); + std::fs::write(dir.path().join("file-b"), "more data").unwrap(); + + let zipfile = bundle_to_zipfile(&dir, tempdir_for_zip.path()) + .expect("Should have been able to bundle zipfile"); + let archive = zip::read::ZipArchive::new(zipfile).unwrap(); + + // We expect the order to be deterministically alphabetical + let mut names = archive.file_names(); + assert_eq!(names.next(), Some("dir-a/")); + assert_eq!(names.next(), Some("dir-a/file-a")); + assert_eq!(names.next(), Some("dir-b/")); + assert_eq!(names.next(), Some("file-b")); + assert_eq!(names.next(), None); + } } From 863c24b73f2de285ca8c9a6081ceb1d7a3c86a3f Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Tue, 5 May 2026 13:08:49 -0700 Subject: [PATCH 2/6] Move support bundle collection into a shared crate Lifts `nexus/src/app/background/tasks/support_bundle/` (the mechanism layer) into a new top-level crate `support-bundle-collection` so that both Nexus and omdb can call it. No logic changes; pure relocation plus import rewriting. --- Cargo.lock | 41 +++++++ Cargo.toml | 3 + dev-tools/ls-apis/tests/api_dependencies.out | 4 +- nexus/Cargo.toml | 1 + nexus/src/app/background/tasks/mod.rs | 1 - .../tasks/support_bundle_collector.rs | 102 +--------------- support-bundle-collection/Cargo.toml | 47 ++++++++ .../README.md | 0 .../build.rs | 13 +- .../src}/cache.rs | 2 +- .../src}/collection.rs | 8 +- support-bundle-collection/src/lib.rs | 26 ++++ .../src}/perfetto.rs | 0 .../src}/step.rs | 2 +- .../src}/steps/ereports.rs | 11 +- .../src}/steps/host_info.rs | 8 +- .../src}/steps/metadata.rs | 4 +- .../src}/steps/mod.rs | 4 +- .../src}/steps/reconfigurator.rs | 4 +- .../src}/steps/sled_cubby.rs | 6 +- .../src}/steps/sp_dumps.rs | 10 +- support-bundle-collection/src/zip.rs | 113 ++++++++++++++++++ 22 files changed, 272 insertions(+), 138 deletions(-) create mode 100644 support-bundle-collection/Cargo.toml rename {nexus/src/app/background/tasks/support_bundle => support-bundle-collection}/README.md (100%) rename nexus/src/app/background/tasks/support_bundle/mod.rs => support-bundle-collection/build.rs (50%) rename {nexus/src/app/background/tasks/support_bundle => support-bundle-collection/src}/cache.rs (97%) rename {nexus/src/app/background/tasks/support_bundle => support-bundle-collection/src}/collection.rs (97%) create mode 100644 support-bundle-collection/src/lib.rs rename {nexus/src/app/background/tasks/support_bundle => support-bundle-collection/src}/perfetto.rs (100%) rename {nexus/src/app/background/tasks/support_bundle => support-bundle-collection/src}/step.rs (98%) rename {nexus/src/app/background/tasks/support_bundle => support-bundle-collection/src}/steps/ereports.rs (96%) rename {nexus/src/app/background/tasks/support_bundle => support-bundle-collection/src}/steps/host_info.rs (97%) rename {nexus/src/app/background/tasks/support_bundle => support-bundle-collection/src}/steps/metadata.rs (86%) rename {nexus/src/app/background/tasks/support_bundle => support-bundle-collection/src}/steps/mod.rs (95%) rename {nexus/src/app/background/tasks/support_bundle => support-bundle-collection/src}/steps/reconfigurator.rs (93%) rename {nexus/src/app/background/tasks/support_bundle => support-bundle-collection/src}/steps/sled_cubby.rs (95%) rename {nexus/src/app/background/tasks/support_bundle => support-bundle-collection/src}/steps/sp_dumps.rs (92%) create mode 100644 support-bundle-collection/src/zip.rs diff --git a/Cargo.lock b/Cargo.lock index f61caae2532..7a133d5af03 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8765,6 +8765,7 @@ dependencies = [ "steno", "strum 0.27.2", "subprocess", + "support-bundle-collection", "swrite", "tempfile", "term 0.7.0", @@ -14496,6 +14497,46 @@ version = "2.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" +[[package]] +name = "support-bundle-collection" +version = "0.1.0" +dependencies = [ + "anyhow", + "base64 0.22.1", + "camino", + "camino-tempfile", + "chrono", + "dropshot 0.17.0", + "futures", + "gateway-client", + "gateway-types", + "internal-dns-resolver", + "internal-dns-types", + "jiff", + "nexus-db-model", + "nexus-db-queries", + "nexus-networking", + "nexus-reconfigurator-preparation", + "nexus-types", + "omicron-common", + "omicron-rpaths", + "omicron-uuid-kinds", + "omicron-workspace-hack", + "parallel-task-set", + "pq-sys", + "serde", + "serde_json", + "sha2", + "sled-agent-client", + "slog", + "slog-error-chain", + "tokio", + "tokio-util", + "tufaceous-artifact", + "uuid", + "zip 4.6.1", +] + [[package]] name = "support-bundle-viewer" version = "0.1.2" diff --git a/Cargo.toml b/Cargo.toml index 3e5adc43e60..daa23af3dd2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -161,6 +161,7 @@ members = [ "sled-storage", "sled-storage/zfs-test-harness", "sp-sim", + "support-bundle-collection", "test-utils", "trust-quorum", "trust-quorum/gfss", @@ -347,6 +348,7 @@ default-members = [ "sled-storage", "sled-storage/zfs-test-harness", "sp-sim", + "support-bundle-collection", "trust-quorum", "trust-quorum/gfss", "trust-quorum/protocol", @@ -823,6 +825,7 @@ strum = { version = "0.27.2", features = [ "derive" ] } subprocess = "0.2.9" subtle = "2.6.1" supports-color = "3.0.2" +support-bundle-collection = { path = "support-bundle-collection" } support-bundle-viewer = "0.1.2" swrite = "0.1.0" sync-ptr = "0.1.4" diff --git a/dev-tools/ls-apis/tests/api_dependencies.out b/dev-tools/ls-apis/tests/api_dependencies.out index a6cd556f371..a876ebff042 100644 --- a/dev-tools/ls-apis/tests/api_dependencies.out +++ b/dev-tools/ls-apis/tests/api_dependencies.out @@ -49,7 +49,7 @@ Management Gateway Service (client: gateway-client) consumed by: dpd (dendrite/dpd) via 1 path consumed by: lldpd (lldp/lldpd) via 1 path consumed by: mgd (maghemite/mgd) via 1 path - consumed by: omicron-nexus (omicron/nexus) via 5 paths + consumed by: omicron-nexus (omicron/nexus) via 6 paths consumed by: omicron-sled-agent (omicron/sled-agent) via 1 path consumed by: wicketd (omicron/wicketd) via 3 paths @@ -96,7 +96,7 @@ Repo Depot API (client: repo-depot-client) consumed by: omicron-sled-agent (omicron/sled-agent) via 1 path Sled Agent (client: sled-agent-client) - consumed by: omicron-nexus (omicron/nexus) via 8 paths + consumed by: omicron-nexus (omicron/nexus) via 9 paths Wicketd (client: wicketd-client) diff --git a/nexus/Cargo.toml b/nexus/Cargo.toml index 0a24e5eb3e2..18ffa16c8e4 100644 --- a/nexus/Cargo.toml +++ b/nexus/Cargo.toml @@ -159,6 +159,7 @@ raw-cpuid = { workspace = true, features = ["std"] } rustls = { workspace = true } rustls-pemfile = { workspace = true } scim2-rs.workspace = true +support-bundle-collection.workspace = true update-common.workspace = true update-engine.workspace = true omicron-workspace-hack.workspace = true diff --git a/nexus/src/app/background/tasks/mod.rs b/nexus/src/app/background/tasks/mod.rs index fdcb45ef8d0..e4438208813 100644 --- a/nexus/src/app/background/tasks/mod.rs +++ b/nexus/src/app/background/tasks/mod.rs @@ -49,7 +49,6 @@ pub mod region_snapshot_replacement_step; pub mod saga_recovery; pub mod service_firewall_rules; pub mod session_cleanup; -pub mod support_bundle; pub mod support_bundle_collector; pub mod sync_service_zone_nat; pub mod sync_switch_configuration; diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs index 2f3aeadbb31..6f173d1b4a0 100644 --- a/nexus/src/app/background/tasks/support_bundle_collector.rs +++ b/nexus/src/app/background/tasks/support_bundle_collector.rs @@ -6,11 +6,9 @@ use crate::app::background::BackgroundTask; use anyhow::Context; -use camino::Utf8DirEntry; use camino::Utf8Path; use camino_tempfile::Utf8TempDir; use camino_tempfile::tempdir_in; -use camino_tempfile::tempfile_in; use futures::FutureExt; use futures::future::BoxFuture; use internal_dns_resolver::Resolver; @@ -38,16 +36,14 @@ use slog_error_chain::InlineErrorChain; use std::io::Write; use std::num::NonZeroU64; use std::sync::Arc; +use support_bundle_collection::BundleCollection; +use support_bundle_collection::BundleInfo; +use support_bundle_collection::zip::bundle_to_zipfile; use tokio::io::AsyncReadExt; use tokio::io::AsyncSeekExt; use tokio::io::SeekFrom; use tokio_util::sync::CancellationToken; use tufaceous_artifact::ArtifactHash; -use zip::ZipWriter; -use zip::write::FullFileOptions; - -use super::support_bundle::collection::BundleCollection; -use super::support_bundle::collection::BundleInfo; /// We use "/var/tmp" to use Nexus' filesystem for temporary storage, /// rather than "/tmp", which would keep this collected data in-memory. @@ -648,71 +644,6 @@ async fn check_for_cancellation( } } -// Takes a directory "dir", and zips the contents into a single zipfile -// stored as a tempfile under `tempdir`. -fn bundle_to_zipfile( - dir: &Utf8TempDir, - tempdir: &Utf8Path, -) -> anyhow::Result { - let tempfile = tempfile_in(tempdir)?; - let mut zip = ZipWriter::new(tempfile); - - recursively_add_directory_to_zipfile(&mut zip, dir.path(), dir.path())?; - - Ok(zip.finish()?) -} - -fn recursively_add_directory_to_zipfile( - zip: &mut ZipWriter, - root_path: &Utf8Path, - dir_path: &Utf8Path, -) -> anyhow::Result<()> { - // Readdir might return entries in a non-deterministic order. - // Let's sort it for the zipfile, to be nice. - let mut entries = dir_path - .read_dir_utf8()? - .filter_map(Result::ok) - .collect::>(); - entries.sort_by(|a, b| a.file_name().cmp(&b.file_name())); - - for entry in &entries { - // Remove the "/tmp/..." prefix from the path when we're storing it in the - // zipfile. - let dst = entry.path().strip_prefix(root_path)?; - - let file_type = entry.file_type()?; - if file_type.is_file() { - let src = entry.path(); - - let zip_time = entry - .path() - .metadata() - .and_then(|m| m.modified()) - .ok() - .and_then(|sys_time| jiff::Zoned::try_from(sys_time).ok()) - .and_then(|zoned| { - zip::DateTime::try_from(zoned.datetime()).ok() - }) - .unwrap_or_else(zip::DateTime::default); - - let opts = FullFileOptions::default() - .last_modified_time(zip_time) - .compression_method(zip::CompressionMethod::Deflated) - .large_file(true); - - zip.start_file_from_path(dst, opts)?; - let mut file = std::fs::File::open(&src)?; - std::io::copy(&mut file, zip)?; - } - if file_type.is_dir() { - let opts = FullFileOptions::default(); - zip.add_directory_from_path(dst, opts)?; - recursively_add_directory_to_zipfile(zip, root_path, entry.path())?; - } - } - Ok(()) -} - async fn sha2_hash(file: &mut tokio::fs::File) -> anyhow::Result { let mut buf = vec![0u8; 65536]; let mut ctx = Sha256::new(); @@ -774,7 +705,6 @@ impl BackgroundTask for SupportBundleCollector { mod test { use super::*; - use crate::app::background::tasks::support_bundle::perfetto; use crate::app::support_bundles::SupportBundleQueryType; use http_body_util::BodyExt; use nexus_db_model::PhysicalDisk; @@ -805,6 +735,7 @@ mod test { }; use sled_agent_types::inventory::ZpoolHealth; use std::num::NonZeroU64; + use support_bundle_collection::perfetto; use uuid::Uuid; type ControlPlaneTestContext = @@ -2154,29 +2085,4 @@ mod test { SupportBundleCollectionStepStatus::Skipped ); } - - // Ensure that we can convert a temporary directory into a zipfile - #[test] - fn test_zipfile_creation() { - let dir = camino_tempfile::tempdir().unwrap(); - let tempdir_for_zip = camino_tempfile::tempdir().unwrap(); - - std::fs::create_dir_all(dir.path().join("dir-a")).unwrap(); - std::fs::create_dir_all(dir.path().join("dir-b")).unwrap(); - std::fs::write(dir.path().join("dir-a").join("file-a"), "some data") - .unwrap(); - std::fs::write(dir.path().join("file-b"), "more data").unwrap(); - - let zipfile = bundle_to_zipfile(&dir, tempdir_for_zip.path()) - .expect("Should have been able to bundle zipfile"); - let archive = zip::read::ZipArchive::new(zipfile).unwrap(); - - // We expect the order to be deterministically alphabetical - let mut names = archive.file_names(); - assert_eq!(names.next(), Some("dir-a/")); - assert_eq!(names.next(), Some("dir-a/file-a")); - assert_eq!(names.next(), Some("dir-b/")); - assert_eq!(names.next(), Some("file-b")); - assert_eq!(names.next(), None); - } } diff --git a/support-bundle-collection/Cargo.toml b/support-bundle-collection/Cargo.toml new file mode 100644 index 00000000000..6b892921db7 --- /dev/null +++ b/support-bundle-collection/Cargo.toml @@ -0,0 +1,47 @@ +[package] +name = "support-bundle-collection" +version = "0.1.0" +edition.workspace = true + +[lints] +workspace = true + +[build-dependencies] +omicron-rpaths.workspace = true + +[dependencies] +anyhow.workspace = true +base64.workspace = true +camino.workspace = true +camino-tempfile.workspace = true +chrono.workspace = true +dropshot.workspace = true +futures.workspace = true +gateway-client.workspace = true +gateway-types.workspace = true +internal-dns-resolver.workspace = true +internal-dns-types.workspace = true +jiff.workspace = true +nexus-db-model.workspace = true +nexus-db-queries.workspace = true +nexus-networking.workspace = true +nexus-reconfigurator-preparation.workspace = true +nexus-types.workspace = true +omicron-common.workspace = true +omicron-uuid-kinds.workspace = true +parallel-task-set.workspace = true +# See omicron-rpaths for more about the "pq-sys" dependency. +pq-sys = "*" +serde.workspace = true +serde_json.workspace = true +sha2.workspace = true +sled-agent-client.workspace = true +slog.workspace = true +slog-error-chain.workspace = true +tokio.workspace = true +tokio-util.workspace = true +tufaceous-artifact.workspace = true +uuid.workspace = true +zip.workspace = true + +omicron-workspace-hack.workspace = true diff --git a/nexus/src/app/background/tasks/support_bundle/README.md b/support-bundle-collection/README.md similarity index 100% rename from nexus/src/app/background/tasks/support_bundle/README.md rename to support-bundle-collection/README.md diff --git a/nexus/src/app/background/tasks/support_bundle/mod.rs b/support-bundle-collection/build.rs similarity index 50% rename from nexus/src/app/background/tasks/support_bundle/mod.rs rename to support-bundle-collection/build.rs index ba8fb9e4cab..1ba9acd41c9 100644 --- a/nexus/src/app/background/tasks/support_bundle/mod.rs +++ b/support-bundle-collection/build.rs @@ -2,10 +2,9 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -//! Support bundle related types and utilities - -mod cache; -pub mod collection; -pub mod perfetto; -mod step; -mod steps; +// See omicron-rpaths for documentation. +// NOTE: This file MUST be kept in sync with the other build.rs files in this +// repository. +fn main() { + omicron_rpaths::configure_default_omicron_rpaths(); +} diff --git a/nexus/src/app/background/tasks/support_bundle/cache.rs b/support-bundle-collection/src/cache.rs similarity index 97% rename from nexus/src/app/background/tasks/support_bundle/cache.rs rename to support-bundle-collection/src/cache.rs index 314345c64b7..60f87919a5d 100644 --- a/nexus/src/app/background/tasks/support_bundle/cache.rs +++ b/support-bundle-collection/src/cache.rs @@ -7,7 +7,7 @@ //! This is used to share data which may be used by multiple //! otherwise independent steps. -use crate::app::background::tasks::support_bundle::collection::BundleCollection; +use crate::collection::BundleCollection; use gateway_client::Client as MgsClient; use internal_dns_types::names::ServiceName; diff --git a/nexus/src/app/background/tasks/support_bundle/collection.rs b/support-bundle-collection/src/collection.rs similarity index 97% rename from nexus/src/app/background/tasks/support_bundle/collection.rs rename to support-bundle-collection/src/collection.rs index 2326f05e677..f3a3fb9e8a7 100644 --- a/nexus/src/app/background/tasks/support_bundle/collection.rs +++ b/support-bundle-collection/src/collection.rs @@ -13,10 +13,10 @@ //! data to a sled-agent's bundle storage endpoints, and never polls //! bundle state. Those responsibilities belong to the caller. -use crate::app::background::tasks::support_bundle::cache::Cache; -use crate::app::background::tasks::support_bundle::perfetto; -use crate::app::background::tasks::support_bundle::step::CollectionStep; -use crate::app::background::tasks::support_bundle::steps; +use crate::cache::Cache; +use crate::perfetto; +use crate::step::CollectionStep; +use crate::steps; use nexus_types::support_bundle::BundleDataSelection; use anyhow::Context; diff --git a/support-bundle-collection/src/lib.rs b/support-bundle-collection/src/lib.rs new file mode 100644 index 00000000000..6a604b5fb21 --- /dev/null +++ b/support-bundle-collection/src/lib.rs @@ -0,0 +1,26 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! The mechanism layer of support bundle collection. +//! +//! This crate provides the data-gathering primitives used to assemble a +//! support bundle. It is consumed by both the Nexus background task +//! (which manages bundle state in the `support_bundle` table) and by +//! `omdb` (which collects bundles ad-hoc, including when Nexus is down). +//! +//! See `README.md` in this crate for a developer-oriented overview of +//! the step framework. + +#[macro_use] +extern crate slog; + +mod cache; +pub mod collection; +pub mod perfetto; +mod step; +mod steps; +pub mod zip; + +pub use collection::BundleCollection; +pub use collection::BundleInfo; diff --git a/nexus/src/app/background/tasks/support_bundle/perfetto.rs b/support-bundle-collection/src/perfetto.rs similarity index 100% rename from nexus/src/app/background/tasks/support_bundle/perfetto.rs rename to support-bundle-collection/src/perfetto.rs diff --git a/nexus/src/app/background/tasks/support_bundle/step.rs b/support-bundle-collection/src/step.rs similarity index 98% rename from nexus/src/app/background/tasks/support_bundle/step.rs rename to support-bundle-collection/src/step.rs index 5909265b976..df222127403 100644 --- a/nexus/src/app/background/tasks/support_bundle/step.rs +++ b/support-bundle-collection/src/step.rs @@ -4,7 +4,7 @@ //! Support bundle collection step execution framework -use crate::app::background::tasks::support_bundle::collection::BundleCollection; +use crate::collection::BundleCollection; use camino::Utf8Path; use chrono::DateTime; diff --git a/nexus/src/app/background/tasks/support_bundle/steps/ereports.rs b/support-bundle-collection/src/steps/ereports.rs similarity index 96% rename from nexus/src/app/background/tasks/support_bundle/steps/ereports.rs rename to support-bundle-collection/src/steps/ereports.rs index ff4a58cbbad..d8f4aa69b4d 100644 --- a/nexus/src/app/background/tasks/support_bundle/steps/ereports.rs +++ b/support-bundle-collection/src/steps/ereports.rs @@ -4,13 +4,14 @@ //! Collect ereports for support bundles -use crate::app::background::tasks::support_bundle::collection::BundleCollection; -use crate::app::background::tasks::support_bundle::step::CollectionStepOutput; +use crate::collection::BundleCollection; +use crate::step::CollectionStepOutput; use nexus_types::fm::ereport::EreportFilters; use anyhow::Context; use camino::Utf8Path; use camino::Utf8PathBuf; +use dropshot::PaginationOrder; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; use nexus_db_queries::db::datastore; @@ -75,10 +76,8 @@ async fn save_ereports( dir: Utf8PathBuf, status: &mut SupportBundleEreportStatus, ) -> anyhow::Result<()> { - let mut paginator = Paginator::new( - datastore::SQL_BATCH_SIZE, - dropshot::PaginationOrder::Ascending, - ); + let mut paginator = + Paginator::new(datastore::SQL_BATCH_SIZE, PaginationOrder::Ascending); while let Some(p) = paginator.next() { let pagparams = p.current_pagparams(); let ereports = tokio::select! { diff --git a/nexus/src/app/background/tasks/support_bundle/steps/host_info.rs b/support-bundle-collection/src/steps/host_info.rs similarity index 97% rename from nexus/src/app/background/tasks/support_bundle/steps/host_info.rs rename to support-bundle-collection/src/steps/host_info.rs index 4aa0dc7fdb4..41cfadc58ad 100644 --- a/nexus/src/app/background/tasks/support_bundle/steps/host_info.rs +++ b/support-bundle-collection/src/steps/host_info.rs @@ -4,10 +4,10 @@ //! Collect host information from sleds for support bundles -use crate::app::background::tasks::support_bundle::cache::Cache; -use crate::app::background::tasks::support_bundle::collection::BundleCollection; -use crate::app::background::tasks::support_bundle::step::CollectionStep; -use crate::app::background::tasks::support_bundle::step::CollectionStepOutput; +use crate::cache::Cache; +use crate::collection::BundleCollection; +use crate::step::CollectionStep; +use crate::step::CollectionStepOutput; use anyhow::Context; use anyhow::bail; diff --git a/nexus/src/app/background/tasks/support_bundle/steps/metadata.rs b/support-bundle-collection/src/steps/metadata.rs similarity index 86% rename from nexus/src/app/background/tasks/support_bundle/steps/metadata.rs rename to support-bundle-collection/src/steps/metadata.rs index 726e3387d43..fcf7946b1f4 100644 --- a/nexus/src/app/background/tasks/support_bundle/steps/metadata.rs +++ b/support-bundle-collection/src/steps/metadata.rs @@ -4,8 +4,8 @@ //! Collects metadata about the bundle itself -use crate::app::background::tasks::support_bundle::collection::BundleCollection; -use crate::app::background::tasks::support_bundle::step::CollectionStepOutput; +use crate::collection::BundleCollection; +use crate::step::CollectionStepOutput; use camino::Utf8Path; /// Writes the bundle ID to a file diff --git a/nexus/src/app/background/tasks/support_bundle/steps/mod.rs b/support-bundle-collection/src/steps/mod.rs similarity index 95% rename from nexus/src/app/background/tasks/support_bundle/steps/mod.rs rename to support-bundle-collection/src/steps/mod.rs index 93952861b2c..57ddda584ed 100644 --- a/nexus/src/app/background/tasks/support_bundle/steps/mod.rs +++ b/support-bundle-collection/src/steps/mod.rs @@ -4,8 +4,8 @@ //! Individual support bundle collection steps -use crate::app::background::tasks::support_bundle::cache::Cache; -use crate::app::background::tasks::support_bundle::step::CollectionStep; +use crate::cache::Cache; +use crate::step::CollectionStep; use futures::FutureExt; use nexus_types::internal_api::background::SupportBundleCollectionStep; diff --git a/nexus/src/app/background/tasks/support_bundle/steps/reconfigurator.rs b/support-bundle-collection/src/steps/reconfigurator.rs similarity index 93% rename from nexus/src/app/background/tasks/support_bundle/steps/reconfigurator.rs rename to support-bundle-collection/src/steps/reconfigurator.rs index d1c8d79c4f8..d8ad62ee6ed 100644 --- a/nexus/src/app/background/tasks/support_bundle/steps/reconfigurator.rs +++ b/support-bundle-collection/src/steps/reconfigurator.rs @@ -4,8 +4,8 @@ //! Collect reconfigurator state for support bundles -use crate::app::background::tasks::support_bundle::collection::BundleCollection; -use crate::app::background::tasks::support_bundle::step::CollectionStepOutput; +use crate::collection::BundleCollection; +use crate::step::CollectionStepOutput; use anyhow::Context; use camino::Utf8Path; diff --git a/nexus/src/app/background/tasks/support_bundle/steps/sled_cubby.rs b/support-bundle-collection/src/steps/sled_cubby.rs similarity index 95% rename from nexus/src/app/background/tasks/support_bundle/steps/sled_cubby.rs rename to support-bundle-collection/src/steps/sled_cubby.rs index 1bd6dc59a36..91ce615e101 100644 --- a/nexus/src/app/background/tasks/support_bundle/steps/sled_cubby.rs +++ b/support-bundle-collection/src/steps/sled_cubby.rs @@ -4,9 +4,9 @@ //! Collect sled cubby information for support bundles -use crate::app::background::tasks::support_bundle::cache::Cache; -use crate::app::background::tasks::support_bundle::collection::BundleCollection; -use crate::app::background::tasks::support_bundle::step::CollectionStepOutput; +use crate::cache::Cache; +use crate::collection::BundleCollection; +use crate::step::CollectionStepOutput; use anyhow::Context; use anyhow::bail; diff --git a/nexus/src/app/background/tasks/support_bundle/steps/sp_dumps.rs b/support-bundle-collection/src/steps/sp_dumps.rs similarity index 92% rename from nexus/src/app/background/tasks/support_bundle/steps/sp_dumps.rs rename to support-bundle-collection/src/steps/sp_dumps.rs index 07027abcc5a..496be1eb065 100644 --- a/nexus/src/app/background/tasks/support_bundle/steps/sp_dumps.rs +++ b/support-bundle-collection/src/steps/sp_dumps.rs @@ -4,11 +4,11 @@ //! Collect SP task dumps for support bundles -use crate::app::background::tasks::support_bundle::cache::Cache; -use crate::app::background::tasks::support_bundle::collection::BundleCollection; -use crate::app::background::tasks::support_bundle::step::CollectionStep; -use crate::app::background::tasks::support_bundle::step::CollectionStepOutput; -use crate::app::background::tasks::support_bundle::steps; +use crate::cache::Cache; +use crate::collection::BundleCollection; +use crate::step::CollectionStep; +use crate::step::CollectionStepOutput; +use crate::steps; use anyhow::Context; use anyhow::bail; diff --git a/support-bundle-collection/src/zip.rs b/support-bundle-collection/src/zip.rs new file mode 100644 index 00000000000..b0065e8b326 --- /dev/null +++ b/support-bundle-collection/src/zip.rs @@ -0,0 +1,113 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Helpers for converting a collected bundle directory into a zipfile. +//! +//! These are used by callers that need to produce a single archive from +//! the directory of collected data — both Nexus (for storing on a sled +//! agent) and omdb (for writing to local storage). + +use ::zip::ZipWriter; +use ::zip::write::FullFileOptions; +use anyhow::Result; +use camino::Utf8DirEntry; +use camino::Utf8Path; +use camino_tempfile::Utf8TempDir; +use camino_tempfile::tempfile_in; + +/// Takes the contents of `dir`, and zips them into a single zipfile +/// stored as a tempfile under `tempdir`. +pub fn bundle_to_zipfile( + dir: &Utf8TempDir, + tempdir: &Utf8Path, +) -> Result { + let tempfile = tempfile_in(tempdir)?; + let mut zip = ZipWriter::new(tempfile); + + recursively_add_directory_to_zipfile(&mut zip, dir.path(), dir.path())?; + + Ok(zip.finish()?) +} + +fn recursively_add_directory_to_zipfile( + zip: &mut ZipWriter, + root_path: &Utf8Path, + dir_path: &Utf8Path, +) -> Result<()> { + // Readdir might return entries in a non-deterministic order. + // Let's sort it for the zipfile, to be nice. + let mut entries = dir_path + .read_dir_utf8()? + .filter_map(Result::ok) + .collect::>(); + entries.sort_by(|a, b| a.file_name().cmp(&b.file_name())); + + for entry in &entries { + // Strip the tempdir prefix when storing the path in the zipfile. + let dst = entry.path().strip_prefix(root_path)?; + + let file_type = entry.file_type()?; + if file_type.is_file() { + let src = entry.path(); + + let zip_time = entry + .path() + .metadata() + .and_then(|m| m.modified()) + .ok() + .and_then(|sys_time| jiff::Zoned::try_from(sys_time).ok()) + .and_then(|zoned| { + ::zip::DateTime::try_from(zoned.datetime()).ok() + }) + .unwrap_or_else(::zip::DateTime::default); + + let opts = FullFileOptions::default() + .last_modified_time(zip_time) + .compression_method(::zip::CompressionMethod::Deflated) + .large_file(true); + + zip.start_file_from_path(dst, opts)?; + let mut file = std::fs::File::open(&src)?; + std::io::copy(&mut file, zip)?; + } + if file_type.is_dir() { + let opts = FullFileOptions::default(); + zip.add_directory_from_path(dst, opts)?; + recursively_add_directory_to_zipfile(zip, root_path, entry.path())?; + } + } + Ok(()) +} + +#[cfg(test)] +mod test { + use super::*; + + use camino_tempfile::tempdir; + + // Ensure that we can convert a temporary directory into a zipfile + #[test] + fn test_zipfile_creation() { + let dir = tempdir().unwrap(); + let tempdir_for_zip = tempdir().unwrap(); + + std::fs::create_dir_all(dir.path().join("dir-a")).unwrap(); + std::fs::create_dir_all(dir.path().join("dir-b")).unwrap(); + std::fs::write(dir.path().join("dir-a").join("file-a"), "some data") + .unwrap(); + std::fs::write(dir.path().join("file-b"), "more data").unwrap(); + + let zipfile = bundle_to_zipfile(&dir, tempdir_for_zip.path()) + .expect("Should have been able to bundle zipfile"); + let archive = ::zip::read::ZipArchive::new(zipfile).unwrap(); + + // We expect the order to be deterministically alphabetical + let mut names = archive.file_names(); + assert_eq!(names.next(), Some("dir-a/")); + assert_eq!(names.next(), Some("dir-a/file-a")); + assert_eq!(names.next(), Some("dir-b/")); + assert_eq!(names.next(), Some("file-b")); + assert_eq!(names.next(), None); + } +} From d180d48a80c79c4f57eb15ef671492602df0282e Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Tue, 5 May 2026 13:21:24 -0700 Subject: [PATCH 3/6] Add `omdb support-bundle collect` subcommand MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wires a new subcommand on omdb that calls into the `support-bundle-collection` crate to gather a bundle locally. Unlike the Nexus background task, this path does not register a row in the `support_bundle` table, does not transfer the bundle to a sled agent, and does not require Nexus to be up — it only needs CRDB, internal DNS, MGS, and sled-agents reachable on the underlay. This is intended for incident response: when Nexus is down (the most important time to gather a bundle), an operator can still produce one locally. --- Cargo.lock | 3 + dev-tools/omdb/Cargo.toml | 4 + dev-tools/omdb/src/bin/omdb/main.rs | 4 + .../src/bin/omdb/support_bundle_collect.rs | 221 ++++++++++++++++++ dev-tools/omdb/tests/test_all_output.rs | 45 ++++ dev-tools/omdb/tests/usage_errors.out | 2 + 6 files changed, 279 insertions(+) create mode 100644 dev-tools/omdb/src/bin/omdb/support_bundle_collect.rs diff --git a/Cargo.lock b/Cargo.lock index 7a133d5af03..47ac15b35b9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8875,6 +8875,7 @@ dependencies = [ "nexus-db-schema", "nexus-inventory", "nexus-lockstep-client", + "nexus-networking", "nexus-reconfigurator-preparation", "nexus-saga-recovery", "nexus-test-utils", @@ -8907,6 +8908,7 @@ dependencies = [ "steno", "strum 0.27.2", "subprocess", + "support-bundle-collection", "support-bundle-viewer", "supports-color 3.0.2", "tabled 0.15.0", @@ -8919,6 +8921,7 @@ dependencies = [ "url", "uuid", "vergen-gitcl", + "zip 4.6.1", ] [[package]] diff --git a/dev-tools/omdb/Cargo.toml b/dev-tools/omdb/Cargo.toml index 27de5c27917..3ecacb60934 100644 --- a/dev-tools/omdb/Cargo.toml +++ b/dev-tools/omdb/Cargo.toml @@ -19,6 +19,7 @@ base64.workspace = true bootstrap-agent-lockstep-client.workspace = true bytes.workspace = true camino.workspace = true +camino-tempfile.workspace = true chrono.workspace = true clap.workspace = true clickhouse-admin-single-client.workspace = true @@ -54,6 +55,7 @@ nexus-db-queries.workspace = true nexus-db-schema.workspace = true nexus-inventory.workspace = true nexus-lockstep-client.workspace = true +nexus-networking.workspace = true nexus-reconfigurator-preparation.workspace = true nexus-saga-recovery.workspace = true nexus-types.workspace = true @@ -83,6 +85,7 @@ slog.workspace = true slog-error-chain.workspace = true steno.workspace = true strum.workspace = true +support-bundle-collection.workspace = true support-bundle-viewer.workspace = true supports-color.workspace = true tabled.workspace = true @@ -104,6 +107,7 @@ nexus-test-utils-macros.workspace = true omicron-nexus.workspace = true omicron-test-utils.workspace = true subprocess.workspace = true +zip.workspace = true # Disable doc builds by default for our binaries to work around issue # rust-lang/cargo#8373. These docs would not be very useful anyway. diff --git a/dev-tools/omdb/src/bin/omdb/main.rs b/dev-tools/omdb/src/bin/omdb/main.rs index 8d3524e96a7..ca8df0783ee 100644 --- a/dev-tools/omdb/src/bin/omdb/main.rs +++ b/dev-tools/omdb/src/bin/omdb/main.rs @@ -58,6 +58,7 @@ mod oxql; mod reconfigurator; mod sled_agent; mod support_bundle; +mod support_bundle_collect; fn main() -> Result<(), anyhow::Error> { sigpipe::reset(); @@ -83,6 +84,7 @@ async fn main_impl() -> Result<(), anyhow::Error> { reconfig.run_cmd(&args, &log).await } OmdbCommands::SledAgent(sled) => sled.run_cmd(&args, &log).await, + OmdbCommands::SupportBundle(sb) => sb.run_cmd(&args, &log).await, OmdbCommands::CrucibleAgent(crucible) => crucible.run_cmd(&args).await, OmdbCommands::CruciblePantry(crucible) => crucible.run_cmd(&args).await, OmdbCommands::ClickhouseAdmin(ch) => ch.run_cmd(&args, &log).await, @@ -297,6 +299,8 @@ enum OmdbCommands { Reconfigurator(reconfigurator::ReconfiguratorArgs), /// Debug a specific Sled SledAgent(sled_agent::SledAgentArgs), + /// Collect or inspect a support bundle + SupportBundle(support_bundle_collect::SupportBundleArgs), } fn parse_dropshot_log_level( diff --git a/dev-tools/omdb/src/bin/omdb/support_bundle_collect.rs b/dev-tools/omdb/src/bin/omdb/support_bundle_collect.rs new file mode 100644 index 00000000000..0a7a0af2d9a --- /dev/null +++ b/dev-tools/omdb/src/bin/omdb/support_bundle_collect.rs @@ -0,0 +1,221 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! `omdb support-bundle collect` — collect a support bundle locally, +//! without going through Nexus. +//! +//! Unlike the Nexus background task, this path: +//! +//! - Does not register a row in the `support_bundle` table. +//! - Does not transfer the resulting bundle to a sled-agent for durable +//! storage. The zip is written to a local file path. +//! - Does not require Nexus to be up. It only needs CRDB, internal +//! DNS, MGS, and the rack's sled-agents reachable on the underlay. +//! +//! This is intended for incident response, where the operator may need +//! to collect a bundle precisely because Nexus is unhealthy. + +use crate::Omdb; +use crate::db::DbUrlOptions; +use anyhow::Context; +use camino::Utf8PathBuf; +use camino_tempfile::tempdir_in; +use clap::Args; +use clap::Subcommand; +use clap::ValueEnum; +use nexus_db_queries::context::OpContext; +use nexus_db_queries::db::DataStore; +use nexus_types::fm::ereport::EreportFilters; +use nexus_types::support_bundle::BundleDataSelection; +use omicron_uuid_kinds::SupportBundleUuid; +use std::io::Seek; +use std::io::SeekFrom; +use std::sync::Arc; +use support_bundle_collection::BundleCollection; +use support_bundle_collection::BundleInfo; +use support_bundle_collection::zip::bundle_to_zipfile; + +/// Categories of data the bundle collector knows how to gather. +/// +/// Mirrors `nexus_types::support_bundle::BundleDataCategory`, but is +/// declared here so it can derive `clap::ValueEnum` without making +/// `nexus-types` depend on clap. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, ValueEnum)] +enum BundleCategory { + Reconfigurator, + HostInfo, + SledCubbyInfo, + SpDumps, + Ereports, +} + +/// Arguments to the "omdb support-bundle" subcommand +#[derive(Debug, Args)] +pub struct SupportBundleArgs { + #[command(subcommand)] + command: SupportBundleCommands, +} + +#[derive(Debug, Subcommand)] +enum SupportBundleCommands { + /// Collect a support bundle without involving Nexus. + /// + /// Connects directly to CockroachDB, internal DNS, MGS, and the + /// rack's sled-agents — none of which depend on Nexus being up. + /// The bundle is written to a local zip file. No row is created + /// in the `support_bundle` table. + Collect(CollectArgs), +} + +#[derive(Debug, Args)] +struct CollectArgs { + #[command(flatten)] + db_url_opts: DbUrlOptions, + + /// Path where the resulting bundle zip will be written. + #[clap(long, short = 'o')] + output: Utf8PathBuf, + + /// Reason recorded inside the bundle's metadata. + #[clap(long, default_value = "collected via omdb")] + reason: String, + + /// Directory to use for staging the bundle contents before zipping. + #[clap(long, default_value = "/var/tmp")] + tempdir: Utf8PathBuf, + + /// Categories of data to collect. May be supplied multiple times. + /// Defaults to all categories. + #[clap(long, value_enum)] + include: Vec, +} + +impl CollectArgs { + fn data_selection(&self) -> BundleDataSelection { + let categories: &[BundleCategory] = if self.include.is_empty() { + BundleCategory::value_variants() + } else { + self.include.as_slice() + }; + + let mut sel = BundleDataSelection::new(); + for category in categories { + sel = match category { + BundleCategory::Reconfigurator => sel.with_reconfigurator(), + BundleCategory::HostInfo => sel.with_all_sleds(), + BundleCategory::SledCubbyInfo => sel.with_sled_cubby_info(), + BundleCategory::SpDumps => sel.with_sp_dumps(), + BundleCategory::Ereports => sel.with_ereports( + EreportFilters::new() + .with_start_time( + omicron_common::now_db_precision() + - chrono::Days::new(7), + ) + .expect("no end time set, cannot fail"), + ), + }; + } + sel + } +} + +impl SupportBundleArgs { + pub async fn run_cmd( + &self, + omdb: &Omdb, + log: &slog::Logger, + ) -> anyhow::Result<()> { + match &self.command { + SupportBundleCommands::Collect(args) => args.run(omdb, log).await, + } + } +} + +impl CollectArgs { + async fn run(&self, omdb: &Omdb, log: &slog::Logger) -> anyhow::Result<()> { + self.db_url_opts + .with_datastore(omdb, log, async |opctx, datastore| { + self.collect(omdb, log, opctx, datastore).await + }) + .await + } + + async fn collect( + &self, + omdb: &Omdb, + log: &slog::Logger, + opctx: OpContext, + datastore: Arc, + ) -> anyhow::Result<()> { + let resolver = omdb.dns_resolver(log.clone()).await?; + + let bundle = BundleInfo { + id: SupportBundleUuid::new_v4(), + reason_for_creation: self.reason.clone(), + }; + let bundle_log = log.new(slog::o!("bundle" => bundle.id.to_string())); + eprintln!("Collecting support bundle {}", bundle.id); + + let collection = Arc::new(BundleCollection::new( + datastore, + resolver, + bundle_log, + opctx, + self.data_selection(), + bundle, + )); + + // Wire Ctrl-C to cancel the in-flight collection. + let cancel_handle = tokio::spawn({ + let token = collection.cancellation_token().clone(); + async move { + let _ = tokio::signal::ctrl_c().await; + eprintln!("\nCtrl-C received — cancelling bundle collection."); + token.cancel(); + } + }); + + let dir = tempdir_in(&self.tempdir).with_context(|| { + format!("creating temp dir under {}", self.tempdir) + })?; + let collect_result = collection.collect_bundle_locally(&dir).await; + cancel_handle.abort(); + let _ = cancel_handle.await; + let report = collect_result?; + + let zip_tempdir = self.tempdir.clone(); + let output = self.output.clone(); + tokio::task::spawn_blocking(move || -> anyhow::Result<()> { + let mut tempfile = bundle_to_zipfile(&dir, &zip_tempdir)?; + tempfile.seek(SeekFrom::Start(0))?; + let mut out = std::fs::File::create(&output) + .with_context(|| format!("creating {output}"))?; + std::io::copy(&mut tempfile, &mut out)?; + Ok(()) + }) + .await + .context("zip task panicked")??; + + eprintln!("Wrote bundle to {}", self.output); + eprintln!("{} steps executed:", report.steps.len()); + for step in &report.steps { + let dur = step.end - step.start; + eprintln!( + " {:>9}ms {:?} {}", + dur.num_milliseconds(), + step.status, + step.name, + ); + } + if let Some(ereports) = &report.ereports { + eprintln!( + "ereports: {} found, {} collected, {} errors", + ereports.n_found, + ereports.n_collected, + ereports.errors.len(), + ); + } + Ok(()) + } +} diff --git a/dev-tools/omdb/tests/test_all_output.rs b/dev-tools/omdb/tests/test_all_output.rs index b787dceefd9..0dd1b55843f 100644 --- a/dev-tools/omdb/tests/test_all_output.rs +++ b/dev-tools/omdb/tests/test_all_output.rs @@ -436,6 +436,51 @@ async fn test_omdb_success_cases() { ); assert!(!parsed.collections.is_empty()); + // Exercise `omdb support-bundle collect` end-to-end. We don't add this + // to the `successes.out` snapshot because the output includes a + // randomly-generated bundle UUID, timing-dependent step durations, + // and per-sled step names that would all need redaction. Instead we + // run the command and verify the resulting zip is well-formed and + // contains the expected metadata files. + let bundle_path = tmpdir.path().join("bundle.zip"); + let bundle_args: &[&str] = &[ + "support-bundle", + "collect", + "--output", + bundle_path.as_str(), + "--tempdir", + tmpdir.path().as_str(), + "--reason", + "integration test", + ]; + let mut bundle_output = String::new(); + let p = postgres_url.clone(); + let dns = cptestctx.internal_dns.dns_server.local_address().to_string(); + do_run_no_redactions( + &mut bundle_output, + move |exec| exec.env("OMDB_DB_URL", &p).env("OMDB_DNS_SERVER", &dns), + &cmd_path, + bundle_args, + ) + .await; + let zip_file = std::fs::File::open(&bundle_path).unwrap_or_else(|err| { + panic!( + "bundle zip not produced at {bundle_path}: {}\n\ + omdb output was:\n{bundle_output}", + InlineErrorChain::new(&err), + ) + }); + let mut archive = + zip::ZipArchive::new(zip_file).expect("bundle is a valid zip archive"); + for required in + ["bundle_id.txt", "meta/reason_for_creation.txt", "meta/trace.json"] + { + assert!( + archive.by_name(required).is_ok(), + "bundle zip is missing expected entry {required}", + ); + } + let ox_invocation = &["oximeter", "list-producers"]; let mut ox_output = String::new(); let ox = ox_url.clone(); diff --git a/dev-tools/omdb/tests/usage_errors.out b/dev-tools/omdb/tests/usage_errors.out index b940a1f991f..db57bc2686c 100644 --- a/dev-tools/omdb/tests/usage_errors.out +++ b/dev-tools/omdb/tests/usage_errors.out @@ -19,6 +19,7 @@ Commands: oxql Enter the Oximeter Query Language shell for interactive querying reconfigurator Interact with the Reconfigurator system sled-agent Debug a specific Sled + support-bundle Collect or inspect a support bundle help Print this message or the help of the given subcommand(s) Options: @@ -54,6 +55,7 @@ Commands: oxql Enter the Oximeter Query Language shell for interactive querying reconfigurator Interact with the Reconfigurator system sled-agent Debug a specific Sled + support-bundle Collect or inspect a support bundle help Print this message or the help of the given subcommand(s) Options: From e3e45e488e0d65d5641e7642ac7d194e3e19b2a3 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Fri, 22 May 2026 10:56:00 -0700 Subject: [PATCH 4/6] Re-use BundleDataCategory enum --- .../src/bin/omdb/support_bundle_collect.rs | 31 ++++++------------- nexus/types/src/support_bundle.rs | 12 ++++++- 2 files changed, 20 insertions(+), 23 deletions(-) diff --git a/dev-tools/omdb/src/bin/omdb/support_bundle_collect.rs b/dev-tools/omdb/src/bin/omdb/support_bundle_collect.rs index 0a7a0af2d9a..d26a1cc3e40 100644 --- a/dev-tools/omdb/src/bin/omdb/support_bundle_collect.rs +++ b/dev-tools/omdb/src/bin/omdb/support_bundle_collect.rs @@ -27,6 +27,7 @@ use clap::ValueEnum; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; use nexus_types::fm::ereport::EreportFilters; +use nexus_types::support_bundle::BundleDataCategory; use nexus_types::support_bundle::BundleDataSelection; use omicron_uuid_kinds::SupportBundleUuid; use std::io::Seek; @@ -36,20 +37,6 @@ use support_bundle_collection::BundleCollection; use support_bundle_collection::BundleInfo; use support_bundle_collection::zip::bundle_to_zipfile; -/// Categories of data the bundle collector knows how to gather. -/// -/// Mirrors `nexus_types::support_bundle::BundleDataCategory`, but is -/// declared here so it can derive `clap::ValueEnum` without making -/// `nexus-types` depend on clap. -#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, ValueEnum)] -enum BundleCategory { - Reconfigurator, - HostInfo, - SledCubbyInfo, - SpDumps, - Ereports, -} - /// Arguments to the "omdb support-bundle" subcommand #[derive(Debug, Args)] pub struct SupportBundleArgs { @@ -88,13 +75,13 @@ struct CollectArgs { /// Categories of data to collect. May be supplied multiple times. /// Defaults to all categories. #[clap(long, value_enum)] - include: Vec, + include: Vec, } impl CollectArgs { fn data_selection(&self) -> BundleDataSelection { - let categories: &[BundleCategory] = if self.include.is_empty() { - BundleCategory::value_variants() + let categories: &[BundleDataCategory] = if self.include.is_empty() { + BundleDataCategory::value_variants() } else { self.include.as_slice() }; @@ -102,11 +89,11 @@ impl CollectArgs { let mut sel = BundleDataSelection::new(); for category in categories { sel = match category { - BundleCategory::Reconfigurator => sel.with_reconfigurator(), - BundleCategory::HostInfo => sel.with_all_sleds(), - BundleCategory::SledCubbyInfo => sel.with_sled_cubby_info(), - BundleCategory::SpDumps => sel.with_sp_dumps(), - BundleCategory::Ereports => sel.with_ereports( + BundleDataCategory::Reconfigurator => sel.with_reconfigurator(), + BundleDataCategory::HostInfo => sel.with_all_sleds(), + BundleDataCategory::SledCubbyInfo => sel.with_sled_cubby_info(), + BundleDataCategory::SpDumps => sel.with_sp_dumps(), + BundleDataCategory::Ereports => sel.with_ereports( EreportFilters::new() .with_start_time( omicron_common::now_db_precision() diff --git a/nexus/types/src/support_bundle.rs b/nexus/types/src/support_bundle.rs index 4c95dcb3233..c35b3032c24 100644 --- a/nexus/types/src/support_bundle.rs +++ b/nexus/types/src/support_bundle.rs @@ -16,7 +16,17 @@ use std::collections::HashSet; use std::fmt; /// Describes the category of support bundle data. -#[derive(Debug, Clone, Copy, Hash, Eq, PartialEq, Serialize, Deserialize)] +#[derive( + Debug, + Clone, + Copy, + Hash, + Eq, + PartialEq, + Serialize, + Deserialize, + clap::ValueEnum, +)] #[cfg_attr(test, derive(test_strategy::Arbitrary))] pub enum BundleDataCategory { /// Collects reconfigurator state (some of the latest blueprints, From c1a72bb237b3c0cc48a0818aff0d145df0a07618 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Fri, 22 May 2026 12:52:33 -0700 Subject: [PATCH 5/6] Allow streaming to stdout, avoid intermediate file --- .../src/bin/omdb/support_bundle_collect.rs | 35 ++++-- dev-tools/omdb/tests/test_all_output.rs | 45 ++++++++ support-bundle-collection/src/zip.rs | 105 ++++++++++++++---- 3 files changed, 151 insertions(+), 34 deletions(-) diff --git a/dev-tools/omdb/src/bin/omdb/support_bundle_collect.rs b/dev-tools/omdb/src/bin/omdb/support_bundle_collect.rs index d26a1cc3e40..2db5d4921a1 100644 --- a/dev-tools/omdb/src/bin/omdb/support_bundle_collect.rs +++ b/dev-tools/omdb/src/bin/omdb/support_bundle_collect.rs @@ -30,12 +30,12 @@ use nexus_types::fm::ereport::EreportFilters; use nexus_types::support_bundle::BundleDataCategory; use nexus_types::support_bundle::BundleDataSelection; use omicron_uuid_kinds::SupportBundleUuid; -use std::io::Seek; -use std::io::SeekFrom; +use std::io::Write; use std::sync::Arc; use support_bundle_collection::BundleCollection; use support_bundle_collection::BundleInfo; -use support_bundle_collection::zip::bundle_to_zipfile; +use support_bundle_collection::zip::bundle_to_stream; +use support_bundle_collection::zip::bundle_to_writer; /// Arguments to the "omdb support-bundle" subcommand #[derive(Debug, Args)] @@ -60,9 +60,10 @@ struct CollectArgs { #[command(flatten)] db_url_opts: DbUrlOptions, - /// Path where the resulting bundle zip will be written. + /// Optional path where the bundle zip will be written. If omitted, + /// the zip is streamed to stdout (suitable for piping over ssh). #[clap(long, short = 'o')] - output: Utf8PathBuf, + output: Option, /// Reason recorded inside the bundle's metadata. #[clap(long, default_value = "collected via omdb")] @@ -171,20 +172,30 @@ impl CollectArgs { let _ = cancel_handle.await; let report = collect_result?; - let zip_tempdir = self.tempdir.clone(); let output = self.output.clone(); tokio::task::spawn_blocking(move || -> anyhow::Result<()> { - let mut tempfile = bundle_to_zipfile(&dir, &zip_tempdir)?; - tempfile.seek(SeekFrom::Start(0))?; - let mut out = std::fs::File::create(&output) - .with_context(|| format!("creating {output}"))?; - std::io::copy(&mut tempfile, &mut out)?; + match output { + Some(path) => { + let file = std::fs::File::create(&path) + .with_context(|| format!("creating {path}"))?; + bundle_to_writer(&dir, &file)?; + } + None => { + let mut stdout = std::io::stdout().lock(); + bundle_to_stream(&dir, &mut stdout)?; + stdout.flush()?; + } + } Ok(()) }) .await .context("zip task panicked")??; - eprintln!("Wrote bundle to {}", self.output); + if let Some(path) = &self.output { + eprintln!("Wrote bundle to {path}"); + } else { + eprintln!("Bundle streamed to stdout"); + } eprintln!("{} steps executed:", report.steps.len()); for step in &report.steps { let dur = step.end - step.start; diff --git a/dev-tools/omdb/tests/test_all_output.rs b/dev-tools/omdb/tests/test_all_output.rs index 93e63c04155..d1d1beb670d 100644 --- a/dev-tools/omdb/tests/test_all_output.rs +++ b/dev-tools/omdb/tests/test_all_output.rs @@ -473,6 +473,51 @@ async fn test_omdb_success_cases() { ); } + // Now exercise the stdout-streaming path: omit `--output` and + // capture stdout as bytes. Verifies the data-descriptor zip variant + // produced by `bundle_to_stream` is well-formed and contains the + // expected metadata. + let stdout_path = tmpdir.path().join("bundle-stdout.zip"); + let stdout_file = + std::fs::File::create(&stdout_path).expect("create stdout capture"); + let cmd_path_owned = cmd_path.to_path_buf(); + let p = postgres_url.clone(); + let dns = cptestctx.internal_dns.dns_server.local_address().to_string(); + let stream_tempdir = tmpdir.path().to_owned(); + let exit_status = tokio::task::spawn_blocking(move || { + Exec::cmd(&cmd_path_owned) + .env("OMDB_DB_URL", &p) + .env("OMDB_DNS_SERVER", &dns) + .env("RUST_BACKTRACE", "1") + .env("RUST_LIB_BACKTRACE", "0") + .args(&[ + "support-bundle", + "collect", + "--tempdir", + stream_tempdir.as_str(), + "--reason", + "integration test (stdout)", + ]) + .stdout(subprocess::Redirection::File(stdout_file)) + .join() + .expect("running omdb to stream a bundle") + }) + .await + .expect("spawn_blocking"); + assert!(exit_status.success(), "stdout streaming failed: {exit_status:?}"); + let zip_file = std::fs::File::open(&stdout_path) + .expect("captured stdout file should exist"); + let mut archive = zip::ZipArchive::new(zip_file) + .expect("streamed bundle is a valid zip archive"); + for required in + ["bundle_id.txt", "meta/reason_for_creation.txt", "meta/trace.json"] + { + assert!( + archive.by_name(required).is_ok(), + "streamed bundle zip is missing expected entry {required}", + ); + } + let ox_invocation = &["oximeter", "list-producers"]; let mut ox_output = String::new(); let ox = ox_url.clone(); diff --git a/support-bundle-collection/src/zip.rs b/support-bundle-collection/src/zip.rs index b0065e8b326..d8fd4d942c0 100644 --- a/support-bundle-collection/src/zip.rs +++ b/support-bundle-collection/src/zip.rs @@ -2,11 +2,19 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -//! Helpers for converting a collected bundle directory into a zipfile. +//! Helpers for converting a collected bundle directory into a zip archive. //! -//! These are used by callers that need to produce a single archive from -//! the directory of collected data — both Nexus (for storing on a sled -//! agent) and omdb (for writing to local storage). +//! Three entry points: +//! +//! - [`bundle_to_writer`] writes a standard zip into any `Write + Seek` +//! sink. Used by omdb when `--output` is a regular file. +//! - [`bundle_to_stream`] writes a zip with data descriptors into a +//! non-seekable sink. Used by omdb when streaming the bundle to stdout +//! (e.g. to pipe over ssh from a switch zone). +//! - [`bundle_to_zipfile`] is a thin convenience that allocates a tempfile +//! and delegates to [`bundle_to_writer`]. Retained for Nexus's +//! chunked-upload path, which needs an owned seekable `File` for +//! hashing + per-chunk `try_clone` / `seek`. use ::zip::ZipWriter; use ::zip::write::FullFileOptions; @@ -15,23 +23,46 @@ use camino::Utf8DirEntry; use camino::Utf8Path; use camino_tempfile::Utf8TempDir; use camino_tempfile::tempfile_in; +use std::io::Write; + +/// Write a bundle zip into a seekable destination. Produces a standard +/// zip (no data descriptors). +pub fn bundle_to_writer( + dir: &Utf8TempDir, + writer: W, +) -> Result<()> { + write_zip(dir, ZipWriter::new(writer)) +} + +/// Write a bundle zip into a non-seekable destination. The resulting +/// archive uses zip data descriptors (~16 bytes of overhead per entry) +/// and is readable by any standard unzip tool. +pub fn bundle_to_stream(dir: &Utf8TempDir, writer: W) -> Result<()> { + write_zip(dir, ZipWriter::new_stream(writer)) +} -/// Takes the contents of `dir`, and zips them into a single zipfile -/// stored as a tempfile under `tempdir`. +/// Zip the contents of `dir` into a tempfile under `tempdir` and return +/// the owned file handle. Used by Nexus's chunked-upload path. pub fn bundle_to_zipfile( dir: &Utf8TempDir, tempdir: &Utf8Path, ) -> Result { - let tempfile = tempfile_in(tempdir)?; - let mut zip = ZipWriter::new(tempfile); + let mut tempfile = tempfile_in(tempdir)?; + bundle_to_writer(dir, &mut tempfile)?; + Ok(tempfile) +} +fn write_zip( + dir: &Utf8TempDir, + mut zip: ZipWriter, +) -> Result<()> { recursively_add_directory_to_zipfile(&mut zip, dir.path(), dir.path())?; - - Ok(zip.finish()?) + zip.finish()?; + Ok(()) } -fn recursively_add_directory_to_zipfile( - zip: &mut ZipWriter, +fn recursively_add_directory_to_zipfile( + zip: &mut ZipWriter, root_path: &Utf8Path, dir_path: &Utf8Path, ) -> Result<()> { @@ -85,24 +116,21 @@ mod test { use super::*; use camino_tempfile::tempdir; + use std::io::Cursor; - // Ensure that we can convert a temporary directory into a zipfile - #[test] - fn test_zipfile_creation() { + fn make_sample_bundle() -> Utf8TempDir { let dir = tempdir().unwrap(); - let tempdir_for_zip = tempdir().unwrap(); - std::fs::create_dir_all(dir.path().join("dir-a")).unwrap(); std::fs::create_dir_all(dir.path().join("dir-b")).unwrap(); std::fs::write(dir.path().join("dir-a").join("file-a"), "some data") .unwrap(); std::fs::write(dir.path().join("file-b"), "more data").unwrap(); + dir + } - let zipfile = bundle_to_zipfile(&dir, tempdir_for_zip.path()) - .expect("Should have been able to bundle zipfile"); - let archive = ::zip::read::ZipArchive::new(zipfile).unwrap(); - - // We expect the order to be deterministically alphabetical + fn assert_expected_entries( + archive: ::zip::read::ZipArchive, + ) { let mut names = archive.file_names(); assert_eq!(names.next(), Some("dir-a/")); assert_eq!(names.next(), Some("dir-a/file-a")); @@ -110,4 +138,37 @@ mod test { assert_eq!(names.next(), Some("file-b")); assert_eq!(names.next(), None); } + + // Ensure that bundle_to_writer produces a deterministically-ordered + // archive when given a seekable destination. + #[test] + fn test_bundle_to_writer() { + let dir = make_sample_bundle(); + let mut buf = Cursor::new(Vec::new()); + bundle_to_writer(&dir, &mut buf).unwrap(); + let archive = ::zip::read::ZipArchive::new(buf).unwrap(); + assert_expected_entries(archive); + } + + // Ensure that bundle_to_stream produces the same archive contents + // when given a non-seekable destination (using data descriptors). + #[test] + fn test_bundle_to_stream() { + let dir = make_sample_bundle(); + let mut buf: Vec = Vec::new(); + bundle_to_stream(&dir, &mut buf).unwrap(); + let archive = ::zip::read::ZipArchive::new(Cursor::new(buf)).unwrap(); + assert_expected_entries(archive); + } + + // Ensure that the tempfile-returning convenience still works for the + // Nexus chunked-upload path. + #[test] + fn test_bundle_to_zipfile() { + let dir = make_sample_bundle(); + let tempdir_for_zip = tempdir().unwrap(); + let zipfile = bundle_to_zipfile(&dir, tempdir_for_zip.path()).unwrap(); + let archive = ::zip::read::ZipArchive::new(zipfile).unwrap(); + assert_expected_entries(archive); + } } From 7e4e6bb6f0c18c8e7ef273b6483c72a496058a73 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Fri, 22 May 2026 13:06:46 -0700 Subject: [PATCH 6/6] Add destructive argument --- dev-tools/omdb/src/bin/omdb/support_bundle_collect.rs | 6 ++++++ dev-tools/omdb/tests/test_all_output.rs | 2 ++ 2 files changed, 8 insertions(+) diff --git a/dev-tools/omdb/src/bin/omdb/support_bundle_collect.rs b/dev-tools/omdb/src/bin/omdb/support_bundle_collect.rs index 2db5d4921a1..285bd45e2b6 100644 --- a/dev-tools/omdb/src/bin/omdb/support_bundle_collect.rs +++ b/dev-tools/omdb/src/bin/omdb/support_bundle_collect.rs @@ -122,6 +122,12 @@ impl SupportBundleArgs { impl CollectArgs { async fn run(&self, omdb: &Omdb, log: &slog::Logger) -> anyhow::Result<()> { + // Collecting a full bundle stages every file in --tempdir before + // (or while) writing the zip. On the switch zone, where this + // command typically runs during incident response, disk space is + // limited and a large bundle can fill it. Gate the command behind + // -w/--destructive so an operator opts in knowingly. + let _token = omdb.check_allow_destructive()?; self.db_url_opts .with_datastore(omdb, log, async |opctx, datastore| { self.collect(omdb, log, opctx, datastore).await diff --git a/dev-tools/omdb/tests/test_all_output.rs b/dev-tools/omdb/tests/test_all_output.rs index d1d1beb670d..3af437f65d1 100644 --- a/dev-tools/omdb/tests/test_all_output.rs +++ b/dev-tools/omdb/tests/test_all_output.rs @@ -436,6 +436,7 @@ async fn test_omdb_success_cases() { // contains the expected metadata files. let bundle_path = tmpdir.path().join("bundle.zip"); let bundle_args: &[&str] = &[ + "-w", "support-bundle", "collect", "--output", @@ -491,6 +492,7 @@ async fn test_omdb_success_cases() { .env("RUST_BACKTRACE", "1") .env("RUST_LIB_BACKTRACE", "0") .args(&[ + "-w", "support-bundle", "collect", "--tempdir",