diff --git a/Cargo.lock b/Cargo.lock index f38a163a6cc..d74af4d89b7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8882,6 +8882,7 @@ dependencies = [ "nexus-db-schema", "nexus-inventory", "nexus-lockstep-client", + "nexus-networking", "nexus-reconfigurator-preparation", "nexus-saga-recovery", "nexus-test-utils", @@ -8915,6 +8916,7 @@ dependencies = [ "steno", "strum 0.27.2", "subprocess", + "support-bundle-collection", "support-bundle-viewer", "supports-color 3.0.2", "tabled 0.15.0", @@ -8927,6 +8929,7 @@ dependencies = [ "url", "uuid", "vergen-gitcl", + "zip 4.6.1", ] [[package]] diff --git a/dev-tools/omdb/Cargo.toml b/dev-tools/omdb/Cargo.toml index 1bfef37d333..18d255e9e66 100644 --- a/dev-tools/omdb/Cargo.toml +++ b/dev-tools/omdb/Cargo.toml @@ -19,6 +19,7 @@ base64.workspace = true bootstrap-agent-lockstep-client.workspace = true bytes.workspace = true camino.workspace = true +camino-tempfile.workspace = true chrono.workspace = true clap.workspace = true clickhouse-admin-single-client.workspace = true @@ -55,6 +56,7 @@ nexus-db-queries.workspace = true nexus-db-schema.workspace = true nexus-inventory.workspace = true nexus-lockstep-client.workspace = true +nexus-networking.workspace = true nexus-reconfigurator-preparation.workspace = true nexus-saga-recovery.workspace = true nexus-types.workspace = true @@ -86,6 +88,7 @@ slog.workspace = true slog-error-chain.workspace = true steno.workspace = true strum.workspace = true +support-bundle-collection.workspace = true support-bundle-viewer.workspace = true supports-color.workspace = true tabled.workspace = true @@ -110,6 +113,7 @@ nexus-test-utils.workspace = true nexus-test-utils-macros.workspace = true omicron-test-utils.workspace = true subprocess.workspace = true +zip.workspace = true # Disable doc builds by default for our binaries to work around issue # rust-lang/cargo#8373. These docs would not be very useful anyway. diff --git a/dev-tools/omdb/src/bin/omdb/main.rs b/dev-tools/omdb/src/bin/omdb/main.rs index 57b8b8bd508..74f34e29a0d 100644 --- a/dev-tools/omdb/src/bin/omdb/main.rs +++ b/dev-tools/omdb/src/bin/omdb/main.rs @@ -58,6 +58,7 @@ mod oxql; mod reconfigurator; mod sled_agent; mod support_bundle; +mod support_bundle_collect; mod timesync; fn main() -> Result<(), anyhow::Error> { @@ -84,6 +85,7 @@ async fn main_impl() -> Result<(), anyhow::Error> { reconfig.run_cmd(&args, &log).await } OmdbCommands::SledAgent(sled) => sled.run_cmd(&args, &log).await, + OmdbCommands::SupportBundle(sb) => sb.run_cmd(&args, &log).await, OmdbCommands::CrucibleAgent(crucible) => crucible.run_cmd(&args).await, OmdbCommands::CruciblePantry(crucible) => crucible.run_cmd(&args).await, OmdbCommands::ClickhouseAdmin(ch) => ch.run_cmd(&args, &log).await, @@ -299,6 +301,8 @@ enum OmdbCommands { Reconfigurator(reconfigurator::ReconfiguratorArgs), /// Debug a specific Sled SledAgent(sled_agent::SledAgentArgs), + /// Collect or inspect a support bundle + SupportBundle(support_bundle_collect::SupportBundleArgs), /// Monitor time synchronization Timesync(timesync::TimesyncArgs), } diff --git a/dev-tools/omdb/src/bin/omdb/support_bundle_collect.rs b/dev-tools/omdb/src/bin/omdb/support_bundle_collect.rs new file mode 100644 index 00000000000..285bd45e2b6 --- /dev/null +++ b/dev-tools/omdb/src/bin/omdb/support_bundle_collect.rs @@ -0,0 +1,225 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! `omdb support-bundle collect` — collect a support bundle locally, +//! without going through Nexus. +//! +//! Unlike the Nexus background task, this path: +//! +//! - Does not register a row in the `support_bundle` table. +//! - Does not transfer the resulting bundle to a sled-agent for durable +//! storage. The zip is written to a local file path. +//! - Does not require Nexus to be up. It only needs CRDB, internal +//! DNS, MGS, and the rack's sled-agents reachable on the underlay. +//! +//! This is intended for incident response, where the operator may need +//! to collect a bundle precisely because Nexus is unhealthy. + +use crate::Omdb; +use crate::db::DbUrlOptions; +use anyhow::Context; +use camino::Utf8PathBuf; +use camino_tempfile::tempdir_in; +use clap::Args; +use clap::Subcommand; +use clap::ValueEnum; +use nexus_db_queries::context::OpContext; +use nexus_db_queries::db::DataStore; +use nexus_types::fm::ereport::EreportFilters; +use nexus_types::support_bundle::BundleDataCategory; +use nexus_types::support_bundle::BundleDataSelection; +use omicron_uuid_kinds::SupportBundleUuid; +use std::io::Write; +use std::sync::Arc; +use support_bundle_collection::BundleCollection; +use support_bundle_collection::BundleInfo; +use support_bundle_collection::zip::bundle_to_stream; +use support_bundle_collection::zip::bundle_to_writer; + +/// Arguments to the "omdb support-bundle" subcommand +#[derive(Debug, Args)] +pub struct SupportBundleArgs { + #[command(subcommand)] + command: SupportBundleCommands, +} + +#[derive(Debug, Subcommand)] +enum SupportBundleCommands { + /// Collect a support bundle without involving Nexus. + /// + /// Connects directly to CockroachDB, internal DNS, MGS, and the + /// rack's sled-agents — none of which depend on Nexus being up. + /// The bundle is written to a local zip file. No row is created + /// in the `support_bundle` table. + Collect(CollectArgs), +} + +#[derive(Debug, Args)] +struct CollectArgs { + #[command(flatten)] + db_url_opts: DbUrlOptions, + + /// Optional path where the bundle zip will be written. If omitted, + /// the zip is streamed to stdout (suitable for piping over ssh). + #[clap(long, short = 'o')] + output: Option, + + /// Reason recorded inside the bundle's metadata. + #[clap(long, default_value = "collected via omdb")] + reason: String, + + /// Directory to use for staging the bundle contents before zipping. + #[clap(long, default_value = "/var/tmp")] + tempdir: Utf8PathBuf, + + /// Categories of data to collect. May be supplied multiple times. + /// Defaults to all categories. + #[clap(long, value_enum)] + include: Vec, +} + +impl CollectArgs { + fn data_selection(&self) -> BundleDataSelection { + let categories: &[BundleDataCategory] = if self.include.is_empty() { + BundleDataCategory::value_variants() + } else { + self.include.as_slice() + }; + + let mut sel = BundleDataSelection::new(); + for category in categories { + sel = match category { + BundleDataCategory::Reconfigurator => sel.with_reconfigurator(), + BundleDataCategory::HostInfo => sel.with_all_sleds(), + BundleDataCategory::SledCubbyInfo => sel.with_sled_cubby_info(), + BundleDataCategory::SpDumps => sel.with_sp_dumps(), + BundleDataCategory::Ereports => sel.with_ereports( + EreportFilters::new() + .with_start_time( + omicron_common::now_db_precision() + - chrono::Days::new(7), + ) + .expect("no end time set, cannot fail"), + ), + }; + } + sel + } +} + +impl SupportBundleArgs { + pub async fn run_cmd( + &self, + omdb: &Omdb, + log: &slog::Logger, + ) -> anyhow::Result<()> { + match &self.command { + SupportBundleCommands::Collect(args) => args.run(omdb, log).await, + } + } +} + +impl CollectArgs { + async fn run(&self, omdb: &Omdb, log: &slog::Logger) -> anyhow::Result<()> { + // Collecting a full bundle stages every file in --tempdir before + // (or while) writing the zip. On the switch zone, where this + // command typically runs during incident response, disk space is + // limited and a large bundle can fill it. Gate the command behind + // -w/--destructive so an operator opts in knowingly. + let _token = omdb.check_allow_destructive()?; + self.db_url_opts + .with_datastore(omdb, log, async |opctx, datastore| { + self.collect(omdb, log, opctx, datastore).await + }) + .await + } + + async fn collect( + &self, + omdb: &Omdb, + log: &slog::Logger, + opctx: OpContext, + datastore: Arc, + ) -> anyhow::Result<()> { + let resolver = omdb.dns_resolver(log.clone()).await?; + + let bundle = BundleInfo { + id: SupportBundleUuid::new_v4(), + reason_for_creation: self.reason.clone(), + }; + let bundle_log = log.new(slog::o!("bundle" => bundle.id.to_string())); + eprintln!("Collecting support bundle {}", bundle.id); + + let collection = Arc::new(BundleCollection::new( + datastore, + resolver, + bundle_log, + opctx, + self.data_selection(), + bundle, + )); + + // Wire Ctrl-C to cancel the in-flight collection. + let cancel_handle = tokio::spawn({ + let token = collection.cancellation_token().clone(); + async move { + let _ = tokio::signal::ctrl_c().await; + eprintln!("\nCtrl-C received — cancelling bundle collection."); + token.cancel(); + } + }); + + let dir = tempdir_in(&self.tempdir).with_context(|| { + format!("creating temp dir under {}", self.tempdir) + })?; + let collect_result = collection.collect_bundle_locally(&dir).await; + cancel_handle.abort(); + let _ = cancel_handle.await; + let report = collect_result?; + + let output = self.output.clone(); + tokio::task::spawn_blocking(move || -> anyhow::Result<()> { + match output { + Some(path) => { + let file = std::fs::File::create(&path) + .with_context(|| format!("creating {path}"))?; + bundle_to_writer(&dir, &file)?; + } + None => { + let mut stdout = std::io::stdout().lock(); + bundle_to_stream(&dir, &mut stdout)?; + stdout.flush()?; + } + } + Ok(()) + }) + .await + .context("zip task panicked")??; + + if let Some(path) = &self.output { + eprintln!("Wrote bundle to {path}"); + } else { + eprintln!("Bundle streamed to stdout"); + } + eprintln!("{} steps executed:", report.steps.len()); + for step in &report.steps { + let dur = step.end - step.start; + eprintln!( + " {:>9}ms {:?} {}", + dur.num_milliseconds(), + step.status, + step.name, + ); + } + if let Some(ereports) = &report.ereports { + eprintln!( + "ereports: {} found, {} collected, {} errors", + ereports.n_found, + ereports.n_collected, + ereports.errors.len(), + ); + } + Ok(()) + } +} diff --git a/dev-tools/omdb/tests/test_all_output.rs b/dev-tools/omdb/tests/test_all_output.rs index 92c6e4e186d..3af437f65d1 100644 --- a/dev-tools/omdb/tests/test_all_output.rs +++ b/dev-tools/omdb/tests/test_all_output.rs @@ -428,6 +428,98 @@ async fn test_omdb_success_cases() { ); assert!(!parsed.collections.is_empty()); + // Exercise `omdb support-bundle collect` end-to-end. We don't add this + // to the `successes.out` snapshot because the output includes a + // randomly-generated bundle UUID, timing-dependent step durations, + // and per-sled step names that would all need redaction. Instead we + // run the command and verify the resulting zip is well-formed and + // contains the expected metadata files. + let bundle_path = tmpdir.path().join("bundle.zip"); + let bundle_args: &[&str] = &[ + "-w", + "support-bundle", + "collect", + "--output", + bundle_path.as_str(), + "--tempdir", + tmpdir.path().as_str(), + "--reason", + "integration test", + ]; + let mut bundle_output = String::new(); + let p = postgres_url.clone(); + let dns = cptestctx.internal_dns.dns_server.local_address().to_string(); + do_run_no_redactions( + &mut bundle_output, + move |exec| exec.env("OMDB_DB_URL", &p).env("OMDB_DNS_SERVER", &dns), + &cmd_path, + bundle_args, + ) + .await; + let zip_file = std::fs::File::open(&bundle_path).unwrap_or_else(|err| { + panic!( + "bundle zip not produced at {bundle_path}: {}\n\ + omdb output was:\n{bundle_output}", + InlineErrorChain::new(&err), + ) + }); + let mut archive = + zip::ZipArchive::new(zip_file).expect("bundle is a valid zip archive"); + for required in + ["bundle_id.txt", "meta/reason_for_creation.txt", "meta/trace.json"] + { + assert!( + archive.by_name(required).is_ok(), + "bundle zip is missing expected entry {required}", + ); + } + + // Now exercise the stdout-streaming path: omit `--output` and + // capture stdout as bytes. Verifies the data-descriptor zip variant + // produced by `bundle_to_stream` is well-formed and contains the + // expected metadata. + let stdout_path = tmpdir.path().join("bundle-stdout.zip"); + let stdout_file = + std::fs::File::create(&stdout_path).expect("create stdout capture"); + let cmd_path_owned = cmd_path.to_path_buf(); + let p = postgres_url.clone(); + let dns = cptestctx.internal_dns.dns_server.local_address().to_string(); + let stream_tempdir = tmpdir.path().to_owned(); + let exit_status = tokio::task::spawn_blocking(move || { + Exec::cmd(&cmd_path_owned) + .env("OMDB_DB_URL", &p) + .env("OMDB_DNS_SERVER", &dns) + .env("RUST_BACKTRACE", "1") + .env("RUST_LIB_BACKTRACE", "0") + .args(&[ + "-w", + "support-bundle", + "collect", + "--tempdir", + stream_tempdir.as_str(), + "--reason", + "integration test (stdout)", + ]) + .stdout(subprocess::Redirection::File(stdout_file)) + .join() + .expect("running omdb to stream a bundle") + }) + .await + .expect("spawn_blocking"); + assert!(exit_status.success(), "stdout streaming failed: {exit_status:?}"); + let zip_file = std::fs::File::open(&stdout_path) + .expect("captured stdout file should exist"); + let mut archive = zip::ZipArchive::new(zip_file) + .expect("streamed bundle is a valid zip archive"); + for required in + ["bundle_id.txt", "meta/reason_for_creation.txt", "meta/trace.json"] + { + assert!( + archive.by_name(required).is_ok(), + "streamed bundle zip is missing expected entry {required}", + ); + } + let ox_invocation = &["oximeter", "list-producers"]; let mut ox_output = String::new(); let ox = ox_url.clone(); diff --git a/dev-tools/omdb/tests/usage_errors.out b/dev-tools/omdb/tests/usage_errors.out index b439a8d3cb7..3059c4b8031 100644 --- a/dev-tools/omdb/tests/usage_errors.out +++ b/dev-tools/omdb/tests/usage_errors.out @@ -19,6 +19,7 @@ Commands: oxql Enter the Oximeter Query Language shell for interactive querying reconfigurator Interact with the Reconfigurator system sled-agent Debug a specific Sled + support-bundle Collect or inspect a support bundle timesync Monitor time synchronization help Print this message or the help of the given subcommand(s) @@ -55,6 +56,7 @@ Commands: oxql Enter the Oximeter Query Language shell for interactive querying reconfigurator Interact with the Reconfigurator system sled-agent Debug a specific Sled + support-bundle Collect or inspect a support bundle timesync Monitor time synchronization help Print this message or the help of the given subcommand(s) diff --git a/nexus/types/src/support_bundle.rs b/nexus/types/src/support_bundle.rs index 4c95dcb3233..c35b3032c24 100644 --- a/nexus/types/src/support_bundle.rs +++ b/nexus/types/src/support_bundle.rs @@ -16,7 +16,17 @@ use std::collections::HashSet; use std::fmt; /// Describes the category of support bundle data. -#[derive(Debug, Clone, Copy, Hash, Eq, PartialEq, Serialize, Deserialize)] +#[derive( + Debug, + Clone, + Copy, + Hash, + Eq, + PartialEq, + Serialize, + Deserialize, + clap::ValueEnum, +)] #[cfg_attr(test, derive(test_strategy::Arbitrary))] pub enum BundleDataCategory { /// Collects reconfigurator state (some of the latest blueprints, diff --git a/support-bundle-collection/src/zip.rs b/support-bundle-collection/src/zip.rs index b0065e8b326..d8fd4d942c0 100644 --- a/support-bundle-collection/src/zip.rs +++ b/support-bundle-collection/src/zip.rs @@ -2,11 +2,19 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -//! Helpers for converting a collected bundle directory into a zipfile. +//! Helpers for converting a collected bundle directory into a zip archive. //! -//! These are used by callers that need to produce a single archive from -//! the directory of collected data — both Nexus (for storing on a sled -//! agent) and omdb (for writing to local storage). +//! Three entry points: +//! +//! - [`bundle_to_writer`] writes a standard zip into any `Write + Seek` +//! sink. Used by omdb when `--output` is a regular file. +//! - [`bundle_to_stream`] writes a zip with data descriptors into a +//! non-seekable sink. Used by omdb when streaming the bundle to stdout +//! (e.g. to pipe over ssh from a switch zone). +//! - [`bundle_to_zipfile`] is a thin convenience that allocates a tempfile +//! and delegates to [`bundle_to_writer`]. Retained for Nexus's +//! chunked-upload path, which needs an owned seekable `File` for +//! hashing + per-chunk `try_clone` / `seek`. use ::zip::ZipWriter; use ::zip::write::FullFileOptions; @@ -15,23 +23,46 @@ use camino::Utf8DirEntry; use camino::Utf8Path; use camino_tempfile::Utf8TempDir; use camino_tempfile::tempfile_in; +use std::io::Write; + +/// Write a bundle zip into a seekable destination. Produces a standard +/// zip (no data descriptors). +pub fn bundle_to_writer( + dir: &Utf8TempDir, + writer: W, +) -> Result<()> { + write_zip(dir, ZipWriter::new(writer)) +} + +/// Write a bundle zip into a non-seekable destination. The resulting +/// archive uses zip data descriptors (~16 bytes of overhead per entry) +/// and is readable by any standard unzip tool. +pub fn bundle_to_stream(dir: &Utf8TempDir, writer: W) -> Result<()> { + write_zip(dir, ZipWriter::new_stream(writer)) +} -/// Takes the contents of `dir`, and zips them into a single zipfile -/// stored as a tempfile under `tempdir`. +/// Zip the contents of `dir` into a tempfile under `tempdir` and return +/// the owned file handle. Used by Nexus's chunked-upload path. pub fn bundle_to_zipfile( dir: &Utf8TempDir, tempdir: &Utf8Path, ) -> Result { - let tempfile = tempfile_in(tempdir)?; - let mut zip = ZipWriter::new(tempfile); + let mut tempfile = tempfile_in(tempdir)?; + bundle_to_writer(dir, &mut tempfile)?; + Ok(tempfile) +} +fn write_zip( + dir: &Utf8TempDir, + mut zip: ZipWriter, +) -> Result<()> { recursively_add_directory_to_zipfile(&mut zip, dir.path(), dir.path())?; - - Ok(zip.finish()?) + zip.finish()?; + Ok(()) } -fn recursively_add_directory_to_zipfile( - zip: &mut ZipWriter, +fn recursively_add_directory_to_zipfile( + zip: &mut ZipWriter, root_path: &Utf8Path, dir_path: &Utf8Path, ) -> Result<()> { @@ -85,24 +116,21 @@ mod test { use super::*; use camino_tempfile::tempdir; + use std::io::Cursor; - // Ensure that we can convert a temporary directory into a zipfile - #[test] - fn test_zipfile_creation() { + fn make_sample_bundle() -> Utf8TempDir { let dir = tempdir().unwrap(); - let tempdir_for_zip = tempdir().unwrap(); - std::fs::create_dir_all(dir.path().join("dir-a")).unwrap(); std::fs::create_dir_all(dir.path().join("dir-b")).unwrap(); std::fs::write(dir.path().join("dir-a").join("file-a"), "some data") .unwrap(); std::fs::write(dir.path().join("file-b"), "more data").unwrap(); + dir + } - let zipfile = bundle_to_zipfile(&dir, tempdir_for_zip.path()) - .expect("Should have been able to bundle zipfile"); - let archive = ::zip::read::ZipArchive::new(zipfile).unwrap(); - - // We expect the order to be deterministically alphabetical + fn assert_expected_entries( + archive: ::zip::read::ZipArchive, + ) { let mut names = archive.file_names(); assert_eq!(names.next(), Some("dir-a/")); assert_eq!(names.next(), Some("dir-a/file-a")); @@ -110,4 +138,37 @@ mod test { assert_eq!(names.next(), Some("file-b")); assert_eq!(names.next(), None); } + + // Ensure that bundle_to_writer produces a deterministically-ordered + // archive when given a seekable destination. + #[test] + fn test_bundle_to_writer() { + let dir = make_sample_bundle(); + let mut buf = Cursor::new(Vec::new()); + bundle_to_writer(&dir, &mut buf).unwrap(); + let archive = ::zip::read::ZipArchive::new(buf).unwrap(); + assert_expected_entries(archive); + } + + // Ensure that bundle_to_stream produces the same archive contents + // when given a non-seekable destination (using data descriptors). + #[test] + fn test_bundle_to_stream() { + let dir = make_sample_bundle(); + let mut buf: Vec = Vec::new(); + bundle_to_stream(&dir, &mut buf).unwrap(); + let archive = ::zip::read::ZipArchive::new(Cursor::new(buf)).unwrap(); + assert_expected_entries(archive); + } + + // Ensure that the tempfile-returning convenience still works for the + // Nexus chunked-upload path. + #[test] + fn test_bundle_to_zipfile() { + let dir = make_sample_bundle(); + let tempdir_for_zip = tempdir().unwrap(); + let zipfile = bundle_to_zipfile(&dir, tempdir_for_zip.path()).unwrap(); + let archive = ::zip::read::ZipArchive::new(zipfile).unwrap(); + assert_expected_entries(archive); + } }