diff --git a/Cargo.lock b/Cargo.lock index f61caae2532..47ac15b35b9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8765,6 +8765,7 @@ dependencies = [ "steno", "strum 0.27.2", "subprocess", + "support-bundle-collection", "swrite", "tempfile", "term 0.7.0", @@ -8874,6 +8875,7 @@ dependencies = [ "nexus-db-schema", "nexus-inventory", "nexus-lockstep-client", + "nexus-networking", "nexus-reconfigurator-preparation", "nexus-saga-recovery", "nexus-test-utils", @@ -8906,6 +8908,7 @@ dependencies = [ "steno", "strum 0.27.2", "subprocess", + "support-bundle-collection", "support-bundle-viewer", "supports-color 3.0.2", "tabled 0.15.0", @@ -8918,6 +8921,7 @@ dependencies = [ "url", "uuid", "vergen-gitcl", + "zip 4.6.1", ] [[package]] @@ -14496,6 +14500,46 @@ version = "2.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" +[[package]] +name = "support-bundle-collection" +version = "0.1.0" +dependencies = [ + "anyhow", + "base64 0.22.1", + "camino", + "camino-tempfile", + "chrono", + "dropshot 0.17.0", + "futures", + "gateway-client", + "gateway-types", + "internal-dns-resolver", + "internal-dns-types", + "jiff", + "nexus-db-model", + "nexus-db-queries", + "nexus-networking", + "nexus-reconfigurator-preparation", + "nexus-types", + "omicron-common", + "omicron-rpaths", + "omicron-uuid-kinds", + "omicron-workspace-hack", + "parallel-task-set", + "pq-sys", + "serde", + "serde_json", + "sha2", + "sled-agent-client", + "slog", + "slog-error-chain", + "tokio", + "tokio-util", + "tufaceous-artifact", + "uuid", + "zip 4.6.1", +] + [[package]] name = "support-bundle-viewer" version = "0.1.2" diff --git a/Cargo.toml b/Cargo.toml index 3e5adc43e60..daa23af3dd2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -161,6 +161,7 @@ members = [ "sled-storage", "sled-storage/zfs-test-harness", "sp-sim", + "support-bundle-collection", "test-utils", "trust-quorum", "trust-quorum/gfss", @@ -347,6 +348,7 @@ default-members = [ "sled-storage", "sled-storage/zfs-test-harness", "sp-sim", + "support-bundle-collection", "trust-quorum", "trust-quorum/gfss", "trust-quorum/protocol", @@ -823,6 +825,7 @@ strum = { version = "0.27.2", features = [ "derive" ] } subprocess = "0.2.9" subtle = "2.6.1" supports-color = "3.0.2" +support-bundle-collection = { path = "support-bundle-collection" } support-bundle-viewer = "0.1.2" swrite = "0.1.0" sync-ptr = "0.1.4" diff --git a/dev-tools/ls-apis/tests/api_dependencies.out b/dev-tools/ls-apis/tests/api_dependencies.out index a6cd556f371..a876ebff042 100644 --- a/dev-tools/ls-apis/tests/api_dependencies.out +++ b/dev-tools/ls-apis/tests/api_dependencies.out @@ -49,7 +49,7 @@ Management Gateway Service (client: gateway-client) consumed by: dpd (dendrite/dpd) via 1 path consumed by: lldpd (lldp/lldpd) via 1 path consumed by: mgd (maghemite/mgd) via 1 path - consumed by: omicron-nexus (omicron/nexus) via 5 paths + consumed by: omicron-nexus (omicron/nexus) via 6 paths consumed by: omicron-sled-agent (omicron/sled-agent) via 1 path consumed by: wicketd (omicron/wicketd) via 3 paths @@ -96,7 +96,7 @@ Repo Depot API (client: repo-depot-client) consumed by: omicron-sled-agent (omicron/sled-agent) via 1 path Sled Agent (client: sled-agent-client) - consumed by: omicron-nexus (omicron/nexus) via 8 paths + consumed by: omicron-nexus (omicron/nexus) via 9 paths Wicketd (client: wicketd-client) diff --git a/dev-tools/omdb/Cargo.toml b/dev-tools/omdb/Cargo.toml index 27de5c27917..3ecacb60934 100644 --- a/dev-tools/omdb/Cargo.toml +++ b/dev-tools/omdb/Cargo.toml @@ -19,6 +19,7 @@ base64.workspace = true bootstrap-agent-lockstep-client.workspace = true bytes.workspace = true camino.workspace = true +camino-tempfile.workspace = true chrono.workspace = true clap.workspace = true clickhouse-admin-single-client.workspace = true @@ -54,6 +55,7 @@ nexus-db-queries.workspace = true nexus-db-schema.workspace = true nexus-inventory.workspace = true nexus-lockstep-client.workspace = true +nexus-networking.workspace = true nexus-reconfigurator-preparation.workspace = true nexus-saga-recovery.workspace = true nexus-types.workspace = true @@ -83,6 +85,7 @@ slog.workspace = true slog-error-chain.workspace = true steno.workspace = true strum.workspace = true +support-bundle-collection.workspace = true support-bundle-viewer.workspace = true supports-color.workspace = true tabled.workspace = true @@ -104,6 +107,7 @@ nexus-test-utils-macros.workspace = true omicron-nexus.workspace = true omicron-test-utils.workspace = true subprocess.workspace = true +zip.workspace = true # Disable doc builds by default for our binaries to work around issue # rust-lang/cargo#8373. These docs would not be very useful anyway. diff --git a/dev-tools/omdb/src/bin/omdb/main.rs b/dev-tools/omdb/src/bin/omdb/main.rs index 8d3524e96a7..ca8df0783ee 100644 --- a/dev-tools/omdb/src/bin/omdb/main.rs +++ b/dev-tools/omdb/src/bin/omdb/main.rs @@ -58,6 +58,7 @@ mod oxql; mod reconfigurator; mod sled_agent; mod support_bundle; +mod support_bundle_collect; fn main() -> Result<(), anyhow::Error> { sigpipe::reset(); @@ -83,6 +84,7 @@ async fn main_impl() -> Result<(), anyhow::Error> { reconfig.run_cmd(&args, &log).await } OmdbCommands::SledAgent(sled) => sled.run_cmd(&args, &log).await, + OmdbCommands::SupportBundle(sb) => sb.run_cmd(&args, &log).await, OmdbCommands::CrucibleAgent(crucible) => crucible.run_cmd(&args).await, OmdbCommands::CruciblePantry(crucible) => crucible.run_cmd(&args).await, OmdbCommands::ClickhouseAdmin(ch) => ch.run_cmd(&args, &log).await, @@ -297,6 +299,8 @@ enum OmdbCommands { Reconfigurator(reconfigurator::ReconfiguratorArgs), /// Debug a specific Sled SledAgent(sled_agent::SledAgentArgs), + /// Collect or inspect a support bundle + SupportBundle(support_bundle_collect::SupportBundleArgs), } fn parse_dropshot_log_level( diff --git a/dev-tools/omdb/src/bin/omdb/support_bundle_collect.rs b/dev-tools/omdb/src/bin/omdb/support_bundle_collect.rs new file mode 100644 index 00000000000..8b527b84023 --- /dev/null +++ b/dev-tools/omdb/src/bin/omdb/support_bundle_collect.rs @@ -0,0 +1,247 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! `omdb support-bundle collect` — collect a support bundle locally, +//! without going through Nexus. +//! +//! Unlike the Nexus background task, this path: +//! +//! - Does not register a row in the `support_bundle` table. +//! - Does not transfer the resulting bundle to a sled-agent for durable +//! storage. The zip is written to a local file path. +//! - Does not require Nexus to be up. It only needs CRDB, internal +//! DNS, MGS, and the rack's sled-agents reachable on the underlay. +//! +//! This is intended for incident response, where the operator may need +//! to collect a bundle precisely because Nexus is unhealthy. + +use crate::Omdb; +use crate::db::DbUrlOptions; +use anyhow::Context; +use camino::Utf8PathBuf; +use camino_tempfile::tempdir_in; +use clap::Args; +use clap::Subcommand; +use clap::ValueEnum; +use nexus_db_queries::context::OpContext; +use nexus_db_queries::db::DataStore; +use nexus_types::fm::ereport::EreportFilters; +use nexus_types::support_bundle::BundleDataSelection; +use nexus_types::support_bundle::BundleTimeRange; +use omicron_uuid_kinds::SupportBundleUuid; +use std::io::Seek; +use std::io::SeekFrom; +use std::sync::Arc; +use std::time::Duration; +use support_bundle_collection::BundleCollection; +use support_bundle_collection::BundleInfo; +use support_bundle_collection::zip::bundle_to_zipfile; + +/// Categories of data the bundle collector knows how to gather. +/// +/// Mirrors `nexus_types::support_bundle::BundleDataCategory`, but is +/// declared here so it can derive `clap::ValueEnum` without making +/// `nexus-types` depend on clap. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, ValueEnum)] +enum BundleCategory { + Reconfigurator, + HostInfo, + SledCubbyInfo, + SpDumps, + Ereports, +} + +/// Arguments to the "omdb support-bundle" subcommand +#[derive(Debug, Args)] +pub struct SupportBundleArgs { + #[command(subcommand)] + command: SupportBundleCommands, +} + +#[derive(Debug, Subcommand)] +enum SupportBundleCommands { + /// Collect a support bundle without involving Nexus. + /// + /// Connects directly to CockroachDB, internal DNS, MGS, and the + /// rack's sled-agents — none of which depend on Nexus being up. + /// The bundle is written to a local zip file. No row is created + /// in the `support_bundle` table. + Collect(CollectArgs), +} + +#[derive(Debug, Args)] +struct CollectArgs { + #[command(flatten)] + db_url_opts: DbUrlOptions, + + /// Path where the resulting bundle zip will be written. + #[clap(long, short = 'o')] + output: Utf8PathBuf, + + /// Reason recorded inside the bundle's metadata. + #[clap(long, default_value = "collected via omdb")] + reason: String, + + /// Directory to use for staging the bundle contents before zipping. + #[clap(long, default_value = "/var/tmp")] + tempdir: Utf8PathBuf, + + /// Categories of data to collect. May be supplied multiple times. + /// Defaults to all categories. + #[clap(long, value_enum)] + include: Vec, + + /// Collect data newer than this duration ago. Applies to both + /// service logs and ereports. Defaults to 7 days when `--since` + /// is not set. + #[clap(long, value_parser = humantime::parse_duration)] + since: Option, + + /// Collect data older than this duration ago. Truncates the + /// upper bound of the window. Applies to both service logs and + /// ereports. Defaults to no upper bound (file mtimes can't be + /// in the future, so this effectively means "up to now"). + #[clap(long, value_parser = humantime::parse_duration)] + until: Option, +} + +impl CollectArgs { + fn data_selection(&self) -> BundleDataSelection { + let categories: &[BundleCategory] = if self.include.is_empty() { + BundleCategory::value_variants() + } else { + self.include.as_slice() + }; + + let mut sel = BundleDataSelection::new(); + for category in categories { + sel = match category { + BundleCategory::Reconfigurator => sel.with_reconfigurator(), + BundleCategory::HostInfo => sel.with_all_sleds(), + BundleCategory::SledCubbyInfo => sel.with_sled_cubby_info(), + BundleCategory::SpDumps => sel.with_sp_dumps(), + BundleCategory::Ereports => { + sel.with_ereports(EreportFilters::new()) + } + }; + } + + // Apply a bundle-wide time range. Each flag's default fires + // independently: `--since` defaults to 7 days, `--until` + // defaults to no upper bound. + let now = omicron_common::now_db_precision(); + let range = BundleTimeRange { + start: self + .since + .and_then(|d| chrono::Duration::from_std(d).ok()) + .map(|d| now - d) + .or(Some(now - chrono::Days::new(7))), + end: self + .until + .and_then(|d| chrono::Duration::from_std(d).ok()) + .map(|d| now - d), + }; + sel.with_time_range(range) + } +} + +impl SupportBundleArgs { + pub async fn run_cmd( + &self, + omdb: &Omdb, + log: &slog::Logger, + ) -> anyhow::Result<()> { + match &self.command { + SupportBundleCommands::Collect(args) => args.run(omdb, log).await, + } + } +} + +impl CollectArgs { + async fn run(&self, omdb: &Omdb, log: &slog::Logger) -> anyhow::Result<()> { + self.db_url_opts + .with_datastore(omdb, log, async |opctx, datastore| { + self.collect(omdb, log, opctx, datastore).await + }) + .await + } + + async fn collect( + &self, + omdb: &Omdb, + log: &slog::Logger, + opctx: OpContext, + datastore: Arc, + ) -> anyhow::Result<()> { + let resolver = omdb.dns_resolver(log.clone()).await?; + + let bundle = BundleInfo { + id: SupportBundleUuid::new_v4(), + reason_for_creation: self.reason.clone(), + }; + let bundle_log = log.new(slog::o!("bundle" => bundle.id.to_string())); + eprintln!("Collecting support bundle {}", bundle.id); + + let collection = Arc::new(BundleCollection::new( + datastore, + resolver, + bundle_log, + opctx, + self.data_selection(), + bundle, + )); + + // Wire Ctrl-C to cancel the in-flight collection. + let cancel_handle = tokio::spawn({ + let token = collection.cancellation_token().clone(); + async move { + let _ = tokio::signal::ctrl_c().await; + eprintln!("\nCtrl-C received — cancelling bundle collection."); + token.cancel(); + } + }); + + let dir = tempdir_in(&self.tempdir).with_context(|| { + format!("creating temp dir under {}", self.tempdir) + })?; + let collect_result = collection.collect_bundle_locally(&dir).await; + cancel_handle.abort(); + let _ = cancel_handle.await; + let report = collect_result?; + + let zip_tempdir = self.tempdir.clone(); + let output = self.output.clone(); + tokio::task::spawn_blocking(move || -> anyhow::Result<()> { + let mut tempfile = bundle_to_zipfile(&dir, &zip_tempdir)?; + tempfile.seek(SeekFrom::Start(0))?; + let mut out = std::fs::File::create(&output) + .with_context(|| format!("creating {output}"))?; + std::io::copy(&mut tempfile, &mut out)?; + Ok(()) + }) + .await + .context("zip task panicked")??; + + eprintln!("Wrote bundle to {}", self.output); + eprintln!("{} steps executed:", report.steps.len()); + for step in &report.steps { + let dur = step.end - step.start; + eprintln!( + " {:>9}ms {:?} {}", + dur.num_milliseconds(), + step.status, + step.name, + ); + } + if let Some(ereports) = &report.ereports { + eprintln!( + "ereports: {} found, {} collected, {} errors", + ereports.n_found, + ereports.n_collected, + ereports.errors.len(), + ); + } + Ok(()) + } +} diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out index 8662889facf..fcdc1a6de67 100644 --- a/dev-tools/omdb/tests/successes.out +++ b/dev-tools/omdb/tests/successes.out @@ -747,23 +747,25 @@ task: "fm_sitrep_gc" configured period: every s last completed activation: , triggered by started at (s ago) and ran for ms - batch size: 1000 - orphaned sitreps deleted: 0 - batches: 1 - orphaned fm_alert_request rows deleted: 0 - batches: 1 - orphaned fm_case rows deleted: 0 - batches: 1 - orphaned fm_ereport_in_case rows deleted: 0 - batches: 1 - orphaned fm_support_bundle_request rows deleted: 0 - batches: 1 - orphaned fm_support_bundle_request_data_selection_ereports rows deleted: 0 - batches: 1 - orphaned fm_support_bundle_request_data_selection_flags rows deleted: 0 - batches: 1 - orphaned fm_support_bundle_request_data_selection_host_info rows deleted: 0 - batches: 1 + batch size: 1000 + orphaned sitreps deleted: 0 + batches: 1 + orphaned fm_alert_request rows deleted: 0 + batches: 1 + orphaned fm_case rows deleted: 0 + batches: 1 + orphaned fm_ereport_in_case rows deleted: 0 + batches: 1 + orphaned fm_support_bundle_request rows deleted: 0 + batches: 1 + orphaned fm_support_bundle_request_data_selection_ereports rows deleted: 0 + batches: 1 + orphaned fm_support_bundle_request_data_selection_flags rows deleted: 0 + batches: 1 + orphaned fm_support_bundle_request_data_selection_host_info rows deleted: 0 + batches: 1 + orphaned fm_support_bundle_request_data_selection_time_range rows deleted: 0 + batches: 1 task: "fm_sitrep_loader" configured period: every s @@ -1428,23 +1430,25 @@ task: "fm_sitrep_gc" configured period: every s last completed activation: , triggered by started at (s ago) and ran for ms - batch size: 1000 - orphaned sitreps deleted: 0 - batches: 1 - orphaned fm_alert_request rows deleted: 0 - batches: 1 - orphaned fm_case rows deleted: 0 - batches: 1 - orphaned fm_ereport_in_case rows deleted: 0 - batches: 1 - orphaned fm_support_bundle_request rows deleted: 0 - batches: 1 - orphaned fm_support_bundle_request_data_selection_ereports rows deleted: 0 - batches: 1 - orphaned fm_support_bundle_request_data_selection_flags rows deleted: 0 - batches: 1 - orphaned fm_support_bundle_request_data_selection_host_info rows deleted: 0 - batches: 1 + batch size: 1000 + orphaned sitreps deleted: 0 + batches: 1 + orphaned fm_alert_request rows deleted: 0 + batches: 1 + orphaned fm_case rows deleted: 0 + batches: 1 + orphaned fm_ereport_in_case rows deleted: 0 + batches: 1 + orphaned fm_support_bundle_request rows deleted: 0 + batches: 1 + orphaned fm_support_bundle_request_data_selection_ereports rows deleted: 0 + batches: 1 + orphaned fm_support_bundle_request_data_selection_flags rows deleted: 0 + batches: 1 + orphaned fm_support_bundle_request_data_selection_host_info rows deleted: 0 + batches: 1 + orphaned fm_support_bundle_request_data_selection_time_range rows deleted: 0 + batches: 1 task: "fm_sitrep_loader" configured period: every s diff --git a/dev-tools/omdb/tests/test_all_output.rs b/dev-tools/omdb/tests/test_all_output.rs index b787dceefd9..0dd1b55843f 100644 --- a/dev-tools/omdb/tests/test_all_output.rs +++ b/dev-tools/omdb/tests/test_all_output.rs @@ -436,6 +436,51 @@ async fn test_omdb_success_cases() { ); assert!(!parsed.collections.is_empty()); + // Exercise `omdb support-bundle collect` end-to-end. We don't add this + // to the `successes.out` snapshot because the output includes a + // randomly-generated bundle UUID, timing-dependent step durations, + // and per-sled step names that would all need redaction. Instead we + // run the command and verify the resulting zip is well-formed and + // contains the expected metadata files. + let bundle_path = tmpdir.path().join("bundle.zip"); + let bundle_args: &[&str] = &[ + "support-bundle", + "collect", + "--output", + bundle_path.as_str(), + "--tempdir", + tmpdir.path().as_str(), + "--reason", + "integration test", + ]; + let mut bundle_output = String::new(); + let p = postgres_url.clone(); + let dns = cptestctx.internal_dns.dns_server.local_address().to_string(); + do_run_no_redactions( + &mut bundle_output, + move |exec| exec.env("OMDB_DB_URL", &p).env("OMDB_DNS_SERVER", &dns), + &cmd_path, + bundle_args, + ) + .await; + let zip_file = std::fs::File::open(&bundle_path).unwrap_or_else(|err| { + panic!( + "bundle zip not produced at {bundle_path}: {}\n\ + omdb output was:\n{bundle_output}", + InlineErrorChain::new(&err), + ) + }); + let mut archive = + zip::ZipArchive::new(zip_file).expect("bundle is a valid zip archive"); + for required in + ["bundle_id.txt", "meta/reason_for_creation.txt", "meta/trace.json"] + { + assert!( + archive.by_name(required).is_ok(), + "bundle zip is missing expected entry {required}", + ); + } + let ox_invocation = &["oximeter", "list-producers"]; let mut ox_output = String::new(); let ox = ox_url.clone(); diff --git a/dev-tools/omdb/tests/usage_errors.out b/dev-tools/omdb/tests/usage_errors.out index b940a1f991f..db57bc2686c 100644 --- a/dev-tools/omdb/tests/usage_errors.out +++ b/dev-tools/omdb/tests/usage_errors.out @@ -19,6 +19,7 @@ Commands: oxql Enter the Oximeter Query Language shell for interactive querying reconfigurator Interact with the Reconfigurator system sled-agent Debug a specific Sled + support-bundle Collect or inspect a support bundle help Print this message or the help of the given subcommand(s) Options: @@ -54,6 +55,7 @@ Commands: oxql Enter the Oximeter Query Language shell for interactive querying reconfigurator Interact with the Reconfigurator system sled-agent Debug a specific Sled + support-bundle Collect or inspect a support bundle help Print this message or the help of the given subcommand(s) Options: diff --git a/nexus/Cargo.toml b/nexus/Cargo.toml index 0a24e5eb3e2..18ffa16c8e4 100644 --- a/nexus/Cargo.toml +++ b/nexus/Cargo.toml @@ -159,6 +159,7 @@ raw-cpuid = { workspace = true, features = ["std"] } rustls = { workspace = true } rustls-pemfile = { workspace = true } scim2-rs.workspace = true +support-bundle-collection.workspace = true update-common.workspace = true update-engine.workspace = true omicron-workspace-hack.workspace = true diff --git a/nexus/db-model/src/fm/support_bundle_request.rs b/nexus/db-model/src/fm/support_bundle_request.rs index dc62f3e7653..9164fc06695 100644 --- a/nexus/db-model/src/fm/support_bundle_request.rs +++ b/nexus/db-model/src/fm/support_bundle_request.rs @@ -11,10 +11,11 @@ use nexus_db_schema::schema::{ fm_support_bundle_request_data_selection_ereports, fm_support_bundle_request_data_selection_flags, fm_support_bundle_request_data_selection_host_info, + fm_support_bundle_request_data_selection_time_range, }; use nexus_types::fm; -use nexus_types::fm::ereport::{EreportFilters, EreportFiltersParams}; -use nexus_types::support_bundle::{BundleData, SledSelection}; +use nexus_types::fm::ereport::EreportFilters; +use nexus_types::support_bundle::{BundleData, BundleTimeRange, SledSelection}; use omicron_uuid_kinds::{ CaseKind, GenericUuid, SitrepKind, SledUuid, SupportBundleKind, }; @@ -120,14 +121,14 @@ impl HostInfo { impl From for BundleData { fn from(row: HostInfo) -> Self { let HostInfo { sitrep_id: _, request_id: _, all_sleds, sled_ids } = row; - let selection = if all_sleds { + let sleds = if all_sleds { SledSelection::All } else { SledSelection::Specific( sled_ids.into_iter().map(SledUuid::from_untyped_uuid).collect(), ) }; - BundleData::HostInfo(selection) + BundleData::HostInfo(sleds) } } @@ -136,8 +137,6 @@ impl From for BundleData { pub struct Ereports { pub sitrep_id: DbTypedUuid, pub request_id: DbTypedUuid, - pub start_time: Option>, - pub end_time: Option>, pub only_serials: Vec, pub only_classes: Vec, } @@ -151,39 +150,64 @@ impl Ereports { Ereports { sitrep_id: sitrep_id.into(), request_id: request_id.into(), - start_time: filters.start_time(), - end_time: filters.end_time(), only_serials: filters.only_serials().to_vec(), only_classes: filters.only_classes().to_vec(), } } } -impl TryFrom for BundleData { - type Error = omicron_common::api::external::Error; - - fn try_from(row: Ereports) -> Result { +impl From for BundleData { + fn from(row: Ereports) -> Self { let Ereports { sitrep_id: _, request_id: _, - start_time, - end_time, only_serials, only_classes, } = row; - EreportFiltersParams { - start_time, - end_time, - only_serials, - only_classes, + BundleData::Ereports( + EreportFilters::new() + .with_serials(only_serials) + .with_classes(only_classes), + ) + } +} + +/// Bundle-wide time bound persisted alongside the per-category data +/// selection rows. Reads are merged into +/// [`nexus_types::support_bundle::BundleDataSelection::time_range`]. +#[derive(Queryable, Insertable, Clone, Debug, Selectable)] +#[diesel(table_name = fm_support_bundle_request_data_selection_time_range)] +pub struct TimeRange { + pub sitrep_id: DbTypedUuid, + pub request_id: DbTypedUuid, + pub start_time: Option>, + pub end_time: Option>, +} + +impl TimeRange { + pub fn from_sitrep( + sitrep_id: impl Into>, + request_id: impl Into>, + range: &BundleTimeRange, + ) -> Self { + TimeRange { + sitrep_id: sitrep_id.into(), + request_id: request_id.into(), + start_time: range.start, + end_time: range.end, } - .try_into() - .map(BundleData::Ereports) } } -/// Joined query result: flags + optional host_info + optional ereports. -/// All fields use `#[diesel(embed)]` so no `table_name` is needed. +impl From for BundleTimeRange { + fn from(row: TimeRange) -> Self { + BundleTimeRange { start: row.start_time, end: row.end_time } + } +} + +/// Joined query result: flags + optional host_info + optional ereports + +/// optional bundle-wide time range. All fields use `#[diesel(embed)]` so +/// no `table_name` is needed. #[derive(Queryable, Selectable)] pub struct BundleDataSelection { #[diesel(embed)] @@ -192,14 +216,14 @@ pub struct BundleDataSelection { pub host_info: Option, #[diesel(embed)] pub ereports: Option, + #[diesel(embed)] + pub time_range: Option, } -impl TryFrom +impl From for nexus_types::support_bundle::BundleDataSelection { - type Error = omicron_common::api::external::Error; - - fn try_from(row: BundleDataSelection) -> Result { + fn from(row: BundleDataSelection) -> Self { let mut selection = nexus_types::support_bundle::BundleDataSelection::new(); if row.flags.include_reconfigurator { @@ -215,8 +239,9 @@ impl TryFrom selection.insert(host_info.into()); } if let Some(ereports) = row.ereports { - selection.insert(ereports.try_into()?); + selection.insert(ereports.into()); } - Ok(selection) + selection.set_time_range(row.time_range.map(BundleTimeRange::from)); + selection } } diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs index e09c123b732..c17e078db4c 100644 --- a/nexus/db-model/src/schema_versions.rs +++ b/nexus/db-model/src/schema_versions.rs @@ -16,7 +16,7 @@ use std::{collections::BTreeMap, sync::LazyLock}; /// /// This must be updated when you change the database schema. Refer to /// schema/crdb/README.adoc in the root of this repository for details. -pub const SCHEMA_VERSION: Version = Version::new(256, 0, 0); +pub const SCHEMA_VERSION: Version = Version::new(257, 0, 0); /// List of all past database schema versions, in *reverse* order /// @@ -28,6 +28,7 @@ pub static KNOWN_VERSIONS: LazyLock> = LazyLock::new(|| { // | leaving the first copy as an example for the next person. // v // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"), + KnownVersion::new(257, "support-bundle-time-range"), KnownVersion::new(256, "bgp-unnumbered-peer-cleanup"), KnownVersion::new(255, "blueprint-add-external-networking-generation"), KnownVersion::new( diff --git a/nexus/db-model/src/support_bundle.rs b/nexus/db-model/src/support_bundle.rs index d1f65b5c2e2..70b7925a2b7 100644 --- a/nexus/db-model/src/support_bundle.rs +++ b/nexus/db-model/src/support_bundle.rs @@ -9,13 +9,15 @@ use nexus_db_schema::schema::{ support_bundle_data_selection_ereports, support_bundle_data_selection_flags, support_bundle_data_selection_host_info, + support_bundle_data_selection_time_range, }; use chrono::{DateTime, Utc}; use nexus_types::external_api::support_bundle as support_bundle_types; -use nexus_types::fm::ereport::{EreportFilters, EreportFiltersParams}; +use nexus_types::fm::ereport::EreportFilters; use nexus_types::internal_api::views as internal_views; use nexus_types::support_bundle::BundleData; +use nexus_types::support_bundle::BundleTimeRange; use nexus_types::support_bundle::SledSelection; use omicron_uuid_kinds::CaseKind; use omicron_uuid_kinds::CaseUuid; @@ -201,14 +203,14 @@ impl HostInfo { impl From for BundleData { fn from(row: HostInfo) -> Self { let HostInfo { bundle_id: _, all_sleds, sled_ids } = row; - let selection = if all_sleds { + let sleds = if all_sleds { SledSelection::All } else { SledSelection::Specific( sled_ids.into_iter().map(SledUuid::from_untyped_uuid).collect(), ) }; - BundleData::HostInfo(selection) + BundleData::HostInfo(sleds) } } @@ -216,8 +218,6 @@ impl From for BundleData { #[diesel(table_name = support_bundle_data_selection_ereports)] pub struct Ereports { pub bundle_id: DbTypedUuid, - pub start_time: Option>, - pub end_time: Option>, pub only_serials: Vec, pub only_classes: Vec, } @@ -229,38 +229,56 @@ impl Ereports { ) -> Self { Ereports { bundle_id: bundle_id.into(), - start_time: filters.start_time(), - end_time: filters.end_time(), only_serials: filters.only_serials().to_vec(), only_classes: filters.only_classes().to_vec(), } } } -impl TryFrom for BundleData { - type Error = omicron_common::api::external::Error; +impl From for BundleData { + fn from(row: Ereports) -> Self { + let Ereports { bundle_id: _, only_serials, only_classes } = row; + BundleData::Ereports( + EreportFilters::new() + .with_serials(only_serials) + .with_classes(only_classes), + ) + } +} + +/// Bundle-wide time bound persisted alongside the per-category data +/// selection rows. Reads are merged into +/// [`nexus_types::support_bundle::BundleDataSelection::time_range`]. +#[derive(Queryable, Insertable, Clone, Debug, Selectable)] +#[diesel(table_name = support_bundle_data_selection_time_range)] +pub struct TimeRange { + pub bundle_id: DbTypedUuid, + pub start_time: Option>, + pub end_time: Option>, +} - fn try_from(row: Ereports) -> Result { - let Ereports { - bundle_id: _, - start_time, - end_time, - only_serials, - only_classes, - } = row; - EreportFiltersParams { - start_time, - end_time, - only_serials, - only_classes, +impl TimeRange { + pub fn new( + bundle_id: impl Into>, + range: &BundleTimeRange, + ) -> Self { + TimeRange { + bundle_id: bundle_id.into(), + start_time: range.start, + end_time: range.end, } - .try_into() - .map(BundleData::Ereports) } } -/// Joined query result: flags + optional host_info + optional ereports. -/// All fields use `#[diesel(embed)]` so no `table_name` is needed. +impl From for BundleTimeRange { + fn from(row: TimeRange) -> Self { + BundleTimeRange { start: row.start_time, end: row.end_time } + } +} + +/// Joined query result: flags + optional host_info + optional ereports + +/// optional bundle-wide time range. All fields use `#[diesel(embed)]` so +/// no `table_name` is needed. #[derive(Queryable, Selectable)] pub struct BundleDataSelection { #[diesel(embed)] @@ -269,14 +287,14 @@ pub struct BundleDataSelection { pub host_info: Option, #[diesel(embed)] pub ereports: Option, + #[diesel(embed)] + pub time_range: Option, } -impl TryFrom +impl From for nexus_types::support_bundle::BundleDataSelection { - type Error = omicron_common::api::external::Error; - - fn try_from(row: BundleDataSelection) -> Result { + fn from(row: BundleDataSelection) -> Self { let mut selection = nexus_types::support_bundle::BundleDataSelection::new(); if row.flags.include_reconfigurator { @@ -292,8 +310,9 @@ impl TryFrom selection.insert(host_info.into()); } if let Some(ereports) = row.ereports { - selection.insert(ereports.try_into()?); + selection.insert(ereports.into()); } - Ok(selection) + selection.set_time_range(row.time_range.map(BundleTimeRange::from)); + selection } } diff --git a/nexus/db-queries/src/db/datastore/ereport.rs b/nexus/db-queries/src/db/datastore/ereport.rs index 54aa6f6418d..147c5e13063 100644 --- a/nexus/db-queries/src/db/datastore/ereport.rs +++ b/nexus/db-queries/src/db/datastore/ereport.rs @@ -32,6 +32,7 @@ use nexus_db_schema::schema::ereport::dsl; use nexus_types::fm::ereport as fm; use nexus_types::fm::ereport::EreportFilters; use nexus_types::fm::ereport::EreportId; +use nexus_types::support_bundle::BundleTimeRange; use omicron_common::api::external::CreateResult; use omicron_common::api::external::DataPageParams; use omicron_common::api::external::Error; @@ -96,11 +97,13 @@ impl DataStore { &self, opctx: &OpContext, filters: &EreportFilters, + time_range: Option<&BundleTimeRange>, pagparams: &DataPageParams<'_, (Uuid, DbEna)>, ) -> ListResultVec { opctx.authorize(authz::Action::ListChildren, &authz::FLEET).await?; - let query = Self::ereport_fetch_matching_query(filters, pagparams); + let query = + Self::ereport_fetch_matching_query(filters, time_range, pagparams); query .load_async(&*self.pool_connection_authorized(opctx).await?) .await @@ -109,6 +112,7 @@ impl DataStore { fn ereport_fetch_matching_query( filters: &EreportFilters, + time_range: Option<&BundleTimeRange>, pagparams: &DataPageParams<'_, (Uuid, DbEna)>, ) -> impl RunnableQuery + use<> { let mut query = paginated_multicolumn( @@ -119,12 +123,13 @@ impl DataStore { .filter(dsl::time_deleted.is_null()) .select(Ereport::as_select()); - if let Some(start) = filters.start_time() { - query = query.filter(dsl::time_collected.ge(start)); - } - - if let Some(end) = filters.end_time() { - query = query.filter(dsl::time_collected.le(end)); + if let Some(range) = time_range { + if let Some(start) = range.start { + query = query.filter(dsl::time_collected.ge(start)); + } + if let Some(end) = range.end { + query = query.filter(dsl::time_collected.le(end)); + } } if !filters.only_serials().is_empty() { @@ -553,23 +558,20 @@ mod tests { #[tokio::test] async fn explain_ereport_fetch_matching_only_time() { - explain_fetch_matching_query( + explain_fetch_matching_query_with_time( "explain_ereport_fetch_matching_only_time", - EreportFilters::new() - .with_end_time(chrono::Utc::now()) - .expect("no start time set"), + EreportFilters::new(), + BundleTimeRange { start: None, end: Some(chrono::Utc::now()) }, ) .await } #[tokio::test] async fn explain_ereport_fetch_matching_time_and_serials() { - explain_fetch_matching_query( + explain_fetch_matching_query_with_time( "explain_ereport_fetch_matching_only_time", - EreportFilters::new() - .with_serials(["BRM6900420", "BRM5555555"]) - .with_end_time(chrono::Utc::now()) - .expect("no start time set"), + EreportFilters::new().with_serials(["BRM6900420", "BRM5555555"]), + BundleTimeRange { start: None, end: Some(chrono::Utc::now()) }, ) .await } @@ -577,6 +579,23 @@ mod tests { async fn explain_fetch_matching_query( test_name: &str, filters: EreportFilters, + ) { + explain_fetch_matching_query_inner(test_name, filters, None).await + } + + async fn explain_fetch_matching_query_with_time( + test_name: &str, + filters: EreportFilters, + time_range: BundleTimeRange, + ) { + explain_fetch_matching_query_inner(test_name, filters, Some(time_range)) + .await + } + + async fn explain_fetch_matching_query_inner( + test_name: &str, + filters: EreportFilters, + time_range: Option, ) { let logctx = dev::test_setup_log(test_name); let db = TestDatabase::new_with_pool(&logctx.log).await; @@ -590,8 +609,11 @@ mod tests { }; eprintln!("--- filters: {filters:#?}\n"); - let query = - DataStore::ereport_fetch_matching_query(&filters, &pagparams); + let query = DataStore::ereport_fetch_matching_query( + &filters, + time_range.as_ref(), + &pagparams, + ); let explanation = query .explain_async(&conn) @@ -795,19 +817,25 @@ mod tests { }; let found_default = datastore - .ereport_fetch_matching(opctx, &Default::default(), &pagparams) + .ereport_fetch_matching( + opctx, + &Default::default(), + None, + &pagparams, + ) .await .expect("fetch matching with default filters should succeed"); check_results(dbg!(found_default), &id, &ereport); + let time_range = BundleTimeRange { + start: Some(ereport.time_collected - Duration::from_secs(600)), + end: None, + }; let found_by_time_range = datastore .ereport_fetch_matching( opctx, - &EreportFilters::new() - .with_start_time( - ereport.time_collected - Duration::from_secs(600), - ) - .expect("no end time set"), + &EreportFilters::new(), + Some(&time_range), &pagparams, ) .await @@ -818,6 +846,7 @@ mod tests { .ereport_fetch_matching( opctx, &EreportFilters::new().with_serials(["my cool serial"]), + None, &pagparams, ) .await @@ -828,6 +857,7 @@ mod tests { .ereport_fetch_matching( opctx, &EreportFilters::new().with_classes(["my cool ereport"]), + None, &pagparams, ) .await diff --git a/nexus/db-queries/src/db/datastore/fm.rs b/nexus/db-queries/src/db/datastore/fm.rs index 1552d463dfb..a3ff3121fdc 100644 --- a/nexus/db-queries/src/db/datastore/fm.rs +++ b/nexus/db-queries/src/db/datastore/fm.rs @@ -119,6 +119,7 @@ sitrep_child_tables! { SupportBundleRequestDataSelectionFlags => { table: "fm_support_bundle_request_data_selection_flags" }, SupportBundleRequestDataSelectionHostInfo => { table: "fm_support_bundle_request_data_selection_host_info" }, SupportBundleRequestDataSelectionEreports => { table: "fm_support_bundle_request_data_selection_ereports" }, + SupportBundleRequestDataSelectionTimeRange => { table: "fm_support_bundle_request_data_selection_time_range" }, SupportBundleRequest => { table: "fm_support_bundle_request" }, Case => { table: "fm_case" }, } @@ -576,6 +577,7 @@ impl DataStore { use nexus_db_schema::schema::fm_support_bundle_request_data_selection_ereports::dsl as ereports_dsl; use nexus_db_schema::schema::fm_support_bundle_request_data_selection_flags::dsl as flags_dsl; use nexus_db_schema::schema::fm_support_bundle_request_data_selection_host_info::dsl as host_info_dsl; + use nexus_db_schema::schema::fm_support_bundle_request_data_selection_time_range::dsl as time_range_dsl; let sitrep_uuid = id.into_untyped_uuid(); let mut selections = @@ -600,6 +602,11 @@ impl DataStore { .on(ereports_dsl::sitrep_id.eq(flags_dsl::sitrep_id) .and(ereports_dsl::request_id.eq(flags_dsl::request_id))), ) + .left_join( + time_range_dsl::fm_support_bundle_request_data_selection_time_range + .on(time_range_dsl::sitrep_id.eq(flags_dsl::sitrep_id) + .and(time_range_dsl::request_id.eq(flags_dsl::request_id))), + ) .select(DbBundleDataSelection::as_select()) .load_async(conn) .await @@ -611,11 +618,7 @@ impl DataStore { paginator = p.found_batch(&batch, &|row| row.flags.request_id); for row in batch { let request_id: SupportBundleUuid = row.flags.request_id.into(); - let domain_selection: BundleDataSelection = - row.try_into().map_err(|e: Error| { - e.internal_context("failed to convert data selection") - })?; - selections.insert(request_id, domain_selection); + selections.insert(request_id, row.into()); } } @@ -847,11 +850,12 @@ impl DataStore { requests: Vec, data_selections: Vec<(SupportBundleUuid, BundleDataSelection)>, ) -> Result<(), InsertSitrepError> { - use model::fm::{DataSelectionFlags, Ereports, HostInfo}; + use model::fm::{DataSelectionFlags, Ereports, HostInfo, TimeRange}; use nexus_db_schema::schema::fm_support_bundle_request::dsl as support_bundle_req_dsl; use nexus_db_schema::schema::fm_support_bundle_request_data_selection_ereports::dsl as ereports_dsl; use nexus_db_schema::schema::fm_support_bundle_request_data_selection_flags::dsl as flags_dsl; use nexus_db_schema::schema::fm_support_bundle_request_data_selection_host_info::dsl as host_info_dsl; + use nexus_db_schema::schema::fm_support_bundle_request_data_selection_time_range::dsl as time_range_dsl; if !requests.is_empty() { diesel::insert_into( @@ -870,6 +874,7 @@ impl DataStore { let mut flags_rows = Vec::new(); let mut host_info_rows = Vec::new(); let mut ereports_rows = Vec::new(); + let mut time_range_rows = Vec::new(); for (req_id, data_selection) in data_selections { flags_rows.push(DataSelectionFlags::from_sitrep( @@ -877,6 +882,10 @@ impl DataStore { req_id, &data_selection, )); + if let Some(range) = data_selection.time_range() { + time_range_rows + .push(TimeRange::from_sitrep(sitrep_id, req_id, range)); + } for data in data_selection { match data { BundleData::Reconfigurator @@ -932,6 +941,20 @@ impl DataStore { .internal_context("failed to insert sb_req_ereports rows") })?; } + if !time_range_rows.is_empty() { + diesel::insert_into( + time_range_dsl::fm_support_bundle_request_data_selection_time_range, + ) + .values(time_range_rows) + .execute_async(conn) + .await + .map_err(|e| { + public_error_from_diesel(e, ErrorHandler::Server) + .internal_context( + "failed to insert sb_req_time_range rows", + ) + })?; + } Ok(()) } @@ -1484,6 +1507,7 @@ impl DataStore { use nexus_db_schema::schema::fm_support_bundle_request_data_selection_ereports::dsl as ereports_dsl; use nexus_db_schema::schema::fm_support_bundle_request_data_selection_flags::dsl as flags_dsl; use nexus_db_schema::schema::fm_support_bundle_request_data_selection_host_info::dsl as host_info_dsl; + use nexus_db_schema::schema::fm_support_bundle_request_data_selection_time_range::dsl as time_range_dsl; diesel::delete( flags_dsl::fm_support_bundle_request_data_selection_flags @@ -1503,6 +1527,12 @@ impl DataStore { ) .execute_async(conn) .await?; + diesel::delete( + time_range_dsl::fm_support_bundle_request_data_selection_time_range + .filter(time_range_dsl::sitrep_id.eq_any(sitrep_ids.clone())), + ) + .execute_async(conn) + .await?; diesel::delete( support_bundle_req_dsl::fm_support_bundle_request .filter(support_bundle_req_dsl::sitrep_id.eq_any(sitrep_ids)), @@ -2213,7 +2243,7 @@ mod tests { }) .unwrap(); // A request with a nontrivial data_selection, including - // HostInfo(Specific) and ereport time filters. + // HostInfo(Specific) and a bundle-wide time range. { use nexus_types::fm::ereport::EreportFilters; use nexus_types::support_bundle::*; @@ -2224,13 +2254,11 @@ mod tests { .with_specific_sleds([ omicron_uuid_kinds::SledUuid::new_v4(), ]) - .with_ereports( - EreportFilters::new() - .with_start_time(now - chrono::Duration::hours(1)) - .unwrap() - .with_end_time(now) - .unwrap(), - ); + .with_ereports(EreportFilters::new()) + .with_time_range(BundleTimeRange { + start: Some(now - chrono::Duration::hours(1)), + end: Some(now), + }); support_bundles_requested .insert_unique(fm::case::SupportBundleRequest { id: SupportBundleUuid::new_v4(), @@ -2378,13 +2406,13 @@ mod tests { .with_specific_sleds([sled1, sled2]) .with_ereports( EreportFilters::new() - .with_start_time(now - chrono::Duration::hours(2)) - .unwrap() - .with_end_time(now) - .unwrap() .with_serials(["BRM42"]) .with_classes(["ereport.io"]), - ); + ) + .with_time_range(BundleTimeRange { + start: Some(now - chrono::Duration::hours(2)), + end: Some(now), + }); support_bundles_requested .insert_unique(fm::case::SupportBundleRequest { id: SupportBundleUuid::new_v4(), diff --git a/nexus/db-queries/src/db/datastore/support_bundle.rs b/nexus/db-queries/src/db/datastore/support_bundle.rs index 71b2ca5e727..742bbcd94a7 100644 --- a/nexus/db-queries/src/db/datastore/support_bundle.rs +++ b/nexus/db-queries/src/db/datastore/support_bundle.rs @@ -261,33 +261,36 @@ impl DataStore { use nexus_db_schema::schema::support_bundle_data_selection_ereports::dsl as ereports_dsl; use nexus_db_schema::schema::support_bundle_data_selection_flags::dsl as flags_dsl; use nexus_db_schema::schema::support_bundle_data_selection_host_info::dsl as host_info_dsl; + use nexus_db_schema::schema::support_bundle_data_selection_time_range::dsl as time_range_dsl; let conn = self.pool_connection_authorized(opctx).await?; let bundle_uuid = bundle_id.into_untyped_uuid(); - flags_dsl::support_bundle_data_selection_flags - .filter(flags_dsl::bundle_id.eq(bundle_uuid)) - .left_join( - host_info_dsl::support_bundle_data_selection_host_info - .on(host_info_dsl::bundle_id.eq(flags_dsl::bundle_id)), - ) - .left_join( - ereports_dsl::support_bundle_data_selection_ereports - .on(ereports_dsl::bundle_id.eq(flags_dsl::bundle_id)), - ) - .select(DbBundleDataSelection::as_select()) - .first_async(&*conn) - .await - .map_err(|e| { - public_error_from_diesel(e, ErrorHandler::Server) - .internal_context( - "failed to query data selection for bundle", - ) - })? - .try_into() - .map_err(|e: Error| { - e.internal_context("failed to convert data selection") - }) + let row: DbBundleDataSelection = + flags_dsl::support_bundle_data_selection_flags + .filter(flags_dsl::bundle_id.eq(bundle_uuid)) + .left_join( + host_info_dsl::support_bundle_data_selection_host_info + .on(host_info_dsl::bundle_id.eq(flags_dsl::bundle_id)), + ) + .left_join( + ereports_dsl::support_bundle_data_selection_ereports + .on(ereports_dsl::bundle_id.eq(flags_dsl::bundle_id)), + ) + .left_join( + time_range_dsl::support_bundle_data_selection_time_range + .on(time_range_dsl::bundle_id.eq(flags_dsl::bundle_id)), + ) + .select(DbBundleDataSelection::as_select()) + .first_async(&*conn) + .await + .map_err(|e| { + public_error_from_diesel(e, ErrorHandler::Server) + .internal_context( + "failed to query data selection for bundle", + ) + })?; + Ok(row.into()) } /// Looks up a single support bundle @@ -645,10 +648,13 @@ impl DataStore { bundle_id: SupportBundleUuid, data_selection: BundleDataSelection, ) -> Result<(), diesel::result::Error> { - use crate::db::model::{DataSelectionFlags, Ereports, HostInfo}; + use crate::db::model::{ + DataSelectionFlags, Ereports, HostInfo, TimeRange, + }; use nexus_db_schema::schema::support_bundle_data_selection_ereports::dsl as ereports_dsl; use nexus_db_schema::schema::support_bundle_data_selection_flags::dsl as flags_dsl; use nexus_db_schema::schema::support_bundle_data_selection_host_info::dsl as host_info_dsl; + use nexus_db_schema::schema::support_bundle_data_selection_time_range::dsl as time_range_dsl; // Always insert a flags row. diesel::insert_into(flags_dsl::support_bundle_data_selection_flags) @@ -663,6 +669,17 @@ impl DataStore { .execute_async(conn) .await?; + // Insert the bundle-wide time-range row (if set), independent + // of any per-category rows. + if let Some(range) = data_selection.time_range() { + diesel::insert_into( + time_range_dsl::support_bundle_data_selection_time_range, + ) + .values(TimeRange::new(bundle_id, range)) + .execute_async(conn) + .await?; + } + // Insert payload tables for variants that carry data. for data in data_selection { match data { @@ -704,6 +721,7 @@ impl DataStore { use nexus_db_schema::schema::support_bundle_data_selection_ereports::dsl as ereports_dsl; use nexus_db_schema::schema::support_bundle_data_selection_flags::dsl as flags_dsl; use nexus_db_schema::schema::support_bundle_data_selection_host_info::dsl as host_info_dsl; + use nexus_db_schema::schema::support_bundle_data_selection_time_range::dsl as time_range_dsl; diesel::delete(flags_dsl::support_bundle_data_selection_flags) .filter(flags_dsl::bundle_id.eq_any(bundle_ids.clone())) @@ -714,9 +732,15 @@ impl DataStore { .execute_async(conn) .await?; diesel::delete(ereports_dsl::support_bundle_data_selection_ereports) - .filter(ereports_dsl::bundle_id.eq_any(bundle_ids)) + .filter(ereports_dsl::bundle_id.eq_any(bundle_ids.clone())) .execute_async(conn) .await?; + diesel::delete( + time_range_dsl::support_bundle_data_selection_time_range, + ) + .filter(time_range_dsl::bundle_id.eq_any(bundle_ids)) + .execute_async(conn) + .await?; Ok(()) } diff --git a/nexus/db-schema/src/schema.rs b/nexus/db-schema/src/schema.rs index 85383eac200..dc42d9ba15c 100644 --- a/nexus/db-schema/src/schema.rs +++ b/nexus/db-schema/src/schema.rs @@ -1620,17 +1620,24 @@ table! { table! { support_bundle_data_selection_ereports (bundle_id) { bundle_id -> Uuid, - start_time -> Nullable, - end_time -> Nullable, only_serials -> Array, only_classes -> Array, } } +table! { + support_bundle_data_selection_time_range (bundle_id) { + bundle_id -> Uuid, + start_time -> Nullable, + end_time -> Nullable, + } +} + allow_tables_to_appear_in_same_query!( support_bundle_data_selection_flags, support_bundle_data_selection_host_info, support_bundle_data_selection_ereports, + support_bundle_data_selection_time_range, ); /* hardware inventory */ @@ -3297,17 +3304,25 @@ table! { fm_support_bundle_request_data_selection_ereports (sitrep_id, request_id) { sitrep_id -> Uuid, request_id -> Uuid, - start_time -> Nullable, - end_time -> Nullable, only_serials -> Array, only_classes -> Array, } } +table! { + fm_support_bundle_request_data_selection_time_range (sitrep_id, request_id) { + sitrep_id -> Uuid, + request_id -> Uuid, + start_time -> Nullable, + end_time -> Nullable, + } +} + allow_tables_to_appear_in_same_query!( fm_support_bundle_request_data_selection_flags, fm_support_bundle_request_data_selection_host_info, fm_support_bundle_request_data_selection_ereports, + fm_support_bundle_request_data_selection_time_range, ); table! { diff --git a/nexus/mgs-updates/src/test_util/host_phase_2_test_state.rs b/nexus/mgs-updates/src/test_util/host_phase_2_test_state.rs index f228f68d961..1299a32faed 100644 --- a/nexus/mgs-updates/src/test_util/host_phase_2_test_state.rs +++ b/nexus/mgs-updates/src/test_util/host_phase_2_test_state.rs @@ -941,6 +941,18 @@ mod api_impl { unimplemented!() } + async fn support_logs_download_v1( + _request_context: RequestContext, + _path_params: Path< + sled_agent_types_versions::v1::diagnostics::SledDiagnosticsLogsDownloadPathParam, + >, + _query_params: dropshot::Query< + sled_agent_types_versions::v1::diagnostics::SledDiagnosticsLogsDownloadQueryParam, + >, + ) -> Result, HttpError> { + unimplemented!() + } + async fn chicken_switch_destroy_orphaned_datasets_get_v1( _request_context: RequestContext, ) -> Result< diff --git a/nexus/src/app/background/tasks/mod.rs b/nexus/src/app/background/tasks/mod.rs index fdcb45ef8d0..e4438208813 100644 --- a/nexus/src/app/background/tasks/mod.rs +++ b/nexus/src/app/background/tasks/mod.rs @@ -49,7 +49,6 @@ pub mod region_snapshot_replacement_step; pub mod saga_recovery; pub mod service_firewall_rules; pub mod session_cleanup; -pub mod support_bundle; pub mod support_bundle_collector; pub mod sync_service_zone_nat; pub mod sync_switch_configuration; diff --git a/nexus/src/app/background/tasks/support_bundle/collection.rs b/nexus/src/app/background/tasks/support_bundle/collection.rs deleted file mode 100644 index 08c9df8dbde..00000000000 --- a/nexus/src/app/background/tasks/support_bundle/collection.rs +++ /dev/null @@ -1,678 +0,0 @@ -// This Source Code Form is subject to the terms of the Mozilla Public -// License, v. 2.0. If a copy of the MPL was not distributed with this -// file, You can obtain one at https://mozilla.org/MPL/2.0/. - -//! The entrypoint to all support bundle collection. -//! -//! These are the primitives used to look up everything else within the bundle. - -use crate::app::background::tasks::support_bundle::cache::Cache; -use crate::app::background::tasks::support_bundle::perfetto; -use crate::app::background::tasks::support_bundle::step::CollectionStep; -use crate::app::background::tasks::support_bundle::steps; -use nexus_types::support_bundle::BundleDataSelection; - -use anyhow::Context; -use camino::Utf8DirEntry; -use camino::Utf8Path; -use camino_tempfile::Utf8TempDir; -use camino_tempfile::tempdir_in; -use camino_tempfile::tempfile_in; -use internal_dns_resolver::Resolver; -use nexus_db_model::SupportBundle; -use nexus_db_model::SupportBundleState; -use nexus_db_queries::context::OpContext; -use nexus_db_queries::db::DataStore; -use nexus_types::internal_api::background::SupportBundleCollectionReport; -use omicron_common::api::external::Error; -use omicron_uuid_kinds::DatasetUuid; -use omicron_uuid_kinds::SupportBundleUuid; -use omicron_uuid_kinds::ZpoolUuid; -use parallel_task_set::ParallelTaskSet; -use serde_json::json; -use sha2::Digest; -use sha2::Sha256; -use slog_error_chain::InlineErrorChain; -use std::io::Write; -use std::num::NonZeroU64; -use std::sync::Arc; -use tokio::io::AsyncReadExt; -use tokio::io::AsyncSeekExt; -use tokio::io::SeekFrom; -use tokio_util::sync::CancellationToken; -use tufaceous_artifact::ArtifactHash; -use zip::ZipWriter; -use zip::write::FullFileOptions; - -/// We use "/var/tmp" to use Nexus' filesystem for temporary storage, -/// rather than "/tmp", which would keep this collected data in-memory. -pub const TEMPDIR: &str = "/var/tmp"; - -/// The size of piece of a support bundle to transfer to the sled agent -/// within a single streaming request. -pub const CHUNK_SIZE: NonZeroU64 = NonZeroU64::new(1024 * 1024 * 1024).unwrap(); - -/// Wraps up all arguments to perform a single support bundle collection -pub struct BundleCollection { - datastore: Arc, - resolver: Resolver, - log: slog::Logger, - opctx: OpContext, - data_selection: BundleDataSelection, - bundle: SupportBundle, - transfer_chunk_size: NonZeroU64, - cancellation_token: CancellationToken, -} - -impl BundleCollection { - pub fn new( - datastore: Arc, - resolver: Resolver, - log: slog::Logger, - opctx: OpContext, - data_selection: BundleDataSelection, - bundle: SupportBundle, - transfer_chunk_size: NonZeroU64, - ) -> Self { - Self { - datastore, - resolver, - log, - opctx, - data_selection, - bundle, - transfer_chunk_size, - cancellation_token: CancellationToken::new(), - } - } - - pub fn datastore(&self) -> &Arc { - &self.datastore - } - - pub fn resolver(&self) -> &Resolver { - &self.resolver - } - - pub fn log(&self) -> &slog::Logger { - &self.log - } - - pub fn opctx(&self) -> &OpContext { - &self.opctx - } - - pub fn data_selection(&self) -> &BundleDataSelection { - &self.data_selection - } - - pub fn bundle(&self) -> &SupportBundle { - &self.bundle - } - - /// Returns true if this bundle collection has been cancelled. - /// - /// Use for cooperative cancellation checks before cancel-unsafe - /// operations (filesystem writes, `spawn_blocking`). - pub fn is_cancelled(&self) -> bool { - self.cancellation_token.is_cancelled() - } - - /// Returns a reference to the cancellation token. - /// - /// Pass to helper functions that need to `select!` on cancellation - /// independently (e.g., futures inside `FuturesUnordered`). - pub fn cancellation_token(&self) -> &CancellationToken { - &self.cancellation_token - } - - /// Returns a future that completes when cancellation is requested. - /// - /// Use in `tokio::select!` with cancel-safe operations (HTTP requests, - /// DB queries) for immediate cancellation at await points. - pub async fn cancelled(&self) { - self.cancellation_token.cancelled().await - } - - /// Collect the bundle within Nexus, and store it on a target sled. - pub async fn collect_bundle_and_store_on_sled( - self: &Arc, - ) -> anyhow::Result { - // Create a temporary directory where we'll store the support bundle - // as it's being collected. - let dir = tempdir_in(TEMPDIR)?; - - let report = self.collect_bundle_locally(&dir).await?; - self.store_bundle_on_sled(dir).await?; - Ok(report) - } - - // Create the support bundle, placing the contents into a user-specified - // directory. - // - // Does not attempt to convert the contents into a zipfile, nor send them - // to any durable storage. - async fn collect_bundle_locally( - self: &Arc, - dir: &Utf8TempDir, - ) -> anyhow::Result { - // Spawn a background task that periodically checks whether this - // bundle should still be collected. If not, it cancels the - // `CancellationToken`. - // - // Cancellation is hybrid: - // - // - Cancel-safe operations (HTTP requests, DB queries) use - // `tokio::select!` with the token for immediate cancellation. - // Dropping these futures is safe — no local side effects. - // - // - Cancel-unsafe operations (filesystem writes via `tokio::fs`, - // `spawn_blocking`) use cooperative `is_cancelled()` checks. - // These are never dropped mid-flight, ensuring all - // `spawn_blocking` work completes before the TempDir is dropped. - // - // `run_collect_bundle_steps` checks the token before spawning new - // steps and drains all in-flight tasks before returning. - // - // Previous iterations used a top-level `tokio::select!` to race - // cancellation against the entire collection. This had two problems: - // - // 1. Dropping the collection future left `spawn_blocking` file - // writes in-flight, racing with TempDir cleanup. - // See: https://github.com/oxidecomputer/omicron/issues/10198 - // - // 2. Earlier versions of the `select!` did async DB work in the - // cancellation branch body while the collection branch held a - // connection pool claim, causing deadlocks. - // See: https://github.com/oxidecomputer/omicron/issues/9259 - // - // The current design avoids both: cancel-unsafe futures are never - // dropped, and the DB check runs in a separate spawned task. - let cancel_task = tokio::spawn({ - let this = Arc::clone(self); - async move { this.check_for_cancellation().await } - }); - - // Run the collection. It checks self.cancelled and drains all - // in-flight work before returning. - let report = self.collect_bundle_as_file(dir).await; - - // Collection is done — stop the cancellation checker. - cancel_task.abort(); - let _ = cancel_task.await; - - if self.is_cancelled() { - warn!( - &self.log, - "Support Bundle cancelled - stopping collection"; - "bundle" => %self.bundle.id, - "state" => ?self.bundle.state - ); - return Err(anyhow::anyhow!("Support Bundle Cancelled")); - } - - info!( - &self.log, - "Bundle Collection completed"; - "bundle" => %self.bundle.id - ); - report - } - - async fn store_bundle_on_sled( - &self, - dir: Utf8TempDir, - ) -> anyhow::Result<()> { - // Create the zipfile as a temporary file - let mut zipfile = tokio::fs::File::from_std(bundle_to_zipfile(&dir)?); - let total_len = zipfile.metadata().await?.len(); - - // Collect the hash locally before we send it over the network - // - // We'll use this later during finalization to confirm the bundle - // has been stored successfully. - zipfile.seek(SeekFrom::Start(0)).await?; - let hash = sha2_hash(&mut zipfile).await?; - - // Find the sled where we're storing this bundle. - let sled_id = self - .datastore - .zpool_get_sled_if_in_service( - &self.opctx, - self.bundle.zpool_id.into(), - ) - .await?; - let sled_client = nexus_networking::sled_client( - &self.datastore, - &self.opctx, - sled_id, - &self.log, - ) - .await?; - - let zpool = ZpoolUuid::from(self.bundle.zpool_id); - let dataset = DatasetUuid::from(self.bundle.dataset_id); - let support_bundle = SupportBundleUuid::from(self.bundle.id); - - // Tell this sled to create the bundle. - let creation_result = sled_client - .support_bundle_start_creation(&zpool, &dataset, &support_bundle) - .await - .with_context(|| "Support bundle failed to start creation")?; - - if matches!( - creation_result.state, - sled_agent_client::types::SupportBundleState::Complete - ) { - // Early exit case: the bundle was already created -- we must have either - // crashed or failed between "finalizing" and "writing to the database that we - // finished". - info!(&self.log, "Support bundle was already collected"; "bundle" => %self.bundle.id); - return Ok(()); - } - info!(&self.log, "Support bundle creation started"; "bundle" => %self.bundle.id); - - let mut offset = 0; - while offset < total_len { - // Stream the zipfile to the sled where it should be kept - let mut file = zipfile - .try_clone() - .await - .with_context(|| "Failed to clone zipfile")?; - file.seek(SeekFrom::Start(offset)).await.with_context(|| { - format!("Failed to seek to offset {offset} / {total_len} within zipfile") - })?; - - // Only stream at most "transfer_chunk_size" bytes at once - let chunk_size = std::cmp::min( - self.transfer_chunk_size.get(), - total_len - offset, - ); - - let limited_file = file.take(chunk_size); - let stream = tokio_util::io::ReaderStream::new(limited_file); - let body = reqwest::Body::wrap_stream(stream); - - info!( - &self.log, - "Streaming bundle chunk"; - "bundle" => %self.bundle.id, - "offset" => offset, - "length" => chunk_size, - ); - - sled_client.support_bundle_transfer( - &zpool, &dataset, &support_bundle, offset, body - ).await.with_context(|| { - format!("Failed to transfer bundle: {chunk_size}@{offset} of {total_len} to sled") - })?; - - offset += chunk_size; - } - - sled_client - .support_bundle_finalize( - &zpool, - &dataset, - &support_bundle, - &hash.to_string(), - ) - .await - .with_context(|| "Failed to finalize bundle")?; - - // Returning from this method should drop all temporary storage - // allocated locally for this support bundle. - Ok(()) - } - - // Indefinitely perform periodic checks about whether or not we should - // cancel the bundle. - // - // Cancels `self.cancellation_token` and returns if: - // - The bundle state is no longer SupportBundleState::Collecting - // (which happens if the bundle has been explicitly cancelled, or - // if the backing storage has been expunged). - // - The bundle has been deleted - // - // Otherwise, keeps checking indefinitely while polled. - async fn check_for_cancellation(&self) { - let work_duration = tokio::time::Duration::from_secs(5); - let mut yield_interval = tokio::time::interval_at( - tokio::time::Instant::now() + work_duration, - work_duration, - ); - - loop { - // Timer fired mid-collection - check if we should stop. - yield_interval.tick().await; - trace!( - self.log, - "Checking if Bundle Collection cancelled"; - "bundle" => %self.bundle.id - ); - - match self - .datastore - .support_bundle_get(&self.opctx, self.bundle.id.into()) - .await - { - Ok(SupportBundle { - state: SupportBundleState::Collecting, - .. - }) => { - // Bundle still collecting; continue... - continue; - } - Ok(_) => { - // Not collecting, for any reason: Time to exit - self.cancellation_token.cancel(); - return; - } - Err(Error::ObjectNotFound { .. } | Error::NotFound { .. }) => { - self.cancellation_token.cancel(); - return; - } - Err(err) => { - warn!( - self.log, - "Database error checking bundle cancellation"; - InlineErrorChain::new(&err) - ); - - // If we cannot contact the database, retry later - continue; - } - } - } - } - - async fn run_collect_bundle_steps( - self: &Arc, - output: &Utf8TempDir, - mut steps: Vec, - ) -> SupportBundleCollectionReport { - let mut report = - SupportBundleCollectionReport::new(self.bundle.id.into()); - - const MAX_CONCURRENT_STEPS: usize = 16; - let mut tasks = - ParallelTaskSet::new_with_parallelism(MAX_CONCURRENT_STEPS); - - loop { - // Check for cancellation before spawning new work. - if self.is_cancelled() { - info!( - &self.log, - "Cancellation detected — draining in-flight steps"; - "bundle" => %self.bundle.id, - ); - break; - } - - // Process all the currently-planned steps - while let Some(step) = steps.pop() { - let previous_result = tasks - .spawn({ - let collection = self.clone(); - let dir = output.path().to_path_buf(); - let log = self.log.clone(); - async move { - debug!(log, "Running step"; "step" => &step.name); - step.run(&collection, dir.as_path(), &log).await - } - }) - .await; - - if let Some(output) = previous_result { - output.process(&mut report, &mut steps); - }; - } - - // If we've run out of tasks to spawn, join any of the previously - // spawned tasks, if any exist. - if let Some(output) = tasks.join_next().await { - output.process(&mut report, &mut steps); - - // As soon as any task completes, see if we can spawn more work - // immediately. This ensures that the ParallelTaskSet is - // saturated as much as it can be. - continue; - } - - // Executing steps may create additional steps, as follow-up work. - // - // Only finish if we've exhausted all possible steps and joined - // all spawned work. - if steps.is_empty() { - // Write trace file before returning - if let Err(err) = self.write_trace_file(output, &report).await { - warn!( - self.log, - "Failed to write trace file"; - "error" => ?err - ); - } - return report; - } - } - - // Drain all in-flight tasks. This ensures any tokio::fs operations - // (which internally use spawn_blocking) complete before we return, - // so the TempDir can be safely dropped by our caller. - while let Some(output) = tasks.join_next().await { - output.process(&mut report, &mut steps); - // Ignore newly-spawned steps from drained tasks — we're - // stopping. - } - - report - } - - // Write a Perfetto Event format JSON file for visualization - async fn write_trace_file( - &self, - output: &Utf8TempDir, - report: &SupportBundleCollectionReport, - ) -> anyhow::Result<()> { - let meta_dir = output.path().join("meta"); - tokio::fs::create_dir_all(&meta_dir).await.with_context(|| { - format!("Failed to create meta directory {meta_dir}") - })?; - - let trace_path = meta_dir.join("trace.json"); - - // Convert steps to Perfetto Trace Event format. - // Sort steps by start time and assign each a unique sequential ID. - // - // This is necessary because the trace event format does not like - // multiple slices to overlap - so we make each slice distinct. - // - // Ideally we'd be able to correlate these with actual tokio tasks, - // but it's hard to convert tokio::task::Id to a u64 because - // of https://github.com/tokio-rs/tokio/issues/7430 - let mut sorted_steps: Vec<_> = report.steps.iter().collect(); - sorted_steps.sort_by_key(|s| s.start); - - // Generate trace events - each step gets a unique ID (1, 2, 3, ...) - // based on its start time order - let trace_events: Vec<_> = sorted_steps - .iter() - .enumerate() - .map(|(i, step)| { - let start_us = step.start.timestamp_micros(); - let duration_us = (step.end - step.start) - .num_microseconds() - .unwrap_or(0) - .max(0); - let step_id = i + 1; - - perfetto::TraceEvent { - name: step.name.clone(), - cat: "bundle_collection".to_string(), - ph: "X".to_string(), - ts: start_us, - dur: duration_us, - pid: 1, - tid: step_id, - args: json!({ - "status": step.status.to_string(), - }), - } - }) - .collect(); - - let trace = perfetto::Trace { - trace_events, - display_time_unit: "ms".to_string(), - }; - - let trace_content = serde_json::to_string_pretty(&trace) - .context("Failed to serialize trace JSON")?; - - tokio::fs::write(&trace_path, trace_content).await.with_context( - || format!("Failed to write trace file to {trace_path}"), - )?; - - info!( - self.log, - "Wrote trace file"; - "path" => %trace_path, - "num_events" => trace.trace_events.len() - ); - - Ok(()) - } - - // Perform the work of collecting the support bundle into a temporary - // directory. - // - // "dir" is an output directory where data can be stored. - // - // If a partial bundle can be collected, it should be returned as - // an Ok(SupportBundleCollectionReport). Any failures from this function - // will prevent the support bundle from being collected altogether. - // - // Cancellation is hybrid: cancel-safe operations (HTTP, DB) use - // `select!` with the token for immediate cancellation; cancel-unsafe - // operations (filesystem writes) use cooperative `is_cancelled()`. - // In-flight work is drained before returning. - async fn collect_bundle_as_file( - self: &Arc, - dir: &Utf8TempDir, - ) -> anyhow::Result { - let log = &self.log; - - info!(&log, "Collecting bundle as local file"); - - let cache = Cache::new(); - let steps = steps::all(&cache); - Ok(self.run_collect_bundle_steps(dir, steps).await) - } -} - -// Takes a directory "dir", and zips the contents into a single zipfile. -fn bundle_to_zipfile(dir: &Utf8TempDir) -> anyhow::Result { - let tempfile = tempfile_in(TEMPDIR)?; - let mut zip = ZipWriter::new(tempfile); - - recursively_add_directory_to_zipfile(&mut zip, dir.path(), dir.path())?; - - Ok(zip.finish()?) -} - -fn recursively_add_directory_to_zipfile( - zip: &mut ZipWriter, - root_path: &Utf8Path, - dir_path: &Utf8Path, -) -> anyhow::Result<()> { - // Readdir might return entries in a non-deterministic order. - // Let's sort it for the zipfile, to be nice. - let mut entries = dir_path - .read_dir_utf8()? - .filter_map(Result::ok) - .collect::>(); - entries.sort_by(|a, b| a.file_name().cmp(&b.file_name())); - - for entry in &entries { - // Remove the "/tmp/..." prefix from the path when we're storing it in the - // zipfile. - let dst = entry.path().strip_prefix(root_path)?; - - let file_type = entry.file_type()?; - if file_type.is_file() { - let src = entry.path(); - - let zip_time = entry - .path() - .metadata() - .and_then(|m| m.modified()) - .ok() - .and_then(|sys_time| jiff::Zoned::try_from(sys_time).ok()) - .and_then(|zoned| { - zip::DateTime::try_from(zoned.datetime()).ok() - }) - .unwrap_or_else(zip::DateTime::default); - - let opts = FullFileOptions::default() - .last_modified_time(zip_time) - .compression_method(zip::CompressionMethod::Deflated) - .large_file(true); - - zip.start_file_from_path(dst, opts)?; - let mut file = std::fs::File::open(&src)?; - std::io::copy(&mut file, zip)?; - } - if file_type.is_dir() { - let opts = FullFileOptions::default(); - zip.add_directory_from_path(dst, opts)?; - recursively_add_directory_to_zipfile(zip, root_path, entry.path())?; - } - } - Ok(()) -} - -async fn sha2_hash(file: &mut tokio::fs::File) -> anyhow::Result { - let mut buf = vec![0u8; 65536]; - let mut ctx = Sha256::new(); - loop { - let n = file.read(&mut buf).await?; - if n == 0 { - break; - } - ctx.write_all(&buf[0..n])?; - } - - let digest = ctx.finalize(); - Ok(ArtifactHash(digest.as_slice().try_into()?)) -} - -#[cfg(test)] -mod test { - use super::*; - - use camino_tempfile::tempdir; - - // Ensure that we can convert a temporary directory into a zipfile - #[test] - fn test_zipfile_creation() { - let dir = tempdir().unwrap(); - - std::fs::create_dir_all(dir.path().join("dir-a")).unwrap(); - std::fs::create_dir_all(dir.path().join("dir-b")).unwrap(); - std::fs::write(dir.path().join("dir-a").join("file-a"), "some data") - .unwrap(); - std::fs::write(dir.path().join("file-b"), "more data").unwrap(); - - let zipfile = bundle_to_zipfile(&dir) - .expect("Should have been able to bundle zipfile"); - let archive = zip::read::ZipArchive::new(zipfile).unwrap(); - - // We expect the order to be deterministically alphabetical - let mut names = archive.file_names(); - assert_eq!(names.next(), Some("dir-a/")); - assert_eq!(names.next(), Some("dir-a/file-a")); - assert_eq!(names.next(), Some("dir-b/")); - assert_eq!(names.next(), Some("file-b")); - assert_eq!(names.next(), None); - } -} diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs index d9091f1cc98..6f173d1b4a0 100644 --- a/nexus/src/app/background/tasks/support_bundle_collector.rs +++ b/nexus/src/app/background/tasks/support_bundle_collector.rs @@ -6,6 +6,9 @@ use crate::app::background::BackgroundTask; use anyhow::Context; +use camino::Utf8Path; +use camino_tempfile::Utf8TempDir; +use camino_tempfile::tempdir_in; use futures::FutureExt; use futures::future::BoxFuture; use internal_dns_resolver::Resolver; @@ -26,13 +29,29 @@ use omicron_uuid_kinds::SledUuid; use omicron_uuid_kinds::SupportBundleUuid; use omicron_uuid_kinds::ZpoolUuid; use serde_json::json; +use sha2::Digest; +use sha2::Sha256; use sled_agent_types::support_bundle::NESTED_DATASET_NOT_FOUND; use slog_error_chain::InlineErrorChain; +use std::io::Write; use std::num::NonZeroU64; use std::sync::Arc; - -use super::support_bundle::collection::BundleCollection; -use super::support_bundle::collection::CHUNK_SIZE; +use support_bundle_collection::BundleCollection; +use support_bundle_collection::BundleInfo; +use support_bundle_collection::zip::bundle_to_zipfile; +use tokio::io::AsyncReadExt; +use tokio::io::AsyncSeekExt; +use tokio::io::SeekFrom; +use tokio_util::sync::CancellationToken; +use tufaceous_artifact::ArtifactHash; + +/// We use "/var/tmp" to use Nexus' filesystem for temporary storage, +/// rather than "/tmp", which would keep this collected data in-memory. +pub const TEMPDIR: &str = "/var/tmp"; + +/// The size of piece of a support bundle to transfer to the sled agent +/// within a single streaming request. +pub const CHUNK_SIZE: NonZeroU64 = NonZeroU64::new(1024 * 1024 * 1024).unwrap(); fn authz_support_bundle_from_id(id: SupportBundleUuid) -> authz::SupportBundle { authz::SupportBundle::new(authz::FLEET, id, LookupType::by_id(id)) @@ -370,17 +389,59 @@ impl SupportBundleCollector { .await .context("failed to query bundle data selection")?; + let bundle_log = + opctx.log.new(slog::o!("bundle" => bundle.id.to_string())); let collection = Arc::new(BundleCollection::new( self.datastore.clone(), self.resolver.clone(), - opctx.log.new(slog::o!("bundle" => bundle.id.to_string())), + bundle_log.clone(), opctx.child(std::collections::BTreeMap::new()), data_selection, - bundle.clone(), - self.transfer_chunk_size, + BundleInfo { + id: bundle.id.into(), + reason_for_creation: bundle.reason_for_creation.clone(), + }, )); - let mut report = collection.collect_bundle_and_store_on_sled().await?; + // Spawn a task that periodically polls the `support_bundle` row's + // state. If the bundle has been cancelled (or removed), it cancels + // the inner `CancellationToken`, prompting `collect_bundle_locally` + // to drain in-flight work and return. + // + // This is a separate task rather than a top-level + // `tokio::select!`: previous designs raced collection against an + // async DB check in the cancellation branch body, deadlocking + // when the collection branch held a connection pool claim. See + // https://github.com/oxidecomputer/omicron/issues/9259. + let cancel_task = tokio::spawn({ + let datastore = self.datastore.clone(); + let cancel_opctx = opctx.child(std::collections::BTreeMap::new()); + let token = collection.cancellation_token().clone(); + let log = bundle_log.clone(); + let bundle_id = bundle.id; + async move { + check_for_cancellation( + datastore, + cancel_opctx, + token, + log, + bundle_id.into(), + ) + .await + } + }); + + // Create a temporary directory where we'll store the support bundle + // as it's being collected. + let dir = tempdir_in(TEMPDIR)?; + let collect_result = collection.collect_bundle_locally(&dir).await; + + // Collection is done — stop the cancellation checker. + cancel_task.abort(); + let _ = cancel_task.await; + + let mut report = collect_result?; + self.store_bundle_on_sled(&bundle_log, opctx, &bundle, dir).await?; if let Err(err) = self .datastore .support_bundle_update( @@ -411,6 +472,191 @@ impl SupportBundleCollector { report.activated_in_db_ok = true; Ok(Some(report)) } + + // Zip the collected bundle directory and stream it to the sled-agent + // that owns the target zpool/dataset for durable storage. + async fn store_bundle_on_sled( + &self, + log: &slog::Logger, + opctx: &OpContext, + bundle: &SupportBundle, + dir: Utf8TempDir, + ) -> anyhow::Result<()> { + // Create the zipfile as a temporary file + let mut zipfile = tokio::fs::File::from_std(bundle_to_zipfile( + &dir, + Utf8Path::new(TEMPDIR), + )?); + let total_len = zipfile.metadata().await?.len(); + + // Collect the hash locally before we send it over the network + // + // We'll use this later during finalization to confirm the bundle + // has been stored successfully. + zipfile.seek(SeekFrom::Start(0)).await?; + let hash = sha2_hash(&mut zipfile).await?; + + // Find the sled where we're storing this bundle. + let sled_id = self + .datastore + .zpool_get_sled_if_in_service(opctx, bundle.zpool_id.into()) + .await?; + let sled_client = + nexus_networking::sled_client(&self.datastore, opctx, sled_id, log) + .await?; + + let zpool = ZpoolUuid::from(bundle.zpool_id); + let dataset = DatasetUuid::from(bundle.dataset_id); + let support_bundle = SupportBundleUuid::from(bundle.id); + + // Tell this sled to create the bundle. + let creation_result = sled_client + .support_bundle_start_creation(&zpool, &dataset, &support_bundle) + .await + .with_context(|| "Support bundle failed to start creation")?; + + if matches!( + creation_result.state, + sled_agent_client::types::SupportBundleState::Complete + ) { + // Early exit case: the bundle was already created -- we must have either + // crashed or failed between "finalizing" and "writing to the database that we + // finished". + info!(log, "Support bundle was already collected"; "bundle" => %bundle.id); + return Ok(()); + } + info!(log, "Support bundle creation started"; "bundle" => %bundle.id); + + let mut offset = 0; + while offset < total_len { + // Stream the zipfile to the sled where it should be kept + let mut file = zipfile + .try_clone() + .await + .with_context(|| "Failed to clone zipfile")?; + file.seek(SeekFrom::Start(offset)).await.with_context(|| { + format!("Failed to seek to offset {offset} / {total_len} within zipfile") + })?; + + // Only stream at most "transfer_chunk_size" bytes at once + let chunk_size = std::cmp::min( + self.transfer_chunk_size.get(), + total_len - offset, + ); + + let limited_file = file.take(chunk_size); + let stream = tokio_util::io::ReaderStream::new(limited_file); + let body = reqwest::Body::wrap_stream(stream); + + info!( + log, + "Streaming bundle chunk"; + "bundle" => %bundle.id, + "offset" => offset, + "length" => chunk_size, + ); + + sled_client.support_bundle_transfer( + &zpool, &dataset, &support_bundle, offset, body + ).await.with_context(|| { + format!("Failed to transfer bundle: {chunk_size}@{offset} of {total_len} to sled") + })?; + + offset += chunk_size; + } + + sled_client + .support_bundle_finalize( + &zpool, + &dataset, + &support_bundle, + &hash.to_string(), + ) + .await + .with_context(|| "Failed to finalize bundle")?; + + // Returning from this method should drop all temporary storage + // allocated locally for this support bundle. + Ok(()) + } +} + +// Indefinitely poll the database for the bundle's state. Cancels the +// supplied token when: +// +// - The bundle's state is no longer `Collecting` (typically because +// the bundle was explicitly cancelled by a user, or because its +// backing storage was expunged). +// - The bundle row no longer exists. +// +// The mechanism layer is otherwise oblivious to the bundle lifecycle — +// this is the management bridge that turns DB state changes into +// cancellation. +async fn check_for_cancellation( + datastore: Arc, + opctx: OpContext, + cancellation_token: CancellationToken, + log: slog::Logger, + bundle_id: SupportBundleUuid, +) { + let work_duration = tokio::time::Duration::from_secs(5); + let mut yield_interval = tokio::time::interval_at( + tokio::time::Instant::now() + work_duration, + work_duration, + ); + + loop { + // Timer fired mid-collection - check if we should stop. + yield_interval.tick().await; + trace!( + log, + "Checking if Bundle Collection cancelled"; + "bundle" => %bundle_id + ); + + match datastore.support_bundle_get(&opctx, bundle_id).await { + Ok(SupportBundle { + state: SupportBundleState::Collecting, .. + }) => { + // Bundle still collecting; continue... + continue; + } + Ok(_) => { + // Not collecting, for any reason: Time to exit + cancellation_token.cancel(); + return; + } + Err(Error::ObjectNotFound { .. } | Error::NotFound { .. }) => { + cancellation_token.cancel(); + return; + } + Err(err) => { + warn!( + log, + "Database error checking bundle cancellation"; + InlineErrorChain::new(&err) + ); + + // If we cannot contact the database, retry later + continue; + } + } + } +} + +async fn sha2_hash(file: &mut tokio::fs::File) -> anyhow::Result { + let mut buf = vec![0u8; 65536]; + let mut ctx = Sha256::new(); + loop { + let n = file.read(&mut buf).await?; + if n == 0 { + break; + } + ctx.write_all(&buf[0..n])?; + } + + let digest = ctx.finalize(); + Ok(ArtifactHash(digest.as_slice().try_into()?)) } impl BackgroundTask for SupportBundleCollector { @@ -459,7 +705,6 @@ impl BackgroundTask for SupportBundleCollector { mod test { use super::*; - use crate::app::background::tasks::support_bundle::perfetto; use crate::app::support_bundles::SupportBundleQueryType; use http_body_util::BodyExt; use nexus_db_model::PhysicalDisk; @@ -490,6 +735,7 @@ mod test { }; use sled_agent_types::inventory::ZpoolHealth; use std::num::NonZeroU64; + use support_bundle_collection::perfetto; use uuid::Uuid; type ControlPlaneTestContext = diff --git a/nexus/tests/integration_tests/support_bundles.rs b/nexus/tests/integration_tests/support_bundles.rs index 4ddd5365e16..4b296a50a26 100644 --- a/nexus/tests/integration_tests/support_bundles.rs +++ b/nexus/tests/integration_tests/support_bundles.rs @@ -6,14 +6,20 @@ use anyhow::Context; use anyhow::Result; +use camino::Utf8Path; +use camino::Utf8PathBuf; +use chrono::DateTime; +use chrono::Utc; use dropshot::HttpErrorResponseBody; use dropshot::test_util::ClientTestContext; use futures::TryStreamExt; use http::StatusCode; use http::method::Method; +use internal_dns_resolver::Resolver; use nexus_db_model::SupportBundleState as DbSupportBundleState; use nexus_db_queries::authz; use nexus_db_queries::context::OpContext; +use nexus_db_queries::db::DataStore; use nexus_lockstep_client::types::LastResult; use nexus_test_utils::http_testing::AuthnMode; use nexus_test_utils::http_testing::NexusRequest; @@ -25,10 +31,15 @@ use nexus_types::internal_api::background::SupportBundleCleanupReport; use nexus_types::internal_api::background::SupportBundleCollectionReport; use nexus_types::internal_api::background::SupportBundleCollectionStep; use nexus_types::internal_api::background::SupportBundleEreportStatus; +use nexus_types::support_bundle::BundleDataSelection; +use nexus_types::support_bundle::BundleTimeRange; use omicron_common::api::external::LookupType; use omicron_uuid_kinds::SupportBundleUuid; use serde::Deserialize; use std::io::Cursor; +use std::sync::Arc; +use support_bundle_collection::BundleCollection; +use support_bundle_collection::BundleInfo; use zip::read::ZipArchive; type ControlPlaneTestContext = @@ -1013,3 +1024,158 @@ async fn test_support_bundle_fm_case_id(cptestctx: &ControlPlaneTestContext) { .into_inner(); assert_eq!(fetched_fm.fm_case_id, Some(case_id)); } + +/// The collector writes per-sled logs at +/// `/rack//sled//logs//`. Find the +/// first matching directory; tests that use this only inject data +/// on one sled. +fn find_first_zone_log_dir(root: &Utf8Path, zone: &str) -> Utf8PathBuf { + let rack_root = root.join("rack"); + let mut entries = rack_root + .read_dir_utf8() + .expect("read rack dir") + .filter_map(Result::ok); + let rack = entries.next().expect("at least one rack").path().to_owned(); + let mut sled_iter = rack + .join("sled") + .read_dir_utf8() + .expect("read sled dir") + .filter_map(Result::ok); + loop { + let sled = + sled_iter.next().expect("at least one sled with injected logs"); + let candidate = sled.path().join("logs").join(zone); + if candidate.exists() { + return candidate; + } + } +} + +/// Build a `BundleCollection` with `start` as the bundle-wide lower +/// bound and run `collect_bundle_locally` into a fresh temp dir. +/// Bypasses the Nexus state machine and `support_bundle` row; +/// useful only for exercising the collection mechanism directly. +async fn collect_with_window( + cptestctx: &ControlPlaneTestContext, + opctx: &OpContext, + datastore: &Arc, + resolver: Resolver, + start: DateTime, +) -> camino_tempfile::Utf8TempDir { + let bundle_id = SupportBundleUuid::new_v4(); + let log = cptestctx.logctx.log.new(slog::o!( + "component" => "test_support_bundle_log_time_range", + "bundle" => bundle_id.to_string(), + )); + let selection = BundleDataSelection::new() + .with_all_sleds() + .with_time_range(BundleTimeRange { start: Some(start), end: None }); + let collection = Arc::new(BundleCollection::new( + datastore.clone(), + resolver, + log, + opctx.child(std::collections::BTreeMap::new()), + selection, + BundleInfo { + id: bundle_id, + reason_for_creation: "log time-range test".into(), + }, + )); + let dir = camino_tempfile::tempdir().expect("creating output tempdir"); + collection + .collect_bundle_locally(&dir) + .await + .expect("collect_bundle_locally"); + dir +} + +// Exercise the host-info time-range filter end-to-end: +// inject synthetic log entries on the sim sled-agent with crafted mtimes, +// then run the bundle collector and confirm that only entries within the +// requested window land in the output directory. +#[nexus_test] +async fn test_support_bundle_log_time_range( + cptestctx: &ControlPlaneTestContext, +) { + use chrono::TimeDelta; + use omicron_sled_agent::sim::SimLogEntry; + + let nexus = &cptestctx.server.server_context().nexus; + let datastore = nexus.datastore(); + let resolver = nexus.resolver(); + let opctx = + OpContext::for_tests(cptestctx.logctx.log.clone(), datastore.clone()); + + // Inject four synthetic entries with mtimes that bracket the two + // windows we test below: 30 m and 6 h ago land inside both, 3 d + // ago lands only inside the 7 d window, and 30 d ago is outside + // both. + let now = Utc::now(); + let recent = now - TimeDelta::minutes(30); + let middle = now - TimeDelta::hours(6); + let between = now - TimeDelta::days(3); + let ancient = now - TimeDelta::days(30); + let zone = "test_zone"; + let sled_agent = cptestctx.first_sled_agent(); + for (filename, mtime) in [ + ("recent.log", recent), + ("middle.log", middle), + ("between.log", between), + ("ancient.log", ancient), + ] { + sled_agent.insert_support_log( + zone, + SimLogEntry { + filename: filename.into(), + contents: filename.as_bytes().to_vec(), + mtime, + }, + ); + } + + // 24h window: recent + middle inside; between (3 d) and ancient + // (30 d) excluded. + let dir_24h = collect_with_window( + cptestctx, + &opctx, + datastore, + resolver.clone(), + now - TimeDelta::hours(24), + ) + .await; + let log_dir = find_first_zone_log_dir(dir_24h.path(), zone); + assert!( + log_dir.join("recent.log").exists(), + "24h window: recent.log should be present at {log_dir}", + ); + assert!( + log_dir.join("middle.log").exists(), + "24h window: middle.log should be present at {log_dir}", + ); + assert!( + !log_dir.join("between.log").exists(), + "24h window: between.log (3 d) should NOT be present at {log_dir}", + ); + assert!( + !log_dir.join("ancient.log").exists(), + "24h window: ancient.log should NOT be present at {log_dir}", + ); + + // 7d window: between (3 d) now in scope; ancient (30 d) still out. + let dir_7d = collect_with_window( + cptestctx, + &opctx, + datastore, + resolver.clone(), + now - TimeDelta::days(7), + ) + .await; + let log_dir = find_first_zone_log_dir(dir_7d.path(), zone); + assert!(log_dir.join("recent.log").exists()); + assert!(log_dir.join("middle.log").exists()); + assert!( + log_dir.join("between.log").exists(), + "7d window: between.log (3 d) should be present at {log_dir}", + ); + assert!(!log_dir.join("ancient.log").exists()); +} diff --git a/nexus/types/src/fm/ereport.rs b/nexus/types/src/fm/ereport.rs index 2cc219392c8..672e97fa267 100644 --- a/nexus/types/src/fm/ereport.rs +++ b/nexus/types/src/fm/ereport.rs @@ -6,7 +6,6 @@ use crate::inventory::SpType; use chrono::{DateTime, Utc}; -use omicron_common::api::external::Error; use omicron_uuid_kinds::EreporterRestartUuid; use omicron_uuid_kinds::OmicronZoneUuid; use omicron_uuid_kinds::SitrepUuid; @@ -239,23 +238,8 @@ fn get_sp_metadata_string( /// .with_classes(["hw.pwr.*"]); /// ``` /// -/// Note: JSON deserialization validates the start_time/end_time constraints -/// using `TryFrom`. #[derive(Clone, Debug, Default, Eq, PartialEq, Serialize, Deserialize)] -#[serde(try_from = "EreportFiltersParams")] pub struct EreportFilters { - /// If present, include only ereports that were collected at the specified - /// timestamp or later. - /// - /// If `end_time` is also present, this value *must* be at or before - /// `end_time`. This invariant is enforced by [`Self::with_start_time`]. - start_time: Option>, - /// If present, include only ereports that were collected at the specified - /// timestamp or before. - /// - /// If `start_time` is also present, this value *must* be at or after - /// `start_time`. This invariant is enforced by [`Self::with_end_time`]. - end_time: Option>, /// If this list is non-empty, include only ereports that were reported by /// systems with the provided serial numbers. only_serials: Vec, @@ -265,40 +249,6 @@ pub struct EreportFilters { only_classes: Vec, } -/// Helper for [`EreportFilters`] that validates the `start_time <= end_time` -/// invariant. Used by serde deserialization above, as well as in conversions -/// from Diesel `Ereports` model types. -#[derive(Deserialize)] -#[must_use = "this struct does nothing unless converted to EreportFilters"] -pub struct EreportFiltersParams { - pub start_time: Option>, - pub end_time: Option>, - pub only_serials: Vec, - pub only_classes: Vec, -} - -impl TryFrom for EreportFilters { - type Error = Error; - - fn try_from(params: EreportFiltersParams) -> Result { - let EreportFiltersParams { - start_time, - end_time, - only_serials, - only_classes, - } = params; - let mut f = Self::new(); - if let Some(t) = start_time { - f = f.with_start_time(t)?; - } - if let Some(t) = end_time { - f = f.with_end_time(t)?; - } - f = f.with_serials(only_serials).with_classes(only_classes); - Ok(f) - } -} - /// Displayer for pretty-printing [`EreportFilters`]. #[must_use = "this struct does nothing unless displayed"] pub struct DisplayEreportFilters<'a> { @@ -311,39 +261,6 @@ impl EreportFilters { Self::default() } - /// Includes only ereports collected at or after `time`. - /// - /// Returns an error if `time` is after a previously set end time. - pub fn with_start_time( - mut self, - time: DateTime, - ) -> Result { - if let Some(end) = self.end_time { - if time > end { - return Err(Error::invalid_request( - "start time must be before end time", - )); - } - } - self.start_time = Some(time); - Ok(self) - } - - /// Includes only ereports collected at or before `time`. - /// - /// Returns an error if `time` is before a previously set start time. - pub fn with_end_time(mut self, time: DateTime) -> Result { - if let Some(start) = self.start_time { - if start > time { - return Err(Error::invalid_request( - "start time must be before end time", - )); - } - } - self.end_time = Some(time); - Ok(self) - } - /// Adds serial numbers to the inclusion filter. /// /// When one or more serials are present, only ereports reported by @@ -368,16 +285,6 @@ impl EreportFilters { self } - /// Returns the start time filter, if set. - pub fn start_time(&self) -> Option> { - self.start_time - } - - /// Returns the end time filter, if set. - pub fn end_time(&self) -> Option> { - self.end_time - } - /// Returns the serial number inclusion filter. pub fn only_serials(&self) -> &[String] { &self.only_serials @@ -411,12 +318,6 @@ impl fmt::Display for DisplayEreportFilters<'_> { f.write_fmt(args) }; - if let Some(start) = filters.start_time() { - fmt_part(f, format_args!("start: {start}"))?; - } - if let Some(end) = filters.end_time() { - fmt_part(f, format_args!("end: {end}"))?; - } if !filters.only_serials().is_empty() { fmt_part( f, @@ -449,41 +350,17 @@ pub(crate) mod test_utils { use super::*; use proptest::prelude::*; - fn arb_datetime() -> impl Strategy> { - // Generate timestamps in a reasonable range (2020-2030). - (1577836800i64..1893456000i64) - .prop_map(|secs| DateTime::from_timestamp(secs, 0).unwrap()) - } - impl Arbitrary for EreportFilters { type Parameters = (); type Strategy = BoxedStrategy; fn arbitrary_with(_: Self::Parameters) -> Self::Strategy { ( - prop::option::of(arb_datetime()), - prop::option::of(arb_datetime()), prop::collection::vec(".*", 0..=3), prop::collection::vec(".*", 0..=3), ) - .prop_map(|(time_a, time_b, only_serials, only_classes)| { - // Ensure start <= end when both are present. - let (start_time, end_time) = match (time_a, time_b) { - (Some(a), Some(b)) if a > b => (Some(b), Some(a)), - other => other, - }; - let mut filters = EreportFilters::new(); - if let Some(t) = start_time { - filters = filters - .with_start_time(t) - .expect("no end time set yet"); - } - if let Some(t) = end_time { - filters = filters - .with_end_time(t) - .expect("start <= end by construction"); - } - filters + .prop_map(|(only_serials, only_classes)| { + EreportFilters::new() .with_serials(only_serials) .with_classes(only_classes) }) diff --git a/nexus/types/src/support_bundle.rs b/nexus/types/src/support_bundle.rs index 4c95dcb3233..c4dd07d5ef3 100644 --- a/nexus/types/src/support_bundle.rs +++ b/nexus/types/src/support_bundle.rs @@ -8,6 +8,8 @@ //! They are shared between the support bundle collector and FM case types. use crate::fm::ereport::EreportFilters; +use chrono::DateTime; +use chrono::Utc; use itertools::Itertools; use omicron_uuid_kinds::SledUuid; use serde::{Deserialize, Serialize}; @@ -49,6 +51,19 @@ pub enum BundleData { Ereports(EreportFilters), } +/// Inclusive time bound applied bundle-wide to time-bounded categories +/// (currently host-info logs and ereports). +/// +/// `None` on either side means unbounded on that side. Stored as a +/// single field on [`BundleDataSelection`] rather than smeared across +/// per-category filters; the `time_range()` accessor surfaces it to +/// consumers. +#[derive(Debug, Clone, Default, Eq, PartialEq, Serialize, Deserialize)] +pub struct BundleTimeRange { + pub start: Option>, + pub end: Option>, +} + impl BundleData { fn category(&self) -> BundleDataCategory { match self { @@ -77,8 +92,8 @@ impl fmt::Display for DisplayBundleData<'_> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self.data { BundleData::Reconfigurator => write!(f, "reconfigurator"), - BundleData::HostInfo(selection) => { - write!(f, "host_info({})", selection.display()) + BundleData::HostInfo(sleds) => { + write!(f, "host_info({})", sleds.display()) } BundleData::SledCubbyInfo => write!(f, "sled_cubby_info"), BundleData::SpDumps => write!(f, "sp_dumps"), @@ -94,33 +109,38 @@ impl fmt::Display for DisplayBundleData<'_> { /// This wrapper ensures that categories and data always match - you can't /// insert (BundleDataCategory::Reconfigurator, BundleData::SpDumps) /// because each BundleData determines its own category. -#[derive(Debug, Clone, Eq, PartialEq, Serialize, Deserialize)] +/// +/// `time_range`, when set, bounds every time-bounded category's +/// collection (host-info logs and ereports). Stored as one field +/// here rather than copied into per-category filters. +#[derive(Debug, Clone, Default, Eq, PartialEq, Serialize, Deserialize)] pub struct BundleDataSelection { data: HashMap, + time_range: Option, } impl BundleDataSelection { /// Creates an empty selection with no data categories. pub fn new() -> Self { - Self { data: HashMap::new() } + Self::default() } /// Returns a selection containing all default data categories - /// (i.e. "collect everything"). + /// (i.e. "collect everything") with a bundle-wide 7-day time + /// window applied to host-info logs and ereports. pub fn all() -> Self { Self::new() .with_reconfigurator() .with_all_sleds() .with_sled_cubby_info() .with_sp_dumps() - .with_ereports( - EreportFilters::new() - .with_start_time( - omicron_common::now_db_precision() - - chrono::Days::new(7), - ) - .expect("no end time set, cannot fail"), - ) + .with_ereports(EreportFilters::new()) + .with_time_range(BundleTimeRange { + start: Some( + omicron_common::now_db_precision() - chrono::Days::new(7), + ), + end: None, + }) } /// Adds reconfigurator state collection. @@ -164,12 +184,26 @@ impl BundleDataSelection { self } + /// Sets the bundle-wide time range. Affects every time-bounded + /// category (host-info logs and ereports) at collection time. + pub fn with_time_range(mut self, range: BundleTimeRange) -> Self { + self.time_range = Some(range); + self + } + /// Inserts a [`BundleData`] value. If a value with the same category /// already exists, the last write wins. pub fn insert(&mut self, bundle_data: BundleData) { self.data.insert(bundle_data.category(), bundle_data); } + /// Sets the bundle-wide time range in place (used by code paths + /// that build the selection incrementally, e.g. database read + /// paths). + pub fn set_time_range(&mut self, range: Option) { + self.time_range = range; + } + /// Returns `true` if reconfigurator state should be collected. pub fn contains_reconfigurator(&self) -> bool { self.data.contains_key(&BundleDataCategory::Reconfigurator) @@ -202,6 +236,12 @@ impl BundleDataSelection { _ => None, } } + + /// Returns the bundle-wide time range, if any was set. Applies + /// to every time-bounded category at collection time. + pub fn time_range(&self) -> Option<&BundleTimeRange> { + self.time_range.as_ref() + } } impl IntoIterator for BundleDataSelection { @@ -260,12 +300,6 @@ impl fmt::Display for DisplayBundleDataSelection<'_> { } } -impl Default for BundleDataSelection { - fn default() -> Self { - Self::new() - } -} - /// The set of sleds to include. This can either be all sleds, or a set of /// specific sleds. #[derive(Debug, Clone, Eq, PartialEq, Serialize, Deserialize)] @@ -311,13 +345,41 @@ pub(crate) mod test_utils { use super::*; use proptest::prelude::*; + fn arb_datetime() -> impl Strategy> { + // Span the full representable range of `DateTime` so + // round-trip tests exercise far-past and far-future times, + // not just a hand-picked window that drifts out of date. + let min = DateTime::::MIN_UTC.timestamp(); + let max = DateTime::::MAX_UTC.timestamp(); + (min..=max).prop_map(|secs| DateTime::from_timestamp(secs, 0).unwrap()) + } + + impl Arbitrary for BundleTimeRange { + type Parameters = (); + type Strategy = BoxedStrategy; + + fn arbitrary_with(_: Self::Parameters) -> Self::Strategy { + (prop::option::of(arb_datetime()), prop::option::of(arb_datetime())) + .prop_map(|(start, end)| BundleTimeRange { start, end }) + .boxed() + } + } + impl Arbitrary for BundleDataSelection { type Parameters = (); type Strategy = BoxedStrategy; fn arbitrary_with(_: Self::Parameters) -> Self::Strategy { - prop::collection::vec(any::(), 0..=5) - .prop_map(|data| data.into_iter().collect()) + ( + prop::collection::vec(any::(), 0..=5), + prop::option::of(any::()), + ) + .prop_map(|(data, time_range)| { + let mut sel: BundleDataSelection = + data.into_iter().collect(); + sel.set_time_range(time_range); + sel + }) .boxed() } } diff --git a/openapi/sled-agent/sled-agent-37.0.0-a1f825.json.gitstub b/openapi/sled-agent/sled-agent-37.0.0-a1f825.json.gitstub new file mode 100644 index 00000000000..8352e4453ff --- /dev/null +++ b/openapi/sled-agent/sled-agent-37.0.0-a1f825.json.gitstub @@ -0,0 +1 @@ +71da5e7d49981b8b676a4737e0d93f5765450b80:openapi/sled-agent/sled-agent-37.0.0-a1f825.json diff --git a/openapi/sled-agent/sled-agent-37.0.0-a1f825.json b/openapi/sled-agent/sled-agent-38.0.0-1db2eb.json similarity index 99% rename from openapi/sled-agent/sled-agent-37.0.0-a1f825.json rename to openapi/sled-agent/sled-agent-38.0.0-1db2eb.json index d479abd971d..4f5b6777cbf 100644 --- a/openapi/sled-agent/sled-agent-37.0.0-a1f825.json +++ b/openapi/sled-agent/sled-agent-38.0.0-1db2eb.json @@ -7,7 +7,7 @@ "url": "https://oxide.computer", "email": "api@oxide.computer" }, - "version": "37.0.0" + "version": "38.0.0" }, "paths": { "/artifacts": { @@ -704,16 +704,36 @@ "type": "string" } }, + { + "in": "query", + "name": "end_time", + "description": "Upper bound (inclusive) on log file `mtime`. If absent, no upper bound is applied.", + "schema": { + "nullable": true, + "type": "string", + "format": "date-time" + } + }, { "in": "query", "name": "max_rotated", - "description": "The max number of rotated logs to include in the final support bundle", - "required": true, + "description": "The max number of rotated logs to include in the final support bundle. If absent, no count cap is applied.", "schema": { + "nullable": true, "type": "integer", "format": "uint", "minimum": 0 } + }, + { + "in": "query", + "name": "start_time", + "description": "Lower bound (inclusive) on log file `mtime`. If absent, no lower bound is applied.", + "schema": { + "nullable": true, + "type": "string", + "format": "date-time" + } } ], "responses": { diff --git a/openapi/sled-agent/sled-agent-latest.json b/openapi/sled-agent/sled-agent-latest.json index ccb5f6634a4..d78b9ad95bf 120000 --- a/openapi/sled-agent/sled-agent-latest.json +++ b/openapi/sled-agent/sled-agent-latest.json @@ -1 +1 @@ -sled-agent-37.0.0-a1f825.json \ No newline at end of file +sled-agent-38.0.0-1db2eb.json \ No newline at end of file diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index caa7558304a..79f3634c237 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -3201,13 +3201,26 @@ CREATE TABLE IF NOT EXISTS omicron.public.support_bundle_data_selection_host_inf CREATE TABLE IF NOT EXISTS omicron.public.support_bundle_data_selection_ereports ( bundle_id UUID NOT NULL, - start_time TIMESTAMPTZ, - end_time TIMESTAMPTZ, only_serials TEXT[] NOT NULL DEFAULT ARRAY[], only_classes TEXT[] NOT NULL DEFAULT ARRAY[], + PRIMARY KEY (bundle_id) +); + +/* + * Bundle-wide time bound applied to time-bounded categories + * (host-info logs and ereports). Persisted at the bundle level + * rather than per-category; absent rows imply no time bound. + */ +CREATE TABLE IF NOT EXISTS omicron.public.support_bundle_data_selection_time_range ( + bundle_id UUID NOT NULL, + start_time TIMESTAMPTZ, + end_time TIMESTAMPTZ, + PRIMARY KEY (bundle_id), - CHECK (start_time IS NULL OR end_time IS NULL OR start_time <= end_time) + CONSTRAINT start_before_end CHECK ( + start_time IS NULL OR end_time IS NULL OR start_time <= end_time + ) ); /*******************************************************************/ @@ -7718,13 +7731,27 @@ CREATE TABLE IF NOT EXISTS omicron.public.fm_support_bundle_request_data_selecti CREATE TABLE IF NOT EXISTS omicron.public.fm_support_bundle_request_data_selection_ereports ( sitrep_id UUID NOT NULL, request_id UUID NOT NULL, - start_time TIMESTAMPTZ, - end_time TIMESTAMPTZ, only_serials TEXT[] NOT NULL DEFAULT ARRAY[], only_classes TEXT[] NOT NULL DEFAULT ARRAY[], + PRIMARY KEY (sitrep_id, request_id) +); + +/* + * Bundle-wide time bound for FM-driven support bundle requests. + * Mirrors `support_bundle_data_selection_time_range` but keyed by + * the FM (sitrep_id, request_id) tuple. + */ +CREATE TABLE IF NOT EXISTS omicron.public.fm_support_bundle_request_data_selection_time_range ( + sitrep_id UUID NOT NULL, + request_id UUID NOT NULL, + start_time TIMESTAMPTZ, + end_time TIMESTAMPTZ, + PRIMARY KEY (sitrep_id, request_id), - CHECK (start_time IS NULL OR end_time IS NULL OR start_time <= end_time) + CONSTRAINT start_before_end CHECK ( + start_time IS NULL OR end_time IS NULL OR start_time <= end_time + ) ); /* @@ -8557,7 +8584,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - (TRUE, NOW(), NOW(), '256.0.0', NULL) + (TRUE, NOW(), NOW(), '257.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; diff --git a/schema/crdb/support-bundle-time-range/up1.sql b/schema/crdb/support-bundle-time-range/up1.sql new file mode 100644 index 00000000000..d07032ebc58 --- /dev/null +++ b/schema/crdb/support-bundle-time-range/up1.sql @@ -0,0 +1,10 @@ +CREATE TABLE IF NOT EXISTS omicron.public.support_bundle_data_selection_time_range ( + bundle_id UUID NOT NULL, + start_time TIMESTAMPTZ, + end_time TIMESTAMPTZ, + + PRIMARY KEY (bundle_id), + CONSTRAINT start_before_end CHECK ( + start_time IS NULL OR end_time IS NULL OR start_time <= end_time + ) +); diff --git a/schema/crdb/support-bundle-time-range/up2.sql b/schema/crdb/support-bundle-time-range/up2.sql new file mode 100644 index 00000000000..880a8516666 --- /dev/null +++ b/schema/crdb/support-bundle-time-range/up2.sql @@ -0,0 +1,11 @@ +CREATE TABLE IF NOT EXISTS omicron.public.fm_support_bundle_request_data_selection_time_range ( + sitrep_id UUID NOT NULL, + request_id UUID NOT NULL, + start_time TIMESTAMPTZ, + end_time TIMESTAMPTZ, + + PRIMARY KEY (sitrep_id, request_id), + CONSTRAINT start_before_end CHECK ( + start_time IS NULL OR end_time IS NULL OR start_time <= end_time + ) +); diff --git a/schema/crdb/support-bundle-time-range/up3.sql b/schema/crdb/support-bundle-time-range/up3.sql new file mode 100644 index 00000000000..149a7383a9c --- /dev/null +++ b/schema/crdb/support-bundle-time-range/up3.sql @@ -0,0 +1,23 @@ +-- Migrate any pre-existing time bounds from the per-category ereports +-- tables into the new bundle-level time_range tables. Idempotent via +-- ON CONFLICT DO NOTHING (the new tables are keyed by the same id(s) +-- as the source rows). +-- +-- Reading every row of the ereports tables is the point here, so +-- override CockroachDB's full-table-scan guardrail for this txn. + +SET LOCAL disallow_full_table_scans = off; + +INSERT INTO omicron.public.support_bundle_data_selection_time_range + (bundle_id, start_time, end_time) +SELECT bundle_id, start_time, end_time +FROM omicron.public.support_bundle_data_selection_ereports +WHERE start_time IS NOT NULL OR end_time IS NOT NULL +ON CONFLICT (bundle_id) DO NOTHING; + +INSERT INTO omicron.public.fm_support_bundle_request_data_selection_time_range + (sitrep_id, request_id, start_time, end_time) +SELECT sitrep_id, request_id, start_time, end_time +FROM omicron.public.fm_support_bundle_request_data_selection_ereports +WHERE start_time IS NOT NULL OR end_time IS NOT NULL +ON CONFLICT (sitrep_id, request_id) DO NOTHING; diff --git a/schema/crdb/support-bundle-time-range/up4.sql b/schema/crdb/support-bundle-time-range/up4.sql new file mode 100644 index 00000000000..ac370e165eb --- /dev/null +++ b/schema/crdb/support-bundle-time-range/up4.sql @@ -0,0 +1,7 @@ +-- Drop start_time from the support_bundle ereports table. CASCADE +-- removes the dependent (anonymous) CHECK constraint on +-- start_time/end_time, which is no longer relevant once the time +-- columns live in support_bundle_data_selection_time_range. + +ALTER TABLE omicron.public.support_bundle_data_selection_ereports + DROP COLUMN IF EXISTS start_time CASCADE; diff --git a/schema/crdb/support-bundle-time-range/up5.sql b/schema/crdb/support-bundle-time-range/up5.sql new file mode 100644 index 00000000000..f9e594245ce --- /dev/null +++ b/schema/crdb/support-bundle-time-range/up5.sql @@ -0,0 +1,2 @@ +ALTER TABLE omicron.public.support_bundle_data_selection_ereports + DROP COLUMN IF EXISTS end_time; diff --git a/schema/crdb/support-bundle-time-range/up6.sql b/schema/crdb/support-bundle-time-range/up6.sql new file mode 100644 index 00000000000..e12b991eaf3 --- /dev/null +++ b/schema/crdb/support-bundle-time-range/up6.sql @@ -0,0 +1,5 @@ +-- Drop start_time from the FM ereports table. CASCADE removes the +-- dependent (anonymous) CHECK constraint. + +ALTER TABLE omicron.public.fm_support_bundle_request_data_selection_ereports + DROP COLUMN IF EXISTS start_time CASCADE; diff --git a/schema/crdb/support-bundle-time-range/up7.sql b/schema/crdb/support-bundle-time-range/up7.sql new file mode 100644 index 00000000000..a7a0940bfda --- /dev/null +++ b/schema/crdb/support-bundle-time-range/up7.sql @@ -0,0 +1,2 @@ +ALTER TABLE omicron.public.fm_support_bundle_request_data_selection_ereports + DROP COLUMN IF EXISTS end_time; diff --git a/sled-agent/api/src/lib.rs b/sled-agent/api/src/lib.rs index 501ddfc322d..d5986fb876e 100644 --- a/sled-agent/api/src/lib.rs +++ b/sled-agent/api/src/lib.rs @@ -38,6 +38,7 @@ api_versions!([ // | example for the next person. // v // (next_int, IDENT), + (38, ADD_LOG_TIME_RANGE), (37, MODIFY_SVC_ENABLED_NOT_ONLINE_STATE), (36, DROPSHOT_FREEFORM_BODY_DESC), (35, INLINE_ROUTER_PEER_IP_ADDR), @@ -1332,6 +1333,7 @@ pub trait SledAgentApi { #[endpoint { method = GET, path = "/support/logs/download/{zone}", + versions = VERSION_ADD_LOG_TIME_RANGE.., }] async fn support_logs_download( request_context: RequestContext, @@ -1343,6 +1345,25 @@ pub trait SledAgentApi { >, ) -> Result, HttpError>; + /// Pre-`VERSION_ADD_LOG_TIME_RANGE` shape of the zone-logs download + /// endpoint: takes only `max_rotated`, with no time-range query + /// parameters. Newer clients use the entry above. + #[endpoint { + operation_id = "support_logs_download", + method = GET, + path = "/support/logs/download/{zone}", + versions = ..VERSION_ADD_LOG_TIME_RANGE, + }] + async fn support_logs_download_v1( + request_context: RequestContext, + path_params: Path< + v1::diagnostics::SledDiagnosticsLogsDownloadPathParam, + >, + query_params: Query< + v1::diagnostics::SledDiagnosticsLogsDownloadQueryParam, + >, + ) -> Result, HttpError>; + /// This endpoint reports the status of the `destroy_orphaned_datasets` /// chicken switch. It will be removed with omicron#6177. #[endpoint { diff --git a/sled-agent/src/http_entrypoints.rs b/sled-agent/src/http_entrypoints.rs index 2eb525d3e53..6b92548ae9a 100644 --- a/sled-agent/src/http_entrypoints.rs +++ b/sled-agent/src/http_entrypoints.rs @@ -1430,12 +1430,48 @@ impl SledAgentApi for SledAgentImpl { let sa = request_context.context(); let SledDiagnosticsLogsDownloadPathParam { zone } = path_params.into_inner(); - let SledDiagnosticsLogsDownloadQueryParam { max_rotated } = - query_params.into_inner(); + let SledDiagnosticsLogsDownloadQueryParam { + max_rotated, + start_time, + end_time, + } = query_params.into_inner(); + let window = sled_diagnostics::LogTimeWindow { + start: start_time, + end: end_time, + }; sa.latencies() .instrument_dropshot_handler(&request_context, async { sa.as_support_bundle_logs() - .get_logs_for_zone(zone, max_rotated) + .get_logs_for_zone(zone, max_rotated, window) + .await + .map_err(HttpError::from) + }) + .await + } + + async fn support_logs_download_v1( + request_context: RequestContext, + path_params: Path< + v1::diagnostics::SledDiagnosticsLogsDownloadPathParam, + >, + query_params: Query< + v1::diagnostics::SledDiagnosticsLogsDownloadQueryParam, + >, + ) -> Result, HttpError> { + let sa = request_context.context(); + let v1::diagnostics::SledDiagnosticsLogsDownloadPathParam { zone } = + path_params.into_inner(); + let v1::diagnostics::SledDiagnosticsLogsDownloadQueryParam { + max_rotated, + } = query_params.into_inner(); + sa.latencies() + .instrument_dropshot_handler(&request_context, async { + sa.as_support_bundle_logs() + .get_logs_for_zone( + zone, + Some(max_rotated), + sled_diagnostics::LogTimeWindow::default(), + ) .await .map_err(HttpError::from) }) diff --git a/sled-agent/src/sim/http_entrypoints.rs b/sled-agent/src/sim/http_entrypoints.rs index 8e65a9a4574..2ed7fc647d7 100644 --- a/sled-agent/src/sim/http_entrypoints.rs +++ b/sled-agent/src/sim/http_entrypoints.rs @@ -997,18 +997,61 @@ impl SledAgentApi for SledAgentSimImpl { } async fn support_logs( - _request_context: RequestContext, + request_context: RequestContext, ) -> Result>, HttpError> { - // Return an empty zone list for testing. - Ok(HttpResponseOk(Default::default())) + let sa = request_context.context(); + // Return whatever zones tests have injected entries for. By + // default this is empty. + let zones = + sa.support_logs.lock().unwrap().keys().cloned().collect::>(); + Ok(HttpResponseOk(zones)) } async fn support_logs_download( - _request_context: RequestContext, - _path_params: Path, - _query_params: Query, + request_context: RequestContext, + path_params: Path, + query_params: Query, ) -> Result, HttpError> { - method_unimplemented() + let sa = request_context.context(); + let SledDiagnosticsLogsDownloadPathParam { zone } = + path_params.into_inner(); + let SledDiagnosticsLogsDownloadQueryParam { + max_rotated, + start_time, + end_time, + } = query_params.into_inner(); + super::sim_support_logs::serve_zip( + sa, + &zone, + max_rotated, + sled_diagnostics::LogTimeWindow { + start: start_time, + end: end_time, + }, + ) + } + + async fn support_logs_download_v1( + request_context: RequestContext, + path_params: Path< + v1::diagnostics::SledDiagnosticsLogsDownloadPathParam, + >, + query_params: Query< + v1::diagnostics::SledDiagnosticsLogsDownloadQueryParam, + >, + ) -> Result, HttpError> { + let sa = request_context.context(); + let v1::diagnostics::SledDiagnosticsLogsDownloadPathParam { zone } = + path_params.into_inner(); + let v1::diagnostics::SledDiagnosticsLogsDownloadQueryParam { + max_rotated, + } = query_params.into_inner(); + super::sim_support_logs::serve_zip( + sa, + &zone, + Some(max_rotated), + sled_diagnostics::LogTimeWindow::default(), + ) } async fn chicken_switch_destroy_orphaned_datasets_get_v1( diff --git a/sled-agent/src/sim/mod.rs b/sled-agent/src/sim/mod.rs index e00e964e600..aee92a7533f 100644 --- a/sled-agent/src/sim/mod.rs +++ b/sled-agent/src/sim/mod.rs @@ -12,6 +12,7 @@ mod http_entrypoints_pantry; mod http_entrypoints_storage; mod instance; mod server; +mod sim_support_logs; mod simulatable; mod sled_agent; mod storage; @@ -23,6 +24,7 @@ pub use config::{ TEST_HARDWARE_THREADS, TEST_RESERVOIR_RAM, ZpoolConfig, }; pub use server::{RssArgs, Server, run_standalone_server}; +pub use sled_agent::SimLogEntry; pub use sled_agent::SledAgent; pub use storage::PantryServer; pub(crate) use storage::Storage; diff --git a/sled-agent/src/sim/sim_support_logs.rs b/sled-agent/src/sim/sim_support_logs.rs new file mode 100644 index 00000000000..55f97bf492e --- /dev/null +++ b/sled-agent/src/sim/sim_support_logs.rs @@ -0,0 +1,78 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Simulated implementation of the sled-agent's `support_logs_download` +//! endpoint. +//! +//! Tests inject [`super::SimLogEntry`] entries via +//! `SledAgent::insert_support_log`. This module renders the entries +//! for a given zone into an in-memory zip, applying the same +//! `start_time`/`end_time`/`max_rotated` filters as the real +//! handler. + +use super::sled_agent::SledAgent; +use dropshot::HttpError; +use sled_diagnostics::LogTimeWindow; +use std::io::Write; + +/// Build a zip of synthetic log entries for `zone` and return it as +/// an `application/zip` HTTP response. Entries with mtime outside the +/// requested `window` are excluded; if `max_rotated` is `Some(n)`, no +/// more than `n` entries are returned (after time filtering, sorted +/// newest-first). +pub(super) fn serve_zip( + sa: &SledAgent, + zone: &str, + max_rotated: Option, + window: LogTimeWindow, +) -> Result, HttpError> { + let mut entries = { + let logs = sa.support_logs.lock().unwrap(); + logs.get(zone).cloned().unwrap_or_default() + }; + + entries.retain(|e| { + if let Some(start) = window.start { + if e.mtime < start { + return false; + } + } + if let Some(end) = window.end { + if e.mtime > end { + return false; + } + } + true + }); + + // Sort newest-first so the count cap takes the most recent. + entries.sort_by(|a, b| b.mtime.cmp(&a.mtime)); + if let Some(n) = max_rotated { + entries.truncate(n); + } + + let mut buf = Vec::new(); + { + let mut zip = zip::ZipWriter::new(std::io::Cursor::new(&mut buf)); + for entry in &entries { + let opts: zip::write::FileOptions<'_, ()> = + zip::write::FileOptions::default() + .compression_method(zip::CompressionMethod::Stored); + zip.start_file(&entry.filename, opts).map_err(|err| { + HttpError::for_internal_error(err.to_string()) + })?; + zip.write_all(&entry.contents).map_err(|err| { + HttpError::for_internal_error(err.to_string()) + })?; + } + zip.finish() + .map_err(|err| HttpError::for_internal_error(err.to_string()))?; + } + + Ok(http::Response::builder() + .status(http::StatusCode::OK) + .header(http::header::CONTENT_TYPE, "application/zip") + .body(dropshot::Body::from(buf)) + .unwrap()) +} diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs index 384a1ed6ea7..75452f2b33b 100644 --- a/sled-agent/src/sim/sled_agent.rs +++ b/sled-agent/src/sim/sled_agent.rs @@ -20,6 +20,7 @@ use anyhow::Context; use anyhow::bail; use bootstore::schemes::v0 as bootstore; use bytes::Bytes; +use chrono::DateTime; use chrono::Utc; use dropshot::Body; use dropshot::HttpError; @@ -79,6 +80,21 @@ use std::time::Duration; use tufaceous_artifact::ArtifactHash; use uuid::Uuid; +/// A synthetic log file the simulated sled-agent will return from +/// `support_logs_download`. +/// +/// Tests construct these with controlled mtimes to exercise the +/// support-bundle log time-range filtering end-to-end. +#[derive(Clone, Debug)] +pub struct SimLogEntry { + /// File name to write inside the per-zone zip. + pub filename: String, + /// File contents. + pub contents: Vec, + /// Modification time used for the time-range filter. + pub mtime: DateTime, +} + /// Simulates management of the control plane on a sled /// /// The current implementation simulates a server directly in this program. @@ -117,6 +133,12 @@ pub struct SledAgent { /// counter and return 503 Service Unavailable. local_storage_error_count: AtomicU32, pub bootstore_network_config: Mutex, + /// Test-only log entries keyed by zone name. Populated via + /// [`SledAgent::insert_support_log`]; consulted by the + /// `support_logs` / `support_logs_download` HTTP handlers. + /// Empty by default. + pub(crate) support_logs: + Mutex>>, pub(super) repo_depot: dropshot::HttpServer>, pub log: Logger, @@ -203,6 +225,7 @@ impl SledAgent { repo_depot, log, bootstore_network_config, + support_logs: Mutex::new(std::collections::BTreeMap::new()), health_monitor, }) } @@ -1094,6 +1117,26 @@ impl SledAgent { .map_err(|err| err.into()) } + /// Test helper: append a synthetic log entry that the sim + /// `support_logs_download` handler will return for `zone`. + pub fn insert_support_log( + &self, + zone: impl Into, + entry: SimLogEntry, + ) { + self.support_logs + .lock() + .unwrap() + .entry(zone.into()) + .or_default() + .push(entry); + } + + /// Test helper: drop all injected synthetic log entries. + pub fn clear_support_logs(&self) { + self.support_logs.lock().unwrap().clear(); + } + pub fn datasets_ensure( &self, config: DatasetsConfig, diff --git a/sled-agent/src/support_bundle/logs.rs b/sled-agent/src/support_bundle/logs.rs index bd158920f63..37de74d9f3b 100644 --- a/sled-agent/src/support_bundle/logs.rs +++ b/sled-agent/src/support_bundle/logs.rs @@ -10,6 +10,7 @@ use futures::StreamExt; use futures::stream::FuturesUnordered; use range_requests::make_get_response; use sled_agent_config_reconciler::AvailableDatasetsReceiver; +use sled_diagnostics::LogTimeWindow; use slog::Logger; use slog_error_chain::InlineErrorChain; use tokio::io::AsyncSeekExt; @@ -72,10 +73,15 @@ impl<'a> SupportBundleLogs<'a> { /// For a given zone and its services create a zip file of all logs /// found in that zone and stream it out via an `HttpResponse`. + /// + /// `max_rotated` caps the number of rotated files per service when + /// `Some(_)`. `window` bounds files by `mtime`; either side may be + /// `None` for "unbounded on that side." pub async fn get_logs_for_zone( &self, zone: Z, - max_rotated: usize, + max_rotated: Option, + window: LogTimeWindow, ) -> Result, Error> where Z: Into, @@ -88,7 +94,9 @@ impl<'a> SupportBundleLogs<'a> { let zip_file = { let handle = sled_diagnostics::LogsHandle::new(log); - match handle.get_zone_logs(&zone, max_rotated, &mut tempfile).await + match handle + .get_zone_logs(&zone, max_rotated, window, &mut tempfile) + .await { Ok(_) => Ok(tempfile), Err(e) => Err(e), diff --git a/sled-agent/types/versions/src/add_log_time_range/diagnostics.rs b/sled-agent/types/versions/src/add_log_time_range/diagnostics.rs new file mode 100644 index 00000000000..d7ea2c0b3b2 --- /dev/null +++ b/sled-agent/types/versions/src/add_log_time_range/diagnostics.rs @@ -0,0 +1,48 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Diagnostics types for Sled Agent API `ADD_LOG_TIME_RANGE`. + +use chrono::DateTime; +use chrono::Utc; +use schemars::JsonSchema; +use serde::Deserialize; + +use crate::v1; + +/// Query parameters for sled-diagnostics log download requests. +/// +/// `max_rotated` becomes optional in this version: callers using a +/// time-range bound typically don't want a count cap on top. +#[derive(Deserialize, JsonSchema)] +pub struct SledDiagnosticsLogsDownloadQueryParam { + /// The max number of rotated logs to include in the final support + /// bundle. If absent, no count cap is applied. + #[serde(default)] + pub max_rotated: Option, + + /// Lower bound (inclusive) on log file `mtime`. If absent, no + /// lower bound is applied. + #[serde(default)] + pub start_time: Option>, + + /// Upper bound (inclusive) on log file `mtime`. If absent, no + /// upper bound is applied. + #[serde(default)] + pub end_time: Option>, +} + +impl From + for SledDiagnosticsLogsDownloadQueryParam +{ + fn from( + old: v1::diagnostics::SledDiagnosticsLogsDownloadQueryParam, + ) -> Self { + Self { + max_rotated: Some(old.max_rotated), + start_time: None, + end_time: None, + } + } +} diff --git a/sled-agent/types/versions/src/add_log_time_range/mod.rs b/sled-agent/types/versions/src/add_log_time_range/mod.rs new file mode 100644 index 00000000000..e9035775756 --- /dev/null +++ b/sled-agent/types/versions/src/add_log_time_range/mod.rs @@ -0,0 +1,13 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Version `ADD_LOG_TIME_RANGE` of the Sled Agent API. +//! +//! This version reshapes the `support_logs_download` endpoint's query +//! parameters to support time-bounded log collection: `max_rotated` +//! becomes optional, and `start_time`/`end_time` are added so callers +//! can request only logs whose `mtime` falls within the supplied +//! window. + +pub mod diagnostics; diff --git a/sled-agent/types/versions/src/latest.rs b/sled-agent/types/versions/src/latest.rs index 7ceb9d668c9..97dbd50f909 100644 --- a/sled-agent/types/versions/src/latest.rs +++ b/sled-agent/types/versions/src/latest.rs @@ -41,7 +41,7 @@ pub mod debug { pub mod diagnostics { pub use crate::v1::diagnostics::SledDiagnosticsLogsDownloadPathParam; pub use crate::v1::diagnostics::SledDiagnosticsLogsDownloadPathParm; - pub use crate::v1::diagnostics::SledDiagnosticsLogsDownloadQueryParam; + pub use crate::v38::diagnostics::SledDiagnosticsLogsDownloadQueryParam; } pub mod disk { diff --git a/sled-agent/types/versions/src/lib.rs b/sled-agent/types/versions/src/lib.rs index 8b8fc8071c0..40ce26f39aa 100644 --- a/sled-agent/types/versions/src/lib.rs +++ b/sled-agent/types/versions/src/lib.rs @@ -81,6 +81,8 @@ pub mod v33; pub mod v34; #[path = "modify_svc_enabled_not_online_state/mod.rs"] pub mod v37; +#[path = "add_log_time_range/mod.rs"] +pub mod v38; #[path = "add_nexus_lockstep_port_to_inventory/mod.rs"] pub mod v4; #[path = "add_probe_put_endpoint/mod.rs"] diff --git a/sled-diagnostics/src/lib.rs b/sled-diagnostics/src/lib.rs index 659468051a5..420c8fba238 100644 --- a/sled-diagnostics/src/lib.rs +++ b/sled-diagnostics/src/lib.rs @@ -22,7 +22,7 @@ cfg_if::cfg_if! { } pub mod logs; -pub use logs::{LogError, LogsHandle}; +pub use logs::{LogError, LogTimeWindow, LogsHandle}; mod queries; pub use crate::queries::{ diff --git a/sled-diagnostics/src/logs.rs b/sled-diagnostics/src/logs.rs index f2d4b1b36d4..5e399f1d47c 100644 --- a/sled-diagnostics/src/logs.rs +++ b/sled-diagnostics/src/logs.rs @@ -11,6 +11,8 @@ use std::{ }; use camino::{Utf8Path, Utf8PathBuf}; +use chrono::DateTime; +use chrono::Utc; use fs_err::File; use illumos_utils::zfs::{ CreateSnapshotError, DestroySnapshotError, GetValueError, @@ -566,7 +568,8 @@ impl LogsHandle { pub async fn get_zone_logs( &self, zone: &str, - max_rotated: usize, + max_rotated: Option, + window: LogTimeWindow, writer: &mut W, ) -> Result<(), LogError> { // We are opting to use oxlog to find logs rather than using a similar @@ -575,6 +578,14 @@ impl LogsHandle { // internal structures like a list of running zones. Instead we operate // on all of the log paths that oxlog is capable of discovering via the // filesystem directly. + // + // NOTE: We deliberately pass `date_range: None` to oxlog and apply + // the time-range filter ourselves below. oxlog's filter would + // exclude "current" SMF log files whose mtime falls outside the + // window — but the active log file is exactly the place fresh + // entries land, so we always want it included regardless of the + // requested window. Filtering here lets us keep current logs + // unconditionally and gate only the archived/extra files. let zones = oxlog::Zones::load().map_err(|e| LogError::OxLog(e))?; let zone_logs = zones.zone_logs( zone, @@ -599,10 +610,13 @@ impl LogsHandle { // we'll leak snapshots. let mut log_snapshots = LogSnapshots::new(); + let mtime_range = window.to_mtime_range(); + let result = self .get_zone_logs_inner( zone_logs, max_rotated, + mtime_range, zip, &mut log_snapshots, ) @@ -616,12 +630,19 @@ impl LogsHandle { async fn get_zone_logs_inner( &self, zone_logs: BTreeMap, - max_rotated: usize, + max_rotated: Option, + mtime_range: MtimeRange, mut zip: zip::ZipWriter, mut log_snapshots: &mut LogSnapshots, ) -> Result<(), LogError> { for (service, service_logs) in zone_logs { // - Grab all of the service's SMF logs - + + // The "current" log is always included, regardless of mtime + // bounds — the file is where fresh entries land, and a + // long-quiet but still-active service shouldn't lose its + // current log just because its last write predates the + // window. if let Some(current) = service_logs.current { self.process_logs( &service, @@ -643,6 +664,7 @@ impl LogsHandle { .archived .into_iter() .filter(|log| log.path.as_str().contains("crypt/debug")) + .filter(|log| mtime_range.contains(log.modified)) .collect(); // Since these logs can be spread out across multiple U.2 devices @@ -655,12 +677,19 @@ impl LogsHandle { .unwrap_or(0) }); - for file in archived.iter().rev().take(max_rotated) { + // Apply the count cap (`max_rotated`) only when the caller + // specified one. With a time range and no count cap, take + // everything in the window. + let mut to_process: Vec<&LogFile> = archived.iter().rev().collect(); + if let Some(n) = max_rotated { + to_process.truncate(n); + } + for file in to_process { self.process_logs( &service, &mut zip, &mut log_snapshots, - &file, + file, LogType::Archive, ) .await?; @@ -692,8 +721,18 @@ impl LogsHandle { .await?; } - // We clamp the number of rotated logs we grab to 5. - for log in logs.rotated.iter().rev().take(max_rotated) { + // Apply mtime + count cap to rotated extras. + let mut rotated: Vec<&LogFile> = logs + .rotated + .iter() + .copied() + .rev() + .filter(|log| mtime_range.contains(log.modified)) + .collect(); + if let Some(n) = max_rotated { + rotated.truncate(n); + } + for log in rotated { self.process_logs( &service, &mut zip, @@ -712,6 +751,60 @@ impl LogsHandle { } } +/// Inclusive log-collection time window expressed in chrono terms. +/// +/// Bundles `start_time`/`end_time` into one type so callers can't +/// accidentally swap them at the call site — the two fields have the +/// same type and same `Option`-ness, and would otherwise be adjacent +/// positional arguments. +#[derive(Clone, Copy, Debug, Default)] +pub struct LogTimeWindow { + pub start: Option>, + pub end: Option>, +} + +impl LogTimeWindow { + fn to_mtime_range(self) -> MtimeRange { + MtimeRange { + start: self.start.map(chrono_to_jiff), + end: self.end.map(chrono_to_jiff), + } + } +} + +/// Inclusive `mtime` window applied to archived and extra log files. +#[derive(Clone, Copy, Debug, Default)] +struct MtimeRange { + start: Option, + end: Option, +} + +impl MtimeRange { + /// Returns true if a file with the given `modified` time should be + /// included. Files with unknown mtime (`None`) are included — over- + /// inclusive matches the calling pattern. + fn contains(&self, modified: Option) -> bool { + let Some(mtime) = modified else { + return true; + }; + if let Some(start) = self.start { + if mtime < start { + return false; + } + } + if let Some(end) = self.end { + if mtime > end { + return false; + } + } + true + } +} + +fn chrono_to_jiff(ts: DateTime) -> jiff::Timestamp { + jiff::Timestamp::from_second(ts.timestamp()).unwrap_or(jiff::Timestamp::MIN) +} + fn write_log_to_zip( logger: &Logger, service: &str, diff --git a/support-bundle-collection/Cargo.toml b/support-bundle-collection/Cargo.toml new file mode 100644 index 00000000000..6b892921db7 --- /dev/null +++ b/support-bundle-collection/Cargo.toml @@ -0,0 +1,47 @@ +[package] +name = "support-bundle-collection" +version = "0.1.0" +edition.workspace = true + +[lints] +workspace = true + +[build-dependencies] +omicron-rpaths.workspace = true + +[dependencies] +anyhow.workspace = true +base64.workspace = true +camino.workspace = true +camino-tempfile.workspace = true +chrono.workspace = true +dropshot.workspace = true +futures.workspace = true +gateway-client.workspace = true +gateway-types.workspace = true +internal-dns-resolver.workspace = true +internal-dns-types.workspace = true +jiff.workspace = true +nexus-db-model.workspace = true +nexus-db-queries.workspace = true +nexus-networking.workspace = true +nexus-reconfigurator-preparation.workspace = true +nexus-types.workspace = true +omicron-common.workspace = true +omicron-uuid-kinds.workspace = true +parallel-task-set.workspace = true +# See omicron-rpaths for more about the "pq-sys" dependency. +pq-sys = "*" +serde.workspace = true +serde_json.workspace = true +sha2.workspace = true +sled-agent-client.workspace = true +slog.workspace = true +slog-error-chain.workspace = true +tokio.workspace = true +tokio-util.workspace = true +tufaceous-artifact.workspace = true +uuid.workspace = true +zip.workspace = true + +omicron-workspace-hack.workspace = true diff --git a/nexus/src/app/background/tasks/support_bundle/README.md b/support-bundle-collection/README.md similarity index 100% rename from nexus/src/app/background/tasks/support_bundle/README.md rename to support-bundle-collection/README.md diff --git a/nexus/src/app/background/tasks/support_bundle/mod.rs b/support-bundle-collection/build.rs similarity index 50% rename from nexus/src/app/background/tasks/support_bundle/mod.rs rename to support-bundle-collection/build.rs index ba8fb9e4cab..1ba9acd41c9 100644 --- a/nexus/src/app/background/tasks/support_bundle/mod.rs +++ b/support-bundle-collection/build.rs @@ -2,10 +2,9 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -//! Support bundle related types and utilities - -mod cache; -pub mod collection; -pub mod perfetto; -mod step; -mod steps; +// See omicron-rpaths for documentation. +// NOTE: This file MUST be kept in sync with the other build.rs files in this +// repository. +fn main() { + omicron_rpaths::configure_default_omicron_rpaths(); +} diff --git a/nexus/src/app/background/tasks/support_bundle/cache.rs b/support-bundle-collection/src/cache.rs similarity index 97% rename from nexus/src/app/background/tasks/support_bundle/cache.rs rename to support-bundle-collection/src/cache.rs index 314345c64b7..60f87919a5d 100644 --- a/nexus/src/app/background/tasks/support_bundle/cache.rs +++ b/support-bundle-collection/src/cache.rs @@ -7,7 +7,7 @@ //! This is used to share data which may be used by multiple //! otherwise independent steps. -use crate::app::background::tasks::support_bundle::collection::BundleCollection; +use crate::collection::BundleCollection; use gateway_client::Client as MgsClient; use internal_dns_types::names::ServiceName; diff --git a/support-bundle-collection/src/collection.rs b/support-bundle-collection/src/collection.rs new file mode 100644 index 00000000000..f3a3fb9e8a7 --- /dev/null +++ b/support-bundle-collection/src/collection.rs @@ -0,0 +1,350 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! The mechanism layer of support bundle collection. +//! +//! Given a datastore handle, internal-DNS resolver, [`OpContext`], a +//! [`BundleDataSelection`], a synthesized [`BundleInfo`] (id + +//! reason_for_creation), and an output directory, gather data into the +//! directory and produce a [`SupportBundleCollectionReport`]. +//! +//! This layer never reads the `support_bundle` table, never transfers +//! data to a sled-agent's bundle storage endpoints, and never polls +//! bundle state. Those responsibilities belong to the caller. + +use crate::cache::Cache; +use crate::perfetto; +use crate::step::CollectionStep; +use crate::steps; +use nexus_types::support_bundle::BundleDataSelection; + +use anyhow::Context; +use camino_tempfile::Utf8TempDir; +use internal_dns_resolver::Resolver; +use nexus_db_queries::context::OpContext; +use nexus_db_queries::db::DataStore; +use nexus_types::internal_api::background::SupportBundleCollectionReport; +use omicron_uuid_kinds::SupportBundleUuid; +use parallel_task_set::ParallelTaskSet; +use serde_json::json; +use std::sync::Arc; +use tokio_util::sync::CancellationToken; + +/// Minimal bundle metadata needed by the mechanism layer. +/// +/// Built by the caller — either drawn from an existing `support_bundle` +/// DB row or synthesized fresh. +#[derive(Clone, Debug)] +pub struct BundleInfo { + pub id: SupportBundleUuid, + pub reason_for_creation: String, +} + +/// Wraps up all arguments to perform a single support bundle collection +pub struct BundleCollection { + datastore: Arc, + resolver: Resolver, + log: slog::Logger, + opctx: OpContext, + data_selection: BundleDataSelection, + bundle: BundleInfo, + cancellation_token: CancellationToken, +} + +impl BundleCollection { + pub fn new( + datastore: Arc, + resolver: Resolver, + log: slog::Logger, + opctx: OpContext, + data_selection: BundleDataSelection, + bundle: BundleInfo, + ) -> Self { + Self { + datastore, + resolver, + log, + opctx, + data_selection, + bundle, + cancellation_token: CancellationToken::new(), + } + } + + pub fn datastore(&self) -> &Arc { + &self.datastore + } + + pub fn resolver(&self) -> &Resolver { + &self.resolver + } + + pub fn log(&self) -> &slog::Logger { + &self.log + } + + pub fn opctx(&self) -> &OpContext { + &self.opctx + } + + pub fn data_selection(&self) -> &BundleDataSelection { + &self.data_selection + } + + pub fn bundle(&self) -> &BundleInfo { + &self.bundle + } + + /// Returns true if this bundle collection has been cancelled. + /// + /// Use for cooperative cancellation checks before cancel-unsafe + /// operations (filesystem writes, `spawn_blocking`). + pub fn is_cancelled(&self) -> bool { + self.cancellation_token.is_cancelled() + } + + /// Returns a reference to the cancellation token. + /// + /// Cancellation is driven by the caller; this type only observes + /// the token. + pub fn cancellation_token(&self) -> &CancellationToken { + &self.cancellation_token + } + + /// Returns a future that completes when cancellation is requested. + /// + /// Use in `tokio::select!` with cancel-safe operations (HTTP requests, + /// DB queries) for immediate cancellation at await points. + pub async fn cancelled(&self) { + self.cancellation_token.cancelled().await + } + + /// Collect the bundle into the supplied temporary directory. + /// + /// The caller drives cancellation via `cancellation_token()`; this + /// function drains in-flight work before returning. The caller is + /// responsible for whatever happens to the directory afterward + /// (zipping, transferring to durable storage, writing to stdout, + /// etc.). + pub async fn collect_bundle_locally( + self: &Arc, + dir: &Utf8TempDir, + ) -> anyhow::Result { + let report = self.collect_bundle_as_file(dir).await; + + if self.is_cancelled() { + warn!( + &self.log, + "Support Bundle cancelled - stopping collection"; + "bundle" => %self.bundle.id, + ); + return Err(anyhow::anyhow!("Support Bundle Cancelled")); + } + + info!( + &self.log, + "Bundle Collection completed"; + "bundle" => %self.bundle.id + ); + report + } + + // Cancellation is hybrid: + // + // - Cancel-safe operations (HTTP requests, DB queries) use + // `tokio::select!` with the token for immediate cancellation. + // Dropping these futures is safe — no local side effects. + // + // - Cancel-unsafe operations (filesystem writes via `tokio::fs`, + // `spawn_blocking`) use cooperative `is_cancelled()` checks. + // These are never dropped mid-flight, ensuring all + // `spawn_blocking` work completes before the output directory + // is dropped. + // + // This loop checks the token before spawning new steps and drains + // all in-flight tasks before returning. Failing to do so previously + // raced `spawn_blocking` writes against `TempDir` cleanup; see + // https://github.com/oxidecomputer/omicron/issues/10198. + async fn run_collect_bundle_steps( + self: &Arc, + output: &Utf8TempDir, + mut steps: Vec, + ) -> SupportBundleCollectionReport { + let mut report = SupportBundleCollectionReport::new(self.bundle.id); + + const MAX_CONCURRENT_STEPS: usize = 16; + let mut tasks = + ParallelTaskSet::new_with_parallelism(MAX_CONCURRENT_STEPS); + + loop { + // Check for cancellation before spawning new work. + if self.is_cancelled() { + info!( + &self.log, + "Cancellation detected — draining in-flight steps"; + "bundle" => %self.bundle.id, + ); + break; + } + + // Process all the currently-planned steps + while let Some(step) = steps.pop() { + let previous_result = tasks + .spawn({ + let collection = self.clone(); + let dir = output.path().to_path_buf(); + let log = self.log.clone(); + async move { + debug!(log, "Running step"; "step" => &step.name); + step.run(&collection, dir.as_path(), &log).await + } + }) + .await; + + if let Some(output) = previous_result { + output.process(&mut report, &mut steps); + }; + } + + // If we've run out of tasks to spawn, join any of the previously + // spawned tasks, if any exist. + if let Some(output) = tasks.join_next().await { + output.process(&mut report, &mut steps); + + // As soon as any task completes, see if we can spawn more work + // immediately. This ensures that the ParallelTaskSet is + // saturated as much as it can be. + continue; + } + + // Executing steps may create additional steps, as follow-up work. + // + // Only finish if we've exhausted all possible steps and joined + // all spawned work. + if steps.is_empty() { + // Write trace file before returning + if let Err(err) = self.write_trace_file(output, &report).await { + warn!( + self.log, + "Failed to write trace file"; + "error" => ?err + ); + } + return report; + } + } + + // Drain all in-flight tasks. This ensures any tokio::fs operations + // (which internally use spawn_blocking) complete before we return, + // so the TempDir can be safely dropped by our caller. + while let Some(output) = tasks.join_next().await { + output.process(&mut report, &mut steps); + // Ignore newly-spawned steps from drained tasks — we're + // stopping. + } + + report + } + + // Write a Perfetto Event format JSON file for visualization + async fn write_trace_file( + &self, + output: &Utf8TempDir, + report: &SupportBundleCollectionReport, + ) -> anyhow::Result<()> { + let meta_dir = output.path().join("meta"); + tokio::fs::create_dir_all(&meta_dir).await.with_context(|| { + format!("Failed to create meta directory {meta_dir}") + })?; + + let trace_path = meta_dir.join("trace.json"); + + // Convert steps to Perfetto Trace Event format. + // Sort steps by start time and assign each a unique sequential ID. + // + // This is necessary because the trace event format does not like + // multiple slices to overlap - so we make each slice distinct. + // + // Ideally we'd be able to correlate these with actual tokio tasks, + // but it's hard to convert tokio::task::Id to a u64 because + // of https://github.com/tokio-rs/tokio/issues/7430 + let mut sorted_steps: Vec<_> = report.steps.iter().collect(); + sorted_steps.sort_by_key(|s| s.start); + + // Generate trace events - each step gets a unique ID (1, 2, 3, ...) + // based on its start time order + let trace_events: Vec<_> = sorted_steps + .iter() + .enumerate() + .map(|(i, step)| { + let start_us = step.start.timestamp_micros(); + let duration_us = (step.end - step.start) + .num_microseconds() + .unwrap_or(0) + .max(0); + let step_id = i + 1; + + perfetto::TraceEvent { + name: step.name.clone(), + cat: "bundle_collection".to_string(), + ph: "X".to_string(), + ts: start_us, + dur: duration_us, + pid: 1, + tid: step_id, + args: json!({ + "status": step.status.to_string(), + }), + } + }) + .collect(); + + let trace = perfetto::Trace { + trace_events, + display_time_unit: "ms".to_string(), + }; + + let trace_content = serde_json::to_string_pretty(&trace) + .context("Failed to serialize trace JSON")?; + + tokio::fs::write(&trace_path, trace_content).await.with_context( + || format!("Failed to write trace file to {trace_path}"), + )?; + + info!( + self.log, + "Wrote trace file"; + "path" => %trace_path, + "num_events" => trace.trace_events.len() + ); + + Ok(()) + } + + // Perform the work of collecting the support bundle into a temporary + // directory. + // + // "dir" is an output directory where data can be stored. + // + // If a partial bundle can be collected, it should be returned as + // an Ok(SupportBundleCollectionReport). Any failures from this function + // will prevent the support bundle from being collected altogether. + // + // Cancellation is hybrid: cancel-safe operations (HTTP, DB) use + // `select!` with the token for immediate cancellation; cancel-unsafe + // operations (filesystem writes) use cooperative `is_cancelled()`. + // In-flight work is drained before returning. + async fn collect_bundle_as_file( + self: &Arc, + dir: &Utf8TempDir, + ) -> anyhow::Result { + let log = &self.log; + + info!(&log, "Collecting bundle as local file"); + + let cache = Cache::new(); + let steps = steps::all(&cache); + Ok(self.run_collect_bundle_steps(dir, steps).await) + } +} diff --git a/support-bundle-collection/src/lib.rs b/support-bundle-collection/src/lib.rs new file mode 100644 index 00000000000..6a604b5fb21 --- /dev/null +++ b/support-bundle-collection/src/lib.rs @@ -0,0 +1,26 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! The mechanism layer of support bundle collection. +//! +//! This crate provides the data-gathering primitives used to assemble a +//! support bundle. It is consumed by both the Nexus background task +//! (which manages bundle state in the `support_bundle` table) and by +//! `omdb` (which collects bundles ad-hoc, including when Nexus is down). +//! +//! See `README.md` in this crate for a developer-oriented overview of +//! the step framework. + +#[macro_use] +extern crate slog; + +mod cache; +pub mod collection; +pub mod perfetto; +mod step; +mod steps; +pub mod zip; + +pub use collection::BundleCollection; +pub use collection::BundleInfo; diff --git a/nexus/src/app/background/tasks/support_bundle/perfetto.rs b/support-bundle-collection/src/perfetto.rs similarity index 100% rename from nexus/src/app/background/tasks/support_bundle/perfetto.rs rename to support-bundle-collection/src/perfetto.rs diff --git a/nexus/src/app/background/tasks/support_bundle/step.rs b/support-bundle-collection/src/step.rs similarity index 98% rename from nexus/src/app/background/tasks/support_bundle/step.rs rename to support-bundle-collection/src/step.rs index 5909265b976..df222127403 100644 --- a/nexus/src/app/background/tasks/support_bundle/step.rs +++ b/support-bundle-collection/src/step.rs @@ -4,7 +4,7 @@ //! Support bundle collection step execution framework -use crate::app::background::tasks::support_bundle::collection::BundleCollection; +use crate::collection::BundleCollection; use camino::Utf8Path; use chrono::DateTime; diff --git a/nexus/src/app/background/tasks/support_bundle/steps/ereports.rs b/support-bundle-collection/src/steps/ereports.rs similarity index 94% rename from nexus/src/app/background/tasks/support_bundle/steps/ereports.rs rename to support-bundle-collection/src/steps/ereports.rs index ff4a58cbbad..e306f1f351f 100644 --- a/nexus/src/app/background/tasks/support_bundle/steps/ereports.rs +++ b/support-bundle-collection/src/steps/ereports.rs @@ -4,13 +4,14 @@ //! Collect ereports for support bundles -use crate::app::background::tasks::support_bundle::collection::BundleCollection; -use crate::app::background::tasks::support_bundle::step::CollectionStepOutput; +use crate::collection::BundleCollection; +use crate::step::CollectionStepOutput; use nexus_types::fm::ereport::EreportFilters; use anyhow::Context; use camino::Utf8Path; use camino::Utf8PathBuf; +use dropshot::PaginationOrder; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; use nexus_db_queries::db::datastore; @@ -34,6 +35,7 @@ pub async fn collect( debug!(log, "Support bundle: ereports not requested"); return Ok(CollectionStepOutput::Skipped); }; + let time_range = collection.data_selection().time_range().cloned(); let ereports_dir = dir.join("ereports"); let mut status = SupportBundleEreportStatus::default(); if let Err(err) = save_ereports( @@ -42,6 +44,7 @@ pub async fn collect( opctx, datastore, ereport_filters.clone(), + time_range, ereports_dir, &mut status, ) @@ -66,25 +69,25 @@ pub async fn collect( // // Cancel-**unsafe**: writes to the filesystem via `tokio::fs`. // DB queries are eagerly cancelled via `select!`. +#[allow(clippy::too_many_arguments)] async fn save_ereports( collection: &BundleCollection, log: &Logger, opctx: &OpContext, datastore: &Arc, filters: EreportFilters, + time_range: Option, dir: Utf8PathBuf, status: &mut SupportBundleEreportStatus, ) -> anyhow::Result<()> { - let mut paginator = Paginator::new( - datastore::SQL_BATCH_SIZE, - dropshot::PaginationOrder::Ascending, - ); + let mut paginator = + Paginator::new(datastore::SQL_BATCH_SIZE, PaginationOrder::Ascending); while let Some(p) = paginator.next() { let pagparams = p.current_pagparams(); let ereports = tokio::select! { _ = collection.cancelled() => return Ok(()), result = datastore.ereport_fetch_matching( - &opctx, &filters, &pagparams + &opctx, &filters, time_range.as_ref(), &pagparams ) => { result.map_err(|e| { e.internal_context("failed to query for ereports") diff --git a/nexus/src/app/background/tasks/support_bundle/steps/host_info.rs b/support-bundle-collection/src/steps/host_info.rs similarity index 91% rename from nexus/src/app/background/tasks/support_bundle/steps/host_info.rs rename to support-bundle-collection/src/steps/host_info.rs index 4aa0dc7fdb4..cf60749209c 100644 --- a/nexus/src/app/background/tasks/support_bundle/steps/host_info.rs +++ b/support-bundle-collection/src/steps/host_info.rs @@ -4,10 +4,10 @@ //! Collect host information from sleds for support bundles -use crate::app::background::tasks::support_bundle::cache::Cache; -use crate::app::background::tasks::support_bundle::collection::BundleCollection; -use crate::app::background::tasks::support_bundle::step::CollectionStep; -use crate::app::background::tasks::support_bundle::step::CollectionStepOutput; +use crate::cache::Cache; +use crate::collection::BundleCollection; +use crate::step::CollectionStep; +use crate::step::CollectionStepOutput; use anyhow::Context; use anyhow::bail; @@ -31,6 +31,8 @@ pub async fn spawn_query_all_sleds( else { return Ok(CollectionStepOutput::Skipped); }; + let sled_selection = sled_selection.clone(); + let time_range = collection.data_selection().time_range().cloned(); let all_sleds = tokio::select! { _ = collection.cancelled() => return Ok(CollectionStepOutput::None), @@ -48,12 +50,16 @@ pub async fn spawn_query_all_sleds( } let sled = sled.clone(); + let time_range = time_range.clone(); extra_steps.push(CollectionStep::new( format!("sled data for sled {}", sled.id()), Box::new({ move |collection, dir| { async move { - collect_data_from_sled(collection, sled, dir).await + collect_data_from_sled( + collection, sled, time_range, dir, + ) + .await } .boxed() } @@ -79,6 +85,7 @@ pub async fn spawn_query_all_sleds( async fn collect_data_from_sled( collection: &BundleCollection, sled: Sled, + time_range: Option, dir: &Utf8Path, ) -> anyhow::Result { let (log, opctx, datastore) = @@ -240,6 +247,7 @@ async fn collect_data_from_sled( &sled_client, zone, &sled_path, + time_range.as_ref(), cancellation_token, ) }) @@ -317,15 +325,23 @@ async fn save_zone_log_zip_or_error( client: &sled_agent_client::Client, zone: &str, path: &Utf8Path, + time_range: Option<&nexus_types::support_bundle::BundleTimeRange>, cancellation_token: &CancellationToken, ) -> anyhow::Result<()> { - // In the future when support bundle collection exposes tuning parameters - // this can turn into a collection parameter. - const DEFAULT_MAX_ROTATED_LOGS: u32 = 5; + // Destructure with field names so the positional Progenitor call + // below can't accidentally swap start_time and end_time (both have + // the same `Option<&DateTime>` type). + let nexus_types::support_bundle::BundleTimeRange { start, end } = + time_range.cloned().unwrap_or_default(); let download_result = tokio::select! { _ = cancellation_token.cancelled() => return Ok(()), - result = client.support_logs_download(zone, DEFAULT_MAX_ROTATED_LOGS) => result, + result = client.support_logs_download( + zone, + /* end_time */ end.as_ref(), + /* max_rotated*/ None, + /* start_time */ start.as_ref(), + ) => result, }; match download_result { diff --git a/nexus/src/app/background/tasks/support_bundle/steps/metadata.rs b/support-bundle-collection/src/steps/metadata.rs similarity index 86% rename from nexus/src/app/background/tasks/support_bundle/steps/metadata.rs rename to support-bundle-collection/src/steps/metadata.rs index 726e3387d43..fcf7946b1f4 100644 --- a/nexus/src/app/background/tasks/support_bundle/steps/metadata.rs +++ b/support-bundle-collection/src/steps/metadata.rs @@ -4,8 +4,8 @@ //! Collects metadata about the bundle itself -use crate::app::background::tasks::support_bundle::collection::BundleCollection; -use crate::app::background::tasks::support_bundle::step::CollectionStepOutput; +use crate::collection::BundleCollection; +use crate::step::CollectionStepOutput; use camino::Utf8Path; /// Writes the bundle ID to a file diff --git a/nexus/src/app/background/tasks/support_bundle/steps/mod.rs b/support-bundle-collection/src/steps/mod.rs similarity index 95% rename from nexus/src/app/background/tasks/support_bundle/steps/mod.rs rename to support-bundle-collection/src/steps/mod.rs index 93952861b2c..57ddda584ed 100644 --- a/nexus/src/app/background/tasks/support_bundle/steps/mod.rs +++ b/support-bundle-collection/src/steps/mod.rs @@ -4,8 +4,8 @@ //! Individual support bundle collection steps -use crate::app::background::tasks::support_bundle::cache::Cache; -use crate::app::background::tasks::support_bundle::step::CollectionStep; +use crate::cache::Cache; +use crate::step::CollectionStep; use futures::FutureExt; use nexus_types::internal_api::background::SupportBundleCollectionStep; diff --git a/nexus/src/app/background/tasks/support_bundle/steps/reconfigurator.rs b/support-bundle-collection/src/steps/reconfigurator.rs similarity index 93% rename from nexus/src/app/background/tasks/support_bundle/steps/reconfigurator.rs rename to support-bundle-collection/src/steps/reconfigurator.rs index d1c8d79c4f8..d8ad62ee6ed 100644 --- a/nexus/src/app/background/tasks/support_bundle/steps/reconfigurator.rs +++ b/support-bundle-collection/src/steps/reconfigurator.rs @@ -4,8 +4,8 @@ //! Collect reconfigurator state for support bundles -use crate::app::background::tasks::support_bundle::collection::BundleCollection; -use crate::app::background::tasks::support_bundle::step::CollectionStepOutput; +use crate::collection::BundleCollection; +use crate::step::CollectionStepOutput; use anyhow::Context; use camino::Utf8Path; diff --git a/nexus/src/app/background/tasks/support_bundle/steps/sled_cubby.rs b/support-bundle-collection/src/steps/sled_cubby.rs similarity index 95% rename from nexus/src/app/background/tasks/support_bundle/steps/sled_cubby.rs rename to support-bundle-collection/src/steps/sled_cubby.rs index 1bd6dc59a36..91ce615e101 100644 --- a/nexus/src/app/background/tasks/support_bundle/steps/sled_cubby.rs +++ b/support-bundle-collection/src/steps/sled_cubby.rs @@ -4,9 +4,9 @@ //! Collect sled cubby information for support bundles -use crate::app::background::tasks::support_bundle::cache::Cache; -use crate::app::background::tasks::support_bundle::collection::BundleCollection; -use crate::app::background::tasks::support_bundle::step::CollectionStepOutput; +use crate::cache::Cache; +use crate::collection::BundleCollection; +use crate::step::CollectionStepOutput; use anyhow::Context; use anyhow::bail; diff --git a/nexus/src/app/background/tasks/support_bundle/steps/sp_dumps.rs b/support-bundle-collection/src/steps/sp_dumps.rs similarity index 92% rename from nexus/src/app/background/tasks/support_bundle/steps/sp_dumps.rs rename to support-bundle-collection/src/steps/sp_dumps.rs index 07027abcc5a..496be1eb065 100644 --- a/nexus/src/app/background/tasks/support_bundle/steps/sp_dumps.rs +++ b/support-bundle-collection/src/steps/sp_dumps.rs @@ -4,11 +4,11 @@ //! Collect SP task dumps for support bundles -use crate::app::background::tasks::support_bundle::cache::Cache; -use crate::app::background::tasks::support_bundle::collection::BundleCollection; -use crate::app::background::tasks::support_bundle::step::CollectionStep; -use crate::app::background::tasks::support_bundle::step::CollectionStepOutput; -use crate::app::background::tasks::support_bundle::steps; +use crate::cache::Cache; +use crate::collection::BundleCollection; +use crate::step::CollectionStep; +use crate::step::CollectionStepOutput; +use crate::steps; use anyhow::Context; use anyhow::bail; diff --git a/support-bundle-collection/src/zip.rs b/support-bundle-collection/src/zip.rs new file mode 100644 index 00000000000..b0065e8b326 --- /dev/null +++ b/support-bundle-collection/src/zip.rs @@ -0,0 +1,113 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Helpers for converting a collected bundle directory into a zipfile. +//! +//! These are used by callers that need to produce a single archive from +//! the directory of collected data — both Nexus (for storing on a sled +//! agent) and omdb (for writing to local storage). + +use ::zip::ZipWriter; +use ::zip::write::FullFileOptions; +use anyhow::Result; +use camino::Utf8DirEntry; +use camino::Utf8Path; +use camino_tempfile::Utf8TempDir; +use camino_tempfile::tempfile_in; + +/// Takes the contents of `dir`, and zips them into a single zipfile +/// stored as a tempfile under `tempdir`. +pub fn bundle_to_zipfile( + dir: &Utf8TempDir, + tempdir: &Utf8Path, +) -> Result { + let tempfile = tempfile_in(tempdir)?; + let mut zip = ZipWriter::new(tempfile); + + recursively_add_directory_to_zipfile(&mut zip, dir.path(), dir.path())?; + + Ok(zip.finish()?) +} + +fn recursively_add_directory_to_zipfile( + zip: &mut ZipWriter, + root_path: &Utf8Path, + dir_path: &Utf8Path, +) -> Result<()> { + // Readdir might return entries in a non-deterministic order. + // Let's sort it for the zipfile, to be nice. + let mut entries = dir_path + .read_dir_utf8()? + .filter_map(Result::ok) + .collect::>(); + entries.sort_by(|a, b| a.file_name().cmp(&b.file_name())); + + for entry in &entries { + // Strip the tempdir prefix when storing the path in the zipfile. + let dst = entry.path().strip_prefix(root_path)?; + + let file_type = entry.file_type()?; + if file_type.is_file() { + let src = entry.path(); + + let zip_time = entry + .path() + .metadata() + .and_then(|m| m.modified()) + .ok() + .and_then(|sys_time| jiff::Zoned::try_from(sys_time).ok()) + .and_then(|zoned| { + ::zip::DateTime::try_from(zoned.datetime()).ok() + }) + .unwrap_or_else(::zip::DateTime::default); + + let opts = FullFileOptions::default() + .last_modified_time(zip_time) + .compression_method(::zip::CompressionMethod::Deflated) + .large_file(true); + + zip.start_file_from_path(dst, opts)?; + let mut file = std::fs::File::open(&src)?; + std::io::copy(&mut file, zip)?; + } + if file_type.is_dir() { + let opts = FullFileOptions::default(); + zip.add_directory_from_path(dst, opts)?; + recursively_add_directory_to_zipfile(zip, root_path, entry.path())?; + } + } + Ok(()) +} + +#[cfg(test)] +mod test { + use super::*; + + use camino_tempfile::tempdir; + + // Ensure that we can convert a temporary directory into a zipfile + #[test] + fn test_zipfile_creation() { + let dir = tempdir().unwrap(); + let tempdir_for_zip = tempdir().unwrap(); + + std::fs::create_dir_all(dir.path().join("dir-a")).unwrap(); + std::fs::create_dir_all(dir.path().join("dir-b")).unwrap(); + std::fs::write(dir.path().join("dir-a").join("file-a"), "some data") + .unwrap(); + std::fs::write(dir.path().join("file-b"), "more data").unwrap(); + + let zipfile = bundle_to_zipfile(&dir, tempdir_for_zip.path()) + .expect("Should have been able to bundle zipfile"); + let archive = ::zip::read::ZipArchive::new(zipfile).unwrap(); + + // We expect the order to be deterministically alphabetical + let mut names = archive.file_names(); + assert_eq!(names.next(), Some("dir-a/")); + assert_eq!(names.next(), Some("dir-a/file-a")); + assert_eq!(names.next(), Some("dir-b/")); + assert_eq!(names.next(), Some("file-b")); + assert_eq!(names.next(), None); + } +}