-
Notifications
You must be signed in to change notification settings - Fork 84
[3/3] Add omdb support-bundle collect subcommand
#10376
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
1d364a4
863c24b
d180d48
641724e
e3e45e4
c1a72bb
7e4e6bb
707d566
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,225 @@ | ||
| // This Source Code Form is subject to the terms of the Mozilla Public | ||
| // License, v. 2.0. If a copy of the MPL was not distributed with this | ||
| // file, You can obtain one at https://mozilla.org/MPL/2.0/. | ||
|
|
||
| //! `omdb support-bundle collect` — collect a support bundle locally, | ||
| //! without going through Nexus. | ||
| //! | ||
| //! Unlike the Nexus background task, this path: | ||
| //! | ||
| //! - Does not register a row in the `support_bundle` table. | ||
| //! - Does not transfer the resulting bundle to a sled-agent for durable | ||
| //! storage. The zip is written to a local file path. | ||
| //! - Does not require Nexus to be up. It only needs CRDB, internal | ||
| //! DNS, MGS, and the rack's sled-agents reachable on the underlay. | ||
| //! | ||
| //! This is intended for incident response, where the operator may need | ||
| //! to collect a bundle precisely because Nexus is unhealthy. | ||
|
|
||
| use crate::Omdb; | ||
| use crate::db::DbUrlOptions; | ||
| use anyhow::Context; | ||
| use camino::Utf8PathBuf; | ||
| use camino_tempfile::tempdir_in; | ||
| use clap::Args; | ||
| use clap::Subcommand; | ||
| use clap::ValueEnum; | ||
| use nexus_db_queries::context::OpContext; | ||
| use nexus_db_queries::db::DataStore; | ||
| use nexus_types::fm::ereport::EreportFilters; | ||
| use nexus_types::support_bundle::BundleDataCategory; | ||
| use nexus_types::support_bundle::BundleDataSelection; | ||
| use omicron_uuid_kinds::SupportBundleUuid; | ||
| use std::io::Write; | ||
| use std::sync::Arc; | ||
| use support_bundle_collection::BundleCollection; | ||
| use support_bundle_collection::BundleInfo; | ||
| use support_bundle_collection::zip::bundle_to_stream; | ||
| use support_bundle_collection::zip::bundle_to_writer; | ||
|
|
||
| /// Arguments to the "omdb support-bundle" subcommand | ||
| #[derive(Debug, Args)] | ||
| pub struct SupportBundleArgs { | ||
| #[command(subcommand)] | ||
| command: SupportBundleCommands, | ||
| } | ||
|
|
||
| #[derive(Debug, Subcommand)] | ||
| enum SupportBundleCommands { | ||
| /// Collect a support bundle without involving Nexus. | ||
| /// | ||
| /// Connects directly to CockroachDB, internal DNS, MGS, and the | ||
| /// rack's sled-agents — none of which depend on Nexus being up. | ||
| /// The bundle is written to a local zip file. No row is created | ||
| /// in the `support_bundle` table. | ||
| Collect(CollectArgs), | ||
| } | ||
|
|
||
| #[derive(Debug, Args)] | ||
| struct CollectArgs { | ||
| #[command(flatten)] | ||
| db_url_opts: DbUrlOptions, | ||
|
|
||
| /// Optional path where the bundle zip will be written. If omitted, | ||
| /// the zip is streamed to stdout (suitable for piping over ssh). | ||
| #[clap(long, short = 'o')] | ||
| output: Option<Utf8PathBuf>, | ||
|
|
||
| /// Reason recorded inside the bundle's metadata. | ||
| #[clap(long, default_value = "collected via omdb")] | ||
| reason: String, | ||
|
|
||
| /// Directory to use for staging the bundle contents before zipping. | ||
| #[clap(long, default_value = "/var/tmp")] | ||
| tempdir: Utf8PathBuf, | ||
|
|
||
| /// Categories of data to collect. May be supplied multiple times. | ||
| /// Defaults to all categories. | ||
| #[clap(long, value_enum)] | ||
| include: Vec<BundleDataCategory>, | ||
| } | ||
|
|
||
| impl CollectArgs { | ||
| fn data_selection(&self) -> BundleDataSelection { | ||
| let categories: &[BundleDataCategory] = if self.include.is_empty() { | ||
| BundleDataCategory::value_variants() | ||
| } else { | ||
| self.include.as_slice() | ||
| }; | ||
|
|
||
| let mut sel = BundleDataSelection::new(); | ||
| for category in categories { | ||
| sel = match category { | ||
| BundleDataCategory::Reconfigurator => sel.with_reconfigurator(), | ||
| BundleDataCategory::HostInfo => sel.with_all_sleds(), | ||
| BundleDataCategory::SledCubbyInfo => sel.with_sled_cubby_info(), | ||
| BundleDataCategory::SpDumps => sel.with_sp_dumps(), | ||
| BundleDataCategory::Ereports => sel.with_ereports( | ||
| EreportFilters::new() | ||
| .with_start_time( | ||
| omicron_common::now_db_precision() | ||
| - chrono::Days::new(7), | ||
| ) | ||
| .expect("no end time set, cannot fail"), | ||
| ), | ||
| }; | ||
| } | ||
| sel | ||
| } | ||
| } | ||
|
|
||
| impl SupportBundleArgs { | ||
| pub async fn run_cmd( | ||
| &self, | ||
| omdb: &Omdb, | ||
| log: &slog::Logger, | ||
| ) -> anyhow::Result<()> { | ||
| match &self.command { | ||
| SupportBundleCommands::Collect(args) => args.run(omdb, log).await, | ||
| } | ||
| } | ||
| } | ||
|
|
||
| impl CollectArgs { | ||
| async fn run(&self, omdb: &Omdb, log: &slog::Logger) -> anyhow::Result<()> { | ||
| // Collecting a full bundle stages every file in --tempdir before | ||
| // (or while) writing the zip. On the switch zone, where this | ||
| // command typically runs during incident response, disk space is | ||
| // limited and a large bundle can fill it. Gate the command behind | ||
| // -w/--destructive so an operator opts in knowingly. | ||
| let _token = omdb.check_allow_destructive()?; | ||
| self.db_url_opts | ||
| .with_datastore(omdb, log, async |opctx, datastore| { | ||
| self.collect(omdb, log, opctx, datastore).await | ||
| }) | ||
| .await | ||
| } | ||
|
|
||
| async fn collect( | ||
| &self, | ||
| omdb: &Omdb, | ||
| log: &slog::Logger, | ||
| opctx: OpContext, | ||
| datastore: Arc<DataStore>, | ||
| ) -> anyhow::Result<()> { | ||
| let resolver = omdb.dns_resolver(log.clone()).await?; | ||
|
|
||
| let bundle = BundleInfo { | ||
| id: SupportBundleUuid::new_v4(), | ||
| reason_for_creation: self.reason.clone(), | ||
| }; | ||
| let bundle_log = log.new(slog::o!("bundle" => bundle.id.to_string())); | ||
| eprintln!("Collecting support bundle {}", bundle.id); | ||
|
|
||
| let collection = Arc::new(BundleCollection::new( | ||
| datastore, | ||
| resolver, | ||
| bundle_log, | ||
| opctx, | ||
| self.data_selection(), | ||
| bundle, | ||
| )); | ||
|
|
||
| // Wire Ctrl-C to cancel the in-flight collection. | ||
| let cancel_handle = tokio::spawn({ | ||
| let token = collection.cancellation_token().clone(); | ||
| async move { | ||
| let _ = tokio::signal::ctrl_c().await; | ||
| eprintln!("\nCtrl-C received — cancelling bundle collection."); | ||
| token.cancel(); | ||
| } | ||
| }); | ||
|
|
||
| let dir = tempdir_in(&self.tempdir).with_context(|| { | ||
| format!("creating temp dir under {}", self.tempdir) | ||
| })?; | ||
| let collect_result = collection.collect_bundle_locally(&dir).await; | ||
| cancel_handle.abort(); | ||
| let _ = cancel_handle.await; | ||
| let report = collect_result?; | ||
|
|
||
| let output = self.output.clone(); | ||
| tokio::task::spawn_blocking(move || -> anyhow::Result<()> { | ||
| match output { | ||
| Some(path) => { | ||
| let file = std::fs::File::create(&path) | ||
| .with_context(|| format!("creating {path}"))?; | ||
| bundle_to_writer(&dir, &file)?; | ||
| } | ||
| None => { | ||
| let mut stdout = std::io::stdout().lock(); | ||
| bundle_to_stream(&dir, &mut stdout)?; | ||
| stdout.flush()?; | ||
| } | ||
| } | ||
| Ok(()) | ||
| }) | ||
| .await | ||
| .context("zip task panicked")??; | ||
|
|
||
| if let Some(path) = &self.output { | ||
| eprintln!("Wrote bundle to {path}"); | ||
| } else { | ||
| eprintln!("Bundle streamed to stdout"); | ||
| } | ||
| eprintln!("{} steps executed:", report.steps.len()); | ||
| for step in &report.steps { | ||
| let dur = step.end - step.start; | ||
| eprintln!( | ||
| " {:>9}ms {:?} {}", | ||
| dur.num_milliseconds(), | ||
| step.status, | ||
| step.name, | ||
| ); | ||
| } | ||
| if let Some(ereports) = &report.ereports { | ||
| eprintln!( | ||
| "ereports: {} found, {} collected, {} errors", | ||
| ereports.n_found, | ||
| ereports.n_collected, | ||
| ereports.errors.len(), | ||
| ); | ||
| } | ||
|
Comment on lines
+215
to
+222
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not necessarily opposed to this, but I'm curious why ereports merit a special call-out here, versus, say, the number of SP dumps.
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it is a little arbitrary - ereports are currently the only support bundle collection step that can silently skip individual items (where "n_found != n_collected", with I could try to store this some other way, if we want, but it's currently describing the format of I could try to save this summary info within the bundle itself, but maybe that should be a follow-up PR, as it changes the contents of bundles themselves?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for explaining. Agree nothing needs to change here, should we file an issue for adding a bundle summary?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I just went ahead and made a new PR: #10500 |
||
| Ok(()) | ||
| } | ||
| } | ||
Uh oh!
There was an error while loading. Please reload this page.