diff --git a/Cargo.lock b/Cargo.lock index 4fc5e5a215f..2bd09cba675 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2207,7 +2207,7 @@ source = "git+https://github.com/oxidecomputer/crucible?rev=bd9a0e2abe6b6b89aec8 dependencies = [ "crucible-workspace-hack", "libc", - "num-derive 0.4.2", + "num-derive", "num-traits", "thiserror 2.0.18", ] @@ -6164,12 +6164,12 @@ dependencies = [ [[package]] name = "libscf-sys" -version = "1.1.0" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "12f02d0eda38e8cc453c5ec5d49945545d8d9eb0e59cb2ce4152ba6518f373e7" +checksum = "d0d7bd6cfd9b5d32738cebd83a1b68060d96b1ca10d88bf9a5cb10dfac0f1cdf" dependencies = [ "libc", - "num-derive 0.3.3", + "num-derive", "num-traits", ] @@ -8026,17 +8026,6 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cf97ec579c3c42f953ef76dbf8d55ac91fb219dde70e49aa4a6b7d74e9919050" -[[package]] -name = "num-derive" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "876a53fff98e03a936a674b29568b0e605f06b29372c2489ff4de23f1949743d" -dependencies = [ - "proc-macro2", - "quote", - "syn 1.0.109", -] - [[package]] name = "num-derive" version = "0.4.2" @@ -10811,7 +10800,7 @@ dependencies = [ "anyhow", "convert_case 0.4.0", "libm", - "num-derive 0.4.2", + "num-derive", "num-traits", "ron", "serde", @@ -11419,9 +11408,9 @@ dependencies = [ [[package]] name = "proptest" -version = "1.10.0" +version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37566cb3fdacef14c0737f9546df7cfeadbfbc9fef10991038bf5015d0c80532" +checksum = "4b45fcc2344c680f5025fe57779faef368840d0bd1f42f216291f0dc4ace4744" dependencies = [ "bit-set 0.8.0", "bit-vec 0.8.0", @@ -13726,6 +13715,43 @@ dependencies = [ "tufaceous-artifact", ] +[[package]] +name = "sled-agent-scrimlet-reconcilers" +version = "0.1.0" +dependencies = [ + "anyhow", + "assert_matches", + "chrono", + "daft", + "dpd-client 0.1.0 (git+https://github.com/oxidecomputer/dendrite?rev=cc0c307c617f2988aafdca4e3bd35ea178b64801)", + "dropshot", + "either", + "futures", + "gateway-client", + "gateway-messages", + "gateway-test-utils", + "gateway-types", + "httpmock", + "iddqd", + "macaddr", + "mg-admin-client", + "omicron-common", + "omicron-test-utils", + "omicron-uuid-kinds", + "omicron-workspace-hack", + "oxnet", + "proptest", + "rdb-types", + "reqwest 0.13.2", + "serde_json", + "sled-agent-types", + "slog", + "slog-error-chain", + "test-strategy", + "thiserror 2.0.18", + "tokio", +] + [[package]] name = "sled-agent-types" version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index 7b2a8470a87..8d019d1f216 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -153,6 +153,7 @@ members = [ "sled-agent/measurements", "sled-agent/rack-setup", "sled-agent/repo-depot-api", + "sled-agent/scrimlet-reconcilers", "sled-agent/types", "sled-agent/types/versions", "sled-agent/resolvable-files", @@ -342,6 +343,7 @@ default-members = [ "sled-agent/measurements", "sled-agent/rack-setup", "sled-agent/repo-depot-api", + "sled-agent/scrimlet-reconcilers", "sled-agent/types", "sled-agent/types/versions", "sled-agent/resolvable-files", @@ -738,7 +740,7 @@ propolis_api_types = { git = "https://github.com/oxidecomputer/propolis", rev = propolis-client = { git = "https://github.com/oxidecomputer/propolis", rev = "58ab73bde89ade637b0ca8118682ee9575da6c2a" } propolis-mock-server = { git = "https://github.com/oxidecomputer/propolis", rev = "58ab73bde89ade637b0ca8118682ee9575da6c2a" } # NOTE: see above! -proptest = "1.7.0" +proptest = "1.11.0" qorb = "0.4.1" quote = "1.0" # Some dependencies still require rand 0.8.x. @@ -803,6 +805,7 @@ sled-agent-early-networking = { path = "sled-agent/early-networking" } sled-agent-health-monitor = { path = "sled-agent/health-monitor" } sled-agent-measurements = { path = "sled-agent/measurements" } sled-agent-rack-setup = { path = "sled-agent/rack-setup" } +sled-agent-scrimlet-reconcilers = { path = "sled-agent/scrimlet-reconcilers" } sled-agent-types = { path = "sled-agent/types" } sled-agent-types-versions = { path = "sled-agent/types/versions" } sled-agent-resolvable-files = { path = "sled-agent/resolvable-files" } diff --git a/gateway-test-utils/src/setup.rs b/gateway-test-utils/src/setup.rs index ab5a1dc0fd9..e51f8b83d66 100644 --- a/gateway-test-utils/src/setup.rs +++ b/gateway-test-utils/src/setup.rs @@ -38,7 +38,7 @@ pub const DEFAULT_SP_SIM_CONFIG: &str = pub struct GatewayTestContext { pub client: gateway_client::Client, pub server: omicron_gateway::Server, - pub port: u16, + port: u16, pub simrack: SimRack, pub logctx: LogContext, pub gateway_id: Uuid, @@ -47,6 +47,10 @@ pub struct GatewayTestContext { } impl GatewayTestContext { + pub fn address(&self) -> SocketAddrV6 { + SocketAddrV6::new(Ipv6Addr::LOCALHOST, self.port, 0, 0) + } + pub fn mgs_backends(&self) -> watch::Receiver { self.resolver_backends.clone() } diff --git a/nexus/reconfigurator/execution/src/test_utils.rs b/nexus/reconfigurator/execution/src/test_utils.rs index cd46adacd0b..5070e51b691 100644 --- a/nexus/reconfigurator/execution/src/test_utils.rs +++ b/nexus/reconfigurator/execution/src/test_utils.rs @@ -109,7 +109,8 @@ pub fn overridables_for_test( for (id_str, switch_slot) in scrimlets { let sled_id = id_str.parse().unwrap(); let ip = Ipv6Addr::LOCALHOST; - let mgs_port = cptestctx.gateway.get(&switch_slot).unwrap().port; + let mgs_port = + cptestctx.gateway.get(&switch_slot).unwrap().address().port(); let dendrite_port = cptestctx.dendrite.read().unwrap().get(&switch_slot).unwrap().port; let mgd_port = cptestctx.mgd.get(&switch_slot).unwrap().port; diff --git a/nexus/src/app/sagas/instance_start.rs b/nexus/src/app/sagas/instance_start.rs index 0899ffcd474..b42a4d747e9 100644 --- a/nexus/src/app/sagas/instance_start.rs +++ b/nexus/src/app/sagas/instance_start.rs @@ -1163,7 +1163,6 @@ async fn sis_ensure_running( #[cfg(test)] mod test { use core::time::Duration; - use std::net::SocketAddrV6; use crate::app::sagas::disk_delete::test::ExpungeTestHarness; use crate::app::sagas::disk_delete::test::create_disk; @@ -1501,8 +1500,7 @@ mod test { // Reuse the port number from the removed Switch0 to start a new dendrite instance let nexus_address = cptestctx.internal_client.bind_address; let mgs = cptestctx.gateway.get(&SwitchSlot::Switch0).unwrap(); - let mgs_address = - SocketAddrV6::new(Ipv6Addr::LOCALHOST, mgs.port, 0, 0).into(); + let mgs_address = mgs.address().into(); // Test fault recovery for nat propogation // Start a new dendrite instance for switch0 diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index 4660c88b419..df3dfe98ea0 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -2811,13 +2811,13 @@ mod test { cptestctx: &ControlPlaneTestContext, switch0_port: u16, ) { - use std::net::Ipv6Addr; - use std::net::SocketAddrV6; - let nexus_address = cptestctx.internal_client.bind_address; - let mgs = cptestctx.gateway.get(&SwitchSlot::Switch0).unwrap(); - let mgs_address = - SocketAddrV6::new(Ipv6Addr::LOCALHOST, mgs.port, 0, 0).into(); + let mgs_address = cptestctx + .gateway + .get(&SwitchSlot::Switch0) + .unwrap() + .address() + .into(); let new_switch0 = omicron_test_utils::dev::dendrite::DendriteInstance::start( diff --git a/nexus/test-utils/src/nexus_test.rs b/nexus/test-utils/src/nexus_test.rs index 693aea88732..733ec41a5af 100644 --- a/nexus/test-utils/src/nexus_test.rs +++ b/nexus/test-utils/src/nexus_test.rs @@ -259,13 +259,7 @@ impl ControlPlaneTestContext { }; let mgs = self.gateway.get(&switch_slot).unwrap(); - let mgs_addr = std::net::SocketAddrV6::new( - std::net::Ipv6Addr::LOCALHOST, - mgs.port, - 0, - 0, - ) - .into(); + let mgs_addr = mgs.address().into(); let dendrite = omicron_test_utils::dev::dendrite::DendriteInstance::start( diff --git a/nexus/test-utils/src/starter.rs b/nexus/test-utils/src/starter.rs index 3822be75d1f..5f54d487e0f 100644 --- a/nexus/test-utils/src/starter.rs +++ b/nexus/test-utils/src/starter.rs @@ -422,8 +422,7 @@ impl<'a, N: NexusServer> ControlPlaneStarter<'a, N> { let log = &self.logctx.log; debug!(log, "Starting Dendrite"; "switch_slot" => ?switch_slot); let mgs = self.gateway.get(&switch_slot).unwrap(); - let mgs_addr = - SocketAddrV6::new(Ipv6Addr::LOCALHOST, mgs.port, 0, 0).into(); + let mgs_addr = mgs.address().into(); // Set up a stub instance of dendrite let dendrite = dev::dendrite::DendriteInstance::start( @@ -448,9 +447,7 @@ impl<'a, N: NexusServer> ControlPlaneStarter<'a, N> { pub async fn start_mgd(&mut self, switch_slot: SwitchSlot) { let log = &self.logctx.log; debug!(log, "Starting mgd"; "switch_slot" => ?switch_slot); - let mgs = self.gateway.get(&switch_slot).unwrap(); - let mgs_addr = - SocketAddrV6::new(Ipv6Addr::LOCALHOST, mgs.port, 0, 0).into(); + let mgs_addr = self.gateway.get(&switch_slot).unwrap().address().into(); // Set up an instance of mgd let mgd = @@ -484,7 +481,7 @@ impl<'a, N: NexusServer> ControlPlaneStarter<'a, N> { sled_id, Ipv6Addr::LOCALHOST, self.dendrite.read().unwrap().get(&switch_slot).unwrap().port, - self.gateway.get(&switch_slot).unwrap().port, + self.gateway.get(&switch_slot).unwrap().address().port(), self.mgd.get(&switch_slot).unwrap().port, ) .unwrap() diff --git a/sled-agent/scrimlet-reconcilers/Cargo.toml b/sled-agent/scrimlet-reconcilers/Cargo.toml new file mode 100644 index 00000000000..8e0c9ef4e06 --- /dev/null +++ b/sled-agent/scrimlet-reconcilers/Cargo.toml @@ -0,0 +1,48 @@ +[package] +name = "sled-agent-scrimlet-reconcilers" +version = "0.1.0" +edition.workspace = true +license = "MPL-2.0" + +[lints] +workspace = true + +[dependencies] +chrono.workspace = true +daft.workspace = true +dpd-client.workspace = true +either.workspace = true +futures.workspace = true +gateway-client.workspace = true +gateway-types.workspace = true +iddqd.workspace = true +macaddr.workspace = true +mg-admin-client.workspace = true +omicron-common.workspace = true +omicron-uuid-kinds.workspace = true +oxnet.workspace = true +rdb-types.workspace = true +reqwest.workspace = true +sled-agent-types.workspace = true +slog.workspace = true +slog-error-chain.workspace = true +thiserror.workspace = true +tokio.workspace = true + +omicron-workspace-hack.workspace = true + +[dev-dependencies] +anyhow.workspace = true +assert_matches.workspace = true +dropshot.workspace = true +gateway-messages.workspace = true +gateway-test-utils.workspace = true +httpmock.workspace = true +omicron-test-utils.workspace = true +proptest.workspace = true +serde_json.workspace = true +sled-agent-types = { workspace = true, features = ["testing"] } +test-strategy.workspace = true + +[features] +testing = [] diff --git a/sled-agent/scrimlet-reconcilers/src/dpd_reconciler.rs b/sled-agent/scrimlet-reconcilers/src/dpd_reconciler.rs new file mode 100644 index 00000000000..445c9144604 --- /dev/null +++ b/sled-agent/scrimlet-reconcilers/src/dpd_reconciler.rs @@ -0,0 +1,99 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Reconciler responsible for configuration of `dpd` within a scrimlet's switch +//! zone. + +use crate::handle::ScrimletReconcilersMode; +use crate::reconciler_task::Reconciler; +use crate::switch_zone_slot::ThisSledSwitchSlot; +use dpd_client::Client; +use sled_agent_types::system_networking::SystemNetworkingConfig; +use slog::Logger; +use std::time::Duration; + +mod nat; +mod port_reconciler; + +pub use nat::DpdNatReconcilerStatus; +pub use nat::DpdNatReconcilerStatusNatEntry; +pub use nat::DpdNatReconcilerStatusNatEntryFailure; +pub use port_reconciler::DpdPortOperationFailure; +pub use port_reconciler::DpdPortReconcilerStatus; +use port_reconciler::PortReconciler; + +#[derive(Debug, Clone)] +pub struct DpdReconcilerStatus { + /// Result of reconciling port settings + pub port_settings_status: DpdPortReconcilerStatus, + /// Result of reconciling service zone NAT entries + pub nat_status: DpdNatReconcilerStatus, +} + +impl slog::KV for DpdReconcilerStatus { + fn serialize( + &self, + record: &slog::Record<'_>, + serializer: &mut dyn slog::Serializer, + ) -> slog::Result { + let Self { port_settings_status, nat_status } = self; + port_settings_status.serialize(record, serializer)?; + nat_status.serialize(record, serializer)?; + Ok(()) + } +} + +#[derive(Debug)] +pub(crate) struct DpdReconciler { + client: Client, + switch_slot: ThisSledSwitchSlot, + port_reconciler: PortReconciler, +} + +impl Reconciler for DpdReconciler { + type Status = DpdReconcilerStatus; + + const LOGGER_COMPONENT_NAME: &'static str = "DpdReconciler"; + const RE_RECONCILE_INTERVAL: Duration = Duration::from_secs(30); + + fn new( + mode: ScrimletReconcilersMode, + switch_slot: ThisSledSwitchSlot, + parent_log: &Logger, + ) -> Self { + Self { + client: mode.dpd_client(parent_log), + switch_slot, + port_reconciler: PortReconciler::default(), + } + } + + async fn do_reconciliation( + &mut self, + system_networking_config: &SystemNetworkingConfig, + log: &Logger, + ) -> Self::Status { + let port_settings_status = self + .port_reconciler + .reconcile( + &self.client, + &system_networking_config.rack_network_config, + self.switch_slot, + log, + ) + .await; + + let nat_status = if let Some(nat_entries) = system_networking_config + .blueprint_external_networking_config + .as_ref() + .map(|config| &config.service_zone_nat_entries) + { + nat::reconcile(&self.client, nat_entries, log).await + } else { + DpdNatReconcilerStatus::NoNatEntriesConfig + }; + + DpdReconcilerStatus { port_settings_status, nat_status } + } +} diff --git a/sled-agent/scrimlet-reconcilers/src/dpd_reconciler/nat.rs b/sled-agent/scrimlet-reconcilers/src/dpd_reconciler/nat.rs new file mode 100644 index 00000000000..aa211a14ba4 --- /dev/null +++ b/sled-agent/scrimlet-reconcilers/src/dpd_reconciler/nat.rs @@ -0,0 +1,647 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Reconciliation of Omicron service NAT entries. +//! +//! Does not modify non-service NAT entries. + +use daft::Diffable; +use dpd_client::Client; +use futures::Stream; +use futures::TryStreamExt; +use macaddr::MacAddr6; +use omicron_common::api::external::MacAddr; +use omicron_common::api::external::Vni; +use omicron_uuid_kinds::OmicronZoneUuid; +use sled_agent_types::system_networking::ServiceZoneNatEntries; +use sled_agent_types::system_networking::ServiceZoneNatEntry; +use slog::Logger; +use slog::info; +use slog::warn; +use slog_error_chain::InlineErrorChain; +use std::collections::BTreeMap; +use std::collections::BTreeSet; +use std::net::IpAddr; +use std::net::Ipv6Addr; +use std::num::NonZeroU32; + +type DpdClientError = dpd_client::Error; + +const SINGLE_REQUEST_LIMIT: Option = + Some(NonZeroU32::new(128).unwrap()); + +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)] +pub struct DpdNatReconcilerStatusNatEntry { + pub external_ip: IpAddr, + pub first_port: u16, + pub last_port: u16, +} + +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)] +pub struct DpdNatReconcilerStatusNatEntryFailure { + pub entry: DpdNatReconcilerStatusNatEntry, + pub error: String, +} + +/// Status of reconciling service zone NAT entries with `dpd`. +#[derive(Debug, Clone)] +pub enum DpdNatReconcilerStatus { + /// Reconciliation was skipped because the bootstore contains no NAT entry + /// config information. + NoNatEntriesConfig, + + /// Reconciliation failed while attempting to read the current set of + /// entries from `dpd`. + FailedReadingCurrentDpdNatEntries(String), + + /// Reconciliation failed because the bootstore config contained an illegal + /// combination of entries (e.g., two zones with identical NAT entries). + InvalidSystemNetworkingConfig(String), + + /// Reconciliation completed successfully. + Success { + /// Set of zone IDs whose NAT entries were already correct in `dpd` and + /// left unchanged. + unchanged: BTreeSet, + + /// List of NAT entries removed. + removed: Vec, + + /// Map of zone NAT entries created. + created: BTreeMap, + }, + + /// Reconciliation completed but had at least one failure. + PartialSuccess { + /// Set of zone IDs whose NAT entries were already correct in `dpd` and + /// left unchanged. + unchanged: BTreeSet, + + /// List of NAT entries successfully removed. + removed: Vec, + + /// List of NAT entries we tried but failed to remove. + remove_failures: Vec, + + /// Map of zone NAT entries successfully created. + created: BTreeMap, + + /// Map of zone NAT entries we tried but failed to create. + create_failures: + BTreeMap, + }, +} + +impl slog::KV for DpdNatReconcilerStatus { + fn serialize( + &self, + _record: &slog::Record<'_>, + serializer: &mut dyn slog::Serializer, + ) -> slog::Result { + match self { + DpdNatReconcilerStatus::NoNatEntriesConfig => serializer.emit_str( + "nat-reconciler-skipped".into(), + "no NAT entries present in config", + ), + DpdNatReconcilerStatus::FailedReadingCurrentDpdNatEntries( + reason, + ) => serializer.emit_str("nat-reconciler-failed".into(), reason), + DpdNatReconcilerStatus::InvalidSystemNetworkingConfig(reason) => { + serializer.emit_arguments( + "nat-reconciler-failed".into(), + &format_args!("invalid system networking config: {reason}"), + ) + } + DpdNatReconcilerStatus::Success { unchanged, removed, created } => { + // Only show a summary count; we have individual log statements + // for each create/remove. + for (key, val) in [ + ("nat-entries-unchanged", unchanged.len()), + ("nat-entries-successfully-removed", removed.len()), + ("nat-entries-failed-to-remove", 0), + ("nat-entries-successfully-created", created.len()), + ("nat-entries-failed-to-create", 0), + ] { + serializer.emit_usize(key.into(), val)?; + } + Ok(()) + } + DpdNatReconcilerStatus::PartialSuccess { + unchanged, + removed, + remove_failures, + created, + create_failures, + } => { + // Only show a summary count; we have individual log statements + // for each create/remove. + for (key, val) in [ + ("nat-entries-unchanged", unchanged.len()), + ("nat-entries-successfully-removed", removed.len()), + ("nat-entries-failed-to-remove", remove_failures.len()), + ("nat-entries-successfully-created", created.len()), + ("nat-entries-failed-to-create", create_failures.len()), + ] { + serializer.emit_usize(key.into(), val)?; + } + Ok(()) + } + } + } +} + +/// Perform reconciliation. +/// +/// On successful completion, the `dpd` reachable via `Client` will contain all +/// the service NAT entries described by `desired_nat_entries`, and no other NAT +/// entries in the service zone VNI(s). +pub(super) async fn reconcile( + client: &Client, + desired_nat_entries: &ServiceZoneNatEntries, + log: &Logger, +) -> DpdNatReconcilerStatus { + let dpd_current_entries = match CurrentDpdEntriesAssembler::assemble( + client, + desired_nat_entries, + log, + ) + .await + { + Ok(entries) => entries, + Err(err) => { + return DpdNatReconcilerStatus::FailedReadingCurrentDpdNatEntries( + format!( + "failed to read current NAT entries from dpd: {}", + InlineErrorChain::new(&err), + ), + ); + } + }; + + let plan = match ReconciliationPlan::new( + &dpd_current_entries, + desired_nat_entries, + log, + ) { + Ok(plan) => plan, + Err(err) => { + return DpdNatReconcilerStatus::InvalidSystemNetworkingConfig(err); + } + }; + + apply_plan(client, plan, log).await +} + +/// Apply the contents of `plan` to dpd via `client`. +/// +/// This requires `plan.to_remove.len() + plan.to_create.len()` independent +/// calls to `dpd`. We do not short circuit on failure: we'll always attempt to +/// make every call required. This may not be the right choice, but some +/// arguments in favor: +/// +/// * We'd like to eventually replace this with fewer calls, if we add different +/// APIs to `dpd` for applying NAT settings in bulk. +/// +/// * In practice we expect the number of calls here to be small. On startup we +/// expect ~10 `to_create` calls (one for each service, which is typically 2 +/// boundary NTP, 3 Nexus, and 1-5 external DNS), and for every reconciliation +/// attempt after that we expect 0-1 (either no changes, or a single new +/// service has been added or removed; it's possible we'll see multiple, but +/// unlikely given we re-reconcile on every networking config change). +/// * We always want to report the status of every step described by `plan`, and +/// implementing stop-on-first-failure means we'd need to record a "didn't +/// attempt because of an earlier failure" status for some steps. That's +/// doable but annoying. +async fn apply_plan( + client: &Client, + plan: ReconciliationPlan, + log: &Logger, +) -> DpdNatReconcilerStatus { + let ReconciliationPlan { unchanged, to_remove, to_create } = plan; + + // Always remove first. DPD keys NAT entries by IP address and lower port; + // if we've changed which zone is associated with a given IP/port pair, we + // need to ensure we remove the old entry before attempting to create a new + // one. + // + // If we fail to remove an entry that shares an IP/port with an entry in + // `to_create`, the create will also fail. We could optimize this by + // skipping creates if they had a corresponding remove failure, but that's + // quite a lot of bookkeeping for a rare case that isn't very problematic + // anyway (remove failure means the create will fail, but if the remove + // failed the create very likely could have failed anyway!). + let mut removed = Vec::new(); + let mut remove_failures = Vec::new(); + for entry in to_remove { + let result = match entry.target_ip { + IpAddr::V4(ip) => { + client.nat_ipv4_delete(&ip, entry.first_port).await + } + IpAddr::V6(ip) => { + client.nat_ipv6_delete(&ip, entry.first_port).await + } + }; + + match result { + Ok(_) => { + info!(log, "successfully removed NAT entry"; "entry" => ?entry); + removed.push(entry.into()); + } + Err(err) => { + let err = InlineErrorChain::new(&err); + warn!( + log, "failed to remove NAT entry"; + "entry" => ?entry, + &err, + ); + remove_failures.push(DpdNatReconcilerStatusNatEntryFailure { + entry: entry.into(), + error: err.to_string(), + }); + } + } + } + + let mut created = BTreeMap::new(); + let mut create_failures = BTreeMap::new(); + for (zone_id, entry) in to_create { + match create_nat_entry(client, &entry).await { + Ok(()) => { + info!( + log, "successfully created NAT entry"; + "zone-id" => %zone_id, + "entry" => ?entry, + ); + created.insert(zone_id, entry.into()); + } + Err(err) => { + let err = InlineErrorChain::new(&err); + warn!( + log, "failed to create NAT entry"; + "zone-id" => %zone_id, + "entry" => ?entry, + &err, + ); + create_failures.insert( + zone_id, + DpdNatReconcilerStatusNatEntryFailure { + entry: entry.into(), + error: err.to_string(), + }, + ); + } + } + } + + if remove_failures.is_empty() && create_failures.is_empty() { + DpdNatReconcilerStatus::Success { unchanged, removed, created } + } else { + DpdNatReconcilerStatus::PartialSuccess { + unchanged, + removed, + remove_failures, + created, + create_failures, + } + } +} + +#[derive(Debug, PartialEq, Eq)] +struct ReconciliationPlan { + // Set of zones whose NAT entries already exist in DPD. + unchanged: BTreeSet, + + // Set of NAT entries that exist in DPD but not our desired set; each of + // these should be removed. + to_remove: BTreeSet, + + // Set of NAT entries that don't exist in DPD but are in our desired set; + // each of these should be created. + to_create: BTreeMap, +} + +impl ReconciliationPlan { + /// Construct a new plan by diffing the current entries against the desired + /// entries. + /// + /// # Errors + /// + /// Fails if `service_nat_entries` contains invalid data (this should be + /// impossible). + fn new( + dpd_current_entries: &BTreeSet, + service_nat_entries: &ServiceZoneNatEntries, + log: &Logger, + ) -> Result { + // Convert `service_nat_entries` into both a set of `NatEntry`s (so we + // can diff it against `dpd_current_entries` via `daft`) and a map of + // `NatEntry` back to the zone ID that needs it (for our status + // reporting). + let mut desired_nat_entries = BTreeSet::new(); + let mut nat_to_zone_id = BTreeMap::new(); + for entry in service_nat_entries.iter() { + let zone_id = entry.zone_id; + let entry = NatEntry::from(entry); + + // We should have no duplicates; if we do, we have two different + // zones that want the same NAT entry. Refuse to reconcile. This + // should be impossible by construction: `ServiceZoneNatEntries` + // rejects overlapping entries. We double-check here in case that + // changes or is buggy. + if let Some(prev_zone_id) = nat_to_zone_id.insert(entry, zone_id) { + return Err(format!( + "invalid SystemNetworkingConfig: zones {zone_id} and \ + {prev_zone_id} want the same NAT entry: {entry:?}", + )); + } + + // We don't have to check again for duplicates here; we just + // confirmed every `entry` is unique. + desired_nat_entries.insert(entry); + } + + let nat_entry_diff = dpd_current_entries.diff(&desired_nat_entries); + + let unchanged = nat_entry_diff + .common + .into_iter() + .map(|entry| { + nat_to_zone_id + .get(entry) + .copied() + .expect("nat_to_zone_id has a value for every common entry") + }) + .collect::>(); + let to_remove = nat_entry_diff + .removed + .into_iter() + .copied() + .collect::>(); + let to_create = nat_entry_diff + .added + .into_iter() + .map(|entry| { + let zone_id = nat_to_zone_id + .get(entry) + .copied() + .expect("nat_to_zone_id has a value for every added entry"); + (zone_id, *entry) + }) + .collect::>(); + + info!( + log, + "generated NAT reconciliation plan"; + "entries-unchanged" => unchanged.len(), + "entries-to-remove" => to_remove.len(), + "entries-to-create" => to_create.len(), + ); + + Ok(Self { unchanged, to_remove, to_create }) + } +} + +async fn create_nat_entry( + client: &Client, + entry: &NatEntry, +) -> Result<(), DpdClientError> { + let nat_target = dpd_client::types::NatTarget { + inner_mac: dpd_client::types::MacAddr { a: entry.nic_mac.into_array() }, + internal_ip: entry.sled_underlay_ip, + vni: entry.vni.as_u32().into(), + }; + match entry.target_ip { + IpAddr::V4(ip) => client + .nat_ipv4_create( + &ip, + entry.first_port, + entry.last_port, + &nat_target, + ) + .await + .map(|response| response.into_inner()), + IpAddr::V6(ip) => client + .nat_ipv6_create( + &ip, + entry.first_port, + entry.last_port, + &nat_target, + ) + .await + .map(|response| response.into_inner()), + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Diffable)] +#[cfg_attr(test, derive(test_strategy::Arbitrary))] +struct NatEntry { + sled_underlay_ip: Ipv6Addr, + target_ip: IpAddr, + #[cfg_attr(test, strategy(proptest::strategy::Just(0)))] + first_port: u16, + #[cfg_attr(test, strategy(proptest::strategy::Just(65535)))] + last_port: u16, + nic_mac: MacAddr, + vni: Vni, +} + +impl From for DpdNatReconcilerStatusNatEntry { + fn from(value: NatEntry) -> Self { + Self { + external_ip: value.target_ip, + first_port: value.first_port, + last_port: value.last_port, + } + } +} + +impl From<&'_ ServiceZoneNatEntry> for NatEntry { + fn from(value: &'_ ServiceZoneNatEntry) -> Self { + let (first_port, last_port) = value.kind.nat_port_range(); + Self { + sled_underlay_ip: value.sled_underlay_ip, + target_ip: value.kind.external_ip(), + first_port, + last_port, + nic_mac: value.nic_mac, + vni: value.vni, + } + } +} + +#[derive(Debug, Clone, Copy, thiserror::Error)] +#[error("invalid VNI: {0}")] +struct BadVni(u32); + +impl TryFrom for NatEntry { + type Error = BadVni; + + fn try_from( + value: dpd_client::types::Ipv4Nat, + ) -> Result { + let vni = Vni::try_from(value.target.vni.0) + .map_err(|_| BadVni(value.target.vni.0))?; + Ok(Self { + sled_underlay_ip: value.target.internal_ip, + target_ip: value.external.into(), + nic_mac: MacAddr6::from(value.target.inner_mac.a).into(), + first_port: value.low, + last_port: value.high, + vni, + }) + } +} + +impl TryFrom for NatEntry { + type Error = BadVni; + + fn try_from( + value: dpd_client::types::Ipv6Nat, + ) -> Result { + let vni = Vni::try_from(value.target.vni.0) + .map_err(|_| BadVni(value.target.vni.0))?; + Ok(Self { + sled_underlay_ip: value.target.internal_ip, + target_ip: value.external.into(), + nic_mac: MacAddr6::from(value.target.inner_mac.a).into(), + first_port: value.low, + last_port: value.high, + vni, + }) + } +} + +/// Helper to assemble all IPv4 and IPv6 NAT entries in the relevant VNI(s). +/// +/// Currently this requires listing _all_ NAT entries in `dpd` and filtering on +/// our side down to just the VNIs we care about. We'd like a nicer API on the +/// dpd side: . +struct CurrentDpdEntriesAssembler<'a> { + client: &'a Client, + service_vnis: BTreeSet, + current_entries: BTreeSet, +} + +struct RelevantEntryCount { + service_vni: u64, + non_service_vni: u64, +} + +impl<'a> CurrentDpdEntriesAssembler<'a> { + async fn assemble( + client: &'a Client, + desired_nat_entries: &ServiceZoneNatEntries, + log: &Logger, + ) -> Result, DpdClientError> { + // We want to reconcile "all service NAT entries", but + // `desired_nat_entries` only tells us what should exist, not what needs + // to be removed. Build a set of the `Vni`s used in all our services; in + // practice, we expect this to always be a set of length one containing + // the `Vni::SERVICES_VNI` constant, because all services use that Vni. + // But we've written this as we have so that if we decide to split each + // kind of service into a separate Vni, this reconciliation still works. + // + // We assume there are never any service NAT entries in `dpd` that have + // a Vni other than one of the ones present in `desired_nat_entries`. + // `ServiceZoneNatEntries`, by construction, enforces that it's not + // empty, which guarantees that this set will be nonempty too. + let service_vnis: BTreeSet = + desired_nat_entries.iter().map(|entry| entry.vni).collect(); + let mut builder = + Self { client, service_vnis, current_entries: BTreeSet::new() }; + builder.read_ipv4_entries(log).await?; + builder.read_ipv6_entries(log).await?; + Ok(builder.current_entries) + } + + async fn read_ipv4_entries( + &mut self, + log: &Logger, + ) -> Result<(), DpdClientError> { + let mut stream_addresses = + self.client.nat_ipv4_addresses_list_stream(SINGLE_REQUEST_LIMIT); + + let mut counts = + RelevantEntryCount { service_vni: 0, non_service_vni: 0 }; + + while let Some(ip) = stream_addresses.try_next().await? { + self.assemble_entries_from_stream( + self.client.nat_ipv4_list_stream(&ip, SINGLE_REQUEST_LIMIT), + &mut counts, + ) + .await?; + } + + info!( + log, + "finished fetching current ipv4 NAT entries from dpd"; + "service-nat-entries" => counts.service_vni, + "non-service-nat-entries" => counts.non_service_vni, + "service-vnis" => ?self.service_vnis, + ); + + Ok(()) + } + + async fn read_ipv6_entries( + &mut self, + log: &Logger, + ) -> Result<(), DpdClientError> { + let mut stream_addresses = + self.client.nat_ipv6_addresses_list_stream(SINGLE_REQUEST_LIMIT); + + let mut counts = + RelevantEntryCount { service_vni: 0, non_service_vni: 0 }; + + while let Some(ip) = stream_addresses.try_next().await? { + self.assemble_entries_from_stream( + self.client.nat_ipv6_list_stream(&ip, SINGLE_REQUEST_LIMIT), + &mut counts, + ) + .await?; + } + + info!( + log, + "finished fetching current ipv6 NAT entries from dpd"; + "service-nat-entries" => counts.service_vni, + "non-service-nat-entries" => counts.non_service_vni, + "service-vnis" => ?self.service_vnis, + ); + + Ok(()) + } + + async fn assemble_entries_from_stream( + &mut self, + mut stream: S, + counts: &mut RelevantEntryCount, + ) -> Result<(), DpdClientError> + where + S: Stream> + Unpin, + T: TryInto, + { + while let Some(entry) = stream.try_next().await? { + // The only way we can fail to convert a dpd `NatEntry` is if the + // Vni from dpd isn't a valid omicron Vni (the `BadVni` in our + // generic bound). This should never happen, but if it does, we know + // this isn't an entry we care about: we're only looking for entries + // that match our services' Vni(s). + match entry.try_into() { + Ok(entry) if self.service_vnis.contains(&entry.vni) => { + self.current_entries.insert(entry); + counts.service_vni += 1; + } + Ok(_) | Err(BadVni(_)) => { + counts.non_service_vni += 1; + } + } + } + + Ok(()) + } +} + +#[cfg(test)] +mod tests; diff --git a/sled-agent/scrimlet-reconcilers/src/dpd_reconciler/nat/tests.rs b/sled-agent/scrimlet-reconcilers/src/dpd_reconciler/nat/tests.rs new file mode 100644 index 00000000000..262ce2bad28 --- /dev/null +++ b/sled-agent/scrimlet-reconcilers/src/dpd_reconciler/nat/tests.rs @@ -0,0 +1,411 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use super::*; +use anyhow::Context; +use gateway_messages::SpPort; +use omicron_common::OMICRON_DPD_TAG; +use omicron_test_utils::dev; +use proptest::prelude::any; +use proptest::prelude::proptest as proptest_macro; +use proptest::strategy::Strategy; +use test_strategy::Arbitrary; +use test_strategy::proptest; +use tokio::task::block_in_place; + +#[proptest] +fn proptest_plan_all_unchanged(service_nat_entries: ServiceZoneNatEntries) { + let logctx = + omicron_test_utils::dev::test_setup_log("proptest_plan_all_unchanged"); + let log = &logctx.log; + + let dpd_current_entries = + service_nat_entries.iter().map(NatEntry::from).collect(); + let plan = ReconciliationPlan::new( + &dpd_current_entries, + &service_nat_entries, + log, + ) + .expect("planning should succeed"); + + let expected = ReconciliationPlan { + unchanged: service_nat_entries.iter().map(|e| e.zone_id).collect(), + to_remove: BTreeSet::new(), + to_create: BTreeMap::new(), + }; + + assert_eq!(plan, expected); + + logctx.cleanup_successful(); +} + +#[proptest] +fn proptest_plan_create_all(service_nat_entries: ServiceZoneNatEntries) { + let logctx = + omicron_test_utils::dev::test_setup_log("proptest_plan_create_all"); + let log = &logctx.log; + + let dpd_current_entries = BTreeSet::new(); + let plan = ReconciliationPlan::new( + &dpd_current_entries, + &service_nat_entries, + log, + ) + .expect("planning should succeed"); + + let expected = ReconciliationPlan { + unchanged: BTreeSet::new(), + to_remove: BTreeSet::new(), + to_create: service_nat_entries + .iter() + .map(|e| (e.zone_id, NatEntry::from(e))) + .collect(), + }; + + assert_eq!(plan, expected); + + logctx.cleanup_successful(); +} + +#[derive(Debug, Clone, Arbitrary)] +struct TestInput { + // Complete set of desired `ServiceZoneNatEntries` we want to exist after + // reconciliation. This is divided into two: the first + // 0..preexisting_service_nat_entries entries should already exist in dpd, + // and the remaining should not (until we perform reconciliation). + desired_service_nat_entries: ServiceZoneNatEntries, + #[strategy(0..=#desired_service_nat_entries.len())] + preexisting_service_nat_entries: usize, + + // Stale service entries. We should remove all of these. + #[strategy(proptest::collection::btree_set( + any::(), + 0..32, + ).prop_filter( + "no overlap with desired_service_nat_entries", + move |entries| { + filter_out_overlapping_arbitrary_service_entries( + &#desired_service_nat_entries, + entries, + ) + } + ))] + stale_service_entries: BTreeSet, + + // Non-service entries. We should never touch these. + #[strategy(proptest::collection::btree_set( + any::().prop_filter( + "non-service VNI", + |entry| entry.vni != Vni::SERVICES_VNI + ), + 0..32, + ).prop_filter( + "no duplicate IPs", + move |entries| filter_out_overlapping_arbitrary_non_service_entries( + &#desired_service_nat_entries, + &#stale_service_entries, + entries, + ), + ))] + preexisting_non_service_entries: BTreeSet, +} + +// Helper to collapse ipv4-mapped-ipv6 (generated by proptests sometimes) +// down to just ipv4. +fn collapse_ipv4_mapped(ip: IpAddr) -> IpAddr { + match ip { + ip @ IpAddr::V4(_) => ip, + ip @ IpAddr::V6(ip6) => match ip6.to_ipv4_mapped() { + Some(ip4) => IpAddr::V4(ip4), + None => ip, + }, + } +} + +// Proptest filter to ensure we don't get conflicts from dpd on external IPs: +// return false if `entries` contains duplicate IPs or any IP also used by +// `desired_service_nat_entries`. +fn filter_out_overlapping_arbitrary_service_entries( + desired_service_nat_entries: &ServiceZoneNatEntries, + entries: &BTreeSet, +) -> bool { + let mut entry_ips = BTreeSet::new(); + for entry in entries { + if desired_service_nat_entries.contains_key(&entry.zone_id) { + return false; + } + if !entry_ips.insert(collapse_ipv4_mapped(entry.kind.external_ip())) { + return false; + } + } + + desired_service_nat_entries.iter().all(|other| { + !entry_ips.contains(&collapse_ipv4_mapped(other.kind.external_ip())) + }) +} + +// Proptest filter to ensure we don't get conflicts from dpd on external IPs: +// return false if `entries` contains duplicate IPs or any IP also used by +// `desired_service_nat_entries` or `stale_service_entries`. +fn filter_out_overlapping_arbitrary_non_service_entries( + desired_service_nat_entries: &ServiceZoneNatEntries, + stale_service_entries: &BTreeSet, + entries: &BTreeSet, +) -> bool { + let prior = desired_service_nat_entries + .iter() + .chain(stale_service_entries.iter()) + .map(|e| collapse_ipv4_mapped(e.kind.external_ip())) + .collect::>(); + let mut entry_ips = BTreeSet::new(); + for e in entries { + if !entry_ips.insert(collapse_ipv4_mapped(e.target_ip)) { + return false; + } + } + prior.intersection(&entry_ips).next().is_none() +} + +impl TestInput { + // All entries we should set up in dpd before reconciling. + fn entries_that_should_exist_at_start_of_test( + &self, + ) -> impl Iterator + '_ { + self.desired_service_nat_entries + .iter() + .take(self.preexisting_service_nat_entries) + .chain(self.stale_service_entries.iter()) + .map(NatEntry::from) + .chain(self.preexisting_non_service_entries.iter().copied()) + } + + // All entries we expect to exist in dpd after reconciling. + fn entries_that_should_exist_after_reconciliation( + &self, + ) -> impl Iterator + '_ { + self.desired_service_nat_entries + .iter() + .map(NatEntry::from) + .chain(self.preexisting_non_service_entries.iter().copied()) + } + + async fn apply_initial_to_dpd( + &self, + client: &dpd_client::Client, + ) -> anyhow::Result<()> { + // Reset from any previous test. (One `dendrite` instance is used across + // multiple proptest invocations). + self.erase_all_nat_entries(client) + .await + .context("failed to erase all NAT entries at start of test")?; + + for entry in self.entries_that_should_exist_at_start_of_test() { + create_nat_entry(client, &entry).await.with_context(|| { + format!("failed to create initial entry {entry:?}") + })?; + } + + Ok(()) + } + + async fn validate_post_reconciliation( + &self, + client: &dpd_client::Client, + ) -> anyhow::Result<()> { + let dpd_entries = self.collect_all_dpd_entries(client).await?; + let expected_entries = self + .entries_that_should_exist_after_reconciliation() + .collect::>(); + assert_eq!(expected_entries, dpd_entries); + Ok(()) + } + + async fn collect_all_dpd_entries( + &self, + client: &dpd_client::Client, + ) -> anyhow::Result> { + let mut all_entries = BTreeSet::new(); + + // Collect IPv4 + let mut stream_addresses = + client.nat_ipv4_addresses_list_stream(SINGLE_REQUEST_LIMIT); + while let Some(ip) = stream_addresses + .try_next() + .await + .context("failed while streaming ipv4 addrs")? + { + let mut entries = + client.nat_ipv4_list_stream(&ip, SINGLE_REQUEST_LIMIT); + while let Some(e) = entries + .try_next() + .await + .context("failed while streaming ipv4 NAT entries")? + { + all_entries.insert( + NatEntry::try_from(e.clone()).with_context(|| { + format!("failed to convert {e:?} to NatEntry") + })?, + ); + } + } + + // Collect IPv6 + let mut stream_addresses = + client.nat_ipv6_addresses_list_stream(SINGLE_REQUEST_LIMIT); + while let Some(ip) = stream_addresses + .try_next() + .await + .context("failed while streaming ipv6 addrs")? + { + let mut entries = + client.nat_ipv6_list_stream(&ip, SINGLE_REQUEST_LIMIT); + while let Some(e) = entries + .try_next() + .await + .context("failed while streaming ipv6 NAT entries")? + { + all_entries.insert( + NatEntry::try_from(e.clone()).with_context(|| { + format!("failed to convert {e:?} to NatEntry") + })?, + ); + } + } + + Ok(all_entries) + } + + async fn erase_all_nat_entries( + &self, + client: &dpd_client::Client, + ) -> anyhow::Result<()> { + for e in self.collect_all_dpd_entries(client).await? { + match e.target_ip { + IpAddr::V4(ip) => { + client + .nat_ipv4_delete(&ip, e.first_port) + .await + .with_context(|| format!("failed to delete {e:?}"))?; + } + IpAddr::V6(ip) => { + client + .nat_ipv6_delete(&ip, e.first_port) + .await + .with_context(|| format!("failed to delete {e:?}"))?; + } + } + } + + Ok(()) + } +} + +#[proptest] +fn proptest_plan_mix(input: TestInput) { + let logctx = omicron_test_utils::dev::test_setup_log("proptest_plan_mix"); + let log = &logctx.log; + + // We expect to remove all stale service entries. + let expected_to_remove = input + .stale_service_entries + .iter() + .map(NatEntry::from) + .collect::>(); + + // The "current entries" includes all stale entries plus a subset of the + // desired entries; start with the stale ones, and we add the subset in the + // loop below. + let mut dpd_current_entries = expected_to_remove.clone(); + + // Now add the number of common entries we expect. + let mut expected_unchanged = BTreeSet::new(); + let mut expected_to_create = BTreeMap::new(); + for (i, entry) in input.desired_service_nat_entries.iter().enumerate() { + if i < input.preexisting_service_nat_entries { + dpd_current_entries.insert(NatEntry::from(entry)); + expected_unchanged.insert(entry.zone_id); + } else { + expected_to_create.insert(entry.zone_id, NatEntry::from(entry)); + } + } + + let plan = ReconciliationPlan::new( + &dpd_current_entries, + &input.desired_service_nat_entries, + log, + ) + .expect("planning should succeed"); + + let expected = ReconciliationPlan { + unchanged: expected_unchanged, + to_remove: expected_to_remove, + to_create: expected_to_create, + }; + + assert_eq!(plan, expected); + + logctx.cleanup_successful(); +} + +#[tokio::test(flavor = "multi_thread")] +async fn proptest_full_reconciliation() { + let logctx = + omicron_test_utils::dev::test_setup_log("proptest_full_reconciliation"); + let mgsctx = gateway_test_utils::setup::test_setup( + "proptest_full_reconciliation", + SpPort::One, + ) + .await; + let mut dpdctx = dev::dendrite::DendriteInstance::start( + 0, + None, + Some(mgsctx.address().into()), + ) + .await + .expect("started dendrite"); + let client = dpd_client::Client::new( + &format!("http://{}", dpdctx.address()), + dpd_client::ClientState { + tag: OMICRON_DPD_TAG.to_owned(), + log: logctx.log.clone(), + }, + ); + let rt = tokio::runtime::Handle::current(); + + let one_test_invocation = async |input: TestInput| { + input + .apply_initial_to_dpd(&client) + .await + .expect("applied initial setup to dpd"); + + let status = + reconcile(&client, &input.desired_service_nat_entries, &logctx.log) + .await; + + match status { + DpdNatReconcilerStatus::NoNatEntriesConfig + | DpdNatReconcilerStatus::FailedReadingCurrentDpdNatEntries(_) + | DpdNatReconcilerStatus::InvalidSystemNetworkingConfig(_) + | DpdNatReconcilerStatus::PartialSuccess { .. } => { + panic!("unexpected reconciler status: {status:?}"); + } + DpdNatReconcilerStatus::Success { .. } => { + input + .validate_post_reconciliation(&client) + .await + .expect("validated post reconciliation entries"); + } + } + }; + + proptest_macro!(|(input: TestInput)| { + // Do a little dance to call our async `one_test_invocation` within the + // non-async `proptest_macro!()` context. + block_in_place(|| rt.block_on(one_test_invocation(input))); + }); + + dpdctx.cleanup().await.expect("dpd cleanup succeeded"); + mgsctx.teardown().await; + logctx.cleanup_successful(); +} diff --git a/sled-agent/scrimlet-reconcilers/src/dpd_reconciler/port_reconciler.rs b/sled-agent/scrimlet-reconcilers/src/dpd_reconciler/port_reconciler.rs new file mode 100644 index 00000000000..2acf1f67164 --- /dev/null +++ b/sled-agent/scrimlet-reconcilers/src/dpd_reconciler/port_reconciler.rs @@ -0,0 +1,631 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Reconciliation of QSFP port settings. + +use crate::switch_zone_slot::ThisSledSwitchSlot; +use daft::Diffable; +use dpd_client::Client; +use dpd_client::types::LinkCreate as DpdLinkCreate; +use dpd_client::types::LinkId as DpdLinkId; +use dpd_client::types::LinkSettings as DpdLinkSettings; +use dpd_client::types::PortFec as DpdPortFec; +use dpd_client::types::PortId as DpdPortId; +use dpd_client::types::PortSettings as DpdPortSettings; +use dpd_client::types::PortSpeed as DpdPortSpeed; +use dpd_client::types::Qsfp as DpdQsfp; +use dpd_client::types::TxEq as DpdTxEq; +use iddqd::IdOrdItem; +use iddqd::IdOrdMap; +use iddqd::id_ord_map; +use omicron_common::OMICRON_DPD_TAG; +use sled_agent_types::early_networking::LinkFec; +use sled_agent_types::early_networking::LinkSpeed; +use sled_agent_types::early_networking::PortConfig; +use sled_agent_types::early_networking::RackNetworkConfig; +use sled_agent_types::early_networking::TxEqConfig; +use sled_agent_types::early_networking::UplinkAddress; +use sled_agent_types::early_networking::UplinkAddressConfig; +use slog::Logger; +use slog::info; +use slog::warn; +use slog_error_chain::InlineErrorChain; +use std::collections::BTreeMap; +use std::collections::BTreeSet; +use std::collections::HashMap; +use std::net::IpAddr; + +type DpdClientError = dpd_client::Error; + +const DPD_TAG: Option<&'static str> = Some(OMICRON_DPD_TAG); + +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)] +pub struct DpdPortOperationFailure { + pub port_id: DpdQsfp, + pub error: String, +} + +/// Status of reconciling QSFP port settings with `dpd`. +#[derive(Debug, Clone)] +pub enum DpdPortReconcilerStatus { + /// Reconciliation failed while attempting to read the current settings from + /// `dpd`. + FailedReadingCurrentSettings(String), + + /// Reconciliation failed because the data prevented us from constructing a + /// plan - this should be impossible absent bugs. + FailedGeneratingPlan(String), + + /// Reconciliation completed successfully. + Success { + unchanged: BTreeSet, + cleared: BTreeSet, + applied: BTreeSet, + }, + + /// Reconciliation completed but had at least one failure. + PartialSuccess { + unchanged: BTreeSet, + cleared: BTreeSet, + clear_failures: Vec, + applied: BTreeSet, + apply_failures: Vec, + }, +} + +impl slog::KV for DpdPortReconcilerStatus { + fn serialize( + &self, + _record: &slog::Record<'_>, + serializer: &mut dyn slog::Serializer, + ) -> slog::Result { + let skipped_key = "port-reconciler-skipped"; + match self { + Self::FailedReadingCurrentSettings(reason) => { + serializer.emit_str(skipped_key.into(), reason) + } + Self::FailedGeneratingPlan(reason) => { + serializer.emit_str(skipped_key.into(), reason) + } + Self::Success { unchanged, cleared, applied } => { + // Only show a summary count; we have individual log statements + // for each clear/apply. + for (key, val) in [ + ("port-settings-unchanged", unchanged.len()), + ("port-settings-successfully-cleared", cleared.len()), + ("port-settings-failed-to-clear", 0), + ("port-settings-successfully-applied", applied.len()), + ("port-settings-failed-to-apply", 0), + ] { + serializer.emit_usize(key.into(), val)?; + } + Ok(()) + } + Self::PartialSuccess { + unchanged, + cleared, + clear_failures, + applied, + apply_failures, + } => { + // Only show a summary count; we have individual log statements + // for each clear/apply. + for (key, val) in [ + ("port-settings-unchanged", unchanged.len()), + ("port-settings-successfully-cleared", cleared.len()), + ("port-settings-failed-to-clear", clear_failures.len()), + ("port-settings-successfully-applied", applied.len()), + ("port-settings-failed-to-apply", apply_failures.len()), + ] { + serializer.emit_usize(key.into(), val)?; + } + Ok(()) + } + } + } +} + +#[derive(Default, Debug)] +pub(super) struct PortReconciler { + // The set of QSFP ports is a physical property; it never changes for a + // given switch, so we can cache this value once. The first time dpd + // successfully returns the list of ports, we save that value here and reuse + // it forever; it can never change. + cached_qsfp_ports: Option>, +} + +impl PortReconciler { + pub(super) async fn reconcile( + &mut self, + client: &Client, + desired_config: &RackNetworkConfig, + our_switch_slot: ThisSledSwitchSlot, + log: &Logger, + ) -> DpdPortReconcilerStatus { + let dpd_current_settings = match self + .dpd_get_current_settings(client, log) + .await + { + Ok(settings) => settings, + Err(err) => { + return DpdPortReconcilerStatus::FailedReadingCurrentSettings( + format!( + "failed to read current port settings from dpd: {}", + InlineErrorChain::new(&err), + ), + ); + } + }; + + let plan = match ReconciliationPlan::new( + dpd_current_settings, + desired_config, + our_switch_slot, + log, + ) { + Ok(plan) => plan, + Err(err) => { + // Ensure `err` is actually a string; if it changes to a proper + // error type, we need to use `InlineErrorChain` here instead. + let err: &str = &err; + return DpdPortReconcilerStatus::FailedGeneratingPlan(format!( + "failed to generate plan to apply port settings: {err}", + )); + } + }; + + apply_plan(client, plan, log).await + } + + async fn dpd_get_current_settings( + &mut self, + client: &Client, + log: &Logger, + ) -> Result, DpdClientError> { + let qsfp_ports = match self.cached_qsfp_ports.as_mut() { + Some(cached) => cached.as_slice(), + None => { + let ports = client + .port_list() + .await? + .into_inner() + .into_iter() + .filter_map(|port| match port { + // We're only responsible for applying settings to QSFP + // ports; any other kind of port cannot have settings + // populated in `RackNetworkConfig` and is + // internal-to-dpd. + DpdPortId::Internal(_) | DpdPortId::Rear(_) => None, + DpdPortId::Qsfp(qsfp) => Some(qsfp), + }) + .collect::>(); + info!( + log, "cached set of qsfp port IDs"; + "num-qsfp-ports" => ports.len(), + ); + self.cached_qsfp_ports.insert(ports).as_slice() + } + }; + + let mut config_by_port = BTreeMap::new(); + for port_id in qsfp_ports.iter().cloned() { + let settings = client + .port_settings_get(&DpdPortId::Qsfp(port_id.clone()), DPD_TAG) + .await? + .into_inner(); + // Check for empty ports and filter those out. + // + // TODO-performance We could consider caching "ports known to have + // no links", as that _shouldn't_ change unless we apply settings to + // a port. But "shouldn't" is doing a lot of work here, and caching + // has all the usual problems! For now, we'll just fetch all the + // port settings every time, and we can tune that down in the future + // if needed. + let are_port_settings_clear = { + let DpdPortSettings { links } = &settings; + links.is_empty() + }; + if !are_port_settings_clear { + config_by_port.insert(port_id, settings); + } + } + + Ok(config_by_port) + } +} + +/// Apply the contents of `plan` to dpd via `client`. +/// +/// This requires `plan.to_clear.len() + plan.to_apply.len()` independent +/// calls to `dpd`. We do not short circuit on failure: we'll always attempt to +/// make every call required. This may not be the right choice, but some +/// arguments in favor: +/// +/// * In practice we expect the number of calls here to be small. On startup we +/// expect 1-32 `to_apply` calls (one for each configured uplink), and for +/// every reconciliation attempt after that we expect 0-1 (either no changes, +/// or a single port has had its settings changed; it's possible we'll see +/// multiple ports change at once, but at most 32). +/// * We always want to report the status of every step described by `plan`, and +/// implementing stop-on-first-failure means we'd need to record a "didn't +/// attempt because of an earlier failure" status for some steps. That's +/// doable but annoying. +async fn apply_plan( + client: &Client, + plan: ReconciliationPlan, + log: &Logger, +) -> DpdPortReconcilerStatus { + let ReconciliationPlan { unchanged, to_clear, to_apply } = plan; + + let mut cleared = BTreeSet::new(); + let mut clear_failures = Vec::new(); + for port_id in to_clear { + match client + .port_settings_clear(&DpdPortId::Qsfp(port_id.clone()), DPD_TAG) + .await + { + Ok(_) => { + info!( + log, "successfully cleared settings for port"; + "port_id" => port_id.to_string(), + ); + cleared.insert(port_id); + } + Err(err) => { + let err = InlineErrorChain::new(&err); + warn!( + log, "failed to clear port settings"; + "port_id" => port_id.to_string(), + &err, + ); + clear_failures.push(DpdPortOperationFailure { + port_id, + error: err.to_string(), + }); + } + } + } + + let mut applied = BTreeSet::new(); + let mut apply_failures = Vec::new(); + for (port_id, settings) in to_apply { + match client + .port_settings_apply( + &DpdPortId::Qsfp(port_id.clone()), + DPD_TAG, + &settings, + ) + .await + { + Ok(_) => { + info!( + log, "successfully applied settings for port"; + "port_id" => port_id.to_string(), + ); + applied.insert(port_id); + } + Err(err) => { + let err = InlineErrorChain::new(&err); + warn!( + log, "failed to apply port settings"; + "port_id" => port_id.to_string(), + &err, + ); + apply_failures.push(DpdPortOperationFailure { + port_id, + error: err.to_string(), + }); + } + } + } + + if clear_failures.is_empty() && apply_failures.is_empty() { + DpdPortReconcilerStatus::Success { unchanged, cleared, applied } + } else { + DpdPortReconcilerStatus::PartialSuccess { + unchanged, + cleared, + clear_failures, + applied, + apply_failures, + } + } +} + +#[derive(Debug, PartialEq)] +struct ReconciliationPlan { + // Set of ports whose settings are already correct in `dpd`. + unchanged: BTreeSet, + + // Set of ports that have settings in `dpd` but not in our desired config; + // these should be cleared. + to_clear: BTreeSet, + + // Set of ports whose settings in `dpd` don't match our desired config or + // don't exist at all; these need to be applied. + to_apply: BTreeMap, +} + +impl ReconciliationPlan { + fn new( + dpd_current_settings: BTreeMap, + config: &RackNetworkConfig, + our_switch_slot: ThisSledSwitchSlot, + log: &Logger, + ) -> Result { + // Helper for all the places in this method where we have to convert a + // string back into a `DpdQsfp`. We never expect this to fail in + // practice, but want to report the source of the bad port name if it + // happens. + fn parse_port_id( + port_id: &str, + source: &str, + ) -> Result { + port_id.parse().map_err(|err| { + format!( + "invalid port ID `{port_id}` in {source}: {}", + InlineErrorChain::new(&err) + ) + }) + } + + // Convert dpd settings into a diffable form. + let dpd_current_settings = dpd_current_settings + .into_iter() + .map(DiffablePortSettings::try_from) + .collect::, _>>() + .map_err(|err| InlineErrorChain::new(&err).to_string())?; + + // Convert desired config into a diffable form. + let desired_settings = config + .ports + .iter() + .filter(|p| p.switch == our_switch_slot) + .map(DiffablePortSettings::from) + .collect::>(); + + let id_ord_map::Diff { common, added, removed } = + dpd_current_settings.diff(&desired_settings); + + // Any entries removed are ports that have settings in dpd but not + // `config`; we need to clear them. + let to_clear = removed + .into_iter() + .map(|item| parse_port_id(&item.port_id, "dpd")) + .collect::, _>>()?; + + // Any entries added are ports that have settings in `config` but not + // dpd; we need to add them. + let mut to_apply = added + .into_iter() + .map(|p| { + let port_id = parse_port_id(&p.port_id, "rack network config")?; + Ok::<_, String>((port_id, DpdPortSettings::from(p))) + }) + .collect::, _>>()?; + let mut unchanged = BTreeSet::new(); + + // For any entries in common (i.e., the key exists in both dpd and + // `config`), we have to check whether any values changed. For every + // leaf in common, we'll either add its port ID to `unchanged` (if the + // values match) or we'll add the desired value to `to_apply`. + for leaf in common { + if leaf.is_unchanged() { + unchanged.insert(parse_port_id( + leaf.key(), + "rack network config AND dpd", + )?); + } else { + let port_id = + parse_port_id(leaf.key(), "rack network config AND dpd")?; + + // `common` is a map of unique keys that must be distinct from + // the `added` keys used to seed `to_apply`, so these inserts + // are guaranteed to all be unique. + to_apply.insert(port_id, (*leaf.after()).into()); + } + } + + info!( + log, + "generated dpd port settings reconciliation plan"; + "ports_unchanged" => unchanged.len(), + "ports_to_clear" => to_clear.len(), + "ports_to_apply" => to_apply.len(), + ); + + Ok(Self { unchanged, to_clear, to_apply }) + } +} + +// We convert both `RackNetworkConfig`'s port settings and the `DpdPortSettings` +// we read from `dpd` into this type to compute the diff. +#[derive(Debug, Clone, PartialEq, Eq, daft::Diffable)] +struct DiffablePortSettings { + port_id: String, + autoneg: bool, + tx_eq: Option, + fec: Option, + speed: LinkSpeed, + addrs: BTreeSet, +} + +impl IdOrdItem for DiffablePortSettings { + type Key<'a> = &'a str; + + fn key(&self) -> Self::Key<'_> { + &self.port_id + } + + iddqd::id_upcast!(); +} + +impl From<&'_ PortConfig> for DiffablePortSettings { + fn from(port: &'_ PortConfig) -> Self { + Self { + port_id: port.port.clone(), + autoneg: port.autoneg, + tx_eq: port.tx_eq, + fec: port.uplink_port_fec, + speed: port.uplink_port_speed, + addrs: port + .addresses + .iter() + .filter_map(|a| { + let UplinkAddressConfig { + address, + // Discard `vlan_id` - that's handled by `uplinkd`. + vlan_id: _, + } = a; + + match address { + UplinkAddress::AddrConf => None, + UplinkAddress::Static { ip_net } => { + // Discard the `ip_net` prefix, which is also + // handled by `uplinkd`. We only need the IP. + Some(ip_net.addr()) + } + } + }) + .collect(), + } + } +} + +#[derive(Debug, thiserror::Error)] +#[error( + "expected exactly 1 link per port in dpd, but got {nlinks} on port {port}" +)] +struct UnexpectedLinkCount { + nlinks: usize, + port: String, +} + +impl TryFrom<(DpdQsfp, DpdPortSettings)> for DiffablePortSettings { + type Error = UnexpectedLinkCount; + + fn try_from( + value: (DpdQsfp, DpdPortSettings), + ) -> Result { + let (port_id, DpdPortSettings { links }) = value; + + // We only expect to be constructed if there's exactly one link + // configured: + // + // * 0 links is an empty port; those should be filtered out by + // `dpd_get_current_settings()` + // * 2 or more links cannot be represented in `RackNetworkConfig` today, + // so it shouldn't be possible for `dpd` to report that on any port + if links.len() != 1 { + return Err(UnexpectedLinkCount { + nlinks: links.len(), + port: port_id.to_string(), + }); + } + // We just confirmed there's exactly one link; take ownership of it. + let sole_link = links.into_values().next().unwrap(); + + let tx_eq = sole_link.params.tx_eq.map(|t| TxEqConfig { + main: t.main, + post1: t.post1, + post2: t.post2, + pre1: t.pre1, + pre2: t.pre2, + }); + + let fec = sole_link.params.fec.map(|f| match f { + DpdPortFec::Firecode => LinkFec::Firecode, + DpdPortFec::None => LinkFec::None, + DpdPortFec::Rs => LinkFec::Rs, + }); + + let speed = match sole_link.params.speed { + DpdPortSpeed::Speed0G => LinkSpeed::Speed0G, + DpdPortSpeed::Speed1G => LinkSpeed::Speed1G, + DpdPortSpeed::Speed10G => LinkSpeed::Speed10G, + DpdPortSpeed::Speed25G => LinkSpeed::Speed25G, + DpdPortSpeed::Speed40G => LinkSpeed::Speed40G, + DpdPortSpeed::Speed50G => LinkSpeed::Speed50G, + DpdPortSpeed::Speed100G => LinkSpeed::Speed100G, + DpdPortSpeed::Speed200G => LinkSpeed::Speed200G, + DpdPortSpeed::Speed400G => LinkSpeed::Speed400G, + }; + + // dont consider link local addresses in change computation + let addrs = sole_link + .addrs + .into_iter() + .filter(|ip| match ip { + IpAddr::V6(ip) if ip.is_unicast_link_local() => false, + _ => true, + }) + .collect(); + + Ok(Self { + port_id: port_id.to_string(), + autoneg: sole_link.params.autoneg, + tx_eq, + fec, + speed, + addrs, + }) + } +} + +impl From<&'_ DiffablePortSettings> for DpdPortSettings { + fn from(port: &DiffablePortSettings) -> Self { + let autoneg = port.autoneg; + let addrs = port.addrs.iter().copied().collect(); + let kr = false; //NOTE: kr does not apply to user configurable links. + + let fec = port.fec.map(|f| match f { + LinkFec::Firecode => DpdPortFec::Firecode, + LinkFec::None => DpdPortFec::None, + LinkFec::Rs => DpdPortFec::Rs, + }); + + let speed = match port.speed { + LinkSpeed::Speed0G => DpdPortSpeed::Speed0G, + LinkSpeed::Speed1G => DpdPortSpeed::Speed1G, + LinkSpeed::Speed10G => DpdPortSpeed::Speed10G, + LinkSpeed::Speed25G => DpdPortSpeed::Speed25G, + LinkSpeed::Speed40G => DpdPortSpeed::Speed40G, + LinkSpeed::Speed50G => DpdPortSpeed::Speed50G, + LinkSpeed::Speed100G => DpdPortSpeed::Speed100G, + LinkSpeed::Speed200G => DpdPortSpeed::Speed200G, + LinkSpeed::Speed400G => DpdPortSpeed::Speed400G, + }; + + let tx_eq = port.tx_eq.map(|t| DpdTxEq { + main: t.main, + post1: t.post1, + post2: t.post2, + pre1: t.pre1, + pre2: t.pre2, + }); + + // TODO breakouts? + let mut links = HashMap::with_capacity(1); + let link_id = DpdLinkId(0); + links.insert( + link_id.to_string(), + DpdLinkSettings { + addrs, + params: DpdLinkCreate { + autoneg, + fec, + kr, + lane: Some(link_id), + speed, + tx_eq, + }, + }, + ); + + DpdPortSettings { links } + } +} + +#[cfg(test)] +mod tests; diff --git a/sled-agent/scrimlet-reconcilers/src/dpd_reconciler/port_reconciler/tests.rs b/sled-agent/scrimlet-reconcilers/src/dpd_reconciler/port_reconciler/tests.rs new file mode 100644 index 00000000000..e077f03a81c --- /dev/null +++ b/sled-agent/scrimlet-reconcilers/src/dpd_reconciler/port_reconciler/tests.rs @@ -0,0 +1,929 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use super::*; +use dpd_client::types::LinkCreate as DpdLinkCreate; +use dpd_client::types::LinkId as DpdLinkId; +use dpd_client::types::LinkSettings as DpdLinkSettings; +use dpd_client::types::PortFec as DpdPortFec; +use dpd_client::types::PortSpeed as DpdPortSpeed; +use gateway_messages::SpPort; +use omicron_test_utils::dev; +use proptest::prelude::proptest as proptest_macro; +use proptest::strategy::Strategy; +use sled_agent_types::early_networking::LinkFec; +use sled_agent_types::early_networking::LinkSpeed; +use sled_agent_types::early_networking::PortConfig; +use sled_agent_types::early_networking::RackNetworkConfig; +use sled_agent_types::early_networking::SwitchSlot; +use sled_agent_types::early_networking::UplinkAddress; +use sled_agent_types::early_networking::UplinkAddressConfig; +use sled_agent_types::early_networking::UplinkIpNet; +use std::collections::BTreeMap; +use std::collections::BTreeSet; +use std::collections::HashMap; +use std::net::IpAddr; +use test_strategy::Arbitrary; +use test_strategy::proptest; +use tokio::sync::Mutex; +use tokio::task::block_in_place; + +use crate::switch_zone_slot::ThisSledSwitchSlot; + +/// Build a minimal `PortConfig` for a single port. +fn port_config( + port: &DpdQsfp, + switch: SwitchSlot, + speed: LinkSpeed, + fec: Option, + autoneg: bool, + addrs: &[UplinkIpNet], +) -> PortConfig { + PortConfig { + routes: Vec::new(), + addresses: addrs + .iter() + .copied() + .map(|ip_net| UplinkAddressConfig { + address: UplinkAddress::Static { ip_net }, + vlan_id: None, + }) + .collect(), + switch, + port: port.to_string(), + uplink_port_speed: speed, + uplink_port_fec: fec, + bgp_peers: Vec::new(), + autoneg, + lldp: None, + tx_eq: None, + } +} + +/// Build `DpdPortSettings` with a single link (the only layout we support). +fn dpd_port_settings( + speed: DpdPortSpeed, + fec: Option, + autoneg: bool, + addrs: Vec, +) -> DpdPortSettings { + let mut links = HashMap::new(); + let link_id = DpdLinkId(0); + links.insert( + link_id.to_string(), + DpdLinkSettings { + addrs, + params: DpdLinkCreate { + autoneg, + fec, + kr: false, + lane: Some(link_id), + speed, + tx_eq: None, + }, + }, + ); + DpdPortSettings { links } +} + +fn rack_config(ports: Vec) -> RackNetworkConfig { + RackNetworkConfig { + rack_subnet: "fd00::/48".parse().unwrap(), + infra_ip_first: "10.0.0.1".parse().unwrap(), + infra_ip_last: "10.0.0.100".parse().unwrap(), + ports, + bgp: Vec::new(), + bfd: Vec::new(), + } +} + +#[test] +fn plan_all_unchanged() { + let logctx = dev::test_setup_log("plan_all_unchanged"); + let log = &logctx.log; + + let qsfp0: DpdQsfp = "qsfp0".parse().unwrap(); + let addr: IpAddr = "10.0.0.1".parse().unwrap(); + let ip_net: UplinkIpNet = format!("{addr}/24").parse().unwrap(); + + // Desired config: one port on Switch0. + let config = rack_config(vec![port_config( + &qsfp0, + SwitchSlot::Switch0, + LinkSpeed::Speed100G, + Some(LinkFec::Rs), + true, + &[ip_net], + )]); + + // Current dpd state matches exactly. + let dpd_current = BTreeMap::from([( + qsfp0.clone(), + dpd_port_settings( + DpdPortSpeed::Speed100G, + Some(DpdPortFec::Rs), + true, + vec![addr], + ), + )]); + + let plan = ReconciliationPlan::new( + dpd_current, + &config, + ThisSledSwitchSlot::TEST_FAKE, + log, + ) + .expect("plan should succeed"); + + assert_eq!(plan.unchanged, BTreeSet::from([qsfp0])); + assert!(plan.to_clear.is_empty()); + assert!(plan.to_apply.is_empty()); + + logctx.cleanup_successful(); +} + +#[test] +fn plan_create_all() { + let logctx = dev::test_setup_log("plan_create_all"); + let log = &logctx.log; + + let qsfp0: DpdQsfp = "qsfp0".parse().unwrap(); + let qsfp1: DpdQsfp = "qsfp1".parse().unwrap(); + + // Desired config: two ports on Switch0. + let config = rack_config(vec![ + port_config( + &qsfp0, + SwitchSlot::Switch0, + LinkSpeed::Speed100G, + Some(LinkFec::Rs), + true, + &["10.0.0.1/24".parse().unwrap()], + ), + port_config( + &qsfp1, + SwitchSlot::Switch0, + LinkSpeed::Speed25G, + None, + false, + &["10.0.0.2/24".parse().unwrap()], + ), + ]); + + // dpd has no settings at all. + let dpd_current = BTreeMap::new(); + + let plan = ReconciliationPlan::new( + dpd_current, + &config, + ThisSledSwitchSlot::TEST_FAKE, + log, + ) + .expect("plan should succeed"); + + assert!(plan.unchanged.is_empty()); + assert!(plan.to_clear.is_empty()); + assert_eq!(plan.to_apply.keys().collect::>(), vec![&qsfp0, &qsfp1]); + + logctx.cleanup_successful(); +} + +#[test] +fn plan_clear_all() { + let logctx = dev::test_setup_log("plan_clear_all"); + let log = &logctx.log; + + let qsfp0: DpdQsfp = "qsfp0".parse().unwrap(); + let qsfp1: DpdQsfp = "qsfp1".parse().unwrap(); + + // Desired config: no ports. + let config = rack_config(vec![]); + + // dpd has settings for two ports. + let dpd_current = BTreeMap::from([ + ( + qsfp0.clone(), + dpd_port_settings( + DpdPortSpeed::Speed100G, + Some(DpdPortFec::Rs), + true, + vec!["10.0.0.1".parse().unwrap()], + ), + ), + ( + qsfp1.clone(), + dpd_port_settings( + DpdPortSpeed::Speed25G, + None, + false, + vec!["10.0.0.2".parse().unwrap()], + ), + ), + ]); + + let plan = ReconciliationPlan::new( + dpd_current, + &config, + ThisSledSwitchSlot::TEST_FAKE, + log, + ) + .expect("plan should succeed"); + + assert!(plan.unchanged.is_empty()); + assert_eq!(plan.to_clear, BTreeSet::from([qsfp0, qsfp1])); + assert!(plan.to_apply.is_empty()); + + logctx.cleanup_successful(); +} + +#[test] +fn plan_mix() { + let logctx = dev::test_setup_log("plan_mix"); + let log = &logctx.log; + + let qsfp0: DpdQsfp = "qsfp0".parse().unwrap(); + let qsfp1: DpdQsfp = "qsfp1".parse().unwrap(); + let qsfp2: DpdQsfp = "qsfp2".parse().unwrap(); + let qsfp3: DpdQsfp = "qsfp3".parse().unwrap(); + let ip0: IpAddr = "10.0.0.1".parse().unwrap(); + let ip1: IpAddr = "10.0.0.2".parse().unwrap(); + let ip2: IpAddr = "10.0.0.3".parse().unwrap(); + let ip3: IpAddr = "10.0.0.4".parse().unwrap(); + let ip_net0: UplinkIpNet = format!("{ip0}/24").parse().unwrap(); + let ip_net1: UplinkIpNet = format!("{ip1}/24").parse().unwrap(); + let ip_net3: UplinkIpNet = format!("{ip3}/24").parse().unwrap(); + + // Desired config: qsfp0 unchanged, qsfp1 changed speed, qsfp3 new. + // qsfp2 should be cleared (exists in dpd but not in desired config). + let config = rack_config(vec![ + port_config( + &qsfp0, + SwitchSlot::Switch0, + LinkSpeed::Speed100G, + Some(LinkFec::Rs), + true, + &[ip_net0], + ), + port_config( + &qsfp1, + SwitchSlot::Switch0, + LinkSpeed::Speed100G, + None, + false, + &[ip_net1], + ), + port_config( + &qsfp3, + SwitchSlot::Switch0, + LinkSpeed::Speed50G, + None, + true, + &[ip_net3], + ), + ]); + + let dpd_current = BTreeMap::from([ + ( + qsfp0.clone(), + dpd_port_settings( + DpdPortSpeed::Speed100G, + Some(DpdPortFec::Rs), + true, + vec![ip0], + ), + ), + ( + qsfp1.clone(), + dpd_port_settings(DpdPortSpeed::Speed25G, None, false, vec![ip1]), + ), + ( + qsfp2.clone(), + dpd_port_settings(DpdPortSpeed::Speed10G, None, false, vec![ip2]), + ), + ]); + + let plan = ReconciliationPlan::new( + dpd_current, + &config, + ThisSledSwitchSlot::TEST_FAKE, + log, + ) + .expect("plan should succeed"); + + // qsfp0: unchanged + assert_eq!(plan.unchanged, BTreeSet::from([qsfp0])); + // qsfp2: in dpd but not desired + assert_eq!(plan.to_clear, BTreeSet::from([qsfp2])); + // qsfp1: changed, qsfp3: new + assert_eq!( + plan.to_apply.keys().collect::>(), + BTreeSet::from([&qsfp1, &qsfp3]), + ); + + logctx.cleanup_successful(); +} + +#[test] +fn plan_filters_other_switch_slot() { + let logctx = dev::test_setup_log("plan_filters_other_switch_slot"); + let log = &logctx.log; + + let qsfp0: DpdQsfp = "qsfp0".parse().unwrap(); + let qsfp1: DpdQsfp = "qsfp1".parse().unwrap(); + + // Desired config has ports on both Switch0 and Switch1. Our reconciler is + // Switch0 (TEST_FAKE), so Switch1 ports should be ignored entirely. + let config = rack_config(vec![ + port_config( + &qsfp0, + SwitchSlot::Switch0, + LinkSpeed::Speed100G, + None, + true, + &["10.0.0.1/24".parse().unwrap()], + ), + port_config( + &qsfp1, + SwitchSlot::Switch1, // different switch + LinkSpeed::Speed25G, + None, + false, + &["10.0.0.2/24".parse().unwrap()], + ), + ]); + + // dpd is empty. + let dpd_current = BTreeMap::new(); + + let plan = ReconciliationPlan::new( + dpd_current, + &config, + ThisSledSwitchSlot::TEST_FAKE, + log, + ) + .expect("plan should succeed"); + + // Only qsfp0 should appear; qsfp1 is on the wrong switch. + assert!(plan.unchanged.is_empty()); + assert!(plan.to_clear.is_empty()); + assert_eq!(plan.to_apply.keys().collect::>(), vec![&qsfp0],); + + logctx.cleanup_successful(); +} + +#[test] +fn plan_link_local_addrs_ignored_from_dpd() { + let logctx = dev::test_setup_log("plan_link_local_addrs_ignored_from_dpd"); + let log = &logctx.log; + + let qsfp0: DpdQsfp = "qsfp0".parse().unwrap(); + let addr: IpAddr = "10.0.0.1".parse().unwrap(); + let ip_net: UplinkIpNet = format!("{addr}/24").parse().unwrap(); + let link_local: IpAddr = "fe80::1".parse().unwrap(); + + // Desired config: one port with one address. + let config = rack_config(vec![port_config( + &qsfp0, + SwitchSlot::Switch0, + LinkSpeed::Speed100G, + None, + true, + &[ip_net], + )]); + + // dpd has the same config but also reports a link-local address. + // The reconciler should ignore the link-local and see no diff. + let dpd_current = BTreeMap::from([( + qsfp0.clone(), + dpd_port_settings( + DpdPortSpeed::Speed100G, + None, + true, + vec![addr, link_local], + ), + )]); + + let plan = ReconciliationPlan::new( + dpd_current, + &config, + ThisSledSwitchSlot::TEST_FAKE, + log, + ) + .expect("plan should succeed"); + + assert_eq!(plan.unchanged, BTreeSet::from([qsfp0])); + assert!(plan.to_clear.is_empty()); + assert!(plan.to_apply.is_empty()); + + logctx.cleanup_successful(); +} + +#[test] +fn plan_rejects_multi_link_dpd_port() { + let logctx = dev::test_setup_log("plan_rejects_multi_link_dpd_port"); + let log = &logctx.log; + + let qsfp0: DpdQsfp = "qsfp0".parse().unwrap(); + let config = rack_config(vec![]); + + // Build DpdPortSettings with two links -- this is not representable in + // RackNetworkConfig and should cause plan generation to fail. + let mut links = HashMap::new(); + let link0 = DpdLinkId(0); + let link1 = DpdLinkId(1); + let link_params = DpdLinkCreate { + autoneg: true, + fec: None, + kr: false, + lane: Some(link0), + speed: DpdPortSpeed::Speed100G, + tx_eq: None, + }; + links.insert( + link0.to_string(), + DpdLinkSettings { + addrs: vec!["10.0.0.1".parse().unwrap()], + params: link_params.clone(), + }, + ); + links.insert( + link1.to_string(), + DpdLinkSettings { + addrs: vec!["10.0.0.2".parse().unwrap()], + params: link_params, + }, + ); + let dpd_current = BTreeMap::from([(qsfp0, DpdPortSettings { links })]); + + let err = ReconciliationPlan::new( + dpd_current, + &config, + ThisSledSwitchSlot::TEST_FAKE, + log, + ) + .expect_err("plan should fail with multi-link port"); + + assert!(err.contains("expected exactly 1 link"), "unexpected error: {err}",); + + logctx.cleanup_successful(); +} + +#[test] +fn plan_created_settings_round_trip_correctly() { + let logctx = + dev::test_setup_log("plan_created_settings_round_trip_correctly"); + let log = &logctx.log; + + let qsfp5: DpdQsfp = "qsfp5".parse().unwrap(); + let ip1: IpAddr = "10.0.0.1".parse().unwrap(); + let ip2: IpAddr = "10.0.0.2".parse().unwrap(); + let ip_net1: UplinkIpNet = format!("{ip1}/24").parse().unwrap(); + let ip_net2: UplinkIpNet = format!("{ip2}/24").parse().unwrap(); + + // Desired config with various settings populated. + let config = rack_config(vec![port_config( + &qsfp5, + SwitchSlot::Switch0, + LinkSpeed::Speed100G, + Some(LinkFec::Rs), + true, + &[ip_net1, ip_net2], + )]); + + // dpd is empty, so qsfp5 should be in `to_apply`. + let dpd_current = BTreeMap::new(); + + let plan = ReconciliationPlan::new( + dpd_current, + &config, + ThisSledSwitchSlot::TEST_FAKE, + log, + ) + .expect("plan should succeed"); + + let created = plan.to_apply.get(&qsfp5).expect("qsfp5 should exist"); + + // Verify the DpdPortSettings matches what we'd expect. + assert_eq!(created.links.len(), 1); + let link = created.links.values().next().unwrap(); + assert_eq!(link.params.speed, DpdPortSpeed::Speed100G); + assert_eq!(link.params.fec, Some(DpdPortFec::Rs)); + assert!(link.params.autoneg); + assert_eq!( + link.addrs.iter().copied().collect::>(), + BTreeSet::from([ip1, ip2]), + ); + + logctx.cleanup_successful(); +} + +#[derive(Debug, Clone, PartialEq, Eq, Arbitrary)] +struct ArbitraryPortSettings { + autoneg: bool, + tx_eq: Option, + fec: Option, + speed: LinkSpeed, + #[strategy(proptest::collection::btree_set( + proptest::arbitrary::any::(), + 0..=4, + ))] + addrs: BTreeSet, +} + +impl ArbitraryPortSettings { + fn to_dpd_settings(&self, port_id: &PortId) -> DpdPortSettings { + DpdPortSettings::from(&DiffablePortSettings { + port_id: port_id.0.clone(), + autoneg: self.autoneg, + tx_eq: self.tx_eq, + fec: self.fec, + speed: self.speed, + addrs: self + .addrs + .iter() + .filter_map(|addr| match addr.address { + UplinkAddress::AddrConf => None, + UplinkAddress::Static { ip_net } => Some(ip_net.addr()), + }) + .collect(), + }) + } +} + +#[derive(Debug, Clone, Arbitrary)] +enum SwitchPortSettingsTestInput { + // port only exists in dpd + DpdOnly(ArbitraryPortSettings), + + // port only exists in desired config for switch 0 or switch 1 + DesiredSwitch0(ArbitraryPortSettings), + DesiredSwitch1(ArbitraryPortSettings), + + // port exists in dpd and desired switch 0 with the same settings + DpdAndSwitch0Same(ArbitraryPortSettings), + + // port exists in dpd and desired switch 0, but the settings may be + // different (usually will be, but randomly could be the same still!) + DpdAndSwitch0Changed { + dpd: ArbitraryPortSettings, + switch0: ArbitraryPortSettings, + }, +} + +// We cap the arbitrary port generation here to at most `qsfp30`, because the +// `dpd` stub binary has a bug that makes `qsfp31` unusable: +// . +#[derive(Debug, Clone, Arbitrary, PartialEq, Eq, PartialOrd, Ord)] +struct PortId(#[strategy((0..31).prop_map(|n| format!("qsfp{n}")))] String); + +impl PortId { + fn to_dpd(&self) -> DpdQsfp { + self.0.parse().unwrap() + } +} + +#[derive(Debug, Clone, Arbitrary)] +struct TestInput { + ports: BTreeMap, +} + +impl TestInput { + // Map all our arbitrary inputs into just the map that should exist in dpd + // prior to reconciliation. + fn initial_dpd_settings(&self) -> BTreeMap { + self.ports + .iter() + .filter_map(|(port_id, input)| match input { + SwitchPortSettingsTestInput::DpdOnly(dpd) + | SwitchPortSettingsTestInput::DpdAndSwitch0Same(dpd) + | SwitchPortSettingsTestInput::DpdAndSwitch0Changed { + dpd, + .. + } => Some((port_id.to_dpd(), dpd.to_dpd_settings(port_id))), + SwitchPortSettingsTestInput::DesiredSwitch0(_) + | SwitchPortSettingsTestInput::DesiredSwitch1(_) => None, + }) + .collect() + } + + // Map all our arbitrary inputs into the desired `RackNetworkConfig`. + fn desired_rack_config(&self) -> RackNetworkConfig { + let switch0 = + self.ports.iter().filter_map(|(port_id, input)| match input { + SwitchPortSettingsTestInput::DesiredSwitch0(switch0) + | SwitchPortSettingsTestInput::DpdAndSwitch0Same(switch0) + | SwitchPortSettingsTestInput::DpdAndSwitch0Changed { + switch0, + .. + } => Some(diffable_to_port_config( + SwitchSlot::Switch0, + port_id, + switch0, + )), + SwitchPortSettingsTestInput::DpdOnly(_) + | SwitchPortSettingsTestInput::DesiredSwitch1(_) => None, + }); + let switch1 = + self.ports.iter().filter_map(|(port_id, input)| match input { + SwitchPortSettingsTestInput::DesiredSwitch1(switch1) => { + Some(diffable_to_port_config( + SwitchSlot::Switch1, + port_id, + switch1, + )) + } + SwitchPortSettingsTestInput::DpdOnly(_) + | SwitchPortSettingsTestInput::DesiredSwitch0(_) + | SwitchPortSettingsTestInput::DpdAndSwitch0Same(_) + | SwitchPortSettingsTestInput::DpdAndSwitch0Changed { + .. + } => None, + }); + rack_config(switch0.chain(switch1).collect()) + } + + // Build the set of expected unchanged port names based on our arbitrary + // inputs. + fn expected_unchanged(&self) -> BTreeSet { + self.ports + .iter() + .filter_map(|(port_id, input)| match input { + SwitchPortSettingsTestInput::DpdOnly(_) + | SwitchPortSettingsTestInput::DesiredSwitch0(_) + | SwitchPortSettingsTestInput::DesiredSwitch1(_) => None, + SwitchPortSettingsTestInput::DpdAndSwitch0Same(_) => { + Some(port_id.to_dpd()) + } + SwitchPortSettingsTestInput::DpdAndSwitch0Changed { + dpd, + switch0, + } => { + // We have to convert to dpd settings for this comparison; + // if addrconf addresses are involved, they need to be + // filtered out. + if dpd.to_dpd_settings(port_id) + == switch0.to_dpd_settings(port_id) + { + Some(port_id.to_dpd()) + } else { + None + } + } + }) + .collect() + } + + // Build the set of ports we expect to clear in dpd. + fn expected_to_clear(&self) -> BTreeSet { + self.ports + .iter() + .filter_map(|(port_id, input)| match input { + SwitchPortSettingsTestInput::DpdOnly(_) => { + Some(port_id.to_dpd()) + } + SwitchPortSettingsTestInput::DpdAndSwitch0Changed { + .. + } + | SwitchPortSettingsTestInput::DesiredSwitch0(_) + | SwitchPortSettingsTestInput::DesiredSwitch1(_) + | SwitchPortSettingsTestInput::DpdAndSwitch0Same(_) => None, + }) + .collect() + } + + // Build the set of ports we expect to apply because they're new or changed. + fn expected_to_apply(&self) -> BTreeMap { + self.ports + .iter() + .filter_map(|(port_id, input)| match input { + SwitchPortSettingsTestInput::DpdOnly(_) + | SwitchPortSettingsTestInput::DesiredSwitch1(_) + | SwitchPortSettingsTestInput::DpdAndSwitch0Same(_) => None, + SwitchPortSettingsTestInput::DesiredSwitch0(switch0) => { + Some((port_id.to_dpd(), switch0.to_dpd_settings(port_id))) + } + SwitchPortSettingsTestInput::DpdAndSwitch0Changed { + dpd, + switch0, + } => { + // We have to convert to dpd settings for this comparison; + // if addrconf addresses are involved, they need to be + // filtered out. + if dpd.to_dpd_settings(port_id) + != switch0.to_dpd_settings(port_id) + { + Some(( + port_id.to_dpd(), + switch0.to_dpd_settings(port_id), + )) + } else { + None + } + } + }) + .collect() + } + + async fn validate_post_reconciliation( + &self, + client: &dpd_client::Client, + ) -> anyhow::Result<()> { + // Fetch all settings in dpd. + let mut dpd_ports_with_settings = BTreeMap::new(); + for port_id in 0..32 { + let port_id = format!("qsfp{port_id}").parse::().unwrap(); + let settings = client + .port_settings_get(&DpdPortId::Qsfp(port_id.clone()), DPD_TAG) + .await? + .into_inner(); + if !settings.links.is_empty() { + dpd_ports_with_settings.insert(port_id, settings); + } + } + + // We expect to see all the settings we applied plus all unchanged + // ports. + let mut expected_settings = self + .ports + .iter() + .filter_map(|(port_id, input)| match input { + SwitchPortSettingsTestInput::DpdOnly(_) + | SwitchPortSettingsTestInput::DesiredSwitch1(_) => None, + SwitchPortSettingsTestInput::DesiredSwitch0(switch0) + | SwitchPortSettingsTestInput::DpdAndSwitch0Same(switch0) + | SwitchPortSettingsTestInput::DpdAndSwitch0Changed { + switch0, + .. + } => Some((port_id.to_dpd(), switch0.to_dpd_settings(port_id))), + }) + .collect::>(); + + // To compare the state in dpd with what we expect, we need to ignore + // ordering in the vec of addresses. We'll sort them all. + for links in dpd_ports_with_settings + .values_mut() + .chain(expected_settings.values_mut()) + .flat_map(|settings| settings.links.values_mut()) + { + links.addrs.sort_unstable(); + } + + assert_eq!( + dpd_ports_with_settings, expected_settings, + "unexpected settings in dpd after reconciliation" + ); + + Ok(()) + } +} + +fn diffable_to_port_config( + switch: SwitchSlot, + port_id: &PortId, + config: &ArbitraryPortSettings, +) -> PortConfig { + let ArbitraryPortSettings { autoneg, tx_eq, fec, speed, addrs } = config; + PortConfig { + addresses: addrs.into_iter().copied().collect(), + switch, + port: port_id.0.clone(), + uplink_port_speed: *speed, + uplink_port_fec: *fec, + autoneg: *autoneg, + tx_eq: *tx_eq, + + // Fields that aren't involved in dpd configuration + routes: Vec::new(), + bgp_peers: Vec::new(), + lldp: None, + } +} + +#[proptest] +fn proptest_plan(input: TestInput) { + let logctx = dev::test_setup_log("proptest_plan"); + let log = &logctx.log; + + let plan = ReconciliationPlan::new( + input.initial_dpd_settings(), + &input.desired_rack_config(), + ThisSledSwitchSlot::TEST_FAKE, + log, + ) + .expect("plan should succeed"); + + let ReconciliationPlan { unchanged, to_clear, to_apply } = plan; + + assert_eq!(unchanged, input.expected_unchanged(), "incorrect unchanged"); + assert_eq!(to_clear, input.expected_to_clear(), "incorrect to_clear"); + assert_eq!(to_apply, input.expected_to_apply(), "incorrect to_apply"); + + logctx.cleanup_successful(); +} + +#[tokio::test(flavor = "multi_thread")] +async fn proptest_full_reconciliation() { + let logctx = dev::test_setup_log("proptest_full_reconciliation"); + let mgsctx = gateway_test_utils::setup::test_setup( + "proptest_full_reconciliation", + SpPort::One, + ) + .await; + let mut dpdctx = dev::dendrite::DendriteInstance::start( + 0, + None, + Some(mgsctx.address().into()), + ) + .await + .expect("started dendrite"); + let client = dpd_client::Client::new( + &format!("http://{}", dpdctx.address()), + dpd_client::ClientState { + tag: OMICRON_DPD_TAG.to_owned(), + log: logctx.log.clone(), + }, + ); + let rt = tokio::runtime::Handle::current(); + + let reconciler = Mutex::new(PortReconciler::default()); + let one_test_invocation = async |input: TestInput| { + // Clear all ports from a previous proptest invocation. + for port_id in 0..32 { + let port_id = format!("qsfp{port_id}").parse().unwrap(); + client + .port_settings_clear(&DpdPortId::Qsfp(port_id), DPD_TAG) + .await + .expect("cleared port"); + } + + // Apply all initial settings. + for (port_id, settings) in input.initial_dpd_settings() { + client + .port_settings_apply( + &DpdPortId::Qsfp(port_id), + DPD_TAG, + &settings, + ) + .await + .expect("applied initial settings"); + } + + // Perform reconciliation. + let status = reconciler + .lock() + .await + .reconcile( + &client, + &input.desired_rack_config(), + ThisSledSwitchSlot::TEST_FAKE, + &logctx.log, + ) + .await; + + match status { + DpdPortReconcilerStatus::FailedReadingCurrentSettings(_) + | DpdPortReconcilerStatus::FailedGeneratingPlan(_) + | DpdPortReconcilerStatus::PartialSuccess { .. } => { + panic!("unexpected reconciler status: {status:?}"); + } + DpdPortReconcilerStatus::Success { + unchanged, + cleared, + applied, + } => { + assert_eq!( + unchanged, + input.expected_unchanged(), + "incorrect unchanged" + ); + assert_eq!( + cleared, + input.expected_to_clear(), + "incorrect cleared" + ); + let expected_applied = input + .expected_to_apply() + .keys() + .cloned() + .collect::>(); + assert_eq!(applied, expected_applied, "incorrect applied"); + + input + .validate_post_reconciliation(&client) + .await + .expect("validated post reconciliation settings"); + } + } + }; + + proptest_macro!(|(input: TestInput)| { + // Do a little dance to call our async `one_test_invocation` within the + // non-async `proptest_macro!()` context. + block_in_place(|| rt.block_on(one_test_invocation(input))); + }); + + dpdctx.cleanup().await.expect("dpd cleanup succeeded"); + mgsctx.teardown().await; + logctx.cleanup_successful(); +} diff --git a/sled-agent/scrimlet-reconcilers/src/handle.rs b/sled-agent/scrimlet-reconcilers/src/handle.rs new file mode 100644 index 00000000000..d4e74f10390 --- /dev/null +++ b/sled-agent/scrimlet-reconcilers/src/handle.rs @@ -0,0 +1,400 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! [`ScrimletReconcilers`] is the entry point into this crate for `sled-agent`; +//! it provides a suitable handle for `sled-agent`'s "long running tasks", and +//! contains a handle to each of the inner service-specific reconcilers. + +use crate::DetermineSwitchSlotStatus; +use crate::dpd_reconciler::DpdReconciler; +use crate::mgd_reconciler::MgdReconciler; +use crate::reconciler_task::ReconcilerTaskHandle; +use crate::status::ScrimletReconcilersStatus; +use crate::status::ScrimletStatus; +use crate::switch_zone_slot::ThisSledSwitchSlot; +use crate::uplinkd_reconciler::UplinkdReconciler; +use omicron_common::address::DENDRITE_PORT; +use omicron_common::address::MGD_PORT; +use omicron_common::address::MGS_PORT; +use sled_agent_types::sled::ThisSledSwitchZoneUnderlayIpAddr; +use sled_agent_types::system_networking::SystemNetworkingConfig; +use slog::Logger; +use slog::info; +use std::net::SocketAddr; +use std::net::SocketAddrV6; +use std::sync::Arc; +use std::sync::OnceLock; +use std::time::Duration; +use tokio::sync::watch; + +/// Mode in which the scrimlet reconcilers should run. +/// +/// This exists to support tests where we don't have a real switch zone like +/// we expect to have on real hardware. The production sled-agent will always +/// pass `SwitchZone(ip)`; in this mode, we'll run reconcilers that talk to +/// services at the provided IP with their well-known ports and that communicate +/// with SMF within the switch zone. In the `Test { .. }` mode, we'll point the +/// reconcilers at the specified addresses for some services, and won't run the +/// SMF-based reconcilers at all. +#[derive(Debug, Clone, Copy)] +pub enum ScrimletReconcilersMode { + SwitchZone(ThisSledSwitchZoneUnderlayIpAddr), + #[cfg(any(test, feature = "testing"))] + Test { + mgs_addr: SocketAddr, + dpd_addr: SocketAddr, + mgd_addr: SocketAddr, + }, +} + +impl ScrimletReconcilersMode { + // Build a `reqwest` client with different timeout settings depending on + // whether we're expecting to contact a real service or a test one. + fn reqwest_client(&self) -> reqwest::Client { + match self { + ScrimletReconcilersMode::SwitchZone(_) => { + // Build a custom reqwest client, primarily to set a lower + // `pool_idle_timeout`. dropshot's default connection timeout is + // 30 seconds. We want to ensure we don't hit + // in any + // reconcilers that try to re-reconcile on a 30 second interval, + // so we choose a much lower `pool_idle_timeout`: 10 seconds is + // long enough to reuse a connection for all the requests made + // during one reconciliation pass, but is short enough we should + // discard it before the server wants to time us out. + // + // The 15 second connect and read timeout are consistent with + // progenitor's normal defaults. + reqwest::ClientBuilder::new() + .connect_timeout(Duration::from_secs(15)) + .pool_idle_timeout(Duration::from_secs(10)) + .build() + .expect("reqwest parameters are valid") + } + #[cfg(any(test, feature = "testing"))] + ScrimletReconcilersMode::Test { .. } => { + // Some of our tests use tokio's paused time. We want to + // construct a reqwest client that does not specify any timeouts + // at all, allowing it to wait forever; this plays nicely with + // paused time. (Paused time + timeouts cause the timeouts to + // elapse instantly, which doesn't mesh well with establishing + // TCP connections.) + reqwest::Client::new() + } + } + } + + pub(crate) fn mgs_client( + self, + parent_log: &Logger, + ) -> gateway_client::Client { + let addr: SocketAddr = match self { + ScrimletReconcilersMode::SwitchZone(ip) => { + SocketAddrV6::new(ip.into(), MGS_PORT, 0, 0).into() + } + #[cfg(any(test, feature = "testing"))] + ScrimletReconcilersMode::Test { mgs_addr, .. } => mgs_addr, + }; + let baseurl = format!("http://{addr}"); + gateway_client::Client::new_with_client( + &baseurl, + self.reqwest_client(), + parent_log + .new(slog::o!("component" => "ThisSledSwitchSlotMgsClient")), + ) + } + + pub(crate) fn dpd_client(self, parent_log: &Logger) -> dpd_client::Client { + use omicron_common::OMICRON_DPD_TAG; + + let addr: SocketAddr = match self { + ScrimletReconcilersMode::SwitchZone(ip) => { + SocketAddrV6::new(ip.into(), DENDRITE_PORT, 0, 0).into() + } + #[cfg(any(test, feature = "testing"))] + ScrimletReconcilersMode::Test { dpd_addr, .. } => dpd_addr, + }; + let baseurl = format!("http://{addr}"); + dpd_client::Client::new_with_client( + &baseurl, + self.reqwest_client(), + dpd_client::ClientState { + tag: OMICRON_DPD_TAG.to_owned(), + log: parent_log + .new(slog::o!("component" => "DpdReconcilerClient")), + }, + ) + } + + pub(crate) fn mgd_client( + self, + parent_log: &Logger, + ) -> mg_admin_client::Client { + let addr: SocketAddr = match self { + ScrimletReconcilersMode::SwitchZone(ip) => { + SocketAddrV6::new(ip.into(), MGD_PORT, 0, 0).into() + } + #[cfg(any(test, feature = "testing"))] + ScrimletReconcilersMode::Test { mgd_addr, .. } => mgd_addr, + }; + let baseurl = format!("http://{addr}"); + mg_admin_client::Client::new_with_client( + &baseurl, + self.reqwest_client(), + parent_log.new(slog::o!("component" => "MgdReconcilerClient")), + ) + } +} + +/// Information required to enable all the scrimlet reconciler tasks provided +/// by this crate. +#[derive(Debug, Clone)] +pub struct SledAgentNetworkingInfo { + pub system_networking_config_rx: watch::Receiver, + pub mode: ScrimletReconcilersMode, +} + +/// Handle to tasks that reconcile network configuration with services within a +/// scrimlet's local switch zone. +/// +/// [`ScrimletReconcilers`] has a two- or three-phase initialization process +/// (depending on whether the sled is a non-scrimlet or a scrimlet) to support +/// being included in `sled-agent`'s set of "long running tasks". +/// [`ScrimletReconcilers::new()`] can be constructed at any time (in +/// particular: very soon after `sled-agent` starts). +/// +/// After a [`ScrimletReconcilers`] has been created, `sled-agent` is +/// responsible for calling +/// [`ScrimletReconcilers::set_sled_agent_networking_info_once()`] exactly one +/// time: this provides the reconcilers with a watch channel to receive the +/// current [`SystemNetworkingConfig`] and the IP address of this sled's switch +/// zone (should it have one). +/// +/// After `sled-agent` has provided those prerequisites, on all sleds, +/// [`ScrimletReconcilers`] spawns a tokio task that waits until both of these +/// have occurred: +/// +/// 1. [`ScrimletReconcilers::set_scrimlet_status()`] is called with +/// [`ScrimletStatus::Scrimlet`]. +/// 2. We successfully contact MGS within our switch zone to determine which +/// switch slot we are. +/// +/// On non-scrimlet sleds, step 1 never happens, so the reconciler tasks are +/// never spawned. +/// +/// Once both of these are satisfied on scrimlets, all reconciliation tasks are +/// spawned and begin running. They will reactivate periodically and in response +/// to any changes to the [`SystemNetworkingConfig`] from sled-agent, and will +/// go inert if [`ScrimletReconcilers::set_scrimlet_status()`] is called with +/// [`ScrimletStatus::NotScrimlet`] (remaining inert until we are told we are a +/// scrimlet again). +pub struct ScrimletReconcilers { + // Sending half of the channel used to communicate to all the reconcilers + // whether we're still a scrimlet. + scrimlet_status_tx: watch::Sender, + + // These once locks hold the second and third phases of initialization + // described in the doc comment above. + determining_switch_slot: + OnceLock>, + running_reconcilers: Arc>, + + parent_log: Logger, +} + +impl ScrimletReconcilers { + pub fn new(parent_log: &Logger) -> Self { + // We discard the receiver here, and create new subscribers if and when + // we spawn tasks that need to consume it. + let (scrimlet_status_tx, _scrimlet_status_rx) = + watch::channel(ScrimletStatus::NotScrimlet); + + Self { + scrimlet_status_tx, + determining_switch_slot: OnceLock::new(), + running_reconcilers: Arc::new(OnceLock::new()), + parent_log: parent_log.clone(), + } + } + + pub fn status(&self) -> ScrimletReconcilersStatus { + // Do we have running reconcilers? If so, report their status. + if let Some(running) = self.running_reconcilers.get() { + let RunningReconcilers { + dpd_reconciler, + mgd_reconciler, + uplinkd_reconciler, + } = running; + ScrimletReconcilersStatus::Running { + dpd_reconciler: dpd_reconciler.status(), + mgd_reconciler: mgd_reconciler.status(), + uplinkd_reconciler: uplinkd_reconciler.status(), + } + } + // Otherwise, have we started determining our switch slot? + else if let Some(status_rx) = self.determining_switch_slot.get() { + ScrimletReconcilersStatus::DeterminingSwitchSlot( + status_rx.borrow().clone(), + ) + } + // Otherwise, we're still waiting for the networking info. + else { + ScrimletReconcilersStatus::WaitingForSledAgentNetworkingInfo + } + } + + /// Set whether this sled is a scrimlet or not. + /// + /// This doesn't change _much_ at runtime, but it can: we may start out "not + /// a scrimlet" and then discover an attached switch later after boot, at + /// which point we become a scrimlet, or we may become "not a scrimlet" if + /// the switch is detached at runtime. + pub fn set_scrimlet_status(&self, status: ScrimletStatus) { + self.scrimlet_status_tx.send_if_modified(|prev| { + if *prev == status { + false + } else { + *prev = status; + true + } + }); + } + + /// Provide the networking information necessary to start the scrimlet + /// reconciler tasks. + /// + /// # Panics + /// + /// This method panics if called more than once; this is considered a + /// programmer error. + /// + /// One of the bits inside `info` is a watch channel: receiving a second + /// channel is a sign of control flow gone very wrong, as all the tasks will + /// already be operating based on the first channel received. + pub fn set_sled_agent_networking_info_once( + &self, + info: SledAgentNetworkingInfo, + ) { + let (determining_switch_slot_tx, determining_switch_slot_rx) = + watch::channel(DetermineSwitchSlotStatus::NotScrimlet); + + // Ensure we're only called once. + if self.determining_switch_slot.set(determining_switch_slot_rx).is_err() + { + panic!( + "set_sled_agent_networking_info_once() called more than \ + once - scrimlet reconcilers are already set up and \ + running based on the initial information provided!" + ); + } + + // We now know this is the one and only time we've been called; spawn a + // task that waits until we're a scrimlet, then waits until we can + // determine our switch slot, then spawns all the running reconcilers + // (populating `self.running_reconcilers`). + // + // We don't hang on to the join handle from this task; it exits either + // when it populates `self.running_reconcilers`, or when we're dropped + // (because it will exit when `self.scrimlet_status_tx` is closed). + tokio::spawn(determine_switch_slot( + Arc::clone(&self.running_reconcilers), + determining_switch_slot_tx, + self.scrimlet_status_tx.subscribe(), + info.mode.mgs_client(&self.parent_log), + info, + self.parent_log.clone(), + )); + } +} + +async fn determine_switch_slot( + running_reconcilers: Arc>, + determining_switch_slot_tx: watch::Sender, + mut scrimlet_status_rx: watch::Receiver, + mgs_client: gateway_client::Client, + networking_info: SledAgentNetworkingInfo, + parent_log: Logger, +) { + let log = parent_log + .new(slog::o!("component" => "ThisSledSwitchSlotDetermination")); + + // Block until either we successfully contact MGS at our switch zone + // underlay IP or the sending half of `scrimlet_status_rx` is closed (which + // means our parent `ScrimletReconcilers` was dropped - this only happens in + // tests). + let this_sled_switch_slot = + match ThisSledSwitchSlot::determine_retrying_forever( + determining_switch_slot_tx, + &mut scrimlet_status_rx, + &mgs_client, + &log, + ) + .await + { + Ok(slot) => slot, + Err(_recv_error) => { + return; + } + }; + info!( + log, "determined this sled's switch slot"; + "slot" => ?this_sled_switch_slot, + ); + + // We know `running_reconcilers` must be unset, because we're the only one + // that sets it, and we're only spawned from + // `set_sled_agent_networking_info_once()` (which itself guarantees it's + // only called once). + running_reconcilers + .set(RunningReconcilers::spawn_all( + scrimlet_status_rx, + networking_info, + this_sled_switch_slot, + &parent_log, + )) + .expect("running reconcilers is only set once"); +} + +#[derive(Debug)] +struct RunningReconcilers { + dpd_reconciler: ReconcilerTaskHandle, + mgd_reconciler: ReconcilerTaskHandle, + uplinkd_reconciler: ReconcilerTaskHandle, +} + +impl RunningReconcilers { + fn spawn_all( + scrimlet_status_rx: watch::Receiver, + networking_info: SledAgentNetworkingInfo, + this_sled_switch_slot: ThisSledSwitchSlot, + parent_log: &Logger, + ) -> Self { + let dpd_reconciler = ReconcilerTaskHandle::::spawn( + scrimlet_status_rx.clone(), + networking_info.system_networking_config_rx.clone(), + networking_info.mode, + this_sled_switch_slot, + parent_log, + ); + let mgd_reconciler = ReconcilerTaskHandle::::spawn( + scrimlet_status_rx.clone(), + networking_info.system_networking_config_rx.clone(), + networking_info.mode, + this_sled_switch_slot, + parent_log, + ); + let uplinkd_reconciler = + ReconcilerTaskHandle::::spawn( + scrimlet_status_rx, + networking_info.system_networking_config_rx, + networking_info.mode, + this_sled_switch_slot, + parent_log, + ); + Self { dpd_reconciler, mgd_reconciler, uplinkd_reconciler } + } +} + +#[cfg(test)] +mod tests; diff --git a/sled-agent/scrimlet-reconcilers/src/handle/tests.rs b/sled-agent/scrimlet-reconcilers/src/handle/tests.rs new file mode 100644 index 00000000000..f8ee5184130 --- /dev/null +++ b/sled-agent/scrimlet-reconcilers/src/handle/tests.rs @@ -0,0 +1,424 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use super::*; +use assert_matches::assert_matches; +use dropshot::ConfigLogging; +use dropshot::ConfigLoggingLevel; +use dropshot::test_util::LogContext; +use gateway_messages::SpPort; +use gateway_test_utils::setup::GatewayTestContext; +use httpmock::Mock; +use httpmock::MockServer; +use omicron_test_utils::dev; +use sled_agent_types::early_networking::RackNetworkConfig; +use std::time::Duration; + +// For "happy path" tests, we spin up a real MGS instances (pointed at a +// simulated SP). +// +// For "sad path" tests, we use a mock MGS so we can inject failures. +trait MgsFlavor { + fn address(&self) -> SocketAddr; + async fn teardown(self); +} + +impl MgsFlavor for GatewayTestContext { + fn address(&self) -> SocketAddr { + SocketAddr::V6(GatewayTestContext::address(self)) + } + + async fn teardown(self) { + GatewayTestContext::teardown(self).await + } +} + +impl MgsFlavor for MockServer { + fn address(&self) -> SocketAddr { + *MockServer::address(self) + } + + async fn teardown(self) {} +} + +struct Harness { + handle: ScrimletReconcilers, + networking_config_tx: watch::Sender, + mgs: T, + logctx: LogContext, +} + +impl Harness { + fn new_common(logctx: LogContext, mgs: T) -> Self { + let (networking_config_tx, _) = + watch::channel(SystemNetworkingConfig { + rack_network_config: RackNetworkConfig { + rack_subnet: "fd00:1122:3344:0100::/56".parse().unwrap(), + infra_ip_first: "192.0.2.10".parse().unwrap(), + infra_ip_last: "192.0.2.100".parse().unwrap(), + ports: Vec::new(), + bgp: Vec::new(), + bfd: Vec::new(), + }, + blueprint_external_networking_config: None, + }); + + let handle = ScrimletReconcilers::new(&logctx.log); + + Self { handle, networking_config_tx, mgs, logctx } + } + + async fn teardown(self) { + self.logctx.cleanup_successful(); + self.mgs.teardown().await; + } + + fn sled_agent_networking_info(&self) -> SledAgentNetworkingInfo { + let dummy_addr = "0.0.0.0:0".parse().unwrap(); + SledAgentNetworkingInfo { + system_networking_config_rx: self.networking_config_tx.subscribe(), + mode: ScrimletReconcilersMode::Test { + mgs_addr: self.mgs.address(), + dpd_addr: dummy_addr, + mgd_addr: dummy_addr, + }, + } + } + + async fn wait_for_task_status(&self, matches: F) + where + F: Fn(&ScrimletReconcilersStatus) -> bool, + { + let mut status = self.handle.status(); + let start = tokio::time::Instant::now(); + while start.elapsed() < Duration::from_secs(30) { + if matches(&status) { + return; + } + tokio::time::sleep(Duration::from_millis(10)).await; + status = self.handle.status(); + } + panic!("timeout waiting for task status (got {status:?})"); + } +} + +impl Harness { + async fn new_real_mgs(logctx: LogContext) -> Self { + let ctx = gateway_test_utils::setup::test_setup( + logctx.test_name(), + SpPort::One, + ) + .await; + Self::new_common(logctx, ctx) + } +} + +impl Harness { + fn new_mock_mgs(logctx: LogContext) -> Self { + Self::new_common(logctx, MockServer::start()) + } + + fn mock_mgs_set_switch_slot(&self, slot: u16) -> Mock<'_> { + let body = + serde_json::json!({ "type": "switch", "slot": slot }).to_string(); + self.mgs.mock(|when, then| { + when.method(httpmock::Method::GET).path("/local/switch-id"); + then.status(200) + .header("content-type", "application/json") + .body(body); + }) + } + + fn mock_mgs_set_503(&self) -> Mock<'_> { + self.mgs.mock(|when, then| { + when.method(httpmock::Method::GET).path("/local/switch-id"); + then.status(503).header("content-type", "application/json").body( + serde_json::json!({ + "request_id": "test", + "message": "service unavailable", + }) + .to_string(), + ); + }) + } + + async fn wait_for_mock_to_be_called(&self, mock: &Mock<'_>) { + // Tricky bit: we use tokio's paused time in tests below both for + // consistency and test speed, but we need to wait for _real_ time for + // httpmock to receive requests. Use `std::time::Instant` here so we + // wait for 10 real seconds for a request to be received. + let start = std::time::Instant::now(); + while start.elapsed() < Duration::from_secs(10) { + if mock.calls_async().await == 0 { + // advance paused time... + tokio::time::sleep(Duration::from_secs(1)).await; + // and also _really_ sleep briefly to avoid spinning 100% CPU + std::thread::sleep(Duration::from_millis(10)); + continue; + } + + // We got at least 1 call; assert and return. + return mock.assert_async().await; + } + + panic!("timed out wait for mock to be called"); + } +} + +#[tokio::test] +#[should_panic( + expected = "set_sled_agent_networking_info_once() called more than once" +)] +async fn calling_set_sled_agent_networking_info_once_multiple_times_panics() { + // Set up a stderr logger - we're going to panic and won't have the + // opportunity to clean up a file-based one. + let log_config = + ConfigLogging::StderrTerminal { level: ConfigLoggingLevel::Trace }; + let logctx = LogContext::new( + "calling_set_sled_agent_networking_info_once_multiple_times_panics", + &log_config, + ); + let harness = Harness::new_mock_mgs(logctx); + + harness.handle.set_sled_agent_networking_info_once( + harness.sled_agent_networking_info(), + ); + harness.handle.set_sled_agent_networking_info_once( + harness.sled_agent_networking_info(), + ); +} + +// Happy-path test for non-scrimlets. +#[tokio::test] +async fn non_scrimlet_two_phase_initialization() { + let logctx = dev::test_setup_log("non_scrimlet_two_phase_initialization"); + let harness = Harness::new_mock_mgs(logctx); + + // initial status + assert_matches!( + harness.handle.status(), + ScrimletReconcilersStatus::WaitingForSledAgentNetworkingInfo + ); + + harness.handle.set_sled_agent_networking_info_once( + harness.sled_agent_networking_info(), + ); + + // terminal status for non-scrimlets + assert_matches!( + harness.handle.status(), + ScrimletReconcilersStatus::DeterminingSwitchSlot( + DetermineSwitchSlotStatus::NotScrimlet + ) + ); + + harness.teardown().await; +} + +// Happy-path test for scrimlets where we find out we're a scrimlet after +// getting our networking info. +#[tokio::test] +async fn scrimlet_three_phase_initialization_info_then_scrimlet() { + let logctx = dev::test_setup_log( + "scrimlet_three_phase_initialization_info_then_scrimlet", + ); + let harness = Harness::new_real_mgs(logctx).await; + + // initial status + assert_matches!( + harness.handle.status(), + ScrimletReconcilersStatus::WaitingForSledAgentNetworkingInfo + ); + + harness.handle.set_sled_agent_networking_info_once( + harness.sled_agent_networking_info(), + ); + + // status before we become a scrimlet + assert_matches!( + harness.handle.status(), + ScrimletReconcilersStatus::DeterminingSwitchSlot( + DetermineSwitchSlotStatus::NotScrimlet + ) + ); + + harness.handle.set_scrimlet_status(ScrimletStatus::Scrimlet); + + harness + .wait_for_task_status(|status| { + matches!(status, ScrimletReconcilersStatus::Running { .. }) + }) + .await; + + harness.teardown().await; +} + +// Happy-path test for scrimlets where we find out we're a scrimlet before +// getting our networking info. +#[tokio::test] +async fn scrimlet_three_phase_initialization_scrimlet_then_info() { + let logctx = dev::test_setup_log( + "scrimlet_three_phase_initialization_scrimlet_then_info", + ); + let harness = Harness::new_real_mgs(logctx).await; + + // Become a scrimlet right away. + harness.handle.set_scrimlet_status(ScrimletStatus::Scrimlet); + + // initial status + assert_matches!( + harness.handle.status(), + ScrimletReconcilersStatus::WaitingForSledAgentNetworkingInfo + ); + + harness.handle.set_sled_agent_networking_info_once( + harness.sled_agent_networking_info(), + ); + + // We're already a scrimlet, so we contact MGS then transition to Running. + harness + .wait_for_task_status(|status| { + matches!(status, ScrimletReconcilersStatus::Running { .. }) + }) + .await; + + harness.teardown().await; +} + +// Scrimlet test case where MGS fails the first time we ask then succeeds later. +#[tokio::test(start_paused = true)] +async fn scrimlet_mgs_fails_first_attempt() { + let logctx = dev::test_setup_log("scrimlet_mgs_fails_first_attempt"); + let harness = Harness::new_mock_mgs(logctx); + + // Configure our mock MGS to return an HTTP 503. + let mut error_mock = harness.mock_mgs_set_503(); + + // initial status + assert_matches!( + harness.handle.status(), + ScrimletReconcilersStatus::WaitingForSledAgentNetworkingInfo + ); + + harness.handle.set_sled_agent_networking_info_once( + harness.sled_agent_networking_info(), + ); + + // status before we become a scrimlet + assert_matches!( + harness.handle.status(), + ScrimletReconcilersStatus::DeterminingSwitchSlot( + DetermineSwitchSlotStatus::NotScrimlet + ) + ); + + harness.handle.set_scrimlet_status(ScrimletStatus::Scrimlet); + + // Wait for our mock to receive a request and return a 503; we should then + // see the `WaitingToRetry` state with an error. + harness.wait_for_mock_to_be_called(&error_mock).await; + // We should first see the "waiting to retry" status... + harness + .wait_for_task_status(|status| { + matches!( + status, + ScrimletReconcilersStatus::DeterminingSwitchSlot( + DetermineSwitchSlotStatus::WaitingToRetry { + prev_attempt_err, + } + ) if prev_attempt_err.contains("status: 503") + ) + }) + .await; + + // Changing MGS to return success should transition us to `Running`. + error_mock.delete(); + let success_mock = harness.mock_mgs_set_switch_slot(0); + + harness.wait_for_mock_to_be_called(&success_mock).await; + harness + .wait_for_task_status(|status| { + matches!(status, ScrimletReconcilersStatus::Running { .. }) + }) + .await; + + harness.teardown().await; +} + +// Scrimlet test case where MGS fails, then we become "not a scrimlet", then we +// become a scrimlet again and MGS succeeds. +#[tokio::test(start_paused = true)] +async fn scrimlet_mgs_fails_then_we_become_not_a_scrimlet() { + let logctx = + dev::test_setup_log("scrimlet_mgs_fails_then_we_become_not_a_scrimlet"); + let harness = Harness::new_mock_mgs(logctx); + + // Configure our mock MGS to return an HTTP 503. + let mut error_mock = harness.mock_mgs_set_503(); + + // initial status + assert_matches!( + harness.handle.status(), + ScrimletReconcilersStatus::WaitingForSledAgentNetworkingInfo + ); + + harness.handle.set_sled_agent_networking_info_once( + harness.sled_agent_networking_info(), + ); + + // status before we become a scrimlet + assert_matches!( + harness.handle.status(), + ScrimletReconcilersStatus::DeterminingSwitchSlot( + DetermineSwitchSlotStatus::NotScrimlet + ) + ); + + harness.handle.set_scrimlet_status(ScrimletStatus::Scrimlet); + + // Wait until we've failed to determine our switch slot. + harness.wait_for_mock_to_be_called(&error_mock).await; + harness + .wait_for_task_status(|status| { + matches!( + status, + ScrimletReconcilersStatus::DeterminingSwitchSlot( + DetermineSwitchSlotStatus::WaitingToRetry { + prev_attempt_err, + } + ) if prev_attempt_err.contains("status: 503") + ) + }) + .await; + + // Now sled-agent tells us we're not a scrimlet; we should transition back + // to `NotScrimlet`. + harness.handle.set_scrimlet_status(ScrimletStatus::NotScrimlet); + harness + .wait_for_task_status(|status| { + matches!( + status, + ScrimletReconcilersStatus::DeterminingSwitchSlot( + DetermineSwitchSlotStatus::NotScrimlet + ) + ) + }) + .await; + + // Reconfigure MGS to succeed. + error_mock.delete(); + let success_mock = harness.mock_mgs_set_switch_slot(0); + + // Become a scrimlet again - we should transition through `ContactingMgs` + // (with no previous error) then to `Running`. + harness.handle.set_scrimlet_status(ScrimletStatus::Scrimlet); + harness.wait_for_mock_to_be_called(&success_mock).await; + harness + .wait_for_task_status(|status| { + matches!(status, ScrimletReconcilersStatus::Running { .. }) + }) + .await; + + harness.teardown().await; +} diff --git a/sled-agent/scrimlet-reconcilers/src/lib.rs b/sled-agent/scrimlet-reconcilers/src/lib.rs new file mode 100644 index 00000000000..a8acb314f26 --- /dev/null +++ b/sled-agent/scrimlet-reconcilers/src/lib.rs @@ -0,0 +1,72 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! This crate implements long-running reconciler tasks responsible for +//! configuration of services within a scrimlet's switch zone. +//! +//! These tasks _only_ talk to services on the same sled as the sled-agent +//! executing these tasks; we attempt to ensure this at runtime via types like +//! [`ThisSledSwitchZoneUnderlayIpAddr`]. A scrimlet running these tasks should +//! never attempt to talk to another scrimlet's switch zone, and a non-scrimlet +//! running these tasks should never attempt to talk to anything. (Non-scrimlet +//! sleds still create a [`ScrimletReconcilers`] handle, as sled-agent can't +//! easily tell the difference between "not a scrimlet because we're in a +//! different cubby" and "not a scrimlet because we should have a switch but it +//! isn't connected / isn't powered on / etc.", but the reconciliation tasks are +//! only spawned under conditions that only scrimlets can satisfy; see +//! [`ScrimletReconcilers`] for more detail.) +//! +//! These tasks are responsible for applying system-level networking, including: +//! +//! * Configuration of uplink ports within `dpd` +//! * Configuration of NAT entries for system-level services (boundary NTP, +//! Nexus, external DNS - notably _not_ instances) within `dpd` +//! * Configuration of BGP within `mgd` +//! * Configuration of BFD within `mgd` +//! * Configuration of static routes within `mgd` +//! * Configuration of SMF properties for `uplinkd` and `lldpd` +//! +//! The specific configuration that should be applied comes from Nexus (or RSS, +//! at rack setup time) and is sent to `sled-agent` via the bootstore. +//! +//! In the past, responsibility for this configuration was split: sled-agent was +//! responsible for applying an initial config on sled boot (required for cold +//! boot of the rack), and Nexus was responsible for continuously keeping the +//! config in sync afterwards. This had a variety of problems; see +//! . The split is now +//! that Nexus is responsible for maintaining what the configuration should be, +//! and each scrimlet is responsible for applying that configuration to its own +//! switch zone's services; the latter is implemented via this crate. +//! +//! [`ThisSledSwitchZoneUnderlayIpAddr`]: +//! sled_agent_types::sled::ThisSledSwitchZoneUnderlayIpAddr + +mod dpd_reconciler; +mod handle; +mod mgd_reconciler; +mod reconciler_task; +mod status; +mod switch_zone_slot; +mod uplinkd_reconciler; + +pub use dpd_reconciler::DpdNatReconcilerStatusNatEntry; +pub use dpd_reconciler::DpdNatReconcilerStatusNatEntryFailure; +pub use dpd_reconciler::DpdPortOperationFailure; +pub use dpd_reconciler::DpdPortReconcilerStatus; +pub use dpd_reconciler::DpdReconcilerStatus; +pub use handle::ScrimletReconcilers; +pub use handle::ScrimletReconcilersMode; +pub use handle::SledAgentNetworkingInfo; +pub use mgd_reconciler::MgdReconcilerStatus; +pub use mgd_reconciler::MgdStaticRouteReconcilerStatus; +pub use status::DetermineSwitchSlotStatus; +pub use status::ReconcilerActivationReason; +pub use status::ReconcilerCurrentStatus; +pub use status::ReconcilerInertReason; +pub use status::ReconcilerRunningStatus; +pub use status::ReconcilerStatus; +pub use status::ReconciliationCompletedStatus; +pub use status::ScrimletReconcilersStatus; +pub use status::ScrimletStatus; +pub use uplinkd_reconciler::UplinkdReconcilerStatus; diff --git a/sled-agent/scrimlet-reconcilers/src/mgd_reconciler.rs b/sled-agent/scrimlet-reconcilers/src/mgd_reconciler.rs new file mode 100644 index 00000000000..e25a68de725 --- /dev/null +++ b/sled-agent/scrimlet-reconcilers/src/mgd_reconciler.rs @@ -0,0 +1,72 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Reconciler responsible for configuration of `mgd` within a scrimlet's switch +//! zone. + +use crate::ScrimletReconcilersMode; +use crate::reconciler_task::Reconciler; +use crate::switch_zone_slot::ThisSledSwitchSlot; +use mg_admin_client::Client; +use sled_agent_types::system_networking::SystemNetworkingConfig; +use slog::Logger; +use std::time::Duration; + +mod static_route_reconciler; + +pub use static_route_reconciler::MgdStaticRouteReconcilerStatus; + +#[derive(Debug, Clone)] +pub struct MgdReconcilerStatus { + pub static_routes_status: MgdStaticRouteReconcilerStatus, +} + +impl slog::KV for MgdReconcilerStatus { + fn serialize( + &self, + record: &slog::Record<'_>, + serializer: &mut dyn slog::Serializer, + ) -> slog::Result { + let Self { static_routes_status } = self; + static_routes_status.serialize(record, serializer)?; + Ok(()) + } +} + +#[derive(Debug)] +pub(crate) struct MgdReconciler { + client: Client, + switch_slot: ThisSledSwitchSlot, +} + +impl Reconciler for MgdReconciler { + type Status = MgdReconcilerStatus; + + const LOGGER_COMPONENT_NAME: &'static str = "MgdReconciler"; + const RE_RECONCILE_INTERVAL: Duration = Duration::from_secs(30); + + fn new( + mode: ScrimletReconcilersMode, + switch_slot: ThisSledSwitchSlot, + parent_log: &Logger, + ) -> Self { + Self { client: mode.mgd_client(parent_log), switch_slot } + } + + async fn do_reconciliation( + &mut self, + system_networking_config: &SystemNetworkingConfig, + log: &Logger, + ) -> Self::Status { + let static_routes_status = static_route_reconciler::reconcile( + &self.client, + &system_networking_config.rack_network_config, + self.switch_slot, + log, + ) + .await; + + MgdReconcilerStatus { static_routes_status } + } +} diff --git a/sled-agent/scrimlet-reconcilers/src/mgd_reconciler/static_route_reconciler.rs b/sled-agent/scrimlet-reconcilers/src/mgd_reconciler/static_route_reconciler.rs new file mode 100644 index 00000000000..9ca8d2484a5 --- /dev/null +++ b/sled-agent/scrimlet-reconcilers/src/mgd_reconciler/static_route_reconciler.rs @@ -0,0 +1,585 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Reconciliation of static routes within mgd. + +use crate::switch_zone_slot::ThisSledSwitchSlot; +use daft::BTreeSetDiff; +use daft::Diffable; +use either::Either; +use mg_admin_client::Client; +use mg_admin_client::types::AddStaticRoute4Request as MgdAddStaticRoute4Request; +use mg_admin_client::types::AddStaticRoute6Request as MgdAddStaticRoute6Request; +use mg_admin_client::types::DeleteStaticRoute4Request as MgdDeleteStaticRoute4Request; +use mg_admin_client::types::DeleteStaticRoute6Request as MgdDeleteStaticRoute6Request; +use mg_admin_client::types::Path as MgdPath; +use mg_admin_client::types::StaticRoute4 as MgdStaticRoute4; +use mg_admin_client::types::StaticRoute4List as MgdStaticRoute4List; +use mg_admin_client::types::StaticRoute6 as MgdStaticRoute6; +use mg_admin_client::types::StaticRoute6List as MgdStaticRoute6List; +use oxnet::IpNet; +use oxnet::IpNetParseError; +use oxnet::Ipv4Net; +use oxnet::Ipv6Net; +use rdb_types::Prefix4 as MgdPrefix4; +use rdb_types::Prefix6 as MgdPrefix6; +use sled_agent_types::early_networking::RackNetworkConfig; +use sled_agent_types::early_networking::RouteConfig; +use slog::Logger; +use slog::info; +use slog::warn; +use slog_error_chain::InlineErrorChain; +use std::collections::BTreeSet; +use std::collections::HashMap; +use std::iter; +use std::net::IpAddr; +use std::net::Ipv4Addr; +use std::net::Ipv6Addr; + +// This is the default RIB Priority used for static routes. This mirrors +// the const defined in maghemite in rdb/src/lib.rs. +// +// TODO-cleanup Once https://github.com/oxidecomputer/maghemite/pull/757 lands +// and is available in omicron, we should remove this in favor of using the +// exported constant from maghemite. +const DEFAULT_RIB_PRIORITY_STATIC: u8 = 1; + +type MgdClientError = mg_admin_client::Error; + +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum MgdStaticRouteReconcilerStatus { + /// Reconciliation was skipped because we couldn't fetch the current set of + /// static routes from MGD. + FailedReadingStaticRoutes(String), + + /// Reconciliation was skipped because we couldn't determine a plan for + /// changes to make. + /// + /// This should never happen - it indicates there's some faulty data + /// somewhere (either coming from mgd or in the rack network config). + FailedGeneratingPlan(String), + + /// Reconciliation completed successfully. + /// + /// mgd operations are performed in bulk, so each item here contains the + /// count of items involved. + Success { + unchanged: usize, + deleted_v4: usize, + deleted_v6: usize, + added_v4: usize, + added_v6: usize, + }, + + /// Reconciliation completed with at least one failure. + /// + /// mgd operations are performed in bulk, so each item here contains the + /// count of items applied on success. + PartialSuccess { + unchanged: usize, + delete_v4_result: Result, + delete_v6_result: Result, + add_v4_result: Result, + add_v6_result: Result, + }, +} + +impl slog::KV for MgdStaticRouteReconcilerStatus { + fn serialize( + &self, + _record: &slog::Record<'_>, + serializer: &mut dyn slog::Serializer, + ) -> slog::Result { + let skipped_key = "static-routes-reconciler-skipped"; + match self { + Self::FailedReadingStaticRoutes(reason) => { + serializer.emit_str(skipped_key.into(), reason) + } + Self::FailedGeneratingPlan(reason) => { + serializer.emit_str(skipped_key.into(), reason) + } + Self::Success { + unchanged, + deleted_v4, + deleted_v6, + added_v4, + added_v6, + } => { + serializer + .emit_usize("static-routes-unchanged".into(), *unchanged)?; + for (key, items) in [ + ("static-routes-delete-v4", deleted_v4), + ("static-routes-delete-v6", deleted_v6), + ("static-routes-add-v4", added_v4), + ("static-routes-add-v6", added_v6), + ] { + serializer.emit_arguments( + key.into(), + &format_args!("success ({items} routes affected)"), + )?; + } + Ok(()) + } + Self::PartialSuccess { + unchanged, + delete_v4_result, + delete_v6_result, + add_v4_result, + add_v6_result, + } => { + serializer + .emit_usize("static-routes-unchanged".into(), *unchanged)?; + for (key, result) in [ + ("static-routes-delete-v4", delete_v4_result), + ("static-routes-delete-v6", delete_v6_result), + ("static-routes-add-v4", add_v4_result), + ("static-routes-add-v6", add_v6_result), + ] { + match result { + Ok(items) => serializer.emit_arguments( + key.into(), + &format_args!("success ({items} routes affected)"), + )?, + Err(err) => { + serializer.emit_str(key.into(), &err)?; + } + } + } + Ok(()) + } + } + } +} + +pub(super) async fn reconcile( + client: &Client, + desired_config: &RackNetworkConfig, + our_switch_slot: ThisSledSwitchSlot, + log: &Logger, +) -> MgdStaticRouteReconcilerStatus { + let current_routes = match MgdCurrentRoutes::fetch(client).await { + Ok(routes) => routes, + Err(err) => { + return MgdStaticRouteReconcilerStatus::FailedReadingStaticRoutes( + format!( + "failed to read current static routes from mgd: {}", + InlineErrorChain::new(&err) + ), + ); + } + }; + + let plan = match ReconciliationPlan::new( + current_routes, + desired_config, + our_switch_slot, + log, + ) { + Ok(plan) => plan, + Err(err) => { + // Ensure `err` is actually a string; if it changes to a proper + // error type, we need to use `InlineErrorChain` here instead. + let err: &str = &err; + return MgdStaticRouteReconcilerStatus::FailedGeneratingPlan( + format!( + "failed to generate plan to apply static routes: {err}", + ), + ); + } + }; + + apply_plan(client, plan, log).await +} + +/// Apply the contents of `plan` to mgd via `client`. +async fn apply_plan( + client: &Client, + plan: ReconciliationPlan, + log: &Logger, +) -> MgdStaticRouteReconcilerStatus { + let ReconciliationPlan { unchanged_count, to_delete, to_add } = plan; + + // Delete before adding in case there are any conflicting routes for new + // routes we want to add. + // + // We will always attempt to add whatever routes we need to add even if one + // or both of these deletes fail. That may fail too, if there are + // conflicting routes we failed to delete! But it may succeed (or partially + // succeed) if there are also routes we need to add that don't conflict. + let (delete_v4_result, delete_v6_result) = { + // Assemble all the v4 and v6 route deletes into two requests. + let mut delete_v4_req = MgdDeleteStaticRoute4Request { + routes: MgdStaticRoute4List { list: Vec::new() }, + }; + let mut delete_v6_req = MgdDeleteStaticRoute6Request { + routes: MgdStaticRoute6List { list: Vec::new() }, + }; + for route in to_delete { + match route.into_mgd_static_route() { + MgdStaticRoute::V4(r) => delete_v4_req.routes.list.push(r), + MgdStaticRoute::V6(r) => delete_v6_req.routes.list.push(r), + } + } + ( + apply_bulk_operation_if_needed( + "deleting static v4 routes", + delete_v4_req.routes.list.len(), + || client.static_remove_v4_route(&delete_v4_req), + log, + ) + .await, + apply_bulk_operation_if_needed( + "deleting static v6 routes", + delete_v6_req.routes.list.len(), + || client.static_remove_v6_route(&delete_v6_req), + log, + ) + .await, + ) + }; + + let (add_v4_result, add_v6_result) = { + // Do the same for all route additions. + let mut add_v4_req = MgdAddStaticRoute4Request { + routes: MgdStaticRoute4List { list: Vec::new() }, + }; + let mut add_v6_req = MgdAddStaticRoute6Request { + routes: MgdStaticRoute6List { list: Vec::new() }, + }; + for route in to_add { + match route.into_mgd_static_route() { + MgdStaticRoute::V4(r) => add_v4_req.routes.list.push(r), + MgdStaticRoute::V6(r) => add_v6_req.routes.list.push(r), + } + } + ( + apply_bulk_operation_if_needed( + "adding static v4 routes", + add_v4_req.routes.list.len(), + || client.static_add_v4_route(&add_v4_req), + log, + ) + .await, + apply_bulk_operation_if_needed( + "adding static v6 routes", + add_v6_req.routes.list.len(), + || client.static_add_v6_route(&add_v6_req), + log, + ) + .await, + ) + }; + + match (&add_v4_result, &add_v6_result, &delete_v4_result, &delete_v6_result) + { + (Ok(added_v4), Ok(added_v6), Ok(deleted_v4), Ok(deleted_v6)) => { + MgdStaticRouteReconcilerStatus::Success { + unchanged: unchanged_count, + deleted_v4: *deleted_v4, + deleted_v6: *deleted_v6, + added_v4: *added_v4, + added_v6: *added_v6, + } + } + _ => MgdStaticRouteReconcilerStatus::PartialSuccess { + unchanged: unchanged_count, + delete_v4_result, + delete_v6_result, + add_v4_result, + add_v6_result, + }, + } +} + +// Helper to optionally perform `op`. +// +// If `nitems` is 0, `op` is never called, and we return +// `MgdStaticRouteBulkOperationResult::SkippedNoItems`. +// +// If `nitems` is not 0, we await the future returned by `op` and return either +// success or failure according to its returned value. +async fn apply_bulk_operation_if_needed( + description: &str, + nitems: usize, + op: F, + log: &Logger, +) -> Result +where + F: FnOnce() -> Fut, + Fut: Future>, +{ + if nitems == 0 { + return Ok(0); + } + + match op().await { + Ok(_) => { + info!(log, "{description} succeeded"; "num-routes" => %nitems); + Ok(nitems) + } + Err(err) => { + let err = InlineErrorChain::new(&err); + warn!(log, "{description} failed"; &err); + Err(format!("{description} failed: {err}")) + } + } +} + +#[derive(Debug, PartialEq)] +struct ReconciliationPlan { + // Count of routes that remained unchanged in this reconciliation. + unchanged_count: usize, + + // Routes to delete. + to_delete: BTreeSet, + + // Routes to add. + to_add: BTreeSet, +} + +impl ReconciliationPlan { + fn new( + mgd_current_routes: MgdCurrentRoutes, + config: &RackNetworkConfig, + our_switch_slot: ThisSledSwitchSlot, + log: &Logger, + ) -> Result { + // Convert current routes into diffable form. + let mgd_current_routes = match mgd_current_routes.try_into_diffable() { + Ok(routes) => routes, + Err(err) => { + return Err(format!( + "invalid route fetched from mgd: {}", + InlineErrorChain::new(&err) + )); + } + }; + + // Convert desired config into diffable form. + let desired_routes = config + .ports + .iter() + .filter(|port| port.switch == our_switch_slot) + .flat_map(|port| port.routes.iter()) + .map(DiffableStaticRoute::try_from) + .collect::, _>>()?; + + let BTreeSetDiff { common, added, removed } = + mgd_current_routes.diff(&desired_routes); + + let unchanged_count = common.len(); + let to_delete = removed.into_iter().copied().collect::>(); + let to_add = added.into_iter().copied().collect::>(); + + info!( + log, + "generated mgd static route reconciliation plan"; + "routes-unchanged" => unchanged_count, + "routes-to-delete" => to_delete.len(), + "routes-to-add" => to_add.len(), + ); + + Ok(Self { unchanged_count, to_delete, to_add }) + } +} + +#[derive(Debug, Default)] +struct MgdCurrentRoutes { + v4: HashMap>, + v6: HashMap>, +} + +impl MgdCurrentRoutes { + async fn fetch(client: &Client) -> Result { + let v4 = client.static_list_v4_routes().await?.into_inner(); + let v6 = client.static_list_v6_routes().await?.into_inner(); + + Ok(Self { v4, v6 }) + } + + fn try_into_diffable( + self, + ) -> Result, BadMgdRoute> { + let v4_routes = self.v4.into_iter().flat_map(|(prefix, paths)| { + DiffableStaticRoute::try_from_v4(prefix, paths) + }); + let v6_routes = self.v6.into_iter().flat_map(|(prefix, paths)| { + DiffableStaticRoute::try_from_v6(prefix, paths) + }); + + v4_routes.chain(v6_routes).collect() + } +} + +#[derive( + Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, daft::Diffable, +)] +#[cfg_attr(test, derive(test_strategy::Arbitrary))] +struct DiffableStaticRoute { + description: DiffableStaticRouteDescription, + vlan_id: Option, + priority: u8, +} + +impl TryFrom<&'_ RouteConfig> for DiffableStaticRoute { + type Error = String; + + fn try_from(route: &'_ RouteConfig) -> Result { + let description = match (route.nexthop, route.destination) { + (IpAddr::V4(nexthop), IpNet::V4(prefix)) => { + DiffableStaticRouteDescription::V4 { nexthop, prefix } + } + (IpAddr::V6(nexthop), IpNet::V6(prefix)) => { + DiffableStaticRouteDescription::V6 { nexthop, prefix } + } + (nexthop, prefix) => { + return Err(format!( + "rack network config route has mixed IP families \ + for nexthop and prefix: {nexthop}, {prefix}" + )); + } + }; + + Ok(Self { + description, + vlan_id: route.vlan_id, + // TODO The rack network config uses `None` as a sentinel for "the + // default priority". This isn't what we want long-term; see + // https://github.com/oxidecomputer/maghemite/issues/646#issuecomment-3948331208. + priority: route.rib_priority.unwrap_or(DEFAULT_RIB_PRIORITY_STATIC), + }) + } +} + +#[derive(Debug, thiserror::Error)] +enum BadMgdRoute { + #[error("could not parse {family} route prefix `{prefix}`")] + ParsePrefix { + family: &'static str, + prefix: String, + #[source] + err: IpNetParseError, + }, + #[error( + "expected {family} nexthop in prefix `{prefix}`, but got {nexthop}" + )] + BadNexthopFamily { family: &'static str, prefix: String, nexthop: String }, +} + +impl DiffableStaticRoute { + fn into_mgd_static_route(self) -> MgdStaticRoute { + match self.description { + DiffableStaticRouteDescription::V4 { nexthop, prefix } => { + MgdStaticRoute::V4(MgdStaticRoute4 { + nexthop, + prefix: MgdPrefix4 { + value: prefix.addr(), + length: prefix.width(), + }, + rib_priority: self.priority, + vlan_id: self.vlan_id, + }) + } + DiffableStaticRouteDescription::V6 { nexthop, prefix } => { + MgdStaticRoute::V6(MgdStaticRoute6 { + nexthop, + prefix: MgdPrefix6 { + value: prefix.addr(), + length: prefix.width(), + }, + rib_priority: self.priority, + vlan_id: self.vlan_id, + }) + } + } + } + + fn try_from_v4( + prefix: String, + paths: Vec, + ) -> impl Iterator> { + let prefix = match prefix.parse::() { + Ok(prefix) => prefix, + Err(err) => { + return Either::Left(iter::once(Err( + BadMgdRoute::ParsePrefix { family: "ipv4", prefix, err }, + ))); + } + }; + + Either::Right(paths.into_iter().map(move |path| { + let nexthop = match path.nexthop { + IpAddr::V4(ip) => ip, + IpAddr::V6(ip) => { + return Err(BadMgdRoute::BadNexthopFamily { + family: "ipv4", + prefix: prefix.to_string(), + nexthop: ip.to_string(), + }); + } + }; + + let description = + DiffableStaticRouteDescription::V4 { nexthop, prefix }; + + Ok(Self { + description, + vlan_id: path.vlan_id, + priority: path.rib_priority, + }) + })) + } + + fn try_from_v6( + prefix: String, + paths: Vec, + ) -> impl Iterator> { + let prefix = match prefix.parse::() { + Ok(prefix) => prefix, + Err(err) => { + return Either::Left(iter::once(Err( + BadMgdRoute::ParsePrefix { family: "ipv6", prefix, err }, + ))); + } + }; + + Either::Right(paths.into_iter().map(move |path| { + let nexthop = match path.nexthop { + IpAddr::V6(ip) => ip, + IpAddr::V4(ip) => { + return Err(BadMgdRoute::BadNexthopFamily { + family: "ipv6", + prefix: prefix.to_string(), + nexthop: ip.to_string(), + }); + } + }; + + let description = + DiffableStaticRouteDescription::V6 { nexthop, prefix }; + + Ok(Self { + description, + vlan_id: path.vlan_id, + priority: path.rib_priority, + }) + })) + } +} + +enum MgdStaticRoute { + V4(MgdStaticRoute4), + V6(MgdStaticRoute6), +} + +#[derive( + Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, daft::Diffable, +)] +enum DiffableStaticRouteDescription { + V4 { nexthop: Ipv4Addr, prefix: Ipv4Net }, + V6 { nexthop: Ipv6Addr, prefix: Ipv6Net }, +} + +#[cfg(test)] +mod tests; diff --git a/sled-agent/scrimlet-reconcilers/src/mgd_reconciler/static_route_reconciler/tests.rs b/sled-agent/scrimlet-reconcilers/src/mgd_reconciler/static_route_reconciler/tests.rs new file mode 100644 index 00000000000..44a9ca7f095 --- /dev/null +++ b/sled-agent/scrimlet-reconcilers/src/mgd_reconciler/static_route_reconciler/tests.rs @@ -0,0 +1,1183 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use super::*; +use crate::switch_zone_slot::ThisSledSwitchSlot; +use gateway_messages::SpPort; +use mg_admin_client::types::Path as MgdPath; +use omicron_test_utils::dev; +use proptest::prelude::Arbitrary; +use proptest::prelude::Just; +use proptest::prelude::any; +use proptest::prelude::proptest as proptest_macro; +use proptest::strategy::Strategy; +use sled_agent_types::early_networking::LinkSpeed; +use sled_agent_types::early_networking::PortConfig; +use sled_agent_types::early_networking::RackNetworkConfig; +use sled_agent_types::early_networking::RouteConfig; +use sled_agent_types::early_networking::SwitchSlot; +use sled_agent_types::early_networking::UplinkIpNet; +use std::collections::BTreeMap; +use test_strategy::proptest; +use tokio::task::block_in_place; + +impl Arbitrary for DiffableStaticRouteDescription { + type Parameters = (); + type Strategy = proptest::prelude::BoxedStrategy; + + fn arbitrary_with(_args: Self::Parameters) -> Self::Strategy { + // mgd enforces the same requirements as `UplinkIpNet` for prefixes, so + // we'll lean on its `Arbitrary` implementation to avoid invalid IPs + // like loopback and ipv4-mapped ipv6 addrs + // + // We have to do some work here to generate a matching v4-v4 or v6-v6 + // nexthop address; that address currently has no requirements (although + // it may eventually: + // ), so we + // generate any arbitrary nexthop that matches the protocol family. + any::() + .prop_flat_map(|prefix| { + let prefix = IpNet::from(prefix); + + let (prefix, nexthop_strategy) = match prefix { + IpNet::V4(prefix) => { + let prefix = Ipv4Net::new( + prefix.addr() & prefix.mask_addr(), + prefix.width(), + ) + .expect("still valid after applying mask"); + + ( + IpNet::from(prefix), + any::().prop_map(IpAddr::from).boxed(), + ) + } + IpNet::V6(prefix) => { + let prefix = Ipv6Net::new( + prefix.addr() & prefix.mask_addr(), + prefix.width(), + ) + .expect("still valid after applying mask"); + + ( + IpNet::from(prefix), + any::().prop_map(IpAddr::from).boxed(), + ) + } + }; + (Just(prefix), nexthop_strategy) + }) + .prop_map(|(prefix, nexthop)| match (prefix, nexthop) { + (IpNet::V4(prefix), IpAddr::V4(nexthop)) => { + Self::V4 { nexthop, prefix } + } + (IpNet::V6(prefix), IpAddr::V6(nexthop)) => { + Self::V6 { nexthop, prefix } + } + (IpNet::V4(_), IpAddr::V6(_)) + | (IpNet::V6(_), IpAddr::V4(_)) => { + unreachable!("invalid v4/v6 combo in Arbitrary impl") + } + }) + .boxed() + } +} + +/// Build a minimal `PortConfig` on the given switch with the given routes. +fn port_config(switch: SwitchSlot, routes: Vec) -> PortConfig { + PortConfig { + routes, + addresses: Vec::new(), + switch, + port: "qsfp0".to_string(), + uplink_port_speed: LinkSpeed::Speed100G, + uplink_port_fec: None, + bgp_peers: Vec::new(), + autoneg: false, + lldp: None, + tx_eq: None, + } +} + +fn rack_config(ports: Vec) -> RackNetworkConfig { + RackNetworkConfig { + rack_subnet: "fd00::/48".parse().unwrap(), + infra_ip_first: "10.0.0.1".parse().unwrap(), + infra_ip_last: "10.0.0.100".parse().unwrap(), + ports, + bgp: Vec::new(), + bfd: Vec::new(), + } +} + +fn make_route( + destination: &str, + nexthop: &str, + vlan_id: Option, + rib_priority: Option, +) -> RouteConfig { + RouteConfig { + destination: destination.parse().unwrap(), + nexthop: nexthop.parse().unwrap(), + vlan_id, + rib_priority, + } +} + +fn mgd_path(nexthop: &str, rib_priority: u8, vlan_id: Option) -> MgdPath { + MgdPath { + nexthop: nexthop.parse().unwrap(), + rib_priority, + vlan_id, + shutdown: false, + bgp: None, + nexthop_interface: None, + } +} + +fn mgd_routes( + v4: Vec<(&str, Vec)>, + v6: Vec<(&str, Vec)>, +) -> MgdCurrentRoutes { + MgdCurrentRoutes { + v4: v4.into_iter().map(|(k, v)| (k.to_string(), v)).collect(), + v6: v6.into_iter().map(|(k, v)| (k.to_string(), v)).collect(), + } +} + +#[test] +fn plan_all_unchanged() { + let logctx = dev::test_setup_log("plan_all_unchanged"); + let log = &logctx.log; + + // Desired: one v4 route on Switch0. + let config = rack_config(vec![port_config( + SwitchSlot::Switch0, + vec![make_route("10.0.0.0/24", "10.0.0.1", None, None)], + )]); + + // mgd has the same route (with default priority filled in). + let current = mgd_routes( + vec![("10.0.0.0/24", vec![mgd_path("10.0.0.1", 1, None)])], + vec![], + ); + + let plan = ReconciliationPlan::new( + current, + &config, + ThisSledSwitchSlot::TEST_FAKE, + log, + ) + .expect("plan should succeed"); + + assert_eq!(plan.unchanged_count, 1); + assert!(plan.to_delete.is_empty()); + assert!(plan.to_add.is_empty()); + + logctx.cleanup_successful(); +} + +#[test] +fn plan_add_all() { + let logctx = dev::test_setup_log("plan_add_all"); + let log = &logctx.log; + + // Desired: two routes (one v4, one v6). + let config = rack_config(vec![port_config( + SwitchSlot::Switch0, + vec![ + make_route("10.0.0.0/24", "10.0.0.1", None, Some(5)), + make_route("2001:db8::/64", "2001:db8::1", None, Some(3)), + ], + )]); + + // mgd is empty. + let current = mgd_routes(vec![], vec![]); + + let plan = ReconciliationPlan::new( + current, + &config, + ThisSledSwitchSlot::TEST_FAKE, + log, + ) + .expect("plan should succeed"); + + assert_eq!(plan.unchanged_count, 0); + assert!(plan.to_delete.is_empty()); + assert_eq!(plan.to_add.len(), 2); + + logctx.cleanup_successful(); +} + +#[test] +fn plan_delete_all() { + let logctx = dev::test_setup_log("plan_delete_all"); + let log = &logctx.log; + + // Desired: no routes. + let config = rack_config(vec![]); + + // mgd has two routes. + let current = mgd_routes( + vec![("10.0.0.0/24", vec![mgd_path("10.0.0.1", 1, None)])], + vec![("2001:db8::/64", vec![mgd_path("2001:db8::1", 1, None)])], + ); + + let plan = ReconciliationPlan::new( + current, + &config, + ThisSledSwitchSlot::TEST_FAKE, + log, + ) + .expect("plan should succeed"); + + assert_eq!(plan.unchanged_count, 0); + assert_eq!(plan.to_delete.len(), 2); + assert!(plan.to_add.is_empty()); + + logctx.cleanup_successful(); +} + +#[test] +fn plan_mix() { + let logctx = dev::test_setup_log("plan_mix"); + let log = &logctx.log; + + // Desired: keep 10.0.0.0/24, add 10.1.0.0/24, remove 10.2.0.0/24. + let config = rack_config(vec![port_config( + SwitchSlot::Switch0, + vec![ + make_route("10.0.0.0/24", "10.0.0.1", None, None), + make_route("10.1.0.0/24", "10.1.0.1", None, None), + ], + )]); + + let current = mgd_routes( + vec![ + ("10.0.0.0/24", vec![mgd_path("10.0.0.1", 1, None)]), + ("10.2.0.0/24", vec![mgd_path("10.2.0.1", 1, None)]), + ], + vec![], + ); + + let plan = ReconciliationPlan::new( + current, + &config, + ThisSledSwitchSlot::TEST_FAKE, + log, + ) + .expect("plan should succeed"); + + assert_eq!(plan.unchanged_count, 1); + assert_eq!(plan.to_delete.len(), 1); + assert_eq!(plan.to_add.len(), 1); + + logctx.cleanup_successful(); +} + +#[test] +fn plan_filters_other_switch_slot() { + let logctx = dev::test_setup_log("plan_filters_other_switch_slot"); + let log = &logctx.log; + + // Desired config has routes on both Switch0 and Switch1. Our reconciler is + // Switch0 (TEST_FAKE), so Switch1 routes should be ignored entirely. + let config = rack_config(vec![ + port_config( + SwitchSlot::Switch0, + vec![make_route("10.0.0.0/24", "10.0.0.1", None, None)], + ), + port_config( + SwitchSlot::Switch1, + vec![make_route("10.1.0.0/24", "10.1.0.1", None, None)], + ), + ]); + + // mgd is empty. + let current = mgd_routes(vec![], vec![]); + + let plan = ReconciliationPlan::new( + current, + &config, + ThisSledSwitchSlot::TEST_FAKE, + log, + ) + .expect("plan should succeed"); + + // Only the Switch0 route should be added. + assert_eq!(plan.unchanged_count, 0); + assert!(plan.to_delete.is_empty()); + assert_eq!(plan.to_add.len(), 1); + + logctx.cleanup_successful(); +} + +#[test] +fn plan_default_rib_priority() { + let logctx = dev::test_setup_log("plan_default_rib_priority"); + let log = &logctx.log; + + // Desired route has rib_priority = None (should default to 1). + let config = rack_config(vec![port_config( + SwitchSlot::Switch0, + vec![make_route("10.0.0.0/24", "10.0.0.1", None, None)], + )]); + + // mgd has the same route with priority 1 (the default). + let current = mgd_routes( + vec![( + "10.0.0.0/24", + vec![mgd_path("10.0.0.1", DEFAULT_RIB_PRIORITY_STATIC, None)], + )], + vec![], + ); + + let plan = ReconciliationPlan::new( + current, + &config, + ThisSledSwitchSlot::TEST_FAKE, + log, + ) + .expect("plan should succeed"); + + // Should be unchanged because None defaults to 1. + assert_eq!(plan.unchanged_count, 1); + assert!(plan.to_delete.is_empty()); + assert!(plan.to_add.is_empty()); + + logctx.cleanup_successful(); +} + +#[test] +fn plan_priority_change_is_delete_plus_add() { + let logctx = dev::test_setup_log("plan_priority_change_is_delete_plus_add"); + let log = &logctx.log; + + // Desired: priority 5 for a route. + let config = rack_config(vec![port_config( + SwitchSlot::Switch0, + vec![make_route("10.0.0.0/24", "10.0.0.1", None, Some(5))], + )]); + + // mgd has the same route but with priority 1. + let current = mgd_routes( + vec![("10.0.0.0/24", vec![mgd_path("10.0.0.1", 1, None)])], + vec![], + ); + + let plan = ReconciliationPlan::new( + current, + &config, + ThisSledSwitchSlot::TEST_FAKE, + log, + ) + .expect("plan should succeed"); + + // Different priority means old route deleted, new route added. + assert_eq!(plan.unchanged_count, 0); + assert_eq!(plan.to_delete.len(), 1); + assert_eq!(plan.to_add.len(), 1); + + logctx.cleanup_successful(); +} + +#[test] +fn plan_vlan_id_matters() { + let logctx = dev::test_setup_log("plan_vlan_id_matters"); + let log = &logctx.log; + + // Desired: route with vlan_id = Some(100). + let config = rack_config(vec![port_config( + SwitchSlot::Switch0, + vec![make_route("10.0.0.0/24", "10.0.0.1", Some(100), None)], + )]); + + // mgd has the same route but with no vlan_id. + let current = mgd_routes( + vec![("10.0.0.0/24", vec![mgd_path("10.0.0.1", 1, None)])], + vec![], + ); + + let plan = ReconciliationPlan::new( + current, + &config, + ThisSledSwitchSlot::TEST_FAKE, + log, + ) + .expect("plan should succeed"); + + // Different vlan_id: delete the old, add the new. + assert_eq!(plan.unchanged_count, 0); + assert_eq!(plan.to_delete.len(), 1); + assert_eq!(plan.to_add.len(), 1); + + logctx.cleanup_successful(); +} + +#[test] +fn plan_multiple_nexthops_per_prefix() { + let logctx = dev::test_setup_log("plan_multiple_nexthops_per_prefix"); + let log = &logctx.log; + + // Desired: two nexthops for the same prefix. + let config = rack_config(vec![port_config( + SwitchSlot::Switch0, + vec![ + make_route("10.0.0.0/24", "10.0.0.1", None, None), + make_route("10.0.0.0/24", "10.0.0.2", None, None), + ], + )]); + + // mgd has one of them. + let current = mgd_routes( + vec![("10.0.0.0/24", vec![mgd_path("10.0.0.1", 1, None)])], + vec![], + ); + + let plan = ReconciliationPlan::new( + current, + &config, + ThisSledSwitchSlot::TEST_FAKE, + log, + ) + .expect("plan should succeed"); + + assert_eq!(plan.unchanged_count, 1); + assert!(plan.to_delete.is_empty()); + assert_eq!(plan.to_add.len(), 1); + + logctx.cleanup_successful(); +} + +#[test] +fn plan_rejects_bad_mgd_prefix() { + let logctx = dev::test_setup_log("plan_rejects_bad_mgd_prefix"); + let log = &logctx.log; + + let config = rack_config(vec![]); + + // mgd returns an unparseable prefix. + let current = mgd_routes( + vec![("not-a-prefix", vec![mgd_path("10.0.0.1", 1, None)])], + vec![], + ); + + let err = ReconciliationPlan::new( + current, + &config, + ThisSledSwitchSlot::TEST_FAKE, + log, + ) + .expect_err("plan should fail with bad prefix"); + + assert!( + err.contains("invalid route fetched from mgd"), + "unexpected error: {err}", + ); + + logctx.cleanup_successful(); +} + +#[test] +fn plan_rejects_mixed_ip_families_in_config() { + let logctx = + dev::test_setup_log("plan_rejects_mixed_ip_families_in_config"); + let log = &logctx.log; + + // A route with a v4 nexthop and a v6 destination. + let config = rack_config(vec![port_config( + SwitchSlot::Switch0, + vec![RouteConfig { + destination: "2001:db8::/64".parse().unwrap(), + nexthop: "10.0.0.1".parse().unwrap(), + vlan_id: None, + rib_priority: None, + }], + )]); + + let current = mgd_routes(vec![], vec![]); + + let err = ReconciliationPlan::new( + current, + &config, + ThisSledSwitchSlot::TEST_FAKE, + log, + ) + .expect_err("plan should fail with mixed families"); + + assert!(err.contains("mixed IP families"), "unexpected error: {err}",); + + logctx.cleanup_successful(); +} + +#[test] +fn plan_rejects_wrong_nexthop_family_from_mgd() { + let logctx = + dev::test_setup_log("plan_rejects_wrong_nexthop_family_from_mgd"); + let log = &logctx.log; + + let config = rack_config(vec![]); + + // mgd has a v4 prefix but a v6 nexthop. + let current = mgd_routes( + vec![("10.0.0.0/24", vec![mgd_path("2001:db8::1", 1, None)])], + vec![], + ); + + let err = ReconciliationPlan::new( + current, + &config, + ThisSledSwitchSlot::TEST_FAKE, + log, + ) + .expect_err("plan should fail with wrong nexthop family"); + + assert!( + err.contains("invalid route fetched from mgd"), + "unexpected error: {err}", + ); + + logctx.cleanup_successful(); +} + +#[test] +fn plan_both_empty() { + let logctx = dev::test_setup_log("plan_both_empty"); + let log = &logctx.log; + + let config = rack_config(vec![]); + let current = mgd_routes(vec![], vec![]); + + let plan = ReconciliationPlan::new( + current, + &config, + ThisSledSwitchSlot::TEST_FAKE, + log, + ) + .expect("plan should succeed"); + + assert_eq!(plan.unchanged_count, 0); + assert!(plan.to_delete.is_empty()); + assert!(plan.to_add.is_empty()); + + logctx.cleanup_successful(); +} + +#[test] +fn plan_v6_routes() { + let logctx = dev::test_setup_log("plan_v6_routes"); + let log = &logctx.log; + + // Desired: one v6 route. + let config = rack_config(vec![port_config( + SwitchSlot::Switch0, + vec![make_route("2001:db8::/64", "2001:db8::1", None, Some(2))], + )]); + + // mgd has a different v6 route. + let current = mgd_routes( + vec![], + vec![("fd00::/48", vec![mgd_path("fd00::1", 1, None)])], + ); + + let plan = ReconciliationPlan::new( + current, + &config, + ThisSledSwitchSlot::TEST_FAKE, + log, + ) + .expect("plan should succeed"); + + assert_eq!(plan.unchanged_count, 0); + assert_eq!(plan.to_delete.len(), 1); + assert_eq!(plan.to_add.len(), 1); + + logctx.cleanup_successful(); +} + +#[test] +fn plan_routes_from_multiple_ports() { + let logctx = dev::test_setup_log("plan_routes_from_multiple_ports"); + let log = &logctx.log; + + // Desired: two ports on our switch, each with a route. + let config = rack_config(vec![ + port_config( + SwitchSlot::Switch0, + vec![make_route("10.0.0.0/24", "10.0.0.1", None, None)], + ), + port_config( + SwitchSlot::Switch0, + vec![make_route("10.1.0.0/24", "10.1.0.1", None, None)], + ), + ]); + + // mgd is empty. + let current = mgd_routes(vec![], vec![]); + + let plan = ReconciliationPlan::new( + current, + &config, + ThisSledSwitchSlot::TEST_FAKE, + log, + ) + .expect("plan should succeed"); + + // Both routes from both ports should be added. + assert_eq!(plan.unchanged_count, 0); + assert!(plan.to_delete.is_empty()); + assert_eq!(plan.to_add.len(), 2); + + logctx.cleanup_successful(); +} + +#[derive(Debug, Clone, Copy, test_strategy::Arbitrary)] +enum StaticRouteTestInput { + // route only exists in mgd + MgdOnly, + + // route only exists in desired config for switch 0 or switch 1 + DesiredConfigOnly(SwitchSlot), + + // route exists in mgd and our desired switch 0 config + MgdAndSwitch0, + + // route exists in mgd and our desired switch 0 config, but we want to + // change the priority + // + // The `u8` here is the new priority we want in the desired config; the + // prior mgd priority will come from our parent `StaticRouteTestValue`. They + // may randomly be the same in some proptest runs. + MgdAndSwitch0NewPriority(u8), +} + +// mgd's API for describing static routes is looser than it should be; we'll +// generate a map using keys of the tuple (prefix, nexthop, vlan_id). This is +// more consistent with what mgd does internally: +// . +#[derive( + Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, test_strategy::Arbitrary, +)] +struct StaticRouteTestKey { + description: DiffableStaticRouteDescription, + vlan_id: Option, +} + +#[derive(Debug, Clone, Copy, test_strategy::Arbitrary)] +struct StaticRouteTestValue { + input: StaticRouteTestInput, + priority: u8, +} + +impl StaticRouteTestKey { + fn to_route(&self, priority: u8) -> DiffableStaticRoute { + DiffableStaticRoute { + description: self.description, + vlan_id: self.vlan_id, + priority, + } + } +} + +#[derive(Debug, Clone, test_strategy::Arbitrary)] +struct TestInput { + routes: BTreeMap, +} + +impl From<&'_ DiffableStaticRoute> for RouteConfig { + fn from(value: &'_ DiffableStaticRoute) -> Self { + let destination = match value.description { + DiffableStaticRouteDescription::V4 { prefix, .. } => { + IpNet::V4(prefix) + } + DiffableStaticRouteDescription::V6 { prefix, .. } => { + IpNet::V6(prefix) + } + }; + let nexthop = match value.description { + DiffableStaticRouteDescription::V4 { nexthop, .. } => { + IpAddr::V4(nexthop) + } + DiffableStaticRouteDescription::V6 { nexthop, .. } => { + IpAddr::V6(nexthop) + } + }; + Self { + destination, + nexthop, + vlan_id: value.vlan_id, + rib_priority: Some(value.priority), + } + } +} + +impl TestInput { + fn initial_mgd_routes(&self) -> MgdCurrentRoutes { + Self::routes_from_iter( + self.routes + .iter() + .filter(|(_, val)| match val.input { + StaticRouteTestInput::MgdOnly + | StaticRouteTestInput::MgdAndSwitch0 + | StaticRouteTestInput::MgdAndSwitch0NewPriority(_) => true, + StaticRouteTestInput::DesiredConfigOnly(_) => false, + }) + .map(|(key, val)| key.to_route(val.priority)), + ) + } + + fn post_reconciliation_mgd_routes(&self) -> MgdCurrentRoutes { + Self::routes_from_iter(self.routes.iter().filter_map(|(key, val)| { + match val.input { + StaticRouteTestInput::DesiredConfigOnly( + SwitchSlot::Switch1, + ) + | StaticRouteTestInput::MgdOnly => None, + + StaticRouteTestInput::DesiredConfigOnly( + SwitchSlot::Switch0, + ) + | StaticRouteTestInput::MgdAndSwitch0 => { + Some(key.to_route(val.priority)) + } + StaticRouteTestInput::MgdAndSwitch0NewPriority(priority) => { + Some(key.to_route(priority)) + } + } + })) + } + + fn routes_from_iter( + mgd_routes: impl Iterator, + ) -> MgdCurrentRoutes { + let mut v4: HashMap> = HashMap::new(); + let mut v6: HashMap> = HashMap::new(); + for route in mgd_routes { + match route.description { + DiffableStaticRouteDescription::V4 { nexthop, prefix } => { + v4.entry(prefix.to_string()).or_default().push(MgdPath { + bgp: None, + nexthop: IpAddr::V4(nexthop), + nexthop_interface: None, + rib_priority: route.priority, + shutdown: false, + vlan_id: route.vlan_id, + }); + } + DiffableStaticRouteDescription::V6 { nexthop, prefix } => { + v6.entry(prefix.to_string()).or_default().push(MgdPath { + bgp: None, + nexthop: IpAddr::V6(nexthop), + nexthop_interface: None, + rib_priority: route.priority, + shutdown: false, + vlan_id: route.vlan_id, + }); + } + } + } + + MgdCurrentRoutes { v4, v6 } + } + + fn desired_rack_config(&self) -> RackNetworkConfig { + let switch0 = + self.routes.iter().filter_map(|(key, val)| match val.input { + StaticRouteTestInput::MgdOnly + | StaticRouteTestInput::DesiredConfigOnly( + SwitchSlot::Switch1, + ) => None, + StaticRouteTestInput::MgdAndSwitch0 + | StaticRouteTestInput::DesiredConfigOnly( + SwitchSlot::Switch0, + ) => Some(RouteConfig::from(&key.to_route(val.priority))), + StaticRouteTestInput::MgdAndSwitch0NewPriority(priority) => { + Some(RouteConfig::from(&key.to_route(priority))) + } + }); + let switch1 = + self.routes.iter().filter_map(|(key, val)| match val.input { + StaticRouteTestInput::MgdOnly + | StaticRouteTestInput::MgdAndSwitch0 + | StaticRouteTestInput::MgdAndSwitch0NewPriority(_) + | StaticRouteTestInput::DesiredConfigOnly( + SwitchSlot::Switch0, + ) => None, + StaticRouteTestInput::DesiredConfigOnly( + SwitchSlot::Switch1, + ) => Some(RouteConfig::from(&key.to_route(val.priority))), + }); + rack_config(vec![ + port_config(SwitchSlot::Switch0, switch0.collect()), + port_config(SwitchSlot::Switch1, switch1.collect()), + ]) + } + + fn expected_unchanged_count(&self) -> usize { + self.routes + .values() + .filter(|val| match val.input { + StaticRouteTestInput::MgdOnly + | StaticRouteTestInput::DesiredConfigOnly(_) => false, + StaticRouteTestInput::MgdAndSwitch0 => true, + StaticRouteTestInput::MgdAndSwitch0NewPriority(priority) => { + val.priority == priority + } + }) + .count() + } + + fn expected_to_add(&self) -> BTreeSet { + self.routes + .iter() + .filter_map(|(key, val)| match val.input { + StaticRouteTestInput::MgdOnly + | StaticRouteTestInput::MgdAndSwitch0 + | StaticRouteTestInput::DesiredConfigOnly( + SwitchSlot::Switch1, + ) => None, + StaticRouteTestInput::DesiredConfigOnly( + SwitchSlot::Switch0, + ) => Some(key.to_route(val.priority)), + StaticRouteTestInput::MgdAndSwitch0NewPriority(priority) => { + if priority != val.priority { + Some(key.to_route(priority)) + } else { + None + } + } + }) + .collect() + } + + fn expected_to_delete(&self) -> BTreeSet { + self.routes + .iter() + .filter_map(|(key, val)| match val.input { + StaticRouteTestInput::MgdAndSwitch0 + | StaticRouteTestInput::DesiredConfigOnly(_) => None, + StaticRouteTestInput::MgdOnly => { + Some(key.to_route(val.priority)) + } + StaticRouteTestInput::MgdAndSwitch0NewPriority(priority) => { + if priority != val.priority { + Some(key.to_route(val.priority)) + } else { + None + } + } + }) + .collect() + } + + fn expected_reconciliation_status(&self) -> MgdStaticRouteReconcilerStatus { + let mut unchanged = 0; + let mut deleted_v4 = 0; + let mut deleted_v6 = 0; + let mut added_v4 = 0; + let mut added_v6 = 0; + + for (key, val) in &self.routes { + match val.input { + StaticRouteTestInput::MgdOnly => match key.description { + DiffableStaticRouteDescription::V4 { .. } => { + deleted_v4 += 1; + } + DiffableStaticRouteDescription::V6 { .. } => { + deleted_v6 += 1; + } + }, + StaticRouteTestInput::DesiredConfigOnly( + SwitchSlot::Switch0, + ) => match key.description { + DiffableStaticRouteDescription::V4 { .. } => { + added_v4 += 1; + } + DiffableStaticRouteDescription::V6 { .. } => { + added_v6 += 1; + } + }, + StaticRouteTestInput::MgdAndSwitch0NewPriority(priority) => { + if priority == val.priority { + unchanged += 1; + } else { + match key.description { + DiffableStaticRouteDescription::V4 { .. } => { + deleted_v4 += 1; + added_v4 += 1; + } + DiffableStaticRouteDescription::V6 { .. } => { + deleted_v6 += 1; + added_v6 += 1; + } + } + } + } + StaticRouteTestInput::DesiredConfigOnly( + SwitchSlot::Switch1, + ) => (), + StaticRouteTestInput::MgdAndSwitch0 => { + unchanged += 1; + } + } + } + + MgdStaticRouteReconcilerStatus::Success { + unchanged, + deleted_v4, + deleted_v6, + added_v4, + added_v6, + } + } +} + +#[proptest] +fn proptest_plan(input: TestInput) { + let logctx = dev::test_setup_log("proptest_plan"); + let log = &logctx.log; + + let plan = ReconciliationPlan::new( + input.initial_mgd_routes(), + &input.desired_rack_config(), + ThisSledSwitchSlot::TEST_FAKE, + log, + ) + .expect("plan should succeed"); + + let ReconciliationPlan { unchanged_count, to_delete, to_add } = plan; + + assert_eq!(unchanged_count, input.expected_unchanged_count()); + assert_eq!(to_delete, input.expected_to_delete(), "incorrect to_delete"); + assert_eq!(to_add, input.expected_to_add(), "incorrect to_add"); + + logctx.cleanup_successful(); +} + +struct MgdStaticRouteLists { + v4: MgdStaticRoute4List, + v6: MgdStaticRoute6List, +} + +impl From for MgdStaticRouteLists { + fn from(value: MgdCurrentRoutes) -> Self { + Self { + v4: MgdStaticRoute4List { + list: value + .v4 + .into_iter() + .flat_map(|(prefix, paths)| { + let prefix = prefix + .parse::() + .expect("valid MgdPrefix4"); + paths.into_iter().map(move |path| { + let nexthop = match path.nexthop { + IpAddr::V4(ip) => ip, + IpAddr::V6(_) => { + panic!("invalid path: v4 with v6 nexthop") + } + }; + MgdStaticRoute4 { + nexthop, + prefix, + rib_priority: path.rib_priority, + vlan_id: path.vlan_id, + } + }) + }) + .collect(), + }, + v6: MgdStaticRoute6List { + list: value + .v6 + .into_iter() + .flat_map(|(prefix, paths)| { + let prefix = prefix + .parse::() + .expect("valid MgdPrefix6"); + paths.into_iter().map(move |path| { + let nexthop = match path.nexthop { + IpAddr::V6(ip) => ip, + IpAddr::V4(_) => { + panic!("invalid path: v6 with v4 nexthop") + } + }; + MgdStaticRoute6 { + nexthop, + prefix, + rib_priority: path.rib_priority, + vlan_id: path.vlan_id, + } + }) + }) + .collect(), + }, + } + } +} + +async fn remove_all_current_routes( + client: &Client, + current_routes: MgdCurrentRoutes, +) { + let MgdStaticRouteLists { v4, v6 } = current_routes.into(); + + client + .static_remove_v4_route(&MgdDeleteStaticRoute4Request { routes: v4 }) + .await + .expect("removed v4 routes"); + client + .static_remove_v6_route(&MgdDeleteStaticRoute6Request { routes: v6 }) + .await + .expect("removed v6 routes"); +} + +async fn create_initial_routes( + client: &Client, + initial_routes: MgdCurrentRoutes, +) { + let MgdStaticRouteLists { v4, v6 } = initial_routes.into(); + + client + .static_add_v4_route(&MgdAddStaticRoute4Request { routes: v4 }) + .await + .expect("added v4 routes"); + client + .static_add_v6_route(&MgdAddStaticRoute6Request { routes: v6 }) + .await + .expect("added v6 routes"); +} + +#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)] +struct ComparableMgdPath<'a> { + nexthop: &'a IpAddr, + nexthop_interface: &'a Option, + rib_priority: &'a u8, + shutdown: &'a bool, + vlan_id: &'a Option, +} + +impl<'a> From<&'a MgdPath> for ComparableMgdPath<'a> { + fn from(value: &'a MgdPath) -> Self { + let MgdPath { + bgp, + nexthop, + nexthop_interface, + rib_priority, + shutdown, + vlan_id, + } = value; + assert!(bgp.is_none(), "static route tests never use BGP"); + + Self { nexthop, nexthop_interface, rib_priority, shutdown, vlan_id } + } +} + +fn assert_routes_eq( + actual: MgdCurrentRoutes, + expected: MgdCurrentRoutes, + description: &str, +) { + // convert to BTree{Map,Set} for nicer output from assert_eq!() on failure + #[derive(Debug, PartialEq, Eq)] + struct BTreeMgdCurrentRoutes<'a> { + v4: BTreeMap<&'a str, BTreeSet>>, + v6: BTreeMap<&'a str, BTreeSet>>, + } + + impl<'a> From<&'a MgdCurrentRoutes> for BTreeMgdCurrentRoutes<'a> { + fn from(routes: &'a MgdCurrentRoutes) -> Self { + fn convert_map<'b>( + input: &'b HashMap>, + ) -> BTreeMap<&'b str, BTreeSet>> + { + let mut output: BTreeMap<_, BTreeSet<_>> = BTreeMap::new(); + for (k, paths) in input { + let paths = paths.iter().map(ComparableMgdPath::from); + output.insert(k.as_str(), paths.collect()); + } + output + } + + Self { v4: convert_map(&routes.v4), v6: convert_map(&routes.v6) } + } + } + + let actual = BTreeMgdCurrentRoutes::from(&actual); + let expected = BTreeMgdCurrentRoutes::from(&expected); + eprintln!("--- checking {description} ---"); + assert_eq!(actual, expected); +} + +#[tokio::test(flavor = "multi_thread")] +async fn proptest_full_reconciliation() { + let logctx = dev::test_setup_log("proptest_full_reconciliation"); + let mgsctx = gateway_test_utils::setup::test_setup( + "proptest_full_reconciliation", + SpPort::One, + ) + .await; + let mut mgdctx = + dev::maghemite::MgdInstance::start(0, mgsctx.address().into()) + .await + .expect("started mgd"); + let client = Client::new( + &format!("http://{}", mgdctx.address()), + logctx.log.clone(), + ); + let rt = tokio::runtime::Handle::current(); + + let one_test_invocation = async |input: TestInput| { + // Clear all routes from a previous proptest invocation. + let current_routes = + MgdCurrentRoutes::fetch(&client).await.expect("fetched all routes"); + remove_all_current_routes(&client, current_routes).await; + let current_routes = + MgdCurrentRoutes::fetch(&client).await.expect("fetched all routes"); + assert_routes_eq( + current_routes, + MgdCurrentRoutes::default(), + "cleared all routes", + ); + + // Apply all initial settings. + create_initial_routes(&client, input.initial_mgd_routes()).await; + let current_routes = + MgdCurrentRoutes::fetch(&client).await.expect("fetched all routes"); + assert_routes_eq( + current_routes, + input.initial_mgd_routes(), + "initial setup mismatch", + ); + + // Perform reconciliation. + let status = reconcile( + &client, + &input.desired_rack_config(), + ThisSledSwitchSlot::TEST_FAKE, + &logctx.log, + ) + .await; + + // Check reported status and mgd state + assert_eq!(status, input.expected_reconciliation_status()); + let current_routes = + MgdCurrentRoutes::fetch(&client).await.expect("fetched all routes"); + assert_routes_eq( + current_routes, + input.post_reconciliation_mgd_routes(), + "post-reconciliation mismatch", + ); + }; + + proptest_macro!(|(input: TestInput)| { + // Do a little dance to call our async `one_test_invocation` within the + // non-async `proptest_macro!()` context. + block_in_place(|| rt.block_on(one_test_invocation(input))); + }); + + mgdctx.cleanup().await.expect("mgd cleanup succeeded"); + mgsctx.teardown().await; + logctx.cleanup_successful(); +} diff --git a/sled-agent/scrimlet-reconcilers/src/reconciler_task.rs b/sled-agent/scrimlet-reconcilers/src/reconciler_task.rs new file mode 100644 index 00000000000..988e4651a3b --- /dev/null +++ b/sled-agent/scrimlet-reconcilers/src/reconciler_task.rs @@ -0,0 +1,315 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! The general framework for one of the service-specific reconciler tasks +//! implemented in this crate. +//! +//! Each reconciler follows the same structure: +//! +//! 1. If we ever stop being a scrimlet (i.e., the attached switch goes away), +//! go inert until we become a scrimlet again (i.e., the switch reappears). +//! This can happen during sidecar updates if it powers off briefly to reset +//! internal FPGAs, or in a variety of other less common and more rainy-day +//! situations. +//! 2. Periodically or when the `SystemNetworkingConfig` changes, perform +//! service-specific reconciliation. This is provided by implementors of the +//! [`Reconciler`] trait elsewhere in this crate. +//! 3. Report status of this task in an output watch channel, suitable for +//! reporting in the sled-agent inventory. +//! +//! [`ReconcilerTask::run()`] handles 1 and 3, and service-specific +//! implementations of [`Reconciler`] provide 2. + +use crate::handle::ScrimletReconcilersMode; +use crate::status::ReconcilerActivationReason; +use crate::status::ReconcilerCurrentStatus; +use crate::status::ReconcilerInertReason; +use crate::status::ReconcilerRunningStatus; +use crate::status::ReconcilerStatus; +use crate::status::ReconciliationCompletedStatus; +use crate::status::ScrimletStatus; +use crate::switch_zone_slot::ThisSledSwitchSlot; +use chrono::SecondsFormat; +use chrono::Utc; +use sled_agent_types::system_networking::SystemNetworkingConfig; +use slog::Logger; +use slog::error; +use slog::info; +use std::convert::Infallible; +use std::time::Duration; +use std::time::Instant; +use tokio::sync::watch; +use tokio::sync::watch::error::RecvError; +use tokio::task::JoinHandle; + +/// Trait that should be implemented by the service-specific reconciler tasks +/// elsewhere in this crate. +pub(crate) trait Reconciler: Send + 'static { + type Status: slog::KV + Clone + Send + Sync + 'static; + + const LOGGER_COMPONENT_NAME: &'static str; + const RE_RECONCILE_INTERVAL: Duration; + + /// Construct a new instance of this `Reconciler`. + /// + /// Typically builds a client for the relevant service based on `mode` and + /// record `switch_slot` for use inside future calls to + /// `do_reconciliation()`. + fn new( + mode: ScrimletReconcilersMode, + switch_slot: ThisSledSwitchSlot, + parent_log: &Logger, + ) -> Self; + + /// Perform any required reconciliation based on the current contents of + /// `system_networking_config`. + /// + /// This method is infallible; any errors must be described by + /// `Self::Status`. + fn do_reconciliation( + &mut self, + system_networking_config: &SystemNetworkingConfig, + log: &Logger, + ) -> impl Future + Send; +} + +#[derive(Debug)] +pub(crate) struct ReconcilerTaskHandle { + status_rx: watch::Receiver>, + + // We never wait on this task: the only way it exits is by panicking (which + // results in a process abort, since we build sled-agent with panic=abort) + // or if the watch channels it's reading are dropped, which itself can only + // happen by a panic elsewhere. + _task: JoinHandle<()>, +} + +impl ReconcilerTaskHandle { + pub(crate) fn spawn( + scrimlet_status_rx: watch::Receiver, + system_networking_config_rx: watch::Receiver, + mode: ScrimletReconcilersMode, + this_sled_switch_slot: ThisSledSwitchSlot, + parent_log: &Logger, + ) -> Self { + Self::spawn_impl( + scrimlet_status_rx, + system_networking_config_rx, + mode, + this_sled_switch_slot, + parent_log, + T::new, + ) + } + + // Separate, private function that allows unit tests to customize how `T` is + // constructed. Production passes `T::new` as `inner_constructor`; i.e., + // just call the constructor we know exists from the `Reconciler` trait. + fn spawn_impl( + scrimlet_status_rx: watch::Receiver, + system_networking_config_rx: watch::Receiver, + mode: ScrimletReconcilersMode, + this_sled_switch_slot: ThisSledSwitchSlot, + parent_log: &Logger, + inner_constructor: F, + ) -> Self + where + F: FnOnce(ScrimletReconcilersMode, ThisSledSwitchSlot, &Logger) -> T + + Send + + Sync + + 'static, + { + let (status_tx, status_rx) = watch::channel(ReconcilerStatus { + current_status: ReconcilerCurrentStatus::Idle, + last_completion: None, + }); + + let log = parent_log + .new(slog::o!("scrimlet_reconciler" => T::LOGGER_COMPONENT_NAME)); + + let mut inner_task = ReconcilerTask { + scrimlet_status_rx, + system_networking_config_rx, + status_tx, + inner: inner_constructor(mode, this_sled_switch_slot, parent_log), + log, + }; + + let task = tokio::spawn(async move { + match inner_task.run().await { + // `inner_task.run()` runs forever... + Ok(never_returns) => match never_returns {}, + + // ... unless one of its input watch channels has closed. + Err(_recv_error) => { + inner_task.status_tx.send_modify(|status| { + status.current_status = ReconcilerCurrentStatus::Inert( + ReconcilerInertReason::TaskExitedUnexpectedly, + ); + }); + error!( + inner_task.log, + "exited due to watch channel closure \ + (unexpected except during shutdown in tests)" + ); + } + } + }); + + Self { status_rx, _task: task } + } + + pub(crate) fn status(&self) -> ReconcilerStatus { + self.status_rx.borrow().clone() + } +} + +struct ReconcilerTask { + scrimlet_status_rx: watch::Receiver, + system_networking_config_rx: watch::Receiver, + status_tx: watch::Sender>, + inner: T, + log: Logger, +} + +impl ReconcilerTask { + async fn run(&mut self) -> Result { + let mut activation_reason = ReconcilerActivationReason::Startup; + let mut activation_count: u64 = 0; + + loop { + // We know we _were_ a scrimlet at some point, because we determined + // our switch slot by contacting MGS within our own switch zone. But + // it's possible we could become "not a scrimlet" in the future + // (e.g., if the switch disappears out from under us). In such a + // case, block until it comes back. + self.wait_if_this_sled_is_no_longer_a_scrimlet().await?; + + // We _are_ a scrimlet; perform reconciliation. + let start_instant = Instant::now(); + let start_time = Utc::now(); + info!( + self.log, "starting reconciliation attempt"; + "activation_reason" => ?activation_reason, + "activation_count" => activation_count, + ); + + // Snapshot the current networking config so we hold the watch + // channel as little as possible. + let system_networking_config = + self.system_networking_config_rx.borrow_and_update().clone(); + let running_status = + ReconcilerRunningStatus::new(activation_reason); + + // Update our status. + self.status_tx.send_modify(|status| { + status.current_status = + ReconcilerCurrentStatus::Running(running_status); + }); + + // Actually perform reconciliation. + let status_result = self + .inner + .do_reconciliation(&system_networking_config, &self.log) + .await; + + // Update our output watch channel with the result. + info!( + self.log, "reconciliation attempt complete"; + "activation_reason" => ?activation_reason, + "activation_count" => activation_count, + "started_at" => start_time.to_rfc3339_opts( + SecondsFormat::Millis, + /* use_z */ true, + ), + "elapsed" => ?start_instant.elapsed(), + &status_result, + ); + self.status_tx.send_modify(|status| { + status.current_status = ReconcilerCurrentStatus::Idle; + status.last_completion = + Some(Box::new(ReconciliationCompletedStatus { + activation_reason, + completed_at_time: Utc::now(), + ran_for: running_status.elapsed_since_start(), + activation_count, + status: status_result, + })); + }); + activation_count = activation_count.wrapping_add(1); + + // Wait until we should perform reconciliation again: our + // re-reconciliation periodic timer fires or one of our input watch + // channels changes. + // + // All arms are cancel-safe and we do not `.await` within the body + // of any arm, avoiding any opportunity for futurelock. + activation_reason = tokio::select! { + () = tokio::time::sleep(T::RE_RECONCILE_INTERVAL) => { + ReconcilerActivationReason::PeriodicTimer + } + + result = self.system_networking_config_rx.changed() => { + () = result?; + ReconcilerActivationReason::SystemNetworkingConfigChanged + } + + result = self.scrimlet_status_rx.changed() => { + () = result?; + ReconcilerActivationReason::ScrimletStatusChanged + } + }; + } + } + + async fn wait_if_this_sled_is_no_longer_a_scrimlet( + &mut self, + ) -> Result<(), RecvError> { + let mut logged_not_scrimlet = false; + + loop { + let status = *self.scrimlet_status_rx.borrow_and_update(); + match status { + ScrimletStatus::Scrimlet => { + return Ok(()); + } + ScrimletStatus::NotScrimlet => { + if !logged_not_scrimlet { + info!( + self.log, + "not a scrimlet - reconciler going inert" + ); + logged_not_scrimlet = true; + } + self.status_tx.send_modify(|status| { + status.current_status = ReconcilerCurrentStatus::Inert( + ReconcilerInertReason::NoLongerAScrimlet, + ); + }); + + // Select over both input channels so we can detect channel + // closure and exit cleanly if either channel goes away. If + // the rack network config changes here, we'll spuriously + // loop around and reread the scrimlet status, but that's no + // big deal. + // + // Both arms are cancel-safe and we do not `.await` within + // the body of any arm, avoiding any opportunity for + // futurelock. + tokio::select! { + result = self.scrimlet_status_rx.changed() => { + () = result?; + } + result = self.system_networking_config_rx.changed() => { + () = result?; + } + } + } + } + } + } +} + +#[cfg(test)] +mod tests; diff --git a/sled-agent/scrimlet-reconcilers/src/reconciler_task/tests.rs b/sled-agent/scrimlet-reconcilers/src/reconciler_task/tests.rs new file mode 100644 index 00000000000..b0cce407532 --- /dev/null +++ b/sled-agent/scrimlet-reconcilers/src/reconciler_task/tests.rs @@ -0,0 +1,745 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use super::*; +use assert_matches::assert_matches; +use sled_agent_types::early_networking::RackNetworkConfig; +use std::mem; +use std::sync::Arc; +use std::sync::Mutex; +use tokio::sync::mpsc; +use tokio::time::Instant; + +struct MockReconciler { + do_reconciliation_calls: Arc>>, + do_reconciliation_results: mpsc::UnboundedReceiver, +} + +#[derive(Debug, Clone)] +struct MockReconcilerStatus(String); + +impl slog::KV for MockReconcilerStatus { + fn serialize( + &self, + _record: &slog::Record<'_>, + serializer: &mut dyn slog::Serializer, + ) -> slog::Result { + serializer.emit_str("mock-reconciler".into(), &self.0) + } +} + +impl Reconciler for MockReconciler { + type Status = MockReconcilerStatus; + + const LOGGER_COMPONENT_NAME: &'static str = "MockReconciler"; + const RE_RECONCILE_INTERVAL: Duration = Duration::from_secs(30); + + fn new( + _mode: ScrimletReconcilersMode, + _switch_slot: ThisSledSwitchSlot, + _parent_log: &Logger, + ) -> Self { + unimplemented!("not called by tests") + } + + async fn do_reconciliation( + &mut self, + system_networking_config: &SystemNetworkingConfig, + _log: &Logger, + ) -> Self::Status { + self.do_reconciliation_calls + .lock() + .unwrap() + .push(system_networking_config.clone()); + MockReconcilerStatus( + self.do_reconciliation_results + .recv() + .await + .expect("test never closes sending side of channel"), + ) + } +} + +fn test_system_networking_config_1() -> SystemNetworkingConfig { + SystemNetworkingConfig { + rack_network_config: RackNetworkConfig { + rack_subnet: "fd00:1122:3344:0100::/56".parse().unwrap(), + infra_ip_first: "192.0.2.10".parse().unwrap(), + infra_ip_last: "192.0.2.100".parse().unwrap(), + ports: Vec::new(), + bgp: Vec::new(), + bfd: Vec::new(), + }, + blueprint_external_networking_config: None, + } +} + +fn test_system_networking_config_2() -> SystemNetworkingConfig { + SystemNetworkingConfig { + rack_network_config: RackNetworkConfig { + rack_subnet: "fd00:aabb:ccdd:0200::/56".parse().unwrap(), + infra_ip_first: "192.0.2.20".parse().unwrap(), + infra_ip_last: "192.0.2.200".parse().unwrap(), + ports: Vec::new(), + bgp: Vec::new(), + bfd: Vec::new(), + }, + blueprint_external_networking_config: None, + } +} + +struct Harness { + task: ReconcilerTaskHandle, + scrimlet_status_tx: watch::Sender, + system_networking_config_tx: watch::Sender, + do_reconciliation_results_tx: mpsc::UnboundedSender, + do_reconciliation_calls: Arc>>, +} + +impl Harness { + const WAIT_FOR_STATUS_CHECK_INTERVAL: Duration = Duration::from_millis(100); + + fn new(log: &Logger) -> Self { + let (scrimlet_status_tx, scrimlet_status_rx) = + watch::channel(ScrimletStatus::Scrimlet); + let (system_networking_config_tx, system_networking_config_rx) = + watch::channel(test_system_networking_config_1()); + + let (do_reconciliation_results_tx, do_reconciliation_results) = + mpsc::unbounded_channel(); + let do_reconciliation_calls = Arc::new(Mutex::new(Vec::new())); + + let task = { + let do_reconciliation_calls = Arc::clone(&do_reconciliation_calls); + let dummy_addr = "0.0.0.0:0".parse().unwrap(); + ReconcilerTaskHandle::spawn_impl( + scrimlet_status_rx, + system_networking_config_rx, + ScrimletReconcilersMode::Test { + mgs_addr: dummy_addr, + dpd_addr: dummy_addr, + mgd_addr: dummy_addr, + }, + ThisSledSwitchSlot::TEST_FAKE, + log, + |_ip, _slot, _log| MockReconciler { + do_reconciliation_calls, + do_reconciliation_results, + }, + ) + }; + + Self { + task, + scrimlet_status_tx, + system_networking_config_tx, + do_reconciliation_results_tx, + do_reconciliation_calls, + } + } + + fn set_scrimlet_status(&self, status: ScrimletStatus) { + self.scrimlet_status_tx.send_modify(|s| { + *s = status; + }); + } + + async fn wait_for_do_reconciliation_call_count(&self, count: usize) { + let mut last_seen = self.do_reconciliation_calls.lock().unwrap().len(); + let start = Instant::now(); + while start.elapsed() < Duration::from_secs(5) { + if last_seen == count { + return; + } + tokio::time::sleep(Self::WAIT_FOR_STATUS_CHECK_INTERVAL).await; + last_seen = self.do_reconciliation_calls.lock().unwrap().len(); + } + panic!( + "timeout waiting for do_reconciliation call count {count} \ + (got {last_seen})" + ); + } + + async fn wait_for_task_status( + &self, + description: &str, + matches: F, + ) -> ReconcilerStatus + where + F: Fn(&ReconcilerCurrentStatus) -> bool, + { + let mut status = self.task.status(); + let start = Instant::now(); + while start.elapsed() < Duration::from_secs(5) { + if matches(&status.current_status) { + return status; + } + tokio::time::sleep(Self::WAIT_FOR_STATUS_CHECK_INTERVAL).await; + status = self.task.status(); + } + panic!( + "timeout waiting for task status {description} (got {status:?})" + ); + } + + async fn wait_for_task_status_no_longer_a_scrimlet( + &self, + ) -> ReconcilerStatus { + self.wait_for_task_status("Inert(NoLongerAScrimlet)", |status| { + matches!( + status, + ReconcilerCurrentStatus::Inert( + ReconcilerInertReason::NoLongerAScrimlet + ) + ) + }) + .await + } + + async fn wait_for_task_status_idle( + &self, + ) -> ReconcilerStatus { + self.wait_for_task_status("Idle", |status| { + matches!(status, ReconcilerCurrentStatus::Idle) + }) + .await + } + + async fn shutdown_cleanly(self) { + let Self { + task, scrimlet_status_tx, do_reconciliation_results_tx, .. + } = self; + + // Dropping this watch channel should cause the task to exit. + mem::drop(scrimlet_status_tx); + + task._task.await.expect("task didn't panic"); + let final_status = task.status_rx.borrow(); + assert_matches!( + final_status.current_status, + ReconcilerCurrentStatus::Inert( + ReconcilerInertReason::TaskExitedUnexpectedly + ) + ); + + // Explicitly drop this _after_ the task exits so we're guaranteed + // not to hit the `.expect()` in + // `MockReconciler::do_reconciliation()` above. + mem::drop(do_reconciliation_results_tx); + } +} + +// Test the first activation. +#[tokio::test(start_paused = true)] +async fn first_activation() { + let logctx = omicron_test_utils::dev::test_setup_log("first_activation"); + let harness = Harness::new(&logctx.log); + + // Confirm we start in Idle. + assert_matches!( + harness.task.status().current_status, + ReconcilerCurrentStatus::Idle + ); + + // Task should call do_reconciliation() for the first time. + harness.wait_for_do_reconciliation_call_count(1).await; + harness.do_reconciliation_results_tx.send("first".to_string()).unwrap(); + let status = harness.wait_for_task_status_idle().await; + + let completion = + status.last_completion.expect("last_completion should be preserved"); + assert_eq!(completion.activation_count, 0); + assert_eq!(completion.status.0, "first"); + assert_matches!( + completion.activation_reason, + ReconcilerActivationReason::Startup + ); + + harness.shutdown_cleanly().await; + logctx.cleanup_successful(); +} + +// Test: after completing a reconciliation and reaching the select!, +// setting NotScrimlet causes the task to loop back to +// wait_if_this_sled_is_no_longer_a_scrimlet and go inert — without +// performing another reconciliation. +#[tokio::test(start_paused = true)] +async fn scrimlet_becomes_not_scrimlet_during_select() { + let logctx = omicron_test_utils::dev::test_setup_log( + "scrimlet_becomes_not_scrimlet_during_select", + ); + let harness = Harness::new(&logctx.log); + + // Complete the first reconciliation so the task reaches the select!. + harness.wait_for_do_reconciliation_call_count(1).await; + harness.do_reconciliation_results_tx.send("first".to_string()).unwrap(); + harness.wait_for_task_status_idle().await; + + // Become NotScrimlet → the select! fires with ScrimletStatusChanged, + // the loop goes back to wait_if_this_sled_is_no_longer_a_scrimlet, and the + // task becomes Inert(NoLongerAScrimlet). + harness.set_scrimlet_status(ScrimletStatus::NotScrimlet); + let status = harness.wait_for_task_status_no_longer_a_scrimlet().await; + + // No additional do_reconciliation call should have happened: the + // ScrimletStatusChanged activation saw NotScrimlet and went inert + // instead of reconciling. + assert_eq!(harness.do_reconciliation_calls.lock().unwrap().len(), 1); + + // last_completion from the prior run should still be present. + let completion = + status.last_completion.expect("last_completion should be preserved"); + assert_eq!(completion.activation_count, 0); + assert_eq!(completion.status.0, "first"); + + harness.shutdown_cleanly().await; + logctx.cleanup_successful(); +} + +// Test: after the first reconciliation completes, changing the +// SystemNetworkingConfig triggers a second reconciliation with +// activation_reason = SystemNetworkingConfigChanged and the new config. +#[tokio::test(start_paused = true)] +async fn system_networking_config_change_triggers_re_reconciliation() { + let logctx = omicron_test_utils::dev::test_setup_log( + "system_networking_config_change_triggers_re_reconciliation", + ); + let harness = Harness::new(&logctx.log); + + // Wait for the first do_reconciliation call (Startup). + harness.wait_for_do_reconciliation_call_count(1).await; + + // Complete the first reconciliation. + harness.do_reconciliation_results_tx.send("first".to_string()).unwrap(); + harness.wait_for_task_status_idle().await; + + // Send a new SystemNetworkingConfig. + let first_config = test_system_networking_config_1(); + let second_config = test_system_networking_config_2(); + assert_ne!(first_config, second_config); + harness.system_networking_config_tx.send(second_config.clone()).unwrap(); + + // Wait for the second do_reconciliation call. + harness.wait_for_do_reconciliation_call_count(2).await; + + // The second call should have received the new config. + let received_configs = + harness.do_reconciliation_calls.lock().unwrap().clone(); + assert_eq!(received_configs.len(), 2); + assert_eq!(received_configs[0], first_config); + assert_eq!(received_configs[1], second_config); + + // Status should be Running with SystemNetworkingConfigChanged. + let status = harness.task.status(); + match &status.current_status { + ReconcilerCurrentStatus::Running(running) => { + assert_matches!( + running.activation_reason(), + ReconcilerActivationReason::SystemNetworkingConfigChanged + ); + } + other => panic!("expected Running status, got {other:?}"), + } + + // Complete the second reconciliation. + harness.do_reconciliation_results_tx.send("second".to_string()).unwrap(); + let status = harness.wait_for_task_status_idle().await; + + let completion = + status.last_completion.expect("should have last_completion"); + assert_matches!( + completion.activation_reason, + ReconcilerActivationReason::SystemNetworkingConfigChanged + ); + assert_eq!(completion.activation_count, 1); + assert_eq!(completion.status.0, "second"); + + harness.shutdown_cleanly().await; + logctx.cleanup_successful(); +} + +// Test: after the first reconciliation completes, the periodic timer +// fires after RE_RECONCILE_INTERVAL and triggers a second +// reconciliation with activation_reason = PeriodicTimer. +#[tokio::test(start_paused = true)] +async fn periodic_timer_triggers_re_reconciliation() { + let logctx = omicron_test_utils::dev::test_setup_log( + "periodic_timer_triggers_re_reconciliation", + ); + let harness = Harness::new(&logctx.log); + + // Complete the first reconciliation (Startup). + harness.wait_for_do_reconciliation_call_count(1).await; + harness.do_reconciliation_results_tx.send("first".to_string()).unwrap(); + harness.wait_for_task_status_idle().await; + + // Advance time just short of the interval — no second call yet. + tokio::time::advance( + MockReconciler::RE_RECONCILE_INTERVAL - Duration::from_millis(1), + ) + .await; + assert_eq!(harness.do_reconciliation_calls.lock().unwrap().len(), 1); + + // Advance past the interval — periodic timer fires. + tokio::time::advance(Duration::from_millis(1)).await; + harness.wait_for_do_reconciliation_call_count(2).await; + + // Status should be Running with PeriodicTimer. + let status = harness.task.status(); + match &status.current_status { + ReconcilerCurrentStatus::Running(running) => { + assert_matches!( + running.activation_reason(), + ReconcilerActivationReason::PeriodicTimer + ); + } + other => panic!("expected Running status, got {other:?}"), + } + + // Complete and verify. + harness.do_reconciliation_results_tx.send("periodic".to_string()).unwrap(); + let status = harness.wait_for_task_status_idle().await; + let completion = + status.last_completion.expect("should have last_completion"); + assert_matches!( + completion.activation_reason, + ReconcilerActivationReason::PeriodicTimer + ); + assert_eq!(completion.activation_count, 1); + assert_eq!(completion.status.0, "periodic"); + + harness.shutdown_cleanly().await; + logctx.cleanup_successful(); +} + +// Test: if the SystemNetworkingConfig changes while do_reconciliation is +// in-flight, the task should notice when it reaches the select! and +// immediately perform another reconciliation with +// activation_reason = SystemNetworkingConfigChanged using the latest config. +#[tokio::test(start_paused = true)] +async fn config_change_during_inflight_reconciliation() { + let logctx = omicron_test_utils::dev::test_setup_log( + "config_change_during_inflight_reconciliation", + ); + let harness = Harness::new(&logctx.log); + + // Wait for the first do_reconciliation call (Startup) to be entered. + harness.wait_for_do_reconciliation_call_count(1).await; + + // While the first reconciliation is still in-flight, change the + // config. The task won't see this until it finishes and hits the + // select!. + harness + .system_networking_config_tx + .send(test_system_networking_config_2()) + .unwrap(); + + // Complete the first reconciliation. + harness.do_reconciliation_results_tx.send("first".to_string()).unwrap(); + + // The task should immediately start a second reconciliation because + // system_networking_config_rx.changed() fires in the select!. Because we + // have time paused, the elapsed time should be exactly one check interval + // of our test harness. + let before = tokio::time::Instant::now(); + harness.wait_for_do_reconciliation_call_count(2).await; + assert_eq!(before.elapsed(), Harness::WAIT_FOR_STATUS_CHECK_INTERVAL); + + // The second call should have received the new config (via + // borrow_and_update()). + let received_configs = + harness.do_reconciliation_calls.lock().unwrap().clone(); + assert_eq!(received_configs[0], test_system_networking_config_1()); + assert_eq!(received_configs[1], test_system_networking_config_2()); + + // Status should be Running with SystemNetworkingConfigChanged. + let status = harness.task.status(); + match &status.current_status { + ReconcilerCurrentStatus::Running(running) => { + assert_matches!( + running.activation_reason(), + ReconcilerActivationReason::SystemNetworkingConfigChanged + ); + } + other => panic!("expected Running status, got {other:?}"), + } + + // Complete the second reconciliation and verify. + harness.do_reconciliation_results_tx.send("second".to_string()).unwrap(); + let status = harness.wait_for_task_status_idle().await; + let completion = + status.last_completion.expect("should have last_completion"); + assert_matches!( + completion.activation_reason, + ReconcilerActivationReason::SystemNetworkingConfigChanged + ); + assert_eq!(completion.activation_count, 1); + assert_eq!(completion.status.0, "second"); + + harness.shutdown_cleanly().await; + logctx.cleanup_successful(); +} + +// Test: full scrimlet status round-trip. Start as scrimlet, complete +// reconciliation #0 (Startup). Set NotScrimlet → task goes inert. Set +// Scrimlet again → reconciliation #1 fires with activation_reason = +// ScrimletStatusChanged and activation_count = 1. +#[tokio::test(start_paused = true)] +async fn scrimlet_status_round_trip() { + let logctx = + omicron_test_utils::dev::test_setup_log("scrimlet_status_round_trip"); + let harness = Harness::new(&logctx.log); + + // First reconciliation (Startup). + harness.wait_for_do_reconciliation_call_count(1).await; + harness.do_reconciliation_results_tx.send("first".to_string()).unwrap(); + let status = harness.wait_for_task_status_idle().await; + let completion = + status.last_completion.expect("should have last_completion"); + assert_matches!( + completion.activation_reason, + ReconcilerActivationReason::Startup + ); + assert_eq!(completion.activation_count, 0); + + // Become NotScrimlet → task should go inert. + harness.set_scrimlet_status(ScrimletStatus::NotScrimlet); + harness.wait_for_task_status_no_longer_a_scrimlet().await; + + // No additional do_reconciliation call should have happened. + assert_eq!(harness.do_reconciliation_calls.lock().unwrap().len(), 1); + + // Become Scrimlet again → reconciliation #1 fires. + harness.set_scrimlet_status(ScrimletStatus::Scrimlet); + harness.wait_for_do_reconciliation_call_count(2).await; + + // Status should be Running with ScrimletStatusChanged. + let status = harness.task.status(); + match &status.current_status { + ReconcilerCurrentStatus::Running(running) => { + assert_matches!( + running.activation_reason(), + ReconcilerActivationReason::ScrimletStatusChanged + ); + } + other => panic!("expected Running status, got {other:?}"), + } + + // Complete the second reconciliation and check last_completion. + harness.do_reconciliation_results_tx.send("second".to_string()).unwrap(); + let status = harness.wait_for_task_status_idle().await; + let completion = + status.last_completion.expect("should have last_completion"); + assert_matches!( + completion.activation_reason, + ReconcilerActivationReason::ScrimletStatusChanged + ); + assert_eq!(completion.activation_count, 1); + assert_eq!(completion.status.0, "second"); + + harness.shutdown_cleanly().await; + logctx.cleanup_successful(); +} + +// Test: dropping the system_networking_config sender while the task is in +// the select! (after completing a reconciliation) causes the task to +// exit with TaskExitedUnexpectedly. +#[tokio::test(start_paused = true)] +async fn channel_closure_system_networking_config_during_select() { + let logctx = omicron_test_utils::dev::test_setup_log( + "channel_closure_system_networking_config_during_select", + ); + let harness = Harness::new(&logctx.log); + + // Complete one reconciliation so the task reaches the select!. + harness.wait_for_do_reconciliation_call_count(1).await; + harness.do_reconciliation_results_tx.send("done".to_string()).unwrap(); + harness.wait_for_task_status_idle().await; + + // Drop the system_networking_config sender. This closes the watch channel, + // which causes the `system_networking_config_rx.changed()` arm in the + // select! to return Err(RecvError), causing the task to exit. + let Harness { + task, + scrimlet_status_tx, + system_networking_config_tx, + do_reconciliation_results_tx, + do_reconciliation_calls, + } = harness; + + mem::drop(system_networking_config_tx); + + // Wait for the task to exit and verify the final status. + task._task.await.expect("task didn't panic"); + let final_status = task.status_rx.borrow(); + assert_matches!( + final_status.current_status, + ReconcilerCurrentStatus::Inert( + ReconcilerInertReason::TaskExitedUnexpectedly + ) + ); + + // do_reconciliation should have been called exactly once (the initial + // Startup reconciliation); no second call after the channel closed. + assert_eq!(do_reconciliation_calls.lock().unwrap().len(), 1); + + mem::drop(scrimlet_status_tx); + mem::drop(do_reconciliation_results_tx); + + logctx.cleanup_successful(); +} + +// Test: dropping the scrimlet_status sender while the task is in the +// select! (after completing a reconciliation) causes the task to exit +// with TaskExitedUnexpectedly. +#[tokio::test(start_paused = true)] +async fn channel_closure_scrimlet_status_during_select() { + let logctx = omicron_test_utils::dev::test_setup_log( + "channel_closure_scrimlet_status_during_select", + ); + let harness = Harness::new(&logctx.log); + + // Complete one reconciliation so the task reaches the select!. + harness.wait_for_do_reconciliation_call_count(1).await; + harness.do_reconciliation_results_tx.send("done".to_string()).unwrap(); + harness.wait_for_task_status_idle().await; + + // Destructure the harness so we can drop the scrimlet_status sender + // while still holding the other pieces we need. + let Harness { + task, + scrimlet_status_tx, + do_reconciliation_results_tx, + do_reconciliation_calls, + .. + } = harness; + + // Drop the scrimlet_status sender. This closes the watch channel, + // which causes the `scrimlet_status_rx.changed()` arm in the + // select! to return Err(RecvError), causing the task to exit. + mem::drop(scrimlet_status_tx); + + // Wait for the task to exit and verify the final status. + task._task.await.expect("task didn't panic"); + let final_status = task.status_rx.borrow(); + assert_matches!( + final_status.current_status, + ReconcilerCurrentStatus::Inert( + ReconcilerInertReason::TaskExitedUnexpectedly + ) + ); + + // do_reconciliation should have been called exactly once (the initial + // Startup reconciliation); no second call after the channel closed. + assert_eq!(do_reconciliation_calls.lock().unwrap().len(), 1); + + // Explicitly drop after the task exits so we can't hit the .expect() + // in MockReconciler::do_reconciliation(). + mem::drop(do_reconciliation_results_tx); + + logctx.cleanup_successful(); +} + +// Test: dropping the scrimlet_status sender while the task is blocked in +// wait_if_this_sled_is_no_longer_a_scrimlet() waiting for the switch slot +// causes the task to exit with TaskExitedUnexpectedly. +#[tokio::test(start_paused = true)] +async fn channel_closure_scrimlet_status_during_not_scrimlet() { + let logctx = omicron_test_utils::dev::test_setup_log( + "channel_closure_scrimlet_status_during_not_scrimlet", + ); + let harness = Harness::new(&logctx.log); + + // Set ourselves as "not a scrimlet". + harness.set_scrimlet_status(ScrimletStatus::NotScrimlet); + + // Wait for the task to reach Inert(NoLongerAScrimlet). + harness.wait_for_task_status_no_longer_a_scrimlet().await; + + // Destructure the harness so we can drop the scrimlet_status sender + // while still holding the other pieces we need. + let Harness { + task, + scrimlet_status_tx, + system_networking_config_tx, + do_reconciliation_results_tx, + do_reconciliation_calls, + } = harness; + + // Drop the scrimlet_status sender. This closes the watch channel, + // which causes scrimlet_status_rx.changed().await to return + // Err(RecvError) inside wait_if_this_sled_is_no_longer_a_scrimlet(), + // causing the task to exit. + mem::drop(scrimlet_status_tx); + + // Wait for the task to exit and verify the final status. + task._task.await.expect("task didn't panic"); + let final_status = task.status_rx.borrow(); + assert_matches!( + final_status.current_status, + ReconcilerCurrentStatus::Inert( + ReconcilerInertReason::TaskExitedUnexpectedly + ) + ); + + // do_reconciliation() should never have been called. + assert_eq!(do_reconciliation_calls.lock().unwrap().len(), 0); + + // Explicitly drop after the task exits. + mem::drop(do_reconciliation_results_tx); + mem::drop(system_networking_config_tx); + + logctx.cleanup_successful(); +} + +// Test: dropping the system_networking_config sender while the task is blocked +// in wait_if_this_sled_is_no_longer_a_scrimlet() causes the task to exit with +// TaskExitedUnexpectedly. +#[tokio::test(start_paused = true)] +async fn channel_closure_system_networking_config_during_not_scrimlet() { + let logctx = omicron_test_utils::dev::test_setup_log( + "channel_closure_system_networking_config_during_not_scrimlet", + ); + let harness = Harness::new(&logctx.log); + + // Set ourselves as "not a scrimlet". + harness.set_scrimlet_status(ScrimletStatus::NotScrimlet); + + // Wait for the task to reach Inert(NoLongerAScrimlet). + harness.wait_for_task_status_no_longer_a_scrimlet().await; + + // Destructure the harness so we can drop the system_networking_config + // sender while still holding the other pieces we need. + let Harness { + task, + scrimlet_status_tx, + system_networking_config_tx, + do_reconciliation_results_tx, + do_reconciliation_calls, + } = harness; + + // Drop the system_networking_config sender. The task is currently blocked + // waiting to become a scrimlet again, but also monitors this channel for + // closure; it should notice and exit. + mem::drop(system_networking_config_tx); + + // Wait for the task to exit and verify the final status. + task._task.await.expect("task didn't panic"); + let final_status = task.status_rx.borrow(); + assert_matches!( + final_status.current_status, + ReconcilerCurrentStatus::Inert( + ReconcilerInertReason::TaskExitedUnexpectedly + ) + ); + + // do_reconciliation() should never have been called. + assert_eq!(do_reconciliation_calls.lock().unwrap().len(), 0); + + // Explicitly drop after the task exits. + mem::drop(do_reconciliation_results_tx); + mem::drop(scrimlet_status_tx); + + logctx.cleanup_successful(); +} diff --git a/sled-agent/scrimlet-reconcilers/src/status.rs b/sled-agent/scrimlet-reconcilers/src/status.rs new file mode 100644 index 00000000000..d433745b4d6 --- /dev/null +++ b/sled-agent/scrimlet-reconcilers/src/status.rs @@ -0,0 +1,141 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Types for the status and results of the reconcilers in this crate. + +use crate::DpdReconcilerStatus; +use crate::MgdReconcilerStatus; +use crate::UplinkdReconcilerStatus; +use chrono::DateTime; +use chrono::Utc; +use std::time::Duration; +use std::time::Instant; + +/// Whether or not this sled is a scrimlet. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum ScrimletStatus { + Scrimlet, + NotScrimlet, +} + +/// Status of attempting to determine this sled's switch slot via MGS within +/// this sled's switch zone. +#[derive(Debug, Clone)] +pub enum DetermineSwitchSlotStatus { + /// We're not attempting to contact MGS because we're not a scrimlet. + NotScrimlet, + + /// We're currently attempting to contact MGS. + /// + /// If this is not the first attempt, `prev_attempt_err` contains the error + /// we encountered the last time. (If the last time succeeded, we'd be + /// done!) + ContactingMgs { prev_attempt_err: Option }, + + /// We're currently idle waiting for a timeout to retry due to a previous + /// failure. + WaitingToRetry { prev_attempt_err: String }, +} + +/// Why a reconciler task has gone inert. +#[derive(Debug, Clone, Copy)] +pub enum ReconcilerInertReason { + /// The reconciler task started when this sled was a scrimlet, but it has + /// since become "not a scrimlet" (e.g., because the attached switch has + /// gone away). + NoLongerAScrimlet, + + /// The reconciler task exited. This is not expected except in tests; the + /// task runs forever as long as sled-agent holds on to the channels used to + /// communicate with it. + TaskExitedUnexpectedly, +} + +/// Why a reconciler task was activated. +#[derive(Debug, Clone, Copy)] +pub enum ReconcilerActivationReason { + /// Each reconciler runs once on startup. + Startup, + /// The task was activated due to its periodic timer firing. + PeriodicTimer, + /// The task was activated in response to a change in the networking config. + SystemNetworkingConfigChanged, + /// The task was activated in response to the sled becoming a scrimlet again + /// (after previously transitioning to "not a scrimlet"). + ScrimletStatusChanged, +} + +#[derive(Debug, Clone)] +pub struct ReconciliationCompletedStatus { + pub activation_reason: ReconcilerActivationReason, + pub completed_at_time: DateTime, + pub ran_for: Duration, + pub activation_count: u64, + pub status: T, +} + +#[derive(Debug, Clone, Copy)] +pub struct ReconcilerRunningStatus { + activation_reason: ReconcilerActivationReason, + started_at_time: DateTime, + started_at_instant: Instant, +} + +impl ReconcilerRunningStatus { + pub(crate) fn new(activation_reason: ReconcilerActivationReason) -> Self { + Self { + activation_reason, + started_at_time: Utc::now(), + started_at_instant: Instant::now(), + } + } + + pub fn activation_reason(&self) -> ReconcilerActivationReason { + self.activation_reason + } + + pub fn started_at(&self) -> DateTime { + self.started_at_time + } + + pub fn elapsed_since_start(&self) -> Duration { + self.started_at_instant.elapsed() + } +} + +#[derive(Debug, Clone)] +pub enum ReconcilerCurrentStatus { + /// The reconciler is inert: it will not or cannot run for some reason. + Inert(ReconcilerInertReason), + /// The reconciler is currently running. + Running(ReconcilerRunningStatus), + /// The reconciler is not currently running. + Idle, +} + +#[derive(Debug, Clone)] +pub struct ReconcilerStatus { + /// Status of the task at this moment. + pub current_status: ReconcilerCurrentStatus, + /// Final status of the most recent activation of this task. + // Box the inner status to avoid clippy complaining about + // `ScrimletReconcilersStatus::Running { ... }` being overly large. + pub last_completion: Option>>, +} + +#[derive(Debug, Clone)] +pub enum ScrimletReconcilersStatus { + /// `sled-agent` has not yet provided underlay networking information. + WaitingForSledAgentNetworkingInfo, + + /// We're attempting to determine our switch slot. + DeterminingSwitchSlot(DetermineSwitchSlotStatus), + + /// We are a scrimlet and the individual reconcilers are running. + Running { + dpd_reconciler: ReconcilerStatus, + mgd_reconciler: ReconcilerStatus, + uplinkd_reconciler: ReconcilerStatus, + }, +} diff --git a/sled-agent/scrimlet-reconcilers/src/switch_zone_slot.rs b/sled-agent/scrimlet-reconcilers/src/switch_zone_slot.rs new file mode 100644 index 00000000000..d802446fce4 --- /dev/null +++ b/sled-agent/scrimlet-reconcilers/src/switch_zone_slot.rs @@ -0,0 +1,153 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use crate::DetermineSwitchSlotStatus; +use crate::ScrimletStatus; +use gateway_client::Client; +use gateway_client::ClientInfo; +use gateway_types::component::SpType; +use sled_agent_types::early_networking::SwitchSlot; +use slog::Logger; +use slog::error; +use slog::warn; +use slog_error_chain::InlineErrorChain; +use std::time::Duration; +use tokio::sync::watch; +use tokio::sync::watch::error::RecvError; + +/// Newtype wrapper around [`SwitchSlot`]. This type is always the physical slot +/// of our own, local switch. +/// +/// This information can only be determined by asking MGS inside our own switch +/// zone. An instance of this type can only be created if we are indeed a +/// scrimlet. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)] +pub(crate) struct ThisSledSwitchSlot(SwitchSlot); + +impl PartialEq for ThisSledSwitchSlot { + fn eq(&self, other: &SwitchSlot) -> bool { + self.0 == *other + } +} + +impl PartialEq for SwitchSlot { + fn eq(&self, other: &ThisSledSwitchSlot) -> bool { + *self == other.0 + } +} + +impl ThisSledSwitchSlot { + const MGS_RETRY_TIMEOUT: Duration = Duration::from_secs(5); + + #[cfg(test)] + pub(crate) const TEST_FAKE: Self = Self(SwitchSlot::Switch0); + + /// Attempt to determine this sled's switch slot via `client`, which _must_ + /// be an MGS client pointed at the IP address of our switch zone. (We take + /// this as a `Client` instead of a more strict type to allow tests to call + /// this function with a `client` pointed at a non-switch-zone address; the + /// function is `pub(crate)` and we expect callers to respect this + /// requirement in non-test paths.) + /// + /// This function blocks until it either succeeds or the + /// `scrimlet_status_rx` channel is closed. It will retry indefinitely on + /// any failures to communicate via `client`. + pub(crate) async fn determine_retrying_forever( + determine_status_tx: watch::Sender, + scrimlet_status_rx: &mut watch::Receiver, + client: &Client, + log: &Logger, + ) -> Result { + loop { + // Wait until we become a scrimlet; there's no point in trying to + // contact our switch zone if it doesn't exist. + loop { + let scrimlet_status = *scrimlet_status_rx.borrow_and_update(); + match scrimlet_status { + ScrimletStatus::Scrimlet => break, + ScrimletStatus::NotScrimlet => { + determine_status_tx.send_modify(|status| { + *status = DetermineSwitchSlotStatus::NotScrimlet; + }); + scrimlet_status_rx.changed().await?; + continue; + } + } + } + + // Update to our status to `ContactingMgs`, and carry forward any + // error from a previous attempt. + determine_status_tx.send_if_modified(|status| match status { + DetermineSwitchSlotStatus::ContactingMgs { .. } => false, + DetermineSwitchSlotStatus::NotScrimlet => { + *status = DetermineSwitchSlotStatus::ContactingMgs { + prev_attempt_err: None, + }; + true + } + DetermineSwitchSlotStatus::WaitingToRetry { + prev_attempt_err, + } => { + *status = DetermineSwitchSlotStatus::ContactingMgs { + prev_attempt_err: Some(prev_attempt_err.clone()), + }; + true + } + }); + + // We are a scrimlet - see if we know our own slot yet. + let err = match client + .sp_local_switch_id() + .await + .map(|resp| resp.into_inner()) + { + Ok(identity) => match (identity.type_, identity.slot) { + (SpType::Switch, 0) => { + return Ok(ThisSledSwitchSlot(SwitchSlot::Switch0)); + } + (SpType::Switch, 1) => { + return Ok(ThisSledSwitchSlot(SwitchSlot::Switch1)); + } + (sp_type, sp_slot) => { + // We should never get any other response; if we do, + // something has gone very wrong with MGS. It's not + // likely retrying will fix this, but there isn't + // anything else we can do. + error!( + log, + "failed to determine this sled's switch slot: got \ + unexpected identity; will retry"; + "sp_type" => ?sp_type, + "sp_slot" => sp_slot, + ); + format!( + "received invalid SP type/slot combo from MGS {}: \ + {sp_type:?}/{sp_slot}", + client.baseurl() + ) + } + }, + Err(err) => { + let err = InlineErrorChain::new(&err); + warn!( + log, + "failed to determine this sled's switch slot; \ + will retry"; + &err, + ); + err.to_string() + } + }; + + determine_status_tx.send_modify(|status| { + *status = DetermineSwitchSlotStatus::WaitingToRetry { + prev_attempt_err: err, + }; + }); + + // Sleep briefly before retrying. + tokio::time::sleep(Self::MGS_RETRY_TIMEOUT).await; + } + } +} diff --git a/sled-agent/scrimlet-reconcilers/src/uplinkd_reconciler.rs b/sled-agent/scrimlet-reconcilers/src/uplinkd_reconciler.rs new file mode 100644 index 00000000000..4c3fe678e12 --- /dev/null +++ b/sled-agent/scrimlet-reconcilers/src/uplinkd_reconciler.rs @@ -0,0 +1,60 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Reconciler for configuration of `uplinkd` within a scrimlet's switch zone. +//! +//! Unlike most reconcilers in this crate, `uplinkd`'s configuration is managed +//! via SMF, not a dropshot server. + +use crate::ScrimletReconcilersMode; +use crate::reconciler_task::Reconciler; +use crate::switch_zone_slot::ThisSledSwitchSlot; +use sled_agent_types::system_networking::SystemNetworkingConfig; +use slog::Logger; +use std::time::Duration; + +#[derive(Debug, Clone)] +pub struct UplinkdReconcilerStatus { + pub todo_status: (), +} + +impl slog::KV for UplinkdReconcilerStatus { + fn serialize( + &self, + _record: &slog::Record<'_>, + serializer: &mut dyn slog::Serializer, + ) -> slog::Result { + serializer.emit_str("uplinkd-reconciler".into(), "not yet implemented") + } +} + +#[derive(Debug)] +pub(crate) struct UplinkdReconciler { + _switch_slot: ThisSledSwitchSlot, +} + +impl Reconciler for UplinkdReconciler { + type Status = UplinkdReconcilerStatus; + + const LOGGER_COMPONENT_NAME: &'static str = "UplinkdReconciler"; + const RE_RECONCILE_INTERVAL: std::time::Duration = Duration::from_secs(30); + + fn new( + _mode: ScrimletReconcilersMode, + switch_slot: ThisSledSwitchSlot, + _parent_log: &Logger, + ) -> Self { + // TODO: Remain inert if `mode` is `ScrimletReconcilersMode::Test`, + // since that indicates there's no real zone to connect to. + Self { _switch_slot: switch_slot } + } + + async fn do_reconciliation( + &mut self, + _system_networking_config: &SystemNetworkingConfig, + _log: &Logger, + ) -> Self::Status { + UplinkdReconcilerStatus { todo_status: () } + } +} diff --git a/sled-agent/types/versions/src/bootstore_service_nat/system_networking.rs b/sled-agent/types/versions/src/bootstore_service_nat/system_networking.rs index 46e9a80e8e9..d6d6c4a5025 100644 --- a/sled-agent/types/versions/src/bootstore_service_nat/system_networking.rs +++ b/sled-agent/types/versions/src/bootstore_service_nat/system_networking.rs @@ -39,7 +39,17 @@ use std::collections::btree_map::Entry; use std::net::IpAddr; use std::net::Ipv6Addr; -#[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq, JsonSchema)] +#[derive( + Clone, + Debug, + Deserialize, + Serialize, + PartialEq, + Eq, + PartialOrd, + Ord, + JsonSchema, +)] #[serde(tag = "kind", rename_all = "snake_case")] #[cfg_attr(any(test, feature = "testing"), derive(test_strategy::Arbitrary))] pub enum ServiceZoneNatKind { @@ -48,12 +58,26 @@ pub enum ServiceZoneNatKind { Nexus { external_ip: IpAddr }, } -#[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq, JsonSchema)] +#[derive( + Clone, + Debug, + Deserialize, + Serialize, + PartialEq, + Eq, + PartialOrd, + Ord, + JsonSchema, +)] #[cfg_attr(any(test, feature = "testing"), derive(test_strategy::Arbitrary))] pub struct ServiceZoneNatEntry { pub zone_id: OmicronZoneUuid, pub sled_underlay_ip: Ipv6Addr, pub nic_mac: MacAddr, + #[cfg_attr( + any(test, feature = "testing"), + strategy(proptest::strategy::Just(Vni::SERVICES_VNI)) + )] pub vni: Vni, pub kind: ServiceZoneNatKind, } diff --git a/sled-agent/types/versions/src/impls/early_networking.rs b/sled-agent/types/versions/src/impls/early_networking.rs index 83f5152a4e3..121a19f47d3 100644 --- a/sled-agent/types/versions/src/impls/early_networking.rs +++ b/sled-agent/types/versions/src/impls/early_networking.rs @@ -104,6 +104,66 @@ impl std::fmt::Display for UplinkIpNet { } } +#[cfg(any(test, feature = "testing"))] +impl proptest::arbitrary::Arbitrary for UplinkIpNet { + type Parameters = (); + type Strategy = proptest::prelude::BoxedStrategy; + + fn arbitrary_with(_args: Self::Parameters) -> Self::Strategy { + use proptest::prelude::*; + use std::net::Ipv4Addr; + use test_strategy::Arbitrary; + + // `UplinkIpNet::try_from()` rejects quite a lot of IP addresses that + // the default `Arbitrary` implementation likes to produce: loopback + // addresses, unspecified addresses, ipv4-mapped-ipv6, etc. `ValidIpNet` + // has restrictions on the first octet (ipv4) or segment (ipv6) that + // causes it to produce IPs that are valid `UplinkIpNet`s. We also skip + // plenty of valid `UplinkIpNet`s, but should provide sufficient + // coverage for any downstream tests. + #[derive(Debug, Clone, Copy, Arbitrary)] + enum ValidIpNet { + V4 { + #[strategy(1_u8..127)] + octet0: u8, + other_octets: [u8; 3], + #[strategy(0_u8..32)] + prefix: u8, + }, + V6 { + #[strategy(1_u16..0xfe00)] + segment0: u16, + other_segments: [u16; 7], + #[strategy(0_u8..128)] + prefix: u8, + }, + } + + any::() + .prop_map(|arb| { + let ipnet = match arb { + ValidIpNet::V4 { octet0, other_octets, prefix } => { + let mut octets = [0; 4]; + octets[0] = octet0; + octets[1..].copy_from_slice(&other_octets); + let ip = Ipv4Addr::from_octets(octets); + IpNet::new(ip.into(), prefix).expect("valid prefix") + } + ValidIpNet::V6 { segment0, other_segments, prefix } => { + let mut segments = [0; 8]; + segments[0] = segment0; + segments[1..].copy_from_slice(&other_segments); + let ip = Ipv6Addr::from_segments(segments); + IpNet::new(ip.into(), prefix).expect("valid prefix") + } + }; + UplinkIpNet::try_from(ipnet) + .expect("ValidIpNet produces valid UplinkIpNets") + }) + .boxed() + } +} + impl std::fmt::Display for RouterPeerIpAddr { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { self.0.fmt(f) diff --git a/sled-agent/types/versions/src/impls/system_networking.rs b/sled-agent/types/versions/src/impls/system_networking.rs index e9fe0689947..38ee85c8798 100644 --- a/sled-agent/types/versions/src/impls/system_networking.rs +++ b/sled-agent/types/versions/src/impls/system_networking.rs @@ -102,6 +102,22 @@ impl proptest::arbitrary::Arbitrary for ServiceZoneNatEntries { for e in extra { entries.insert_overwrite(e); } + // Filter out sets that contain overlapping IPs. + let mut unique_ips = std::collections::BTreeSet::new(); + for e in &entries { + let ip = match e.kind.external_ip() { + ip @ IpAddr::V4(_) => ip, + ip @ IpAddr::V6(ip6) => { + match ip6.to_ipv4_mapped() { + Some(ip4) => IpAddr::V4(ip4), + None => ip, + } + } + }; + if !unique_ips.insert(ip) { + return None; + } + } ServiceZoneNatEntries::try_from(entries).ok() }, ) diff --git a/sled-agent/types/versions/src/initial/early_networking.rs b/sled-agent/types/versions/src/initial/early_networking.rs index cd539d51424..760e87eeec8 100644 --- a/sled-agent/types/versions/src/initial/early_networking.rs +++ b/sled-agent/types/versions/src/initial/early_networking.rs @@ -245,6 +245,7 @@ pub struct LldpPortConfig { #[derive( Clone, Copy, Debug, Deserialize, Serialize, PartialEq, Eq, Hash, JsonSchema, )] +#[cfg_attr(any(test, feature = "testing"), derive(test_strategy::Arbitrary))] pub struct TxEqConfig { /// Pre-cursor tap1 pub pre1: Option, @@ -298,6 +299,7 @@ pub struct PortConfig { strum::EnumIter, )] #[serde(rename_all = "snake_case")] +#[cfg_attr(any(test, feature = "testing"), derive(test_strategy::Arbitrary))] pub enum SwitchSlot { /// Switch in upper slot Switch0, @@ -307,9 +309,19 @@ pub enum SwitchSlot { /// The speed of a link. #[derive( - Copy, Clone, Debug, Deserialize, Serialize, PartialEq, Eq, JsonSchema, Hash, + Copy, + Clone, + Debug, + Deserialize, + Serialize, + PartialEq, + Eq, + JsonSchema, + Hash, + daft::Diffable, )] #[serde(rename_all = "snake_case")] +#[cfg_attr(any(test, feature = "testing"), derive(test_strategy::Arbitrary))] pub enum LinkSpeed { /// Zero gigabits per second. #[serde(alias = "0G")] @@ -345,6 +357,7 @@ pub enum LinkSpeed { Copy, Clone, Debug, Deserialize, Serialize, PartialEq, Eq, JsonSchema, Hash, )] #[serde(rename_all = "snake_case")] +#[cfg_attr(any(test, feature = "testing"), derive(test_strategy::Arbitrary))] pub enum LinkFec { /// Firecode forward error correction. Firecode, diff --git a/sled-agent/types/versions/src/stronger_bgp_unnumbered_types/early_networking.rs b/sled-agent/types/versions/src/stronger_bgp_unnumbered_types/early_networking.rs index 709f3d9fe0b..b0bc87873af 100644 --- a/sled-agent/types/versions/src/stronger_bgp_unnumbered_types/early_networking.rs +++ b/sled-agent/types/versions/src/stronger_bgp_unnumbered_types/early_networking.rs @@ -69,6 +69,7 @@ pub enum InvalidIpAddrError { Ord, )] #[serde(tag = "type", rename_all = "snake_case")] +#[cfg_attr(any(test, feature = "testing"), derive(test_strategy::Arbitrary))] pub enum UplinkAddress { // `#[serde(rename)]` this to `addrconf` so when this shows up in // config-rss.toml development files, it's not phrased as `addr_conf`. This @@ -297,8 +298,19 @@ impl TryFrom for RouterPeerIpAddr { } #[derive( - Clone, Copy, Debug, Deserialize, Serialize, PartialEq, Eq, JsonSchema, Hash, + Clone, + Copy, + Debug, + Deserialize, + Serialize, + PartialEq, + Eq, + JsonSchema, + Hash, + PartialOrd, + Ord, )] +#[cfg_attr(any(test, feature = "testing"), derive(test_strategy::Arbitrary))] pub struct UplinkAddressConfig { /// The address to be used on the uplink. pub address: UplinkAddress, diff --git a/test-utils/src/dev/dendrite.rs b/test-utils/src/dev/dendrite.rs index 7992fcf48f6..dce6b4689d1 100644 --- a/test-utils/src/dev/dendrite.rs +++ b/test-utils/src/dev/dendrite.rs @@ -4,7 +4,7 @@ //! Tools for managing Dendrite during development -use std::net::SocketAddr; +use std::net::{Ipv6Addr, SocketAddr, SocketAddrV6}; use std::path::{Path, PathBuf}; use std::process::Stdio; use std::time::Duration; @@ -96,6 +96,10 @@ impl DendriteInstance { Ok(Self { port, args, child: Some(child), data_dir: Some(temp_dir) }) } + pub fn address(&self) -> SocketAddrV6 { + SocketAddrV6::new(Ipv6Addr::LOCALHOST, self.port, 0, 0) + } + pub async fn cleanup(&mut self) -> Result<(), anyhow::Error> { if let Some(mut child) = self.child.take() { child.start_kill().context("Sending SIGKILL to child")?; diff --git a/test-utils/src/dev/maghemite.rs b/test-utils/src/dev/maghemite.rs index eaae1af8cd4..988c3a42375 100644 --- a/test-utils/src/dev/maghemite.rs +++ b/test-utils/src/dev/maghemite.rs @@ -4,7 +4,7 @@ //! Tools for managing Maghemite during development -use std::net::SocketAddr; +use std::net::{Ipv6Addr, SocketAddr, SocketAddrV6}; use std::path::{Path, PathBuf}; use std::process::Stdio; use std::time::Duration; @@ -88,6 +88,10 @@ impl MgdInstance { Ok(Self { port, args, child, data_dir: Some(temp_dir) }) } + pub fn address(&self) -> SocketAddrV6 { + SocketAddrV6::new(Ipv6Addr::LOCALHOST, self.port, 0, 0) + } + pub async fn cleanup(&mut self) -> Result<(), anyhow::Error> { if let Some(mut child) = self.child.take() { child.start_kill().context("Sending SIGKILL to child")?; diff --git a/workspace-hack/Cargo.toml b/workspace-hack/Cargo.toml index d13b339e431..e665ee40e65 100644 --- a/workspace-hack/Cargo.toml +++ b/workspace-hack/Cargo.toml @@ -102,7 +102,7 @@ postgres-types = { version = "0.2.12", default-features = false, features = ["wi ppv-lite86 = { version = "0.2.21", default-features = false, features = ["simd", "std"] } predicates = { version = "3.1.4" } proc-macro2 = { version = "1.0.106" } -proptest = { version = "1.10.0" } +proptest = { version = "1.11.0" } quote = { version = "1.0.45" } rand-274715c4dabd11b0 = { package = "rand", version = "0.9.2" } rand-c38e5c1d305a1b54 = { package = "rand", version = "0.8.6" } @@ -248,7 +248,7 @@ postgres-types = { version = "0.2.12", default-features = false, features = ["wi ppv-lite86 = { version = "0.2.21", default-features = false, features = ["simd", "std"] } predicates = { version = "3.1.4" } proc-macro2 = { version = "1.0.106" } -proptest = { version = "1.10.0" } +proptest = { version = "1.11.0" } quote = { version = "1.0.45" } rand-274715c4dabd11b0 = { package = "rand", version = "0.9.2" } rand-c38e5c1d305a1b54 = { package = "rand", version = "0.8.6" }