From 438019dedcc40a0c554bf32837d62e0ae0967347 Mon Sep 17 00:00:00 2001 From: mkoci <26286151+mkoci@users.noreply.github.com> Date: Sun, 17 May 2026 02:13:31 +0000 Subject: [PATCH 01/12] bug(health): gate switch hosts and bmcs in spawn to avoid redfish calls on switch hosts Signed-off-by: mkoci <26286151+mkoci@users.noreply.github.com> --- crates/health/src/discovery/spawn.rs | 212 ++++++++++++++++++++------- crates/health/src/endpoint/model.rs | 11 ++ 2 files changed, 169 insertions(+), 54 deletions(-) diff --git a/crates/health/src/discovery/spawn.rs b/crates/health/src/discovery/spawn.rs index 2bfbc901e8..d9b91460f9 100644 --- a/crates/health/src/discovery/spawn.rs +++ b/crates/health/src/discovery/spawn.rs @@ -28,7 +28,7 @@ use crate::collectors::{ StreamingCollectorStartContext, }; use crate::config::{Configurable, LogCollectionMode}; -use crate::endpoint::{BmcEndpoint, EndpointMetadata}; +use crate::endpoint::BmcEndpoint; use crate::sink::DataSink; fn logs_state_file_path(template: &str, endpoint_id: &str) -> PathBuf { @@ -40,9 +40,23 @@ pub(super) async fn spawn_collectors_for_endpoint( endpoint: &Arc, data_sink: Option>, metrics_prefix: &str, +) -> Result<(), HealthError> { + if endpoint.switch_data().is_some() { + spawn_switch_collectors(ctx, endpoint, data_sink, metrics_prefix) + } else { + spawn_generic_redfish_collectors(ctx, endpoint, data_sink, metrics_prefix) + } +} + +fn spawn_generic_redfish_collectors( + ctx: &mut DiscoveryLoopContext, + endpoint: &Arc, + data_sink: Option>, + metrics_prefix: &str, ) -> Result<(), HealthError> { let key = endpoint.key(); let endpoint_arc = endpoint.clone(); + if let Configurable::Enabled(sensor_cfg) = &ctx.sensors_config && !ctx.collectors.contains(CollectorKind::Sensor, &key) { @@ -216,7 +230,7 @@ pub(super) async fn spawn_collectors_for_endpoint( metrics_prefix, )?); match Collector::start::>( - endpoint_arc.clone(), + endpoint_arc, LeakDetectorCollectorConfig { data_sink: data_sink.clone(), state_refresh_interval: leak_detector_cfg.state_refresh_interval, @@ -250,9 +264,20 @@ pub(super) async fn spawn_collectors_for_endpoint( } } + Ok(()) +} + +fn spawn_switch_collectors( + ctx: &mut DiscoveryLoopContext, + endpoint: &Arc, + data_sink: Option>, + metrics_prefix: &str, +) -> Result<(), HealthError> { + let key = endpoint.key(); + let endpoint_arc = endpoint.clone(); + if let Configurable::Enabled(nmxt_cfg) = &ctx.nmxt_config && !ctx.collectors.contains(CollectorKind::Nmxt, &key) - && matches!(endpoint.metadata, Some(EndpointMetadata::Switch(_))) { let collector_registry = Arc::new( ctx.metrics_manager @@ -279,7 +304,7 @@ pub(super) async fn spawn_collectors_for_endpoint( tracing::info!( endpoint_key = %key, total_nmxt_collectors = ctx.collectors.len(CollectorKind::Nmxt), - "Started NMX-T collection for BMC endpoint" + "Started NMX-T collection for switch endpoint" ); } Err(error) => { @@ -295,7 +320,6 @@ pub(super) async fn spawn_collectors_for_endpoint( if let Configurable::Enabled(nvue_cfg) = &ctx.nvue_config && let Configurable::Enabled(rest_cfg) = &nvue_cfg.rest && !ctx.collectors.contains(CollectorKind::NvueRest, &key) - && matches!(endpoint.metadata, Some(EndpointMetadata::Switch(_))) { let collector_registry = Arc::new( ctx.metrics_manager @@ -322,7 +346,7 @@ pub(super) async fn spawn_collectors_for_endpoint( tracing::info!( endpoint_key = %key, total_nvue_rest_collectors = ctx.collectors.len(CollectorKind::NvueRest), - "Started NVUE REST collection for BMC endpoint" + "Started NVUE REST collection for switch endpoint" ); } Err(error) => { @@ -347,59 +371,155 @@ mod tests { use super::*; use crate::config::{Config, Configurable}; - use crate::endpoint::{BmcAddr, BmcCredentials, EndpointMetadata, SwitchData}; + use crate::endpoint::{BmcAddr, BmcCredentials, EndpointMetadata, MachineData, SwitchData}; use crate::limiter::{NoopLimiter, RateLimiter}; use crate::metrics::MetricsManager; + use crate::sink::{CollectorEvent, EventContext}; - #[test] - fn test_logs_state_file_path_replaces_endpoint_id() { - let path = logs_state_file_path("/tmp/logs_{machine_id}.json", "endpoint-42"); - assert_eq!(path, PathBuf::from("/tmp/logs_endpoint-42.json")); + struct NoopSink; + + impl DataSink for NoopSink { + fn sink_type(&self) -> &'static str { + "noop" + } + + fn handle_event(&self, _context: &EventContext, _event: &CollectorEvent) {} } - #[test] - fn test_endpoint_log_identity_falls_back_to_mac_without_metadata() { - let endpoint = BmcEndpoint::with_fixed_credentials( + fn context_with_config(config: Config, metrics_name: &str) -> DiscoveryLoopContext { + let limiter: Arc = Arc::new(NoopLimiter); + let metrics_manager = + Arc::new(MetricsManager::new(metrics_name).expect("metrics manager should initialize")); + DiscoveryLoopContext::new(limiter, metrics_manager, Arc::new(config)) + .expect("context should initialize") + } + + fn test_endpoint( + ip: Ipv4Addr, + mac: &str, + metadata: Option, + ) -> Arc { + Arc::new(BmcEndpoint::with_fixed_credentials( BmcAddr { - ip: IpAddr::V4(Ipv4Addr::new(10, 0, 0, 1)), + ip: IpAddr::V4(ip), port: Some(443), - mac: MacAddress::from_str("aa:bb:cc:dd:ee:ff").unwrap(), + mac: MacAddress::from_str(mac).expect("valid mac address"), }, BmcCredentials::UsernamePassword { username: "user".to_string(), password: Some("pass".to_string()), }, + metadata, None, - None, - ); + )) + } + + fn switch_metadata() -> EndpointMetadata { + EndpointMetadata::Switch(SwitchData { + id: None, + serial: "switch-serial-1".to_string(), + slot_number: None, + tray_index: None, + }) + } + + fn machine_metadata() -> EndpointMetadata { + EndpointMetadata::Machine(MachineData { + machine_id: "fm100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0" + .parse() + .expect("valid machine id"), + machine_serial: None, + slot_number: None, + tray_index: None, + nvlink_domain_uuid: None, + }) + } + + #[test] + fn test_logs_state_file_path_replaces_endpoint_id() { + let path = logs_state_file_path("/tmp/logs_{machine_id}.json", "endpoint-42"); + assert_eq!(path, PathBuf::from("/tmp/logs_endpoint-42.json")); + } + + #[test] + fn test_endpoint_log_identity_falls_back_to_mac_without_metadata() { + let endpoint = test_endpoint(Ipv4Addr::new(10, 0, 0, 1), "aa:bb:cc:dd:ee:ff", None); assert_eq!(endpoint.log_identity().as_ref(), "AA:BB:CC:DD:EE:FF"); } #[test] fn test_endpoint_log_identity_uses_switch_serial_when_available() { - let endpoint = BmcEndpoint::with_fixed_credentials( - BmcAddr { - ip: IpAddr::V4(Ipv4Addr::new(10, 0, 0, 2)), - port: Some(443), - mac: MacAddress::from_str("11:22:33:44:55:66").unwrap(), - }, - BmcCredentials::UsernamePassword { - username: "user".to_string(), - password: Some("pass".to_string()), - }, - Some(EndpointMetadata::Switch(SwitchData { - id: None, - serial: "switch-serial-1".to_string(), - slot_number: None, - tray_index: None, - })), - None, + let endpoint = test_endpoint( + Ipv4Addr::new(10, 0, 0, 2), + "11:22:33:44:55:66", + Some(switch_metadata()), ); assert_eq!(endpoint.log_identity().as_ref(), "switch-serial-1"); } + #[tokio::test] + async fn test_switch_endpoint_does_not_start_generic_redfish_collectors() { + let mut config = Config::default(); + config.collectors.sensors = Configurable::Enabled(Default::default()); + config.collectors.logs = Configurable::Enabled(Default::default()); + config.collectors.firmware = Configurable::Enabled(Default::default()); + config.collectors.leak_detector = Configurable::Enabled(Default::default()); + config.collectors.nmxt = Configurable::Disabled; + config.collectors.nvue = Configurable::Disabled; + + let mut ctx = context_with_config(config, "test_switch_generic_redfish_gate"); + let endpoint = test_endpoint( + Ipv4Addr::new(10, 0, 0, 6), + "55:66:77:88:99:aa", + Some(switch_metadata()), + ); + + spawn_collectors_for_endpoint( + &mut ctx, + &endpoint, + Some(Arc::new(NoopSink)), + "test_switch_generic_redfish_gate", + ) + .await + .expect("spawn should succeed"); + + assert_eq!(ctx.collectors.len(CollectorKind::Sensor), 0); + assert_eq!(ctx.collectors.len(CollectorKind::Logs), 0); + assert_eq!(ctx.collectors.len(CollectorKind::Firmware), 0); + assert_eq!(ctx.collectors.len(CollectorKind::LeakDetector), 0); + } + + #[tokio::test] + async fn test_machine_endpoint_still_starts_sse_logs_collector() { + let mut config = Config::default(); + config.collectors.sensors = Configurable::Disabled; + config.collectors.logs = Configurable::Enabled(Default::default()); + config.collectors.firmware = Configurable::Disabled; + config.collectors.leak_detector = Configurable::Disabled; + config.collectors.nmxt = Configurable::Disabled; + config.collectors.nvue = Configurable::Disabled; + + let mut ctx = context_with_config(config, "test_machine_sse_logs_collector"); + let endpoint = test_endpoint( + Ipv4Addr::new(10, 0, 0, 7), + "66:77:88:99:aa:bb", + Some(machine_metadata()), + ); + + spawn_collectors_for_endpoint( + &mut ctx, + &endpoint, + Some(Arc::new(NoopSink)), + "test_machine_sse_logs_collector", + ) + .await + .expect("spawn should succeed"); + + assert_eq!(ctx.collectors.len(CollectorKind::Logs), 1); + } + #[tokio::test] async fn test_spawn_is_idempotent_when_collectors_are_disabled() { let mut config = Config::default(); @@ -408,26 +528,10 @@ mod tests { config.collectors.firmware = Configurable::Disabled; config.collectors.leak_detector = Configurable::Disabled; config.collectors.nmxt = Configurable::Disabled; + config.collectors.nvue = Configurable::Disabled; - let limiter: Arc = Arc::new(NoopLimiter); - let metrics_manager = - Arc::new(MetricsManager::new("test").expect("metrics manager should initialize")); - let mut ctx = DiscoveryLoopContext::new(limiter, metrics_manager, Arc::new(config)) - .expect("context should initialize"); - - let endpoint = Arc::new(BmcEndpoint::with_fixed_credentials( - BmcAddr { - ip: IpAddr::V4(Ipv4Addr::new(10, 0, 0, 1)), - port: Some(443), - mac: MacAddress::from_str("aa:bb:cc:dd:ee:ff").unwrap(), - }, - BmcCredentials::UsernamePassword { - username: "user".to_string(), - password: Some("pass".to_string()), - }, - None, - None, - )); + let mut ctx = context_with_config(config, "test_disabled_collectors"); + let endpoint = test_endpoint(Ipv4Addr::new(10, 0, 0, 1), "aa:bb:cc:dd:ee:ff", None); spawn_collectors_for_endpoint(&mut ctx, &endpoint, None, "test") .await diff --git a/crates/health/src/endpoint/model.rs b/crates/health/src/endpoint/model.rs index 06b2cf2a3a..825a3497c8 100644 --- a/crates/health/src/endpoint/model.rs +++ b/crates/health/src/endpoint/model.rs @@ -106,6 +106,10 @@ impl BmcEndpoint { } } + pub fn switch_data(&self) -> Option<&SwitchData> { + self.metadata.as_ref().and_then(EndpointMetadata::as_switch) + } + pub fn credentials(&self) -> BmcCredentials { self.credentials.read().expect("lock poisoned").to_owned() } @@ -128,6 +132,13 @@ pub enum EndpointMetadata { } impl EndpointMetadata { + pub fn as_switch(&self) -> Option<&SwitchData> { + match self { + EndpointMetadata::Switch(switch) => Some(switch), + _ => None, + } + } + pub fn serial_number(&self) -> Option<&str> { match self { EndpointMetadata::Machine(machine) => machine.machine_serial.as_deref(), From db6dca42f64931cf735ba8ed8344427b70efad69 Mon Sep 17 00:00:00 2001 From: mkoci <26286151+mkoci@users.noreply.github.com> Date: Mon, 18 May 2026 23:45:12 +0000 Subject: [PATCH 02/12] feat(health): add SwitchEndpointRole to distinguish switch BMC from Host Signed-off-by: mkoci <26286151+mkoci@users.noreply.github.com> --- crates/health/src/api_client.rs | 5 ++++- crates/health/src/endpoint/mod.rs | 2 +- crates/health/src/endpoint/model.rs | 9 +++++++++ crates/health/src/endpoint/sources.rs | 5 ++++- 4 files changed, 18 insertions(+), 3 deletions(-) diff --git a/crates/health/src/api_client.rs b/crates/health/src/api_client.rs index b49c78310b..6390f69bc2 100644 --- a/crates/health/src/api_client.rs +++ b/crates/health/src/api_client.rs @@ -31,7 +31,7 @@ use url::Url; use crate::HealthError; use crate::endpoint::{ BmcAddr, BmcCredentials, BmcEndpoint, BoxFuture, CredentialProvider, EndpointMetadata, - EndpointSource, MachineData, PowerShelfData, SwitchData, + EndpointSource, MachineData, PowerShelfData, SwitchData, SwitchEndpointRole, }; #[derive(Clone)] @@ -268,6 +268,9 @@ impl ApiClientWrapper { .placement_in_rack .as_ref() .and_then(|placement| placement.tray_index), + endpoint_role: SwitchEndpointRole::Bmc, + is_primary: switch.is_primary, + nmxt_enabled: false, })), None, ) diff --git a/crates/health/src/endpoint/mod.rs b/crates/health/src/endpoint/mod.rs index 191679ad79..b44a985d45 100644 --- a/crates/health/src/endpoint/mod.rs +++ b/crates/health/src/endpoint/mod.rs @@ -20,7 +20,7 @@ mod sources; pub use model::{ BmcAddr, BmcCredentials, BmcEndpoint, BoxFuture, CredentialProvider, EndpointMetadata, - EndpointSource, MachineData, PowerShelfData, SwitchData, + EndpointSource, MachineData, PowerShelfData, SwitchData, SwitchEndpointRole, }; pub use sources::{CompositeEndpointSource, StaticEndpointSource}; diff --git a/crates/health/src/endpoint/model.rs b/crates/health/src/endpoint/model.rs index 825a3497c8..bebfba1d10 100644 --- a/crates/health/src/endpoint/model.rs +++ b/crates/health/src/endpoint/model.rs @@ -163,12 +163,21 @@ pub struct PowerShelfData { pub serial: String, } +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum SwitchEndpointRole { + Bmc, + Host, +} + #[derive(Clone, Debug)] pub struct SwitchData { pub id: Option, pub serial: String, pub slot_number: Option, pub tray_index: Option, + pub endpoint_role: SwitchEndpointRole, + pub is_primary: bool, + pub nmxt_enabled: bool, } #[derive(Clone)] diff --git a/crates/health/src/endpoint/sources.rs b/crates/health/src/endpoint/sources.rs index e2bc6bf8a0..f71af23227 100644 --- a/crates/health/src/endpoint/sources.rs +++ b/crates/health/src/endpoint/sources.rs @@ -26,7 +26,7 @@ use crate::HealthError; use crate::config::StaticBmcEndpoint; use crate::endpoint::{ BmcAddr, BmcCredentials, BmcEndpoint, BoxFuture, EndpointMetadata, EndpointSource, MachineData, - PowerShelfData, SwitchData, + PowerShelfData, SwitchData, SwitchEndpointRole, }; pub struct StaticEndpointSource { @@ -99,6 +99,9 @@ impl StaticEndpointSource { serial, slot_number: switch.slot_number, tray_index: switch.tray_index, + endpoint_role: SwitchEndpointRole::Host, + is_primary: false, + nmxt_enabled: false, })) } else if let Some(machine) = &cfg.machine { let machine_id = &machine.id; From 3fbc376006fbba725fcfbfb58384738549fbba65 Mon Sep 17 00:00:00 2001 From: mkoci <26286151+mkoci@users.noreply.github.com> Date: Tue, 19 May 2026 00:18:54 +0000 Subject: [PATCH 03/12] feat(health): add static config shape for switch bmc/host Signed-off-by: mkoci <26286151+mkoci@users.noreply.github.com> --- crates/health/example/config.example.toml | 10 +++++++++- crates/health/src/config.rs | 17 +++++++++++++++++ crates/health/src/endpoint/sources.rs | 13 +++++++++---- 3 files changed, 35 insertions(+), 5 deletions(-) diff --git a/crates/health/example/config.example.toml b/crates/health/example/config.example.toml index 12067a6f3f..5f1f3c5c65 100644 --- a/crates/health/example/config.example.toml +++ b/crates/health/example/config.example.toml @@ -46,7 +46,15 @@ port = 443 mac = "11:22:33:44:55:66" username = "admin" password = "secret" -switch = { id = "fsw100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0", serial = "SN-SWITCH-001", slot_number = 7, tray_index = 3 } +switch = { id = "fsw100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0", serial = "SN-SWITCH-BMC-001", endpoint_role = "bmc", slot_number = 7, tray_index = 3 } + +[[endpoint_sources.static_bmc_endpoints]] +ip = "10.0.1.2" +port = 443 +mac = "11:22:33:44:55:77" +username = "admin" +password = "secret" +switch = { id = "fsw100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0", serial = "SN-SWITCH-HOST-001", endpoint_role = "host", is_primary = true, slot_number = 7, tray_index = 3 } [[endpoint_sources.static_bmc_endpoints]] ip = "10.0.2.1" diff --git a/crates/health/src/config.rs b/crates/health/src/config.rs index c94d99ce6b..837f1be96b 100644 --- a/crates/health/src/config.rs +++ b/crates/health/src/config.rs @@ -123,6 +123,17 @@ pub struct StaticPowerShelfEndpoint { pub serial: Option, } +#[derive(Clone, Copy, Debug, PartialEq, Eq, serde::Deserialize, serde::Serialize)] +#[serde(rename_all = "snake_case")] +pub enum StaticSwitchEndpointRole { + Bmc, + Host, +} + +fn default_static_switch_endpoint_role() -> StaticSwitchEndpointRole { + StaticSwitchEndpointRole::Host +} + #[derive(Clone, Debug, serde::Deserialize, serde::Serialize)] #[serde(deny_unknown_fields)] pub struct StaticSwitchEndpoint { @@ -130,6 +141,12 @@ pub struct StaticSwitchEndpoint { pub serial: Option, pub slot_number: Option, pub tray_index: Option, + #[serde(default = "default_static_switch_endpoint_role")] + pub endpoint_role: StaticSwitchEndpointRole, + #[serde(default)] + pub is_primary: bool, + #[serde(default)] + pub nmxt_enabled: Option, } impl Debug for StaticBmcEndpoint { diff --git a/crates/health/src/endpoint/sources.rs b/crates/health/src/endpoint/sources.rs index f71af23227..6023f20c2b 100644 --- a/crates/health/src/endpoint/sources.rs +++ b/crates/health/src/endpoint/sources.rs @@ -23,7 +23,7 @@ use carbide_uuid::rack::RackId; use mac_address::MacAddress; use crate::HealthError; -use crate::config::StaticBmcEndpoint; +use crate::config::{StaticBmcEndpoint, StaticSwitchEndpointRole}; use crate::endpoint::{ BmcAddr, BmcCredentials, BmcEndpoint, BoxFuture, EndpointMetadata, EndpointSource, MachineData, PowerShelfData, SwitchData, SwitchEndpointRole, @@ -93,15 +93,20 @@ impl StaticEndpointSource { .clone() .or_else(|| switch.id.clone()) .unwrap_or_else(|| cfg.mac.clone()); + let endpoint_role = match switch.endpoint_role { + StaticSwitchEndpointRole::Bmc => SwitchEndpointRole::Bmc, + StaticSwitchEndpointRole::Host => SwitchEndpointRole::Host, + }; + let nmxt_enabled = switch.nmxt_enabled.unwrap_or(switch.is_primary); Some(EndpointMetadata::Switch(SwitchData { id, serial, slot_number: switch.slot_number, tray_index: switch.tray_index, - endpoint_role: SwitchEndpointRole::Host, - is_primary: false, - nmxt_enabled: false, + endpoint_role, + is_primary: switch.is_primary, + nmxt_enabled, })) } else if let Some(machine) = &cfg.machine { let machine_id = &machine.id; From 817c2f34178f317263c4b8eb699f8e72cf734185 Mon Sep 17 00:00:00 2001 From: mkoci <26286151+mkoci@users.noreply.github.com> Date: Tue, 19 May 2026 00:32:51 +0000 Subject: [PATCH 04/12] feat(health): gate switch collection by endpoint role (host/bmc) Signed-off-by: mkoci <26286151+mkoci@users.noreply.github.com> --- crates/health/src/discovery/spawn.rs | 41 ++++++++++++++++++++-------- 1 file changed, 29 insertions(+), 12 deletions(-) diff --git a/crates/health/src/discovery/spawn.rs b/crates/health/src/discovery/spawn.rs index d9b91460f9..d33a9e0753 100644 --- a/crates/health/src/discovery/spawn.rs +++ b/crates/health/src/discovery/spawn.rs @@ -28,7 +28,7 @@ use crate::collectors::{ StreamingCollectorStartContext, }; use crate::config::{Configurable, LogCollectionMode}; -use crate::endpoint::BmcEndpoint; +use crate::endpoint::{BmcEndpoint, SwitchEndpointRole}; use crate::sink::DataSink; fn logs_state_file_path(template: &str, endpoint_id: &str) -> PathBuf { @@ -41,13 +41,28 @@ pub(super) async fn spawn_collectors_for_endpoint( data_sink: Option>, metrics_prefix: &str, ) -> Result<(), HealthError> { - if endpoint.switch_data().is_some() { - spawn_switch_collectors(ctx, endpoint, data_sink, metrics_prefix) - } else { - spawn_generic_redfish_collectors(ctx, endpoint, data_sink, metrics_prefix) + match endpoint.switch_data().map(|switch| switch.endpoint_role) { + Some(SwitchEndpointRole::Host) => { + spawn_switch_host_collectors(ctx, endpoint, data_sink, metrics_prefix) + } + Some(SwitchEndpointRole::Bmc) | None => { + spawn_generic_redfish_collectors(ctx, endpoint, data_sink, metrics_prefix) + } } } +fn is_switch_host_endpoint(endpoint: &BmcEndpoint) -> bool { + endpoint + .switch_data() + .is_some_and(|switch| switch.endpoint_role == SwitchEndpointRole::Host) +} + +fn switch_host_nmxt_enabled(endpoint: &BmcEndpoint) -> bool { + endpoint.switch_data().is_some_and(|switch| { + switch.endpoint_role == SwitchEndpointRole::Host && switch.nmxt_enabled + }) +} + fn spawn_generic_redfish_collectors( ctx: &mut DiscoveryLoopContext, endpoint: &Arc, @@ -267,7 +282,7 @@ fn spawn_generic_redfish_collectors( Ok(()) } -fn spawn_switch_collectors( +fn spawn_switch_host_collectors( ctx: &mut DiscoveryLoopContext, endpoint: &Arc, data_sink: Option>, @@ -276,7 +291,8 @@ fn spawn_switch_collectors( let key = endpoint.key(); let endpoint_arc = endpoint.clone(); - if let Configurable::Enabled(nmxt_cfg) = &ctx.nmxt_config + if switch_host_nmxt_enabled(endpoint) + && let Configurable::Enabled(nmxt_cfg) = &ctx.nmxt_config && !ctx.collectors.contains(CollectorKind::Nmxt, &key) { let collector_registry = Arc::new( @@ -304,20 +320,21 @@ fn spawn_switch_collectors( tracing::info!( endpoint_key = %key, total_nmxt_collectors = ctx.collectors.len(CollectorKind::Nmxt), - "Started NMX-T collection for switch endpoint" + "Started NMX-T collection for switch host endpoint" ); } Err(error) => { tracing::error!( ?error, - "Could not start NMX-T collector for: {:?}", + "Could not start NMX-T collector for switch host: {:?}", endpoint.addr ) } } } - if let Configurable::Enabled(nvue_cfg) = &ctx.nvue_config + if is_switch_host_endpoint(endpoint) + && let Configurable::Enabled(nvue_cfg) = &ctx.nvue_config && let Configurable::Enabled(rest_cfg) = &nvue_cfg.rest && !ctx.collectors.contains(CollectorKind::NvueRest, &key) { @@ -346,13 +363,13 @@ fn spawn_switch_collectors( tracing::info!( endpoint_key = %key, total_nvue_rest_collectors = ctx.collectors.len(CollectorKind::NvueRest), - "Started NVUE REST collection for switch endpoint" + "Started NVUE REST collection for switch host endpoint" ); } Err(error) => { tracing::error!( ?error, - "Could not start NVUE REST collector for: {:?}", + "Could not start NVUE REST collector for switch host: {:?}", endpoint.addr ) } From 83f05fe82a7fc61f8c51e92807394609c5d6bc73 Mon Sep 17 00:00:00 2001 From: mkoci <26286151+mkoci@users.noreply.github.com> Date: Tue, 19 May 2026 01:31:31 +0000 Subject: [PATCH 05/12] feat(api): expose switch host endpoints and nvos credentials for host monitoring Signed-off-by: mkoci <26286151+mkoci@users.noreply.github.com> --- crates/api/src/api.rs | 14 ++++++++ crates/api/src/auth/internal_rbac_rules.rs | 5 +++ crates/api/src/handlers/credential.rs | 31 ++++++++++++++++ crates/api/src/handlers/switch.rs | 41 ++++++++++++++++++++++ crates/rpc/build.rs | 15 +++++++- crates/rpc/proto/forge.proto | 17 +++++++++ 6 files changed, 122 insertions(+), 1 deletion(-) diff --git a/crates/api/src/api.rs b/crates/api/src/api.rs index 6eeed31538..0689bd244b 100644 --- a/crates/api/src/api.rs +++ b/crates/api/src/api.rs @@ -366,6 +366,13 @@ impl Forge for Api { crate::handlers::switch::find_by_ids(self, request).await } + async fn find_switch_host_endpoints( + &self, + request: Request, + ) -> Result, Status> { + crate::handlers::switch::find_host_endpoints(self, request).await + } + async fn delete_switch( &self, request: Request, @@ -899,6 +906,13 @@ impl Forge for Api { crate::handlers::credential::get_bmc_credentals(self, request).await } + async fn get_switch_nvos_credentials( + &self, + request: Request, + ) -> Result, Status> { + crate::handlers::credential::get_switch_nvos_credentials(self, request).await + } + /// Network status of each managed host, as reported by forge-dpu-agent. /// For use by forge-admin-cli /// diff --git a/crates/api/src/auth/internal_rbac_rules.rs b/crates/api/src/auth/internal_rbac_rules.rs index 57e2d2ab09..fba8237069 100644 --- a/crates/api/src/auth/internal_rbac_rules.rs +++ b/crates/api/src/auth/internal_rbac_rules.rs @@ -249,6 +249,7 @@ impl InternalRBACRules { x.perm("DeleteTenantKeyset", vec![SiteAgent]); x.perm("ValidateTenantPublicKey", vec![SiteAgent, Ssh, SshRs]); x.perm("GetBmcCredentials", vec![Health]); + x.perm("GetSwitchNvosCredentials", vec![Health]); x.perm("GetAllManagedHostNetworkStatus", vec![ForgeAdminCLI]); x.perm( "GetSiteExplorationReport", @@ -718,6 +719,10 @@ impl InternalRBACRules { "FindSwitchesByIds", vec![ForgeAdminCLI, Machineatron, Flow, Health], ); + x.perm( + "FindSwitchHostEndpoints", + vec![ForgeAdminCLI, Machineatron, Flow, Health], + ); x.perm("CreateSwitch", vec![ForgeAdminCLI, Machineatron]); x.perm("DeleteSwitch", vec![ForgeAdminCLI, Machineatron]); x.perm("AddExpectedSwitch", vec![ForgeAdminCLI, Machineatron, Flow]); diff --git a/crates/api/src/handlers/credential.rs b/crates/api/src/handlers/credential.rs index f93a0ef3eb..140fb6eae2 100644 --- a/crates/api/src/handlers/credential.rs +++ b/crates/api/src/handlers/credential.rs @@ -424,6 +424,37 @@ pub(crate) async fn get_bmc_credentals( })) } +pub(crate) async fn get_switch_nvos_credentials( + api: &Api, + request: tonic::Request, +) -> Result, tonic::Status> { + crate::api::log_request_data(&request); + + let req = request.into_inner(); + + let bmc_mac_address: mac_address::MacAddress = req + .bmc_mac_addr + .parse() + .map_err(CarbideError::MacAddressParseError)?; + + let credentials = api + .credential_manager + .get_credentials(&CredentialKey::SwitchNvosAdmin { bmc_mac_address }) + .await + .map_err(|e| CarbideError::internal(e.to_string()))? + .ok_or_else(|| CarbideError::internal("missing credentials".to_string()))?; + + let Credentials::UsernamePassword { username, password } = credentials; + + Ok(Response::new(rpc::GetBmcCredentialsResponse { + credentials: Some(rpc::BmcCredentials { + r#type: Some(rpc::bmc_credentials::Type::UsernamePassword( + rpc::UsernamePassword { username, password }, + )), + }), + })) +} + async fn set_sitewide_bmc_root_credentials( api: &Api, password: String, diff --git a/crates/api/src/handlers/switch.rs b/crates/api/src/handlers/switch.rs index 599b50b085..c8479f1486 100644 --- a/crates/api/src/handlers/switch.rs +++ b/crates/api/src/handlers/switch.rs @@ -203,6 +203,47 @@ pub async fn find_by_ids( Ok(Response::new(rpc::SwitchList { switches })) } +pub async fn find_host_endpoints( + api: &Api, + request: Request, +) -> Result, Status> { + log_request_data(&request); + + let switch_ids = request.into_inner().switch_ids; + + let max_find_by_ids = api.runtime_config.max_find_by_ids as usize; + if switch_ids.len() > max_find_by_ids { + return Err(CarbideError::InvalidArgument(format!( + "no more than {max_find_by_ids} IDs can be accepted" + )) + .into()); + } else if switch_ids.is_empty() { + return Err( + CarbideError::InvalidArgument("at least one ID must be provided".to_string()).into(), + ); + } + + let rows = db_switch::find_switch_endpoints_by_ids(&mut api.db_reader(), &switch_ids).await?; + + let endpoints = rows + .into_iter() + .filter_map(|row| { + let (Some(host_mac), Some(host_ip)) = (row.nvos_mac, row.nvos_ip) else { + return None; + }; + + Some(rpc::SwitchHostEndpoint { + switch_id: Some(row.switch_id), + bmc_mac: row.bmc_mac.to_string(), + host_mac: host_mac.to_string(), + host_ip: host_ip.to_string(), + }) + }) + .collect(); + + Ok(Response::new(rpc::SwitchHostEndpointList { endpoints })) +} + pub async fn find_switch_state_histories( api: &Api, request: Request, diff --git a/crates/rpc/build.rs b/crates/rpc/build.rs index 03ac72ab11..16bb61480c 100644 --- a/crates/rpc/build.rs +++ b/crates/rpc/build.rs @@ -837,7 +837,20 @@ fn main() -> Result<(), Box> { .type_attribute( "forge.GetBmcCredentialsRequest", "#[derive(serde::Serialize)]", - ).type_attribute( + ) + .type_attribute( + "forge.GetSwitchNvosCredentialsRequest", + "#[derive(serde::Serialize)]", + ) + .type_attribute( + "forge.SwitchHostEndpoint", + "#[derive(serde::Serialize)]", + ) + .type_attribute( + "forge.SwitchHostEndpointList", + "#[derive(serde::Serialize)]", + ) + .type_attribute( "forge.PlacementInRack", "#[derive(serde::Serialize)]", ) diff --git a/crates/rpc/proto/forge.proto b/crates/rpc/proto/forge.proto index 5b1783c71a..a476af963d 100644 --- a/crates/rpc/proto/forge.proto +++ b/crates/rpc/proto/forge.proto @@ -101,6 +101,7 @@ service Forge { rpc FindSwitches(SwitchQuery) returns (SwitchList); rpc FindSwitchIds(SwitchSearchFilter) returns (SwitchIdList); rpc FindSwitchesByIds(SwitchesByIdsRequest) returns (SwitchList); + rpc FindSwitchHostEndpoints(SwitchesByIdsRequest) returns (SwitchHostEndpointList); rpc DeleteSwitch(SwitchDeletionRequest) returns (SwitchDeletionResult); // Force deletes a Switch and optionally its associated interfaces from the database. rpc AdminForceDeleteSwitch(AdminForceDeleteSwitchRequest) returns (AdminForceDeleteSwitchResponse); @@ -263,6 +264,7 @@ service Forge { // Query for BMC Credentials rpc GetBmcCredentials(GetBmcCredentialsRequest) returns (GetBmcCredentialsResponse); + rpc GetSwitchNvosCredentials(GetSwitchNvosCredentialsRequest) returns (GetBmcCredentialsResponse); // Admin CLI actions @@ -2192,6 +2194,17 @@ message SwitchesByIdsRequest { repeated common.SwitchId switch_ids = 1; } +message SwitchHostEndpoint { + common.SwitchId switch_id = 1; + string bmc_mac = 2; + string host_mac = 3; + string host_ip = 4; +} + +message SwitchHostEndpointList { + repeated SwitchHostEndpoint endpoints = 1; +} + message ExpectedSwitch { string bmc_mac_address = 1; string bmc_username = 2; @@ -3741,6 +3754,10 @@ message GetBmcCredentialsRequest { string mac_addr = 1; } +message GetSwitchNvosCredentialsRequest { + string bmc_mac_addr = 1; +} + message GetBmcCredentialsResponse { BmcCredentials credentials = 1; } From 039a020eb9b9ea90056f0daeb5c1024a4ed2c673 Mon Sep 17 00:00:00 2001 From: mkoci <26286151+mkoci@users.noreply.github.com> Date: Tue, 19 May 2026 16:03:18 +0000 Subject: [PATCH 06/12] feat(health): wire switch host endpoint to health discovery Signed-off-by: mkoci <26286151+mkoci@users.noreply.github.com> --- crates/health/src/api_client.rs | 188 ++++++++++++++++++++++++++------ 1 file changed, 154 insertions(+), 34 deletions(-) diff --git a/crates/health/src/api_client.rs b/crates/health/src/api_client.rs index 6390f69bc2..d8f5c75161 100644 --- a/crates/health/src/api_client.rs +++ b/crates/health/src/api_client.rs @@ -15,12 +15,14 @@ * limitations under the License. */ +use std::collections::HashMap; use std::convert::TryFrom; use std::net::IpAddr; use std::str::FromStr; use std::sync::{Arc, RwLock}; use carbide_uuid::rack::RackId; +use carbide_uuid::switch::SwitchId; use forge_tls::client_config::ClientCert; use mac_address::MacAddress; use rpc::forge::MachineSearchConfig; @@ -42,6 +44,13 @@ pub struct ApiClientWrapper { #[derive(Clone)] struct ApiCredentialProvider { client: ForgeApiClient, + kind: ApiCredentialKind, +} + +#[derive(Clone)] +enum ApiCredentialKind { + Bmc, + SwitchNvosAdmin { bmc_mac: MacAddress }, } impl CredentialProvider for ApiCredentialProvider { @@ -50,24 +59,68 @@ impl CredentialProvider for ApiCredentialProvider { endpoint: &'a BmcAddr, ) -> BoxFuture<'a, Result> { Box::pin(async move { - let request = rpc::forge::GetBmcCredentialsRequest { - mac_addr: endpoint.mac.to_string(), + let response = match &self.kind { + ApiCredentialKind::Bmc => { + let request = rpc::forge::GetBmcCredentialsRequest { + mac_addr: endpoint.mac.to_string(), + }; + + self.client + .get_bmc_credentials(request) + .await + .map_err(HealthError::ApiInvocationError)? + } + ApiCredentialKind::SwitchNvosAdmin { bmc_mac } => { + let request = rpc::forge::GetSwitchNvosCredentialsRequest { + bmc_mac_addr: bmc_mac.to_string(), + }; + + self.client + .get_switch_nvos_credentials(request) + .await + .map_err(HealthError::ApiInvocationError)? + } }; - self.client - .get_bmc_credentials(request) - .await - .map_err(HealthError::ApiInvocationError)? + response .credentials .and_then(|credentials| credentials.r#type) .map(Into::into) .ok_or_else(|| { - HealthError::GenericError("missing BMC credentials in API response".to_string()) + HealthError::GenericError("missing credentials in API response".to_string()) }) }) } } +fn switch_endpoint_metadata( + switch: &rpc::forge::Switch, + endpoint_role: SwitchEndpointRole, + nmxt_enabled: bool, +) -> Result { + let serial = switch + .config + .as_ref() + .map(|config| config.name.clone()) + .ok_or_else(|| HealthError::GenericError("switch endpoint does not have serial".into()))?; + + Ok(EndpointMetadata::Switch(SwitchData { + id: switch.id, + serial, + slot_number: switch + .placement_in_rack + .as_ref() + .and_then(|placement| placement.slot_number), + tray_index: switch + .placement_in_rack + .as_ref() + .and_then(|placement| placement.tray_index), + endpoint_role, + is_primary: switch.is_primary, + nmxt_enabled, + })) +} + impl ApiClientWrapper { pub fn new(root_ca: String, client_cert: String, client_key: String, api_url: &Url) -> Self { let client_config = ForgeClientConfig::new( @@ -146,10 +199,20 @@ impl ApiClientWrapper { match self.client.find_switches(switch_request).await { Ok(response) => { + let switches = response.switches; + let switch_ids = switches + .iter() + .filter_map(|switch| switch.id) + .collect::>(); + let switches_by_id = switches + .iter() + .filter_map(|switch| switch.id.map(|id| (id, switch))) + .collect::>(); + let mut endpoints = Vec::new(); - for switch in response.switches { - match self.extract_switch_endpoint(&switch).await { + for switch in &switches { + match self.extract_switch_endpoint(switch).await { Ok(endpoint) => endpoints.push(Arc::new(endpoint)), Err(error) => tracing::warn!( ?switch, @@ -159,6 +222,33 @@ impl ApiClientWrapper { } } + if !switch_ids.is_empty() { + match self + .client + .find_switch_host_endpoints(rpc::forge::SwitchesByIdsRequest { switch_ids }) + .await + { + Ok(response) => { + for host_endpoint in response.endpoints { + match self + .extract_switch_host_endpoint(&host_endpoint, &switches_by_id) + .await + { + Ok(endpoint) => endpoints.push(Arc::new(endpoint)), + Err(error) => tracing::warn!( + ?host_endpoint, + ?error, + "Could not add switch host endpoint due to error" + ), + } + } + } + Err(error) => { + tracing::warn!(?error, "Failed to fetch switch host endpoints") + } + } + } + tracing::debug!(count = endpoints.len(), "Fetched switch endpoints"); endpoints } @@ -233,8 +323,13 @@ impl ApiClientWrapper { }) }); - self.endpoint_with_auth(addr, metadata, machine.rack_id.clone()) - .await + self.endpoint_with_auth( + addr, + metadata, + machine.rack_id.clone(), + ApiCredentialKind::Bmc, + ) + .await } async fn extract_switch_endpoint( @@ -247,32 +342,54 @@ impl ApiClientWrapper { )); }; let addr = BmcAddr::try_from(bmc_info)?; - let serial = switch - .config - .as_ref() - .map(|config| config.name.clone()) - .ok_or(HealthError::GenericError( - "Switch endpont does not have serial".to_string(), - ))?; self.endpoint_with_auth( addr, - Some(EndpointMetadata::Switch(SwitchData { - id: switch.id, - serial, - slot_number: switch - .placement_in_rack - .as_ref() - .and_then(|placement| placement.slot_number), - tray_index: switch - .placement_in_rack - .as_ref() - .and_then(|placement| placement.tray_index), - endpoint_role: SwitchEndpointRole::Bmc, - is_primary: switch.is_primary, - nmxt_enabled: false, - })), - None, + Some(switch_endpoint_metadata( + switch, + SwitchEndpointRole::Bmc, + false, + )?), + switch.rack_id.clone(), + ApiCredentialKind::Bmc, + ) + .await + } + + async fn extract_switch_host_endpoint( + &self, + host_endpoint: &rpc::forge::SwitchHostEndpoint, + switches_by_id: &HashMap, + ) -> Result { + let switch_id = host_endpoint.switch_id.ok_or_else(|| { + HealthError::GenericError("switch host endpoint missing switch ID".to_string()) + })?; + let switch = *switches_by_id.get(&switch_id).ok_or_else(|| { + HealthError::GenericError( + "switch host endpoint did not match fetched switch".to_string(), + ) + })?; + let addr = BmcAddr { + ip: host_endpoint + .host_ip + .parse::() + .map_err(|error| HealthError::GenericError(error.to_string()))?, + port: None, + mac: MacAddress::from_str(&host_endpoint.host_mac) + .map_err(|error| HealthError::GenericError(error.to_string()))?, + }; + let bmc_mac = MacAddress::from_str(&host_endpoint.bmc_mac) + .map_err(|error| HealthError::GenericError(error.to_string()))?; + + self.endpoint_with_auth( + addr, + Some(switch_endpoint_metadata( + switch, + SwitchEndpointRole::Host, + switch.is_primary, + )?), + switch.rack_id.clone(), + ApiCredentialKind::SwitchNvosAdmin { bmc_mac }, ) .await } @@ -302,6 +419,7 @@ impl ApiClientWrapper { serial, })), None, + ApiCredentialKind::Bmc, ) .await } @@ -311,9 +429,11 @@ impl ApiClientWrapper { addr: BmcAddr, metadata: Option, rack_id: Option, + credential_kind: ApiCredentialKind, ) -> Result { let provider = ApiCredentialProvider { client: self.client.clone(), + kind: credential_kind, }; let credentials = provider.fetch_credentials(&addr).await?; From 58e8c24408e3677b4f6cb578d65dac0d857f5d2e Mon Sep 17 00:00:00 2001 From: mkoci <26286151+mkoci@users.noreply.github.com> Date: Tue, 19 May 2026 22:15:17 +0000 Subject: [PATCH 07/12] feat(health): discover switch bmc and host endpoints, respectively Signed-off-by: mkoci <26286151+mkoci@users.noreply.github.com> --- crates/api/src/tests/credential.rs | 40 ++++++- crates/api/src/tests/switch_find.rs | 57 +++++++++ crates/health/src/config.rs | 87 +++++++++++++- crates/health/src/discovery/iteration.rs | 7 +- crates/health/src/discovery/spawn.rs | 142 ++++++++++++++++++++++- crates/health/src/endpoint/mod.rs | 6 + crates/health/src/otlp/convert.rs | 5 +- crates/health/src/sink/prometheus.rs | 5 +- 8 files changed, 339 insertions(+), 10 deletions(-) diff --git a/crates/api/src/tests/credential.rs b/crates/api/src/tests/credential.rs index 2fa72f01ba..f9d25a72d1 100644 --- a/crates/api/src/tests/credential.rs +++ b/crates/api/src/tests/credential.rs @@ -16,7 +16,7 @@ */ use forge_secrets::credentials::{ - BgpCredentialType, CredentialKey, CredentialReader, CredentialType, Credentials, + BgpCredentialType, CredentialKey, CredentialReader, CredentialType, Credentials, CredentialWriter, }; use rpc::forge::forge_server::Forge; use rpc::forge::{ @@ -243,3 +243,41 @@ async fn test_create_bgp_credential_validates_max_password_length(pool: sqlx::Pg }) ); } + +#[crate::sqlx_test] +async fn test_get_switch_nvos_credentials(pool: sqlx::PgPool) -> eyre::Result<()> { + let env = create_test_env(pool).await; + let bmc_mac_address = "00:11:22:33:44:55".parse()?; + + env.test_credential_manager + .set_credentials( + &CredentialKey::SwitchNvosAdmin { bmc_mac_address }, + &Credentials::UsernamePassword { + username: "nvos-admin".to_string(), + password: "nvos-secret".to_string(), + }, + ) + .await?; + + let response = env + .api + .get_switch_nvos_credentials(tonic::Request::new( + rpc::forge::GetSwitchNvosCredentialsRequest { + bmc_mac_addr: bmc_mac_address.to_string(), + }, + )) + .await? + .into_inner(); + + let credentials = response.credentials.expect("credentials"); + let Some(rpc::forge::bmc_credentials::Type::UsernamePassword(username_password)) = + credentials.r#type + else { + panic!("expected username/password credentials"); + }; + + assert_eq!(username_password.username, "nvos-admin"); + assert_eq!(username_password.password, "nvos-secret"); + + Ok(()) +} diff --git a/crates/api/src/tests/switch_find.rs b/crates/api/src/tests/switch_find.rs index bbbe39904a..aa3c574677 100644 --- a/crates/api/src/tests/switch_find.rs +++ b/crates/api/src/tests/switch_find.rs @@ -262,3 +262,60 @@ async fn test_find_switches_by_ids_response_fields( Ok(()) } + +#[crate::sqlx_test] +async fn test_find_switch_host_endpoints_returns_resolved_nvos_host( + pool: sqlx::PgPool, +) -> Result<(), Box> { + let env = create_test_env(pool).await; + let switch_id = new_switch(&env, Some("Switch1".to_string()), None).await?; + + let mut rows = db::switch::find_switch_endpoints_by_ids(&env.pool, &[switch_id]).await?; + let expected = rows.pop().expect("switch endpoint row"); + let host_mac = expected.nvos_mac.expect("nvos mac"); + let host_ip = expected.nvos_ip.expect("nvos ip"); + + let response = env + .api + .find_switch_host_endpoints(tonic::Request::new(rpc::forge::SwitchesByIdsRequest { + switch_ids: vec![switch_id], + })) + .await? + .into_inner(); + + assert_eq!(response.endpoints.len(), 1); + assert_eq!(response.endpoints[0].switch_id, Some(switch_id)); + assert_eq!(response.endpoints[0].bmc_mac, expected.bmc_mac.to_string()); + assert_eq!(response.endpoints[0].host_mac, host_mac.to_string()); + assert_eq!(response.endpoints[0].host_ip, host_ip.to_string()); + + Ok(()) +} + +#[crate::sqlx_test] +async fn test_find_switch_host_endpoints_skips_switch_without_nvos_host( + pool: sqlx::PgPool, +) -> Result<(), Box> { + let env = create_test_env(pool).await; + let switch_id = new_switch(&env, Some("Switch1".to_string()), None).await?; + let rows = db::switch::find_switch_endpoints_by_ids(&env.pool, &[switch_id]).await?; + let bmc_mac = rows.first().expect("switch endpoint row").bmc_mac; + + { + let mut txn = env.pool.begin().await?; + db::expected_switch::update_nvos_mac_addresses(txn.as_mut(), bmc_mac, &[]).await?; + txn.commit().await?; + } + + let response = env + .api + .find_switch_host_endpoints(tonic::Request::new(rpc::forge::SwitchesByIdsRequest { + switch_ids: vec![switch_id], + })) + .await? + .into_inner(); + + assert!(response.endpoints.is_empty()); + + Ok(()) +} diff --git a/crates/health/src/config.rs b/crates/health/src/config.rs index 837f1be96b..eba3028ac9 100644 --- a/crates/health/src/config.rs +++ b/crates/health/src/config.rs @@ -1424,6 +1424,66 @@ power_shelf = { id = "fps100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1 ); } + #[test] + fn test_static_switch_host_accepts_primary_without_nmxt_override() { + let toml_content = r#" +[endpoint_sources.carbide_api] +enabled = false + +[[endpoint_sources.static_bmc_endpoints]] +ip = "10.0.1.1" +mac = "11:22:33:44:55:66" +username = "admin" +password = "pass" +switch = { id = "fsw100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0", serial = "SN-SW-001", endpoint_role = "host", is_primary = true } +"#; + + let config: Config = Figment::new() + .merge(Serialized::defaults(Config::default())) + .merge(Toml::string(toml_content)) + .extract() + .expect("static switch host config should parse"); + + let switch = config.endpoint_sources.static_bmc_endpoints[0] + .switch + .as_ref() + .expect("switch metadata"); + + assert_eq!(switch.endpoint_role, StaticSwitchEndpointRole::Host); + assert!(switch.is_primary); + assert_eq!(switch.nmxt_enabled, None); + } + + #[test] + fn test_static_switch_host_accepts_nmxt_override() { + let toml_content = r#" +[endpoint_sources.carbide_api] +enabled = false + +[[endpoint_sources.static_bmc_endpoints]] +ip = "10.0.1.2" +mac = "11:22:33:44:55:77" +username = "admin" +password = "pass" +switch = { id = "fsw100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0", serial = "SN-SW-002", endpoint_role = "host", is_primary = false, nmxt_enabled = true } +"#; + + let config: Config = Figment::new() + .merge(Serialized::defaults(Config::default())) + .merge(Toml::string(toml_content)) + .extract() + .expect("static switch host config should parse"); + + let switch = config.endpoint_sources.static_bmc_endpoints[0] + .switch + .as_ref() + .expect("switch metadata"); + + assert_eq!(switch.endpoint_role, StaticSwitchEndpointRole::Host); + assert!(!switch.is_primary); + assert_eq!(switch.nmxt_enabled, Some(true)); + } + #[test] fn test_static_machine_endpoint_accepts_placement_and_nvlink_metadata() { let toml_content = r#" @@ -1492,7 +1552,7 @@ switch = { serial = "SN-SW-001" } .extract() .expect("could not parse config toml file"); - assert_eq!(config.endpoint_sources.static_bmc_endpoints.len(), 3); + assert_eq!(config.endpoint_sources.static_bmc_endpoints.len(), 4); assert!( config.endpoint_sources.static_bmc_endpoints[0] .switch @@ -1521,17 +1581,38 @@ switch = { serial = "SN-SW-001" } .switch .as_ref() .and_then(|switch| switch.serial.as_deref()), - Some("SN-SWITCH-001") + Some("SN-SWITCH-BMC-001") + ); + assert_eq!( + config.endpoint_sources.static_bmc_endpoints[1] + .switch + .as_ref() + .map(|switch| switch.endpoint_role), + Some(StaticSwitchEndpointRole::Bmc) + ); + assert_eq!( + config.endpoint_sources.static_bmc_endpoints[2] + .switch + .as_ref() + .and_then(|switch| switch.serial.as_deref()), + Some("SN-SWITCH-HOST-001") ); assert_eq!( config.endpoint_sources.static_bmc_endpoints[2] + .switch + .as_ref() + .map(|switch| switch.endpoint_role), + Some(StaticSwitchEndpointRole::Host) + ); + assert_eq!( + config.endpoint_sources.static_bmc_endpoints[3] .power_shelf .as_ref() .and_then(|power_shelf| power_shelf.id.as_deref()), Some("fps100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0") ); assert_eq!( - config.endpoint_sources.static_bmc_endpoints[2] + config.endpoint_sources.static_bmc_endpoints[3] .power_shelf .as_ref() .and_then(|power_shelf| power_shelf.serial.as_deref()), diff --git a/crates/health/src/discovery/iteration.rs b/crates/health/src/discovery/iteration.rs index 5eecd13878..4bb1407f5c 100644 --- a/crates/health/src/discovery/iteration.rs +++ b/crates/health/src/discovery/iteration.rs @@ -100,7 +100,9 @@ mod tests { use mac_address::MacAddress; use super::*; - use crate::endpoint::{BmcAddr, BmcCredentials, EndpointMetadata, SwitchData}; + use crate::endpoint::{ + BmcAddr, BmcCredentials, EndpointMetadata, SwitchData, SwitchEndpointRole, + }; fn endpoint(mac: MacAddress, switch: bool) -> Arc { Arc::new(BmcEndpoint::with_fixed_credentials( @@ -119,6 +121,9 @@ mod tests { serial: format!("serial-{mac}"), slot_number: None, tray_index: None, + endpoint_role: SwitchEndpointRole::Host, + is_primary: false, + nmxt_enabled: false, })) } else { None diff --git a/crates/health/src/discovery/spawn.rs b/crates/health/src/discovery/spawn.rs index d33a9e0753..d10f06ccd3 100644 --- a/crates/health/src/discovery/spawn.rs +++ b/crates/health/src/discovery/spawn.rs @@ -388,7 +388,9 @@ mod tests { use super::*; use crate::config::{Config, Configurable}; - use crate::endpoint::{BmcAddr, BmcCredentials, EndpointMetadata, MachineData, SwitchData}; + use crate::endpoint::{ + BmcAddr, BmcCredentials, EndpointMetadata, MachineData, SwitchData, SwitchEndpointRole, + }; use crate::limiter::{NoopLimiter, RateLimiter}; use crate::metrics::MetricsManager; use crate::sink::{CollectorEvent, EventContext}; @@ -431,15 +433,27 @@ mod tests { )) } - fn switch_metadata() -> EndpointMetadata { + fn switch_metadata_with_role( + endpoint_role: SwitchEndpointRole, + is_primary: bool, + nmxt_enabled: bool, + serial: &str, + ) -> EndpointMetadata { EndpointMetadata::Switch(SwitchData { id: None, - serial: "switch-serial-1".to_string(), + serial: serial.to_string(), slot_number: None, tray_index: None, + endpoint_role, + is_primary, + nmxt_enabled, }) } + fn switch_metadata() -> EndpointMetadata { + switch_metadata_with_role(SwitchEndpointRole::Host, false, false, "switch-serial-1") + } + fn machine_metadata() -> EndpointMetadata { EndpointMetadata::Machine(MachineData { machine_id: "fm100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0" @@ -508,6 +522,128 @@ mod tests { assert_eq!(ctx.collectors.len(CollectorKind::LeakDetector), 0); } + #[tokio::test] + async fn test_switch_bmc_endpoint_starts_redfish_but_not_switch_host_collectors() { + let mut config = Config::default(); + config.collectors.sensors = Configurable::Enabled(Default::default()); + config.collectors.logs = Configurable::Disabled; + config.collectors.firmware = Configurable::Disabled; + config.collectors.leak_detector = Configurable::Disabled; + config.collectors.nmxt = Configurable::Enabled(Default::default()); + config.collectors.nvue = Configurable::Enabled(Default::default()); + + let mut ctx = context_with_config(config, "test_switch_bmc_redfish_only"); + let endpoint = test_endpoint( + Ipv4Addr::new(10, 0, 0, 8), + "55:66:77:88:99:bb", + Some(switch_metadata_with_role( + SwitchEndpointRole::Bmc, + true, + false, + "switch-bmc", + )), + ); + + spawn_collectors_for_endpoint(&mut ctx, &endpoint, None, "test_switch_bmc_redfish_only") + .await + .expect("spawn should succeed"); + + assert_eq!(ctx.collectors.len(CollectorKind::Sensor), 1); + assert_eq!(ctx.collectors.len(CollectorKind::Nmxt), 0); + assert_eq!(ctx.collectors.len(CollectorKind::NvueRest), 0); + } + + #[tokio::test] + async fn test_switch_host_primary_starts_nmxt_and_nvue_rest_when_globally_enabled() { + let mut config = Config::default(); + config.collectors.sensors = Configurable::Disabled; + config.collectors.logs = Configurable::Disabled; + config.collectors.firmware = Configurable::Disabled; + config.collectors.leak_detector = Configurable::Disabled; + config.collectors.nmxt = Configurable::Enabled(Default::default()); + config.collectors.nvue = Configurable::Enabled(Default::default()); + + let mut ctx = context_with_config(config, "test_switch_host_nmxt_nvue_enabled"); + let endpoint = test_endpoint( + Ipv4Addr::new(10, 0, 0, 9), + "55:66:77:88:99:cc", + Some(switch_metadata_with_role( + SwitchEndpointRole::Host, + true, + true, + "switch-host", + )), + ); + + spawn_collectors_for_endpoint(&mut ctx, &endpoint, None, "test") + .await + .expect("spawn should succeed"); + + assert_eq!(ctx.collectors.len(CollectorKind::Sensor), 0); + assert_eq!(ctx.collectors.len(CollectorKind::Nmxt), 1); + assert_eq!(ctx.collectors.len(CollectorKind::NvueRest), 1); + } + + #[tokio::test] + async fn test_switch_host_policy_gates_nmxt_but_not_nvue_rest() { + let mut config = Config::default(); + config.collectors.sensors = Configurable::Disabled; + config.collectors.logs = Configurable::Disabled; + config.collectors.firmware = Configurable::Disabled; + config.collectors.leak_detector = Configurable::Disabled; + config.collectors.nmxt = Configurable::Enabled(Default::default()); + config.collectors.nvue = Configurable::Enabled(Default::default()); + + let mut ctx = context_with_config(config, "test_switch_host_nmxt_endpoint_disabled"); + let endpoint = test_endpoint( + Ipv4Addr::new(10, 0, 0, 10), + "55:66:77:88:99:dd", + Some(switch_metadata_with_role( + SwitchEndpointRole::Host, + false, + false, + "switch-host", + )), + ); + + spawn_collectors_for_endpoint(&mut ctx, &endpoint, None, "test") + .await + .expect("spawn should succeed"); + + assert_eq!(ctx.collectors.len(CollectorKind::Nmxt), 0); + assert_eq!(ctx.collectors.len(CollectorKind::NvueRest), 1); + } + + #[tokio::test] + async fn test_switch_host_does_not_start_host_collectors_when_globally_disabled() { + let mut config = Config::default(); + config.collectors.sensors = Configurable::Disabled; + config.collectors.logs = Configurable::Disabled; + config.collectors.firmware = Configurable::Disabled; + config.collectors.leak_detector = Configurable::Disabled; + config.collectors.nmxt = Configurable::Disabled; + config.collectors.nvue = Configurable::Disabled; + + let mut ctx = context_with_config(config, "test_switch_host_collectors_global_disabled"); + let endpoint = test_endpoint( + Ipv4Addr::new(10, 0, 0, 11), + "55:66:77:88:99:ee", + Some(switch_metadata_with_role( + SwitchEndpointRole::Host, + true, + true, + "switch-host", + )), + ); + + spawn_collectors_for_endpoint(&mut ctx, &endpoint, None, "test") + .await + .expect("spawn should succeed"); + + assert_eq!(ctx.collectors.len(CollectorKind::Nmxt), 0); + assert_eq!(ctx.collectors.len(CollectorKind::NvueRest), 0); + } + #[tokio::test] async fn test_machine_endpoint_still_starts_sse_logs_collector() { let mut config = Config::default(); diff --git a/crates/health/src/endpoint/mod.rs b/crates/health/src/endpoint/mod.rs index b44a985d45..081caef8d8 100644 --- a/crates/health/src/endpoint/mod.rs +++ b/crates/health/src/endpoint/mod.rs @@ -186,6 +186,9 @@ mod tests { serial: Some("SN-001".to_string()), slot_number: Some(7), tray_index: Some(3), + endpoint_role: crate::config::StaticSwitchEndpointRole::Host, + is_primary: true, + nmxt_enabled: None, }), rack_id: None, }]; @@ -200,6 +203,9 @@ mod tests { assert_eq!(s.serial, "SN-001"); assert_eq!(s.slot_number, Some(7)); assert_eq!(s.tray_index, Some(3)); + assert_eq!(s.endpoint_role, SwitchEndpointRole::Host); + assert!(s.is_primary); + assert!(s.nmxt_enabled); } other => panic!("expected Switch metadata, got {other:?}"), } diff --git a/crates/health/src/otlp/convert.rs b/crates/health/src/otlp/convert.rs index fab1b521a6..168c453862 100644 --- a/crates/health/src/otlp/convert.rs +++ b/crates/health/src/otlp/convert.rs @@ -204,7 +204,7 @@ mod tests { use mac_address::MacAddress; use super::*; - use crate::endpoint::{BmcAddr, EndpointMetadata, MachineData, SwitchData}; + use crate::endpoint::{BmcAddr, EndpointMetadata, MachineData, SwitchData, SwitchEndpointRole}; use crate::sink::{ Classification, HealthReport, HealthReportAlert, LogRecord, Probe, ReportSource, }; @@ -302,6 +302,9 @@ mod tests { serial: "SN-SWITCH-001".to_string(), slot_number: Some(7), tray_index: Some(3), + endpoint_role: SwitchEndpointRole::Host, + is_primary: false, + nmxt_enabled: false, })), rack_id: None, }; diff --git a/crates/health/src/sink/prometheus.rs b/crates/health/src/sink/prometheus.rs index e1c70aae6f..e36c1e472f 100644 --- a/crates/health/src/sink/prometheus.rs +++ b/crates/health/src/sink/prometheus.rs @@ -241,7 +241,7 @@ mod tests { use mac_address::MacAddress; use super::*; - use crate::endpoint::{BmcAddr, EndpointMetadata, MachineData, SwitchData}; + use crate::endpoint::{BmcAddr, EndpointMetadata, MachineData, SwitchData, SwitchEndpointRole}; fn test_switch_id(label: &str) -> SwitchId { let mut hash = [0u8; 32]; @@ -309,6 +309,9 @@ mod tests { serial: "SN-SWITCH-001".to_string(), slot_number: Some(7), tray_index: Some(3), + endpoint_role: SwitchEndpointRole::Host, + is_primary: false, + nmxt_enabled: false, })), rack_id: None, }; From 4f8e492b3301d7186de80790c86364d3d89f66bb Mon Sep 17 00:00:00 2001 From: mkoci <26286151+mkoci@users.noreply.github.com> Date: Wed, 20 May 2026 14:30:35 +0200 Subject: [PATCH 08/12] fix(health): remove unnecessary helpers Signed-off-by: mkoci <26286151+mkoci@users.noreply.github.com> --- crates/health/src/api_client.rs | 11 ++++------- crates/health/src/discovery/spawn.rs | 17 ++--------------- 2 files changed, 6 insertions(+), 22 deletions(-) diff --git a/crates/health/src/api_client.rs b/crates/health/src/api_client.rs index d8f5c75161..e6f1d08fd3 100644 --- a/crates/health/src/api_client.rs +++ b/crates/health/src/api_client.rs @@ -200,14 +200,10 @@ impl ApiClientWrapper { match self.client.find_switches(switch_request).await { Ok(response) => { let switches = response.switches; - let switch_ids = switches - .iter() - .filter_map(|switch| switch.id) - .collect::>(); - let switches_by_id = switches + let switches_by_id: HashMap<_, _> = switches .iter() .filter_map(|switch| switch.id.map(|id| (id, switch))) - .collect::>(); + .collect(); let mut endpoints = Vec::new(); @@ -222,7 +218,8 @@ impl ApiClientWrapper { } } - if !switch_ids.is_empty() { + if !switches_by_id.is_empty() { + let switch_ids = switches_by_id.keys().copied().collect(); match self .client .find_switch_host_endpoints(rpc::forge::SwitchesByIdsRequest { switch_ids }) diff --git a/crates/health/src/discovery/spawn.rs b/crates/health/src/discovery/spawn.rs index d10f06ccd3..e50f2fcb88 100644 --- a/crates/health/src/discovery/spawn.rs +++ b/crates/health/src/discovery/spawn.rs @@ -51,18 +51,6 @@ pub(super) async fn spawn_collectors_for_endpoint( } } -fn is_switch_host_endpoint(endpoint: &BmcEndpoint) -> bool { - endpoint - .switch_data() - .is_some_and(|switch| switch.endpoint_role == SwitchEndpointRole::Host) -} - -fn switch_host_nmxt_enabled(endpoint: &BmcEndpoint) -> bool { - endpoint.switch_data().is_some_and(|switch| { - switch.endpoint_role == SwitchEndpointRole::Host && switch.nmxt_enabled - }) -} - fn spawn_generic_redfish_collectors( ctx: &mut DiscoveryLoopContext, endpoint: &Arc, @@ -291,7 +279,7 @@ fn spawn_switch_host_collectors( let key = endpoint.key(); let endpoint_arc = endpoint.clone(); - if switch_host_nmxt_enabled(endpoint) + if endpoint.switch_data().is_some_and(|switch| switch.nmxt_enabled) && let Configurable::Enabled(nmxt_cfg) = &ctx.nmxt_config && !ctx.collectors.contains(CollectorKind::Nmxt, &key) { @@ -333,8 +321,7 @@ fn spawn_switch_host_collectors( } } - if is_switch_host_endpoint(endpoint) - && let Configurable::Enabled(nvue_cfg) = &ctx.nvue_config + if let Configurable::Enabled(nvue_cfg) = &ctx.nvue_config && let Configurable::Enabled(rest_cfg) = &nvue_cfg.rest && !ctx.collectors.contains(CollectorKind::NvueRest, &key) { From eb11bb4652e707648368a61b573c1a175fa33ad9 Mon Sep 17 00:00:00 2001 From: mkoci <26286151+mkoci@users.noreply.github.com> Date: Wed, 20 May 2026 15:57:48 +0200 Subject: [PATCH 09/12] fix(api): use proper errors when bmc/switch host credentials are not found Signed-off-by: mkoci <26286151+mkoci@users.noreply.github.com> --- crates/api/src/handlers/credential.rs | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/crates/api/src/handlers/credential.rs b/crates/api/src/handlers/credential.rs index 140fb6eae2..eb956237ec 100644 --- a/crates/api/src/handlers/credential.rs +++ b/crates/api/src/handlers/credential.rs @@ -409,7 +409,10 @@ pub(crate) async fn get_bmc_credentals( }) .await .map_err(|e| CarbideError::internal(e.to_string()))? - .ok_or_else(|| CarbideError::internal("missing credentials".to_string()))?; + .ok_or_else(|| CarbideError::NotFoundError { + kind: "bmc_root_credentials", + id: req.mac_addr.clone(), + })?; let (username, password) = match credentials { Credentials::UsernamePassword { username, password } => (username, password), @@ -442,7 +445,10 @@ pub(crate) async fn get_switch_nvos_credentials( .get_credentials(&CredentialKey::SwitchNvosAdmin { bmc_mac_address }) .await .map_err(|e| CarbideError::internal(e.to_string()))? - .ok_or_else(|| CarbideError::internal("missing credentials".to_string()))?; + .ok_or_else(|| CarbideError::NotFoundError { + kind: "switch_nvos_credentials", + id: req.bmc_mac_addr.clone(), + })?; let Credentials::UsernamePassword { username, password } = credentials; From 4099f6204ad41eb59a5047623f08a3c8e00100e4 Mon Sep 17 00:00:00 2001 From: mkoci <26286151+mkoci@users.noreply.github.com> Date: Wed, 20 May 2026 21:18:25 +0200 Subject: [PATCH 10/12] fix(health): lint-police --- crates/api/src/tests/credential.rs | 3 ++- crates/health/src/discovery/spawn.rs | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/crates/api/src/tests/credential.rs b/crates/api/src/tests/credential.rs index f9d25a72d1..1d69c312ce 100644 --- a/crates/api/src/tests/credential.rs +++ b/crates/api/src/tests/credential.rs @@ -16,7 +16,8 @@ */ use forge_secrets::credentials::{ - BgpCredentialType, CredentialKey, CredentialReader, CredentialType, Credentials, CredentialWriter, + BgpCredentialType, CredentialKey, CredentialReader, CredentialType, CredentialWriter, + Credentials, }; use rpc::forge::forge_server::Forge; use rpc::forge::{ diff --git a/crates/health/src/discovery/spawn.rs b/crates/health/src/discovery/spawn.rs index e50f2fcb88..e86401c8d6 100644 --- a/crates/health/src/discovery/spawn.rs +++ b/crates/health/src/discovery/spawn.rs @@ -279,7 +279,9 @@ fn spawn_switch_host_collectors( let key = endpoint.key(); let endpoint_arc = endpoint.clone(); - if endpoint.switch_data().is_some_and(|switch| switch.nmxt_enabled) + if endpoint + .switch_data() + .is_some_and(|switch| switch.nmxt_enabled) && let Configurable::Enabled(nmxt_cfg) = &ctx.nmxt_config && !ctx.collectors.contains(CollectorKind::Nmxt, &key) { From 804fd0f3086c5500595a54f0de51f9c5c3048064 Mon Sep 17 00:00:00 2001 From: mkoci <26286151+mkoci@users.noreply.github.com> Date: Thu, 21 May 2026 02:26:12 +0200 Subject: [PATCH 11/12] fix(api, health): reshape Switch API surface with switch bmc and nvos info in one endpoint. Remove dead code --- crates/api-db/src/switch.rs | 61 +------- crates/api-model/src/rpc_conv/switch.rs | 1 + crates/api/src/api.rs | 7 - crates/api/src/auth/internal_rbac_rules.rs | 4 - crates/api/src/handlers/credential.rs | 34 ++++- crates/api/src/handlers/switch.rs | 155 +++++++++------------ crates/api/src/tests/credential.rs | 10 +- crates/api/src/tests/switch_find.rs | 60 ++++++-- crates/health/src/api_client.rs | 79 +++-------- crates/rpc/build.rs | 8 -- crates/rpc/proto/forge.proto | 17 +-- 11 files changed, 180 insertions(+), 256 deletions(-) diff --git a/crates/api-db/src/switch.rs b/crates/api-db/src/switch.rs index 40c1fcb638..9b2510279a 100644 --- a/crates/api-db/src/switch.rs +++ b/crates/api-db/src/switch.rs @@ -22,6 +22,7 @@ use carbide_uuid::switch::SwitchId; use chrono::prelude::*; use config_version::{ConfigVersion, Versioned}; use health_report::{HealthReport, HealthReportApplyMode}; +use mac_address::MacAddress; use model::controller_outcome::PersistentStateHandlerOutcome; use model::metadata::Metadata; use model::rack::RackFirmwareUpgradeStatus; @@ -459,34 +460,6 @@ pub async fn update(switch: &Switch, txn: &mut PgConnection) -> Result DatabaseResult> { - let sql = r#" - SELECT - es.serial_number, - es.bmc_mac_address, - mia.address as ip_address - FROM expected_switches es - JOIN machine_interfaces mi ON mi.mac_address = es.bmc_mac_address - JOIN machine_interface_addresses mia ON mia.interface_id = mi.id - JOIN network_segments ns ON ns.id = mi.segment_id - WHERE ns.network_segment_type = 'underlay' - "#; - - sqlx::query_as(sql) - .fetch_all(txn) - .await - .map_err(|err| DatabaseError::new("list_switch_bmc_info", err)) -} - /// Resolve SwitchIds to BMC IPs via the FK path: /// switches.bmc_mac_address -> expected_switches.bmc_mac_address /// -> machine_interfaces -> machine_interface_addresses (underlay) -> IP @@ -611,38 +584,6 @@ pub async fn update_metadata( } } -#[derive(Debug, sqlx::FromRow)] -pub struct SwitchBmcRow { - pub switch_id: SwitchId, - pub bmc_mac: MacAddress, - pub bmc_ip: IpAddr, -} - -/// Resolve SwitchIds to BMC MAC + IP via machine_interfaces. -pub async fn find_bmc_info_by_switch_ids( - db: impl crate::db_read::DbReader<'_>, - switch_ids: &[SwitchId], -) -> DatabaseResult> { - let sql = r#" - SELECT DISTINCT ON (mi.switch_id) - mi.switch_id, - mi.mac_address AS bmc_mac, - mia.address AS bmc_ip - FROM machine_interfaces mi - JOIN machine_interface_addresses mia ON mia.interface_id = mi.id - JOIN network_segments ns ON ns.id = mi.segment_id - WHERE mi.switch_id = ANY($1) - AND ns.network_segment_type = 'underlay' - ORDER BY mi.switch_id - "#; - - sqlx::query_as(sql) - .bind(switch_ids) - .fetch_all(db) - .await - .map_err(|err| DatabaseError::new("switch::find_bmc_info_by_switch_ids", err)) -} - /// A switch resolved by its BMC MAC address, along with the rack it belongs /// to. Used by the Component Manager state controller wrapper to build a /// rack-level `MaintenanceScope` for the switches it's been asked to act on. diff --git a/crates/api-model/src/rpc_conv/switch.rs b/crates/api-model/src/rpc_conv/switch.rs index 5a2c1fd1f5..8dde250c2c 100644 --- a/crates/api-model/src/rpc_conv/switch.rs +++ b/crates/api-model/src/rpc_conv/switch.rs @@ -187,6 +187,7 @@ impl TryFrom for rpc::Switch { deleted, controller_state, bmc_info: None, + nvos_info: None, state_version, metadata: Some(src.metadata.into()), version: src.version.version_string(), diff --git a/crates/api/src/api.rs b/crates/api/src/api.rs index 48146aaec8..5e6278fdc8 100644 --- a/crates/api/src/api.rs +++ b/crates/api/src/api.rs @@ -366,13 +366,6 @@ impl Forge for Api { crate::handlers::switch::find_by_ids(self, request).await } - async fn find_switch_host_endpoints( - &self, - request: Request, - ) -> Result, Status> { - crate::handlers::switch::find_host_endpoints(self, request).await - } - async fn delete_switch( &self, request: Request, diff --git a/crates/api/src/auth/internal_rbac_rules.rs b/crates/api/src/auth/internal_rbac_rules.rs index 152a1e15f8..bd1a7c78c2 100644 --- a/crates/api/src/auth/internal_rbac_rules.rs +++ b/crates/api/src/auth/internal_rbac_rules.rs @@ -723,10 +723,6 @@ impl InternalRBACRules { "FindSwitchesByIds", vec![ForgeAdminCLI, Machineatron, Flow, Health], ); - x.perm( - "FindSwitchHostEndpoints", - vec![ForgeAdminCLI, Machineatron, Flow, Health], - ); x.perm("CreateSwitch", vec![ForgeAdminCLI, Machineatron]); x.perm("DeleteSwitch", vec![ForgeAdminCLI, Machineatron]); x.perm("AddExpectedSwitch", vec![ForgeAdminCLI, Machineatron, Flow]); diff --git a/crates/api/src/handlers/credential.rs b/crates/api/src/handlers/credential.rs index eb956237ec..79c7085686 100644 --- a/crates/api/src/handlers/credential.rs +++ b/crates/api/src/handlers/credential.rs @@ -434,11 +434,33 @@ pub(crate) async fn get_switch_nvos_credentials( crate::api::log_request_data(&request); let req = request.into_inner(); - - let bmc_mac_address: mac_address::MacAddress = req - .bmc_mac_addr - .parse() - .map_err(CarbideError::MacAddressParseError)?; + let switch_id = req + .switch_id + .ok_or_else(|| CarbideError::InvalidArgument("switch_id is required".to_string()))?; + + let bmc_mac_address = { + let mut txn = api.txn_begin().await?; + let switches = db::switch::find_by( + &mut txn, + db::ObjectColumnFilter::One(db::switch::IdColumn, &switch_id), + ) + .await?; + let _ = txn.rollback().await; + + let switch = switches + .first() + .ok_or_else(|| CarbideError::NotFoundError { + kind: "switch", + id: switch_id.to_string(), + })?; + + switch + .bmc_mac_address + .ok_or_else(|| CarbideError::NotFoundError { + kind: "switch_bmc_mac_address", + id: switch_id.to_string(), + })? + }; let credentials = api .credential_manager @@ -447,7 +469,7 @@ pub(crate) async fn get_switch_nvos_credentials( .map_err(|e| CarbideError::internal(e.to_string()))? .ok_or_else(|| CarbideError::NotFoundError { kind: "switch_nvos_credentials", - id: req.bmc_mac_addr.clone(), + id: switch_id.to_string(), })?; let Credentials::UsernamePassword { username, password } = credentials; diff --git a/crates/api/src/handlers/switch.rs b/crates/api/src/handlers/switch.rs index c8479f1486..c15a3ef3a8 100644 --- a/crates/api/src/handlers/switch.rs +++ b/crates/api/src/handlers/switch.rs @@ -68,27 +68,17 @@ pub async fn find_switch( })? }; - let bmc_info_map: std::collections::HashMap = { - let rows = db_switch::list_switch_bmc_info(&mut txn) + let switch_ids: Vec<_> = switch_list.iter().map(|switch| switch.id).collect(); + let endpoint_info_map: std::collections::HashMap<_, _> = if switch_ids.is_empty() { + std::collections::HashMap::new() + } else { + db_switch::find_switch_endpoints_by_ids(&mut *txn, &switch_ids) .await .map_err(|e| CarbideError::Internal { - message: format!("Failed to get switch BMC info: {}", e), - })?; - - rows.into_iter() - .map(|row| { - ( - row.bmc_mac_address.to_string(), - rpc::BmcInfo { - ip: Some(row.ip_address.to_string()), - mac: Some(row.bmc_mac_address.to_string()), - version: None, - firmware_version: None, - port: None, - machine_interface_id: None, - }, - ) - }) + message: format!("Failed to get switch endpoint info: {}", e), + })? + .into_iter() + .map(|row| (row.switch_id, row)) .collect() }; @@ -99,13 +89,33 @@ pub async fn find_switch( let switches: Vec = switch_list .into_iter() .map(|s| { - let bmc_info = s - .bmc_mac_address - .as_ref() - .and_then(|mac| bmc_info_map.get(&mac.to_string()).cloned()); + let endpoint_info = endpoint_info_map.get(&s.id); rpc::Switch::try_from(s).map(|mut rpc_switch| { - rpc_switch.bmc_info = bmc_info; + rpc_switch.bmc_info = endpoint_info.map(|row| rpc::BmcInfo { + ip: Some(row.bmc_ip.to_string()), + mac: Some(row.bmc_mac.to_string()), + version: None, + firmware_version: None, + port: None, + machine_interface_id: None, + }); + rpc_switch.nvos_info = endpoint_info.and_then(|row| { + let (Some(nvos_mac), Some(nvos_ip)) = + (row.nvos_mac.as_ref(), row.nvos_ip.as_ref()) + else { + return None; + }; + + Some(rpc::BmcInfo { + ip: Some(nvos_ip.to_string()), + mac: Some(nvos_mac.to_string()), + version: None, + firmware_version: None, + port: None, + machine_interface_id: None, + }) + }); rpc_switch }) }) @@ -158,40 +168,48 @@ pub async fn find_by_ids( ) .await?; - let bmc_info_map: std::collections::HashMap<_, _> = { - let rows = db_switch::find_bmc_info_by_switch_ids(&mut txn, &switch_ids) + let endpoint_info_map: std::collections::HashMap<_, _> = + db_switch::find_switch_endpoints_by_ids(&mut txn, &switch_ids) .await .map_err(|e| CarbideError::Internal { - message: format!("Failed to get switch BMC info: {}", e), - })?; - - rows.into_iter() - .map(|row| { - ( - row.switch_id, - rpc::BmcInfo { - ip: Some(row.bmc_ip.to_string()), - mac: Some(row.bmc_mac.to_string()), - version: None, - firmware_version: None, - port: None, - machine_interface_id: None, - }, - ) - }) - .collect() - }; + message: format!("Failed to get switch endpoint info: {}", e), + })? + .into_iter() + .map(|row| (row.switch_id, row)) + .collect(); let _ = txn.rollback().await; let switches: Vec = switch_list .into_iter() .map(|s| { - let id = s.id; - let bmc_info = bmc_info_map.get(&id).cloned(); + let endpoint_info = endpoint_info_map.get(&s.id); rpc::Switch::try_from(s).map(|mut rpc_switch| { - rpc_switch.bmc_info = bmc_info; + rpc_switch.bmc_info = endpoint_info.map(|row| rpc::BmcInfo { + ip: Some(row.bmc_ip.to_string()), + mac: Some(row.bmc_mac.to_string()), + version: None, + firmware_version: None, + port: None, + machine_interface_id: None, + }); + rpc_switch.nvos_info = endpoint_info.and_then(|row| { + let (Some(nvos_mac), Some(nvos_ip)) = + (row.nvos_mac.as_ref(), row.nvos_ip.as_ref()) + else { + return None; + }; + + Some(rpc::BmcInfo { + ip: Some(nvos_ip.to_string()), + mac: Some(nvos_mac.to_string()), + version: None, + firmware_version: None, + port: None, + machine_interface_id: None, + }) + }); rpc_switch }) }) @@ -203,47 +221,6 @@ pub async fn find_by_ids( Ok(Response::new(rpc::SwitchList { switches })) } -pub async fn find_host_endpoints( - api: &Api, - request: Request, -) -> Result, Status> { - log_request_data(&request); - - let switch_ids = request.into_inner().switch_ids; - - let max_find_by_ids = api.runtime_config.max_find_by_ids as usize; - if switch_ids.len() > max_find_by_ids { - return Err(CarbideError::InvalidArgument(format!( - "no more than {max_find_by_ids} IDs can be accepted" - )) - .into()); - } else if switch_ids.is_empty() { - return Err( - CarbideError::InvalidArgument("at least one ID must be provided".to_string()).into(), - ); - } - - let rows = db_switch::find_switch_endpoints_by_ids(&mut api.db_reader(), &switch_ids).await?; - - let endpoints = rows - .into_iter() - .filter_map(|row| { - let (Some(host_mac), Some(host_ip)) = (row.nvos_mac, row.nvos_ip) else { - return None; - }; - - Some(rpc::SwitchHostEndpoint { - switch_id: Some(row.switch_id), - bmc_mac: row.bmc_mac.to_string(), - host_mac: host_mac.to_string(), - host_ip: host_ip.to_string(), - }) - }) - .collect(); - - Ok(Response::new(rpc::SwitchHostEndpointList { endpoints })) -} - pub async fn find_switch_state_histories( api: &Api, request: Request, diff --git a/crates/api/src/tests/credential.rs b/crates/api/src/tests/credential.rs index 1d69c312ce..ad45c83168 100644 --- a/crates/api/src/tests/credential.rs +++ b/crates/api/src/tests/credential.rs @@ -27,6 +27,7 @@ use tonic::Code; use crate::handlers::credential::MAX_BGP_PASSWORD_LENGTH; use crate::tests::common::api_fixtures::create_test_env; +use crate::tests::common::api_fixtures::site_explorer::new_switch; #[crate::sqlx_test] async fn test_create_host_uefi_credential_when_missing(pool: sqlx::PgPool) { @@ -248,7 +249,12 @@ async fn test_create_bgp_credential_validates_max_password_length(pool: sqlx::Pg #[crate::sqlx_test] async fn test_get_switch_nvos_credentials(pool: sqlx::PgPool) -> eyre::Result<()> { let env = create_test_env(pool).await; - let bmc_mac_address = "00:11:22:33:44:55".parse()?; + let switch_id = new_switch(&env, Some("Switch1".to_string()), None).await?; + let bmc_mac_address = db::switch::find_switch_endpoints_by_ids(&env.pool, &[switch_id]) + .await? + .first() + .expect("switch endpoint row") + .bmc_mac; env.test_credential_manager .set_credentials( @@ -264,7 +270,7 @@ async fn test_get_switch_nvos_credentials(pool: sqlx::PgPool) -> eyre::Result<() .api .get_switch_nvos_credentials(tonic::Request::new( rpc::forge::GetSwitchNvosCredentialsRequest { - bmc_mac_addr: bmc_mac_address.to_string(), + switch_id: Some(switch_id), }, )) .await? diff --git a/crates/api/src/tests/switch_find.rs b/crates/api/src/tests/switch_find.rs index aa3c574677..73de7cd3b4 100644 --- a/crates/api/src/tests/switch_find.rs +++ b/crates/api/src/tests/switch_find.rs @@ -264,7 +264,7 @@ async fn test_find_switches_by_ids_response_fields( } #[crate::sqlx_test] -async fn test_find_switch_host_endpoints_returns_resolved_nvos_host( +async fn test_find_switches_by_ids_includes_resolved_nvos_info( pool: sqlx::PgPool, ) -> Result<(), Box> { let env = create_test_env(pool).await; @@ -277,23 +277,62 @@ async fn test_find_switch_host_endpoints_returns_resolved_nvos_host( let response = env .api - .find_switch_host_endpoints(tonic::Request::new(rpc::forge::SwitchesByIdsRequest { + .find_switches_by_ids(tonic::Request::new(rpc::forge::SwitchesByIdsRequest { switch_ids: vec![switch_id], })) .await? .into_inner(); - assert_eq!(response.endpoints.len(), 1); - assert_eq!(response.endpoints[0].switch_id, Some(switch_id)); - assert_eq!(response.endpoints[0].bmc_mac, expected.bmc_mac.to_string()); - assert_eq!(response.endpoints[0].host_mac, host_mac.to_string()); - assert_eq!(response.endpoints[0].host_ip, host_ip.to_string()); + assert_eq!(response.switches.len(), 1); + let switch = &response.switches[0]; + assert_eq!(switch.id, Some(switch_id)); + assert_eq!( + switch.bmc_info.as_ref().and_then(|info| info.mac.clone()), + Some(expected.bmc_mac.to_string()) + ); + assert_eq!( + switch.bmc_info.as_ref().and_then(|info| info.ip.clone()), + Some(expected.bmc_ip.to_string()) + ); + + let nvos_info = switch.nvos_info.as_ref().expect("nvos info"); + assert_eq!(nvos_info.mac, Some(host_mac.to_string())); + assert_eq!(nvos_info.ip, Some(host_ip.to_string())); + + Ok(()) +} + +#[crate::sqlx_test] +async fn test_find_switches_includes_resolved_nvos_info( + pool: sqlx::PgPool, +) -> Result<(), Box> { + let env = create_test_env(pool).await; + let switch_id = new_switch(&env, Some("Switch1".to_string()), None).await?; + + let mut rows = db::switch::find_switch_endpoints_by_ids(&env.pool, &[switch_id]).await?; + let expected = rows.pop().expect("switch endpoint row"); + let host_mac = expected.nvos_mac.expect("nvos mac"); + let host_ip = expected.nvos_ip.expect("nvos ip"); + + let response = env + .api + .find_switches(tonic::Request::new(rpc::forge::SwitchQuery { + name: None, + switch_id: Some(switch_id), + })) + .await? + .into_inner(); + + assert_eq!(response.switches.len(), 1); + let nvos_info = response.switches[0].nvos_info.as_ref().expect("nvos info"); + assert_eq!(nvos_info.mac, Some(host_mac.to_string())); + assert_eq!(nvos_info.ip, Some(host_ip.to_string())); Ok(()) } #[crate::sqlx_test] -async fn test_find_switch_host_endpoints_skips_switch_without_nvos_host( +async fn test_find_switches_by_ids_returns_no_nvos_info_when_unresolved( pool: sqlx::PgPool, ) -> Result<(), Box> { let env = create_test_env(pool).await; @@ -309,13 +348,14 @@ async fn test_find_switch_host_endpoints_skips_switch_without_nvos_host( let response = env .api - .find_switch_host_endpoints(tonic::Request::new(rpc::forge::SwitchesByIdsRequest { + .find_switches_by_ids(tonic::Request::new(rpc::forge::SwitchesByIdsRequest { switch_ids: vec![switch_id], })) .await? .into_inner(); - assert!(response.endpoints.is_empty()); + assert_eq!(response.switches.len(), 1); + assert!(response.switches[0].nvos_info.is_none()); Ok(()) } diff --git a/crates/health/src/api_client.rs b/crates/health/src/api_client.rs index e6f1d08fd3..50e1f098b3 100644 --- a/crates/health/src/api_client.rs +++ b/crates/health/src/api_client.rs @@ -15,7 +15,6 @@ * limitations under the License. */ -use std::collections::HashMap; use std::convert::TryFrom; use std::net::IpAddr; use std::str::FromStr; @@ -50,7 +49,7 @@ struct ApiCredentialProvider { #[derive(Clone)] enum ApiCredentialKind { Bmc, - SwitchNvosAdmin { bmc_mac: MacAddress }, + SwitchNvosAdmin { switch_id: SwitchId }, } impl CredentialProvider for ApiCredentialProvider { @@ -70,9 +69,9 @@ impl CredentialProvider for ApiCredentialProvider { .await .map_err(HealthError::ApiInvocationError)? } - ApiCredentialKind::SwitchNvosAdmin { bmc_mac } => { + ApiCredentialKind::SwitchNvosAdmin { switch_id } => { let request = rpc::forge::GetSwitchNvosCredentialsRequest { - bmc_mac_addr: bmc_mac.to_string(), + switch_id: Some(*switch_id), }; self.client @@ -199,16 +198,10 @@ impl ApiClientWrapper { match self.client.find_switches(switch_request).await { Ok(response) => { - let switches = response.switches; - let switches_by_id: HashMap<_, _> = switches - .iter() - .filter_map(|switch| switch.id.map(|id| (id, switch))) - .collect(); - let mut endpoints = Vec::new(); - for switch in &switches { - match self.extract_switch_endpoint(switch).await { + for switch in response.switches { + match self.extract_switch_endpoint(&switch).await { Ok(endpoint) => endpoints.push(Arc::new(endpoint)), Err(error) => tracing::warn!( ?switch, @@ -216,32 +209,16 @@ impl ApiClientWrapper { "Could not add switch endpoint due to error" ), } - } - if !switches_by_id.is_empty() { - let switch_ids = switches_by_id.keys().copied().collect(); - match self - .client - .find_switch_host_endpoints(rpc::forge::SwitchesByIdsRequest { switch_ids }) - .await - { - Ok(response) => { - for host_endpoint in response.endpoints { - match self - .extract_switch_host_endpoint(&host_endpoint, &switches_by_id) - .await - { - Ok(endpoint) => endpoints.push(Arc::new(endpoint)), - Err(error) => tracing::warn!( - ?host_endpoint, - ?error, - "Could not add switch host endpoint due to error" - ), - } - } - } + match self.extract_switch_host_endpoint(&switch).await { + Ok(Some(endpoint)) => endpoints.push(Arc::new(endpoint)), + Ok(None) => {} Err(error) => { - tracing::warn!(?error, "Failed to fetch switch host endpoints") + tracing::warn!( + ?switch, + ?error, + "Could not add switch host endpoint due to error" + ); } } } @@ -355,28 +332,15 @@ impl ApiClientWrapper { async fn extract_switch_host_endpoint( &self, - host_endpoint: &rpc::forge::SwitchHostEndpoint, - switches_by_id: &HashMap, - ) -> Result { - let switch_id = host_endpoint.switch_id.ok_or_else(|| { + switch: &rpc::forge::Switch, + ) -> Result, HealthError> { + let Some(nvos_info) = switch.nvos_info.as_ref() else { + return Ok(None); + }; + let switch_id = switch.id.ok_or_else(|| { HealthError::GenericError("switch host endpoint missing switch ID".to_string()) })?; - let switch = *switches_by_id.get(&switch_id).ok_or_else(|| { - HealthError::GenericError( - "switch host endpoint did not match fetched switch".to_string(), - ) - })?; - let addr = BmcAddr { - ip: host_endpoint - .host_ip - .parse::() - .map_err(|error| HealthError::GenericError(error.to_string()))?, - port: None, - mac: MacAddress::from_str(&host_endpoint.host_mac) - .map_err(|error| HealthError::GenericError(error.to_string()))?, - }; - let bmc_mac = MacAddress::from_str(&host_endpoint.bmc_mac) - .map_err(|error| HealthError::GenericError(error.to_string()))?; + let addr = BmcAddr::try_from(nvos_info)?; self.endpoint_with_auth( addr, @@ -386,9 +350,10 @@ impl ApiClientWrapper { switch.is_primary, )?), switch.rack_id.clone(), - ApiCredentialKind::SwitchNvosAdmin { bmc_mac }, + ApiCredentialKind::SwitchNvosAdmin { switch_id }, ) .await + .map(Some) } async fn extract_power_shelf_endpoint( diff --git a/crates/rpc/build.rs b/crates/rpc/build.rs index a1a352ba22..48c0aea609 100644 --- a/crates/rpc/build.rs +++ b/crates/rpc/build.rs @@ -848,14 +848,6 @@ fn main() -> Result<(), Box> { "forge.GetSwitchNvosCredentialsRequest", "#[derive(serde::Serialize)]", ) - .type_attribute( - "forge.SwitchHostEndpoint", - "#[derive(serde::Serialize)]", - ) - .type_attribute( - "forge.SwitchHostEndpointList", - "#[derive(serde::Serialize)]", - ) .type_attribute( "forge.PlacementInRack", "#[derive(serde::Serialize)]", diff --git a/crates/rpc/proto/forge.proto b/crates/rpc/proto/forge.proto index 31b96d8492..a23c2df851 100644 --- a/crates/rpc/proto/forge.proto +++ b/crates/rpc/proto/forge.proto @@ -101,7 +101,6 @@ service Forge { rpc FindSwitches(SwitchQuery) returns (SwitchList); rpc FindSwitchIds(SwitchSearchFilter) returns (SwitchIdList); rpc FindSwitchesByIds(SwitchesByIdsRequest) returns (SwitchList); - rpc FindSwitchHostEndpoints(SwitchesByIdsRequest) returns (SwitchHostEndpointList); rpc DeleteSwitch(SwitchDeletionRequest) returns (SwitchDeletionResult); // Force deletes a Switch and optionally its associated interfaces from the database. rpc AdminForceDeleteSwitch(AdminForceDeleteSwitchRequest) returns (AdminForceDeleteSwitchResponse); @@ -2136,6 +2135,9 @@ message Switch { optional PlacementInRack placement_in_rack = 11; reserved 12, 13; bool is_primary = 14; + + // NVOS host endpoint + BmcInfo nvos_info = 15; } message SwitchList { @@ -2202,17 +2204,6 @@ message SwitchesByIdsRequest { repeated common.SwitchId switch_ids = 1; } -message SwitchHostEndpoint { - common.SwitchId switch_id = 1; - string bmc_mac = 2; - string host_mac = 3; - string host_ip = 4; -} - -message SwitchHostEndpointList { - repeated SwitchHostEndpoint endpoints = 1; -} - message ExpectedSwitch { string bmc_mac_address = 1; string bmc_username = 2; @@ -3762,7 +3753,7 @@ message GetBmcCredentialsRequest { } message GetSwitchNvosCredentialsRequest { - string bmc_mac_addr = 1; + common.SwitchId switch_id = 1; } message GetBmcCredentialsResponse { From 4ebcf2e33fcb8679b1c4a2b7b67264b148c7eb60 Mon Sep 17 00:00:00 2001 From: mkoci <26286151+mkoci@users.noreply.github.com> Date: Thu, 21 May 2026 05:43:39 +0200 Subject: [PATCH 12/12] fix(api): broken test --- crates/api/src/tests/switch_find.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/crates/api/src/tests/switch_find.rs b/crates/api/src/tests/switch_find.rs index 73de7cd3b4..34b2021d4d 100644 --- a/crates/api/src/tests/switch_find.rs +++ b/crates/api/src/tests/switch_find.rs @@ -254,10 +254,10 @@ async fn test_find_switches_by_ids_response_fields( // state_version should be populated assert!(!switch.state_version.is_empty()); - // bmc_info is None when no machine_interface discovery data exists + // bmc_info should be populated from the seeded machine_interface discovery data assert!( - switch.bmc_info.is_none(), - "bmc_info should be None when no discovery data exists" + switch.bmc_info.is_some(), + "bmc_info should be present when discovery data exists" ); Ok(())