diff --git a/crates/admin-cli/src/machine/health_report/cmd.rs b/crates/admin-cli/src/machine/health_report/cmd.rs index bab399894b..20f771bd6e 100644 --- a/crates/admin-cli/src/machine/health_report/cmd.rs +++ b/crates/admin-cli/src/machine/health_report/cmd.rs @@ -136,10 +136,11 @@ pub fn get_health_report(template: HealthReportTemplates, message: Option { - report.source = "request-online-repair".to_string(); + report.source = health_report::REQUEST_ONLINE_REPAIR_MERGE_SOURCE.to_string(); report.alerts[0].id = HealthProbeId::from_str("RequestOnlineRepair") .expect("RequestOnlineRepair is a valid non-empty HealthProbeId"); - report.alerts[0].target = Some("request-online-repair".to_string()); + report.alerts[0].target = + Some(health_report::REQUEST_ONLINE_REPAIR_MERGE_SOURCE.to_string()); report.alerts[0].classifications = vec![ HealthAlertClassification::prevent_allocations(), HealthAlertClassification::suppress_external_alerting(), @@ -150,7 +151,7 @@ pub fn get_health_report(template: HealthReportTemplates, message: Option { - report.source = "repair-request".to_string(); + report.source = health_report::REPAIR_REQUEST_MERGE_SOURCE.to_string(); report.alerts[0].id = HealthProbeId::from_str("RequestRepair") .expect("RequestRepair is a valid non-empty HealthProbeId"); report.alerts[0].target = Some("repair-requested".to_string()); @@ -261,7 +262,7 @@ mod tests { Some("Hardware diagnostics indicate memory failure".to_string()), ); - assert_eq!(report.source, "repair-request"); + assert_eq!(report.source, health_report::REPAIR_REQUEST_MERGE_SOURCE); assert_eq!(report.alerts.len(), 1); let alert = &report.alerts[0]; @@ -299,7 +300,7 @@ mod tests { fn test_request_repair_template_with_empty_message() { let report = get_health_report(HealthReportTemplates::RequestRepair, None); - assert_eq!(report.source, "repair-request"); + assert_eq!(report.source, health_report::REPAIR_REQUEST_MERGE_SOURCE); assert_eq!(report.alerts[0].message, ""); } @@ -350,7 +351,10 @@ mod tests { Some("Online repair handoff for stuck repair workflow".to_string()), ); - assert_eq!(report.source, "request-online-repair"); + assert_eq!( + report.source, + health_report::REQUEST_ONLINE_REPAIR_MERGE_SOURCE + ); assert_eq!(report.alerts.len(), 1); let alert = &report.alerts[0]; @@ -358,7 +362,10 @@ mod tests { alert.id, HealthProbeId::from_str("RequestOnlineRepair").unwrap() ); - assert_eq!(alert.target, Some("request-online-repair".to_string())); + assert_eq!( + alert.target, + Some(health_report::REQUEST_ONLINE_REPAIR_MERGE_SOURCE.to_string()) + ); assert_eq!( alert.message, "Online repair handoff for stuck repair workflow" @@ -387,7 +394,10 @@ mod tests { fn test_request_online_repair_template_with_empty_message() { let report = get_health_report(HealthReportTemplates::RequestOnlineRepair, None); - assert_eq!(report.source, "request-online-repair"); + assert_eq!( + report.source, + health_report::REQUEST_ONLINE_REPAIR_MERGE_SOURCE + ); assert_eq!(report.alerts[0].message, ""); } diff --git a/crates/api-model/src/health.rs b/crates/api-model/src/health.rs index a6e6e8c4af..498b8d7791 100644 --- a/crates/api-model/src/health.rs +++ b/crates/api-model/src/health.rs @@ -46,6 +46,16 @@ pub struct HealthReportSources { } impl HealthReportSources { + /// True when a repair-related health merge override is active (`repair-request` or + /// `request-online-repair`). + pub fn repair_merge_active(&self) -> bool { + self.merges + .contains_key(health_report::REPAIR_REQUEST_MERGE_SOURCE) + || self + .merges + .contains_key(health_report::REQUEST_ONLINE_REPAIR_MERGE_SOURCE) + } + #[allow(clippy::should_implement_trait)] pub fn iter(&self) -> impl Iterator { self.merges diff --git a/crates/api-model/src/instance/status/tenant.rs b/crates/api-model/src/instance/status/tenant.rs index 1aa97b2a8d..003061848e 100644 --- a/crates/api-model/src/instance/status/tenant.rs +++ b/crates/api-model/src/instance/status/tenant.rs @@ -59,6 +59,10 @@ pub enum TenantState { Failed, /// Not sure what happened. Check log for more info Invalid, + /// Instance is undergoing online repair while otherwise tenant-ready. Set by + /// `instance_status_tenant_state` in the RPC model layer when a repair health merge + /// is active and the instance would otherwise be [`Ready`]. + Repairing, } #[cfg(test)] diff --git a/crates/api/src/handlers/instance.rs b/crates/api/src/handlers/instance.rs index 05f5bed239..292dd91671 100644 --- a/crates/api/src/handlers/instance.rs +++ b/crates/api/src/handlers/instance.rs @@ -330,7 +330,7 @@ fn create_tenant_reported_issue_override( /// Creates a RequestRepair health override template fn create_request_repair_override(issue: &rpc::Issue) -> HealthReport { HealthReport { - source: "repair-request".to_string(), + source: health_report::REPAIR_REQUEST_MERGE_SOURCE.to_string(), observed_at: Some(chrono::Utc::now()), alerts: vec![HealthProbeAlert { id: HealthProbeId::from_str("RequestRepair") @@ -443,7 +443,10 @@ async fn handle_instance_release_from_repair_tenant( machine: &model::machine::Machine, tenant_organization_id: &str, ) -> Result<(), CarbideError> { - let has_request_repair = machine.health_reports.merges.contains_key("repair-request"); + let has_request_repair = machine + .health_reports + .merges + .contains_key(health_report::REPAIR_REQUEST_MERGE_SOURCE); if !has_request_repair { // No existing RequestRepair override @@ -489,7 +492,7 @@ async fn handle_instance_release_from_repair_tenant( remove_health_override( txn, machine_id, - "repair-request", + health_report::REPAIR_REQUEST_MERGE_SOURCE, "RequestRepair removed - repair completed successfully", ) .await?; @@ -530,7 +533,7 @@ async fn handle_instance_release_from_repair_tenant( remove_health_override( txn, machine_id, - "repair-request", + health_report::REPAIR_REQUEST_MERGE_SOURCE, "RequestRepair removed for incomplete repair", ) .await?; diff --git a/crates/api/src/tests/dpu_reprovisioning.rs b/crates/api/src/tests/dpu_reprovisioning.rs index 63a675252a..ff74bb57d1 100644 --- a/crates/api/src/tests/dpu_reprovisioning.rs +++ b/crates/api/src/tests/dpu_reprovisioning.rs @@ -618,7 +618,8 @@ async fn assert_reprov_tenant_state( host.state.clone().value, None, None, - None + None, + &host.health_reports, ) .unwrap() .tenant diff --git a/crates/api/src/tests/host_bmc_firmware_test.rs b/crates/api/src/tests/host_bmc_firmware_test.rs index 1c5febbb06..e4e8c0e4e2 100644 --- a/crates/api/src/tests/host_bmc_firmware_test.rs +++ b/crates/api/src/tests/host_bmc_firmware_test.rs @@ -1253,7 +1253,8 @@ async fn test_instance_upgrading_actual_part_2( host.state.clone().value, None, None, - None + None, + &host.health_reports, ) .unwrap() .tenant @@ -1292,7 +1293,8 @@ async fn test_instance_upgrading_actual_part_2( host.state.clone().value, None, None, - None + None, + &host.health_reports, ) .unwrap() .tenant @@ -1341,7 +1343,8 @@ async fn test_instance_upgrading_actual_part_2( host.state.clone().value, None, None, - None + None, + &host.health_reports, ) .unwrap() .tenant @@ -1387,7 +1390,8 @@ async fn test_instance_upgrading_actual_part_2( host.state.clone().value, None, None, - None + None, + &host.health_reports, ) .unwrap() .tenant @@ -1423,7 +1427,8 @@ async fn test_instance_upgrading_actual_part_2( host.state.clone().value, None, None, - None + None, + &host.health_reports, ) .unwrap() .tenant @@ -1481,7 +1486,8 @@ async fn test_instance_upgrading_actual_part_2( host.state.clone().value, None, None, - None + None, + &host.health_reports, ) .unwrap() .tenant @@ -1517,7 +1523,8 @@ async fn test_instance_upgrading_actual_part_2( host.state.clone().value, None, None, - None + None, + &host.health_reports, ) .unwrap() .tenant @@ -1565,7 +1572,8 @@ async fn test_instance_upgrading_actual_part_2( host.state.clone().value, None, None, - None + None, + &host.health_reports, ) .unwrap() .tenant @@ -1633,7 +1641,8 @@ async fn test_instance_upgrading_actual_part_2( host.state.clone().value, None, None, - None + None, + &host.health_reports, ) .unwrap() .tenant @@ -1697,7 +1706,8 @@ async fn test_instance_upgrading_actual_part_2( host.state.clone().value, None, None, - None + None, + &host.health_reports, ) .unwrap() .tenant @@ -1733,7 +1743,8 @@ async fn test_instance_upgrading_actual_part_2( host.state.clone().value, None, None, - None + None, + &host.health_reports, ) .unwrap() .tenant @@ -1767,7 +1778,8 @@ async fn test_instance_upgrading_actual_part_2( host.state.clone().value, None, None, - None + None, + &host.health_reports, ) .unwrap() .tenant @@ -1796,7 +1808,8 @@ async fn test_instance_upgrading_actual_part_2( host.state.clone().value, None, None, - None + None, + &host.health_reports, ) .unwrap() .tenant diff --git a/crates/api/src/tests/instance.rs b/crates/api/src/tests/instance.rs index c5089b8ec0..f1363ec722 100644 --- a/crates/api/src/tests/instance.rs +++ b/crates/api/src/tests/instance.rs @@ -5315,7 +5315,7 @@ async fn test_instance_release_backward_compatibility(_: PgPoolOptions, options: !host_machine .health_reports .merges - .contains_key("repair-request"), + .contains_key(health_report::REPAIR_REQUEST_MERGE_SOURCE), "Backward compatibility: RequestRepair override should NOT be applied without issue field" ); @@ -5424,7 +5424,7 @@ async fn test_instance_release_repair_tenant(_: PgPoolOptions, options: PgConnec let has_repair_request_override = host_machine .health_reports .merges - .contains_key("repair-request"); + .contains_key(health_report::REPAIR_REQUEST_MERGE_SOURCE); assert!( !has_tenant_reported_override, @@ -5522,7 +5522,7 @@ async fn test_instance_release_combined_enhancements(_: PgPoolOptions, options: let has_repair_request_override = host_machine .health_reports .merges - .contains_key("repair-request"); + .contains_key(health_report::REPAIR_REQUEST_MERGE_SOURCE); assert!( !has_repair_request_override, @@ -5716,14 +5716,18 @@ async fn test_instance_release_auto_repair_enabled(_: PgPoolOptions, options: Pg host_machine .health_reports .merges - .contains_key("repair-request"), + .contains_key(health_report::REPAIR_REQUEST_MERGE_SOURCE), "Should have RequestRepair override when auto-repair is enabled" ); // 4. Verify the RequestRepair override content - let repair_override = &host_machine.health_reports.merges["repair-request"]; + let repair_override = + &host_machine.health_reports.merges[health_report::REPAIR_REQUEST_MERGE_SOURCE]; let repair_report: health_report::HealthReport = repair_override.clone(); - assert_eq!(repair_report.source, "repair-request"); + assert_eq!( + repair_report.source, + health_report::REPAIR_REQUEST_MERGE_SOURCE + ); assert_eq!(repair_report.alerts.len(), 1); assert_eq!(repair_report.alerts[0].id.to_string(), "RequestRepair"); assert!( diff --git a/crates/api/src/tests/machine_health.rs b/crates/api/src/tests/machine_health.rs index 31296f3ba7..ec103142dc 100644 --- a/crates/api/src/tests/machine_health.rs +++ b/crates/api/src/tests/machine_health.rs @@ -1136,7 +1136,7 @@ async fn test_request_repair_health_override_template( // Create a RequestRepair health override using the API let repair_request_override = health_report::HealthReport { - source: "repair-request".to_string(), + source: health_report::REPAIR_REQUEST_MERGE_SOURCE.to_string(), triggered_by: None, observed_at: Some(chrono::Utc::now()), successes: vec![], @@ -1174,7 +1174,10 @@ async fn test_request_repair_health_override_template( machine.health_sources[1].mode, HealthReportApplyMode::Merge as i32 ); - assert_eq!(machine.health_sources[1].source, "repair-request"); + assert_eq!( + machine.health_sources[1].source, + health_report::REPAIR_REQUEST_MERGE_SOURCE + ); // Verify aggregate health includes the override let aggregate_health = aggregate(machine).unwrap(); @@ -1232,7 +1235,7 @@ async fn test_tenant_reported_issue_and_request_repair_combined( }; let repair_request_override = health_report::HealthReport { - source: "repair-request".to_string(), + source: health_report::REPAIR_REQUEST_MERGE_SOURCE.to_string(), triggered_by: None, observed_at: Some(chrono::Utc::now()), successes: vec![], @@ -1278,7 +1281,7 @@ async fn test_tenant_reported_issue_and_request_repair_combined( .map(|o| o.source.clone()) .collect(); assert!(sources.contains(&"tenant-reported-issue".to_string())); - assert!(sources.contains(&"repair-request".to_string())); + assert!(sources.contains(&health_report::REPAIR_REQUEST_MERGE_SOURCE.to_string())); // All should be merge mode for override_entry in &machine.health_sources { diff --git a/crates/health-report/src/lib.rs b/crates/health-report/src/lib.rs index 2f3ba83068..73ddca0bfc 100644 --- a/crates/health-report/src/lib.rs +++ b/crates/health-report/src/lib.rs @@ -21,6 +21,11 @@ use std::str::FromStr; use serde::{Deserialize, Serialize}; +/// `HealthReportSources::merges` key for the auto-repair (`RequestRepair`) override. +pub const REPAIR_REQUEST_MERGE_SOURCE: &str = "repair-request"; +/// `HealthReportSources::merges` key for online repair gating (`RequestOnlineRepair` override). +pub const REQUEST_ONLINE_REPAIR_MERGE_SOURCE: &str = "request-online-repair"; + /// Reports the aggregate health of a system or subsystem #[derive(PartialEq, Eq, Debug, Clone, Serialize, Deserialize)] pub struct HealthReport { @@ -763,13 +768,13 @@ mod tests { // Shape matches admin-cli `HealthReportTemplates::RequestOnlineRepair` (merge source // `request-online-repair`, probe id `RequestOnlineRepair`). let report = HealthReport { - source: "request-online-repair".to_string(), + source: REQUEST_ONLINE_REPAIR_MERGE_SOURCE.to_string(), triggered_by: None, observed_at: Some(chrono::Utc::now()), successes: vec![], alerts: vec![HealthProbeAlert { id: HealthProbeId::from_str("RequestOnlineRepair").unwrap(), - target: Some("request-online-repair".to_string()), + target: Some(REQUEST_ONLINE_REPAIR_MERGE_SOURCE.to_string()), in_alert_since: None, message: "test".to_string(), tenant_message: None, diff --git a/crates/rpc/proto/forge.proto b/crates/rpc/proto/forge.proto index 135f90eab8..1cb834280d 100644 --- a/crates/rpc/proto/forge.proto +++ b/crates/rpc/proto/forge.proto @@ -1777,6 +1777,8 @@ enum TenantState { // Something Wrong happened at carbide. Check logs for more info. INVALID = 9; + // Instance is undergoing online repair (health override active). + REPAIRING = 10; } // Describe the desired configuration of an IBPartition diff --git a/crates/rpc/src/model/instance/snapshot.rs b/crates/rpc/src/model/instance/snapshot.rs index 1db9c17f71..b6cf8ae121 100644 --- a/crates/rpc/src/model/instance/snapshot.rs +++ b/crates/rpc/src/model/instance/snapshot.rs @@ -26,6 +26,7 @@ use model::machine::nvlink::MachineNvLinkStatusObservation; use model::machine::{ManagedHostState, ReprovisionRequest}; use crate::errors::RpcDataConversionError; +use model::health::HealthReportSources; use crate::model::instance::status::instance_status_from_config_and_observation; /// Derives the tenant and site-admin facing [`InstanceStatus`] from the @@ -37,6 +38,7 @@ pub fn instance_snapshot_derive_status( reprovision_request: Option, ib_status: Option<&MachineInfinibandStatusObservation>, nvlink_status: Option<&MachineNvLinkStatusObservation>, + host_health: &HealthReportSources, ) -> Result { instance_status_from_config_and_observation( dpu_id_to_device_map, @@ -55,5 +57,6 @@ pub fn instance_snapshot_derive_status( ib_status, nvlink_status, snapshot.update_network_config_request.is_some(), + host_health, ) } diff --git a/crates/rpc/src/model/instance/status.rs b/crates/rpc/src/model/instance/status.rs index 0976188a1d..997a9d1ba4 100644 --- a/crates/rpc/src/model/instance/status.rs +++ b/crates/rpc/src/model/instance/status.rs @@ -78,6 +78,7 @@ pub fn instance_status_from_config_and_observation( ib_status: Option<&MachineInfinibandStatusObservation>, nvlink_status: Option<&MachineNvLinkStatusObservation>, is_network_config_request_pending: bool, + host_health: &model::health::HealthReportSources, ) -> Result { let mut instance_config_synced = SyncState::Synced; @@ -150,6 +151,7 @@ pub fn instance_status_from_config_and_observation( instance_config.os.phone_home_enabled, phone_home_last_contact, extension_services_ready, + host_health.repair_merge_active(), )?, true => { // If instance deletion was requested, we always confirm the @@ -187,12 +189,91 @@ impl TryFrom for rpc::SyncState { #[cfg(test)] mod tests { + use std::collections::HashMap; use std::str::FromStr; + use config_version::ConfigVersion; + use health_report::{HealthReport, REPAIR_REQUEST_MERGE_SOURCE}; + use model::instance::config::InstanceConfig; + use model::instance::config::extension_services::InstanceExtensionServicesConfig; + use model::instance::config::infiniband::InstanceInfinibandConfig; + use model::instance::config::network::InstanceNetworkConfig; + use model::instance::config::nvlink::InstanceNvLinkConfig; + use model::instance::config::tenant_config::TenantConfig; + use model::instance::status::InstanceStatusObservations; use model::instance::status::tenant::TenantState; - use model::machine::{DpuReprovisionStates, ManagedHostState, ReprovisionState}; + use model::machine::{DpuReprovisionStates, InstanceState, ManagedHostState, ReprovisionState}; + use model::os::{OperatingSystem, OperatingSystemVariant}; + use model::tenant::TenantOrganizationId; + use uuid::Uuid; use super::*; + use model::health::HealthReportSources; + + fn minimal_instance_config() -> InstanceConfig { + InstanceConfig { + tenant: TenantConfig { + tenant_organization_id: TenantOrganizationId::try_from("TenantA".to_string()) + .unwrap(), + tenant_keyset_ids: vec![], + hostname: None, + }, + os: OperatingSystem { + user_data: None, + variant: OperatingSystemVariant::OsImage(Uuid::nil()), + phone_home_enabled: false, + run_provisioning_instructions_on_every_boot: false, + }, + network: InstanceNetworkConfig::default(), + infiniband: InstanceInfinibandConfig::default(), + network_security_group_id: None, + extension_services: InstanceExtensionServicesConfig::default(), + nvlink: InstanceNvLinkConfig::default(), + } + } + + #[test] + fn repair_merge_active_yields_repairing_via_status_pipeline() { + let config = minimal_instance_config(); + let version = ConfigVersion::initial(); + let mut health = HealthReportSources::default(); + health.merges.insert( + REPAIR_REQUEST_MERGE_SOURCE.to_string(), + HealthReport { + source: REPAIR_REQUEST_MERGE_SOURCE.to_string(), + ..Default::default() + }, + ); + + let status = instance_status_from_config_and_observation( + HashMap::new(), + Versioned::new(&config, version), + Versioned::new(&config.network, version), + Versioned::new(&config.infiniband, version), + Versioned::new(&config.extension_services, version), + Versioned::new(&config.nvlink, version), + &InstanceStatusObservations { + network: HashMap::new(), + extension_services: HashMap::new(), + phone_home_last_contact: None, + }, + ManagedHostState::Assigned { + instance_state: InstanceState::Ready, + }, + false, + None, + None, + None, + false, + &health, + ) + .unwrap(); + + assert_eq!( + status.tenant.as_ref().map(|t| t.state), + Some(TenantState::Repairing) + ); + } #[test] fn test_tenant_state() { @@ -214,6 +295,7 @@ mod tests { false, None, false, + false, ) .unwrap(), TenantState::Invalid diff --git a/crates/rpc/src/model/instance/status/tenant.rs b/crates/rpc/src/model/instance/status/tenant.rs index 831ff53055..791b13a832 100644 --- a/crates/rpc/src/model/instance/status/tenant.rs +++ b/crates/rpc/src/model/instance/status/tenant.rs @@ -22,13 +22,19 @@ use model::machine::{InstanceState, ManagedHostState}; use crate as rpc; use crate::errors::RpcDataConversionError; -/// Tries to convert Machine state to tenant state. +/// Converts machine state into the tenant-visible [`TenantState`]. +/// +/// When `repair_active` is true, [`TenantState::Repairing`] is returned only if the +/// instance would otherwise be tenant-ready (`InstanceState::Ready` with synced configs +/// and extension services ready). It does not override Failed, Updating, Configuring, +/// Provisioning, or Terminating. pub fn instance_status_tenant_state( machine_state: ManagedHostState, configs_synced: SyncState, phone_home_enrolled: bool, phone_home_last_contact: Option>, extension_services_ready: bool, + repair_active: bool, ) -> Result { // At this point, we are sure that instance is created. // If machine state is still ready, means state machine has not processed this instance @@ -61,7 +67,8 @@ pub fn instance_status_tenant_state( (false, _, false) => TenantState::Configuring, // If there is no pending phone-home and extension services are ready, - // return Ready (this was the default before phone_home) + // the instance is tenant-ready; surface online repair only in this case. + (false, SyncState::Synced, true) if repair_active => TenantState::Repairing, (false, SyncState::Synced, true) => TenantState::Ready, // If there is a pending phone-home, we're still @@ -125,6 +132,127 @@ impl TryFrom for rpc::TenantState { TenantState::HostReprovisioning => rpc::TenantState::HostReprovisioning, TenantState::Updating => rpc::TenantState::Updating, TenantState::Invalid => rpc::TenantState::Invalid, + TenantState::Repairing => rpc::TenantState::Repairing, }) } } + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + use std::str::FromStr; + + use carbide_uuid::machine::MachineId; + use chrono::Utc; + use health_report::{HealthReport, REPAIR_REQUEST_MERGE_SOURCE}; + + use super::*; + use model::health::HealthReportSources; + use model::instance::status::SyncState; + use model::machine::{ + DpuReprovisionStates, FailureCause, FailureDetails, FailureSource, InstanceState, + ManagedHostState, + }; + + #[test] + fn repair_merge_active_detects_merge_sources() { + let mut health = HealthReportSources::default(); + assert!(!health.repair_merge_active()); + health.merges.insert( + REPAIR_REQUEST_MERGE_SOURCE.to_string(), + HealthReport { + source: REPAIR_REQUEST_MERGE_SOURCE.to_string(), + ..Default::default() + }, + ); + assert!(health.repair_merge_active()); + } + + #[test] + fn repair_merge_tenant_state_precedence() { + let machine_id = + MachineId::from_str("fm100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0") + .unwrap(); + let failed = InstanceState::Failed { + details: FailureDetails { + cause: FailureCause::NoError, + failed_at: Utc::now(), + source: FailureSource::StateMachine, + }, + machine_id, + }; + + struct Case { + name: &'static str, + machine_state: ManagedHostState, + configs_synced: SyncState, + repair_active: bool, + expected: TenantState, + } + + let cases = [ + Case { + name: "tenant-ready with repair merge", + machine_state: ManagedHostState::Assigned { + instance_state: InstanceState::Ready, + }, + configs_synced: SyncState::Synced, + repair_active: true, + expected: TenantState::Repairing, + }, + Case { + name: "terminating with repair merge", + machine_state: ManagedHostState::Assigned { + instance_state: InstanceState::SwitchToAdminNetwork, + }, + configs_synced: SyncState::Synced, + repair_active: true, + expected: TenantState::Terminating, + }, + Case { + name: "reprovision with repair merge", + machine_state: ManagedHostState::Assigned { + instance_state: InstanceState::DPUReprovision { + dpu_states: DpuReprovisionStates { + states: HashMap::new(), + }, + }, + }, + configs_synced: SyncState::Synced, + repair_active: true, + expected: TenantState::Updating, + }, + Case { + name: "configuring with repair merge", + machine_state: ManagedHostState::Assigned { + instance_state: InstanceState::Ready, + }, + configs_synced: SyncState::Pending, + repair_active: true, + expected: TenantState::Configuring, + }, + Case { + name: "failed with repair merge", + machine_state: ManagedHostState::Assigned { + instance_state: failed, + }, + configs_synced: SyncState::Synced, + repair_active: true, + expected: TenantState::Failed, + }, + ]; + + for case in cases { + let state = instance_status_tenant_state( + case.machine_state, + case.configs_synced, + false, + None, + true, + case.repair_active, + ) + .unwrap_or_else(|_| panic!("case {:?} failed conversion", case.name)); + assert_eq!(state, case.expected, "case: {}", case.name); + } + } +} diff --git a/crates/rpc/src/model/machine/mod.rs b/crates/rpc/src/model/machine/mod.rs index ff16199eba..9bb72cf092 100644 --- a/crates/rpc/src/model/machine/mod.rs +++ b/crates/rpc/src/model/machine/mod.rs @@ -87,6 +87,7 @@ impl RpcTryFrom for Option { .infiniband_status_observation .as_ref(), snapshot.host_snapshot.nvlink_status_observation.as_ref(), + &snapshot.host_snapshot.health_reports, )?; Ok(Some(rpc::Instance { diff --git a/rest-api/api/pkg/api/model/instance.go b/rest-api/api/pkg/api/model/instance.go index d95d28fb85..ad194bb443 100644 --- a/rest-api/api/pkg/api/model/instance.go +++ b/rest-api/api/pkg/api/model/instance.go @@ -1805,6 +1805,8 @@ type APIInstanceStats struct { Terminating int `json:"terminating"` // Ready is the total number of ready Instances Ready int `json:"ready"` + // Repairing is the total number of Instances undergoing online repair + Repairing int `json:"repairing"` // Updating is the total number of Instances receiving system updates Updating int `json:"updating"` // Registering is the total number of registering Instances @@ -1821,7 +1823,9 @@ func getAggregatedInstanceStatus(status string, powerStatus *string) string { return agStatus } - if status != cdbm.InstanceStatusReady { + // Repairing is only stored when the instance is otherwise tenant-ready (same as Ready). + // Overlay reboot/error power state for Ready and Repairing; other statuses keep DB status. + if status != cdbm.InstanceStatusReady && status != cdbm.InstanceStatusRepairing { return agStatus } diff --git a/rest-api/api/pkg/api/model/instance_test.go b/rest-api/api/pkg/api/model/instance_test.go index 4fb5445d7c..05b7e2d2bb 100644 --- a/rest-api/api/pkg/api/model/instance_test.go +++ b/rest-api/api/pkg/api/model/instance_test.go @@ -2304,6 +2304,30 @@ func Test_getAggregatedInstanceStatus(t *testing.T) { }, want: cdbm.InstancePowerStatusError, }, + { + name: "Repairing overlays Rebooting power status like Ready", + args: args{ + status: cdbm.InstanceStatusRepairing, + powerStatus: cdb.GetStrPtr(cdbm.InstancePowerStatusRebooting), + }, + want: cdbm.InstancePowerStatusRebooting, + }, + { + name: "Repairing overlays Error power status like Ready", + args: args{ + status: cdbm.InstanceStatusRepairing, + powerStatus: cdb.GetStrPtr(cdbm.InstancePowerStatusError), + }, + want: cdbm.InstancePowerStatusError, + }, + { + name: "non-ready status does not overlay power status during repair-unrelated states", + args: args{ + status: cdbm.InstanceStatusUpdating, + powerStatus: cdb.GetStrPtr(cdbm.InstancePowerStatusRebooting), + }, + want: cdbm.InstanceStatusUpdating, + }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { diff --git a/rest-api/api/pkg/api/model/tenant.go b/rest-api/api/pkg/api/model/tenant.go index 86bde5519f..364680ad90 100644 --- a/rest-api/api/pkg/api/model/tenant.go +++ b/rest-api/api/pkg/api/model/tenant.go @@ -123,6 +123,7 @@ func NewAPITenantStats(instancestatsmap map[string]int, vpcstatsmap map[string]i Pending: instancestatsmap[cdbm.InstanceStatusPending], Terminating: instancestatsmap[cdbm.InstanceStatusTerminating], Ready: instancestatsmap[cdbm.InstanceStatusReady], + Repairing: instancestatsmap[cdbm.InstanceStatusRepairing], Updating: instancestatsmap[cdbm.InstanceStatusUpdating], Registering: instancestatsmap[cdbm.InstanceStatusProvisioning], Error: instancestatsmap[cdbm.InstanceStatusError], diff --git a/rest-api/api/pkg/api/model/tenant_test.go b/rest-api/api/pkg/api/model/tenant_test.go index 7bff8a7e4d..61fad8934c 100644 --- a/rest-api/api/pkg/api/model/tenant_test.go +++ b/rest-api/api/pkg/api/model/tenant_test.go @@ -74,6 +74,26 @@ func TestNewAPITenant(t *testing.T) { } } +func TestNewAPITenantStats_maps_repairing_instance_count(t *testing.T) { + instanceStats := map[string]int{ + "total": 3, + cdbm.InstanceStatusReady: 1, + cdbm.InstanceStatusRepairing: 2, + cdbm.InstanceStatusUpdating: 0, + cdbm.InstanceStatusPending: 0, + cdbm.InstanceStatusTerminating: 0, + cdbm.InstanceStatusError: 0, + cdbm.InstanceStatusProvisioning: 0, + } + + stats := NewAPITenantStats(instanceStats, map[string]int{}, map[string]int{}, map[string]int{}) + + assert.Equal(t, 3, stats.Instance.Total) + assert.Equal(t, 1, stats.Instance.Ready) + assert.Equal(t, 2, stats.Instance.Repairing) + assert.Equal(t, 0, stats.Instance.Updating) +} + func TestNewAPITenantSummary(t *testing.T) { dbtn := &cdbm.Tenant{ ID: uuid.New(), diff --git a/rest-api/db/pkg/db/model/instance.go b/rest-api/db/pkg/db/model/instance.go index 5c55783fbc..4c447e024a 100644 --- a/rest-api/db/pkg/db/model/instance.go +++ b/rest-api/db/pkg/db/model/instance.go @@ -42,6 +42,8 @@ const ( InstanceStatusReady = "Ready" // InstanceStatusUpdating indicates that the Instance is receiving system updates InstanceStatusUpdating = "Updating" + // InstanceStatusRepairing indicates that the Instance is undergoing online repair on the site + InstanceStatusRepairing = "Repairing" // InstanceStatusError indicates that the Instance provisioning has failed InstanceStatusError = "Error" // InstanceStatusTerminating indicates that the Instance is being terminated @@ -133,6 +135,7 @@ var ( InstanceStatusPending: true, InstanceStatusReady: true, InstanceStatusUpdating: true, + InstanceStatusRepairing: true, InstanceStatusError: true, InstanceStatusConfiguring: true, InstanceStatusProvisioning: true, diff --git a/rest-api/flow/internal/nicoapi/gen/fmds_grpc.pb.go b/rest-api/flow/internal/nicoapi/gen/fmds_grpc.pb.go index 4ad61340e7..227ca33cde 100644 --- a/rest-api/flow/internal/nicoapi/gen/fmds_grpc.pb.go +++ b/rest-api/flow/internal/nicoapi/gen/fmds_grpc.pb.go @@ -31,7 +31,7 @@ // Code generated by protoc-gen-go-grpc. DO NOT EDIT. // versions: -// - protoc-gen-go-grpc v1.6.1 +// - protoc-gen-go-grpc v1.6.2 // - protoc (unknown) // source: fmds.proto diff --git a/rest-api/flow/internal/nicoapi/gen/nico.pb.go b/rest-api/flow/internal/nicoapi/gen/nico.pb.go index f33d7159a2..2cebe54106 100644 --- a/rest-api/flow/internal/nicoapi/gen/nico.pb.go +++ b/rest-api/flow/internal/nicoapi/gen/nico.pb.go @@ -372,21 +372,24 @@ const ( TenantState_UPDATING TenantState = 8 // Something Wrong happened at nico. Check logs for more info. TenantState_INVALID TenantState = 9 + // Instance is undergoing online repair on the site. + TenantState_REPAIRING TenantState = 10 ) // Enum value maps for TenantState. var ( TenantState_name = map[int32]string{ - 0: "PROVISIONING", - 1: "READY", - 2: "CONFIGURING", - 3: "TERMINATING", - 4: "TERMINATED", - 5: "FAILED", - 6: "DPU_REPROVISIONING", - 7: "HOST_REPROVISIONING", - 8: "UPDATING", - 9: "INVALID", + 0: "PROVISIONING", + 1: "READY", + 2: "CONFIGURING", + 3: "TERMINATING", + 4: "TERMINATED", + 5: "FAILED", + 6: "DPU_REPROVISIONING", + 7: "HOST_REPROVISIONING", + 8: "UPDATING", + 9: "INVALID", + 10: "REPAIRING", } TenantState_value = map[string]int32{ "PROVISIONING": 0, @@ -399,6 +402,7 @@ var ( "HOST_REPROVISIONING": 7, "UPDATING": 8, "INVALID": 9, + "REPAIRING": 10, } ) @@ -53278,7 +53282,7 @@ const file_nico_proto_rawDesc = "" + "\x0fPrefixMatchType\x12\x10\n" + "\fPREFIX_EXACT\x10\x00\x12\x13\n" + "\x0fPREFIX_CONTAINS\x10\x01\x12\x17\n" + - "\x13PREFIX_CONTAINED_BY\x10\x02*\xb4\x01\n" + + "\x13PREFIX_CONTAINED_BY\x10\x02*\xc3\x01\n" + "\vTenantState\x12\x10\n" + "\fPROVISIONING\x10\x00\x12\t\n" + "\x05READY\x10\x01\x12\x0f\n" + @@ -53291,7 +53295,9 @@ const file_nico_proto_rawDesc = "" + "\x12DPU_REPROVISIONING\x10\x06\x12\x17\n" + "\x13HOST_REPROVISIONING\x10\a\x12\f\n" + "\bUPDATING\x10\b\x12\v\n" + - "\aINVALID\x10\t*`\n" + + "\aINVALID\x10\t\x12\r\n" + + "\tREPAIRING\x10\n" + + "*`\n" + "\rDeletedFilter\x12\x1a\n" + "\x16DELETED_FILTER_EXCLUDE\x10\x00\x12\x17\n" + "\x13DELETED_FILTER_ONLY\x10\x01\x12\x1a\n" + diff --git a/rest-api/flow/internal/nicoapi/gen/nico_grpc.pb.go b/rest-api/flow/internal/nicoapi/gen/nico_grpc.pb.go index 883fa70e4f..ee8b67ea34 100644 --- a/rest-api/flow/internal/nicoapi/gen/nico_grpc.pb.go +++ b/rest-api/flow/internal/nicoapi/gen/nico_grpc.pb.go @@ -15,7 +15,7 @@ // Code generated by protoc-gen-go-grpc. DO NOT EDIT. // versions: -// - protoc-gen-go-grpc v1.6.1 +// - protoc-gen-go-grpc v1.6.2 // - protoc (unknown) // source: nico.proto diff --git a/rest-api/flow/internal/nicoapi/nicoproto/nico.proto b/rest-api/flow/internal/nicoapi/nicoproto/nico.proto index bfe7979178..6ad4bb066f 100644 --- a/rest-api/flow/internal/nicoapi/nicoproto/nico.proto +++ b/rest-api/flow/internal/nicoapi/nicoproto/nico.proto @@ -1663,6 +1663,8 @@ enum TenantState { // Something Wrong happened at nico. Check logs for more info. INVALID = 9; + // Instance is undergoing online repair on the site. + REPAIRING = 10; } // Describe the desired configuration of an IBPartition diff --git a/rest-api/openapi/spec.yaml b/rest-api/openapi/spec.yaml index 94c330b7a7..5f242d0766 100644 --- a/rest-api/openapi/spec.yaml +++ b/rest-api/openapi/spec.yaml @@ -12466,6 +12466,7 @@ components: pending: 2 provisioning: 1 ready: 8 + repairing: 1 terminating: 2 error: 1 vpc: @@ -12498,6 +12499,7 @@ components: pending: 2 provisioning: 1 ready: 8 + repairing: 1 terminating: 2 error: 1 properties: @@ -12509,6 +12511,8 @@ components: type: integer ready: type: integer + repairing: + type: integer terminating: type: integer error: @@ -15605,6 +15609,7 @@ components: - Configuring - Ready - Updating + - Repairing - Rebooting - Terminating - Error diff --git a/rest-api/sdk/standard/model_instance_count_by_status.go b/rest-api/sdk/standard/model_instance_count_by_status.go index 75c1881414..b030bcbe83 100644 --- a/rest-api/sdk/standard/model_instance_count_by_status.go +++ b/rest-api/sdk/standard/model_instance_count_by_status.go @@ -40,6 +40,7 @@ type InstanceCountByStatus struct { Pending *int32 `json:"pending,omitempty"` Provisioning *int32 `json:"provisioning,omitempty"` Ready *int32 `json:"ready,omitempty"` + Repairing *int32 `json:"repairing,omitempty"` Terminating *int32 `json:"terminating,omitempty"` Error *int32 `json:"error,omitempty"` } @@ -189,6 +190,38 @@ func (o *InstanceCountByStatus) SetReady(v int32) { o.Ready = &v } +// GetRepairing returns the Repairing field value if set, zero value otherwise. +func (o *InstanceCountByStatus) GetRepairing() int32 { + if o == nil || IsNil(o.Repairing) { + var ret int32 + return ret + } + return *o.Repairing +} + +// GetRepairingOk returns a tuple with the Repairing field value if set, nil otherwise +// and a boolean to check if the value has been set. +func (o *InstanceCountByStatus) GetRepairingOk() (*int32, bool) { + if o == nil || IsNil(o.Repairing) { + return nil, false + } + return o.Repairing, true +} + +// HasRepairing returns a boolean if a field has been set. +func (o *InstanceCountByStatus) HasRepairing() bool { + if o != nil && !IsNil(o.Repairing) { + return true + } + + return false +} + +// SetRepairing gets a reference to the given int32 and assigns it to the Repairing field. +func (o *InstanceCountByStatus) SetRepairing(v int32) { + o.Repairing = &v +} + // GetTerminating returns the Terminating field value if set, zero value otherwise. func (o *InstanceCountByStatus) GetTerminating() int32 { if o == nil || IsNil(o.Terminating) { @@ -275,6 +308,9 @@ func (o InstanceCountByStatus) ToMap() (map[string]interface{}, error) { if !IsNil(o.Ready) { toSerialize["ready"] = o.Ready } + if !IsNil(o.Repairing) { + toSerialize["repairing"] = o.Repairing + } if !IsNil(o.Terminating) { toSerialize["terminating"] = o.Terminating } diff --git a/rest-api/workflow-schema/schema/site-agent/workflows/v1/nico_nico.pb.go b/rest-api/workflow-schema/schema/site-agent/workflows/v1/nico_nico.pb.go index 815bdfb64a..caf58d6375 100644 --- a/rest-api/workflow-schema/schema/site-agent/workflows/v1/nico_nico.pb.go +++ b/rest-api/workflow-schema/schema/site-agent/workflows/v1/nico_nico.pb.go @@ -373,21 +373,24 @@ const ( TenantState_UPDATING TenantState = 8 // Something Wrong happened at nico. Check logs for more info. TenantState_INVALID TenantState = 9 + // Instance is undergoing online repair on the site. + TenantState_REPAIRING TenantState = 10 ) // Enum value maps for TenantState. var ( TenantState_name = map[int32]string{ - 0: "PROVISIONING", - 1: "READY", - 2: "CONFIGURING", - 3: "TERMINATING", - 4: "TERMINATED", - 5: "FAILED", - 6: "DPU_REPROVISIONING", - 7: "HOST_REPROVISIONING", - 8: "UPDATING", - 9: "INVALID", + 0: "PROVISIONING", + 1: "READY", + 2: "CONFIGURING", + 3: "TERMINATING", + 4: "TERMINATED", + 5: "FAILED", + 6: "DPU_REPROVISIONING", + 7: "HOST_REPROVISIONING", + 8: "UPDATING", + 9: "INVALID", + 10: "REPAIRING", } TenantState_value = map[string]int32{ "PROVISIONING": 0, @@ -400,6 +403,7 @@ var ( "HOST_REPROVISIONING": 7, "UPDATING": 8, "INVALID": 9, + "REPAIRING": 10, } ) @@ -53293,7 +53297,7 @@ const file_nico_nico_proto_rawDesc = "" + "\x0fPrefixMatchType\x12\x10\n" + "\fPREFIX_EXACT\x10\x00\x12\x13\n" + "\x0fPREFIX_CONTAINS\x10\x01\x12\x17\n" + - "\x13PREFIX_CONTAINED_BY\x10\x02*\xb4\x01\n" + + "\x13PREFIX_CONTAINED_BY\x10\x02*\xc3\x01\n" + "\vTenantState\x12\x10\n" + "\fPROVISIONING\x10\x00\x12\t\n" + "\x05READY\x10\x01\x12\x0f\n" + @@ -53306,7 +53310,9 @@ const file_nico_nico_proto_rawDesc = "" + "\x12DPU_REPROVISIONING\x10\x06\x12\x17\n" + "\x13HOST_REPROVISIONING\x10\a\x12\f\n" + "\bUPDATING\x10\b\x12\v\n" + - "\aINVALID\x10\t*`\n" + + "\aINVALID\x10\t\x12\r\n" + + "\tREPAIRING\x10\n" + + "*`\n" + "\rDeletedFilter\x12\x1a\n" + "\x16DELETED_FILTER_EXCLUDE\x10\x00\x12\x17\n" + "\x13DELETED_FILTER_ONLY\x10\x01\x12\x1a\n" + diff --git a/rest-api/workflow-schema/schema/site-agent/workflows/v1/nico_nico_grpc.pb.go b/rest-api/workflow-schema/schema/site-agent/workflows/v1/nico_nico_grpc.pb.go index e59208b457..7374f43dac 100644 --- a/rest-api/workflow-schema/schema/site-agent/workflows/v1/nico_nico_grpc.pb.go +++ b/rest-api/workflow-schema/schema/site-agent/workflows/v1/nico_nico_grpc.pb.go @@ -16,7 +16,7 @@ // Code generated by protoc-gen-go-grpc. DO NOT EDIT. // versions: -// - protoc-gen-go-grpc v1.6.1 +// - protoc-gen-go-grpc v1.6.2 // - protoc (unknown) // source: nico_nico.proto diff --git a/rest-api/workflow-schema/site-agent/workflows/v1/nico_nico.proto b/rest-api/workflow-schema/site-agent/workflows/v1/nico_nico.proto index ad0a0c857e..8b184bb77a 100644 --- a/rest-api/workflow-schema/site-agent/workflows/v1/nico_nico.proto +++ b/rest-api/workflow-schema/site-agent/workflows/v1/nico_nico.proto @@ -1651,6 +1651,8 @@ enum TenantState { // Something Wrong happened at nico. Check logs for more info. INVALID = 9; + // Instance is undergoing online repair on the site. + REPAIRING = 10; } // Describe the desired configuration of an IBPartition diff --git a/rest-api/workflow/pkg/activity/instance/instance.go b/rest-api/workflow/pkg/activity/instance/instance.go index b62bfec1f6..5c056a5a6b 100644 --- a/rest-api/workflow/pkg/activity/instance/instance.go +++ b/rest-api/workflow/pkg/activity/instance/instance.go @@ -1163,6 +1163,8 @@ func getNICoInstanceStatus(controllerInstanceTenantState cwsv1.TenantState) (str return cdbm.InstanceStatusUpdating, "Instance is receiving system firmware updates" case cwsv1.TenantState_UPDATING: return cdbm.InstanceStatusUpdating, "Instance is receiving system firmware updates" + case cwsv1.TenantState_REPAIRING: + return cdbm.InstanceStatusRepairing, "Instance is undergoing repair on Site" default: return cdbm.InstanceStatusError, "Instance status is unknown" }