From c502c488b5c8044d0f8f979cc70074ec8b99ff55 Mon Sep 17 00:00:00 2001 From: Sunil Kumar Date: Tue, 19 May 2026 17:22:17 +0000 Subject: [PATCH 1/3] feat(instance): add TenantState REPAIRING for online repair Add REPAIRING tenant state across proto, cloud workflow, and OpenAPI. Surface it only when the instance is tenant-ready and a repair health merge is active; repair_merge_active() and shared merge-source constants preserve Failed, Updating, Configuring, and Terminating precedence. --- .../src/machine/health_report/cmd.rs | 26 ++-- crates/api-model/src/health.rs | 10 ++ .../api-model/src/instance/status/tenant.rs | 3 + .../src/rpc_conv/instance/snapshot.rs | 3 + .../api-model/src/rpc_conv/instance/status.rs | 85 +++++++++++- .../src/rpc_conv/instance/status/tenant.rs | 131 +++++++++++++++++- crates/api-model/src/rpc_conv/machine/mod.rs | 1 + crates/api/src/handlers/instance.rs | 11 +- crates/api/src/tests/dpu_reprovisioning.rs | 3 +- .../api/src/tests/host_bmc_firmware_test.rs | 39 ++++-- crates/api/src/tests/instance.rs | 16 ++- crates/api/src/tests/machine_health.rs | 11 +- crates/health-report/src/lib.rs | 9 +- crates/rpc/proto/forge.proto | 2 + rest-api/db/pkg/db/model/instance.go | 3 + rest-api/flow/internal/nicoapi/gen/nico.pb.go | 8 +- .../internal/nicoapi/nicoproto/nico.proto | 2 + rest-api/openapi/spec.yaml | 1 + .../site-agent/workflows/v1/nico_nico.pb.go | 8 +- .../site-agent/workflows/v1/nico_nico.proto | 2 + .../pkg/activity/instance/instance.go | 2 + 21 files changed, 331 insertions(+), 45 deletions(-) diff --git a/crates/admin-cli/src/machine/health_report/cmd.rs b/crates/admin-cli/src/machine/health_report/cmd.rs index e92121cb99..9b6fe33005 100644 --- a/crates/admin-cli/src/machine/health_report/cmd.rs +++ b/crates/admin-cli/src/machine/health_report/cmd.rs @@ -135,10 +135,11 @@ pub fn get_health_report(template: HealthReportTemplates, message: Option { - report.source = "request-online-repair".to_string(); + report.source = health_report::REQUEST_ONLINE_REPAIR_MERGE_SOURCE.to_string(); report.alerts[0].id = HealthProbeId::from_str("RequestOnlineRepair") .expect("RequestOnlineRepair is a valid non-empty HealthProbeId"); - report.alerts[0].target = Some("request-online-repair".to_string()); + report.alerts[0].target = + Some(health_report::REQUEST_ONLINE_REPAIR_MERGE_SOURCE.to_string()); report.alerts[0].classifications = vec![ HealthAlertClassification::prevent_allocations(), HealthAlertClassification::suppress_external_alerting(), @@ -149,7 +150,7 @@ pub fn get_health_report(template: HealthReportTemplates, message: Option { - report.source = "repair-request".to_string(); + report.source = health_report::REPAIR_REQUEST_MERGE_SOURCE.to_string(); report.alerts[0].id = HealthProbeId::from_str("RequestRepair") .expect("RequestRepair is a valid non-empty HealthProbeId"); report.alerts[0].target = Some("repair-requested".to_string()); @@ -260,7 +261,7 @@ mod tests { Some("Hardware diagnostics indicate memory failure".to_string()), ); - assert_eq!(report.source, "repair-request"); + assert_eq!(report.source, health_report::REPAIR_REQUEST_MERGE_SOURCE); assert_eq!(report.alerts.len(), 1); let alert = &report.alerts[0]; @@ -298,7 +299,7 @@ mod tests { fn test_request_repair_template_with_empty_message() { let report = get_health_report(HealthReportTemplates::RequestRepair, None); - assert_eq!(report.source, "repair-request"); + assert_eq!(report.source, health_report::REPAIR_REQUEST_MERGE_SOURCE); assert_eq!(report.alerts[0].message, ""); } @@ -349,7 +350,10 @@ mod tests { Some("Online repair handoff for stuck repair workflow".to_string()), ); - assert_eq!(report.source, "request-online-repair"); + assert_eq!( + report.source, + health_report::REQUEST_ONLINE_REPAIR_MERGE_SOURCE + ); assert_eq!(report.alerts.len(), 1); let alert = &report.alerts[0]; @@ -357,7 +361,10 @@ mod tests { alert.id, HealthProbeId::from_str("RequestOnlineRepair").unwrap() ); - assert_eq!(alert.target, Some("request-online-repair".to_string())); + assert_eq!( + alert.target, + Some(health_report::REQUEST_ONLINE_REPAIR_MERGE_SOURCE.to_string()) + ); assert_eq!( alert.message, "Online repair handoff for stuck repair workflow" @@ -386,7 +393,10 @@ mod tests { fn test_request_online_repair_template_with_empty_message() { let report = get_health_report(HealthReportTemplates::RequestOnlineRepair, None); - assert_eq!(report.source, "request-online-repair"); + assert_eq!( + report.source, + health_report::REQUEST_ONLINE_REPAIR_MERGE_SOURCE + ); assert_eq!(report.alerts[0].message, ""); } diff --git a/crates/api-model/src/health.rs b/crates/api-model/src/health.rs index a6e6e8c4af..498b8d7791 100644 --- a/crates/api-model/src/health.rs +++ b/crates/api-model/src/health.rs @@ -46,6 +46,16 @@ pub struct HealthReportSources { } impl HealthReportSources { + /// True when a repair-related health merge override is active (`repair-request` or + /// `request-online-repair`). + pub fn repair_merge_active(&self) -> bool { + self.merges + .contains_key(health_report::REPAIR_REQUEST_MERGE_SOURCE) + || self + .merges + .contains_key(health_report::REQUEST_ONLINE_REPAIR_MERGE_SOURCE) + } + #[allow(clippy::should_implement_trait)] pub fn iter(&self) -> impl Iterator { self.merges diff --git a/crates/api-model/src/instance/status/tenant.rs b/crates/api-model/src/instance/status/tenant.rs index 1aa97b2a8d..d35cf347d7 100644 --- a/crates/api-model/src/instance/status/tenant.rs +++ b/crates/api-model/src/instance/status/tenant.rs @@ -59,6 +59,9 @@ pub enum TenantState { Failed, /// Not sure what happened. Check log for more info Invalid, + /// Instance is undergoing repair while otherwise tenant-ready (see + /// [`crate::rpc_conv::instance::status::tenant::instance_status_tenant_state`]). + Repairing, } #[cfg(test)] diff --git a/crates/api-model/src/rpc_conv/instance/snapshot.rs b/crates/api-model/src/rpc_conv/instance/snapshot.rs index 563e3774bd..b33e1f109b 100644 --- a/crates/api-model/src/rpc_conv/instance/snapshot.rs +++ b/crates/api-model/src/rpc_conv/instance/snapshot.rs @@ -21,6 +21,7 @@ use carbide_uuid::machine::MachineId; use config_version::Versioned; use rpc::errors::RpcDataConversionError; +use crate::health::HealthReportSources; use crate::instance::snapshot::InstanceSnapshot; use crate::instance::status::InstanceStatus; use crate::machine::infiniband::MachineInfinibandStatusObservation; @@ -37,6 +38,7 @@ pub fn instance_snapshot_derive_status( reprovision_request: Option, ib_status: Option<&MachineInfinibandStatusObservation>, nvlink_status: Option<&MachineNvLinkStatusObservation>, + host_health: &HealthReportSources, ) -> Result { instance_status_from_config_and_observation( dpu_id_to_device_map, @@ -55,5 +57,6 @@ pub fn instance_snapshot_derive_status( ib_status, nvlink_status, snapshot.update_network_config_request.is_some(), + host_health, ) } diff --git a/crates/api-model/src/rpc_conv/instance/status.rs b/crates/api-model/src/rpc_conv/instance/status.rs index fab4d209ae..4bfe7fc17a 100644 --- a/crates/api-model/src/rpc_conv/instance/status.rs +++ b/crates/api-model/src/rpc_conv/instance/status.rs @@ -77,6 +77,7 @@ pub fn instance_status_from_config_and_observation( ib_status: Option<&MachineInfinibandStatusObservation>, nvlink_status: Option<&MachineNvLinkStatusObservation>, is_network_config_request_pending: bool, + host_health: &crate::health::HealthReportSources, ) -> Result { let mut instance_config_synced = SyncState::Synced; @@ -149,6 +150,7 @@ pub fn instance_status_from_config_and_observation( instance_config.os.phone_home_enabled, phone_home_last_contact, extension_services_ready, + host_health.repair_merge_active(), )?, true => { // If instance deletion was requested, we always confirm the @@ -186,11 +188,91 @@ impl TryFrom for rpc::SyncState { #[cfg(test)] mod tests { + use std::collections::HashMap; use std::str::FromStr; + use config_version::ConfigVersion; + use health_report::{HealthReport, REPAIR_REQUEST_MERGE_SOURCE}; + use uuid::Uuid; + use super::*; + use crate::health::HealthReportSources; + use crate::instance::config::InstanceConfig; + use crate::instance::config::extension_services::InstanceExtensionServicesConfig; + use crate::instance::config::infiniband::InstanceInfinibandConfig; + use crate::instance::config::network::InstanceNetworkConfig; + use crate::instance::config::nvlink::InstanceNvLinkConfig; + use crate::instance::config::tenant_config::TenantConfig; + use crate::instance::status::InstanceStatusObservations; use crate::instance::status::tenant::TenantState; - use crate::machine::{DpuReprovisionStates, ManagedHostState, ReprovisionState}; + use crate::machine::{DpuReprovisionStates, InstanceState, ManagedHostState, ReprovisionState}; + use crate::os::{OperatingSystem, OperatingSystemVariant}; + use crate::tenant::TenantOrganizationId; + + fn minimal_instance_config() -> InstanceConfig { + InstanceConfig { + tenant: TenantConfig { + tenant_organization_id: TenantOrganizationId::try_from("TenantA".to_string()) + .unwrap(), + tenant_keyset_ids: vec![], + hostname: None, + }, + os: OperatingSystem { + user_data: None, + variant: OperatingSystemVariant::OsImage(Uuid::nil()), + phone_home_enabled: false, + run_provisioning_instructions_on_every_boot: false, + }, + network: InstanceNetworkConfig::default(), + infiniband: InstanceInfinibandConfig::default(), + network_security_group_id: None, + extension_services: InstanceExtensionServicesConfig::default(), + nvlink: InstanceNvLinkConfig::default(), + } + } + + #[test] + fn repair_merge_active_yields_repairing_via_status_pipeline() { + let config = minimal_instance_config(); + let version = ConfigVersion::initial(); + let mut health = HealthReportSources::default(); + health.merges.insert( + REPAIR_REQUEST_MERGE_SOURCE.to_string(), + HealthReport { + source: REPAIR_REQUEST_MERGE_SOURCE.to_string(), + ..Default::default() + }, + ); + + let status = instance_status_from_config_and_observation( + HashMap::new(), + Versioned::new(&config, version.clone()), + Versioned::new(&config.network, version.clone()), + Versioned::new(&config.infiniband, version.clone()), + Versioned::new(&config.extension_services, version.clone()), + Versioned::new(&config.nvlink, version), + &InstanceStatusObservations { + network: HashMap::new(), + extension_services: HashMap::new(), + phone_home_last_contact: None, + }, + ManagedHostState::Assigned { + instance_state: InstanceState::Ready, + }, + false, + None, + None, + None, + false, + &health, + ) + .unwrap(); + + assert_eq!( + status.tenant.as_ref().map(|t| t.state), + Some(TenantState::Repairing) + ); + } #[test] fn test_tenant_state() { @@ -212,6 +294,7 @@ mod tests { false, None, false, + false, ) .unwrap(), TenantState::Invalid diff --git a/crates/api-model/src/rpc_conv/instance/status/tenant.rs b/crates/api-model/src/rpc_conv/instance/status/tenant.rs index 430dbe7a2f..cbe6974c35 100644 --- a/crates/api-model/src/rpc_conv/instance/status/tenant.rs +++ b/crates/api-model/src/rpc_conv/instance/status/tenant.rs @@ -21,13 +21,19 @@ use crate::instance::status::SyncState; use crate::instance::status::tenant::{InstanceTenantStatus, TenantState}; use crate::machine::{InstanceState, ManagedHostState}; -/// Tries to convert Machine state to tenant state. +/// Converts machine state into the tenant-visible [`TenantState`]. +/// +/// When `repair_active` is true, [`TenantState::Repairing`] is returned only if the +/// instance would otherwise be tenant-ready (`InstanceState::Ready` with synced configs +/// and extension services ready). It does not override Failed, Updating, Configuring, +/// Provisioning, or Terminating. pub fn instance_status_tenant_state( machine_state: ManagedHostState, configs_synced: SyncState, phone_home_enrolled: bool, phone_home_last_contact: Option>, extension_services_ready: bool, + repair_active: bool, ) -> Result { // At this point, we are sure that instance is created. // If machine state is still ready, means state machine has not processed this instance @@ -60,7 +66,8 @@ pub fn instance_status_tenant_state( (false, _, false) => TenantState::Configuring, // If there is no pending phone-home and extension services are ready, - // return Ready (this was the default before phone_home) + // the instance is tenant-ready; surface online repair only in this case. + (false, SyncState::Synced, true) if repair_active => TenantState::Repairing, (false, SyncState::Synced, true) => TenantState::Ready, // If there is a pending phone-home, we're still @@ -124,6 +131,126 @@ impl TryFrom for rpc::TenantState { TenantState::HostReprovisioning => rpc::TenantState::HostReprovisioning, TenantState::Updating => rpc::TenantState::Updating, TenantState::Invalid => rpc::TenantState::Invalid, + TenantState::Repairing => rpc::TenantState::Repairing, }) } } + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + use std::str::FromStr; + + use super::*; + use crate::health::HealthReportSources; + use crate::instance::status::SyncState; + use crate::machine::{ + DpuReprovisionStates, FailureCause, FailureDetails, FailureSource, InstanceState, + ManagedHostState, + }; + use carbide_uuid::machine::MachineId; + use chrono::Utc; + use health_report::{HealthReport, REPAIR_REQUEST_MERGE_SOURCE}; + + #[test] + fn repair_merge_active_detects_merge_sources() { + let mut health = HealthReportSources::default(); + assert!(!health.repair_merge_active()); + health.merges.insert( + REPAIR_REQUEST_MERGE_SOURCE.to_string(), + HealthReport { + source: REPAIR_REQUEST_MERGE_SOURCE.to_string(), + ..Default::default() + }, + ); + assert!(health.repair_merge_active()); + } + + #[test] + fn repair_merge_tenant_state_precedence() { + let machine_id = + MachineId::from_str("fm100htjtiaehv1n5vh67tbmqq4eabcjdng40f7jupsadbedhruh6rag1l0") + .unwrap(); + let failed = InstanceState::Failed { + details: FailureDetails { + cause: FailureCause::NoError, + failed_at: Utc::now(), + source: FailureSource::StateMachine, + }, + machine_id, + }; + + struct Case { + name: &'static str, + machine_state: ManagedHostState, + configs_synced: SyncState, + repair_active: bool, + expected: TenantState, + } + + let cases = [ + Case { + name: "tenant-ready with repair merge", + machine_state: ManagedHostState::Assigned { + instance_state: InstanceState::Ready, + }, + configs_synced: SyncState::Synced, + repair_active: true, + expected: TenantState::Repairing, + }, + Case { + name: "terminating with repair merge", + machine_state: ManagedHostState::Assigned { + instance_state: InstanceState::SwitchToAdminNetwork, + }, + configs_synced: SyncState::Synced, + repair_active: true, + expected: TenantState::Terminating, + }, + Case { + name: "reprovision with repair merge", + machine_state: ManagedHostState::Assigned { + instance_state: InstanceState::DPUReprovision { + dpu_states: DpuReprovisionStates { + states: HashMap::new(), + }, + }, + }, + configs_synced: SyncState::Synced, + repair_active: true, + expected: TenantState::Updating, + }, + Case { + name: "configuring with repair merge", + machine_state: ManagedHostState::Assigned { + instance_state: InstanceState::Ready, + }, + configs_synced: SyncState::Pending, + repair_active: true, + expected: TenantState::Configuring, + }, + Case { + name: "failed with repair merge", + machine_state: ManagedHostState::Assigned { + instance_state: failed, + }, + configs_synced: SyncState::Synced, + repair_active: true, + expected: TenantState::Failed, + }, + ]; + + for case in cases { + let state = instance_status_tenant_state( + case.machine_state, + case.configs_synced, + false, + None, + true, + case.repair_active, + ) + .unwrap_or_else(|_| panic!("case {:?} failed conversion", case.name)); + assert_eq!(state, case.expected, "case: {}", case.name); + } + } +} diff --git a/crates/api-model/src/rpc_conv/machine/mod.rs b/crates/api-model/src/rpc_conv/machine/mod.rs index a40bded8e9..46f269af58 100644 --- a/crates/api-model/src/rpc_conv/machine/mod.rs +++ b/crates/api-model/src/rpc_conv/machine/mod.rs @@ -85,6 +85,7 @@ impl TryFrom for Option { .infiniband_status_observation .as_ref(), snapshot.host_snapshot.nvlink_status_observation.as_ref(), + &snapshot.host_snapshot.health_reports, )?; Ok(Some(rpc::Instance { diff --git a/crates/api/src/handlers/instance.rs b/crates/api/src/handlers/instance.rs index a8cc8435f7..0187238b91 100644 --- a/crates/api/src/handlers/instance.rs +++ b/crates/api/src/handlers/instance.rs @@ -329,7 +329,7 @@ fn create_tenant_reported_issue_override( /// Creates a RequestRepair health override template fn create_request_repair_override(issue: &rpc::Issue) -> HealthReport { HealthReport { - source: "repair-request".to_string(), + source: health_report::REPAIR_REQUEST_MERGE_SOURCE.to_string(), observed_at: Some(chrono::Utc::now()), alerts: vec![HealthProbeAlert { id: HealthProbeId::from_str("RequestRepair") @@ -442,7 +442,10 @@ async fn handle_instance_release_from_repair_tenant( machine: &model::machine::Machine, tenant_organization_id: &str, ) -> Result<(), CarbideError> { - let has_request_repair = machine.health_reports.merges.contains_key("repair-request"); + let has_request_repair = machine + .health_reports + .merges + .contains_key(health_report::REPAIR_REQUEST_MERGE_SOURCE); if !has_request_repair { // No existing RequestRepair override @@ -488,7 +491,7 @@ async fn handle_instance_release_from_repair_tenant( remove_health_override( txn, machine_id, - "repair-request", + health_report::REPAIR_REQUEST_MERGE_SOURCE, "RequestRepair removed - repair completed successfully", ) .await?; @@ -529,7 +532,7 @@ async fn handle_instance_release_from_repair_tenant( remove_health_override( txn, machine_id, - "repair-request", + health_report::REPAIR_REQUEST_MERGE_SOURCE, "RequestRepair removed for incomplete repair", ) .await?; diff --git a/crates/api/src/tests/dpu_reprovisioning.rs b/crates/api/src/tests/dpu_reprovisioning.rs index 3252369103..b4c9d414b0 100644 --- a/crates/api/src/tests/dpu_reprovisioning.rs +++ b/crates/api/src/tests/dpu_reprovisioning.rs @@ -618,7 +618,8 @@ async fn assert_reprov_tenant_state( host.state.clone().value, None, None, - None + None, + &host.health_reports, ) .unwrap() .tenant diff --git a/crates/api/src/tests/host_bmc_firmware_test.rs b/crates/api/src/tests/host_bmc_firmware_test.rs index 18f90c5390..8de845d3df 100644 --- a/crates/api/src/tests/host_bmc_firmware_test.rs +++ b/crates/api/src/tests/host_bmc_firmware_test.rs @@ -1253,7 +1253,8 @@ async fn test_instance_upgrading_actual_part_2( host.state.clone().value, None, None, - None + None, + &host.health_reports, ) .unwrap() .tenant @@ -1292,7 +1293,8 @@ async fn test_instance_upgrading_actual_part_2( host.state.clone().value, None, None, - None + None, + &host.health_reports, ) .unwrap() .tenant @@ -1341,7 +1343,8 @@ async fn test_instance_upgrading_actual_part_2( host.state.clone().value, None, None, - None + None, + &host.health_reports, ) .unwrap() .tenant @@ -1387,7 +1390,8 @@ async fn test_instance_upgrading_actual_part_2( host.state.clone().value, None, None, - None + None, + &host.health_reports, ) .unwrap() .tenant @@ -1423,7 +1427,8 @@ async fn test_instance_upgrading_actual_part_2( host.state.clone().value, None, None, - None + None, + &host.health_reports, ) .unwrap() .tenant @@ -1481,7 +1486,8 @@ async fn test_instance_upgrading_actual_part_2( host.state.clone().value, None, None, - None + None, + &host.health_reports, ) .unwrap() .tenant @@ -1517,7 +1523,8 @@ async fn test_instance_upgrading_actual_part_2( host.state.clone().value, None, None, - None + None, + &host.health_reports, ) .unwrap() .tenant @@ -1565,7 +1572,8 @@ async fn test_instance_upgrading_actual_part_2( host.state.clone().value, None, None, - None + None, + &host.health_reports, ) .unwrap() .tenant @@ -1633,7 +1641,8 @@ async fn test_instance_upgrading_actual_part_2( host.state.clone().value, None, None, - None + None, + &host.health_reports, ) .unwrap() .tenant @@ -1697,7 +1706,8 @@ async fn test_instance_upgrading_actual_part_2( host.state.clone().value, None, None, - None + None, + &host.health_reports, ) .unwrap() .tenant @@ -1733,7 +1743,8 @@ async fn test_instance_upgrading_actual_part_2( host.state.clone().value, None, None, - None + None, + &host.health_reports, ) .unwrap() .tenant @@ -1767,7 +1778,8 @@ async fn test_instance_upgrading_actual_part_2( host.state.clone().value, None, None, - None + None, + &host.health_reports, ) .unwrap() .tenant @@ -1796,7 +1808,8 @@ async fn test_instance_upgrading_actual_part_2( host.state.clone().value, None, None, - None + None, + &host.health_reports, ) .unwrap() .tenant diff --git a/crates/api/src/tests/instance.rs b/crates/api/src/tests/instance.rs index 2028bc1c58..328524e7d4 100644 --- a/crates/api/src/tests/instance.rs +++ b/crates/api/src/tests/instance.rs @@ -5055,7 +5055,7 @@ async fn test_instance_release_backward_compatibility(_: PgPoolOptions, options: !host_machine .health_reports .merges - .contains_key("repair-request"), + .contains_key(health_report::REPAIR_REQUEST_MERGE_SOURCE), "Backward compatibility: RequestRepair override should NOT be applied without issue field" ); @@ -5164,7 +5164,7 @@ async fn test_instance_release_repair_tenant(_: PgPoolOptions, options: PgConnec let has_repair_request_override = host_machine .health_reports .merges - .contains_key("repair-request"); + .contains_key(health_report::REPAIR_REQUEST_MERGE_SOURCE); assert!( !has_tenant_reported_override, @@ -5262,7 +5262,7 @@ async fn test_instance_release_combined_enhancements(_: PgPoolOptions, options: let has_repair_request_override = host_machine .health_reports .merges - .contains_key("repair-request"); + .contains_key(health_report::REPAIR_REQUEST_MERGE_SOURCE); assert!( !has_repair_request_override, @@ -5456,14 +5456,18 @@ async fn test_instance_release_auto_repair_enabled(_: PgPoolOptions, options: Pg host_machine .health_reports .merges - .contains_key("repair-request"), + .contains_key(health_report::REPAIR_REQUEST_MERGE_SOURCE), "Should have RequestRepair override when auto-repair is enabled" ); // 4. Verify the RequestRepair override content - let repair_override = &host_machine.health_reports.merges["repair-request"]; + let repair_override = + &host_machine.health_reports.merges[health_report::REPAIR_REQUEST_MERGE_SOURCE]; let repair_report: health_report::HealthReport = repair_override.clone(); - assert_eq!(repair_report.source, "repair-request"); + assert_eq!( + repair_report.source, + health_report::REPAIR_REQUEST_MERGE_SOURCE + ); assert_eq!(repair_report.alerts.len(), 1); assert_eq!(repair_report.alerts[0].id.to_string(), "RequestRepair"); assert!( diff --git a/crates/api/src/tests/machine_health.rs b/crates/api/src/tests/machine_health.rs index 31296f3ba7..ec103142dc 100644 --- a/crates/api/src/tests/machine_health.rs +++ b/crates/api/src/tests/machine_health.rs @@ -1136,7 +1136,7 @@ async fn test_request_repair_health_override_template( // Create a RequestRepair health override using the API let repair_request_override = health_report::HealthReport { - source: "repair-request".to_string(), + source: health_report::REPAIR_REQUEST_MERGE_SOURCE.to_string(), triggered_by: None, observed_at: Some(chrono::Utc::now()), successes: vec![], @@ -1174,7 +1174,10 @@ async fn test_request_repair_health_override_template( machine.health_sources[1].mode, HealthReportApplyMode::Merge as i32 ); - assert_eq!(machine.health_sources[1].source, "repair-request"); + assert_eq!( + machine.health_sources[1].source, + health_report::REPAIR_REQUEST_MERGE_SOURCE + ); // Verify aggregate health includes the override let aggregate_health = aggregate(machine).unwrap(); @@ -1232,7 +1235,7 @@ async fn test_tenant_reported_issue_and_request_repair_combined( }; let repair_request_override = health_report::HealthReport { - source: "repair-request".to_string(), + source: health_report::REPAIR_REQUEST_MERGE_SOURCE.to_string(), triggered_by: None, observed_at: Some(chrono::Utc::now()), successes: vec![], @@ -1278,7 +1281,7 @@ async fn test_tenant_reported_issue_and_request_repair_combined( .map(|o| o.source.clone()) .collect(); assert!(sources.contains(&"tenant-reported-issue".to_string())); - assert!(sources.contains(&"repair-request".to_string())); + assert!(sources.contains(&health_report::REPAIR_REQUEST_MERGE_SOURCE.to_string())); // All should be merge mode for override_entry in &machine.health_sources { diff --git a/crates/health-report/src/lib.rs b/crates/health-report/src/lib.rs index 2f3ba83068..73ddca0bfc 100644 --- a/crates/health-report/src/lib.rs +++ b/crates/health-report/src/lib.rs @@ -21,6 +21,11 @@ use std::str::FromStr; use serde::{Deserialize, Serialize}; +/// `HealthReportSources::merges` key for the auto-repair (`RequestRepair`) override. +pub const REPAIR_REQUEST_MERGE_SOURCE: &str = "repair-request"; +/// `HealthReportSources::merges` key for online repair gating (`RequestOnlineRepair` override). +pub const REQUEST_ONLINE_REPAIR_MERGE_SOURCE: &str = "request-online-repair"; + /// Reports the aggregate health of a system or subsystem #[derive(PartialEq, Eq, Debug, Clone, Serialize, Deserialize)] pub struct HealthReport { @@ -763,13 +768,13 @@ mod tests { // Shape matches admin-cli `HealthReportTemplates::RequestOnlineRepair` (merge source // `request-online-repair`, probe id `RequestOnlineRepair`). let report = HealthReport { - source: "request-online-repair".to_string(), + source: REQUEST_ONLINE_REPAIR_MERGE_SOURCE.to_string(), triggered_by: None, observed_at: Some(chrono::Utc::now()), successes: vec![], alerts: vec![HealthProbeAlert { id: HealthProbeId::from_str("RequestOnlineRepair").unwrap(), - target: Some("request-online-repair".to_string()), + target: Some(REQUEST_ONLINE_REPAIR_MERGE_SOURCE.to_string()), in_alert_since: None, message: "test".to_string(), tenant_message: None, diff --git a/crates/rpc/proto/forge.proto b/crates/rpc/proto/forge.proto index 1491c20280..f816873772 100644 --- a/crates/rpc/proto/forge.proto +++ b/crates/rpc/proto/forge.proto @@ -1777,6 +1777,8 @@ enum TenantState { // Something Wrong happened at carbide. Check logs for more info. INVALID = 9; + // Instance is undergoing online repair (health override active). + REPAIRING = 10; } // Describe the desired configuration of an IBPartition diff --git a/rest-api/db/pkg/db/model/instance.go b/rest-api/db/pkg/db/model/instance.go index 5c55783fbc..4c447e024a 100644 --- a/rest-api/db/pkg/db/model/instance.go +++ b/rest-api/db/pkg/db/model/instance.go @@ -42,6 +42,8 @@ const ( InstanceStatusReady = "Ready" // InstanceStatusUpdating indicates that the Instance is receiving system updates InstanceStatusUpdating = "Updating" + // InstanceStatusRepairing indicates that the Instance is undergoing online repair on the site + InstanceStatusRepairing = "Repairing" // InstanceStatusError indicates that the Instance provisioning has failed InstanceStatusError = "Error" // InstanceStatusTerminating indicates that the Instance is being terminated @@ -133,6 +135,7 @@ var ( InstanceStatusPending: true, InstanceStatusReady: true, InstanceStatusUpdating: true, + InstanceStatusRepairing: true, InstanceStatusError: true, InstanceStatusConfiguring: true, InstanceStatusProvisioning: true, diff --git a/rest-api/flow/internal/nicoapi/gen/nico.pb.go b/rest-api/flow/internal/nicoapi/gen/nico.pb.go index f33d7159a2..be5c4b3c2b 100644 --- a/rest-api/flow/internal/nicoapi/gen/nico.pb.go +++ b/rest-api/flow/internal/nicoapi/gen/nico.pb.go @@ -372,6 +372,8 @@ const ( TenantState_UPDATING TenantState = 8 // Something Wrong happened at nico. Check logs for more info. TenantState_INVALID TenantState = 9 + // Instance is undergoing online repair on the site. + TenantState_REPAIRING TenantState = 10 ) // Enum value maps for TenantState. @@ -385,8 +387,9 @@ var ( 5: "FAILED", 6: "DPU_REPROVISIONING", 7: "HOST_REPROVISIONING", - 8: "UPDATING", - 9: "INVALID", + 8: "UPDATING", + 9: "INVALID", + 10: "REPAIRING", } TenantState_value = map[string]int32{ "PROVISIONING": 0, @@ -399,6 +402,7 @@ var ( "HOST_REPROVISIONING": 7, "UPDATING": 8, "INVALID": 9, + "REPAIRING": 10, } ) diff --git a/rest-api/flow/internal/nicoapi/nicoproto/nico.proto b/rest-api/flow/internal/nicoapi/nicoproto/nico.proto index bfe7979178..6ad4bb066f 100644 --- a/rest-api/flow/internal/nicoapi/nicoproto/nico.proto +++ b/rest-api/flow/internal/nicoapi/nicoproto/nico.proto @@ -1663,6 +1663,8 @@ enum TenantState { // Something Wrong happened at nico. Check logs for more info. INVALID = 9; + // Instance is undergoing online repair on the site. + REPAIRING = 10; } // Describe the desired configuration of an IBPartition diff --git a/rest-api/openapi/spec.yaml b/rest-api/openapi/spec.yaml index 94c330b7a7..5773efc599 100644 --- a/rest-api/openapi/spec.yaml +++ b/rest-api/openapi/spec.yaml @@ -15605,6 +15605,7 @@ components: - Configuring - Ready - Updating + - Repairing - Rebooting - Terminating - Error diff --git a/rest-api/workflow-schema/schema/site-agent/workflows/v1/nico_nico.pb.go b/rest-api/workflow-schema/schema/site-agent/workflows/v1/nico_nico.pb.go index 815bdfb64a..d86d1cb355 100644 --- a/rest-api/workflow-schema/schema/site-agent/workflows/v1/nico_nico.pb.go +++ b/rest-api/workflow-schema/schema/site-agent/workflows/v1/nico_nico.pb.go @@ -373,6 +373,8 @@ const ( TenantState_UPDATING TenantState = 8 // Something Wrong happened at nico. Check logs for more info. TenantState_INVALID TenantState = 9 + // Instance is undergoing online repair on the site. + TenantState_REPAIRING TenantState = 10 ) // Enum value maps for TenantState. @@ -386,8 +388,9 @@ var ( 5: "FAILED", 6: "DPU_REPROVISIONING", 7: "HOST_REPROVISIONING", - 8: "UPDATING", - 9: "INVALID", + 8: "UPDATING", + 9: "INVALID", + 10: "REPAIRING", } TenantState_value = map[string]int32{ "PROVISIONING": 0, @@ -400,6 +403,7 @@ var ( "HOST_REPROVISIONING": 7, "UPDATING": 8, "INVALID": 9, + "REPAIRING": 10, } ) diff --git a/rest-api/workflow-schema/site-agent/workflows/v1/nico_nico.proto b/rest-api/workflow-schema/site-agent/workflows/v1/nico_nico.proto index ad0a0c857e..8b184bb77a 100644 --- a/rest-api/workflow-schema/site-agent/workflows/v1/nico_nico.proto +++ b/rest-api/workflow-schema/site-agent/workflows/v1/nico_nico.proto @@ -1651,6 +1651,8 @@ enum TenantState { // Something Wrong happened at nico. Check logs for more info. INVALID = 9; + // Instance is undergoing online repair on the site. + REPAIRING = 10; } // Describe the desired configuration of an IBPartition diff --git a/rest-api/workflow/pkg/activity/instance/instance.go b/rest-api/workflow/pkg/activity/instance/instance.go index b62bfec1f6..5c056a5a6b 100644 --- a/rest-api/workflow/pkg/activity/instance/instance.go +++ b/rest-api/workflow/pkg/activity/instance/instance.go @@ -1163,6 +1163,8 @@ func getNICoInstanceStatus(controllerInstanceTenantState cwsv1.TenantState) (str return cdbm.InstanceStatusUpdating, "Instance is receiving system firmware updates" case cwsv1.TenantState_UPDATING: return cdbm.InstanceStatusUpdating, "Instance is receiving system firmware updates" + case cwsv1.TenantState_REPAIRING: + return cdbm.InstanceStatusRepairing, "Instance is undergoing repair on Site" default: return cdbm.InstanceStatusError, "Instance status is unknown" } From 64c6f7ad492c877824e4b756c22b514719d06d4d Mon Sep 17 00:00:00 2001 From: Sunil Kumar Date: Wed, 20 May 2026 09:36:08 +0000 Subject: [PATCH 2/3] fix(api-model): satisfy clippy clone-on-copy in repair status test --- crates/api-model/src/rpc_conv/instance/status.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/crates/api-model/src/rpc_conv/instance/status.rs b/crates/api-model/src/rpc_conv/instance/status.rs index 4bfe7fc17a..ca68ff140f 100644 --- a/crates/api-model/src/rpc_conv/instance/status.rs +++ b/crates/api-model/src/rpc_conv/instance/status.rs @@ -246,10 +246,10 @@ mod tests { let status = instance_status_from_config_and_observation( HashMap::new(), - Versioned::new(&config, version.clone()), - Versioned::new(&config.network, version.clone()), - Versioned::new(&config.infiniband, version.clone()), - Versioned::new(&config.extension_services, version.clone()), + Versioned::new(&config, version), + Versioned::new(&config.network, version), + Versioned::new(&config.infiniband, version), + Versioned::new(&config.extension_services, version), Versioned::new(&config.nvlink, version), &InstanceStatusObservations { network: HashMap::new(), From 445ba59fe6589296cbfac7a483639d2d612992cb Mon Sep 17 00:00:00 2001 From: Sunil Kumar Date: Wed, 20 May 2026 14:59:08 +0000 Subject: [PATCH 3/3] fixed nightly rustfmt errors --- crates/api-model/src/rpc_conv/instance/status/tenant.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/crates/api-model/src/rpc_conv/instance/status/tenant.rs b/crates/api-model/src/rpc_conv/instance/status/tenant.rs index cbe6974c35..2922d6e2f3 100644 --- a/crates/api-model/src/rpc_conv/instance/status/tenant.rs +++ b/crates/api-model/src/rpc_conv/instance/status/tenant.rs @@ -141,6 +141,10 @@ mod tests { use std::collections::HashMap; use std::str::FromStr; + use carbide_uuid::machine::MachineId; + use chrono::Utc; + use health_report::{HealthReport, REPAIR_REQUEST_MERGE_SOURCE}; + use super::*; use crate::health::HealthReportSources; use crate::instance::status::SyncState; @@ -148,9 +152,6 @@ mod tests { DpuReprovisionStates, FailureCause, FailureDetails, FailureSource, InstanceState, ManagedHostState, }; - use carbide_uuid::machine::MachineId; - use chrono::Utc; - use health_report::{HealthReport, REPAIR_REQUEST_MERGE_SOURCE}; #[test] fn repair_merge_active_detects_merge_sources() {