Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 18 additions & 8 deletions crates/admin-cli/src/machine/health_report/cmd.rs
Original file line number Diff line number Diff line change
Expand Up @@ -136,10 +136,11 @@ pub fn get_health_report(template: HealthReportTemplates, message: Option<String
// (admin machine force-delete is unchanged). Merge source `request-online-repair` is separate
// from `tenant-reported-issue`.
HealthReportTemplates::RequestOnlineRepair => {
report.source = "request-online-repair".to_string();
report.source = health_report::REQUEST_ONLINE_REPAIR_MERGE_SOURCE.to_string();
report.alerts[0].id = HealthProbeId::from_str("RequestOnlineRepair")
.expect("RequestOnlineRepair is a valid non-empty HealthProbeId");
report.alerts[0].target = Some("request-online-repair".to_string());
report.alerts[0].target =
Some(health_report::REQUEST_ONLINE_REPAIR_MERGE_SOURCE.to_string());
report.alerts[0].classifications = vec![
HealthAlertClassification::prevent_allocations(),
HealthAlertClassification::suppress_external_alerting(),
Expand All @@ -150,7 +151,7 @@ pub fn get_health_report(template: HealthReportTemplates, message: Option<String
// Template to indicate that the instance is identified as unhealthy and
// is ready to be picked by Repair System for diagnosis and fix.
HealthReportTemplates::RequestRepair => {
report.source = "repair-request".to_string();
report.source = health_report::REPAIR_REQUEST_MERGE_SOURCE.to_string();
report.alerts[0].id = HealthProbeId::from_str("RequestRepair")
.expect("RequestRepair is a valid non-empty HealthProbeId");
report.alerts[0].target = Some("repair-requested".to_string());
Expand Down Expand Up @@ -261,7 +262,7 @@ mod tests {
Some("Hardware diagnostics indicate memory failure".to_string()),
);

assert_eq!(report.source, "repair-request");
assert_eq!(report.source, health_report::REPAIR_REQUEST_MERGE_SOURCE);
assert_eq!(report.alerts.len(), 1);

let alert = &report.alerts[0];
Expand Down Expand Up @@ -299,7 +300,7 @@ mod tests {
fn test_request_repair_template_with_empty_message() {
let report = get_health_report(HealthReportTemplates::RequestRepair, None);

assert_eq!(report.source, "repair-request");
assert_eq!(report.source, health_report::REPAIR_REQUEST_MERGE_SOURCE);
assert_eq!(report.alerts[0].message, "");
}

Expand Down Expand Up @@ -350,15 +351,21 @@ mod tests {
Some("Online repair handoff for stuck repair workflow".to_string()),
);

assert_eq!(report.source, "request-online-repair");
assert_eq!(
report.source,
health_report::REQUEST_ONLINE_REPAIR_MERGE_SOURCE
);
assert_eq!(report.alerts.len(), 1);

let alert = &report.alerts[0];
assert_eq!(
alert.id,
HealthProbeId::from_str("RequestOnlineRepair").unwrap()
);
assert_eq!(alert.target, Some("request-online-repair".to_string()));
assert_eq!(
alert.target,
Some(health_report::REQUEST_ONLINE_REPAIR_MERGE_SOURCE.to_string())
);
assert_eq!(
alert.message,
"Online repair handoff for stuck repair workflow"
Expand Down Expand Up @@ -387,7 +394,10 @@ mod tests {
fn test_request_online_repair_template_with_empty_message() {
let report = get_health_report(HealthReportTemplates::RequestOnlineRepair, None);

assert_eq!(report.source, "request-online-repair");
assert_eq!(
report.source,
health_report::REQUEST_ONLINE_REPAIR_MERGE_SOURCE
);
assert_eq!(report.alerts[0].message, "");
}

Expand Down
10 changes: 10 additions & 0 deletions crates/api-model/src/health.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,16 @@ pub struct HealthReportSources {
}

impl HealthReportSources {
/// True when a repair-related health merge override is active (`repair-request` or
/// `request-online-repair`).
pub fn repair_merge_active(&self) -> bool {
self.merges
.contains_key(health_report::REPAIR_REQUEST_MERGE_SOURCE)
|| self
.merges
.contains_key(health_report::REQUEST_ONLINE_REPAIR_MERGE_SOURCE)
}

#[allow(clippy::should_implement_trait)]
pub fn iter(&self) -> impl Iterator<Item = (&HealthReport, HealthReportApplyMode)> {
self.merges
Expand Down
3 changes: 3 additions & 0 deletions crates/api-model/src/instance/status/tenant.rs
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,9 @@ pub enum TenantState {
Failed,
/// Not sure what happened. Check log for more info
Invalid,
/// Instance is undergoing repair while otherwise tenant-ready (see
/// [`crate::rpc_conv::instance::status::tenant::instance_status_tenant_state`]).
Repairing,
}

#[cfg(test)]
Expand Down
3 changes: 3 additions & 0 deletions crates/api-model/src/rpc_conv/instance/snapshot.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ use carbide_uuid::machine::MachineId;
use config_version::Versioned;
use rpc::errors::RpcDataConversionError;

use crate::health::HealthReportSources;
use crate::instance::snapshot::InstanceSnapshot;
use crate::instance::status::InstanceStatus;
use crate::machine::infiniband::MachineInfinibandStatusObservation;
Expand All @@ -37,6 +38,7 @@ pub fn instance_snapshot_derive_status(
reprovision_request: Option<ReprovisionRequest>,
ib_status: Option<&MachineInfinibandStatusObservation>,
nvlink_status: Option<&MachineNvLinkStatusObservation>,
host_health: &HealthReportSources,
) -> Result<InstanceStatus, RpcDataConversionError> {
instance_status_from_config_and_observation(
dpu_id_to_device_map,
Expand All @@ -55,5 +57,6 @@ pub fn instance_snapshot_derive_status(
ib_status,
nvlink_status,
snapshot.update_network_config_request.is_some(),
host_health,
)
}
85 changes: 84 additions & 1 deletion crates/api-model/src/rpc_conv/instance/status.rs
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ pub fn instance_status_from_config_and_observation(
ib_status: Option<&MachineInfinibandStatusObservation>,
nvlink_status: Option<&MachineNvLinkStatusObservation>,
is_network_config_request_pending: bool,
host_health: &crate::health::HealthReportSources,
) -> Result<InstanceStatus, RpcDataConversionError> {
let mut instance_config_synced = SyncState::Synced;

Expand Down Expand Up @@ -149,6 +150,7 @@ pub fn instance_status_from_config_and_observation(
instance_config.os.phone_home_enabled,
phone_home_last_contact,
extension_services_ready,
host_health.repair_merge_active(),
)?,
true => {
// If instance deletion was requested, we always confirm the
Expand Down Expand Up @@ -186,11 +188,91 @@ impl TryFrom<SyncState> for rpc::SyncState {

#[cfg(test)]
mod tests {
use std::collections::HashMap;
use std::str::FromStr;

use config_version::ConfigVersion;
use health_report::{HealthReport, REPAIR_REQUEST_MERGE_SOURCE};
use uuid::Uuid;

use super::*;
use crate::health::HealthReportSources;
use crate::instance::config::InstanceConfig;
use crate::instance::config::extension_services::InstanceExtensionServicesConfig;
use crate::instance::config::infiniband::InstanceInfinibandConfig;
use crate::instance::config::network::InstanceNetworkConfig;
use crate::instance::config::nvlink::InstanceNvLinkConfig;
use crate::instance::config::tenant_config::TenantConfig;
use crate::instance::status::InstanceStatusObservations;
use crate::instance::status::tenant::TenantState;
use crate::machine::{DpuReprovisionStates, ManagedHostState, ReprovisionState};
use crate::machine::{DpuReprovisionStates, InstanceState, ManagedHostState, ReprovisionState};
use crate::os::{OperatingSystem, OperatingSystemVariant};
use crate::tenant::TenantOrganizationId;

fn minimal_instance_config() -> InstanceConfig {
InstanceConfig {
tenant: TenantConfig {
tenant_organization_id: TenantOrganizationId::try_from("TenantA".to_string())
.unwrap(),
tenant_keyset_ids: vec![],
hostname: None,
},
os: OperatingSystem {
user_data: None,
variant: OperatingSystemVariant::OsImage(Uuid::nil()),
phone_home_enabled: false,
run_provisioning_instructions_on_every_boot: false,
},
network: InstanceNetworkConfig::default(),
infiniband: InstanceInfinibandConfig::default(),
network_security_group_id: None,
extension_services: InstanceExtensionServicesConfig::default(),
nvlink: InstanceNvLinkConfig::default(),
}
}

#[test]
fn repair_merge_active_yields_repairing_via_status_pipeline() {
let config = minimal_instance_config();
let version = ConfigVersion::initial();
let mut health = HealthReportSources::default();
health.merges.insert(
REPAIR_REQUEST_MERGE_SOURCE.to_string(),
HealthReport {
source: REPAIR_REQUEST_MERGE_SOURCE.to_string(),
..Default::default()
},
);

let status = instance_status_from_config_and_observation(
HashMap::new(),
Versioned::new(&config, version),
Versioned::new(&config.network, version),
Versioned::new(&config.infiniband, version),
Versioned::new(&config.extension_services, version),
Versioned::new(&config.nvlink, version),
&InstanceStatusObservations {
network: HashMap::new(),
extension_services: HashMap::new(),
phone_home_last_contact: None,
},
ManagedHostState::Assigned {
instance_state: InstanceState::Ready,
},
false,
None,
None,
None,
false,
&health,
)
.unwrap();

assert_eq!(
status.tenant.as_ref().map(|t| t.state),
Some(TenantState::Repairing)
);
}

#[test]
fn test_tenant_state() {
Expand All @@ -212,6 +294,7 @@ mod tests {
false,
None,
false,
false,
)
.unwrap(),
TenantState::Invalid
Expand Down
Loading
Loading