diff --git a/Cargo.lock b/Cargo.lock index e83da904c92..8c6b5b9def7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -7157,6 +7157,7 @@ dependencies = [ "anyhow", "chrono", "ereport-types", + "expectorate", "iddqd", "nexus-reconfigurator-planning", "nexus-types", @@ -7164,10 +7165,12 @@ dependencies = [ "omicron-uuid-kinds", "omicron-workspace-hack", "rand 0.9.2", + "schemars 0.8.22", "serde", "serde_json", "slog", "slog-error-chain", + "strum 0.27.2", "thiserror 2.0.18", "typed-rng", ] diff --git a/nexus/fm/Cargo.toml b/nexus/fm/Cargo.toml index 5e88e88b88a..c4b916bbcd5 100644 --- a/nexus/fm/Cargo.toml +++ b/nexus/fm/Cargo.toml @@ -38,3 +38,7 @@ omicron-workspace-hack.workspace = true omicron-test-utils.workspace = true nexus-reconfigurator-planning.workspace = true ereport-types.workspace = true +expectorate.workspace = true +schemars.workspace = true +strum.workspace = true +omicron-uuid-kinds = { workspace = true, features = ["schemars08"] } diff --git a/nexus/fm/output/README.md b/nexus/fm/output/README.md new file mode 100644 index 00000000000..c982b139555 --- /dev/null +++ b/nexus/fm/output/README.md @@ -0,0 +1,23 @@ +# FM fact payload guardrail goldens + +These files snapshot the persisted representation of every fault-management +diagnosis engine (DE) fact payload, which is serialized into the opaque +`fm_fact.payload` JSONB column: + +- `fm_fact_payload_schemas.json` — the JSON **schema** of each DE's payload type + (structure: variants, fields, types, and embedded-enum domains). Human-readable + `description` text is stripped before snapshotting, so editing a doc comment + (here or on an embedded type) does not register as a shape change. +- `fm_fact_payload_samples.json` — one obviously-fake serialized **sample** per + payload variant (the exact on-disk bytes). + +They are checked by the `payload_stability` tests in +`nexus/fm/src/diagnosis/mod.rs`. **If a test points you here, you changed the +serialized shape of a persisted fact payload — that is a data migration**, not +just a code change, because deployed databases already hold rows in the old +shape. + +Before regenerating with `EXPECTORATE=overwrite cargo nextest run -p nexus-fm`, +read the "Evolving persisted fact payloads" guidance at the top of +`nexus/fm/src/diagnosis/mod.rs` to decide whether the change is additive +(serde-compatible) or breaking (needs a JSONB data migration). diff --git a/nexus/fm/output/fm_fact_payload_samples.json b/nexus/fm/output/fm_fact_payload_samples.json new file mode 100644 index 00000000000..08dbcdd58e0 --- /dev/null +++ b/nexus/fm/output/fm_fact_payload_samples.json @@ -0,0 +1,10 @@ +{ + "physical_disk::zpool_unhealthy": { + "kind": "zpool_unhealthy", + "last_seen_health": "faulted", + "observed_in_inv": "33333333-3333-3333-3333-333333333333", + "physical_disk_id": "11111111-1111-1111-1111-111111111111", + "time_observed": "2000-01-01T00:00:00Z", + "zpool_id": "22222222-2222-2222-2222-222222222222" + } +} diff --git a/nexus/fm/output/fm_fact_payload_schemas.json b/nexus/fm/output/fm_fact_payload_schemas.json new file mode 100644 index 00000000000..2adb7a28446 --- /dev/null +++ b/nexus/fm/output/fm_fact_payload_schemas.json @@ -0,0 +1,124 @@ +{ + "physical_disk": { + "$schema": "http://json-schema.org/draft-07/schema#", + "definitions": { + "CollectionUuid": { + "format": "uuid", + "type": "string", + "x-rust-type": { + "crate": "omicron-uuid-kinds", + "path": "omicron_uuid_kinds::CollectionUuid", + "version": "*" + } + }, + "PhysicalDiskUuid": { + "format": "uuid", + "type": "string", + "x-rust-type": { + "crate": "omicron-uuid-kinds", + "path": "omicron_uuid_kinds::PhysicalDiskUuid", + "version": "*" + } + }, + "ZpoolHealth": { + "oneOf": [ + { + "enum": [ + "online" + ], + "type": "string" + }, + { + "enum": [ + "degraded" + ], + "type": "string" + }, + { + "enum": [ + "faulted" + ], + "type": "string" + }, + { + "enum": [ + "offline" + ], + "type": "string" + }, + { + "enum": [ + "removed" + ], + "type": "string" + }, + { + "enum": [ + "unavailable" + ], + "type": "string" + } + ] + }, + "ZpoolUuid": { + "format": "uuid", + "type": "string", + "x-rust-type": { + "crate": "omicron-uuid-kinds", + "path": "omicron_uuid_kinds::ZpoolUuid", + "version": "*" + } + } + }, + "oneOf": [ + { + "properties": { + "kind": { + "enum": [ + "zpool_unhealthy" + ], + "type": "string" + }, + "last_seen_health": { + "$ref": "#/definitions/ZpoolHealth" + }, + "observed_in_inv": { + "allOf": [ + { + "$ref": "#/definitions/CollectionUuid" + } + ] + }, + "physical_disk_id": { + "allOf": [ + { + "$ref": "#/definitions/PhysicalDiskUuid" + } + ] + }, + "time_observed": { + "format": "date-time", + "type": "string" + }, + "zpool_id": { + "allOf": [ + { + "$ref": "#/definitions/ZpoolUuid" + } + ] + } + }, + "required": [ + "kind", + "last_seen_health", + "observed_in_inv", + "physical_disk_id", + "time_observed", + "zpool_id" + ], + "type": "object" + } + ], + "title": "DiskFact" + } +} diff --git a/nexus/fm/src/diagnosis/mod.rs b/nexus/fm/src/diagnosis/mod.rs index 9e9f1959c96..16881c4ae8e 100644 --- a/nexus/fm/src/diagnosis/mod.rs +++ b/nexus/fm/src/diagnosis/mod.rs @@ -7,6 +7,83 @@ //! Each submodule defines one diagnosis engine (DE). `analyze` dispatches to //! each engine in turn; engines are deterministic and idempotent per RFD 603, //! so the dispatch order does not matter. +//! +//! # Evolving persisted fact payloads +//! +//! Each DE owns a fact-payload type serialized into the opaque `fm_fact.payload` +//! JSONB column (the owning DE of a fact is found via `fm_case.de`). Deployed +//! databases hold rows in whatever shape was current when they were written. +//! The `payload_stability` tests in this module snapshot every DE's payload (a +//! structural JSON-schema snapshot plus per-variant data samples) and fail +//! loudly when the representation changes. +//! +//! ## Is my change a data migration? +//! +//! A change is **backward-compatible** if and only if every value already on +//! disk still deserializes under the new type. Otherwise it is **breaking** and +//! existing rows must be rewritten. +//! +//! Examples: +//! - **Backward-compatible**: a new `#[serde(tag = "kind")]` variant; a new +//! `Option` or `#[serde(default)]` field; *widening* a field's domain. +//! Importantly, old data can still be parsed. +//! - **Breaking**: renaming or removing a field; changing a field's type or +//! value encoding; *narrowing* a domain; renaming or removing an enum variant +//! that may exist in old rows; changing the `kind` tag. +//! +//! ## Writing the migration +//! +//! Rewrite existing rows with a data-only `UPDATE` in a new +//! `schema/crdb//up1.sql`. As a worked example, renaming the +//! `last_seen_health` key to `health` in the disk DE's `zpool_unhealthy` +//! payload: +//! +//! ```sql +//! UPDATE omicron.public.fm_fact +//! SET payload = +//! -- Change the value of the payload +//! jsonb_set( +//! payload, +//! -- INSERT the new 'health' field name. +//! ARRAY['health'], +//! -- Use the value from the old field +//! payload->'last_seen_health' +//! ) +//! -- REMOVE the old field +//! - 'last_seen_health' +//! -- Scope this change to be specific to this DE, and specific to +//! -- this particular variant. +//! WHERE case_id IN ( +//! SELECT id FROM omicron.public.fm_case WHERE de = 'physical_disk' +//! ) +//! AND payload->>'kind' = 'zpool_unhealthy' +//! -- For idempotency: Only run this query if the payload contains the +//! -- old value. +//! AND payload ? 'last_seen_health'; +//! ``` +//! +//! For a value *re-encoding* rather than a key rename, keep the same skeleton and +//! change only the new value, e.g. wrapping the bare string in an object: +//! +//! ```sql +//! SET payload = jsonb_set( +//! payload, +//! ARRAY['last_seen_health'], +//! jsonb_build_object('state', payload->'last_seen_health') +//! ) +//! ``` +//! +//! Then do the usual schema-version bookkeeping: add the `schema/crdb/` +//! directory, bump `KNOWN_VERSIONS` and `dbinit.sql` (see +//! `schema/crdb/README.adoc`) — even though, from CockroachDB's point of view, +//! the column is unchanged JSONB and only the data changes — add a case to +//! `validate_data_migration()` in `nexus/tests/integration_tests/schema.rs`, and +//! regenerate the goldens. +//! +//! Prefer the additive route when you can: rather than rewriting an existing +//! variant in place, add a *new* `kind` variant for the new shape and migrate old +//! rows to it. The old shape stays readable throughout, which matters because +//! migrations here are offline and all-at-once. use crate::SitrepBuilder; use crate::analysis_input::Input; @@ -45,3 +122,192 @@ pub fn analyze( pub fn known_ereport_classes() -> &'static [&'static str] { &[] } + +/// A diagnosis engine's persisted fact-payload type. Each DE's payload enum +/// implements this so the payload-stability guardrail can snapshot it +/// generically: the DE only declares its engine kind and an obviously-fake +/// sample of every variant, and the guardrail derives the schema and the +/// serialized bytes from the type itself. The impl lives in the DE's own +/// module, so changing a payload stays self-contained there. +#[cfg(test)] +trait FactPayload: serde::Serialize + schemars::JsonSchema + Sized { + /// The engine that owns this payload. + const DE: nexus_types::fm::DiagnosisEngineKind; + + /// One obviously-fake sample per variant. The guardrail labels each sample + /// by the serde `kind` tag it serializes to, so there is no separate name + /// to keep in sync. Implementations should `match` exhaustively over their + /// variants, so adding a variant forces a sample to be added here + /// (otherwise it fails to compile). + fn samples() -> Vec; +} + +/// Guardrail tests snapshotting the serialized representation of every persisted +/// DE fact payload. See the module-level "Evolving persisted fact payloads" +/// docs above before regenerating any golden file: a diff here means a +/// persisted payload changed shape, which is a data migration. +#[cfg(test)] +mod payload_stability { + use super::FactPayload; + use nexus_types::fm::DiagnosisEngineKind; + use std::collections::BTreeMap; + + /// The type-erased snapshot of one DE's payload, produced by + /// `snapshot_payload`. Erasing `T` lets `registered` collect heterogeneous + /// DE payload types into one `Vec`. + struct FactPayloadSnapshot { + de: DiagnosisEngineKind, + schema: schemars::schema::RootSchema, + samples: Vec, + } + + /// One obviously-fake serialized sample of a single payload variant. + struct FactPayloadSample { + /// The serde `kind` tag this variant serializes to: golden key + label. + variant: String, + /// The variant serialized to its on-disk JSON form. + json: serde_json::Value, + } + + /// Strips human-readable `description` text from every node of a schema. + struct StripDescriptions; + impl schemars::visit::Visitor for StripDescriptions { + fn visit_schema_object( + &mut self, + schema: &mut schemars::schema::SchemaObject, + ) { + if let Some(metadata) = schema.metadata.as_mut() { + metadata.description = None; + } + // Recurse into subschemas (properties, definitions, `oneOf`, ...). + schemars::visit::visit_schema_object(self, schema); + } + } + + /// Snapshot one DE's payload type `T`: its schema and the serialized bytes + /// of each sample. Fully generic, so no DE-specific logic lives here. + fn snapshot_payload() -> FactPayloadSnapshot { + use schemars::visit::Visitor; + + let samples = T::samples() + .into_iter() + .map(|fact| { + let json = serde_json::to_value(&fact).unwrap(); + // Label the sample by the serde `kind` tag it serializes to, + // i.e. the bytes that land in `fm_fact.payload`. + let variant = json + .get("kind") + .and_then(|k| k.as_str()) + .expect( + "fact payload must serialize with a string `kind` tag", + ) + .to_string(); + FactPayloadSample { variant, json } + }) + .collect(); + let mut schema = schemars::schema_for!(T); + StripDescriptions.visit_root_schema(&mut schema); + FactPayloadSnapshot { de: T::DE, schema, samples } + } + + /// The single source of truth for which DEs the guardrail covers, and the + /// *only* place that names individual DE payload types: adding a DE is one + /// line here, and changing an existing DE's payload touches only that DE's + /// own module (its `FactPayload` impl). + fn registered() -> Vec { + vec![ + snapshot_payload::(), + // FUTURE DEs: snapshot_payload::::OtherFact>(), + ] + } + + /// Every `DiagnosisEngineKind` must either register a fact payload in + /// `registered` or be listed in `NO_PAYLOAD`. Because this iterates the enum + /// (rather than naming kinds by hand), a newly added DE kind that does + /// neither fails this test, instead of silently going uncovered by the + /// guardrail. + #[test] + fn every_de_kind_is_registered_or_exempt() { + use strum::IntoEnumIterator; + + // DE kinds that intentionally have no persisted fact payload (yet). A + // kind belongs here only if it writes no facts; once it gains a payload, + // register it above and remove it from this list. + const NO_PAYLOAD: &[DiagnosisEngineKind] = + &[DiagnosisEngineKind::PowerShelf]; + + let registered_des: Vec = + registered().iter().map(|d| d.de).collect(); + + for de in DiagnosisEngineKind::iter() { + let is_registered = registered_des.contains(&de); + let is_exempt = NO_PAYLOAD.contains(&de); + match (is_registered, is_exempt) { + (true, false) | (false, true) => {} + (false, false) => panic!( + "diagnosis engine `{de}` has no registered fact payload and \ + is not in NO_PAYLOAD: register it in `registered`, or — if \ + it has no persisted facts — add it to the NO_PAYLOAD \ + exemption list", + ), + (true, true) => panic!( + "diagnosis engine `{de}` is both registered and in \ + NO_PAYLOAD: remove it from NO_PAYLOAD", + ), + } + } + } + + /// Structural guardrail: snapshots the JSON schema of every DE payload. A diff + /// here means a field/variant/type changed (including an embedded type owned + /// by another crate). + #[test] + fn fact_payload_schemas_are_stable() { + let mut doc = BTreeMap::new(); + for d in registered() { + if doc + .insert( + d.de.to_string(), + serde_json::to_value(&d.schema).unwrap(), + ) + .is_some() + { + panic!( + "diagnosis engine `{}` registered more than one fact payload \ + type; we currently expect DEs to only register one fact type. \ + Consider using an enum variant.", + d.de, + ); + } + } + let serialized = serde_json::to_string_pretty(&doc).unwrap() + "\n"; + expectorate::assert_contents( + "output/fm_fact_payload_schemas.json", + &serialized, + ); + } + + /// Sample stability guardrail: snapshots the exact on-disk JSON for one + /// sample of every payload variant. A diff here means the serialized + /// contents changed. + #[test] + fn fact_payload_samples_are_stable() { + let mut doc = BTreeMap::new(); + for d in registered() { + for sample in d.samples { + let key = format!("{}::{}", d.de, sample.variant); + if doc.insert(key.clone(), sample.json).is_some() { + panic!( + "duplicate fact payload sample key `{key}`: two \ + variants serialized to the same `de::variant` label", + ); + } + } + } + let serialized = serde_json::to_string_pretty(&doc).unwrap() + "\n"; + expectorate::assert_contents( + "output/fm_fact_payload_samples.json", + &serialized, + ); + } +} diff --git a/nexus/fm/src/diagnosis/physical_disk.rs b/nexus/fm/src/diagnosis/physical_disk.rs index 135433d338f..45f235b006c 100644 --- a/nexus/fm/src/diagnosis/physical_disk.rs +++ b/nexus/fm/src/diagnosis/physical_disk.rs @@ -20,6 +20,16 @@ use std::collections::BTreeMap; /// Per-fact state for the disk diagnosis engine, serialized into the /// `fm_fact.payload` JSONB column. Other diagnosis engines must not /// inspect or modify this; shared FM code treats it as opaque bytes. +/// +/// Its serialized shape is **persisted** in `fm_fact.payload`, so changing it +/// is a data migration. The payload-stability tests in +/// [`super`](crate::diagnosis) snapshot this representation and will fail loudly +/// if it changes; see the migration guidance there before editing. +#[cfg_attr(test, derive(schemars::JsonSchema, strum::EnumDiscriminants))] +#[cfg_attr( + test, + strum_discriminants(name(DiskFactKind), derive(strum::EnumIter)) +)] #[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)] #[serde(tag = "kind", rename_all = "snake_case")] pub enum DiskFact { @@ -28,6 +38,7 @@ pub enum DiskFact { } /// Payload of a [`DiskFact::ZpoolUnhealthy`] fact. +#[cfg_attr(test, derive(schemars::JsonSchema))] #[derive(Clone, Copy, Debug, Eq, PartialEq, Serialize, Deserialize)] pub struct ZpoolUnhealthyFactPayload { /// The physical disk this fact (and its parent case) is about. @@ -48,6 +59,41 @@ pub struct ZpoolUnhealthyFactPayload { pub time_observed: DateTime, } +/// Snapshots `DiskFact` for the payload-stability guardrail in +/// [`super`](crate::diagnosis). The guardrail derives the schema, serialized +/// bytes, and deserialize round-trip from the type itself; this impl just +/// declares the engine kind and an obviously-fake sample of every variant. +#[cfg(test)] +impl super::FactPayload for DiskFact { + const DE: DiagnosisEngineKind = DiagnosisEngineKind::PhysicalDisk; + + /// The exhaustive `match` over [`DiskFactKind`] forces a sample to be added + /// whenever a new variant is added (otherwise this fails to compile). + fn samples() -> Vec { + use strum::IntoEnumIterator; + DiskFactKind::iter() + .map(|kind| match kind { + DiskFactKind::ZpoolUnhealthy => { + DiskFact::ZpoolUnhealthy(ZpoolUnhealthyFactPayload { + physical_disk_id: + "11111111-1111-1111-1111-111111111111" + .parse() + .unwrap(), + zpool_id: "22222222-2222-2222-2222-222222222222" + .parse() + .unwrap(), + last_seen_health: ZpoolHealth::Faulted, + observed_in_inv: "33333333-3333-3333-3333-333333333333" + .parse() + .unwrap(), + time_observed: "2000-01-01T00:00:00Z".parse().unwrap(), + }) + } + }) + .collect() + } +} + /// A [`DiskFact::ZpoolUnhealthy`] payload paired with the `FactUuid` it /// lives under. Used to build in-memory indices over facts during /// analysis; not serialized. diff --git a/nexus/types/src/fm.rs b/nexus/types/src/fm.rs index 965db247ae1..97ca1375884 100644 --- a/nexus/types/src/fm.rs +++ b/nexus/types/src/fm.rs @@ -216,6 +216,7 @@ pub type CurrentSitrep = Arc<(SitrepVersion, Sitrep)>; serde::Serialize, serde::Deserialize, strum::Display, + strum::EnumIter, )] #[serde(rename_all = "snake_case")] #[strum(serialize_all = "snake_case")]