|
2 | 2 | # SPDX-License-Identifier: Apache-2.0 |
3 | 3 |
|
4 | 4 | groups: |
5 | | -- name: hypervisorOperator |
| 5 | +- name: hypervisorLifecycle |
6 | 6 | rules: |
7 | | - - alert: HypervisorNotReady |
| 7 | + - alert: HypervisorOnboardingStuck |
8 | 8 | expr: | |
9 | | - kube_customresource_hypervisor_condition{condition="Ready"} == 0 |
10 | | - for: 10m |
| 9 | + kube_customresource_hypervisor_condition{condition="Onboarding"} == 1 |
| 10 | + for: 1h |
11 | 11 | labels: |
12 | | - severity: critical |
| 12 | + severity: warning |
13 | 13 | type: hypervisor_operator |
14 | 14 | annotations: |
15 | | - summary: "Hypervisor {{`{{ $labels.name }}`}} is not ready" |
16 | | - description: "The hypervisor {{`{{ $labels.name }}`}} in zone {{`{{ $labels.zone }}`}} has been in a non-ready state for more than 10 minutes." |
| 15 | + summary: "Hypervisor {{ $labels.name }} onboarding stuck for over 1 hour" |
| 16 | + description: "The hypervisor {{ $labels.name }} in zone {{ $labels.zone }} has been onboarding for more than 1 hour. Check nova registration, test VM status, or trait/aggregate sync." |
17 | 17 |
|
18 | | - - alert: HypervisorConditionDegraded |
| 18 | + - alert: HypervisorEvictionStuck |
19 | 19 | expr: | |
20 | | - kube_customresource_hypervisor_condition{condition!="Ready"} == 0 |
21 | | - for: 10m |
| 20 | + kube_customresource_hypervisor_condition{condition="Evicting"} == 1 |
| 21 | + for: 4h |
22 | 22 | labels: |
23 | 23 | severity: warning |
24 | 24 | type: hypervisor_operator |
25 | 25 | annotations: |
26 | | - summary: "Hypervisor {{`{{ $labels.name }}`}} condition {{`{{ $labels.condition }}`}} is degraded" |
27 | | - description: "The hypervisor {{`{{ $labels.name }}`}} in zone {{`{{ $labels.zone }}`}} has condition {{`{{ $labels.condition }}`}} in a False state for more than 10 minutes. Reason: {{`{{ $labels.reason }}`}}." |
| 26 | + summary: "Hypervisor {{ $labels.name }} eviction running for over 4 hours" |
| 27 | + description: "The hypervisor {{ $labels.name }} in zone {{ $labels.zone }} has had an active eviction for more than 4 hours. Check for stuck live-migrations or failed VMs." |
28 | 28 |
|
29 | | - - alert: HypervisorEvicted |
| 29 | + - alert: HypervisorEvictedTooLong |
30 | 30 | expr: | |
31 | 31 | kube_customresource_hypervisor_evicted == 1 |
32 | | - for: 2d |
| 32 | + unless on (name) |
| 33 | + kube_customresource_hypervisor_condition{condition="Offboarded"} == 1 |
| 34 | + for: 7d |
| 35 | + labels: |
| 36 | + severity: info |
| 37 | + type: hypervisor_operator |
| 38 | + annotations: |
| 39 | + summary: "Hypervisor {{ $labels.name }} has been evicted for over 7 days" |
| 40 | + description: "The hypervisor {{ $labels.name }} in zone {{ $labels.zone }} has been evicted for more than 7 days without being offboarded. Consider re-enabling or decommissioning." |
| 41 | + |
| 42 | +- name: hypervisorSync |
| 43 | + rules: |
| 44 | + - alert: HypervisorTraitSyncFailed |
| 45 | + expr: | |
| 46 | + kube_customresource_hypervisor_condition{condition="TraitsUpdated"} == 0 |
| 47 | + and on (name) |
| 48 | + kube_customresource_hypervisor_condition{condition="Onboarding"} == 0 |
| 49 | + for: 30m |
33 | 50 | labels: |
34 | 51 | severity: warning |
35 | 52 | type: hypervisor_operator |
36 | 53 | annotations: |
37 | | - summary: "Hypervisor {{`{{ $labels.name }}`}} has been evicted" |
38 | | - description: "The hypervisor {{`{{ $labels.name }}`}} in zone {{`{{ $labels.zone }}`}} has been in an evicted state for more than 2 days." |
| 54 | + summary: "Hypervisor {{ $labels.name }} trait sync has been failing" |
| 55 | + description: "The hypervisor {{ $labels.name }} in zone {{ $labels.zone }} has had TraitsUpdated=False for more than 30 minutes outside of onboarding. Check OpenStack Placement API connectivity." |
39 | 56 |
|
| 57 | + - alert: HypervisorAggregateSyncFailed |
| 58 | + expr: | |
| 59 | + kube_customresource_hypervisor_condition{condition="AggregatesUpdated"} == 0 |
| 60 | + and on (name) |
| 61 | + kube_customresource_hypervisor_condition{condition="Onboarding"} == 0 |
| 62 | + unless on (name) |
| 63 | + kube_customresource_hypervisor_condition{condition="Evicting"} == 1 |
| 64 | + for: 30m |
| 65 | + labels: |
| 66 | + severity: warning |
| 67 | + type: hypervisor_operator |
| 68 | + annotations: |
| 69 | + summary: "Hypervisor {{ $labels.name }} aggregate sync has been failing" |
| 70 | + description: "The hypervisor {{ $labels.name }} in zone {{ $labels.zone }} has had AggregatesUpdated=False for more than 30 minutes outside of onboarding and eviction. Check OpenStack Nova API connectivity." |
| 71 | + |
| 72 | +- name: hypervisorOperatorHealth |
| 73 | + rules: |
40 | 74 | - alert: HypervisorOperatorReconcileErrors |
41 | 75 | expr: | |
42 | | - rate(controller_runtime_reconcile_errors_total[5m]) > 0 |
| 76 | + rate(controller_runtime_reconcile_errors_total[5m]) > 0.01 |
43 | 77 | for: 15m |
44 | 78 | labels: |
45 | 79 | severity: warning |
46 | 80 | type: hypervisor_operator |
47 | 81 | annotations: |
48 | | - summary: "Hypervisor operator controller {{`{{ $labels.controller }}`}} has persistent reconcile errors" |
49 | | - description: "The controller {{`{{ $labels.controller }}`}} has been producing reconciliation errors for more than 15 minutes." |
| 82 | + summary: "Hypervisor operator controller {{ $labels.controller }} has persistent reconcile errors" |
| 83 | + description: "The controller {{ $labels.controller }} has been producing sustained reconciliation errors for more than 15 minutes." |
50 | 84 |
|
51 | 85 | - alert: HypervisorOperatorDown |
52 | 86 | expr: | |
|
0 commit comments