|
| 1 | +# SPDX-FileCopyrightText: 2025 SAP SE or an SAP affiliate company and cobaltcore-dev contributors |
| 2 | +# SPDX-License-Identifier: Apache-2.0 |
| 3 | + |
| 4 | +groups: |
| 5 | +- name: hypervisorOperator |
| 6 | + rules: |
| 7 | + - alert: HypervisorNotReady |
| 8 | + expr: | |
| 9 | + kube_customresource_hypervisor_condition{condition="Ready"} == 0 |
| 10 | + for: 10m |
| 11 | + labels: |
| 12 | + severity: critical |
| 13 | + type: hypervisor_operator |
| 14 | + annotations: |
| 15 | + summary: "Hypervisor {{`{{ $labels.name }}`}} is not ready" |
| 16 | + description: "The hypervisor {{`{{ $labels.name }}`}} in zone {{`{{ $labels.zone }}`}} has been in a non-ready state for more than 10 minutes." |
| 17 | + |
| 18 | + - alert: HypervisorConditionDegraded |
| 19 | + expr: | |
| 20 | + kube_customresource_hypervisor_condition{condition!="Ready"} == 0 |
| 21 | + for: 10m |
| 22 | + labels: |
| 23 | + severity: warning |
| 24 | + type: hypervisor_operator |
| 25 | + annotations: |
| 26 | + summary: "Hypervisor {{`{{ $labels.name }}`}} condition {{`{{ $labels.condition }}`}} is degraded" |
| 27 | + description: "The hypervisor {{`{{ $labels.name }}`}} in zone {{`{{ $labels.zone }}`}} has condition {{`{{ $labels.condition }}`}} in a False state for more than 10 minutes. Reason: {{`{{ $labels.reason }}`}}." |
| 28 | + |
| 29 | + - alert: HypervisorEvicted |
| 30 | + expr: | |
| 31 | + kube_customresource_hypervisor_evicted == 1 |
| 32 | + for: 2d |
| 33 | + labels: |
| 34 | + severity: warning |
| 35 | + type: hypervisor_operator |
| 36 | + annotations: |
| 37 | + summary: "Hypervisor {{`{{ $labels.name }}`}} has been evicted" |
| 38 | + description: "The hypervisor {{`{{ $labels.name }}`}} in zone {{`{{ $labels.zone }}`}} has been in an evicted state for more than 2 days." |
| 39 | + |
| 40 | + - alert: HypervisorOperatorReconcileErrors |
| 41 | + expr: | |
| 42 | + rate(controller_runtime_reconcile_errors_total[5m]) > 0 |
| 43 | + for: 15m |
| 44 | + labels: |
| 45 | + severity: warning |
| 46 | + type: hypervisor_operator |
| 47 | + annotations: |
| 48 | + summary: "Hypervisor operator controller {{`{{ $labels.controller }}`}} has persistent reconcile errors" |
| 49 | + description: "The controller {{`{{ $labels.controller }}`}} has been producing reconciliation errors for more than 15 minutes." |
| 50 | + |
| 51 | + - alert: HypervisorOperatorDown |
| 52 | + expr: | |
| 53 | + up{job=~".*hypervisor-operator.*"} == 0 |
| 54 | + for: 5m |
| 55 | + labels: |
| 56 | + severity: critical |
| 57 | + type: hypervisor_operator |
| 58 | + annotations: |
| 59 | + summary: "Hypervisor operator is down" |
| 60 | + description: "The hypervisor operator metrics endpoint has been unreachable for more than 5 minutes." |
0 commit comments