File tree Expand file tree Collapse file tree
charts/openstack-hypervisor-operator Expand file tree Collapse file tree Original file line number Diff line number Diff line change 1+ # SPDX-FileCopyrightText: 2025 SAP SE or an SAP affiliate company and cobaltcore-dev contributors
2+ # SPDX-License-Identifier: Apache-2.0
3+
4+ groups :
5+ - name : evictionLifecycle
6+ rules :
7+ - alert : EvictionFailed
8+ expr : |
9+ kube_customresource_eviction_condition{condition="Evicting", reason="Failed"} == 1
10+ for : 5m
11+ labels :
12+ severity : warning
13+ type : hypervisor_operator
14+ annotations :
15+ summary : " Eviction {{ $labels.name }} has failed"
16+ description : " The eviction {{ $labels.name }} for hypervisor {{ $labels.hypervisor }} has reached a terminal failure state. Manual intervention is required — check if the hypervisor exists in OpenStack."
17+
18+ - alert : EvictionMigrationFailing
19+ expr : |
20+ kube_customresource_eviction_condition{condition="MigratingInstance", reason="Failed"} == 1
21+ and on (name)
22+ kube_customresource_eviction_condition{condition="Evicting", reason="Running"} == 1
23+ for : 1h
24+ labels :
25+ severity : warning
26+ type : hypervisor_operator
27+ annotations :
28+ summary : " Eviction {{ $labels.name }} has failing instance migrations for over 1 hour"
29+ description : " The eviction {{ $labels.name }} has had MigratingInstance=Failed for more than 1 hour while still running. Instances may be in ERROR state, blocking eviction progress."
30+
31+ - alert : EvictionOutstandingRamHigh
32+ expr : |
33+ kube_customresource_eviction_outstanding_ram_mb > 0
34+ and on (name)
35+ kube_customresource_eviction_condition{condition="Evicting", reason="Running"} == 1
36+ for : 6h
37+ labels :
38+ severity : warning
39+ type : hypervisor_operator
40+ annotations :
41+ summary : " Eviction {{ $labels.name }} has outstanding RAM for over 6 hours"
42+ description : " The eviction {{ $labels.name }} has had {{ $value }}MB of outstanding RAM for more than 6 hours. Check for stuck live-migrations or instances that cannot be moved."
Original file line number Diff line number Diff line change 1+ # SPDX-FileCopyrightText: 2025 SAP SE or an SAP affiliate company and cobaltcore-dev contributors
2+ # SPDX-License-Identifier: Apache-2.0
3+
4+ {{- if .Values.customResourceMetrics.create }}
5+ {{- $disabled := .Values.customResourceMetrics.disabled | default dict }}
6+ {{- if not (index $disabled "eviction") }}
7+ apiVersion : v1
8+ kind : ConfigMap
9+ metadata :
10+ name : {{ printf "%s-eviction-resources" .Release.Name | trunc 63 | trimSuffix "-" }}
11+ labels :
12+ dev.custom.kube-state-metrics : " true"
13+ {{ include "openstack-hypervisor-operator.labels" . | indent 4 }}
14+ data :
15+ eviction-metrics.yaml : |
16+ spec:
17+ resources:
18+ - groupVersionKind:
19+ group: kvm.cloud.sap
20+ version: v1
21+ kind: Eviction
22+ labelsFromPath:
23+ name: [metadata, name]
24+ metrics:
25+ - name: eviction_info
26+ help: "Info metric for eviction with hypervisor and reason labels (always 1)"
27+ each:
28+ type: Info
29+ info:
30+ labelsFromPath:
31+ hypervisor: [spec, hypervisor]
32+ reason: [spec, reason]
33+ - name: eviction_outstanding_ram_mb
34+ help: "Outstanding RAM in MB to be migrated"
35+ each:
36+ type: Gauge
37+ gauge:
38+ path: [status, outstandingRamMb]
39+ nilIsZero: true
40+ - name: eviction_condition
41+ help: "Eviction condition status (1=True, 0=False)"
42+ each:
43+ type: Gauge
44+ gauge:
45+ path: [status, conditions]
46+ labelsFromPath:
47+ condition: [type]
48+ reason: [reason]
49+ valueFrom: [status]
50+ booleanTrue: "True"
51+ booleanFalse: "False"
52+ {{- end }}
53+ {{- end }}
Original file line number Diff line number Diff line change 22# SPDX-License-Identifier: Apache-2.0
33
44{{- if .Values.customResourceMetrics.create }}
5+ {{- $disabled := .Values.customResourceMetrics.disabled | default dict }}
6+ {{- if not (index $disabled "hypervisor") }}
57apiVersion : v1
68kind : ConfigMap
79metadata :
8890 booleanFalse: "false"
8991 nilIsZero: true
9092{{- end }}
93+ {{- end }}
Original file line number Diff line number Diff line change @@ -67,6 +67,9 @@ prometheusRules:
6767 # disabled:
6868 # HypervisorOnboardingStuck: true
6969 # HypervisorEvictedTooLong: true
70+ # EvictionFailed: true
71+ # EvictionMigrationFailing: true
72+ # EvictionOutstandingRamHigh: true
7073 disabled : {}
7174
7275dashboards :
@@ -80,3 +83,8 @@ dashboards:
8083
8184customResourceMetrics :
8285 create : true
86+ # Disable individual custom resource metrics ConfigMaps by name:
87+ # disabled:
88+ # hypervisor: true
89+ # eviction: true
90+ disabled : {}
You can’t perform that action at this time.
0 commit comments