Skip to content

Commit 4c5e987

Browse files
added eviction-metric
1 parent f17462e commit 4c5e987

4 files changed

Lines changed: 106 additions & 0 deletions

File tree

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
# SPDX-FileCopyrightText: 2025 SAP SE or an SAP affiliate company and cobaltcore-dev contributors
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
groups:
5+
- name: evictionLifecycle
6+
rules:
7+
- alert: EvictionFailed
8+
expr: |
9+
kube_customresource_eviction_condition{condition="Evicting", reason="Failed"} == 1
10+
for: 5m
11+
labels:
12+
severity: warning
13+
type: hypervisor_operator
14+
annotations:
15+
summary: "Eviction {{ $labels.name }} has failed"
16+
description: "The eviction {{ $labels.name }} for hypervisor {{ $labels.hypervisor }} has reached a terminal failure state. Manual intervention is required — check if the hypervisor exists in OpenStack."
17+
18+
- alert: EvictionMigrationFailing
19+
expr: |
20+
kube_customresource_eviction_condition{condition="MigratingInstance", reason="Failed"} == 1
21+
and on (name)
22+
kube_customresource_eviction_condition{condition="Evicting", reason="Running"} == 1
23+
for: 1h
24+
labels:
25+
severity: warning
26+
type: hypervisor_operator
27+
annotations:
28+
summary: "Eviction {{ $labels.name }} has failing instance migrations for over 1 hour"
29+
description: "The eviction {{ $labels.name }} has had MigratingInstance=Failed for more than 1 hour while still running. Instances may be in ERROR state, blocking eviction progress."
30+
31+
- alert: EvictionOutstandingRamHigh
32+
expr: |
33+
kube_customresource_eviction_outstanding_ram_mb > 0
34+
and on (name)
35+
kube_customresource_eviction_condition{condition="Evicting", reason="Running"} == 1
36+
for: 6h
37+
labels:
38+
severity: warning
39+
type: hypervisor_operator
40+
annotations:
41+
summary: "Eviction {{ $labels.name }} has outstanding RAM for over 6 hours"
42+
description: "The eviction {{ $labels.name }} has had {{ $value }}MB of outstanding RAM for more than 6 hours. Check for stuck live-migrations or instances that cannot be moved."
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
# SPDX-FileCopyrightText: 2025 SAP SE or an SAP affiliate company and cobaltcore-dev contributors
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
{{- if .Values.customResourceMetrics.create }}
5+
{{- $disabled := .Values.customResourceMetrics.disabled | default dict }}
6+
{{- if not (index $disabled "eviction") }}
7+
apiVersion: v1
8+
kind: ConfigMap
9+
metadata:
10+
name: {{ printf "%s-eviction-resources" .Release.Name | trunc 63 | trimSuffix "-" }}
11+
labels:
12+
dev.custom.kube-state-metrics: "true"
13+
{{ include "openstack-hypervisor-operator.labels" . | indent 4 }}
14+
data:
15+
eviction-metrics.yaml: |
16+
spec:
17+
resources:
18+
- groupVersionKind:
19+
group: kvm.cloud.sap
20+
version: v1
21+
kind: Eviction
22+
labelsFromPath:
23+
name: [metadata, name]
24+
metrics:
25+
- name: eviction_info
26+
help: "Info metric for eviction with hypervisor and reason labels (always 1)"
27+
each:
28+
type: Info
29+
info:
30+
labelsFromPath:
31+
hypervisor: [spec, hypervisor]
32+
reason: [spec, reason]
33+
- name: eviction_outstanding_ram_mb
34+
help: "Outstanding RAM in MB to be migrated"
35+
each:
36+
type: Gauge
37+
gauge:
38+
path: [status, outstandingRamMb]
39+
nilIsZero: true
40+
- name: eviction_condition
41+
help: "Eviction condition status (1=True, 0=False)"
42+
each:
43+
type: Gauge
44+
gauge:
45+
path: [status, conditions]
46+
labelsFromPath:
47+
condition: [type]
48+
reason: [reason]
49+
valueFrom: [status]
50+
booleanTrue: "True"
51+
booleanFalse: "False"
52+
{{- end }}
53+
{{- end }}

charts/openstack-hypervisor-operator/templates/custom-resource-metrics.yaml renamed to charts/openstack-hypervisor-operator/templates/metrics-hypervisor-cm.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
# SPDX-License-Identifier: Apache-2.0
33

44
{{- if .Values.customResourceMetrics.create }}
5+
{{- $disabled := .Values.customResourceMetrics.disabled | default dict }}
6+
{{- if not (index $disabled "hypervisor") }}
57
apiVersion: v1
68
kind: ConfigMap
79
metadata:
@@ -88,3 +90,4 @@ data:
8890
booleanFalse: "false"
8991
nilIsZero: true
9092
{{- end }}
93+
{{- end }}

charts/openstack-hypervisor-operator/values.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,9 @@ prometheusRules:
6767
# disabled:
6868
# HypervisorOnboardingStuck: true
6969
# HypervisorEvictedTooLong: true
70+
# EvictionFailed: true
71+
# EvictionMigrationFailing: true
72+
# EvictionOutstandingRamHigh: true
7073
disabled: {}
7174

7275
dashboards:
@@ -80,3 +83,8 @@ dashboards:
8083

8184
customResourceMetrics:
8285
create: true
86+
# Disable individual custom resource metrics ConfigMaps by name:
87+
# disabled:
88+
# hypervisor: true
89+
# eviction: true
90+
disabled: {}

0 commit comments

Comments
 (0)