Skip to content

Commit f17462e

Browse files
replaced alerts and add possibility to enable/disbale
1 parent 8163bcb commit f17462e

4 files changed

Lines changed: 86 additions & 28 deletions

File tree

charts/openstack-hypervisor-operator/alerts/operator.yaml

Lines changed: 53 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -2,51 +2,85 @@
22
# SPDX-License-Identifier: Apache-2.0
33

44
groups:
5-
- name: hypervisorOperator
5+
- name: hypervisorLifecycle
66
rules:
7-
- alert: HypervisorNotReady
7+
- alert: HypervisorOnboardingStuck
88
expr: |
9-
kube_customresource_hypervisor_condition{condition="Ready"} == 0
10-
for: 10m
9+
kube_customresource_hypervisor_condition{condition="Onboarding"} == 1
10+
for: 1h
1111
labels:
12-
severity: critical
12+
severity: warning
1313
type: hypervisor_operator
1414
annotations:
15-
summary: "Hypervisor {{`{{ $labels.name }}`}} is not ready"
16-
description: "The hypervisor {{`{{ $labels.name }}`}} in zone {{`{{ $labels.zone }}`}} has been in a non-ready state for more than 10 minutes."
15+
summary: "Hypervisor {{ $labels.name }} onboarding stuck for over 1 hour"
16+
description: "The hypervisor {{ $labels.name }} in zone {{ $labels.zone }} has been onboarding for more than 1 hour. Check nova registration, test VM status, or trait/aggregate sync."
1717

18-
- alert: HypervisorConditionDegraded
18+
- alert: HypervisorEvictionStuck
1919
expr: |
20-
kube_customresource_hypervisor_condition{condition!="Ready"} == 0
21-
for: 10m
20+
kube_customresource_hypervisor_condition{condition="Evicting"} == 1
21+
for: 4h
2222
labels:
2323
severity: warning
2424
type: hypervisor_operator
2525
annotations:
26-
summary: "Hypervisor {{`{{ $labels.name }}`}} condition {{`{{ $labels.condition }}`}} is degraded"
27-
description: "The hypervisor {{`{{ $labels.name }}`}} in zone {{`{{ $labels.zone }}`}} has condition {{`{{ $labels.condition }}`}} in a False state for more than 10 minutes. Reason: {{`{{ $labels.reason }}`}}."
26+
summary: "Hypervisor {{ $labels.name }} eviction running for over 4 hours"
27+
description: "The hypervisor {{ $labels.name }} in zone {{ $labels.zone }} has had an active eviction for more than 4 hours. Check for stuck live-migrations or failed VMs."
2828

29-
- alert: HypervisorEvicted
29+
- alert: HypervisorEvictedTooLong
3030
expr: |
3131
kube_customresource_hypervisor_evicted == 1
32-
for: 2d
32+
unless on (name)
33+
kube_customresource_hypervisor_condition{condition="Offboarded"} == 1
34+
for: 7d
35+
labels:
36+
severity: info
37+
type: hypervisor_operator
38+
annotations:
39+
summary: "Hypervisor {{ $labels.name }} has been evicted for over 7 days"
40+
description: "The hypervisor {{ $labels.name }} in zone {{ $labels.zone }} has been evicted for more than 7 days without being offboarded. Consider re-enabling or decommissioning."
41+
42+
- name: hypervisorSync
43+
rules:
44+
- alert: HypervisorTraitSyncFailed
45+
expr: |
46+
kube_customresource_hypervisor_condition{condition="TraitsUpdated"} == 0
47+
and on (name)
48+
kube_customresource_hypervisor_condition{condition="Onboarding"} == 0
49+
for: 30m
3350
labels:
3451
severity: warning
3552
type: hypervisor_operator
3653
annotations:
37-
summary: "Hypervisor {{`{{ $labels.name }}`}} has been evicted"
38-
description: "The hypervisor {{`{{ $labels.name }}`}} in zone {{`{{ $labels.zone }}`}} has been in an evicted state for more than 2 days."
54+
summary: "Hypervisor {{ $labels.name }} trait sync has been failing"
55+
description: "The hypervisor {{ $labels.name }} in zone {{ $labels.zone }} has had TraitsUpdated=False for more than 30 minutes outside of onboarding. Check OpenStack Placement API connectivity."
3956

57+
- alert: HypervisorAggregateSyncFailed
58+
expr: |
59+
kube_customresource_hypervisor_condition{condition="AggregatesUpdated"} == 0
60+
and on (name)
61+
kube_customresource_hypervisor_condition{condition="Onboarding"} == 0
62+
unless on (name)
63+
kube_customresource_hypervisor_condition{condition="Evicting"} == 1
64+
for: 30m
65+
labels:
66+
severity: warning
67+
type: hypervisor_operator
68+
annotations:
69+
summary: "Hypervisor {{ $labels.name }} aggregate sync has been failing"
70+
description: "The hypervisor {{ $labels.name }} in zone {{ $labels.zone }} has had AggregatesUpdated=False for more than 30 minutes outside of onboarding and eviction. Check OpenStack Nova API connectivity."
71+
72+
- name: hypervisorOperatorHealth
73+
rules:
4074
- alert: HypervisorOperatorReconcileErrors
4175
expr: |
42-
rate(controller_runtime_reconcile_errors_total[5m]) > 0
76+
rate(controller_runtime_reconcile_errors_total[5m]) > 0.01
4377
for: 15m
4478
labels:
4579
severity: warning
4680
type: hypervisor_operator
4781
annotations:
48-
summary: "Hypervisor operator controller {{`{{ $labels.controller }}`}} has persistent reconcile errors"
49-
description: "The controller {{`{{ $labels.controller }}`}} has been producing reconciliation errors for more than 15 minutes."
82+
summary: "Hypervisor operator controller {{ $labels.controller }} has persistent reconcile errors"
83+
description: "The controller {{ $labels.controller }} has been producing sustained reconciliation errors for more than 15 minutes."
5084

5185
- alert: HypervisorOperatorDown
5286
expr: |

charts/openstack-hypervisor-operator/templates/alerts.yaml

Lines changed: 25 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,22 +3,38 @@
33

44
{{- if .Values.prometheusRules.create -}}
55
{{- $root := . -}}
6-
{{- $additionalRuleLabels := include "openstack-hypervisor-operator.additionalRuleLabels" $root | trim }}
7-
{{- $labelsLineRegex := "(?m)^(\\s+labels:\\s*\\n)" }}
6+
{{- $disabled := $root.Values.prometheusRules.disabled | default dict -}}
7+
{{- $additionalRuleLabels := $root.Values.prometheusRules.additionalRuleLabels | default dict -}}
88
{{- $docStarted := false }}
99

1010
{{- range $alertPath, $alertRaw := $root.Files.Glob "alerts/*.yaml" }}
1111
{{- $alertName := base $alertPath }}
12-
{{- $alertContent := printf "%s" $alertRaw }}
13-
{{- if $additionalRuleLabels }}
14-
{{- $alertContent = regexReplaceAll $labelsLineRegex $alertContent (printf "$1%s\n" ($additionalRuleLabels | indent 6)) }}
12+
{{- $parsed := fromYaml (printf "%s" $alertRaw) }}
13+
14+
{{/* Filter disabled alerts and inject additionalRuleLabels */}}
15+
{{- $filteredGroups := list }}
16+
{{- range $group := $parsed.groups }}
17+
{{- $filteredRules := list }}
18+
{{- range $rule := $group.rules }}
19+
{{- if not (index $disabled $rule.alert) }}
20+
{{- if $additionalRuleLabels }}
21+
{{- $mergedLabels := merge (deepCopy $additionalRuleLabels) $rule.labels }}
22+
{{- $_ := set $rule "labels" $mergedLabels }}
23+
{{- end }}
24+
{{- $filteredRules = append $filteredRules $rule }}
25+
{{- end }}
26+
{{- end }}
27+
{{- if $filteredRules }}
28+
{{- $filteredGroup := dict "name" $group.name "rules" $filteredRules }}
29+
{{- $filteredGroups = append $filteredGroups $filteredGroup }}
30+
{{- end }}
1531
{{- end }}
1632

33+
{{- if $filteredGroups }}
1734
{{- if $docStarted }}
1835
---
1936
{{- end }}
2037
{{- $docStarted = true }}
21-
2238
apiVersion: monitoring.coreos.com/v1
2339
kind: PrometheusRule
2440
metadata:
@@ -34,6 +50,8 @@ metadata:
3450
{{ toYaml . | indent 4 }}
3551
{{- end }}
3652
spec:
37-
{{ tpl $alertContent $root | indent 2 }}
53+
groups:
54+
{{ toYaml $filteredGroups | indent 4 }}
55+
{{- end }}
3856
{{- end }}
3957
{{- end }}

charts/openstack-hypervisor-operator/templates/dashboards.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
apiVersion: v1
99
kind: ConfigMap
1010
metadata:
11-
name: {{ printf "%s-%s" $root.Release.Name ($path | replace ".json" "" | replace "/" "-" | trunc 63) }}
11+
name: {{ printf "%s-%s" $root.Release.Name ($path | replace ".json" "" | replace "/" "-") | trunc 63 | trimSuffix "-" }}
1212
labels:
1313
{{- include "openstack-hypervisor-operator.dashboardSelectorLabels" (list $path $root) | indent 4 }}
1414
{{ include "openstack-hypervisor-operator.monitoringLabels" (list $path $root) | indent 4 }}
@@ -25,7 +25,7 @@ data:
2525
apiVersion: v1
2626
kind: ConfigMap
2727
metadata:
28-
name: {{ printf "%s-%s-global" $root.Release.Name ($path | replace ".json" "" | replace "/" "-" | trunc 63) }}
28+
name: {{ printf "%s-%s-global" $root.Release.Name ($path | replace ".json" "" | replace "/" "-") | trunc 63 | trimSuffix "-" }}
2929
labels:
3030
{{- include "openstack-hypervisor-operator.globalDashboardSelectorLabels" (list $path $root) | indent 4 }}
3131
{{ include "openstack-hypervisor-operator.monitoringLabels" (list $path $root) | indent 4 }}

charts/openstack-hypervisor-operator/values.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,11 +63,17 @@ prometheusRules:
6363
labels: {}
6464
annotations: {}
6565
additionalRuleLabels: {}
66+
# Disable individual alerts by name:
67+
# disabled:
68+
# HypervisorOnboardingStuck: true
69+
# HypervisorEvictedTooLong: true
70+
disabled: {}
6671

6772
dashboards:
6873
create: true
6974
global:
7075
create: false
76+
dashboardSelectors: []
7177
dashboardSelectors:
7278
- name: perses.dev/resource
7379
value: '"true"'

0 commit comments

Comments
 (0)