Skip to content

Commit 5e5e9a4

Browse files
added minimal monitoring setup for the plugin enabled basic alert/dashboards
1 parent 80f53bb commit 5e5e9a4

8 files changed

Lines changed: 950 additions & 0 deletions

File tree

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
# SPDX-FileCopyrightText: 2026 SAP SE or an SAP affiliate company and cobaltcore-dev contributors
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
name: Validate Prometheus Alerts
5+
6+
on:
7+
workflow_dispatch:
8+
pull_request:
9+
paths:
10+
- 'charts/openstack-hypervisor-operator/alerts/*.yaml'
11+
- 'charts/openstack-hypervisor-operator/alerts/*.yml'
12+
13+
permissions:
14+
contents: read
15+
16+
env:
17+
PROMTOOL_VERSION: 3.8.0
18+
19+
defaults:
20+
run:
21+
shell: bash
22+
23+
concurrency:
24+
group: validate-prometheus-alerts-${{ github.ref }}
25+
cancel-in-progress: true
26+
27+
jobs:
28+
validate-alerts:
29+
runs-on: ubuntu-latest
30+
timeout-minutes: 5
31+
steps:
32+
- uses: actions/checkout@v6
33+
34+
- name: Install promtool
35+
run: |
36+
set -euo pipefail
37+
curl -sSfL "https://github.com/prometheus/prometheus/releases/download/v${PROMTOOL_VERSION}/prometheus-${PROMTOOL_VERSION}.linux-amd64.tar.gz" \
38+
| tar xz --strip-components=1 "prometheus-${PROMTOOL_VERSION}.linux-amd64/promtool"
39+
sudo install -m 0755 promtool /usr/local/bin/promtool
40+
promtool --version
41+
42+
- name: Validate Prometheus alert rules
43+
run: |
44+
set -euo pipefail
45+
shopt -s nullglob
46+
files=(charts/openstack-hypervisor-operator/alerts/*.{yaml,yml})
47+
if [ ${#files[@]} -eq 0 ]; then
48+
echo "No Prometheus rule files found."
49+
exit 1
50+
fi
51+
promtool check rules "${files[@]}"
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
# SPDX-FileCopyrightText: 2025 SAP SE or an SAP affiliate company and cobaltcore-dev contributors
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
groups:
5+
- name: hypervisorOperator
6+
rules:
7+
- alert: HypervisorNotReady
8+
expr: |
9+
kube_customresource_hypervisor_condition{condition="Ready"} == 0
10+
for: 10m
11+
labels:
12+
severity: critical
13+
type: hypervisor_operator
14+
annotations:
15+
summary: "Hypervisor {{`{{ $labels.name }}`}} is not ready"
16+
description: "The hypervisor {{`{{ $labels.name }}`}} in zone {{`{{ $labels.zone }}`}} has been in a non-ready state for more than 10 minutes."
17+
18+
- alert: HypervisorConditionDegraded
19+
expr: |
20+
kube_customresource_hypervisor_condition{condition!="Ready"} == 0
21+
for: 10m
22+
labels:
23+
severity: warning
24+
type: hypervisor_operator
25+
annotations:
26+
summary: "Hypervisor {{`{{ $labels.name }}`}} condition {{`{{ $labels.condition }}`}} is degraded"
27+
description: "The hypervisor {{`{{ $labels.name }}`}} in zone {{`{{ $labels.zone }}`}} has condition {{`{{ $labels.condition }}`}} in a False state for more than 10 minutes. Reason: {{`{{ $labels.reason }}`}}."
28+
29+
- alert: HypervisorEvicted
30+
expr: |
31+
kube_customresource_hypervisor_evicted == 1
32+
for: 2d
33+
labels:
34+
severity: warning
35+
type: hypervisor_operator
36+
annotations:
37+
summary: "Hypervisor {{`{{ $labels.name }}`}} has been evicted"
38+
description: "The hypervisor {{`{{ $labels.name }}`}} in zone {{`{{ $labels.zone }}`}} has been in an evicted state for more than 2 days."
39+
40+
- alert: HypervisorOperatorReconcileErrors
41+
expr: |
42+
rate(controller_runtime_reconcile_errors_total[5m]) > 0
43+
for: 15m
44+
labels:
45+
severity: warning
46+
type: hypervisor_operator
47+
annotations:
48+
summary: "Hypervisor operator controller {{`{{ $labels.controller }}`}} has persistent reconcile errors"
49+
description: "The controller {{`{{ $labels.controller }}`}} has been producing reconciliation errors for more than 15 minutes."
50+
51+
- alert: HypervisorOperatorDown
52+
expr: |
53+
up{job=~".*hypervisor-operator.*"} == 0
54+
for: 5m
55+
labels:
56+
severity: critical
57+
type: hypervisor_operator
58+
annotations:
59+
summary: "Hypervisor operator is down"
60+
description: "The hypervisor operator metrics endpoint has been unreachable for more than 5 minutes."

0 commit comments

Comments
 (0)