Skip to content

Commit dcbc86a

Browse files
committed
feat: add monitoring to cronjobs
Closes #1792
1 parent f7b8318 commit dcbc86a

7 files changed

Lines changed: 245 additions & 0 deletions

File tree

.env.example

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,18 @@ PROMETHEUS_METRICS_ENABLED=False
6565
# This maps external port 2001 to the internal Prometheus metrics port
6666
EXTERNAL_PROM_METRICS_PORT=2001
6767

68+
# -----------------------------------------------------------------------------
69+
# Notifications monitoring (optional)
70+
# -----------------------------------------------------------------------------
71+
# Healthcheck.io private tokens/UUIDs used by cronjob monitoring.
72+
# The public base URL is defined in code and not stored in env.
73+
HEALTHCHECK_ID_DELETE_UNUSED_HARDWARE_STATUS=
74+
HEALTHCHECK_ID_NOTIFICATIONS_HARDWARE_SUMMARY=
75+
HEALTHCHECK_ID_NOTIFICATIONS_METRICS_SUMMARY=
76+
HEALTHCHECK_ID_NOTIFICATIONS_NEW_ISSUES=
77+
HEALTHCHECK_ID_NOTIFICATIONS_SUMMARY_MICROSOFT=
78+
HEALTHCHECK_ID_NOTIFICATIONS_SUMMARY_MAESTRO=
79+
6880
# -----------------------------------------------------------------------------
6981
# Email / Notifications (optional)
7082
# -----------------------------------------------------------------------------

backend/kernelCI/settings.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,29 @@ def get_json_env_var(name, default):
124124
# To run cronjobs locally, execute
125125
# poetry run ./manage.py crontab arg
126126
# where "arg" is add, remove or show
127+
128+
HEALTHCHECK_MONITORING_PATH_MAP: dict[str, str] = {
129+
"delete_unused_hardware_status": os.environ.get(
130+
"HEALTHCHECK_ID_DELETE_UNUSED_HARDWARE_STATUS", ""
131+
),
132+
"notifications_hardware_summary": os.environ.get(
133+
"HEALTHCHECK_ID_NOTIFICATIONS_HARDWARE_SUMMARY", ""
134+
),
135+
"notifications_metrics_summary": os.environ.get(
136+
"HEALTHCHECK_ID_NOTIFICATIONS_METRICS_SUMMARY", ""
137+
),
138+
"notifications_new_issues": os.environ.get(
139+
"HEALTHCHECK_ID_NOTIFICATIONS_NEW_ISSUES", ""
140+
),
141+
"notifications_summary_microsoft": os.environ.get(
142+
"HEALTHCHECK_ID_NOTIFICATIONS_SUMMARY_MICROSOFT", ""
143+
),
144+
"notifications_summary_maestro": os.environ.get(
145+
"HEALTHCHECK_ID_NOTIFICATIONS_SUMMARY_MAESTRO", ""
146+
),
147+
}
148+
"""Maps monitoring_id to the relative_path that will be appended to the base healthcheck URL."""
149+
127150
SKIP_CRONJOBS = is_boolean_or_string_true(os.environ.get("SKIP_CRONJOBS", False))
128151
if SKIP_CRONJOBS:
129152
CRONJOBS = []
@@ -133,12 +156,15 @@ def get_json_env_var(name, default):
133156
"CRONTAB_COMMAND_SUFFIX", ">> /proc/1/fd/1 2>&1"
134157
)
135158
CRONJOBS = [
159+
# not using a monitoring_id in the first task since it should
160+
# be removed once the denormalization is set in stone
136161
("0 * * * *", "kernelCI_app.tasks.update_checkout_cache"),
137162
(
138163
"59 * * * *",
139164
"django.core.management.call_command",
140165
[
141166
"notifications",
167+
"--monitoring-id=notifications_new_issues",
142168
"--action=new_issues",
143169
"--to=kernelci-results@groups.io",
144170
"--cc=gus@collabora.com",
@@ -151,6 +177,7 @@ def get_json_env_var(name, default):
151177
"django.core.management.call_command",
152178
[
153179
"notifications",
180+
"--monitoring-id=notifications_summary_microsoft",
154181
"--action=summary",
155182
"--to=kernelcialerts@microsoft.com",
156183
"--cc=kernelci-results@groups.io",
@@ -165,6 +192,7 @@ def get_json_env_var(name, default):
165192
"django.core.management.call_command",
166193
[
167194
"notifications",
195+
"--monitoring-id=notifications_summary_maestro",
168196
"--action=summary",
169197
"--add-mailing-lists",
170198
"--send",
@@ -177,6 +205,7 @@ def get_json_env_var(name, default):
177205
"django.core.management.call_command",
178206
[
179207
"notifications",
208+
"--monitoring-id=notifications_hardware_summary",
180209
"--action=hardware_summary",
181210
"--cc=kernelci-results@groups.io",
182211
"--send",
@@ -188,13 +217,15 @@ def get_json_env_var(name, default):
188217
"django.core.management.call_command",
189218
[
190219
"delete_unused_hardware_status",
220+
"--monitoring-id=delete_unused_hardware_status",
191221
],
192222
),
193223
(
194224
"0 0 * * 6",
195225
"django.core.management.call_command",
196226
[
197227
"notifications",
228+
"--monitoring-id=notifications_metrics_summary",
198229
"--action=metrics_summary",
199230
"--to=kernelci@lists.linux.dev",
200231
"--cc=kernelci-results@groups.io",

backend/kernelCI_app/management/commands/delete_unused_hardware_status.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,10 @@
77
import logging
88
from django.core.management.base import BaseCommand
99
from django.db import transaction
10+
from kernelCI_app.management.commands.helpers.healthcheck import (
11+
MONITORING_ID_PARAM_HELP_TEXT,
12+
run_with_healthcheck_monitoring,
13+
)
1014
from kernelCI_app.models import HardwareStatus, LatestCheckout, ProcessedListingItems
1115

1216
logger = logging.getLogger(__name__)
@@ -30,8 +34,21 @@ def add_arguments(self, parser):
3034
default=10000,
3135
help="Number of records to delete per batch (default: 10000)",
3236
)
37+
parser.add_argument(
38+
"--monitoring-id",
39+
type=str,
40+
default=None,
41+
help=MONITORING_ID_PARAM_HELP_TEXT,
42+
)
3343

3444
def handle(self, *args, **options):
45+
monitoring_id = options.get("monitoring_id")
46+
return run_with_healthcheck_monitoring(
47+
monitoring_id=monitoring_id,
48+
action=lambda: self._run_action(options),
49+
)
50+
51+
def _run_action(self, options):
3552
dry_run = options["dry_run"]
3653
batch_size = options["batch_size"]
3754

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
from collections.abc import Callable
2+
from typing import Any
3+
4+
from django.conf import settings
5+
6+
import requests
7+
8+
from kernelCI_app.helpers.logger import out
9+
10+
MONITORING_ID_PARAM_HELP_TEXT = (
11+
"Monitoring ID configured in settings for healthcheck.io pings "
12+
"(optional, used only for monitoring the command execution over time)"
13+
)
14+
HEALTHCHECK_BASE_URL = "https://hc-ping.com"
15+
16+
17+
def _resolve_monitoring_url(monitoring_id: str) -> str | None:
18+
monitoring_path_map: dict[str, str] = settings.HEALTHCHECK_MONITORING_PATH_MAP
19+
monitoring_path = monitoring_path_map.get(monitoring_id)
20+
21+
if not monitoring_path:
22+
return None
23+
24+
return f"{HEALTHCHECK_BASE_URL.rstrip('/')}/{monitoring_path.lstrip('/')}"
25+
26+
27+
def _ping_healthcheck(monitoring_id: str, status: str) -> None:
28+
monitoring_url = _resolve_monitoring_url(monitoring_id)
29+
if not monitoring_url:
30+
out(
31+
"No healthcheck URL configured for monitoring_id='%s', skipping %s ping."
32+
% (monitoring_id, status)
33+
)
34+
return
35+
36+
# Success just needs to ping base healthcheck.io url + uuid, no subpath
37+
monitoring_status_url = (
38+
f"{monitoring_url.rstrip('/')}/{status.lstrip('/')}"
39+
if status != "success"
40+
else monitoring_url
41+
)
42+
try:
43+
response = requests.get(monitoring_status_url, timeout=10)
44+
response.raise_for_status()
45+
out(
46+
"Success at pinging healthcheck '%s' with monitoring_id '%s'"
47+
% (monitoring_status_url, monitoring_id)
48+
)
49+
except requests.RequestException as e:
50+
out(
51+
"ERROR: failed to ping healthcheck for monitoring_id='%s' and status='%s': %s"
52+
% (monitoring_id, status, e)
53+
)
54+
55+
56+
def run_with_healthcheck_monitoring(
57+
*, monitoring_id: str | None, action: Callable[[], Any]
58+
) -> Any:
59+
if not monitoring_id:
60+
return action()
61+
62+
_ping_healthcheck(monitoring_id, "start")
63+
64+
try:
65+
result = action()
66+
except Exception:
67+
_ping_healthcheck(monitoring_id, "fail")
68+
raise
69+
70+
_ping_healthcheck(monitoring_id, "success")
71+
return result

backend/kernelCI_app/management/commands/notifications.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,10 @@
1818
setup_jinja_template,
1919
send_email_report,
2020
)
21+
from kernelCI_app.management.commands.helpers.healthcheck import (
22+
MONITORING_ID_PARAM_HELP_TEXT,
23+
run_with_healthcheck_monitoring,
24+
)
2125

2226
from kernelCI_app.management.commands.helpers.summary import (
2327
SIGNUP_FOLDER,
@@ -892,6 +896,12 @@ def add_arguments(self, parser):
892896
action="store_true",
893897
help="Ignore recipients.yaml file (optional for all actions)",
894898
)
899+
parser.add_argument(
900+
"--monitoring-id",
901+
type=str,
902+
default=None,
903+
help=MONITORING_ID_PARAM_HELP_TEXT,
904+
)
895905

896906
# Action argument (replaces subparsers)
897907
actions = [
@@ -972,6 +982,13 @@ def add_arguments(self, parser):
972982
)
973983

974984
def handle(self, *args, **options):
985+
monitoring_id = options.get("monitoring_id")
986+
return run_with_healthcheck_monitoring(
987+
monitoring_id=monitoring_id,
988+
action=lambda: self._run_action(options),
989+
)
990+
991+
def _run_action(self, options):
975992
# Setup connections
976993
service = smtp_setup_connection()
977994

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
from django.test import SimpleTestCase, override_settings
2+
from unittest.mock import Mock, patch
3+
4+
from kernelCI_app.management.commands.helpers.healthcheck import (
5+
_resolve_monitoring_url,
6+
run_with_healthcheck_monitoring,
7+
)
8+
9+
TEST_BASE_URL = "https://example.com"
10+
11+
12+
@override_settings(
13+
HEALTHCHECK_MONITORING_PATH_MAP={
14+
"job-1": "private-token",
15+
"job-2": "something/with/slashes",
16+
}
17+
)
18+
class TestRunWithHealthcheckMonitoring(SimpleTestCase):
19+
@patch(
20+
"kernelCI_app.management.commands.helpers.healthcheck.HEALTHCHECK_BASE_URL",
21+
TEST_BASE_URL,
22+
)
23+
def test_resolve_monitoring_url_success(self):
24+
result = _resolve_monitoring_url("job-1")
25+
self.assertEqual(result, f"{TEST_BASE_URL}/private-token")
26+
27+
@patch("kernelCI_app.management.commands.helpers.healthcheck.requests.get")
28+
@patch(
29+
"kernelCI_app.management.commands.helpers.healthcheck.HEALTHCHECK_BASE_URL",
30+
TEST_BASE_URL,
31+
)
32+
def test_success_path_pings_start_and_success(self, mock_get):
33+
response = Mock()
34+
response.raise_for_status.return_value = None
35+
mock_get.return_value = response
36+
37+
result = run_with_healthcheck_monitoring(
38+
monitoring_id="job-1", action=lambda: "ok"
39+
)
40+
41+
assert result == "ok"
42+
assert mock_get.call_count == 2
43+
mock_get.assert_any_call(f"{TEST_BASE_URL}/private-token/start", timeout=10)
44+
mock_get.assert_any_call(f"{TEST_BASE_URL}/private-token", timeout=10)
45+
46+
@patch("kernelCI_app.management.commands.helpers.healthcheck.requests.get")
47+
@patch(
48+
"kernelCI_app.management.commands.helpers.healthcheck.HEALTHCHECK_BASE_URL",
49+
TEST_BASE_URL,
50+
)
51+
def test_failure_path_pings_start_and_fail(self, mock_get):
52+
response = Mock()
53+
response.raise_for_status.return_value = None
54+
mock_get.return_value = response
55+
56+
with self.assertRaisesRegex(RuntimeError, "boom"):
57+
run_with_healthcheck_monitoring(
58+
monitoring_id="job-1",
59+
action=lambda: (_ for _ in ()).throw(RuntimeError("boom")),
60+
)
61+
62+
assert mock_get.call_count == 2
63+
mock_get.assert_any_call(f"{TEST_BASE_URL}/private-token/start", timeout=10)
64+
mock_get.assert_any_call(f"{TEST_BASE_URL}/private-token/fail", timeout=10)
65+
66+
@patch("kernelCI_app.management.commands.helpers.healthcheck.requests.get")
67+
def test_no_monitoring_id_skips_pings(self, mock_get):
68+
result = run_with_healthcheck_monitoring(monitoring_id=None, action=lambda: 42)
69+
70+
assert result == 42
71+
mock_get.assert_not_called()
72+
73+
@patch("kernelCI_app.management.commands.helpers.healthcheck.requests.get")
74+
def test_unknown_monitoring_id_skips_network_and_runs_action(self, mock_get):
75+
result = run_with_healthcheck_monitoring(
76+
monitoring_id="missing-id", action=lambda: "ran"
77+
)
78+
79+
assert result == "ran"
80+
mock_get.assert_not_called()

docs/monitoring.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,23 @@ The monitoring system supports multi-worker Gunicorn deployments using Prometheu
9797
- `PROMETHEUS_METRICS_PORT`: Port for the metrics aggregator (default: `8001`)
9898
- `PROMETHEUS_MULTIPROC_DIR`: Directory for multiprocess metric files (default: `/tmp/prometheus_multiproc_dir`)
9999

100+
### Cronjob Healthchecks
101+
102+
The backend can ping healthcheck.io for cronjobs that run Django management commands.
103+
104+
- The public base URL is defined in code as `HEALTHCHECK_BASE_URL`.
105+
- Private monitor tokens stay in environment variables and are mapped in Django settings.
106+
- Each monitored cron run sends pings to `/start`, `/success`, and `/fail`.
107+
108+
Configure these variables in `.env.backend`:
109+
110+
- `HEALTHCHECK_MONITORING_NOTIFICATIONS_NEW_ISSUES_HOURLY`
111+
- `HEALTHCHECK_MONITORING_NOTIFICATIONS_SUMMARY_MICROSOFT_DAILY`
112+
- `HEALTHCHECK_MONITORING_NOTIFICATIONS_SUMMARY_MAESTRO_DAILY`
113+
- `HEALTHCHECK_MONITORING_NOTIFICATIONS_HARDWARE_SUMMARY_WEEKLY`
114+
- `HEALTHCHECK_MONITORING_DELETE_UNUSED_HARDWARE_STATUS_WEEKLY`
115+
- `HEALTHCHECK_MONITORING_NOTIFICATIONS_METRICS_SUMMARY_WEEKLY`
116+
100117
## `prometheus.yml`
101118
- **Target**: `host.docker.internal:8001` (backend running locally)
102119
- **Metrics Path**: `/metrics/`

0 commit comments

Comments
 (0)