From ca3272335a7359b01c4616fe21082c333a1fdd50 Mon Sep 17 00:00:00 2001 From: Giacomo Sanchietti Date: Wed, 22 Apr 2026 15:35:26 +0200 Subject: [PATCH 01/11] feat(victoria): add vmalert alerting Replace Netdata alerting with vmalert: - add vmalert init script (vmalert.initd) to start/stop vmalert service - add vmalert UCI configuration file (vmalert.conf) with datasource settings - add comprehensive alert rules - update Makefile to install vmalert configuration and rules - add detailed documentation of vmalert setup and metrics mapping - support for Mimir integration when configured via ns-plug - add ns-plug-alert-proxy that listens on 127.0.0.1:9095 and receives notifications from vmalert: the proxy verify if an alert is firing or resolved Then it translates selected alerts to the legacy portal format and forwards them to my.nethesis.it or my.nethserver.com - if Mimir credentials are present in ns-plug UCI config, the Mimir alertmanager endpoint is added as a second notifier alongside the proxy - port to Victoria Metrics also alert about non-encrypted backup - add telegraf-mwan Python script that reads /var/run/mwan3/iface_state/ to collect WAN interface connectivity state. - add telegraf-services Python script that queries ubus to collect the running state of all procd-managed services. Outputs JSON for Assisted-by: Copilot:Sonnet4.6 --- packages/ns-plug/Makefile | 13 +- packages/ns-plug/README.md | 37 ++-- packages/ns-plug/files/20_ns-plug | 1 - .../ns-plug/files/backup-encryption-alert | 36 ---- packages/ns-plug/files/ns-plug-alert-proxy | 176 ++++++++++++++++++ .../ns-plug/files/ns-plug-alert-proxy.init | 19 ++ packages/ns-storage/Makefile | 2 - packages/ns-storage/README.md | 4 + .../ns-storage/files/ns-storage-alert.init | 29 --- packages/ns-storage/files/storage-alarm | 27 --- packages/telegraf/Makefile | 11 ++ packages/telegraf/README.md | 145 +++++++++++++++ .../telegraf/files/telegraf-backup-encryption | 17 ++ packages/telegraf/files/telegraf-mwan | 54 ++++++ packages/telegraf/files/telegraf-services | 110 +++++++++++ .../telegraf/files/telegraf-storage-status | 21 +++ .../files/telegraf.conf.d/backup.conf | 16 ++ .../telegraf/files/telegraf.conf.d/mwan.conf | 20 ++ .../files/telegraf.conf.d/services.conf | 20 ++ .../files/telegraf.conf.d/storage.conf | 16 ++ packages/victoria-metrics/Makefile | 6 + packages/victoria-metrics/README.md | 138 ++++++++++++++ .../files/vmalert-rules/backup.yaml | 20 ++ .../files/vmalert-rules/host.yaml | 96 ++++++++++ .../files/vmalert-rules/mwan.yaml | 25 +++ .../files/vmalert-rules/services.yaml | 22 +++ .../files/vmalert-rules/storage.yaml | 18 ++ packages/victoria-metrics/files/vmalert.conf | 3 + packages/victoria-metrics/files/vmalert.initd | 69 +++++++ 29 files changed, 1047 insertions(+), 124 deletions(-) delete mode 100644 packages/ns-plug/files/backup-encryption-alert create mode 100644 packages/ns-plug/files/ns-plug-alert-proxy create mode 100644 packages/ns-plug/files/ns-plug-alert-proxy.init delete mode 100644 packages/ns-storage/files/ns-storage-alert.init delete mode 100644 packages/ns-storage/files/storage-alarm create mode 100644 packages/telegraf/README.md create mode 100644 packages/telegraf/files/telegraf-backup-encryption create mode 100644 packages/telegraf/files/telegraf-mwan create mode 100644 packages/telegraf/files/telegraf-services create mode 100644 packages/telegraf/files/telegraf-storage-status create mode 100644 packages/telegraf/files/telegraf.conf.d/backup.conf create mode 100644 packages/telegraf/files/telegraf.conf.d/mwan.conf create mode 100644 packages/telegraf/files/telegraf.conf.d/services.conf create mode 100644 packages/telegraf/files/telegraf.conf.d/storage.conf create mode 100644 packages/victoria-metrics/README.md create mode 100644 packages/victoria-metrics/files/vmalert-rules/backup.yaml create mode 100644 packages/victoria-metrics/files/vmalert-rules/host.yaml create mode 100644 packages/victoria-metrics/files/vmalert-rules/mwan.yaml create mode 100644 packages/victoria-metrics/files/vmalert-rules/services.yaml create mode 100644 packages/victoria-metrics/files/vmalert-rules/storage.yaml create mode 100644 packages/victoria-metrics/files/vmalert.conf create mode 100644 packages/victoria-metrics/files/vmalert.initd diff --git a/packages/ns-plug/Makefile b/packages/ns-plug/Makefile index b1715109b..5b602cb44 100644 --- a/packages/ns-plug/Makefile +++ b/packages/ns-plug/Makefile @@ -44,6 +44,8 @@ if [ -z "$${IPKG_INSTROOT}" ]; then /etc/init.d/cron restart /usr/libexec/ns-plug/40_ns-plug_mwan_hooks /etc/init.d/ns-plug restart + /etc/init.d/ns-plug-alert-proxy enable + /etc/init.d/ns-plug-alert-proxy restart fi exit 0 endef @@ -55,6 +57,8 @@ if [ -z "$${IPKG_INSTROOT}" ]; then crontab -l | grep -v "/usr/sbin/send-inventory" | sort | uniq | crontab - crontab -l | grep -v "/usr/sbin/send-heartbeat" | sort | uniq | crontab - sed -i '/\/usr\/libexec\/ns-plug\/mwan-hooks/d' /etc/mwan3.user + /etc/init.d/ns-plug-alert-proxy stop + /etc/init.d/ns-plug-alert-proxy disable fi exit 0 endef @@ -68,12 +72,13 @@ define Package/ns-plug/install $(INSTALL_DIR) $(1)/etc/init.d $(INSTALL_DIR) $(1)/etc/config $(INSTALL_DIR) $(1)/etc/uci-defaults - $(INSTALL_DIR) $(1)/etc/netdata $(INSTALL_DIR) $(1)/lib/upgrade/keep.d $(INSTALL_DIR) $(1)/usr/libexec/ns-plug $(INSTALL_DIR) $(1)/usr/libexec/mwan-hooks $(INSTALL_BIN) ./files/ns-plug.init $(1)/etc/init.d/ns-plug + $(INSTALL_BIN) ./files/ns-plug-alert-proxy.init $(1)/etc/init.d/ns-plug-alert-proxy $(INSTALL_BIN) ./files/ns-plug $(1)/usr/sbin/ns-plug + $(INSTALL_BIN) ./files/ns-plug-alert-proxy $(1)/usr/sbin/ns-plug-alert-proxy $(INSTALL_BIN) ./files/distfeed-setup $(1)/usr/sbin/distfeed-setup $(INSTALL_BIN) ./files/remote-backup $(1)/usr/sbin $(INSTALL_BIN) ./files/send-backup $(1)/usr/sbin @@ -89,20 +94,14 @@ define Package/ns-plug/install $(INSTALL_BIN) ./files/ns-push-reports $(1)/usr/bin $(INSTALL_BIN) ./files/ns-controller-push-info $(1)/usr/sbin $(INSTALL_BIN) ./files/20_ns-plug $(1)/etc/uci-defaults - $(INSTALL_BIN) ./files/30_ns-plug_alerts $(1)/etc/uci-defaults $(INSTALL_BIN) ./files/40_ns-plug_automatic_updates $(1)/etc/uci-defaults $(INSTALL_BIN) ./files/40_ns-plug_automatic_updates $(1)/usr/libexec/ns-plug $(INSTALL_BIN) ./files/40_ns-plug_mwan_hooks $(1)/etc/uci-defaults $(INSTALL_BIN) ./files/40_ns-plug_mwan_hooks $(1)/usr/libexec/ns-plug - $(INSTALL_BIN) ./files/netadata_enable_alerts $(1)/usr/share/ns-plug/hooks/register/70netadata_enable_alerts - $(INSTALL_BIN) ./files/netadata_disable_alerts $(1)/usr/share/ns-plug/hooks/unregister/70netadata_disable_alerts $(INSTALL_BIN) ./files/enable_automatic_updates $(1)/usr/share/ns-plug/hooks/register/60enable_automatic_updates $(INSTALL_BIN) ./files/disable_automatic_updates $(1)/usr/share/ns-plug/hooks/unregister/60disable_automatic_updates $(INSTALL_CONF) ./files/config $(1)/etc/config/ns-plug $(INSTALL_CONF) files/ns-plug.keep $(1)/lib/upgrade/keep.d/ns-plug - $(INSTALL_CONF) files/health_alarm_notify.conf $(1)/etc/netdata - $(INSTALL_BIN) ./files/send-mwan-alert $(1)/usr/libexec/mwan-hooks - $(INSTALL_BIN) ./files/backup-encryption-alert $(1)/usr/libexec $(INSTALL_BIN) ./files/mwan-hooks $(1)/usr/libexec/ns-plug $(INSTALL_BIN) ./files/ns-plug-rsyslog-fixup.uci-default $(1)/etc/uci-defaults/rsyslog-fixup endef diff --git a/packages/ns-plug/README.md b/packages/ns-plug/README.md index 263ea7a7b..670169c90 100644 --- a/packages/ns-plug/README.md +++ b/packages/ns-plug/README.md @@ -110,13 +110,9 @@ the given passphrase: only the encrypted backup will be sent to the remote serve To disable the encryption, just delete the file `/etc/backup.pass`. -If the backup is not encrypted, an alert will be sent to the remote portal (my.nethesis.it or my.nethserver.com). -Unencrypted backups are deprecated and will be removed in the future. -The alert can be disabled using this command: -``` -uci set ns-plug.config.backup_alert_disabled=1 -uci commit ns-plug -``` +Non-encrypted backups are not sent to the remote server for security reasons. +If the backup is not encrypted, an alert will be sent to the remote portal (my.nethesis.it or my.nethserver.com) +so the user can be aware of the risk and take action to secure the backup. ### Restore @@ -141,20 +137,17 @@ Alerts are also logged to `/var/log/messages` and are visible within the netdata Only the following alerts are sent to the remote system: -- disk space occupation -- WAN down events - -When an alert is resolved, netdata will also send a clear command to remote server. +| Alert | Condition | Legacy alert_id | +|---|---|---| +| `WanDown` | WAN interface offline for 2m | `wan::down` | +| `DiskSpaceCritical` | Disk usage > 90% for 2m | `df:root:percent_bytes:free` or `df:boot:percent_bytes:free` | +| `BackupEncryptionDisabled` | Backup passphrase missing | `backup:config:notencrypted` | +| `StorageStatus` | Storage status is error | `storage:status` | -### MultiWAN alerts +All other alert are silently dropped by the proxy. +If the machine is not registered, all alerts are silently dropped. -MultiWAN alerts are managed using `/etc/mwan3.user` script. - -When a WAN changes its status, all executable scripts inside the `/usr/libexec/mwan-hooks/` directory will be executed. -If the machine has a valid subscription, the `send-mwan-alert` script will send an alert to my.nethesis.it and my.nethserver.com monitoring portals. -Sent alerts are logged to `/var/log/messages`, example: -``` -Jul 31 12:40:42 NethSec mwan3-alert: Sending alert wan:wanb:down with status FAILURE -... -Jul 31 12:41:04 NethSec mwan3-alert: Sending alert wan:wanb:down with status OK -``` +The proxy starts automatically at boot regardless of registration state. +Firing/resolved state is determined from the Alertmanager-standard `endsAt` field: +if `endsAt` is in the future (or zero/missing) a **FAILURE** is sent; if `endsAt` is in +the past an **OK** is sent. diff --git a/packages/ns-plug/files/20_ns-plug b/packages/ns-plug/files/20_ns-plug index b87f6a92f..4026f4031 100644 --- a/packages/ns-plug/files/20_ns-plug +++ b/packages/ns-plug/files/20_ns-plug @@ -2,7 +2,6 @@ # setup cron jobs for remote servers crontab -l | grep -q '/usr/sbin/send-backup' || echo '02 2 * * * sleep $(( RANDOM % 1800 )); /usr/sbin/send-backup' >> /etc/crontabs/root -crontab -l | grep -q '/usr/libexec/backup-encryption-alert' || echo '02 3 * * * sleep $(( RANDOM % 1800 )); /usr/libexec/backup-encryption-alert' >> /etc/crontabs/root crontab -l | grep -q '/usr/sbin/send-heartbeat' || echo '*/10 * * * * sleep $(( RANDOM % 60 )); /usr/sbin/send-heartbeat' >> /etc/crontabs/root crontab -l | grep -q '/usr/sbin/send-inventory' || echo '05 3 * * * sleep $(( RANDOM % 1800 )); /usr/sbin/send-inventory' >> /etc/crontabs/root diff --git a/packages/ns-plug/files/backup-encryption-alert b/packages/ns-plug/files/backup-encryption-alert deleted file mode 100644 index 1249f5479..000000000 --- a/packages/ns-plug/files/backup-encryption-alert +++ /dev/null @@ -1,36 +0,0 @@ -#!/bin/bash -# -# Copyright (C) 2025 Nethesis S.r.l. -# SPDX-License-Identifier: GPL-2.0-only -# - -# Send a backup alert if backup is not encrypted - -lk=$(uci -q get ns-plug.config.system_id) -secret=$(uci -q get ns-plug.config.secret) -url=$(uci -q get ns-plug.config.alerts_url)"alerts/store" - -# Do not send alert if system_id or secret is not set -if [ -z "$lk" ] || [ -z "$secret" ]; then - exit 0 -fi - -# Check if alert is enabled -if [ "$(uci -q get ns-plug.config.backup_alert_disabled)" = "1" ]; then - exit 0 -fi - -# Send the alert -if [ ! -f "/etc/backup.pass" ]; then - status="FAILURE" -else - status="OK" -fi - -alert_id="backup:config:notencrypted" -logger -t backup-alert "Sending alert ${alert_id} with status ${status}" -payload='{"lk": "'$lk'", "alert_id": "'$alert_id'", "status": "'$status'"}' - -/usr/bin/curl -m 30 --retry 3 -L -s \ - --header "Authorization: token ${secret}" --header "Content-Type: application/json" --header "Accept: application/json" \ - --data-raw "${payload}" ${url} > /dev/null diff --git a/packages/ns-plug/files/ns-plug-alert-proxy b/packages/ns-plug/files/ns-plug-alert-proxy new file mode 100644 index 000000000..f50f70d4b --- /dev/null +++ b/packages/ns-plug/files/ns-plug-alert-proxy @@ -0,0 +1,176 @@ +#!/usr/bin/python3 + +# +# Copyright (C) 2026 Nethesis S.r.l. +# SPDX-License-Identifier: GPL-2.0-only +# + +""" +Alert proxy: receives Alertmanager-like notifications from vmalert and +forwards selected alerts to the legacy my.nethesis.it / my.nethserver.com +monitoring portals. + +Only the following alerts are forwarded: + - WanDown → wan::down + - DiskSpaceCritical → df:root:percent_bytes:free (path=/) + df:boot:percent_bytes:free (path=/boot) + - BackupEncryptionDisabled → backup:config:notencrypted + - StorageStatus → storage:status + +All other alerts are silently dropped. +If the machine is not registered (no system_id/secret in UCI), all alerts +are silently dropped. + +Firing/resolved state is determined from the Alertmanager-standard endsAt +field: if endsAt is in the future (or zero/missing) the alert is FAILURE; +if endsAt is in the past the alert is OK. +""" + +import json +import re +import sys +import time +import urllib.request +from datetime import datetime, timezone +from http.server import BaseHTTPRequestHandler, HTTPServer +from socketserver import ThreadingMixIn +from euci import EUci + +LISTEN_ADDR = "127.0.0.1" +LISTEN_PORT = 9095 + +_DISK_PATH_MAP = { + "/": "df:root:percent_bytes:free", + "/boot": "df:boot:percent_bytes:free", +} + +_ZERO_TIME = "0001-01-01T00:00:00Z" +# vmalert uses nanosecond precision; strip to microseconds for Python parsing +_NANO_RE = re.compile(r"(\.\d{6})\d+(Z|[+-]\d{2}:\d{2})$") + + +def _is_firing(alert): + """Return True if the alert is currently firing based on endsAt.""" + ends_at_str = alert.get("endsAt", "") + if not ends_at_str or ends_at_str == _ZERO_TIME: + return True + ends_at_str = _NANO_RE.sub(r"\1\2", ends_at_str) + ends_at_str = ends_at_str.replace("Z", "+00:00") + try: + ends_at = datetime.fromisoformat(ends_at_str) + return ends_at > datetime.now(timezone.utc) + except Exception: + return True + + +def _map_alert_id(alert_name, labels): + """Return the legacy alert_id string, or None if the alert is not mapped.""" + if alert_name == "WanDown": + iface = labels.get("interface", "unknown") + return f"wan:{iface}:down" + if alert_name == "DiskSpaceCritical": + path = labels.get("path", "") + return _DISK_PATH_MAP.get(path) + if alert_name == "BackupEncryptionDisabled": + return "backup:config:notencrypted" + if alert_name == "StorageStatus": + return "storage:status" + return None + + +def _send_alert(system_id, secret, alerts_url, alert_id, status, retry=3): + url = alerts_url.rstrip("/") + "/alerts/store" + payload = json.dumps( + {"lk": system_id, "alert_id": alert_id, "status": status} + ).encode() + req = urllib.request.Request( + url, + data=payload, + method="POST", + headers={ + "Authorization": f"token {secret}", + "Content-Type": "application/json", + "Accept": "application/json", + }, + ) + try: + with urllib.request.urlopen(req, timeout=60) as resp: + print(f"Alert sent: {alert_id} {status} → {resp.status}", file=sys.stderr) + except Exception as ex: + if retry > 0: + print( + f"Alert send failed: {alert_id} {ex} — retrying in 20s", file=sys.stderr + ) + time.sleep(20) + _send_alert(system_id, secret, alerts_url, alert_id, status, retry - 1) + else: + print(f"Alert send aborted: {alert_id} {ex}", file=sys.stderr) + + +class _AlertHandler(BaseHTTPRequestHandler): + def log_message(self, format, *args): + # Suppress access log + pass + + def do_GET(self): + self.send_response(200) + self.end_headers() + + def do_POST(self): + if self.system_id is None or self.secret is None or self.alerts_url is None: + # Just drop the alert if not configured + self.send_response(200) + self.end_headers() + return + try: + length = int(self.headers.get("Content-Length", 0)) + body = self.rfile.read(length) + data = json.loads(body) + except Exception as ex: + self.send_response(400) + self.end_headers() + self.wfile.write(str(ex).encode()) + return + + if type(data) is list: + alerts = data + else: + alerts = data.get("alerts", []) + for alert in alerts: + labels = alert.get("labels", {}) + alert_name = labels.get("alertname", "") + legacy_status = "FAILURE" if _is_firing(alert) else "OK" + + alert_id = _map_alert_id(alert_name, labels) + if not alert_id: + print( + f"Alert dropped (no mapping): {alert_name} {labels}", + file=sys.stderr, + ) + continue + + _send_alert(self.system_id, self.secret, self.alerts_url, alert_id, legacy_status) + + self.send_response(200) + self.end_headers() + + def __init__(self, *args, **kwargs): + uci = EUci() + self.system_id = uci.get("ns-plug", "config", "system_id", default=None) + self.secret = uci.get("ns-plug", "config", "secret", default=None) + self.alerts_url = uci.get("ns-plug", "config", "alerts_url", default=None) + super().__init__(*args, **kwargs) + + +class _ThreadingHTTPServer(ThreadingMixIn, HTTPServer): + daemon_threads = True + + +def main(): + server = _ThreadingHTTPServer((LISTEN_ADDR, LISTEN_PORT), _AlertHandler) + print(f"alert-proxy listening on {LISTEN_ADDR}:{LISTEN_PORT}", file=sys.stderr) + server.serve_forever() + + +if __name__ == "__main__": + main() diff --git a/packages/ns-plug/files/ns-plug-alert-proxy.init b/packages/ns-plug/files/ns-plug-alert-proxy.init new file mode 100644 index 000000000..31b5fa430 --- /dev/null +++ b/packages/ns-plug/files/ns-plug-alert-proxy.init @@ -0,0 +1,19 @@ +#!/bin/sh /etc/rc.common + +# +# Copyright (C) 2026 Nethesis S.r.l. +# SPDX-License-Identifier: GPL-2.0-only +# + +START=95 +STOP=4 +USE_PROCD=1 + +start_service() { + procd_open_instance + procd_set_param stdout 1 + procd_set_param stderr 1 + procd_set_param command '/usr/sbin/ns-plug-alert-proxy' + procd_set_param respawn 3600 5 0 + procd_close_instance +} diff --git a/packages/ns-storage/Makefile b/packages/ns-storage/Makefile index 27938cfd1..8bd79ac3b 100644 --- a/packages/ns-storage/Makefile +++ b/packages/ns-storage/Makefile @@ -70,8 +70,6 @@ define Package/ns-storage/install $(INSTALL_BIN) ./files/32-ns-storage-convert-uuid.uci-default $(1)/etc/uci-defaults/32-ns-storage-convert-uuid $(INSTALL_CONF) ./files/data.conf $(1)/etc/logrotate.d $(INSTALL_BIN) ./files/storage-status $(1)/usr/sbin - $(INSTALL_BIN) ./files/storage-alarm $(1)/usr/libexec - $(INSTALL_BIN) ./files/ns-storage-alert.init $(1)/etc/init.d/ns-storage-alert $(INSTALL_BIN) ./files/ns-storage-check.init $(1)/etc/init.d/ns-storage-check endef diff --git a/packages/ns-storage/README.md b/packages/ns-storage/README.md index b9554a8b2..ba92773c3 100644 --- a/packages/ns-storage/README.md +++ b/packages/ns-storage/README.md @@ -69,6 +69,10 @@ the system as follow: - rsyslog will write logs also inside `/mnt/data/logs/messages` file - logrotate will rotate `/mnt/data/logs/messages` once a week (see `/etc/logrotate/data.conf` for more info) +### Storage status alert + +The storage health check is exported to Telegraf by `/usr/libexec/telegraf-storage-status` and evaluated by vmalert as `StorageStatus`. + ## Data sync customization Every night the cron will run a script named `sync-data` to sync data from in-memory diff --git a/packages/ns-storage/files/ns-storage-alert.init b/packages/ns-storage/files/ns-storage-alert.init deleted file mode 100644 index 26a259041..000000000 --- a/packages/ns-storage/files/ns-storage-alert.init +++ /dev/null @@ -1,29 +0,0 @@ -#!/bin/sh /etc/rc.common - -# -# Copyright (C) 2025 Nethesis S.r.l. -# SPDX-License-Identifier: GPL-2.0-only -# - -START=99 -USE_PROCD=1 - -start_service() -{ - procd_open_instance - procd_set_param stdout 1 - procd_set_param stderr 1 - procd_set_param command '/usr/libexec/storage-alarm' - procd_close_instance -} - -reload_service() -{ - start -} - -service_triggers() -{ - procd_add_reload_trigger fstab ns-plug - procd_add_reload_mount_trigger /mnt/data -} diff --git a/packages/ns-storage/files/storage-alarm b/packages/ns-storage/files/storage-alarm deleted file mode 100644 index adaccedad..000000000 --- a/packages/ns-storage/files/storage-alarm +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/sh - -# -# Copyright (C) 2025 Nethesis S.r.l. -# SPDX-License-Identifier: GPL-2.0-only -# - -system_id=$(uci -q get ns-plug.config.system_id) -system_secret=$(uci -q get ns-plug.config.secret) -if [ -z "$system_id" ] || [ -z "$system_secret" ]; then - # not subscription - exit 0 -fi -url="$(uci -q get ns-plug.config.alerts_url)/alerts/store" - -storage_status=$(storage-status) -status="OK" -if [ "$storage_status" = "error" ]; then - status="FAILURE" -fi - -/usr/bin/curl -m 180 --retry 10 -L -s \ - --header "Authorization: token $system_secret" \ - --header "Content-Type: application/json" \ - --header "Accept: application/json" \ - --data-binary "{\"lk\": \"$system_id\", \"alert_id\": \"storage:status\", \"status\": \"$status\"}" \ - "$url" > /dev/null diff --git a/packages/telegraf/Makefile b/packages/telegraf/Makefile index e63822acb..1f4175a2d 100644 --- a/packages/telegraf/Makefile +++ b/packages/telegraf/Makefile @@ -76,8 +76,19 @@ define Package/telegraf/install $(INSTALL_DATA) ./files/telegraf.conf $(1)/etc/telegraf.conf $(INSTALL_DIR) $(1)/etc/telegraf.conf.d $(INSTALL_DATA) ./files/telegraf.conf.d/os.conf $(1)/etc/telegraf.conf.d/os.conf + $(INSTALL_DATA) ./files/telegraf.conf.d/backup.conf $(1)/etc/telegraf.conf.d/backup.conf + $(INSTALL_DATA) ./files/telegraf.conf.d/storage.conf $(1)/etc/telegraf.conf.d/storage.conf + $(INSTALL_DATA) ./files/telegraf.conf.d/services.conf $(1)/etc/telegraf.conf.d/services.conf + $(INSTALL_DATA) ./files/telegraf.conf.d/mwan.conf $(1)/etc/telegraf.conf.d/mwan.conf + $(INSTALL_DATA) ./files/telegraf.conf.d/ping.conf $(1)/etc/telegraf.conf.d/ping.conf $(INSTALL_DIR) $(1)/usr/sbin $(INSTALL_BIN) ./files/telegraf-config $(1)/usr/sbin/telegraf-config + $(INSTALL_DIR) $(1)/usr/libexec + $(INSTALL_BIN) ./files/telegraf-services $(1)/usr/libexec/telegraf-services + $(INSTALL_BIN) ./files/telegraf-backup-encryption $(1)/usr/libexec/telegraf-backup-encryption + $(INSTALL_BIN) ./files/telegraf-storage-status $(1)/usr/libexec/telegraf-storage-status + $(INSTALL_BIN) ./files/telegraf-services $(1)/usr/libexec/telegraf-services + $(INSTALL_BIN) ./files/telegraf-mwan $(1)/usr/libexec/telegraf-mwan endef define Package/telegraf/postinst diff --git a/packages/telegraf/README.md b/packages/telegraf/README.md new file mode 100644 index 000000000..b6cb55a73 --- /dev/null +++ b/packages/telegraf/README.md @@ -0,0 +1,145 @@ +# Telegraf + +## Overview + +Telegraf is the metrics collection agent that gathers host and service metrics and forwards them to Victoria Metrics for storage, alerting, and visualization. + +## Architecture + +``` +/usr/libexec/telegraf-services ← service status via ubus +/var/run/mwan3/iface_state/ ← WAN interface status via mwan3 state files +/proc filesystem ← CPU, memory, disk, network + │ + ▼ + Telegraf (inputs.exec, inputs.cpu, inputs.mem, …) + │ + ▼ +Victoria Metrics (http://127.0.0.1:8428) + │ + └─▶ vmalert (alert rules evaluation) +``` + +## Configuration Files + +| Path | Description | +|------|-------------| +| `/etc/telegraf.conf` | Main Telegraf agent config and InfluxDB output | +| `/etc/telegraf.conf.d/*.conf` | Additional Telegraf input configurations for plugins | + + +## Collected Metrics + +To see the list of metrics collected by Telegraf, use: +```bash +/usr/bin/telegraf --config /etc/telegraf.conf --config-directory /etc/telegraf.conf.d --test +``` + +### Service Health Monitoring (services.conf) + +**How it works**: Every 60 seconds, Telegraf executes `/usr/libexec/telegraf-services`, which queries procd via `ubus call service list`, filters the fixed monitored service whitelist, and converts the matching configured instances to metrics. + +**Metric format**: +``` +procd_service_running{service="nginx", instance="instance1"} = 1 (running) or 0 (down) +procd_service_pid{service="nginx", instance="instance1"} = process_id +procd_service_exit_code{service="nginx", instance="instance1"} = last_exit_code +``` + +Only these services are monitored: + +```text +banip +conntrackd +cron +dedalo +dedalo_users_auth +dnsmasq +dropbear +keepalived +mwan3 +netifyd +nginx +ns-api-server +ns-clm +ns-flashstart +ns-flows +ns-plug +ns-plug-alert-proxy +ns-stats +ns-ui +odhcpd +openvpn +qosify +rpcd +rsyslog +snort +swanctl +sysntpd +telegraf +victoria-metrics +vmalert +``` + +Services with no instances are skipped. + +Notable skipped services: +- `adblock`: excluded because ubus info is not reliable (always shows 1 instance even when disabled) + +##### Querying Service Status + +```bash +# All services and their running state +curl -s 'http://127.0.0.1:8428/api/v1/query?query=procd_service_running' + +# Run collection script manually to preview output +/usr/libexec/telegraf-services +``` + +### Multi-WAN Monitoring (mwan.conf) + +**How it works**: Every 60 seconds, Telegraf executes `/usr/libexec/telegraf-mwan`, which reads `/var/run/mwan3/iface_state/` to determine each WAN interface's online/offline state (maintained by mwan3 in real-time). + +**Metric format**: +``` +mwan_interface_online{interface="wan"} = 1 (online) or 0 (offline) +``` + +#### Querying WAN Status + +```bash +# All WAN interfaces and current state +curl -s 'http://127.0.0.1:8428/api/v1/query?query=mwan_interface_online' + +# Run collection script manually +/usr/libexec/telegraf-mwan +``` + +### Storage Status Monitoring (storage.conf) + +**How it works**: Every 60 seconds, Telegraf executes `/usr/libexec/telegraf-storage-status`, which runs `storage-status` and exports the current storage health as a metric. + +**Metric format**: +``` +storage_status_error = 1 (error) or 0 (ok / not configured) +``` + +## Advanced Configuration + +To add custom metrics or modify collection intervals, edit the `/etc/telegraf.conf.d/` files following [Telegraf documentation](https://docs.influxdata.com/telegraf/). Common customizations: + +- Modify collection intervals: change `interval` in main config +- Add new input plugins: append `[[inputs.plugin_name]]` sections + +After changes, restart Telegraf: +```bash +/etc/init.d/telegraf restart +``` + +## References + +- [Telegraf documentation](https://docs.influxdata.com/telegraf/) +- [Telegraf exec plugin](https://github.com/influxdata/telegraf/tree/master/plugins/inputs/exec) +- [OpenWrt procd init scripts](https://openwrt.org/docs/guide-developer/procd-init-scripts) +- [OpenWrt ubus reference](https://openwrt.org/docs/techref/ubus) +- [Victoria Metrics integration](../victoria-metrics/README.md) diff --git a/packages/telegraf/files/telegraf-backup-encryption b/packages/telegraf/files/telegraf-backup-encryption new file mode 100644 index 000000000..a0026c6f3 --- /dev/null +++ b/packages/telegraf/files/telegraf-backup-encryption @@ -0,0 +1,17 @@ +#!/bin/sh +# +# Copyright (C) 2026 Nethesis S.r.l. +# SPDX-License-Identifier: GPL-2.0-only +# + +# Export backup encryption state for Telegraf. +# +# The metric is 1 when /etc/backup.pass exists and is non-empty, otherwise 0. + +if [ -s /etc/backup.pass ]; then + encrypted=1 +else + encrypted=0 +fi + +printf '[{"encrypted":%s}]\n' "$encrypted" diff --git a/packages/telegraf/files/telegraf-mwan b/packages/telegraf/files/telegraf-mwan new file mode 100644 index 000000000..ab6d12837 --- /dev/null +++ b/packages/telegraf/files/telegraf-mwan @@ -0,0 +1,54 @@ +#!/usr/bin/env python3 +# +# Copyright (C) 2026 Nethesis S.r.l. +# SPDX-License-Identifier: GPL-2.0-only +# +# Collect mwan3 WAN interface status from /var/run/mwan3/iface_state/. +# +# Each file in that directory is named after an mwan3 interface and contains +# a single word: "online" or "offline". The directory is managed by mwan3 +# and only exists when the daemon is running. +# +# Output metric: mwan_interface +# Tags: interface +# Fields: online (int 0/1) +# +# Prints a JSON array to stdout, consumed by telegraf inputs.exec with +# data_format = "json_v2" (parsers.json_v2 build tag). + +import json +import os + +IFACE_STATE_DIR = "/var/run/mwan3/iface_state" + + +def build_records(): + """Return one record per mwan3 interface found in the state directory.""" + if not os.path.isdir(IFACE_STATE_DIR): + return [] + + records = [] + for name in sorted(os.listdir(IFACE_STATE_DIR)): + path = os.path.join(IFACE_STATE_DIR, name) + if not os.path.isfile(path): + continue + try: + status = open(path).read().strip() + except OSError: + continue + records.append( + { + "interface": name, + "online": 1 if status == "online" else 0, + } + ) + return records + + +def main(): + records = build_records() + print(json.dumps(records)) + + +if __name__ == "__main__": + main() diff --git a/packages/telegraf/files/telegraf-services b/packages/telegraf/files/telegraf-services new file mode 100644 index 000000000..932049020 --- /dev/null +++ b/packages/telegraf/files/telegraf-services @@ -0,0 +1,110 @@ +#!/usr/bin/env python3 +# +# Copyright (C) 2026 Nethesis S.r.l. +# SPDX-License-Identifier: GPL-2.0-only +# +# Collect procd service status via ubus. +# +# Monitored services: fixed whitelist of NethSecurity services. +# Services with no instances are ignored because they are disabled or not +# configured. +# +# Usage: +# /usr/libexec/telegraf-services +# +# Output metric: procd_service +# Tags: service, instance +# Fields: running (int 0/1), pid (int), exit_code (int) +# +# Prints a JSON array to stdout, consumed by telegraf inputs.exec with +# data_format = "json_v2" (parsers.json_v2 build tag). +# +import json +import subprocess +import sys +MONITORED_SERVICES = { + "banip", + "conntrackd", + "cron", + "dedalo", + "dedalo_users_auth", + "dnsmasq", + "dropbear", + "keepalived", + "mwan3", + "netifyd", + "nginx", + "ns-api-server", + "ns-clm", + "ns-flashstart", + "ns-flows", + "ns-plug", + "ns-plug-alert-proxy", + "ns-stats", + "ns-ui", + "odhcpd", + "openvpn", + "qosify", + "rpcd", + "rsyslog", + "snort", + "swanctl", + "sysntpd", + "telegraf", + "victoria-metrics", + "vmalert", +} + +# Excluded service: adblock + +def get_service_list(): + result = subprocess.run( + ["ubus", "call", "service", "list"], + capture_output=True, + text=True, + timeout=5, + ) + if result.returncode != 0: + print(f"Error calling ubus: {result.stderr}", file=sys.stderr) + sys.exit(1) + return json.loads(result.stdout) + + +def sanitize_tag(value): + # InfluxDB line protocol: tag values must not contain commas, spaces or equals + return value.replace(",", "_").replace(" ", "_").replace("=", "_") + + +def build_records(data): + """Return a list of dicts, one per configured monitored service instance.""" + records = [] + + for svc_name in sorted(MONITORED_SERVICES): + svc_body = data.get(svc_name) + instances = (svc_body or {}).get("instances") or {} + if not instances: + continue + + for inst_name, inst in instances.items(): + records.append( + { + "service": sanitize_tag(svc_name), + "instance": sanitize_tag(inst_name), + "running": 1 if inst.get("running", False) else 0, + "pid": inst.get("pid", 0), + "exit_code": inst.get("exit_code", 0), + } + ) + return records + + +def main(): + data = get_service_list() + records = build_records(data) + + # JSON array — consumed by telegraf inputs.exec with data_format=json_v2 + print(json.dumps(records)) + + +if __name__ == "__main__": + main() diff --git a/packages/telegraf/files/telegraf-storage-status b/packages/telegraf/files/telegraf-storage-status new file mode 100644 index 000000000..f7dd52113 --- /dev/null +++ b/packages/telegraf/files/telegraf-storage-status @@ -0,0 +1,21 @@ +#!/bin/sh +# +# Copyright (C) 2026 Nethesis S.r.l. +# SPDX-License-Identifier: GPL-2.0-only +# + +# Export the storage health state for Telegraf. +# +# The metric is 1 when storage-status reports "error", otherwise 0. + +if storage_status=$(/usr/sbin/storage-status 2>/dev/null); then + if [ "$storage_status" = "error" ]; then + error=1 + else + error=0 + fi +else + error=1 +fi + +printf '[{"error":%s}]\n' "$error" diff --git a/packages/telegraf/files/telegraf.conf.d/backup.conf b/packages/telegraf/files/telegraf.conf.d/backup.conf new file mode 100644 index 000000000..65e4fdf1e --- /dev/null +++ b/packages/telegraf/files/telegraf.conf.d/backup.conf @@ -0,0 +1,16 @@ +# Backup encryption status monitoring +# Reports whether /etc/backup.pass is set so vmalert can alert on unencrypted backups. + +[[inputs.exec]] + name_override = "backup_encryption" + commands = ["/usr/libexec/telegraf-backup-encryption"] + interval = "60s" + timeout = "5s" + data_format = "json_v2" + + [[inputs.exec.json_v2]] + [[inputs.exec.json_v2.object]] + path = "@this" + + [inputs.exec.tags] + influxdb_db = "os-metrics" diff --git a/packages/telegraf/files/telegraf.conf.d/mwan.conf b/packages/telegraf/files/telegraf.conf.d/mwan.conf new file mode 100644 index 000000000..8658cfdcf --- /dev/null +++ b/packages/telegraf/files/telegraf.conf.d/mwan.conf @@ -0,0 +1,20 @@ +# mwan3 WAN interface status monitoring +# Reads /var/run/mwan3/iface_state/ — one file per interface, content is +# "online" or "offline". No-ops silently when mwan3 is not running. +# +# Uses parsers.json_v2 — available in the default NethSecurity Telegraf build. + +[[inputs.exec]] + name_override = "mwan_interface" + commands = ["/usr/libexec/telegraf-mwan"] + interval = "60s" + timeout = "10s" + data_format = "json_v2" + + [[inputs.exec.json_v2]] + [[inputs.exec.json_v2.object]] + path = "@this" + tags = ["interface"] + + [inputs.exec.tags] + influxdb_db = "os-metrics" diff --git a/packages/telegraf/files/telegraf.conf.d/services.conf b/packages/telegraf/files/telegraf.conf.d/services.conf new file mode 100644 index 000000000..04c5f98c6 --- /dev/null +++ b/packages/telegraf/files/telegraf.conf.d/services.conf @@ -0,0 +1,20 @@ +# Procd service status monitoring +# Collects running state for a fixed whitelist of persistent services. +# Services with no instances are ignored by the collector. +# +# Uses parsers.json_v2 — available in the default NethSecurity Telegraf build. + +[[inputs.exec]] + name_override = "procd_service" + commands = ["/usr/libexec/telegraf-services"] + interval = "60s" + timeout = "10s" + data_format = "json_v2" + + [[inputs.exec.json_v2]] + [[inputs.exec.json_v2.object]] + path = "@this" + tags = ["service", "instance"] + + [inputs.exec.tags] + influxdb_db = "os-metrics" diff --git a/packages/telegraf/files/telegraf.conf.d/storage.conf b/packages/telegraf/files/telegraf.conf.d/storage.conf new file mode 100644 index 000000000..10e6efee2 --- /dev/null +++ b/packages/telegraf/files/telegraf.conf.d/storage.conf @@ -0,0 +1,16 @@ +# Storage status monitoring +# Reports whether the persistent data storage is mounted so vmalert can alert on storage:status. + +[[inputs.exec]] + name_override = "storage_status" + commands = ["/usr/libexec/telegraf-storage-status"] + interval = "60s" + timeout = "5s" + data_format = "json_v2" + + [[inputs.exec.json_v2]] + [[inputs.exec.json_v2.object]] + path = "@this" + + [inputs.exec.tags] + influxdb_db = "os-metrics" diff --git a/packages/victoria-metrics/Makefile b/packages/victoria-metrics/Makefile index 5fe85f5a8..6510e5a0a 100644 --- a/packages/victoria-metrics/Makefile +++ b/packages/victoria-metrics/Makefile @@ -52,19 +52,25 @@ endef define Package/victoria-metrics/conffiles /etc/config/victoria-metrics +/etc/config/vmalert endef define Package/victoria-metrics/install $(call GoPackage/Package/Install/Bin,$(1)) $(INSTALL_DIR) $(1)/etc/init.d $(INSTALL_BIN) ./files/victoria-metrics.initd $(1)/etc/init.d/victoria-metrics + $(INSTALL_BIN) ./files/vmalert.initd $(1)/etc/init.d/vmalert $(INSTALL_DIR) $(1)/etc/config $(INSTALL_DATA) ./files/victoria-metrics.conf $(1)/etc/config/victoria-metrics + $(INSTALL_DATA) ./files/vmalert.conf $(1)/etc/config/vmalert + $(INSTALL_DIR) $(1)/etc/vmalert/rules + $(INSTALL_DATA) ./files/vmalert-rules/*.yaml $(1)/etc/vmalert/rules/ endef define Package/victoria-metrics/postinst #!/bin/sh [ -z "$${IPKG_INSTROOT}" ] && /etc/init.d/victoria-metrics restart +[ -z "$${IPKG_INSTROOT}" ] && /etc/init.d/vmalert restart exit 0 endef diff --git a/packages/victoria-metrics/README.md b/packages/victoria-metrics/README.md new file mode 100644 index 000000000..d878bf4f1 --- /dev/null +++ b/packages/victoria-metrics/README.md @@ -0,0 +1,138 @@ +# Victoria Metrics + +## Overview + +This package provides **Victoria Metrics** and **vmalert** for time-series metrics storage and alerting in NethSecurity. Metrics are collected by Telegraf, stored in Victoria Metrics, and evaluated by vmalert according to alert rules. + +**Key Components:** +- **victoria-metrics**: Time-series database on port 8428 +- **vmalert**: Alert rule evaluator on port 8081 +- **Telegraf integration**: Host metrics, service health, WAN status, storage status +- **Mimir integration**: Optional centralized alerting (via ns-plug) + +## Quick Start + +### View Active Alerts + +```bash +# List all firing and pending alerts +curl http://127.0.0.1:8082/api/v1/alerts | jq + +# Get a specific alert status +curl 'http://127.0.0.1:8082/api/v1/rules?type=alert' | \ + jq '.data.groups[].rules[] | select(.name == "HighCpuUsage") | {name, state, lastEvaluation}' +``` + +### List Available Metrics + +```bash +# All metrics currently being stored +curl -s 'http://127.0.0.1:8428/api/v1/label/__name__/values' | jq -r '.data[]' | sort +``` + +## Configuration + +Configuration is located at `/etc/config/victoria-metrics`: + +``` +config victoriametrics 'main' + option storage_path '/var/lib/victoriametrics' + option retention_period '1y' + option http_listen_addr '127.0.0.1:8428' +``` + +**Options:** +- `storage_path`: Where to store metrics data +- `retention_period`: How long to keep metrics (`1d`, `7d`, `30d`, `1y`, etc.) +- `http_listen_addr`: Address and port for the HTTP server + +### Accessing the Web UI + +By default the server is accessible only on localhost for security. +The service also exposes a Web UI on port 8428 for browsing metrics and testing queries. + +To access the Web UI, you can change the `http_listen_addr` to `0.0.0.0:8428` to allow external access, but this is not recommended for production environments without proper security measures. +A safer approach is to use SSH port forwarding: +```bash +ssh -L 8428:127.0.0.1:8428 root@remote_host +``` + +Then open `http://127.0.0.1:8428` in your web browser to see all exposed endpoints. +The UI to query metrics is available at `http://127.0.0.1:8428/vmui + +## Alerting Rules + +All alert rules are defined as YAML files in `/etc/vmalert/rules/*.yaml`. Each file corresponds to a specific monitoring category. + +Some alerts implement a two-tier severity model with `warning` and `critical` levels and are designed to suppress lower-severity alerts when higher-severity ones are firing. + +Warning alerts use `unless` clauses to suppress them when their critical counterpart is already firing, reducing noise. For example, `HighCpuUsage` warning is silenced when `CriticalCpuUsage` is firing. + +See rule files for specific thresholds and suppression logic. + +An alert can be in one of three states: + +1. **Pending**: Condition is true but hasn't met the required `for` duration +2. **Firing**: Condition has been true for at least the `for` duration +3. **Resolved**: Condition is no longer true + +Example: An alert with `for: 5m` takes 5 minutes to transition from pending → firing. + +### Custom Alert Rules + +To add custom alerts, create a new YAML file in `/etc/vmalert/rules/`. +Example `my_alerts.yaml`: + +```yaml +groups: + - name: "my_alerts" + interval: "30s" + rules: + - alert: MyAlert + expr: 'metric_name > threshold' + for: "5m" + labels: + severity: "warning" + service: "my_service" + annotations: + summary_en: "Alert summary" + summary_it: "Riepilogo avviso" + description_en: "Value is {{ $value }}" +``` + +Then restart vmalert: +```bash +/etc/init.d/vmalert restart +``` + +## Mimir Integration (ns-plug) + +Mimir is a multi-tenant Prometheus-compatible long-term storage and alerting system used by nextgen [my](https://github.com/NethServer/my/) monitoring +platform. +When Mimir is configured via ns-plug, vmalert automatically forwards alerts. No manual vmalert configuration needed. + +**Enable Mimir forwarding:** +```bash +uci set ns-plug.config.my_url='https://mimir.example.com' +uci set ns-plug.config.my_system_key='your_api_key' +uci set ns-plug.config.my_system_secret='your_api_secret' +uci commit ns-plug +/etc/init.d/vmalert restart +``` + +**Disable (alert-proxy only mode):** +```bash +uci delete ns-plug.config.my_url +uci delete ns-plug.config.my_system_key +uci delete ns-plug.config.my_system_secret +uci commit ns-plug +/etc/init.d/vmalert restart +``` + +## References + +- [Victoria Metrics vmalert docs](https://docs.victoriametrics.com/vmalert/) +- [MetricsQL documentation](https://docs.victoriametrics.com/metricsql/) +- [Prometheus alerting rules](https://samber.github.io/awesome-prometheus-alerts/) +- [vmalert documentation](https://docs.victoriametrics.com/vmalert/) +- [Telegraf metrics collection](../telegraf/README.md) diff --git a/packages/victoria-metrics/files/vmalert-rules/backup.yaml b/packages/victoria-metrics/files/vmalert-rules/backup.yaml new file mode 100644 index 000000000..30800186e --- /dev/null +++ b/packages/victoria-metrics/files/vmalert-rules/backup.yaml @@ -0,0 +1,20 @@ +# Victoria Metrics Alert Rules for backup encryption monitoring +# +# Monitors whether /etc/backup.pass is present and non-empty via the +# backup_encryption_encrypted metric collected by Telegraf. + +groups: + - name: "backup" + interval: "60s" + rules: + - alert: BackupEncryptionDisabled + expr: 'backup_encryption_encrypted == 0' + for: "2m" + labels: + severity: "warning" + service: "backup" + annotations: + summary_en: "Backup encryption is disabled" + summary_it: "La cifratura dei backup e disattivata" + description_en: "The backup passphrase file /etc/backup.pass is missing or empty. If the firewall has a subscription, the backup will not be sent to the remote storage server." + description_it: "Il file della passphrase dei backup /etc/backup.pass manca o e vuoto. Se il firewall ha una subscription, i backup non verranno inviati al server di archiviazione remoto." diff --git a/packages/victoria-metrics/files/vmalert-rules/host.yaml b/packages/victoria-metrics/files/vmalert-rules/host.yaml new file mode 100644 index 000000000..b01dae22f --- /dev/null +++ b/packages/victoria-metrics/files/vmalert-rules/host.yaml @@ -0,0 +1,96 @@ +# Victoria Metrics Alert Rules for Host and Hardware Monitoring +# +# Based on: https://samber.github.io/awesome-prometheus-alerts/rules/basic-resource-monitoring/host-and-hardware/ +# Adapted for Telegraf metrics names + +groups: + - name: "host_and_hardware" + interval: "30s" + rules: + # CPU Monitoring + - alert: HighCpuUsage + expr: 'round(100 - avg(cpu_usage_idle), 0.1) > 70 unless round(100 - avg(cpu_usage_idle), 0.1) > 85' + for: "5m" + labels: + severity: "info" + service: "host" + annotations: + summary_en: "High CPU usage detected" + summary_it: "Utilizzo elevato di CPU rilevato" + description_en: "CPU usage is {{ $value }}%" + description_it: "Utilizzo della CPU è {{ $value }}%" + + - alert: CriticalCpuUsage + expr: 'round(100 - avg(cpu_usage_idle), 0.1) > 85' + for: "2m" + labels: + severity: "warning" + service: "host" + annotations: + summary_en: "Critical CPU usage detected" + summary_it: "Utilizzo critico di CPU rilevato" + description_en: "CPU usage is {{ $value }}%" + description_it: "Utilizzo della CPU è {{ $value }}%" + + # Memory Monitoring + - alert: HighMemoryUsage + expr: 'round((mem_used / mem_total) * 100, 0.1) > 80 unless round((mem_used / mem_total) * 100, 0.1) > 90' + for: "5m" + labels: + severity: "info" + service: "host" + annotations: + summary_en: "High memory usage detected" + summary_it: "Utilizzo elevato di memoria rilevato" + description_en: "Memory usage is {{ $value }}%" + description_it: "Utilizzo della memoria è {{ $value }}%" + + - alert: CriticalMemoryUsage + expr: 'round((mem_used / mem_total) * 100, 0.1) > 90' + for: "2m" + labels: + severity: "warning" + service: "host" + annotations: + summary_en: "Critical memory usage detected" + summary_it: "Utilizzo critico di memoria rilevato" + description_en: "Memory usage is {{ $value }}%" + description_it: "Utilizzo della memoria è {{ $value }}%" + + # Disk Space Monitoring + - alert: DiskSpaceWarning + expr: 'round((disk_used / disk_total) * 100, 0.1) > 80 unless round((disk_used / disk_total) * 100, 0.1) > 90' + for: "5m" + labels: + severity: "warning" + service: "storage" + annotations: + summary_en: "Disk space low on {{ $labels.path }}" + summary_it: "Spazio disco in esaurimento su {{ $labels.path }}" + description_en: "Disk usage is {{ $value }}% on {{ $labels.path }}" + description_it: "Utilizzo del disco è {{ $value }}% su {{ $labels.path }}" + + - alert: DiskSpaceCritical + expr: 'round((disk_used / disk_total) * 100, 0.1) > 90' + for: "2m" + labels: + severity: "critical" + service: "storage" + annotations: + summary_en: "Disk space critical on {{ $labels.path }}" + summary_it: "Spazio disco critico su {{ $labels.path }}" + description_en: "Disk usage is {{ $value }}% on {{ $labels.path }}" + description_it: "Utilizzo del disco è {{ $value }}% su {{ $labels.path }}" + + # System Load Monitoring + - alert: HighSystemLoad + expr: 'system_load1 / system_n_cpus > 2' + for: "5m" + labels: + severity: "warning" + service: "host" + annotations: + summary_en: "High system load detected" + summary_it: "Carico di sistema elevato rilevato" + description_en: "System load is {{ $value }}" + description_it: "Carico di sistema è {{ $value }}" diff --git a/packages/victoria-metrics/files/vmalert-rules/mwan.yaml b/packages/victoria-metrics/files/vmalert-rules/mwan.yaml new file mode 100644 index 000000000..a4da2b96d --- /dev/null +++ b/packages/victoria-metrics/files/vmalert-rules/mwan.yaml @@ -0,0 +1,25 @@ +# Victoria Metrics Alert Rules for mwan3 WAN Monitoring +# +# Monitors WAN interface connectivity via the mwan_interface_online metric +# collected by /usr/libexec/telegraf-mwan. +# +# The metric is sourced from /var/run/mwan3/iface_state/ which mwan3 +# writes as "online" or "offline" based on its tracking probes. +# Only interfaces present in that directory are monitored — interfaces +# not managed by mwan3 are not included. + +groups: + - name: "mwan" + interval: "60s" + rules: + - alert: WanDown + expr: 'mwan_interface_online == 0' + for: "2m" + labels: + severity: "critical" + service: "network" + annotations: + summary_en: "WAN interface {{ $labels.interface }} is offline" + summary_it: "L'interfaccia WAN {{ $labels.interface }} non è raggiungibile" + description_en: "WAN interface {{ $labels.interface }} is down. Internet connectivity lost." + description_it: "L'interfaccia WAN {{ $labels.interface }} non è raggiungibile. Connettività Internet persa." diff --git a/packages/victoria-metrics/files/vmalert-rules/services.yaml b/packages/victoria-metrics/files/vmalert-rules/services.yaml new file mode 100644 index 000000000..f87b1c38c --- /dev/null +++ b/packages/victoria-metrics/files/vmalert-rules/services.yaml @@ -0,0 +1,22 @@ +# Victoria Metrics Alert Rules for Service Monitoring +# +# Monitors configured procd-managed services via the procd_service_* metrics +# collected by /usr/libexec/telegraf-services. +# +# Services with no instances are ignored by the collector. + +groups: + - name: "services" + interval: "60s" + rules: + - alert: ServiceDown + expr: 'procd_service_running == 0 and procd_service_exit_code != 0' + for: "2m" + labels: + severity: "critical" + alertgroup: "services" + annotations: + summary_en: "Service {{ $labels.service }} is down" + summary_it: "Il servizio {{ $labels.service }} non è attivo" + description_en: "Service {{ $labels.service }} (instance {{ $labels.instance }}) has been down for more than 2 minutes" + description_it: "Il servizio {{ $labels.service }} (istanza {{ $labels.instance }}) non è attivo da più di 2 minuti" diff --git a/packages/victoria-metrics/files/vmalert-rules/storage.yaml b/packages/victoria-metrics/files/vmalert-rules/storage.yaml new file mode 100644 index 000000000..fdcca20cf --- /dev/null +++ b/packages/victoria-metrics/files/vmalert-rules/storage.yaml @@ -0,0 +1,18 @@ +# Storage status monitoring +# +# The alert is driven by the Telegraf storage_status_error metric. + +groups: + - name: "storage" + interval: "30s" + rules: + - alert: StorageStatus + expr: 'storage_status_error == 1' + labels: + severity: "critical" + service: "storage" + annotations: + summary_en: "Storage is in error state" + summary_it: "Lo storage è in stato di errore" + description_en: "The configured data storage is not mounted or is otherwise in error." + description_it: "Lo storage dati configurato non è montato o è in errore." diff --git a/packages/victoria-metrics/files/vmalert.conf b/packages/victoria-metrics/files/vmalert.conf new file mode 100644 index 000000000..c40cc9c37 --- /dev/null +++ b/packages/victoria-metrics/files/vmalert.conf @@ -0,0 +1,3 @@ +config main 'main' + option datasource_url 'http://localhost:8428' + option http_listen_addr '127.0.0.1:8082' diff --git a/packages/victoria-metrics/files/vmalert.initd b/packages/victoria-metrics/files/vmalert.initd new file mode 100644 index 000000000..5ff4efd58 --- /dev/null +++ b/packages/victoria-metrics/files/vmalert.initd @@ -0,0 +1,69 @@ +#!/bin/sh /etc/rc.common + +# +# Copyright (C) 2026 Nethesis S.r.l. +# SPDX-License-Identifier: GPL-2.0-only +# + +START=95 +STOP=5 +USE_PROCD=1 + +PROG="/usr/bin/vmalert" +RULE_DIR="/etc/vmalert/rules" + +start_service() { + config_load vmalert 2>/dev/null || true + + local datasource_url http_listen_addr + config_get datasource_url main datasource_url "http://localhost:8428" + config_get http_listen_addr main http_listen_addr "127.0.0.1:8081" + + # Check if Mimir integration is configured in ns-plug + local mimir_url mimir_key mimir_secret notifier_url + config_load ns-plug 2>/dev/null && { + config_get mimir_url config my_url "" + config_get mimir_key config my_system_key "" + config_get mimir_secret config my_system_secret "" + } + + # If all Mimir credentials are present, configure alert forwarding to Mimir + if [ -n "$mimir_url" ] && [ -n "$mimir_key" ] && [ -n "$mimir_secret" ]; then + notifier_url="${mimir_url%/}/collect/api/services/mimir/alertmanager" + else + notifier_url="" + fi + + procd_open_instance + procd_set_param command $PROG + procd_append_param command -rule="$RULE_DIR/*.yaml" + procd_append_param command -httpListenAddr="$http_listen_addr" + procd_append_param command -datasource.url="$datasource_url" + procd_append_param command -remoteRead.url="$datasource_url" + procd_append_param command -remoteWrite.url="$datasource_url" + procd_append_param command -evaluationInterval=30s + + # Always notify the local alert-proxy (handles unregistered machines gracefully) + procd_append_param command -notifier.url="http://127.0.0.1:9095" + + # Also forward to Mimir if credentials are configured + if [ -n "$notifier_url" ]; then + procd_append_param command -notifier.url="$notifier_url" + procd_append_param command -notifier.basicAuth.username="$mimir_key" + procd_append_param command -notifier.basicAuth.password="$mimir_secret" + fi + + procd_set_param stdout 1 + procd_set_param stderr 1 + procd_set_param respawn 3600 5 5 + procd_close_instance +} + +reload_service() { + stop + start +} + +service_triggers() { + procd_add_reload_trigger vmalert +} From 51f8d6c9dd5d320aa88a5e487a9110ee30c4494e Mon Sep 17 00:00:00 2001 From: Giacomo Sanchietti Date: Thu, 23 Apr 2026 16:33:23 +0200 Subject: [PATCH 02/11] feat(api): replace ns.netdata with ns.telegraf Changes: - migrate ping monitoring from netdata's fping plugin to telegraf's native ping input plugin - expose metrics to the UI The ping plugin uses native method (method="native") which sends ICMP packets directly without external ping command, requiring CAP_NET_RAW capability or root privileges. Metrics are tagged with influxdb_db="ping-metrics" for proper InfluxDB database routing. Assited-by: Copilot:Sonnet4.6 --- packages/ns-api/Makefile | 6 +- packages/ns-api/README.md | 129 +++++- packages/ns-api/files/ns.dashboard | 29 +- packages/ns-api/files/ns.netdata | 66 --- packages/ns-api/files/ns.netdata.json | 13 - packages/ns-api/files/ns.report | 87 ++-- packages/ns-api/files/ns.telegraf | 383 ++++++++++++++++++ packages/ns-api/files/ns.telegraf.json | 13 + packages/ns-api/openapi.yml | 77 ++++ packages/telegraf/Makefile | 33 +- .../telegraf/files/telegraf.conf.d/ping.conf | 26 ++ 11 files changed, 725 insertions(+), 137 deletions(-) delete mode 100755 packages/ns-api/files/ns.netdata delete mode 100644 packages/ns-api/files/ns.netdata.json create mode 100755 packages/ns-api/files/ns.telegraf create mode 100644 packages/ns-api/files/ns.telegraf.json create mode 100644 packages/telegraf/files/telegraf.conf.d/ping.conf diff --git a/packages/ns-api/Makefile b/packages/ns-api/Makefile index a3bd5ce0a..b4bec3a06 100644 --- a/packages/ns-api/Makefile +++ b/packages/ns-api/Makefile @@ -118,8 +118,10 @@ define Package/ns-api/install $(INSTALL_DATA) ./files/ns.mwan.json $(1)/usr/share/rpcd/acl.d/ $(INSTALL_BIN) ./files/ns.dpi $(1)/usr/libexec/rpcd/ $(INSTALL_DATA) ./files/ns.dpi.json $(1)/usr/share/rpcd/acl.d/ - $(INSTALL_BIN) ./files/ns.netdata $(1)/usr/libexec/rpcd/ - $(INSTALL_DATA) ./files/ns.netdata.json $(1)/usr/share/rpcd/acl.d/ + $(INSTALL_BIN) ./files/ns.telegraf $(1)/usr/libexec/rpcd/ + $(INSTALL_DATA) ./files/ns.telegraf.json $(1)/usr/share/rpcd/acl.d/ + $(LN) ns.telegraf $(1)/usr/libexec/rpcd/ns.netdata + $(LN) ns.telegraf.json $(1)/usr/share/rpcd/acl.d/ns.netdata.json $(INSTALL_BIN) ./files/ns.storage $(1)/usr/libexec/rpcd/ $(INSTALL_DATA) ./files/ns.storage.json $(1)/usr/share/rpcd/acl.d/ $(INSTALL_BIN) ./files/ns.account $(1)/usr/libexec/rpcd/ diff --git a/packages/ns-api/README.md b/packages/ns-api/README.md index 6c2e38672..f3b84620e 100644 --- a/packages/ns-api/README.md +++ b/packages/ns-api/README.md @@ -150,6 +150,124 @@ Response: } ``` +## ns.telegraf + +Read and update Telegraf ping monitoring targets, query historical metrics stored in VictoriaMetrics, and list the current alerts evaluated by vmalert. + +### get-configuration + +Get the current list of hosts monitored by the Telegraf ping input: +``` +api-cli ns.telegraf get-configuration +``` + +Output example: +```json +{ + "hosts": [ + "1.1.1.1", + "google.com" + ] +} +``` + +### set-hosts + +Set the list of hosts monitored by the Telegraf ping input and restart Telegraf: +``` +api-cli ns.telegraf set-hosts --data '{"hosts": ["1.1.1.1", "8.8.8.8"]}' +``` + +Parameters: +- `hosts`: array of hostnames or IP addresses to monitor + +Output example: +```json +{ + "success": true +} +``` + +### metrics-history + +Return historical system and network metrics collected by Telegraf and stored in VictoriaMetrics: +``` +api-cli ns.telegraf metrics-history --data '{"start": 1746607800, "end": 1746608400, "step": 60}' +``` + +Parameters: +- `start`: start of the time range as Unix timestamp +- `end`: end of the time range as Unix timestamp +- `step`: sampling interval in seconds + +Output example: +```json +{ + "connections": { + "labels": [1746608100], + "datasets": [{ "label": "Connections", "data": [123] }] + }, + "traffic": {}, + "cpu": { + "labels": [1746608100], + "datasets": [{ "label": "CPU (%)", "data": [14.2] }] + }, + "load": { + "labels": [1746608100], + "datasets": [ + { "label": "1m", "data": [0.12] }, + { "label": "5m", "data": [0.08] }, + { "label": "15m", "data": [0.05] } + ] + }, + "diskio": { "labels": [], "datasets": [] }, + "disk": { "labels": [], "datasets": [] }, + "processes": { "labels": [], "datasets": [] }, + "memory": { "labels": [], "datasets": [] }, + "packets": { "labels": [], "datasets": [] }, + "latency_quality": {} +} +``` + +### list-alerts + +List the current pending and firing alerts evaluated by vmalert: +``` +api-cli ns.telegraf list-alerts +``` + +Output example: +```json +{ + "alerts": [ + { + "state": "firing", + "name": "BackupEncryptionDisabled", + "value": "0", + "labels": { + "alertgroup": "backup", + "alertname": "BackupEncryptionDisabled", + "severity": "warning", + "service": "backup" + }, + "annotations": { + "summary_en": "Backup encryption is disabled", + "summary_it": "La cifratura dei backup e disattivata", + "description_en": "The backup passphrase file /etc/backup.pass is missing or empty.", + "description_it": "Il file della passphrase dei backup /etc/backup.pass manca o e vuoto." + }, + "activeAt": "2026-05-07T09:18:00Z", + "expression": "backup_encryption_encrypted == 0", + "source": "http://NethSec:8082/vmalert/alert?group_id=10212661952842894290&alert_id=4214684507782533109" + } + ] +} +``` + +Possible errors: +- `cannot_retrieve_alerts` +- `invalid_alerts_response` + ## ns.firewall ### list-forward-rules @@ -2436,7 +2554,10 @@ Response example: ### traffic-interface -Return an array of point describing the network traffic in the last hour: +Return an array of points describing the network traffic in the last hour. +Data is sourced from Victoria Metrics using `net_bytes_recv` and `net_bytes_sent` Telegraf counters, +converted to kb/s (kilobits per second). Labels are Unix timestamps in descending order (newest first), +with one point every 20 seconds (~180 points total). ``` api-cli ns.dashboard interface-traffic --data '{"interface": "eth0"}' ``` @@ -7932,7 +8053,7 @@ Output example: ### latency-and-quality-report -Report latency metrics (minimum, maximum and average) and connectivy quality data (packet delivery rate) for every host configured in Netdata fping configuration file, located at `/etc/netdata/fping.conf`. +Report latency metrics (minimum, maximum and average) and connectivity quality data (packet loss percentage) for every host configured in the Telegraf ping plugin configuration file, located at `/etc/telegraf.conf.d/ping.conf`. Usage example: ``` api-cli ns.report latency-and-quality-report @@ -7982,7 +8103,7 @@ Output example: ], [ 1731485262, - 99.8152174 + 100 ], [ 1731484894, @@ -8032,7 +8153,7 @@ Output example: ], [ 1731485262, - 99.8152174 + 100 ], [ 1731484894, diff --git a/packages/ns-api/files/ns.dashboard b/packages/ns-api/files/ns.dashboard index 1e3954eed..2a98ddae7 100644 --- a/packages/ns-api/files/ns.dashboard +++ b/packages/ns-api/files/ns.dashboard @@ -12,6 +12,8 @@ import os import sys import json import subprocess +import time +import urllib.parse import urllib.request from euci import EUci from nethsec import utils, ovpn @@ -274,17 +276,24 @@ def system_info(): def interface_traffic(interface): ret = {"labels": [], "data": []} - # retrieve from netdata the traffic for the last hour - url = f'http://127.0.0.1:19999/api/v1/data?chart=net.{interface}&after=-3600&points=180&options=abs' - try: - with urllib.request.urlopen(url, timeout=10) as fu: - data = json.loads(fu.read()) - except: - return ret + vm_url = "http://127.0.0.1:8428/api/v1/query_range" + now = int(time.time()) + one_hour_ago = now - 3600 + + def vm_query(expr): + params = urllib.parse.urlencode({"query": expr, "start": one_hour_ago, "end": now, "step": 20}) + with urllib.request.urlopen(f"{vm_url}?{params}", timeout=5) as resp: + data = json.loads(resp.read()) + result = data.get("data", {}).get("result", []) + return result[0].get("values", []) if result else [] - for record in data["data"]: - ret["labels"].append(record[0]) - ret["data"].append([record[1], record[2]]) + try: + recv = vm_query(f'rate(net_bytes_recv{{interface="{interface}"}}[20s]) * 8 / 1000') + sent = vm_query(f'rate(net_bytes_sent{{interface="{interface}"}}[20s]) * 8 / 1000') + ret["labels"] = [int(ts) for ts, _ in reversed(recv)] + ret["data"] = [[float(r), float(s)] for (_, r), (_, s) in zip(reversed(recv), reversed(sent))] + except Exception: + pass return ret diff --git a/packages/ns-api/files/ns.netdata b/packages/ns-api/files/ns.netdata deleted file mode 100755 index bb2309181..000000000 --- a/packages/ns-api/files/ns.netdata +++ /dev/null @@ -1,66 +0,0 @@ -#!/usr/bin/python3 - -# -# Copyright (C) 2023 Nethesi3 S.r.l. -# SPDX-License-Identifier: GPL-2.0-only -# - -# Read and set fping configuration for netdata - -import os -import sys -import json -import subprocess -import configparser - -fping_conf_file = "/etc/netdata/fping.conf" -netdata_conf_file = "/etc/netdata/netdata.conf" - -def get_config(): - hosts = [] - # create a simpligied fping.conf if not exists - # the file must contain only one line: hosts="" - if not os.path.exists(fping_conf_file): - with open(fping_conf_file, 'w') as fp: - fp.write('hosts=""\n') - # parse the simplified config file - try: - with open(fping_conf_file, 'r') as fp: - line = fp.readline() - line = line[7:-2] - hosts = line.split(" ") - except: - pass - return {"hosts": hosts} - -def set_config(config): - # Enable and disable fping plugin on netdata - nparser = configparser.ConfigParser() - nparser.read(netdata_conf_file) - if len(config['hosts']) > 0: - nparser['plugins']['fping'] = 'yes' - else: - nparser['plugins']['fping'] = 'no' - with open(netdata_conf_file, 'w') as fpc: - nparser.write(fpc) - - try: - with open(fping_conf_file, 'w') as fp: - hosts = " ".join(config['hosts']) - fp.write(f'hosts="{hosts}"\n') - subprocess.run(["/etc/init.d/netdata", "restart"], check=True) - return {"success": True} - except: - return {"success": False} - -cmd = sys.argv[1] - -if cmd == 'list': - print(json.dumps({"get-configuration": {}, "set-hosts": {"hosts": ["1.1.1.1", "google.com"]}})) -else: - action = sys.argv[2] - if action == "get-configuration": - print(json.dumps(get_config())) - elif action == "set-hosts": - args = json.loads(sys.stdin.read()) - print(json.dumps(set_config(args))) diff --git a/packages/ns-api/files/ns.netdata.json b/packages/ns-api/files/ns.netdata.json deleted file mode 100644 index 5764ef6d4..000000000 --- a/packages/ns-api/files/ns.netdata.json +++ /dev/null @@ -1,13 +0,0 @@ -{ - "netdata-manager": { - "description": "Read and set netdata configuration", - "write": {}, - "read": { - "ubus": { - "ns.netdata": [ - "*" - ] - } - } - } -} diff --git a/packages/ns-api/files/ns.report b/packages/ns-api/files/ns.report index 1eca1eca1..e188c25a3 100755 --- a/packages/ns-api/files/ns.report +++ b/packages/ns-api/files/ns.report @@ -15,6 +15,7 @@ import subprocess from datetime import datetime from collections import defaultdict from nethsec import utils +import urllib.parse import urllib.request from euci import EUci @@ -324,41 +325,69 @@ def ovpnrw_bytes_by_hour_and_user(instance, day, user): return {"hours": hours_bytes} -def get_fping_hosts(): - # read fping hosts from /etc/netdata/fping.conf - try: - with open("/etc/netdata/fping.conf", 'r') as fp: - line = fp.readline() - line = line[7:-2] - hosts = line.split(" ") - return hosts - except: - return [] - - -def get_netdata_chart_data(chart_name): - ret = {"labels": [], "data": []} - # retrieve chart data from netdata - url = f'http://127.0.0.1:19999/api/v1/data?chart={chart_name}&after=-3600&points=180&options=abs' +def get_ping_hosts(): + # read ping hosts from telegraf configuration + ping_conf_file = "/etc/telegraf.conf.d/ping.conf" + hosts = [] + if os.path.exists(ping_conf_file): + try: + with open(ping_conf_file, 'r') as fp: + content = fp.read() + # Find the urls line in TOML format: urls = ["host1", "host2"] + match = re.search(r'urls\s*=\s*(\[[^\]]*\])', content) + if match: + urls_str = match.group(1) + # Parse JSON array + hosts = json.loads(urls_str) + except Exception: + pass + return hosts + + +def get_victoria_metrics_ping_data(host): + """ + Query Victoria Metrics for ping metrics. + Returns: {"latency": {"labels": [...], "data": [...]}, "quality": {"labels": [...], "data": [...]}} + """ + ret_latency = {"labels": ["time", "minimum", "maximum", "average"], "data": []} + ret_quality = {"labels": ["time", "returned"], "data": []} + + vm_url = "http://127.0.0.1:8428/api/v1/query_range" + now = int(time.time()) + one_hour_ago = now - 3600 + timeout = 5 + + def vm_query(metric_expr): + params = urllib.parse.urlencode({'query': metric_expr, 'start': one_hour_ago, 'end': now, 'step': 20}) + with urllib.request.urlopen(f"{vm_url}?{params}", timeout=timeout) as resp: + data = json.loads(resp.read()) + result = data.get('data', {}).get('result', []) + return result[0].get('values', []) if result else [] + try: - with urllib.request.urlopen(url, timeout=10) as fu: - data = json.loads(fu.read()) - except: - return ret - return data + min_values = vm_query(f'ping_minimum_response_ms{{url="{host}"}}') + max_values = vm_query(f'ping_maximum_response_ms{{url="{host}"}}') + avg_values = vm_query(f'ping_average_response_ms{{url="{host}"}}') + + ret_latency["data"] = [ + [int(ts), float(mn), float(mx), float(av)] + for (ts, mn), (_, mx), (_, av) in zip(min_values, max_values, avg_values) + ] + + loss_values = vm_query(f'100 - ping_percent_packet_loss{{url="{host}"}} or 100 - ping_percent_reply_loss{{url="{host}"}}') + ret_quality["data"] = [[int(ts), float(val)] for ts, val in loss_values] + + except Exception as e: + print(f"Error querying Victoria Metrics for {host}: {str(e)}", file=sys.stderr) + + return {"latency": ret_latency, "quality": ret_quality} def latency_and_quality_report(): - hosts = get_fping_hosts() + hosts = get_ping_hosts() ret = {} for host in hosts: - host_replaced = host.replace('.', '_') - latency_chart_data = get_netdata_chart_data(f'fping.{host_replaced}_latency') - quality_chart_data = get_netdata_chart_data(f'fping.{host_replaced}_quality') - ret[host] = { - "latency": latency_chart_data, - "quality": quality_chart_data - } + ret[host] = get_victoria_metrics_ping_data(host) return ret diff --git a/packages/ns-api/files/ns.telegraf b/packages/ns-api/files/ns.telegraf new file mode 100755 index 000000000..0a4dd5192 --- /dev/null +++ b/packages/ns-api/files/ns.telegraf @@ -0,0 +1,383 @@ +#!/usr/bin/python3 + +# +# Copyright (C) 2026 Nethesis S.r.l. +# SPDX-License-Identifier: GPL-2.0-only +# + +# Read and set ping configuration for telegraf, and expose metrics history from VictoriaMetrics + +import os +import sys +import json +import subprocess +import re +import time +import urllib.error +import urllib.parse +import urllib.request +from euci import EUci + +ping_conf_file = "/etc/telegraf.conf.d/ping.conf" + + +def _read_ping_hosts(): + """Read the list of monitored ping hosts from the telegraf config file.""" + if not os.path.exists(ping_conf_file): + return [] + try: + with open(ping_conf_file) as fp: + match = re.search(r"urls\s*=\s*(\[[^\]]*\])", fp.read()) + if match: + return json.loads(match.group(1)) + except Exception: + pass + return [] + + +def get_config(): + return {"hosts": _read_ping_hosts()} + + +def set_config(config): + try: + # Ensure directory exists + os.makedirs(os.path.dirname(ping_conf_file), exist_ok=True) + + # Create the telegraf ping configuration + with open(ping_conf_file, "w") as fp: + fp.write("# Ping input plugin configuration\n") + fp.write("[[inputs.ping]]\n") + if len(config["hosts"]) > 0: + # Format hosts as TOML array + hosts_str = json.dumps(config["hosts"]) + fp.write(f" urls = {hosts_str}\n") + fp.write(' method = "native"\n') + fp.write(" count = 5\n") + fp.write(" ping_interval = 1.0\n") + fp.write(" deadline = 10\n") + fp.write(" [inputs.ping.tags]\n") + fp.write(' influxdb_db = "ping-metrics"\n') + else: + # Write empty config to disable + fp.write(" urls = []\n") + + # Restart telegraf service + subprocess.run(["/etc/init.d/telegraf", "restart"], check=True) + return {"success": True} + except Exception as e: + return {"success": False, "error": str(e)} + + +VM_URL = "http://127.0.0.1:8428/api/v1/query_range" +VM_ALERTS_URL = "http://127.0.0.1:8082/api/v1/alerts" + + +def _get_interface_to_zone_map(): + """Map physical interface names to firewall zone names and devices. + Returns: {"eth1": {"zone": "wan", "device": "eth1"}, "br-lan": {"zone": "lan", "device": "br-lan"}} + """ + interface_map = {} + try: + u = EUci() + zones = u.get("firewall") + networks = u.get("network") + + if not zones or not networks: + return interface_map + + # First, build a map of network names to devices + network_to_device = {} + for net_name, net_config in networks.items(): + if isinstance(net_config, dict): + device = net_config.get("device") or net_config.get("ifname") + if device: + network_to_device[net_name] = device + + # Now map zones to networks and then to devices + for zone_name, zone_config in zones.items(): + if isinstance(zone_config, dict) and "name" in zone_config: + zone_label = zone_config.get("name", zone_name) + networks_list = zone_config.get("network", ()) + + # Convert to list if needed + if isinstance(networks_list, str): + networks_list = [networks_list] + elif not isinstance(networks_list, (list, tuple)): + networks_list = [] + + for network in networks_list: + network = str(network).strip() + if network in network_to_device: + device = network_to_device[network] + interface_map[device] = {"zone": zone_label, "device": device} + except Exception: + pass + + return interface_map + + +def _vm_query(expr, start, end, step): + """Execute a single PromQL range query against VictoriaMetrics.""" + params = urllib.parse.urlencode( + {"query": expr, "start": start, "end": end, "step": step} + ) + with urllib.request.urlopen(f"{VM_URL}?{params}", timeout=10) as resp: + data = json.loads(resp.read()) + return data.get("data", {}).get("result", []) + + +def _single_series(results): + """Extract timestamps and values from a single-series result.""" + if not results: + return {"labels": [], "data": []} + values = results[0].get("values", []) + return { + "labels": [int(ts) for ts, _ in values], + "data": [float(v) for _, v in values], + } + + +def list_alerts(): + try: + with urllib.request.urlopen(VM_ALERTS_URL, timeout=10) as resp: + data = json.loads(resp.read()) + except (TimeoutError, urllib.error.URLError): + return {"error": "cannot_retrieve_alerts"} + except json.JSONDecodeError: + return {"error": "invalid_alerts_response"} + + alerts = data.get("data", {}).get("alerts") + if not isinstance(alerts, list): + return {"error": "invalid_alerts_response"} + + return {"alerts": alerts} + + +def metrics_history(args): + now = int(time.time()) + start = int(args.get("start", now - 86400)) + end = int(args.get("end", now)) + step = int(args.get("step", 300)) + + def q(expr): + try: + return _vm_query(expr, start, end, step) + except Exception: + return [] + + def single(expr): + return _single_series(q(expr)) + + # Connections (conntrack) + s = single("conntrack_ip_conntrack_count") + connections = { + "labels": s["labels"], + "datasets": [{"label": "Connections", "data": s["data"]}], + } + + # Traffic per interface – exclude loopback and intermediate functional blocks (ifb*) + # Organize traffic by interface with zone name labels + interface_map = _get_interface_to_zone_map() + recv_results = q('rate(net_bytes_recv{interface!~"lo|ifb.*"}[5m])') + sent_results = q('rate(net_bytes_sent{interface!~"lo|ifb.*"}[5m])') + + traffic_labels: list = [] + for r in recv_results + sent_results: + if r.get("values"): + traffic_labels = [int(ts) for ts, _ in r["values"]] + break + + # Build traffic data organized by interface + traffic_by_interface = {} + for r in recv_results: + iface = r.get("metric", {}).get("interface", "unknown") + iface_info = interface_map.get(iface, {"zone": iface, "device": iface}) + zone_key = f"{iface_info['zone']}|{iface_info['device']}" # Use composite key + if zone_key not in traffic_by_interface: + traffic_by_interface[zone_key] = { + "labels": traffic_labels, + "datasets": [], + "zone": iface_info["zone"], + "device": iface_info["device"], + } + traffic_by_interface[zone_key]["datasets"].append( + {"label": "Download", "data": [float(v) for _, v in r.get("values", [])]} + ) + + for r in sent_results: + iface = r.get("metric", {}).get("interface", "unknown") + iface_info = interface_map.get(iface, {"zone": iface, "device": iface}) + zone_key = f"{iface_info['zone']}|{iface_info['device']}" + if zone_key not in traffic_by_interface: + traffic_by_interface[zone_key] = { + "labels": traffic_labels, + "datasets": [], + "zone": iface_info["zone"], + "device": iface_info["device"], + } + traffic_by_interface[zone_key]["datasets"].append( + {"label": "Upload", "data": [float(v) for _, v in r.get("values", [])]} + ) + + # CPU usage (%) + s = single('100 - (avg(cpu_usage_idle{cpu="cpu-total"}))') + cpu = {"labels": s["labels"], "datasets": [{"label": "CPU (%)", "data": s["data"]}]} + + # System load (1m / 5m / 15m) + s1 = single("system_load1") + s5 = single("system_load5") + s15 = single("system_load15") + load = { + "labels": s1["labels"], + "datasets": [ + {"label": "1m", "data": s1["data"]}, + {"label": "5m", "data": s5["data"]}, + {"label": "15m", "data": s15["data"]}, + ], + } + + # Disk I/O – sum across all non-loop block devices + s_read = single('sum(rate(diskio_read_bytes{name!~"loop.*"}[5m]))') + s_write = single('sum(rate(diskio_write_bytes{name!~"loop.*"}[5m]))') + diskio = { + "labels": s_read["labels"], + "datasets": [ + {"label": "Read", "data": s_read["data"]}, + {"label": "Write", "data": s_write["data"]}, + ], + } + + # Disk usage per real partition (exclude virtual filesystems) + disk_results = q( + 'disk_used_percent{fstype!~"tmpfs|cgroup2|devtmpfs|sysfs|proc|overlay|squashfs"}' + ) + if disk_results: + disk_labels = [int(ts) for ts, _ in disk_results[0].get("values", [])] + disk_datasets = [ + { + "label": r.get("metric", {}).get("path", "unknown"), + "data": [float(v) for _, v in r.get("values", [])], + } + for r in disk_results + ] + disk = {"labels": disk_labels, "datasets": disk_datasets} + else: + disk = {"labels": [], "datasets": []} + + # Total processes + s = single("processes_total") + processes = { + "labels": s["labels"], + "datasets": [{"label": "Processes", "data": s["data"]}], + } + + # RAM usage: Used and Free (MB) + s_used = single("mem_used / 1048576") + s_free = single("mem_free / 1048576") + memory = { + "labels": s_used["labels"], + "datasets": [ + {"label": "Used", "data": s_used["data"]}, + {"label": "Free", "data": s_free["data"]}, + ], + } + + # Packets Rx/Tx – sum across all interfaces + s_rx = single("sum(rate(net_packets_recv[5m]))") + s_tx = single("sum(rate(net_packets_sent[5m]))") + packets = { + "labels": s_rx["labels"], + "datasets": [ + {"label": "Rx", "data": s_rx["data"]}, + {"label": "Tx", "data": s_tx["data"]}, + ], + } + + # Latency and quality reports for monitored hosts + latency_quality = latency_and_quality_report(start, end, step) + + return { + "connections": connections, + "traffic": traffic_by_interface, + "cpu": cpu, + "load": load, + "diskio": diskio, + "disk": disk, + "processes": processes, + "memory": memory, + "packets": packets, + "latency_quality": latency_quality, + } + + +def latency_and_quality_report(start, end, step): + """Fetch latency and packet loss data for monitored hosts.""" + + def q_values(expr): + try: + result = _vm_query(expr, start, end, step) + return result[0].get("values", []) if result else [] + except Exception: + return [] + + ret = {} + for host in _read_ping_hosts(): + latency_data = {"labels": [], "datasets": []} + quality_data = {"labels": [], "datasets": []} + + try: + min_values = q_values(f'ping_minimum_response_ms{{url="{host}"}}') + max_values = q_values(f'ping_maximum_response_ms{{url="{host}"}}') + avg_values = q_values(f'ping_average_response_ms{{url="{host}"}}') + loss_values = q_values( + f'100 - ping_percent_packet_loss{{url="{host}"}} or 100 - ping_percent_reply_loss{{url="{host}"}}' + ) + + if min_values and max_values and avg_values: + labels = [int(ts) for ts, _ in min_values] + latency_data["labels"] = labels + latency_data["datasets"] = [ + {"label": "Min", "data": [float(v) for _, v in min_values]}, + {"label": "Avg", "data": [float(v) for _, v in avg_values]}, + {"label": "Max", "data": [float(v) for _, v in max_values]}, + ] + + if loss_values: + quality_data["labels"] = [int(ts) for ts, _ in loss_values] + quality_data["datasets"] = [ + {"label": "Delivery %", "data": [float(v) for _, v in loss_values]}, + ] + except Exception: + pass + + ret[host] = {"latency": latency_data, "quality": quality_data} + + return ret + + +cmd = sys.argv[1] + +if cmd == "list": + print( + json.dumps( + { + "get-configuration": {}, + "set-hosts": {"hosts": ["1.1.1.1", "google.com"]}, + "metrics-history": {"start": 0, "end": 0, "step": 300}, + "list-alerts": {}, + } + ) + ) +else: + action = sys.argv[2] + if action == "get-configuration": + print(json.dumps(get_config())) + elif action == "set-hosts": + args = json.loads(sys.stdin.read()) + print(json.dumps(set_config(args))) + elif action == "metrics-history": + args = json.loads(sys.stdin.read()) + print(json.dumps(metrics_history(args))) + elif action == "list-alerts": + print(json.dumps(list_alerts())) diff --git a/packages/ns-api/files/ns.telegraf.json b/packages/ns-api/files/ns.telegraf.json new file mode 100644 index 000000000..972119685 --- /dev/null +++ b/packages/ns-api/files/ns.telegraf.json @@ -0,0 +1,13 @@ +{ + "telegraf-manager": { + "description": "Read and set telegraf ping monitor configuration", + "write": {}, + "read": { + "ubus": { + "ns.telegraf": [ + "*" + ] + } + } + } +} diff --git a/packages/ns-api/openapi.yml b/packages/ns-api/openapi.yml index c5372c489..ece5198e0 100644 --- a/packages/ns-api/openapi.yml +++ b/packages/ns-api/openapi.yml @@ -188,3 +188,80 @@ paths: example: success - $ref: "#/components/schemas/ValidationError" - $ref: "#/components/schemas/Error" + POST /ubus/ns.telegraf/list-alerts: + post: + summary: List current monitoring alerts + operationId: ns.telegraf.list-alerts + tags: + - telegraf + responses: + "200": + description: Current pending and firing alerts evaluated by vmalert + content: + application/json: + schema: + oneOf: + - type: object + required: + - alerts + properties: + alerts: + type: array + items: + type: object + required: + - state + - name + - labels + - annotations + - activeAt + properties: + state: + type: string + description: Alert state reported by vmalert + example: firing + name: + type: string + description: Alert name + example: BackupEncryptionDisabled + value: + type: string + description: Evaluated expression value + example: "0" + labels: + type: object + description: Alert labels emitted by vmalert + additionalProperties: + type: string + annotations: + type: object + description: Localized summaries and descriptions emitted by vmalert + additionalProperties: + type: string + activeAt: + type: string + format: date-time + description: Time when the alert became active + example: "2026-05-07T09:18:00Z" + id: + type: string + description: Alert instance identifier + rule_id: + type: string + description: Alert rule identifier + group_id: + type: string + description: Alert group identifier + expression: + type: string + description: Alert rule expression + source: + type: string + description: vmalert source URL for the alert + restored: + type: boolean + description: True when the alert is being restored + stabilizing: + type: boolean + description: True when the alert is stabilizing + - $ref: "#/components/schemas/Error" diff --git a/packages/telegraf/Makefile b/packages/telegraf/Makefile index 1f4175a2d..3129a4ff9 100644 --- a/packages/telegraf/Makefile +++ b/packages/telegraf/Makefile @@ -28,19 +28,26 @@ GO_BUILD_PKG:=github.com/influxdata/telegraf/cmd/$(PKG_NAME) GO_PKG_LDFLAGS_X:=github.com/influxdata/telegraf/internal.Version=$(PKG_VERSION) GO_PKG_TAGS:= \ custom \ - inputs.bond \ - inputs.cpu \ - inputs.disk \ - inputs.ethtool \ - inputs.mem \ - inputs.net \ - inputs.netstat \ - inputs.nstat \ - inputs.processes \ - inputs.sensors \ - inputs.system \ - outputs.influxdb \ - outputs.prometheus_client + inputs.bond \ + inputs.cpu \ + inputs.disk \ + inputs.ethtool \ + inputs.exec \ + inputs.file \ + inputs.http_listener_v2 \ + inputs.mem \ + inputs.net \ + inputs.netstat \ + inputs.nftables \ + inputs.nstat \ + inputs.ping \ + inputs.processes \ + inputs.sensors \ + inputs.system \ + inputs.tail \ + outputs.influxdb \ + parsers.grok \ + parsers.json_v2 include $(INCLUDE_DIR)/package.mk include $(TOPDIR)/feeds/packages/lang/golang/golang-package.mk diff --git a/packages/telegraf/files/telegraf.conf.d/ping.conf b/packages/telegraf/files/telegraf.conf.d/ping.conf new file mode 100644 index 000000000..40c7c8087 --- /dev/null +++ b/packages/telegraf/files/telegraf.conf.d/ping.conf @@ -0,0 +1,26 @@ +# Ping input plugin - monitors ICMP ping to configured hosts +# Uses native method for better performance and no external dependencies + +[[inputs.ping]] + # Hosts to send ping packets to + urls = [] + + # Method: "native" for improved compatibility and performance + # Uses privileged raw ICMP sockets (requires CAP_NET_RAW or root) + method = "native" + + # Number of ping packets to send per interval + count = 1 + + # Time to wait between sending ping packets (seconds) + ping_interval = 1.0 + + # Total ping deadline (seconds) + deadline = 10 + + # Data size for ping packets (bytes) + size = 56 + + # Tags for metric routing + [inputs.ping.tags] + influxdb_db = "ping-metrics" From 3a63cf04c2a0bb71cc31af2bf9c8dd0dd79e3b3a Mon Sep 17 00:00:00 2001 From: Giacomo Sanchietti Date: Thu, 30 Apr 2026 16:34:12 +0200 Subject: [PATCH 03/11] feat(telegraf): add missing plugins These plugins are required to replace all Netdata features --- packages/telegraf/Makefile | 8 +++----- .../telegraf/files/telegraf.conf.d/os.conf | 20 +++++++++++++++++++ 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/packages/telegraf/Makefile b/packages/telegraf/Makefile index 3129a4ff9..c00cd7384 100644 --- a/packages/telegraf/Makefile +++ b/packages/telegraf/Makefile @@ -29,24 +29,23 @@ GO_PKG_LDFLAGS_X:=github.com/influxdata/telegraf/internal.Version=$(PKG_VERSION) GO_PKG_TAGS:= \ custom \ inputs.bond \ + inputs.conntrack \ inputs.cpu \ inputs.disk \ + inputs.diskio \ inputs.ethtool \ inputs.exec \ inputs.file \ - inputs.http_listener_v2 \ inputs.mem \ inputs.net \ inputs.netstat \ - inputs.nftables \ inputs.nstat \ inputs.ping \ inputs.processes \ inputs.sensors \ inputs.system \ - inputs.tail \ outputs.influxdb \ - parsers.grok \ + outputs.prometheus_client \ parsers.json_v2 include $(INCLUDE_DIR)/package.mk @@ -94,7 +93,6 @@ define Package/telegraf/install $(INSTALL_BIN) ./files/telegraf-services $(1)/usr/libexec/telegraf-services $(INSTALL_BIN) ./files/telegraf-backup-encryption $(1)/usr/libexec/telegraf-backup-encryption $(INSTALL_BIN) ./files/telegraf-storage-status $(1)/usr/libexec/telegraf-storage-status - $(INSTALL_BIN) ./files/telegraf-services $(1)/usr/libexec/telegraf-services $(INSTALL_BIN) ./files/telegraf-mwan $(1)/usr/libexec/telegraf-mwan endef diff --git a/packages/telegraf/files/telegraf.conf.d/os.conf b/packages/telegraf/files/telegraf.conf.d/os.conf index 04f84000e..be6368093 100644 --- a/packages/telegraf/files/telegraf.conf.d/os.conf +++ b/packages/telegraf/files/telegraf.conf.d/os.conf @@ -58,3 +58,23 @@ dump_zeros = true [inputs.nstat.tags] influxdb_db = "os-metrics" + + +# Read metrics about disk I/O +[[inputs.diskio]] + ## By default, telegraf will gather stats for all devices. + ## Setting devices will restrict the stats to the specified devices. + # devices = ["sda", "sdb", "vd*"] + + ## Uncomment the following line if you need disk serial numbers. + # skip_serial_number = false + [inputs.diskio.tags] + influxdb_db = "os-metrics" + + +# Collect metrics from the nf_conntrack kernel module +[[inputs.conntrack]] + files = ["nf_conntrack_count", "nf_conntrack_max"] + dirs = ["/proc/sys/net/netfilter", "/proc/sys/net/netfilter"] + [inputs.conntrack.tags] + influxdb_db = "os-metrics" From 14b425a6287fcb1607465d72e06ac7cd9eb2992e Mon Sep 17 00:00:00 2001 From: Giacomo Sanchietti Date: Tue, 5 May 2026 11:25:08 +0200 Subject: [PATCH 04/11] refactor(ns-plug): remove netdata Netdata has been replaced by Victoria Metrics. --- config/netdata.conf | 2 - packages/ns-api/Makefile | 3 - packages/ns-api/README.md | 33 -------- .../files/post-commit/restart-netdata.py | 14 ---- packages/ns-plug/Makefile | 1 + packages/ns-plug/README.md | 15 ++-- packages/ns-plug/files/30_ns-plug_alerts | 64 ---------------- .../99_ns-plug-netdata-cleanup.uci-default | 13 ++++ .../ns-plug/files/health_alarm_notify.conf | 75 ------------------- .../ns-plug/files/netadata_disable_alerts | 5 -- packages/ns-plug/files/netadata_enable_alerts | 5 -- packages/ns-plug/files/send-mwan-alert | 49 ------------ .../100-netdata-export-promethus.patch | 12 --- 13 files changed, 23 insertions(+), 268 deletions(-) delete mode 100644 config/netdata.conf delete mode 100755 packages/ns-api/files/post-commit/restart-netdata.py delete mode 100644 packages/ns-plug/files/30_ns-plug_alerts create mode 100644 packages/ns-plug/files/99_ns-plug-netdata-cleanup.uci-default delete mode 100644 packages/ns-plug/files/health_alarm_notify.conf delete mode 100644 packages/ns-plug/files/netadata_disable_alerts delete mode 100644 packages/ns-plug/files/netadata_enable_alerts delete mode 100644 packages/ns-plug/files/send-mwan-alert delete mode 100644 patches/feeds/packages/100-netdata-export-promethus.patch diff --git a/config/netdata.conf b/config/netdata.conf deleted file mode 100644 index e4636c07f..000000000 --- a/config/netdata.conf +++ /dev/null @@ -1,2 +0,0 @@ -CONFIG_PACKAGE_netdata=y -CONFIG_PACKAGE_fping=y diff --git a/packages/ns-api/Makefile b/packages/ns-api/Makefile index b4bec3a06..f9fff1a3b 100644 --- a/packages/ns-api/Makefile +++ b/packages/ns-api/Makefile @@ -120,8 +120,6 @@ define Package/ns-api/install $(INSTALL_DATA) ./files/ns.dpi.json $(1)/usr/share/rpcd/acl.d/ $(INSTALL_BIN) ./files/ns.telegraf $(1)/usr/libexec/rpcd/ $(INSTALL_DATA) ./files/ns.telegraf.json $(1)/usr/share/rpcd/acl.d/ - $(LN) ns.telegraf $(1)/usr/libexec/rpcd/ns.netdata - $(LN) ns.telegraf.json $(1)/usr/share/rpcd/acl.d/ns.netdata.json $(INSTALL_BIN) ./files/ns.storage $(1)/usr/libexec/rpcd/ $(INSTALL_DATA) ./files/ns.storage.json $(1)/usr/share/rpcd/acl.d/ $(INSTALL_BIN) ./files/ns.account $(1)/usr/libexec/rpcd/ @@ -188,7 +186,6 @@ define Package/ns-api/install $(INSTALL_CONF) ./files/config/ns-api $(1)/etc/config/ns-api $(INSTALL_CONF) ./files/config/ns-wizard $(1)/etc/config/ns-wizard $(INSTALL_CONF) ./files/templates $(1)/etc/config/ - $(INSTALL_BIN) ./files/post-commit/restart-netdata.py $(1)/usr/libexec/ns-api/post-commit/ $(INSTALL_BIN) ./files/pre-commit/fix-redirect-reflections.py $(1)/usr/libexec/ns-api/pre-commit $(INSTALL_BIN) ./files/pre-commit/update-objects.py $(1)/usr/libexec/ns-api/pre-commit $(INSTALL_BIN) ./files/post-commit/reload-ipsets.py $(1)/usr/libexec/ns-api/post-commit diff --git a/packages/ns-api/README.md b/packages/ns-api/README.md index f3b84620e..3f19bdaff 100644 --- a/packages/ns-api/README.md +++ b/packages/ns-api/README.md @@ -4625,39 +4625,6 @@ Error response example: {"error": "restart_failed"} ``` -## ns.netdata - -Configure netdata reporting daemon. - -### get-configuration - -Get current netdata configuration: -``` -api-cli ns.netdata get-configuration -``` - -Response example: -```json -{ - "hosts": [ - "1.2.3.4", - "google.it" - ] -} -``` - -### set-hosts - -Configure hosts to be monitored by fping: -``` -api-cli ns.netdata set-hosts --data '{"hosts": ["1.1.1.1", "google.com"]}' -``` - -Response example: -```json -{"result": "success"} -``` - ## ns.factoryreset ### reset diff --git a/packages/ns-api/files/post-commit/restart-netdata.py b/packages/ns-api/files/post-commit/restart-netdata.py deleted file mode 100755 index ae6a7017b..000000000 --- a/packages/ns-api/files/post-commit/restart-netdata.py +++ /dev/null @@ -1,14 +0,0 @@ -#!/usr/bin/python - -# -# Copyright (C) 2024 Nethesis S.r.l. -# SPDX-License-Identifier: GPL-2.0-only -# - -# This script restarts netdata is a WAN has changed to update the multiwan chart. - -import subprocess - -# The changes variable is already within the scope from the caller -if 'mwan3' in changes: - subprocess.run(["/etc/init.d/netdata", "restart"]) diff --git a/packages/ns-plug/Makefile b/packages/ns-plug/Makefile index 5b602cb44..020a61948 100644 --- a/packages/ns-plug/Makefile +++ b/packages/ns-plug/Makefile @@ -104,6 +104,7 @@ define Package/ns-plug/install $(INSTALL_CONF) files/ns-plug.keep $(1)/lib/upgrade/keep.d/ns-plug $(INSTALL_BIN) ./files/mwan-hooks $(1)/usr/libexec/ns-plug $(INSTALL_BIN) ./files/ns-plug-rsyslog-fixup.uci-default $(1)/etc/uci-defaults/rsyslog-fixup + $(INSTALL_BIN) ./files/99_ns-plug-netdata-cleanup.uci-default $(1)/etc/uci-defaults/99_ns-plug-netdata-cleanup endef $(eval $(call BuildPackage,ns-plug)) diff --git a/packages/ns-plug/README.md b/packages/ns-plug/README.md index 670169c90..eebc96abb 100644 --- a/packages/ns-plug/README.md +++ b/packages/ns-plug/README.md @@ -129,13 +129,12 @@ remote-backup download $(remote-backup list | jq -r .[0].file) - | gpg --batch - ## Alerts -All system alerts, except MultiWAN ones, are handled by netdata, including those from the multiwan monitoring. -Alerts are disabled by default and enabled only if the machine has a valid subscription. -In this case, alerts are automatically sent to the remote server (either my.nethesis.it or my.nethserver.com) using a -custom sender (`/etc/netdata/health_alarm_notify.conf`). -Alerts are also logged to `/var/log/messages` and are visible within the netdata UI. +System alerts are handled by **vmalert** (Victoria Metrics alert evaluation engine) which evaluates +alert rules against metrics collected by telegraf. -Only the following alerts are sent to the remote system: +When an alert fires or resolves, vmalert sends an Alertmanager-format webhook to `ns-plug-alert-proxy` +running on `127.0.0.1:9095`. The proxy forwards the following alerts to the registered monitoring +portal (my.nethesis.it or my.nethserver.com): | Alert | Condition | Legacy alert_id | |---|---|---| @@ -151,3 +150,7 @@ The proxy starts automatically at boot regardless of registration state. Firing/resolved state is determined from the Alertmanager-standard `endsAt` field: if `endsAt` is in the future (or zero/missing) a **FAILURE** is sent; if `endsAt` is in the past an **OK** is sent. +A FAILURE is sent when the alert starts firing and an OK is sent when it resolves. + +If Mimir credentials are configured in ns-plug UCI (`my_url`, `my_system_key`, `my_system_secret`), +vmalert also forwards all alerts to the Mimir alertmanager for cloud-side processing. diff --git a/packages/ns-plug/files/30_ns-plug_alerts b/packages/ns-plug/files/30_ns-plug_alerts deleted file mode 100644 index dfb62a848..000000000 --- a/packages/ns-plug/files/30_ns-plug_alerts +++ /dev/null @@ -1,64 +0,0 @@ -#!/bin/sh - -# Custom disk alerts -disks_f="/etc/netdata/health.d/disks.conf" -if [ ! -f "$disks_f" ]; then - cat << EOF > "$disks_f" -template: disk_space_usage - on: disk.space - class: Utilization - type: System -component: Disk - os: linux freebsd - hosts: * - families: !/dev !/dev/* !/run !/run/* !/overlay * - calc: \$used * 100 / (\$avail + \$used) - units: % - every: 1m - warn: \$this > ((\$status >= \$WARNING ) ? (80) : (90)) - crit: \$this > ((\$status == \$CRITICAL) ? (90) : (98)) - delay: up 1m down 15m multiplier 1.5 max 1h - info: disk $family space utilization - to: sysadmin -EOF -fi - -# Disable unwanted alerts -files="cpu disks entropy ipc load memory net netfilter processes ram softnet tcp_conn tcp_listen tcp_mem tcp_orphans tcp_resets timex udp_errors" -for f in $files -do - file="/etc/netdata/health.d/${f}.conf" - if [ ! -f $file ]; then - > $file - fi -done - -# Enable mwan chart -sed -i 's/python.d = no/python.d = yes/' /etc/netdata/netdata.conf -python_f="/etc/netdata/python.d.conf" -if [ ! -f "$python_f" ]; then - cat << EOF > "$python_f" -enabled: yes -gc_run: yes -gc_interval: 300 -apache_cache: no -chrony: no -example: no -go_expvar: no -gunicorn_log: no -hpssa: no -logind: no -nginx_log: no -EOF -fi - -# Create mwan alert -cat << EOF > /etc/netdata/health.d/mwan.conf -template: wan_status - on: mwan.score -lookup: min -1m foreach * - every: 1m - warn: \$this < 5 - crit: \$this <= 1 - info: The score of the WAN, 0 means down -EOF diff --git a/packages/ns-plug/files/99_ns-plug-netdata-cleanup.uci-default b/packages/ns-plug/files/99_ns-plug-netdata-cleanup.uci-default new file mode 100644 index 000000000..df1a859c5 --- /dev/null +++ b/packages/ns-plug/files/99_ns-plug-netdata-cleanup.uci-default @@ -0,0 +1,13 @@ +#!/bin/sh +# +# Copyright (C) 2026 Nethesis S.r.l. +# SPDX-License-Identifier: GPL-2.0-only +# + +# Remove bundled netdata configuration files on first boot. +rm -rf /etc/netdata 2>/dev/null || true + +# Remove the legacy backup-encryption alert cron entry if it is still present. +if crontab -l 2>/dev/null | grep -q '/usr/libexec/backup-encryption-alert'; then + crontab -l 2>/dev/null | grep -v '/usr/libexec/backup-encryption-alert' | sort | uniq | crontab - +fi diff --git a/packages/ns-plug/files/health_alarm_notify.conf b/packages/ns-plug/files/health_alarm_notify.conf deleted file mode 100644 index 00852cfa9..000000000 --- a/packages/ns-plug/files/health_alarm_notify.conf +++ /dev/null @@ -1,75 +0,0 @@ -# Configuration for alarm notifications - -SEND_EMAIL="NO" -SEND_DYNATRACE="NO" -SEND_STACKPULSE="NO" -SEND_OPSGENIE="NO" -SEND_HANGOUTS="NO" -SEND_PUSHOVER="NO" -SEND_PUSHBULLET="NO" -SEND_TWILIO="NO" -SEND_MESSAGEBIRD="NO" -SEND_KAVENEGAR="NO" -SEND_TELEGRAM="NO" -SEND_SLACK="NO" -SEND_MSTEAMS="NO" -SEND_ROCKETCHAT="NO" -SEND_ALERTA="NO" -SEND_FLOCK="NO" -SEND_DISCORD="NO" -SEND_HIPCHAT="NO" -SEND_KAFKA="NO" -SEND_PD="NO" -SEND_FLEEP="NO" -SEND_IRC="NO" -SEND_SYSLOG="NO" -SEND_PROWL="NO" -SEND_AWSSNS="NO" -SEND_SMS="NO" -SEND_MATRIX="NO" - -# Enable only syslog and custom notification -use_fqdn='YES' -SEND_SYSLOG="YES" -SYSLOG_FACILITY='' -DEFAULT_RECIPIENT_SYSLOG="sysadmin" -SEND_CUSTOM="YES" -DEFAULT_RECIPIENT_CUSTOM="sysadmin" - -# Always generate clear events -clear_alarm_always='YES' - -# Send alerts to my.nethesis.it or my.nethserver.com -custom_sender() { - lk=$(uci -q get ns-plug.config.system_id) - secret=$(uci -q get ns-plug.config.secret) - url=$(uci -q get ns-plug.config.alerts_url)"alerts/store" - alert_id=${name} - if [ "${status}" == "CRITICAL" ]; then - status="FAILURE" - elif [ "${status}" == "CLEAR" ]; then - status="OK" - fi - - # map to old alerts, when possible - if [ "${chart}" == "disk_space._overlay" ] || [ "${chart}" == "disk_space._" ]; then - alert_id="df:root:percent_bytes:free" - elif [ "${chart}" == "disk_space._boot" ]; then - alert_id="df:boot:percent_bytes:free" - else - alert_id="${name}:${chart}" - fi - payload='{"lk": "'$lk'", "alert_id": "'$alert_id'", "status": "'$status'"}' - - # send only if the machine is registered - if [ -z "${lk}" ] || [ -z "${secret}" ]; then - return - fi - - # send to remote server - if [ "${status}" == "FAILURE" ] || [ "${status}" == "OK" ]; then - /usr/bin/curl -m 180 --retry 3 -L -s \ - --header "Authorization: token ${secret}" --header "Content-Type: application/json" --header "Accept: application/json" \ - --data-raw "${payload}" ${url} - fi -} diff --git a/packages/ns-plug/files/netadata_disable_alerts b/packages/ns-plug/files/netadata_disable_alerts deleted file mode 100644 index b21473515..000000000 --- a/packages/ns-plug/files/netadata_disable_alerts +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/sh - -# Disable netdata alerts -sed -i 's/enabled = yes/enabled = no/' /etc/netdata/netdata.conf -/etc/init.d/netdata restart diff --git a/packages/ns-plug/files/netadata_enable_alerts b/packages/ns-plug/files/netadata_enable_alerts deleted file mode 100644 index cf066e58a..000000000 --- a/packages/ns-plug/files/netadata_enable_alerts +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/sh - -# Enable netdata alerts -sed -i 's/enabled = no/enabled = yes/' /etc/netdata/netdata.conf -/etc/init.d/netdata restart diff --git a/packages/ns-plug/files/send-mwan-alert b/packages/ns-plug/files/send-mwan-alert deleted file mode 100644 index 1a74ec96c..000000000 --- a/packages/ns-plug/files/send-mwan-alert +++ /dev/null @@ -1,49 +0,0 @@ -#!/bin/bash -# -# Copyright (C) 2024 Nethesis S.r.l. -# SPDX-License-Identifier: GPL-2.0-only -# - -# Send WAN alert to monitoring portal - -lk=$(uci -q get ns-plug.config.system_id) -secret=$(uci -q get ns-plug.config.secret) -url=$(uci -q get ns-plug.config.alerts_url)"alerts/store" -pidfile="/tmp/mwan3.$INTERFACE" - -# Do not send alert if system_id or secret is not set -if [ -z "$lk" ] || [ -z "$secret" ]; then - exit 0 -fi - -# Ignore ifup and ifdown events, they both triggers connected and disconnected events -if [ "${ACTION}" == "connected" ]; then - pid=$(cat "$pidfile" 2>/dev/null) - # If a wan is connected within 30 seconds from disconnect, assume it's a restart - # and kill the alert sending process - # mwan3 restart should complete within 30 seconds - if [ -n "$pid" ]; then - kill -s SIGHUP "$pid" - rm "$pidfile" - exit 0 - fi - status="OK" -elif [ "${ACTION}" == "disconnected" ]; then - echo $$ > "$pidfile" - # Delay alert by 30 seconds, so that it can be canceled - sleep 30 - rm "$pidfile" - status="FAILURE" -fi - -# Exit if status is not set -if [ -z "$status" ]; then - exit 0 -fi - -alert_id="wan:${INTERFACE}:down" -logger -t mwan3-alert "Sending alert ${alert_id} with status ${status}" -payload='{"lk": "'$lk'", "alert_id": "'$alert_id'", "status": "'$status'"}' -/usr/bin/curl -m 30 --retry 3 -L -s \ - --header "Authorization: token ${secret}" --header "Content-Type: application/json" --header "Accept: application/json" \ - --data-raw "${payload}" ${url} diff --git a/patches/feeds/packages/100-netdata-export-promethus.patch b/patches/feeds/packages/100-netdata-export-promethus.patch deleted file mode 100644 index 4db53cf7c..000000000 --- a/patches/feeds/packages/100-netdata-export-promethus.patch +++ /dev/null @@ -1,12 +0,0 @@ -diff --git a/admin/netdata/Makefile b/admin/netdata/Makefile -index e471f27b2..def44a874 100644 ---- a/admin/netdata/Makefile -+++ b/admin/netdata/Makefile -@@ -62,7 +62,6 @@ CONFIGURE_ARGS += \ - --disable-plugin-freeipmi \ - --disable-plugin-cups \ - --disable-plugin-xenstat \ -- --disable-backend-prometheus-remote-write \ - --disable-unit-tests \ - --disable-ml \ - --disable-cloud From 0bc20e2f8087c46a3d88988e896e09d95c28ab12 Mon Sep 17 00:00:00 2001 From: Tommaso Bailetti Date: Thu, 7 May 2026 16:16:40 +0200 Subject: [PATCH 05/11] feat(telegraf): using uci conf for ping config, added migration path from netdata --- packages/ns-api/files/ns.telegraf | 77 +++++----- packages/telegraf/Makefile | 4 +- packages/telegraf/files/telegraf-config | 138 +++++++++--------- .../telegraf/files/telegraf.conf.d/ping.conf | 26 ---- packages/telegraf/files/telegraf.uci | 6 + .../uci-defaults/99-telegraf-migrate-netdata | 36 +++++ 6 files changed, 156 insertions(+), 131 deletions(-) delete mode 100644 packages/telegraf/files/telegraf.conf.d/ping.conf create mode 100644 packages/telegraf/files/uci-defaults/99-telegraf-migrate-netdata diff --git a/packages/ns-api/files/ns.telegraf b/packages/ns-api/files/ns.telegraf index 0a4dd5192..cd18a11cc 100755 --- a/packages/ns-api/files/ns.telegraf +++ b/packages/ns-api/files/ns.telegraf @@ -13,26 +13,36 @@ import json import subprocess import re import time +import ipaddress import urllib.error import urllib.parse import urllib.request from euci import EUci +from nethsec.utils import validation_error -ping_conf_file = "/etc/telegraf.conf.d/ping.conf" +def _is_valid_ip_or_hostname(host): + """Validate if a string is a valid IPv4, IPv6, or hostname.""" + if not isinstance(host, str) or not host.strip(): + return False -def _read_ping_hosts(): - """Read the list of monitored ping hosts from the telegraf config file.""" - if not os.path.exists(ping_conf_file): - return [] + # Try to parse as IP address (IPv4 or IPv6) try: - with open(ping_conf_file) as fp: - match = re.search(r"urls\s*=\s*(\[[^\]]*\])", fp.read()) - if match: - return json.loads(match.group(1)) - except Exception: + ipaddress.ip_address(host) + return True + except ValueError: pass - return [] + + # Hostname pattern (alphanumeric, dots, hyphens) + hostname_pattern = r'^([a-zA-Z0-9]([a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])?)(\.([a-zA-Z0-9]([a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])?))*$' + return bool(re.match(hostname_pattern, host)) + + +def _read_ping_hosts(): + """Read the list of monitored ping hosts from UCI config.""" + e_uci = EUci() + pings = e_uci.get('telegraf', 'internet', 'pings', dtype=str, list=True, default=[]) + return pings def get_config(): @@ -40,33 +50,24 @@ def get_config(): def set_config(config): - try: - # Ensure directory exists - os.makedirs(os.path.dirname(ping_conf_file), exist_ok=True) - - # Create the telegraf ping configuration - with open(ping_conf_file, "w") as fp: - fp.write("# Ping input plugin configuration\n") - fp.write("[[inputs.ping]]\n") - if len(config["hosts"]) > 0: - # Format hosts as TOML array - hosts_str = json.dumps(config["hosts"]) - fp.write(f" urls = {hosts_str}\n") - fp.write(' method = "native"\n') - fp.write(" count = 5\n") - fp.write(" ping_interval = 1.0\n") - fp.write(" deadline = 10\n") - fp.write(" [inputs.ping.tags]\n") - fp.write(' influxdb_db = "ping-metrics"\n') - else: - # Write empty config to disable - fp.write(" urls = []\n") - - # Restart telegraf service - subprocess.run(["/etc/init.d/telegraf", "restart"], check=True) - return {"success": True} - except Exception as e: - return {"success": False, "error": str(e)} + if 'hosts' not in config: + return validation_error('hosts', 'required') + + hosts = config.get('hosts') + if not isinstance(hosts, list): + return validation_error('hosts', 'invalid') + + # Validate each host with per-index error reporting + for idx, host in enumerate(hosts): + if not _is_valid_ip_or_hostname(host): + return validation_error(f'hosts.{idx}', 'invalid', value=host) + + e_uci = EUci() + current = set(e_uci.get('telegraf', 'internet', 'pings', dtype=str, list=True, default=[])) + if current != set(hosts): + e_uci.set('telegraf', 'internet', 'pings', hosts) + e_uci.save('telegraf') + return {"success": True} VM_URL = "http://127.0.0.1:8428/api/v1/query_range" diff --git a/packages/telegraf/Makefile b/packages/telegraf/Makefile index c00cd7384..b2e7ffa91 100644 --- a/packages/telegraf/Makefile +++ b/packages/telegraf/Makefile @@ -34,6 +34,7 @@ GO_PKG_TAGS:= \ inputs.disk \ inputs.diskio \ inputs.ethtool \ + inputs.dns_query \ inputs.exec \ inputs.file \ inputs.mem \ @@ -86,7 +87,8 @@ define Package/telegraf/install $(INSTALL_DATA) ./files/telegraf.conf.d/storage.conf $(1)/etc/telegraf.conf.d/storage.conf $(INSTALL_DATA) ./files/telegraf.conf.d/services.conf $(1)/etc/telegraf.conf.d/services.conf $(INSTALL_DATA) ./files/telegraf.conf.d/mwan.conf $(1)/etc/telegraf.conf.d/mwan.conf - $(INSTALL_DATA) ./files/telegraf.conf.d/ping.conf $(1)/etc/telegraf.conf.d/ping.conf + $(INSTALL_DIR) $(1)/etc/uci-defaults + $(INSTALL_BIN) ./files/uci-defaults/99-telegraf-migrate-netdata $(1)/etc/uci-defaults/99-telegraf-migrate-netdata $(INSTALL_DIR) $(1)/usr/sbin $(INSTALL_BIN) ./files/telegraf-config $(1)/usr/sbin/telegraf-config $(INSTALL_DIR) $(1)/usr/libexec diff --git a/packages/telegraf/files/telegraf-config b/packages/telegraf/files/telegraf-config index 235476d86..7cc2b0d0c 100644 --- a/packages/telegraf/files/telegraf-config +++ b/packages/telegraf/files/telegraf-config @@ -10,87 +10,45 @@ import glob from jinja2 import Environment, BaseLoader from euci import EUci -PROMETHEUS_TEMPLATE = """# This file is automatically generated by /usr/sbin/telegraf-config. -# Do not edit manually — changes will be overwritten. - -# Prometheus client output +PROMETHEUS_TEMPLATE = """# Prometheus client output {% if enabled == '1' -%} [[outputs.prometheus_client]] - ## Address to listen on. - ## ex: - ## listen = ":9273" - ## listen = "vsock://:9273" listen = "{{ listen_addr }}" - - ## Maximum duration before timing out read of the request - # read_timeout = "10s" - ## Maximum duration before timing out write of the response - # write_timeout = "10s" - - ## Metric version controls the mapping from Prometheus metrics into Telegraf metrics. - ## See "Metric Format Configuration" in plugins/inputs/prometheus/README.md for details. - ## Valid options: 1, 2 - # metric_version = 1 - - ## Use HTTP Basic Authentication. {%- if basic_auth_username and basic_auth_password %} basic_username = "{{ basic_auth_username }}" basic_password = "{{ basic_auth_password }}" -{%- else %} - # basic_username = "Foo" - # basic_password = "Bar" {%- endif %} +{%- endif %} +""" - ## If set, the IP Ranges which are allowed to access metrics. - ## ex: ip_range = ["192.168.0.0/24", "192.168.1.0/30"] - # ip_range = [] - - ## Path to publish the metrics on. - # path = "/metrics" - - ## Expiration interval for each metric. 0 == no expiration - # expiration_interval = "60s" - - ## Collectors to enable, valid entries are "gocollector" and "process". - ## If unset, both are enabled. - # collectors_exclude = ["gocollector", "process"] - - ## Send string metrics as Prometheus labels. - ## Unless set to false all string metrics will be sent as labels. - # string_as_label = true - - ## If set, enable TLS with the given certificate. - # tls_cert = "/etc/ssl/telegraf.crt" - # tls_key = "/etc/ssl/telegraf.key" - - ## Set one or more allowed client CA certificate file names to - ## enable mutually authenticated TLS connections - # tls_allowed_cacerts = ["/etc/telegraf/clientca.pem"] +PING_TEMPLATE = """# Ping input plugin - monitors ICMP ping to configured hosts +{% if pings -%} +[[inputs.ping]] + urls = [{{ pings|map('tojson')|join(', ') }}] + method = "native" + count = 1 + ping_interval = 1.0 + deadline = 10 + + [inputs.ping.tags] + influxdb_db = "ping-metrics" +{%- endif %} +""" - ## Export metric collection time. - # export_timestamp = false +DNS_TEMPLATE = """# DNS Query input plugin - monitors DNS resolution +{% if dns_domains -%} +[[inputs.dns_query]] + domains = [{{ dns_domains|map('tojson')|join(', ') }}] + servers = ["127.0.0.1"] - ## Specify the metric type explicitly. - ## This overrides the metric-type of the Telegraf metric. Globbing is allowed. - # [outputs.prometheus_client.metric_types] - # counter = [] - # gauge = [] + [inputs.dns_query.tags] + influxdb_db = "ping-metrics" {%- endif %} """ -SENSORS_TEMPLATE = """# This file is automatically generated by /usr/sbin/telegraf-config. -# Do not edit manually — changes will be overwritten. - -# Monitor sensors, requires lm-sensors package -# This plugin ONLY supports Linux +SENSORS_TEMPLATE = """# Monitor sensors (requires lm-sensors package) {% if sensors_available -%} [[inputs.sensors]] - ## Remove numbers from field names. - ## If true, a field name like 'temp1_input' will be changed to 'temp_input'. - # remove_numbers = true - - ## Timeout is the maximum amount of time that the sensors command can run. - # timeout = "5s" [inputs.sensors.tags] influxdb_db = "os-metrics" {%- endif %} @@ -132,6 +90,52 @@ def generate_prometheus_config(): return False +def generate_ping_config(): + """Read UCI config and render ping input section.""" + e_uci = EUci() + try: + pings = e_uci.get('telegraf', 'internet', 'pings', dtype=str, list=True, default=[]) + except Exception: + pings = [] + + template = Environment(loader=BaseLoader()).from_string(PING_TEMPLATE) + rendered = template.render(pings=pings) + + config_file = '/etc/telegraf.conf.d/ping.conf' + os.makedirs(os.path.dirname(config_file), exist_ok=True) + + try: + with open(config_file, 'w') as f: + f.write(rendered) + return True + except Exception as e: + print(f"Error writing config file: {e}") + return False + + +def generate_dns_config(): + """Read UCI config and render DNS query input section.""" + e_uci = EUci() + try: + dns_domains = e_uci.get('telegraf', 'internet', 'dns', dtype=str, list=True, default=[]) + except Exception: + dns_domains = [] + + template = Environment(loader=BaseLoader()).from_string(DNS_TEMPLATE) + rendered = template.render(dns_domains=dns_domains) + + config_file = '/etc/telegraf.conf.d/dns.conf' + os.makedirs(os.path.dirname(config_file), exist_ok=True) + + try: + with open(config_file, 'w') as f: + f.write(rendered) + return True + except Exception as e: + print(f"Error writing config file: {e}") + return False + + def sensors_available(): """Check if sensors command works by running it with -A -u flags.""" try: @@ -165,4 +169,6 @@ def generate_sensors_config(): if __name__ == '__main__': generate_prometheus_config() generate_sensors_config() + generate_ping_config() + generate_dns_config() exit(0) diff --git a/packages/telegraf/files/telegraf.conf.d/ping.conf b/packages/telegraf/files/telegraf.conf.d/ping.conf deleted file mode 100644 index 40c7c8087..000000000 --- a/packages/telegraf/files/telegraf.conf.d/ping.conf +++ /dev/null @@ -1,26 +0,0 @@ -# Ping input plugin - monitors ICMP ping to configured hosts -# Uses native method for better performance and no external dependencies - -[[inputs.ping]] - # Hosts to send ping packets to - urls = [] - - # Method: "native" for improved compatibility and performance - # Uses privileged raw ICMP sockets (requires CAP_NET_RAW or root) - method = "native" - - # Number of ping packets to send per interval - count = 1 - - # Time to wait between sending ping packets (seconds) - ping_interval = 1.0 - - # Total ping deadline (seconds) - deadline = 10 - - # Data size for ping packets (bytes) - size = 56 - - # Tags for metric routing - [inputs.ping.tags] - influxdb_db = "ping-metrics" diff --git a/packages/telegraf/files/telegraf.uci b/packages/telegraf/files/telegraf.uci index d1225d452..b7073f50d 100644 --- a/packages/telegraf/files/telegraf.uci +++ b/packages/telegraf/files/telegraf.uci @@ -3,3 +3,9 @@ config output_prometheus 'output_prometheus' option listen_addr ':9273' option basic_auth_username '' option basic_auth_password '' + +config internet 'internet' + list pings '8.8.8.8' + list pings '1.1.1.1' + list dns 'google.com' + list dns 'cloudflare.com' diff --git a/packages/telegraf/files/uci-defaults/99-telegraf-migrate-netdata b/packages/telegraf/files/uci-defaults/99-telegraf-migrate-netdata new file mode 100644 index 000000000..5fa351f41 --- /dev/null +++ b/packages/telegraf/files/uci-defaults/99-telegraf-migrate-netdata @@ -0,0 +1,36 @@ +#!/bin/sh +# +# Copyright (C) 2026 Nethesis S.r.l. +# SPDX-License-Identifier: GPL-2.0-only +# + +# Migrate fping hosts from the legacy netdata configuration to the telegraf +# UCI config (telegraf.internet.pings). +# +# The old fping.conf format is a single line: +# hosts="1.1.1.1 8.8.8.8 google.com" + +FPING_CONF="/etc/netdata/fping.conf" + +[ -f "$FPING_CONF" ] || exit 0 + +# Extract the hosts string: strip 'hosts="..."' +hosts_line=$(grep '^hosts=' "$FPING_CONF" | head -n1) +[ -n "$hosts_line" ] || exit 0 + +# Strip key and surrounding quotes +hosts_val="${hosts_line#hosts=}" +hosts_val="${hosts_val#\"}" +hosts_val="${hosts_val%\"}" +[ -n "$hosts_val" ] || exit 0 + +# Add each host to telegraf.internet.pings if not already present +uci -q get telegraf.internet > /dev/null || uci set telegraf.internet='internet' + +for host in $hosts_val; do + # Skip if already in the list + uci -q get telegraf.internet.pings | grep -qw "$host" && continue + uci add_list telegraf.internet.pings="$host" +done + +uci commit telegraf From 41daaa9b66f663624fbb2c25703ecf3875bc463e Mon Sep 17 00:00:00 2001 From: Tommaso Bailetti Date: Thu, 7 May 2026 16:21:45 +0200 Subject: [PATCH 06/11] chore(ns-ui): version bump --- packages/ns-ui/Makefile | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/packages/ns-ui/Makefile b/packages/ns-ui/Makefile index 171b73bde..aba307639 100644 --- a/packages/ns-ui/Makefile +++ b/packages/ns-ui/Makefile @@ -10,10 +10,12 @@ PKG_NAME:=ns-ui PKG_VERSION:=2.20.1 PKG_RELEASE:=1 -PKG_SOURCE:=nethsecurity-ui-$(PKG_VERSION).tar.gz -PKG_BUILD_DIR=$(BUILD_DIR)/nethsecurity-ui-$(PKG_VERSION) -PKG_SOURCE_URL:=https://codeload.github.com/nethserver/nethsecurity-ui/tar.gz/$(PKG_VERSION)? -PKG_HASH:=skip +PKG_SOURCE_PROTO:=git +PKG_SOURCE_URL:=https://github.com/NethServer/nethsecurity-ui.git +PKG_SOURCE_VERSION:=efba51830602aa2c879ec41081208b7a33f93351 +PKG_SOURCE_SUBDIR:=nethsecurity-ui-$(PKG_SOURCE_VERSION) +PKG_BUILD_DIR:=$(BUILD_DIR)/$(PKG_SOURCE_SUBDIR) +PKG_MIRROR_HASH:=skip PKG_MAINTAINER:=Giacomo Sanchietti PKG_LICENSE:=GPL-3.0-only From f7e670a94ba8cd794d47957b5235d84cd0a179b5 Mon Sep 17 00:00:00 2001 From: Tommaso Bailetti Date: Thu, 7 May 2026 16:58:04 +0200 Subject: [PATCH 07/11] fix(telegraf): moving pruning of netdata config to telegraf migration --- packages/ns-plug/files/99_ns-plug-netdata-cleanup.uci-default | 3 --- .../telegraf/files/uci-defaults/99-telegraf-migrate-netdata | 3 +++ 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/packages/ns-plug/files/99_ns-plug-netdata-cleanup.uci-default b/packages/ns-plug/files/99_ns-plug-netdata-cleanup.uci-default index df1a859c5..e76fe4356 100644 --- a/packages/ns-plug/files/99_ns-plug-netdata-cleanup.uci-default +++ b/packages/ns-plug/files/99_ns-plug-netdata-cleanup.uci-default @@ -4,9 +4,6 @@ # SPDX-License-Identifier: GPL-2.0-only # -# Remove bundled netdata configuration files on first boot. -rm -rf /etc/netdata 2>/dev/null || true - # Remove the legacy backup-encryption alert cron entry if it is still present. if crontab -l 2>/dev/null | grep -q '/usr/libexec/backup-encryption-alert'; then crontab -l 2>/dev/null | grep -v '/usr/libexec/backup-encryption-alert' | sort | uniq | crontab - diff --git a/packages/telegraf/files/uci-defaults/99-telegraf-migrate-netdata b/packages/telegraf/files/uci-defaults/99-telegraf-migrate-netdata index 5fa351f41..2d5921b82 100644 --- a/packages/telegraf/files/uci-defaults/99-telegraf-migrate-netdata +++ b/packages/telegraf/files/uci-defaults/99-telegraf-migrate-netdata @@ -34,3 +34,6 @@ for host in $hosts_val; do done uci commit telegraf + +# Remove bundled netdata configuration files since they are no longer used. +rm -rf /etc/netdata 2>/dev/null || true From 399125e90aa1b613de3bc8f4e7b19b8dfc114101 Mon Sep 17 00:00:00 2001 From: Tommaso Bailetti Date: Wed, 13 May 2026 14:27:29 +0200 Subject: [PATCH 08/11] fix(ns-api): using uci config for pings --- packages/ns-api/files/ns.report | 22 ++-------------------- 1 file changed, 2 insertions(+), 20 deletions(-) diff --git a/packages/ns-api/files/ns.report b/packages/ns-api/files/ns.report index e188c25a3..47dc1c315 100755 --- a/packages/ns-api/files/ns.report +++ b/packages/ns-api/files/ns.report @@ -325,25 +325,6 @@ def ovpnrw_bytes_by_hour_and_user(instance, day, user): return {"hours": hours_bytes} -def get_ping_hosts(): - # read ping hosts from telegraf configuration - ping_conf_file = "/etc/telegraf.conf.d/ping.conf" - hosts = [] - if os.path.exists(ping_conf_file): - try: - with open(ping_conf_file, 'r') as fp: - content = fp.read() - # Find the urls line in TOML format: urls = ["host1", "host2"] - match = re.search(r'urls\s*=\s*(\[[^\]]*\])', content) - if match: - urls_str = match.group(1) - # Parse JSON array - hosts = json.loads(urls_str) - except Exception: - pass - return hosts - - def get_victoria_metrics_ping_data(host): """ Query Victoria Metrics for ping metrics. @@ -384,7 +365,8 @@ def get_victoria_metrics_ping_data(host): def latency_and_quality_report(): - hosts = get_ping_hosts() + e_uci = EUci() + hosts = e_uci.get('telegraf', 'internet', 'pings', dtype=str, list=True, default=[]) ret = {} for host in hosts: ret[host] = get_victoria_metrics_ping_data(host) From 68a3d7dae5d71d17a7a6551c2bbc4943494f4cc7 Mon Sep 17 00:00:00 2001 From: Tommaso Bailetti Date: Wed, 13 May 2026 14:35:35 +0200 Subject: [PATCH 09/11] fix(ns-plug): added service triggers for alert-proxy --- packages/ns-plug/files/ns-plug-alert-proxy.init | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/packages/ns-plug/files/ns-plug-alert-proxy.init b/packages/ns-plug/files/ns-plug-alert-proxy.init index 31b5fa430..38ce01012 100644 --- a/packages/ns-plug/files/ns-plug-alert-proxy.init +++ b/packages/ns-plug/files/ns-plug-alert-proxy.init @@ -17,3 +17,14 @@ start_service() { procd_set_param respawn 3600 5 0 procd_close_instance } + +service_triggers() +{ + procd_add_reload_trigger "ns-plug" +} + +reload_service() +{ + stop + start +} From 5f02004e17513e2c5c55d80ec1ebcca1668c886d Mon Sep 17 00:00:00 2001 From: Tommaso Bailetti Date: Fri, 15 May 2026 11:42:31 +0200 Subject: [PATCH 10/11] feat(victoria-metrics): added storage configuration --- packages/victoria-metrics/README.md | 10 +++---- .../files/victoria-metrics.conf | 2 -- .../files/victoria-metrics.initd | 27 +++++++++++++++++-- 3 files changed, 30 insertions(+), 9 deletions(-) diff --git a/packages/victoria-metrics/README.md b/packages/victoria-metrics/README.md index d878bf4f1..cd7c11728 100644 --- a/packages/victoria-metrics/README.md +++ b/packages/victoria-metrics/README.md @@ -36,16 +36,16 @@ Configuration is located at `/etc/config/victoria-metrics`: ``` config victoriametrics 'main' - option storage_path '/var/lib/victoriametrics' - option retention_period '1y' option http_listen_addr '127.0.0.1:8428' ``` -**Options:** -- `storage_path`: Where to store metrics data -- `retention_period`: How long to keep metrics (`1d`, `7d`, `30d`, `1y`, etc.) +**Required options:** - `http_listen_addr`: Address and port for the HTTP server +**Optional options:** +- `storage_path`: Where to store metrics data (default: `/var/lib/victoriametrics`, auto-detects `/mnt/data/victoriametrics` if available) +- `retention_period`: How long to keep metrics (`1d`, `7d`, `30d`, `1y`, etc.) (default: `7d`, auto-detects `1y` if not set) + ### Accessing the Web UI By default the server is accessible only on localhost for security. diff --git a/packages/victoria-metrics/files/victoria-metrics.conf b/packages/victoria-metrics/files/victoria-metrics.conf index 5b6813a89..bf3541fe8 100644 --- a/packages/victoria-metrics/files/victoria-metrics.conf +++ b/packages/victoria-metrics/files/victoria-metrics.conf @@ -1,4 +1,2 @@ config victoriametrics 'main' - option storage_path '/var/lib/victoriametrics' - option retention_period '1d' option http_listen_addr '127.0.0.1:8428' diff --git a/packages/victoria-metrics/files/victoria-metrics.initd b/packages/victoria-metrics/files/victoria-metrics.initd index fd2cee851..148804cab 100644 --- a/packages/victoria-metrics/files/victoria-metrics.initd +++ b/packages/victoria-metrics/files/victoria-metrics.initd @@ -15,10 +15,33 @@ PROG="/usr/bin/victoria-metrics" start_service() { config_load victoria-metrics local storage_path retention_period http_listen_addr - config_get storage_path main storage_path /var/lib/victoriametrics - config_get retention_period main retention_period 1d + config_get storage_path main storage_path + config_get retention_period main retention_period config_get http_listen_addr main http_listen_addr 127.0.0.1:8428 + # Detect if external storage is mounted + local disk_mount + config_load fstab + config_get disk_mount ns_data target + + # Auto-detect storage_path if not customized + if [ -z "$storage_path" ]; then + if [ -n "$disk_mount" ]; then + storage_path="$disk_mount/victoria-metrics-data" + else + storage_path="/var/lib/victoria-metrics-data" + fi + fi + + # Set retention_period default based on storage availability + if [ -z "$retention_period" ]; then + if [ -n "$disk_mount" ]; then + retention_period="1y" + else + retention_period="7d" + fi + fi + procd_open_instance procd_set_param stdout 1 procd_set_param stderr 1 From 409d068ceeaf7a5d22e64f1f8a3b4d803eac5ec7 Mon Sep 17 00:00:00 2001 From: Tommaso Bailetti Date: Fri, 15 May 2026 11:42:49 +0200 Subject: [PATCH 11/11] chore(ns-ui): version bump --- packages/ns-ui/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/ns-ui/Makefile b/packages/ns-ui/Makefile index aba307639..ee852564b 100644 --- a/packages/ns-ui/Makefile +++ b/packages/ns-ui/Makefile @@ -12,7 +12,7 @@ PKG_RELEASE:=1 PKG_SOURCE_PROTO:=git PKG_SOURCE_URL:=https://github.com/NethServer/nethsecurity-ui.git -PKG_SOURCE_VERSION:=efba51830602aa2c879ec41081208b7a33f93351 +PKG_SOURCE_VERSION:=6c362f1eb30c8ef6b034d4127885d928e6640553 PKG_SOURCE_SUBDIR:=nethsecurity-ui-$(PKG_SOURCE_VERSION) PKG_BUILD_DIR:=$(BUILD_DIR)/$(PKG_SOURCE_SUBDIR) PKG_MIRROR_HASH:=skip