Skip to content

Commit 56b5e4a

Browse files
committed
Introduce telemetry for observability
This introduces the foundational telemetry infrastructure to improve the observability of LDK Server. It adds a new `/metrics` endpoint exposed on the REST service address, which serves Prometheus-compatible metrics. This endpoint is public and does not require HMAC authentication, allowing for easy integration with monitoring systems. - Added `prometheus` dependency and a `Metrics` utility struct. - Introduced a basic `ldk_health_score` gauge (0-100) that reflects the node's operational status based on connection to peer, sync state, and running status. This is the first step in a larger effort to provide comprehensive telemetry. Future updates will expand this to include metrics for channels, balances, payments, and other critical node activities.
1 parent 2766533 commit 56b5e4a

8 files changed

Lines changed: 251 additions & 9 deletions

File tree

Cargo.lock

Lines changed: 63 additions & 6 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

ldk-server-protos/src/endpoints.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,3 +26,4 @@ pub const LIST_FORWARDED_PAYMENTS_PATH: &str = "ListForwardedPayments";
2626
pub const UPDATE_CHANNEL_CONFIG_PATH: &str = "UpdateChannelConfig";
2727
pub const GET_PAYMENT_DETAILS_PATH: &str = "GetPaymentDetails";
2828
pub const CONNECT_PEER_PATH: &str = "ConnectPeer";
29+
pub const GET_METRICS_PATH: &str = "metrics";

ldk-server/Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@ toml = { version = "0.8.9", default-features = false, features = ["parse"] }
2323
chrono = { version = "0.4", default-features = false, features = ["clock"] }
2424
log = "0.4.28"
2525
base64 = { version = "0.21", default-features = false, features = ["std"] }
26+
lazy_static = "1.5.0"
27+
prometheus = "0.14.0"
2628

2729
# Required for RabittMQ based EventPublisher. Only enabled for `events-rabbitmq` feature.
2830
lapin = { version = "2.4.0", features = ["rustls"], default-features = false, optional = true }

ldk-server/src/api/error.rs

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,3 +131,15 @@ impl From<NodeError> for LdkServerError {
131131
LdkServerError::new(error_code, message)
132132
}
133133
}
134+
135+
impl From<prometheus::Error> for LdkServerError {
136+
fn from(e: prometheus::Error) -> Self {
137+
LdkServerError::new(LdkServerErrorCode::InternalServerError, e.to_string())
138+
}
139+
}
140+
141+
impl From<std::string::FromUtf8Error> for LdkServerError {
142+
fn from(e: std::string::FromUtf8Error) -> Self {
143+
LdkServerError::new(LdkServerErrorCode::InternalServerError, e.to_string())
144+
}
145+
}

ldk-server/src/main.rs

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ use crate::io::persist::{
5050
use crate::service::NodeService;
5151
use crate::util::config::{load_config, ChainSource};
5252
use crate::util::logger::ServerLogger;
53+
use crate::util::metrics::{BUILD_METRICS_INTERVAL, METRICS};
5354
use crate::util::proto_adapter::{forwarded_payment_to_proto, payment_to_proto};
5455
use crate::util::tls::get_or_generate_tls_config;
5556

@@ -291,6 +292,16 @@ fn main() {
291292
}
292293
};
293294
let event_node = Arc::clone(&node);
295+
296+
let metrics_node = Arc::clone(&node);
297+
let mut interval = tokio::time::interval(BUILD_METRICS_INTERVAL);
298+
runtime.spawn(async move {
299+
loop {
300+
interval.tick().await;
301+
METRICS.update_service_health_score(&metrics_node);
302+
}
303+
});
304+
294305
let rest_svc_listener = TcpListener::bind(config_file.rest_service_addr)
295306
.await
296307
.expect("Failed to bind listening port");

ldk-server/src/service.rs

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,9 @@ use ldk_node::Node;
2121
use ldk_server_protos::endpoints::{
2222
BOLT11_RECEIVE_PATH, BOLT11_SEND_PATH, BOLT12_RECEIVE_PATH, BOLT12_SEND_PATH,
2323
CLOSE_CHANNEL_PATH, CONNECT_PEER_PATH, FORCE_CLOSE_CHANNEL_PATH, GET_BALANCES_PATH,
24-
GET_NODE_INFO_PATH, GET_PAYMENT_DETAILS_PATH, LIST_CHANNELS_PATH, LIST_FORWARDED_PAYMENTS_PATH,
25-
LIST_PAYMENTS_PATH, ONCHAIN_RECEIVE_PATH, ONCHAIN_SEND_PATH, OPEN_CHANNEL_PATH, SPLICE_IN_PATH,
26-
SPLICE_OUT_PATH, UPDATE_CHANNEL_CONFIG_PATH,
24+
GET_METRICS_PATH, GET_NODE_INFO_PATH, GET_PAYMENT_DETAILS_PATH, LIST_CHANNELS_PATH,
25+
LIST_FORWARDED_PAYMENTS_PATH, LIST_PAYMENTS_PATH, ONCHAIN_RECEIVE_PATH, ONCHAIN_SEND_PATH,
26+
OPEN_CHANNEL_PATH, SPLICE_IN_PATH, SPLICE_OUT_PATH, UPDATE_CHANNEL_CONFIG_PATH,
2727
};
2828
use prost::Message;
2929

@@ -47,6 +47,7 @@ use crate::api::open_channel::handle_open_channel;
4747
use crate::api::splice_channel::{handle_splice_in_request, handle_splice_out_request};
4848
use crate::api::update_channel_config::handle_update_channel_config_request;
4949
use crate::io::persist::paginated_kv_store::PaginatedKVStore;
50+
use crate::util::metrics::METRICS;
5051
use crate::util::proto_adapter::to_error_response;
5152

5253
// Maximum request body size: 10 MB
@@ -148,6 +149,25 @@ impl Service<Request<Incoming>> for NodeService {
148149
type Future = Pin<Box<dyn Future<Output = Result<Self::Response, Self::Error>> + Send>>;
149150

150151
fn call(&self, req: Request<Incoming>) -> Self::Future {
152+
// Handle metrics endpoint separately to bypass auth and return plain text
153+
if req.uri().path().len() > 1 && &req.uri().path()[1..] == GET_METRICS_PATH {
154+
return Box::pin(async move {
155+
match METRICS.gather_metrics() {
156+
Ok(metrics) => Ok(Response::builder()
157+
.header("Content-Type", "text/plain")
158+
.body(Full::new(Bytes::from(metrics)))
159+
.unwrap()),
160+
Err(e) => {
161+
let (error_response, status_code) = to_error_response(e);
162+
Ok(Response::builder()
163+
.status(status_code)
164+
.body(Full::new(Bytes::from(error_response.encode_to_vec())))
165+
.unwrap())
166+
},
167+
}
168+
});
169+
}
170+
151171
// Extract auth params from headers (validation happens after body is read)
152172
let auth_params = match extract_auth_params(&req) {
153173
Ok(params) => params,

ldk-server/src/util/metrics.rs

Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
// This file is Copyright its original authors, visible in version control
2+
// history.
3+
//
4+
// This file is licensed under the Apache License, Version 2.0 <LICENSE-APACHE
5+
// or http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6+
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your option.
7+
// You may not use this file except in accordance with one or both of these
8+
// licenses.
9+
10+
use std::time::Duration;
11+
12+
use lazy_static::lazy_static;
13+
use ldk_node::Node;
14+
use prometheus::{
15+
default_registry, gather, register_int_gauge_with_registry, Encoder, IntGauge, Opts, Registry,
16+
TextEncoder,
17+
};
18+
19+
use crate::api::error::LdkServerError;
20+
21+
pub const BUILD_METRICS_INTERVAL: Duration = Duration::from_secs(60);
22+
23+
lazy_static! {
24+
pub static ref METRICS: Metrics = Metrics::new(default_registry());
25+
}
26+
27+
pub struct Metrics {
28+
pub service_health_score: IntGauge,
29+
}
30+
31+
impl Metrics {
32+
pub fn new(registry: &Registry) -> Self {
33+
Self {
34+
service_health_score: register_int_gauge_with_registry!(
35+
Opts::new("ldk_health_score", "Current health score (0-100)"),
36+
registry
37+
)
38+
.expect("Failed to register metric"),
39+
}
40+
}
41+
42+
pub fn update_service_health_score(&self, node: &Node) {
43+
let score = self.calculate_ldk_server_health_score(node);
44+
self.service_health_score.set(score);
45+
}
46+
47+
/// The health score computation is pretty basic for now and simply
48+
/// calculated based on the impacted events on the components of the
49+
/// `Node`. The events severity and weightage value are as follows:
50+
///
51+
/// - Critical: 0 (Total failure)
52+
/// - Major: 35%
53+
/// - Minor: 25%
54+
///
55+
/// Using the assigned score above, the health score of the `Node` is
56+
/// computed as:
57+
///
58+
/// Health score = Maximum health score - Sum(Event severity score)
59+
///
60+
/// Where:
61+
///
62+
/// - Maximum health score = 100
63+
///
64+
/// If the `Node` is not running/online, i.e `is_running` is false,
65+
/// the severity is critical with a weightage value of -100%.
66+
///
67+
/// If the `Node` is running but isn't connected to any peer yet,
68+
/// the severity is major with a weightage value of -35%.
69+
///
70+
/// If the `Node` is running but the Lightning Wallet hasn't been synced
71+
/// yet, the severity is minor with a weightage value of -25%.
72+
pub fn calculate_ldk_server_health_score(&self, node: &Node) -> i64 {
73+
Self::compute_health_score(
74+
node.status().is_running,
75+
!node.list_peers().is_empty(),
76+
node.status().latest_lightning_wallet_sync_timestamp.is_some(),
77+
)
78+
}
79+
80+
pub fn gather_metrics(&self) -> Result<String, LdkServerError> {
81+
let mut buffer = Vec::new();
82+
let encoder = TextEncoder::new();
83+
84+
let all_metrics = gather();
85+
encoder.encode(&all_metrics, &mut buffer)?;
86+
Ok(String::from_utf8(buffer)?)
87+
}
88+
89+
fn compute_health_score(is_running: bool, has_peers: bool, is_wallet_synced: bool) -> i64 {
90+
if !is_running {
91+
return 0;
92+
}
93+
94+
let mut health_score = 100;
95+
96+
if !has_peers {
97+
health_score -= 35;
98+
}
99+
100+
if !is_wallet_synced {
101+
health_score -= 25;
102+
}
103+
104+
health_score
105+
}
106+
}
107+
108+
#[cfg(test)]
109+
mod tests {
110+
use super::*;
111+
112+
#[test]
113+
fn test_compute_health_score() {
114+
// Node is not running
115+
assert_eq!(Metrics::compute_health_score(false, true, true), 0);
116+
assert_eq!(Metrics::compute_health_score(false, false, false), 0);
117+
118+
// Node is running, connected to a peer and wallet is synced
119+
assert_eq!(Metrics::compute_health_score(true, true, true), 100);
120+
121+
// Node is running, not connected to a peer but wallet is synced
122+
assert_eq!(Metrics::compute_health_score(true, false, true), 65);
123+
124+
// Node is running, connected to a peer but wallet is not synced
125+
assert_eq!(Metrics::compute_health_score(true, true, false), 75);
126+
127+
// Node is running, not connected to a peer and wallet is not synced
128+
assert_eq!(Metrics::compute_health_score(true, false, false), 40);
129+
}
130+
131+
#[test]
132+
fn test_gather_metrics_format() {
133+
let result = METRICS.gather_metrics();
134+
assert!(result.is_ok());
135+
let output = result.unwrap();
136+
assert!(output.contains("ldk_health_score"));
137+
}
138+
}

ldk-server/src/util/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,5 +9,6 @@
99

1010
pub(crate) mod config;
1111
pub(crate) mod logger;
12+
pub(crate) mod metrics;
1213
pub(crate) mod proto_adapter;
1314
pub(crate) mod tls;

0 commit comments

Comments
 (0)