Skip to content

Commit d5b2d67

Browse files
committed
XXX add basic otel tracing
1 parent 29fb013 commit d5b2d67

5 files changed

Lines changed: 203 additions & 1 deletion

File tree

dropshot/Cargo.toml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,11 @@ tokio-rustls = "0.25.0"
4848
toml = "0.8.19"
4949
waitgroup = "0.1.2"
5050

51+
opentelemetry = { version = "0.27", optional = true }
52+
opentelemetry-http = { version = "0.27", features = ["hyper"], optional = true }
53+
opentelemetry-semantic-conventions = { version = "0.27", optional = true }
54+
tracing = { version = "0.1", optional = true }
55+
5156
[dependencies.chrono]
5257
version = "0.4.38"
5358
features = [ "serde", "std", "clock" ]
@@ -136,6 +141,8 @@ version_check = "0.9.5"
136141
[features]
137142
usdt-probes = ["usdt/asm"]
138143
internal-docs = ["simple-mermaid"]
144+
otel-tracing = ["opentelemetry", "opentelemetry-http", "opentelemetry-semantic-conventions"]
145+
tokio-tracing = ["tracing"]
139146

140147
[package.metadata.docs.rs]
141148
features = ["internal-docs"]

dropshot/src/handler.rs

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,8 @@ pub struct RequestContext<Context: ServerContext> {
8686
pub log: Logger,
8787
/// basic request information (method, URI, etc.)
8888
pub request: RequestInfo,
89+
#[cfg(feature = "otel-tracing")]
90+
pub span_context: opentelemetry::trace::SpanContext,
8991
}
9092

9193
// This is deliberately as close to compatible with `hyper::Request` as
@@ -377,6 +379,19 @@ where
377379
}
378380
}
379381

382+
impl std::fmt::Display for HandlerError {
383+
fn fmt(&self, f: &mut Formatter) -> FmtResult {
384+
write!(
385+
f,
386+
"{}",
387+
match self {
388+
Self::Handler { ref message, .. } => message,
389+
Self::Dropshot(ref e) => &e.external_message,
390+
}
391+
)
392+
}
393+
}
394+
380395
/// An error type which can be converted into an HTTP response.
381396
///
382397
/// The error types returned by handlers must implement this trait, so that a

dropshot/src/lib.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -834,6 +834,8 @@ mod from_map;
834834
mod handler;
835835
mod http_util;
836836
mod logging;
837+
#[cfg(feature = "otel-tracing")]
838+
mod otel;
837839
mod pagination;
838840
mod router;
839841
mod schema_util;

dropshot/src/otel.rs

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
// Copyright 2024 Oxide Computer Company
2+
//! Opentelemetry tracing support
3+
//!
4+
//! Fields that we want to produce to provide comparable
5+
//! functionality to reqwest-tracing[1]:
6+
//!
7+
//! - http.request.method
8+
//! - url.scheme
9+
//! - server.address
10+
//! - server.port
11+
//! - otel.kind
12+
//! - otel.name
13+
//! - otel.status_code
14+
//! - user_agent.original
15+
//! - http.response.status_code
16+
//! - error.message
17+
//! - error.cause_chain
18+
//!
19+
//! [1] <https://docs.rs/reqwest-tracing/0.5.4/reqwest_tracing/macro.reqwest_otel_span.html>
20+
21+
use opentelemetry::{
22+
global, trace::Span, trace::Tracer, trace::TracerProvider,
23+
};
24+
use opentelemetry_http::HeaderExtractor;
25+
use opentelemetry_semantic_conventions::trace;
26+
27+
// - http.request.method
28+
// - url.scheme
29+
// - server.address
30+
// - server.port
31+
// - otel.kind
32+
// - otel.name
33+
// - otel.status_code
34+
// - user_agent.original
35+
#[derive(Debug, Clone, serde::Serialize)]
36+
pub(crate) struct RequestInfo {
37+
pub id: String,
38+
pub local_addr: std::net::SocketAddr,
39+
pub remote_addr: std::net::SocketAddr,
40+
pub method: String,
41+
pub path: String,
42+
pub query: Option<String>,
43+
}
44+
45+
#[derive(Debug, Clone, serde::Serialize)]
46+
pub(crate) struct ResponseInfo {
47+
pub id: String,
48+
pub local_addr: std::net::SocketAddr,
49+
pub remote_addr: std::net::SocketAddr,
50+
pub status_code: u16,
51+
pub message: String,
52+
}
53+
54+
fn extract_context_from_request(
55+
request: &hyper::Request<hyper::body::Incoming>,
56+
) -> opentelemetry::Context {
57+
global::get_text_map_propagator(|propagator| {
58+
propagator.extract(&HeaderExtractor(request.headers()))
59+
})
60+
}
61+
62+
pub fn create_request_span(
63+
request: &hyper::Request<hyper::body::Incoming>,
64+
) -> opentelemetry::global::BoxedSpan {
65+
let tracer_provider = global::tracer_provider();
66+
let scope =
67+
opentelemetry::InstrumentationScope::builder("dropshot_tracing")
68+
.with_version(env!("CARGO_PKG_VERSION"))
69+
.with_schema_url("https://opentelemetry.io/schemas/1.17.0")
70+
.build();
71+
let tracer = tracer_provider.tracer_with_scope(scope);
72+
let parent_cx = extract_context_from_request(&request);
73+
tracer
74+
.span_builder("dropshot_request") //XXX Fixme
75+
.with_kind(opentelemetry::trace::SpanKind::Server)
76+
.start_with_context(&tracer, &parent_cx)
77+
}
78+
79+
pub trait TraceDropshot {
80+
fn trace_request(&mut self, request: RequestInfo);
81+
fn trace_response(&mut self, response: ResponseInfo);
82+
}
83+
84+
impl TraceDropshot for opentelemetry::global::BoxedSpan {
85+
fn trace_request(&mut self, request: RequestInfo) {
86+
self.set_attributes(vec![
87+
opentelemetry::KeyValue::new("http.id".to_string(), request.id),
88+
opentelemetry::KeyValue::new(
89+
"http.method".to_string(),
90+
request.method,
91+
),
92+
opentelemetry::KeyValue::new("http.path".to_string(), request.path),
93+
]);
94+
}
95+
fn trace_response(&mut self, response: ResponseInfo) {
96+
self.set_attributes(vec![
97+
opentelemetry::KeyValue::new(
98+
trace::HTTP_RESPONSE_STATUS_CODE,
99+
i64::from(response.status_code),
100+
),
101+
opentelemetry::KeyValue::new(
102+
"http.message".to_string(),
103+
response.message,
104+
),
105+
]);
106+
}
107+
}

dropshot/src/server.rs

Lines changed: 72 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,11 @@ use super::router::HttpRouter;
1313
use super::versioning::VersionPolicy;
1414
use super::ProbeRegistration;
1515

16+
#[cfg(feature = "otel-tracing")]
17+
use crate::{otel, otel::TraceDropshot};
18+
#[cfg(feature = "otel-tracing")]
19+
use opentelemetry::trace::Span;
20+
1621
use async_stream::stream;
1722
use debug_ignore::DebugIgnore;
1823
use futures::future::{
@@ -730,6 +735,7 @@ impl<C: ServerContext> FusedFuture for HttpServer<C> {
730735
/// invoked by Hyper when a new request is received. This function returns a
731736
/// Result that either represents a valid HTTP response or an error (which will
732737
/// also get turned into an HTTP response).
738+
#[cfg_attr(feature = "tokio-tracing", tracing::instrument(err, skip_all,))]
733739
async fn http_request_handle_wrap<C: ServerContext>(
734740
server: Arc<DropshotState<C>>,
735741
remote_addr: SocketAddr,
@@ -774,6 +780,18 @@ async fn http_request_handle_wrap<C: ServerContext>(
774780
}
775781
}
776782

783+
#[cfg(feature = "otel-tracing")]
784+
let mut span = otel::create_request_span(&request);
785+
#[cfg(feature = "otel-tracing")]
786+
span.trace_request(crate::otel::RequestInfo {
787+
id: request_id.clone(),
788+
local_addr: server.local_addr,
789+
remote_addr,
790+
method: request.method().to_string(),
791+
path: request.uri().path().to_string(),
792+
query: request.uri().query().map(|x| x.to_string()),
793+
});
794+
777795
trace!(request_log, "incoming request");
778796
#[cfg(feature = "usdt-probes")]
779797
probes::request__start!(|| {
@@ -790,9 +808,12 @@ async fn http_request_handle_wrap<C: ServerContext>(
790808

791809
// Copy local address to report later during the finish probe, as the
792810
// server is passed by value to the request handler function.
793-
#[cfg(feature = "usdt-probes")]
811+
#[cfg(any(feature = "usdt-probes", feature = "otel-tracing"))]
794812
let local_addr = server.local_addr;
795813

814+
#[cfg(feature = "otel-tracing")]
815+
let span_context = span.span_context().clone();
816+
796817
// In the case the client disconnects early, the scopeguard allows us
797818
// to perform extra housekeeping before this task is dropped.
798819
let on_disconnect = guard((), |_| {
@@ -802,6 +823,18 @@ async fn http_request_handle_wrap<C: ServerContext>(
802823
"latency_us" => latency_us,
803824
);
804825

826+
#[cfg(feature = "otel-tracing")]
827+
span.trace_response(crate::otel::ResponseInfo {
828+
id: request_id.clone(),
829+
local_addr,
830+
remote_addr,
831+
// 499 is a non-standard code popularized by nginx to mean "client disconnected".
832+
status_code: 499,
833+
message: String::from(
834+
"client disconnected before response returned",
835+
),
836+
});
837+
805838
#[cfg(feature = "usdt-probes")]
806839
probes::request__done!(|| {
807840
crate::dtrace::ResponseInfo {
@@ -823,6 +856,8 @@ async fn http_request_handle_wrap<C: ServerContext>(
823856
&request_id,
824857
request_log.new(o!()),
825858
remote_addr,
859+
#[cfg(feature = "otel-tracing")]
860+
span_context,
826861
)
827862
.await;
828863

@@ -838,6 +873,17 @@ async fn http_request_handle_wrap<C: ServerContext>(
838873
let message_external = error.external_message();
839874
let message_internal = error.internal_message();
840875

876+
#[cfg(feature = "otel-tracing")]
877+
span.trace_response(crate::otel::ResponseInfo {
878+
id: request_id.clone(),
879+
local_addr,
880+
remote_addr,
881+
status_code: status.as_u16(),
882+
message: message_external
883+
.cloned()
884+
.unwrap_or_else(|| message_internal.clone()),
885+
});
886+
841887
#[cfg(feature = "usdt-probes")]
842888
probes::request__done!(|| {
843889
crate::dtrace::ResponseInfo {
@@ -869,6 +915,15 @@ async fn http_request_handle_wrap<C: ServerContext>(
869915
"latency_us" => latency_us,
870916
);
871917

918+
#[cfg(feature = "otel-tracing")]
919+
span.trace_response(crate::otel::ResponseInfo {
920+
id: request_id.parse().unwrap(),
921+
local_addr,
922+
remote_addr,
923+
status_code: response.status().as_u16(),
924+
message: "".to_string(),
925+
});
926+
872927
#[cfg(feature = "usdt-probes")]
873928
probes::request__done!(|| {
874929
crate::dtrace::ResponseInfo {
@@ -887,12 +942,26 @@ async fn http_request_handle_wrap<C: ServerContext>(
887942
Ok(response)
888943
}
889944

945+
#[cfg_attr(feature = "tokio-tracing", tracing::instrument(
946+
err,
947+
skip_all,
948+
fields(
949+
http.method = request.method().as_str().to_string(),
950+
http.uri = request.uri().to_string(),
951+
http.version = format!("{:#?}",request.version()),
952+
http.headers.accept = format!("{:#?}", request.headers()["accept"]),
953+
http.headers.host = format!("{:#?}", request.headers()["host"]),
954+
//http.headers.user_agent = format!("{:#?}", request.headers()["user-agent"]),
955+
),
956+
))]
890957
async fn http_request_handle<C: ServerContext>(
891958
server: Arc<DropshotState<C>>,
892959
request: Request<hyper::body::Incoming>,
893960
request_id: &str,
894961
request_log: Logger,
895962
remote_addr: std::net::SocketAddr,
963+
#[cfg(feature = "otel-tracing")]
964+
span_context: opentelemetry::trace::SpanContext,
896965
) -> Result<Response<Body>, HandlerError> {
897966
// TODO-hardening: is it correct to (and do we correctly) read the entire
898967
// request body even if we decide it's too large and are going to send a 400
@@ -916,6 +985,8 @@ async fn http_request_handle<C: ServerContext>(
916985
endpoint: lookup_result.endpoint,
917986
request_id: request_id.to_string(),
918987
log: request_log,
988+
#[cfg(feature = "otel-tracing")]
989+
span_context: span_context,
919990
};
920991
let handler = lookup_result.handler;
921992

0 commit comments

Comments
 (0)