Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
bc42093
Trim per-span work on metrics aggregator publish path
dougqh May 15, 2026
808d63d
Add SpanKindFilter and CoreSpan.isKind for bitmask-based kind checks
dougqh May 15, 2026
6aa620e
Use SpanKindFilter in ConflatingMetricsAggregator
dougqh May 15, 2026
a02d0a9
Add DDSpan-based variant of ConflatingMetricsAggregator JMH benchmark
dougqh May 15, 2026
ed38f18
Tighten SpanKindFilter encapsulation
dougqh May 15, 2026
034afc0
Defer MetricKey construction and cache lookups to the aggregator thread
dougqh May 15, 2026
3a056b3
Report aggregator inbox-full drops via health metrics
dougqh May 15, 2026
3355865
Merge branch 'master' into dougqh/conflating-metrics-producer-wins
dougqh May 18, 2026
950499c
Merge branch 'dougqh/conflating-metrics-producer-wins' into dougqh/co…
dougqh May 18, 2026
8414960
Resize previousCounts for inbox-full health metric
dougqh May 19, 2026
24969db
Skip SpanSnapshot allocation when the inbox is already at capacity
dougqh May 19, 2026
795ba76
Merge remote-tracking branch 'origin/master' into dougqh/conflating-m…
dougqh May 20, 2026
e455801
Introduce slim PeerTagSchema; capture peer-tag values not pairs
dougqh May 20, 2026
e766fd3
Address PR #11381 review (round 2)
dougqh May 21, 2026
8cfa4a5
Cover inbox-full fast-path in ConflatingMetricsAggregator.publish
dougqh May 21, 2026
3644470
Reconcile PeerTagSchema once per reporting cycle on the aggregator th…
dougqh May 21, 2026
e7d0b42
Add bootstrap + reconcile coverage for PeerTagSchema
dougqh May 21, 2026
ba3225c
Use writer.finishBucket() count in bootstrap test for cascade compati…
dougqh May 21, 2026
ec857b2
Merge branch 'master' into dougqh/conflating-metrics-background-work
dougqh May 21, 2026
0b86066
Preserve TRACER_METRICS_MAX_PENDING semantic + drop stale imports
dougqh May 21, 2026
5c78dbb
Add AdversarialMetricsBenchmark for capacity-bound stress testing
dougqh May 21, 2026
70c20ef
Trim AdversarialMetricsBenchmark counters and clarify printout
dougqh May 21, 2026
68848ad
Close PeerTagSchema reconcile race + cover the swap branch
dougqh May 21, 2026
2ea61c5
Clarify materializePeerTags hit-counting loop
dougqh May 21, 2026
5a4685f
Drop unused Tags imports flagged by codenarc
dougqh May 21, 2026
a1863db
Update dd-trace-core/src/main/java/datadog/trace/common/metrics/PeerT…
dougqh May 22, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -403,6 +403,16 @@ public Set<String> peerTags() {
return discoveryState.peerTags;
}

/**
* Wall-clock timestamp ({@link System#currentTimeMillis()}) of the most recent successful
* feature discovery, or {@code 0L} if discovery has never run. Callers (e.g. the client-stats
* aggregator) snapshot this alongside {@link #peerTags()} to detect when discovery has refreshed
* and a cached view of feature state may be stale.
*/
public long getLastTimeDiscovered() {
return discoveryState.lastTimeDiscovered;
}

public String getMetricsEndpoint() {
return discoveryState.metricsEndpoint;
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
package datadog.trace.common.metrics;

import static datadog.trace.bootstrap.instrumentation.api.Tags.SPAN_KIND;
import static datadog.trace.bootstrap.instrumentation.api.Tags.SPAN_KIND_CLIENT;
import static java.util.concurrent.TimeUnit.SECONDS;

import datadog.trace.api.WellKnownTags;
import datadog.trace.core.CoreSpan;
import datadog.trace.core.monitor.HealthMetrics;
import java.util.Collections;
import java.util.List;
import java.util.concurrent.ThreadLocalRandom;
import java.util.concurrent.atomic.LongAdder;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Fork;
import org.openjdk.jmh.annotations.Measurement;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.annotations.TearDown;
import org.openjdk.jmh.annotations.Threads;
import org.openjdk.jmh.annotations.Warmup;
import org.openjdk.jmh.infra.Blackhole;

/**
* Adversarial JMH benchmark designed to stress the metrics subsystem's capacity bounds.
*
* <p>The metrics aggregator is bounded at every layer:
*
* <ul>
* <li>The aggregate cache caps total entries at {@code tracerMetricsMaxAggregates} (default
* 2048). Beyond that LRU eviction kicks in.
* <li>The producer/consumer inbox is a fixed-size MPSC queue ({@code tracerMetricsMaxPending});
* when full, producer {@code offer} returns false and the snapshot is dropped via {@link
* HealthMetrics#onStatsInboxFull()}.
* <li>Histograms use a bounded dense store -- per-histogram memory is fixed.
* </ul>
*
* <p>The benchmark hammers all of these simultaneously with 8 producer threads, unique labels per
* op (so the aggregate cache fills+evicts repeatedly), random durations across a wide range (so
* histograms accept many distinct bins), and random {@code error}/{@code topLevel} flags (so both
* histograms are exercised). After the run, drop counters are printed so you can see how the
* subsystem absorbed the burst.
*
* <p>What "OOM the metrics subsystem" would look like if the bounds break: producer-thread
* allocation would grow unbounded (snapshots faster than the inbox can drain produces dropped
* snapshots, not heap growth); aggregator-thread heap would grow if entries weren't capped or
* histograms grew past their dense-store limit.
*/
@State(Scope.Benchmark)
@Warmup(iterations = 2, time = 15, timeUnit = SECONDS)
@Measurement(iterations = 5, time = 15, timeUnit = SECONDS)
@BenchmarkMode(Mode.Throughput)
@OutputTimeUnit(SECONDS)
@Threads(8)
@Fork(value = 1)
public class AdversarialMetricsBenchmark {

private ConflatingMetricsAggregator aggregator;
private CountingHealthMetrics health;

@State(Scope.Thread)
public static class ThreadState {
int cursor;
}

@Setup
public void setup() {
this.health = new CountingHealthMetrics();
this.aggregator =
new ConflatingMetricsAggregator(
new WellKnownTags("", "", "", "", "", ""),
Collections.emptySet(),
new ConflatingMetricsAggregatorBenchmark.FixedAgentFeaturesDiscovery(
Collections.singleton("peer.hostname"), Collections.emptySet()),
this.health,
new ConflatingMetricsAggregatorBenchmark.NullSink(),
2048,
2048,
false);
this.aggregator.start();
}

@TearDown
public void tearDown() {
aggregator.close();
// Counters accumulate across the trial (warmup + measurement iterations), since the
// CountingHealthMetrics instance is created once in @Setup and never reset.
System.err.println(
"[ADVERSARIAL] drops over the trial (8 threads, warmup + measurement combined):");
System.err.println(
" onStatsInboxFull = "
+ health.inboxFull.sum()
+ " (snapshots dropped because the MPSC inbox was full)");
System.err.println(
" onStatsAggregateDropped = "
+ health.aggregateDropped.sum()
+ " (snapshots dropped because the aggregate cache was full with no stale entry)");
}

@Benchmark
public void publish(ThreadState ts, Blackhole blackhole) {
int idx = ts.cursor++;
ThreadLocalRandom rng = ThreadLocalRandom.current();

// Mix indices so labels don't fall into linear order. Distinct labels exceed every reasonable
// working-set bound, so the aggregate cache evicts continuously and most ops force a fresh
// MetricKey construction on the consumer thread.
int scrambled = idx * 0x9E3779B1; // golden ratio multiplier
String service = "svc-" + (scrambled & 0xFFFF);
String operation = "op-" + ((scrambled >>> 8) & 0x3FFFF);
String resource = "res-" + ((scrambled ^ 0x5A5A5A) & 0xFFFFF);
String hostname = "host-" + ((scrambled >>> 12) & 0x7FFF);
boolean error = (idx & 7) == 0;
boolean topLevel = (idx & 3) == 0;
// Wide duration spread forces histogram bins to populate broadly.
long durationNanos = 1L + (rng.nextLong() & 0x3FFFFFFFL); // 1 ns .. ~1.07 s

SimpleSpan span =
new SimpleSpan(
service, operation, resource, "web", true, topLevel, error, 0, durationNanos, 200);
span.setTag(SPAN_KIND, SPAN_KIND_CLIENT);
span.setTag("peer.hostname", hostname);

List<CoreSpan<?>> trace = Collections.singletonList(span);
blackhole.consume(aggregator.publish(trace));
}

/**
* Counts what gets dropped. Uses {@link LongAdder} so the printed totals hold up under 8-way
* contention -- {@code volatile long ++} loses ~20% of updates here, which would mask the
* order-of-magnitude shape the bench is trying to surface (inbox-full vs aggregate-dropped).
*/
static final class CountingHealthMetrics extends HealthMetrics {
final LongAdder inboxFull = new LongAdder();
final LongAdder aggregateDropped = new LongAdder();

@Override
public void onStatsInboxFull() {
inboxFull.increment();
}

@Override
public void onStatsAggregateDropped() {
aggregateDropped.increment();
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,27 @@ public AggregateMetric recordDurations(int count, AtomicLongArray durations) {
return this;
}

/**
* Records a single hit. {@code tagAndDuration} carries the duration nanos with optional {@link
* #ERROR_TAG} / {@link #TOP_LEVEL_TAG} bits OR-ed in.
*/
public AggregateMetric recordOneDuration(long tagAndDuration) {
Comment thread
dougqh marked this conversation as resolved.
++hitCount;
if ((tagAndDuration & TOP_LEVEL_TAG) == TOP_LEVEL_TAG) {
tagAndDuration ^= TOP_LEVEL_TAG;
++topLevelCount;
}
if ((tagAndDuration & ERROR_TAG) == ERROR_TAG) {
tagAndDuration ^= ERROR_TAG;
errorLatencies.accept(tagAndDuration);
++errorCount;
} else {
okLatencies.accept(tagAndDuration);
}
duration += tagAndDuration;
return this;
}

public int getErrorCount() {
return errorCount;
}
Expand Down
Loading