From bc420936c96fcade512ae0e8e75190705371bbd8 Mon Sep 17 00:00:00 2001 From: Douglas Q Hawkins Date: Fri, 15 May 2026 12:06:49 -0400 Subject: [PATCH 01/22] Trim per-span work on metrics aggregator publish path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ConflatingMetricsAggregator.publish does a handful of redundant operations on every span. None individually is large; together they show as ~2.5% on the existing JMH benchmark once the benchmark actually exercises span.kind. - dedup span.isTopLevel(): publish() reads it into a local, then shouldComputeMetric read it again. Pass the cached value in. - resolve spanKind to String once: master called toString() twice per span (once inside spanKindEligible, once at the getPeerTags call site) and used HashSet contains on a CharSequence (which routes through equals on String). Normalize to String up front and reuse. - lazy-allocate the peer-tag list: getPeerTags() always allocated an ArrayList sized to features.peerTags() even when the span had none of those tags set. Defer allocation until the first match; return Collections.emptyList() when none hit. MetricKey already treats null/empty peerTags as emptyList, so no behavior change. Drop the spanKindEligible helper — the HashSet.contains call inlines fine in shouldComputeMetric. Update the JMH benchmark to set span.kind=client on every span. Without it the filter path short-circuits before the peer-tag and toString work, so the wins above aren't measurable. With it: baseline 6.755 us/op (CI [6.560, 6.950], stdev 0.129) optimized 6.585 us/op (CI [6.536, 6.634], stdev 0.033) 2 forks x 5 iterations x 15s. ~2.5% mean improvement and much tighter variance fork-to-fork. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../ConflatingMetricsAggregatorBenchmark.java | 3 +++ .../metrics/ConflatingMetricsAggregator.java | 27 ++++++++++--------- 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/ConflatingMetricsAggregatorBenchmark.java b/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/ConflatingMetricsAggregatorBenchmark.java index 971ee5cf6e4..b9a2f7f8c54 100644 --- a/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/ConflatingMetricsAggregatorBenchmark.java +++ b/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/ConflatingMetricsAggregatorBenchmark.java @@ -1,6 +1,8 @@ package datadog.trace.common.metrics; import static datadog.trace.api.ProtocolVersion.V0_4; +import static datadog.trace.bootstrap.instrumentation.api.Tags.SPAN_KIND; +import static datadog.trace.bootstrap.instrumentation.api.Tags.SPAN_KIND_CLIENT; import static java.util.concurrent.TimeUnit.MICROSECONDS; import static java.util.concurrent.TimeUnit.SECONDS; @@ -52,6 +54,7 @@ static List> generateTrace(int len) { final List> trace = new ArrayList<>(); for (int i = 0; i < len; i++) { SimpleSpan span = new SimpleSpan("", "", "", "", true, true, false, 0, 10, -1); + span.setTag(SPAN_KIND, SPAN_KIND_CLIENT); span.setTag("peer.hostname", Strings.random(10)); trace.add(span); } diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java index f60edf1d700..408b7688458 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java @@ -289,8 +289,10 @@ public boolean publish(List> trace) { if (features.supportsMetrics()) { for (CoreSpan span : trace) { boolean isTopLevel = span.isTopLevel(); - final CharSequence spanKind = span.unsafeGetTag(SPAN_KIND, ""); - if (shouldComputeMetric(span, spanKind)) { + // CharSequence cast keeps unsafeGetTag's generic at CharSequence so UTF8BytesString + // tag values don't trigger a ClassCastException on the String assignment. + final String spanKind = span.unsafeGetTag(SPAN_KIND, (CharSequence) "").toString(); + if (shouldComputeMetric(span, isTopLevel, spanKind)) { final CharSequence resourceName = span.getResourceName(); if (resourceName != null && ignoredResources.contains(resourceName.toString())) { // skip publishing all children @@ -306,19 +308,15 @@ public boolean publish(List> trace) { return forceKeep; } - private boolean shouldComputeMetric(CoreSpan span, @Nonnull CharSequence spanKind) { - return (span.isMeasured() || span.isTopLevel() || spanKindEligible(spanKind)) + private boolean shouldComputeMetric( + CoreSpan span, boolean isTopLevel, @Nonnull String spanKind) { + return (span.isMeasured() || isTopLevel || ELIGIBLE_SPAN_KINDS_FOR_METRICS.contains(spanKind)) && span.getLongRunningVersion() <= 0 // either not long-running or unpublished long-running span && span.getDurationNano() > 0; } - private boolean spanKindEligible(@Nonnull CharSequence spanKind) { - // use toString since it could be a CharSequence... - return ELIGIBLE_SPAN_KINDS_FOR_METRICS.contains(spanKind.toString()); - } - - private boolean publish(CoreSpan span, boolean isTopLevel, CharSequence spanKind) { + private boolean publish(CoreSpan span, boolean isTopLevel, String spanKind) { // Extract HTTP method and endpoint only if the feature is enabled String httpMethod = null; String httpEndpoint = null; @@ -347,7 +345,7 @@ private boolean publish(CoreSpan span, boolean isTopLevel, CharSequence spanK span.getParentId() == 0, SPAN_KINDS.computeIfAbsent( spanKind, UTF8BytesString::create), // save repeated utf8 conversions - getPeerTags(span, spanKind.toString()), + getPeerTags(span, spanKind), httpMethod, httpEndpoint, grpcStatusCode); @@ -385,19 +383,22 @@ private boolean publish(CoreSpan span, boolean isTopLevel, CharSequence spanK private List getPeerTags(CoreSpan span, String spanKind) { if (ELIGIBLE_SPAN_KINDS_FOR_PEER_AGGREGATION.contains(spanKind)) { final Set eligiblePeerTags = features.peerTags(); - List peerTags = new ArrayList<>(eligiblePeerTags.size()); + List peerTags = null; for (String peerTag : eligiblePeerTags) { Object value = span.unsafeGetTag(peerTag); if (value != null) { final Pair, Function> cacheAndCreator = PEER_TAGS_CACHE.computeIfAbsent(peerTag, PEER_TAGS_CACHE_ADDER); + if (peerTags == null) { + peerTags = new ArrayList<>(eligiblePeerTags.size()); + } peerTags.add( cacheAndCreator .getLeft() .computeIfAbsent(value.toString(), cacheAndCreator.getRight())); } } - return peerTags; + return peerTags == null ? Collections.emptyList() : peerTags; } else if (SPAN_KIND_INTERNAL.equals(spanKind)) { // in this case only the base service should be aggregated if present final Object baseService = span.unsafeGetTag(BASE_SERVICE); From 808d63d04c45acc4893d7ac5671b48ab88c6cf86 Mon Sep 17 00:00:00 2001 From: Douglas Q Hawkins Date: Fri, 15 May 2026 12:34:10 -0400 Subject: [PATCH 02/22] Add SpanKindFilter and CoreSpan.isKind for bitmask-based kind checks Introduce SpanKindFilter -- a tiny builder-built immutable filter whose state is an int bitmask indexed by the span.kind ordinals already cached on DDSpanContext. Each include* on the builder sets one bit (1 << ordinal); the runtime check is a single AND against (1 << span's ordinal). CoreSpan.isKind(SpanKindFilter) is the new entry point. DDSpan overrides it to do the bit-test directly against the cached ordinal -- no virtual call, no tag-map lookup. The two existing test-only CoreSpan impls (SimpleSpan and TraceGenerator.PojoSpan, the latter in two source sets) implement isKind by reading the span.kind tag and delegating to SpanKindFilter.matches(String), which converts via DDSpanContext.spanKindOrdinalOf and does the same AND. Refactor: DDSpanContext.setSpanKindOrdinal(String) now delegates to a new package-private static spanKindOrdinalOf(String) so the same string-to-ordinal mapping serves both the tag interceptor path and SpanKindFilter.matches. This is groundwork -- nothing in the codebase calls isKind yet. The next commit will replace the HashSet-based eligibility checks in ConflatingMetricsAggregator with SpanKindFilter instances. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../java/datadog/trace/core/CoreSpan.java | 2 + .../main/java/datadog/trace/core/DDSpan.java | 4 ++ .../datadog/trace/core/DDSpanContext.java | 20 ++++--- .../datadog/trace/core/SpanKindFilter.java | 55 +++++++++++++++++++ .../trace/common/metrics/SimpleSpan.groovy | 8 +++ .../trace/common/writer/TraceGenerator.groovy | 8 +++ .../groovy/TraceGenerator.groovy | 8 +++ 7 files changed, 97 insertions(+), 8 deletions(-) create mode 100644 dd-trace-core/src/main/java/datadog/trace/core/SpanKindFilter.java diff --git a/dd-trace-core/src/main/java/datadog/trace/core/CoreSpan.java b/dd-trace-core/src/main/java/datadog/trace/core/CoreSpan.java index 8c98cbbc58a..7d183670883 100644 --- a/dd-trace-core/src/main/java/datadog/trace/core/CoreSpan.java +++ b/dd-trace-core/src/main/java/datadog/trace/core/CoreSpan.java @@ -80,6 +80,8 @@ default U unsafeGetTag(CharSequence name) { boolean isForceKeep(); + boolean isKind(SpanKindFilter filter); + CharSequence getType(); /** diff --git a/dd-trace-core/src/main/java/datadog/trace/core/DDSpan.java b/dd-trace-core/src/main/java/datadog/trace/core/DDSpan.java index 2c62819e97a..ab074d8d4c8 100644 --- a/dd-trace-core/src/main/java/datadog/trace/core/DDSpan.java +++ b/dd-trace-core/src/main/java/datadog/trace/core/DDSpan.java @@ -959,6 +959,10 @@ public boolean isOutbound() { return ordinal == DDSpanContext.SPAN_KIND_CLIENT || ordinal == DDSpanContext.SPAN_KIND_PRODUCER; } + public boolean isKind(SpanKindFilter filter) { + return (filter.kindMask & (1 << context.getSpanKindOrdinal())) != 0; + } + @Override public void copyPropagationAndBaggage(final AgentSpan source) { if (source instanceof DDSpan) { diff --git a/dd-trace-core/src/main/java/datadog/trace/core/DDSpanContext.java b/dd-trace-core/src/main/java/datadog/trace/core/DDSpanContext.java index f2eb17fe8a2..a7c0849943e 100644 --- a/dd-trace-core/src/main/java/datadog/trace/core/DDSpanContext.java +++ b/dd-trace-core/src/main/java/datadog/trace/core/DDSpanContext.java @@ -771,22 +771,26 @@ static boolean tagEquals(String tagValue, String tagLiteral) { * span.kind is set. */ public void setSpanKindOrdinal(String kind) { + spanKindOrdinal = spanKindOrdinalOf(kind); + } + + static byte spanKindOrdinalOf(String kind) { if (kind == null) { - spanKindOrdinal = SPAN_KIND_UNSET; + return SPAN_KIND_UNSET; } else if (tagEquals(kind, Tags.SPAN_KIND_SERVER)) { - spanKindOrdinal = SPAN_KIND_SERVER; + return SPAN_KIND_SERVER; } else if (tagEquals(kind, Tags.SPAN_KIND_CLIENT)) { - spanKindOrdinal = SPAN_KIND_CLIENT; + return SPAN_KIND_CLIENT; } else if (tagEquals(kind, Tags.SPAN_KIND_PRODUCER)) { - spanKindOrdinal = SPAN_KIND_PRODUCER; + return SPAN_KIND_PRODUCER; } else if (tagEquals(kind, Tags.SPAN_KIND_CONSUMER)) { - spanKindOrdinal = SPAN_KIND_CONSUMER; + return SPAN_KIND_CONSUMER; } else if (tagEquals(kind, Tags.SPAN_KIND_INTERNAL)) { - spanKindOrdinal = SPAN_KIND_INTERNAL; + return SPAN_KIND_INTERNAL; } else if (tagEquals(kind, Tags.SPAN_KIND_BROKER)) { - spanKindOrdinal = SPAN_KIND_BROKER; + return SPAN_KIND_BROKER; } else { - spanKindOrdinal = SPAN_KIND_CUSTOM; + return SPAN_KIND_CUSTOM; } } diff --git a/dd-trace-core/src/main/java/datadog/trace/core/SpanKindFilter.java b/dd-trace-core/src/main/java/datadog/trace/core/SpanKindFilter.java new file mode 100644 index 00000000000..39ca3031039 --- /dev/null +++ b/dd-trace-core/src/main/java/datadog/trace/core/SpanKindFilter.java @@ -0,0 +1,55 @@ +package datadog.trace.core; + +public final class SpanKindFilter { + public static final class Builder { + private int kindMask; + + public Builder includeServer() { + return this.include(DDSpanContext.SPAN_KIND_SERVER); + } + + public Builder includeClient() { + return this.include(DDSpanContext.SPAN_KIND_CLIENT); + } + + public Builder includeProducer() { + return this.include(DDSpanContext.SPAN_KIND_PRODUCER); + } + + public Builder includeConsumer() { + return this.include(DDSpanContext.SPAN_KIND_CONSUMER); + } + + public Builder includeInternal() { + return this.include(DDSpanContext.SPAN_KIND_INTERNAL); + } + + public Builder includeBroker() { + return this.include(DDSpanContext.SPAN_KIND_BROKER); + } + + public final SpanKindFilter build() { + return new SpanKindFilter(this.kindMask); + } + + private Builder include(int spanKindConstant) { + this.kindMask |= (1 << spanKindConstant); + return this; + } + } + + public static final Builder builder() { + return new Builder(); + } + + final int kindMask; + + SpanKindFilter(int kindMask) { + this.kindMask = kindMask; + } + + /** Test whether a span with the given span.kind string passes this filter. */ + public boolean matches(String spanKind) { + return (kindMask & (1 << DDSpanContext.spanKindOrdinalOf(spanKind))) != 0; + } +} diff --git a/dd-trace-core/src/test/groovy/datadog/trace/common/metrics/SimpleSpan.groovy b/dd-trace-core/src/test/groovy/datadog/trace/common/metrics/SimpleSpan.groovy index bfc1ee2f4e7..61c8597129c 100644 --- a/dd-trace-core/src/test/groovy/datadog/trace/common/metrics/SimpleSpan.groovy +++ b/dd-trace-core/src/test/groovy/datadog/trace/common/metrics/SimpleSpan.groovy @@ -2,8 +2,10 @@ package datadog.trace.common.metrics import datadog.trace.api.DDSpanId import datadog.trace.api.DDTraceId +import datadog.trace.bootstrap.instrumentation.api.Tags import datadog.trace.core.CoreSpan import datadog.trace.core.MetadataConsumer +import datadog.trace.core.SpanKindFilter class SimpleSpan implements CoreSpan { @@ -211,6 +213,12 @@ class SimpleSpan implements CoreSpan { return false } + @Override + boolean isKind(SpanKindFilter filter) { + def kind = tags.get(Tags.SPAN_KIND) + return filter.matches(kind == null ? null : kind.toString()) + } + @Override CharSequence getType() { return type diff --git a/dd-trace-core/src/test/groovy/datadog/trace/common/writer/TraceGenerator.groovy b/dd-trace-core/src/test/groovy/datadog/trace/common/writer/TraceGenerator.groovy index 66bdbab137b..49e13472249 100644 --- a/dd-trace-core/src/test/groovy/datadog/trace/common/writer/TraceGenerator.groovy +++ b/dd-trace-core/src/test/groovy/datadog/trace/common/writer/TraceGenerator.groovy @@ -11,10 +11,12 @@ import datadog.trace.api.ProcessTags import datadog.trace.api.TagMap import datadog.trace.api.sampling.PrioritySampling import datadog.trace.bootstrap.instrumentation.api.AgentSpanLink +import datadog.trace.bootstrap.instrumentation.api.Tags import datadog.trace.bootstrap.instrumentation.api.UTF8BytesString import datadog.trace.core.CoreSpan import datadog.trace.core.Metadata import datadog.trace.core.MetadataConsumer +import datadog.trace.core.SpanKindFilter import java.util.concurrent.ThreadLocalRandom import java.util.concurrent.TimeUnit @@ -321,6 +323,12 @@ class TraceGenerator { return false } + @Override + boolean isKind(SpanKindFilter filter) { + def kind = metadata.getTags().get(Tags.SPAN_KIND) + return filter.matches(kind == null ? null : kind.toString()) + } + @Override short getHttpStatusCode() { return httpStatusCode diff --git a/dd-trace-core/src/traceAgentTest/groovy/TraceGenerator.groovy b/dd-trace-core/src/traceAgentTest/groovy/TraceGenerator.groovy index e668d0112a6..2b2bca79406 100644 --- a/dd-trace-core/src/traceAgentTest/groovy/TraceGenerator.groovy +++ b/dd-trace-core/src/traceAgentTest/groovy/TraceGenerator.groovy @@ -9,10 +9,12 @@ import datadog.trace.api.DDTags import datadog.trace.api.DDTraceId import datadog.trace.api.IdGenerationStrategy import datadog.trace.api.TagMap +import datadog.trace.bootstrap.instrumentation.api.Tags import datadog.trace.bootstrap.instrumentation.api.UTF8BytesString import datadog.trace.core.CoreSpan import datadog.trace.core.Metadata import datadog.trace.core.MetadataConsumer +import datadog.trace.core.SpanKindFilter import java.util.concurrent.ThreadLocalRandom import java.util.concurrent.TimeUnit @@ -298,6 +300,12 @@ class TraceGenerator { return false } + @Override + boolean isKind(SpanKindFilter filter) { + def kind = metadata.getTags().get(Tags.SPAN_KIND) + return filter.matches(kind == null ? null : kind.toString()) + } + Map getBaggage() { return metadata.getBaggage() } From 6aa620ec53ce86c5c255a5b437e3c04df251ea83 Mon Sep 17 00:00:00 2001 From: Douglas Q Hawkins Date: Fri, 15 May 2026 12:58:41 -0400 Subject: [PATCH 03/22] Use SpanKindFilter in ConflatingMetricsAggregator MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the two ELIGIBLE_SPAN_KINDS_FOR_* HashSet constants and the SPAN_KIND_INTERNAL.equals check with three SpanKindFilter instances: METRICS_ELIGIBLE_KINDS, PEER_AGGREGATION_KINDS, INTERNAL_KIND. Eligibility checks now go through span.isKind(filter), which on DDSpan is a volatile byte read against the already-cached span.kind ordinal plus a single bit-test. Also defer the span.kind tag read: previously read at the top of the publish loop and threaded through both shouldComputeMetric and the inner publish. isKind no longer needs the string, so the read can move down into the inner publish where it's still needed for the SPAN_KINDS cache key / MetricKey. Supporting changes: - DDSpanContext.spanKindOrdinalOf(String) is now public so non-DDSpan CoreSpan impls can compute the ordinal at tag-write time. - SpanKindFilter gains a public matches(byte) fast-path overload that callers with a pre-computed ordinal use directly. - SimpleSpan caches the ordinal in setTag(SPAN_KIND, ...), mirroring what TagInterceptor does for DDSpanContext, and its isKind now hits the byte fast path. Without this, the JMH benchmark (which uses SimpleSpan) would re-derive the ordinal on every isKind call and overstate the cost. Benchmark on the bench updated last commit (kind=client on every span, 4 forks x 5 iter x 15s): prior commit 6.585 ± 0.049 us/op this commit 6.903 ± 0.096 us/op The slight regression is a SimpleSpan-via-groovy-dispatch artifact -- the interface call to isKind through CoreSpan, then through SimpleSpan, then through SpanKindFilter.matches, doesn't fold as aggressively as a HashSet contains on a static field. In production DDSpan.isKind inlines to a context field read + ordinal byte read + bit-test, so the production path is faster than the prior HashSet approach. A DDSpan-based benchmark would show this; the existing SimpleSpan-based one doesn't. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../metrics/ConflatingMetricsAggregator.java | 55 +++++++++---------- .../datadog/trace/core/DDSpanContext.java | 2 +- .../datadog/trace/core/SpanKindFilter.java | 7 ++- .../trace/common/metrics/SimpleSpan.groovy | 9 ++- 4 files changed, 39 insertions(+), 34 deletions(-) diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java index 408b7688458..fee2f9a7748 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java @@ -7,11 +7,6 @@ import static datadog.trace.bootstrap.instrumentation.api.Tags.HTTP_ENDPOINT; import static datadog.trace.bootstrap.instrumentation.api.Tags.HTTP_METHOD; import static datadog.trace.bootstrap.instrumentation.api.Tags.SPAN_KIND; -import static datadog.trace.bootstrap.instrumentation.api.Tags.SPAN_KIND_CLIENT; -import static datadog.trace.bootstrap.instrumentation.api.Tags.SPAN_KIND_CONSUMER; -import static datadog.trace.bootstrap.instrumentation.api.Tags.SPAN_KIND_INTERNAL; -import static datadog.trace.bootstrap.instrumentation.api.Tags.SPAN_KIND_PRODUCER; -import static datadog.trace.bootstrap.instrumentation.api.Tags.SPAN_KIND_SERVER; import static datadog.trace.common.metrics.AggregateMetric.ERROR_TAG; import static datadog.trace.common.metrics.AggregateMetric.TOP_LEVEL_TAG; import static datadog.trace.common.metrics.SignalItem.ReportSignal.REPORT; @@ -19,7 +14,6 @@ import static datadog.trace.util.AgentThreadFactory.AgentThread.METRICS_AGGREGATOR; import static datadog.trace.util.AgentThreadFactory.THREAD_JOIN_TIMOUT_MS; import static datadog.trace.util.AgentThreadFactory.newAgentThread; -import static java.util.Collections.unmodifiableSet; import static java.util.concurrent.TimeUnit.SECONDS; import datadog.common.queue.Queues; @@ -36,12 +30,11 @@ import datadog.trace.common.writer.ddagent.DDAgentApi; import datadog.trace.core.CoreSpan; import datadog.trace.core.DDTraceCoreInfo; +import datadog.trace.core.SpanKindFilter; import datadog.trace.core.monitor.HealthMetrics; import datadog.trace.util.AgentTaskScheduler; import java.util.ArrayList; -import java.util.Arrays; import java.util.Collections; -import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; @@ -50,7 +43,6 @@ import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; import java.util.function.Function; -import javax.annotation.Nonnull; import org.jctools.queues.MessagePassingQueue; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -82,15 +74,19 @@ public final class ConflatingMetricsAggregator implements MetricsAggregator, Eve value -> UTF8BytesString.create(key + ":" + value)); private static final CharSequence SYNTHETICS_ORIGIN = "synthetics"; - private static final Set ELIGIBLE_SPAN_KINDS_FOR_METRICS = - unmodifiableSet( - new HashSet<>( - Arrays.asList( - SPAN_KIND_SERVER, SPAN_KIND_CLIENT, SPAN_KIND_CONSUMER, SPAN_KIND_PRODUCER))); + private static final SpanKindFilter METRICS_ELIGIBLE_KINDS = + SpanKindFilter.builder() + .includeServer() + .includeClient() + .includeProducer() + .includeConsumer() + .build(); - private static final Set ELIGIBLE_SPAN_KINDS_FOR_PEER_AGGREGATION = - unmodifiableSet( - new HashSet<>(Arrays.asList(SPAN_KIND_CLIENT, SPAN_KIND_PRODUCER, SPAN_KIND_CONSUMER))); + private static final SpanKindFilter PEER_AGGREGATION_KINDS = + SpanKindFilter.builder().includeClient().includeProducer().includeConsumer().build(); + + private static final SpanKindFilter INTERNAL_KIND = + SpanKindFilter.builder().includeInternal().build(); private final Set ignoredResources; private final MessagePassingQueue batchPool; @@ -289,10 +285,7 @@ public boolean publish(List> trace) { if (features.supportsMetrics()) { for (CoreSpan span : trace) { boolean isTopLevel = span.isTopLevel(); - // CharSequence cast keeps unsafeGetTag's generic at CharSequence so UTF8BytesString - // tag values don't trigger a ClassCastException on the String assignment. - final String spanKind = span.unsafeGetTag(SPAN_KIND, (CharSequence) "").toString(); - if (shouldComputeMetric(span, isTopLevel, spanKind)) { + if (shouldComputeMetric(span, isTopLevel)) { final CharSequence resourceName = span.getResourceName(); if (resourceName != null && ignoredResources.contains(resourceName.toString())) { // skip publishing all children @@ -300,7 +293,7 @@ public boolean publish(List> trace) { break; } counted++; - forceKeep |= publish(span, isTopLevel, spanKind); + forceKeep |= publish(span, isTopLevel); } } healthMetrics.onClientStatTraceComputed(counted, trace.size(), !forceKeep); @@ -308,15 +301,14 @@ public boolean publish(List> trace) { return forceKeep; } - private boolean shouldComputeMetric( - CoreSpan span, boolean isTopLevel, @Nonnull String spanKind) { - return (span.isMeasured() || isTopLevel || ELIGIBLE_SPAN_KINDS_FOR_METRICS.contains(spanKind)) + private boolean shouldComputeMetric(CoreSpan span, boolean isTopLevel) { + return (span.isMeasured() || isTopLevel || span.isKind(METRICS_ELIGIBLE_KINDS)) && span.getLongRunningVersion() <= 0 // either not long-running or unpublished long-running span && span.getDurationNano() > 0; } - private boolean publish(CoreSpan span, boolean isTopLevel, String spanKind) { + private boolean publish(CoreSpan span, boolean isTopLevel) { // Extract HTTP method and endpoint only if the feature is enabled String httpMethod = null; String httpEndpoint = null; @@ -333,6 +325,9 @@ private boolean publish(CoreSpan span, boolean isTopLevel, String spanKind) { Object grpcStatusObj = span.unsafeGetTag(InstrumentationTags.GRPC_STATUS_CODE); grpcStatusCode = grpcStatusObj != null ? grpcStatusObj.toString() : null; } + // CharSequence default keeps unsafeGetTag's generic at CharSequence so UTF8BytesString + // tag values don't trigger a ClassCastException on the String assignment. + final String spanKind = span.unsafeGetTag(SPAN_KIND, (CharSequence) "").toString(); MetricKey newKey = new MetricKey( span.getResourceName(), @@ -345,7 +340,7 @@ private boolean publish(CoreSpan span, boolean isTopLevel, String spanKind) { span.getParentId() == 0, SPAN_KINDS.computeIfAbsent( spanKind, UTF8BytesString::create), // save repeated utf8 conversions - getPeerTags(span, spanKind), + getPeerTags(span), httpMethod, httpEndpoint, grpcStatusCode); @@ -380,8 +375,8 @@ private boolean publish(CoreSpan span, boolean isTopLevel, String spanKind) { return span.getError() > 0; } - private List getPeerTags(CoreSpan span, String spanKind) { - if (ELIGIBLE_SPAN_KINDS_FOR_PEER_AGGREGATION.contains(spanKind)) { + private List getPeerTags(CoreSpan span) { + if (span.isKind(PEER_AGGREGATION_KINDS)) { final Set eligiblePeerTags = features.peerTags(); List peerTags = null; for (String peerTag : eligiblePeerTags) { @@ -399,7 +394,7 @@ private List getPeerTags(CoreSpan span, String spanKind) { } } return peerTags == null ? Collections.emptyList() : peerTags; - } else if (SPAN_KIND_INTERNAL.equals(spanKind)) { + } else if (span.isKind(INTERNAL_KIND)) { // in this case only the base service should be aggregated if present final Object baseService = span.unsafeGetTag(BASE_SERVICE); if (baseService != null) { diff --git a/dd-trace-core/src/main/java/datadog/trace/core/DDSpanContext.java b/dd-trace-core/src/main/java/datadog/trace/core/DDSpanContext.java index a7c0849943e..e403efd543b 100644 --- a/dd-trace-core/src/main/java/datadog/trace/core/DDSpanContext.java +++ b/dd-trace-core/src/main/java/datadog/trace/core/DDSpanContext.java @@ -774,7 +774,7 @@ public void setSpanKindOrdinal(String kind) { spanKindOrdinal = spanKindOrdinalOf(kind); } - static byte spanKindOrdinalOf(String kind) { + public static byte spanKindOrdinalOf(String kind) { if (kind == null) { return SPAN_KIND_UNSET; } else if (tagEquals(kind, Tags.SPAN_KIND_SERVER)) { diff --git a/dd-trace-core/src/main/java/datadog/trace/core/SpanKindFilter.java b/dd-trace-core/src/main/java/datadog/trace/core/SpanKindFilter.java index 39ca3031039..600e0d9ca47 100644 --- a/dd-trace-core/src/main/java/datadog/trace/core/SpanKindFilter.java +++ b/dd-trace-core/src/main/java/datadog/trace/core/SpanKindFilter.java @@ -50,6 +50,11 @@ public static final Builder builder() { /** Test whether a span with the given span.kind string passes this filter. */ public boolean matches(String spanKind) { - return (kindMask & (1 << DDSpanContext.spanKindOrdinalOf(spanKind))) != 0; + return matches(DDSpanContext.spanKindOrdinalOf(spanKind)); + } + + /** Fast-path test for callers that already hold the span's cached kind ordinal. */ + public boolean matches(byte spanKindOrdinal) { + return (kindMask & (1 << spanKindOrdinal)) != 0; } } diff --git a/dd-trace-core/src/test/groovy/datadog/trace/common/metrics/SimpleSpan.groovy b/dd-trace-core/src/test/groovy/datadog/trace/common/metrics/SimpleSpan.groovy index 61c8597129c..2fd8554d499 100644 --- a/dd-trace-core/src/test/groovy/datadog/trace/common/metrics/SimpleSpan.groovy +++ b/dd-trace-core/src/test/groovy/datadog/trace/common/metrics/SimpleSpan.groovy @@ -4,6 +4,7 @@ import datadog.trace.api.DDSpanId import datadog.trace.api.DDTraceId import datadog.trace.bootstrap.instrumentation.api.Tags import datadog.trace.core.CoreSpan +import datadog.trace.core.DDSpanContext import datadog.trace.core.MetadataConsumer import datadog.trace.core.SpanKindFilter @@ -26,6 +27,8 @@ class SimpleSpan implements CoreSpan { private final Map tags = [:] + private byte spanKindOrdinal = 0 // SPAN_KIND_UNSET + SimpleSpan( String serviceName, String operationName, @@ -173,6 +176,9 @@ class SimpleSpan implements CoreSpan { @Override SimpleSpan setTag(String tag, Object value) { tags.put(tag, value) + if (Tags.SPAN_KIND == tag) { + spanKindOrdinal = DDSpanContext.spanKindOrdinalOf(value == null ? null : value.toString()) + } return this } @@ -215,8 +221,7 @@ class SimpleSpan implements CoreSpan { @Override boolean isKind(SpanKindFilter filter) { - def kind = tags.get(Tags.SPAN_KIND) - return filter.matches(kind == null ? null : kind.toString()) + return filter.matches(spanKindOrdinal) } @Override From a02d0a9cb1b8d67d807160cd4c7a796c627143a2 Mon Sep 17 00:00:00 2001 From: Douglas Q Hawkins Date: Fri, 15 May 2026 13:13:07 -0400 Subject: [PATCH 04/22] Add DDSpan-based variant of ConflatingMetricsAggregator JMH benchmark The existing ConflatingMetricsAggregatorBenchmark uses SimpleSpan, a groovy mock. That's enough for measuring queue/CHM/MetricKey work, but it conceals the production cost of CoreSpan.isKind: SimpleSpan's isKind goes through groovy interface dispatch into SpanKindFilter.matches, while DDSpan.isKind inlines to a context byte-read + bit-test. This new benchmark uses real DDSpan instances created through a CoreTracer (with a NoopWriter so finishing doesn't reach the agent). Same shape as the SimpleSpan bench (64-span trace, span.kind=client, peer.hostname set). Numbers (2 forks x 5 iter x 15s): master: 6.428 +- 0.189 us/op (HashSet eligibility checks) this branch: 6.343 +- 0.115 us/op (SpanKindFilter bitmask) About 1.3% faster on the production path. The SimpleSpan benchmark in the same conditions shows a ~2.2% slowdown -- the mock's dispatch shape gives a misleading signal. Co-Authored-By: Claude Opus 4.7 (1M context) --- ...atingMetricsAggregatorDDSpanBenchmark.java | 98 +++++++++++++++++++ 1 file changed, 98 insertions(+) create mode 100644 dd-trace-core/src/jmh/java/datadog/trace/common/metrics/ConflatingMetricsAggregatorDDSpanBenchmark.java diff --git a/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/ConflatingMetricsAggregatorDDSpanBenchmark.java b/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/ConflatingMetricsAggregatorDDSpanBenchmark.java new file mode 100644 index 00000000000..02c6aaffc1a --- /dev/null +++ b/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/ConflatingMetricsAggregatorDDSpanBenchmark.java @@ -0,0 +1,98 @@ +package datadog.trace.common.metrics; + +import static datadog.trace.bootstrap.instrumentation.api.Tags.SPAN_KIND; +import static datadog.trace.bootstrap.instrumentation.api.Tags.SPAN_KIND_CLIENT; +import static java.util.concurrent.TimeUnit.MICROSECONDS; +import static java.util.concurrent.TimeUnit.SECONDS; + +import datadog.communication.ddagent.DDAgentFeaturesDiscovery; +import datadog.trace.api.WellKnownTags; +import datadog.trace.common.writer.Writer; +import datadog.trace.core.CoreSpan; +import datadog.trace.core.CoreTracer; +import datadog.trace.core.DDSpan; +import datadog.trace.core.monitor.HealthMetrics; +import datadog.trace.util.Strings; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.Warmup; +import org.openjdk.jmh.infra.Blackhole; + +/** + * Parallels {@link ConflatingMetricsAggregatorBenchmark} but uses real {@link DDSpan} instances + * instead of the lightweight {@code SimpleSpan} mock, so the JIT exercises the production {@link + * CoreSpan#isKind} path (cached span.kind ordinal + bit-test) rather than the groovy mock's + * dispatch. + */ +@State(Scope.Benchmark) +@Warmup(iterations = 1, time = 30, timeUnit = SECONDS) +@Measurement(iterations = 3, time = 30, timeUnit = SECONDS) +@BenchmarkMode(Mode.AverageTime) +@OutputTimeUnit(MICROSECONDS) +@Fork(value = 1) +public class ConflatingMetricsAggregatorDDSpanBenchmark { + + private static final CoreTracer TRACER = + CoreTracer.builder().writer(new NoopWriter()).strictTraceWrites(false).build(); + + private final DDAgentFeaturesDiscovery featuresDiscovery = + new ConflatingMetricsAggregatorBenchmark.FixedAgentFeaturesDiscovery( + Collections.singleton("peer.hostname"), Collections.emptySet()); + private final ConflatingMetricsAggregator aggregator = + new ConflatingMetricsAggregator( + new WellKnownTags("", "", "", "", "", ""), + Collections.emptySet(), + featuresDiscovery, + HealthMetrics.NO_OP, + new ConflatingMetricsAggregatorBenchmark.NullSink(), + 2048, + 2048, + false); + private final List> spans = generateTrace(64); + + static List> generateTrace(int len) { + final List> trace = new ArrayList<>(); + for (int i = 0; i < len; i++) { + DDSpan span = (DDSpan) TRACER.startSpan("benchmark", "op"); + span.setTag(SPAN_KIND, SPAN_KIND_CLIENT); + span.setTag("peer.hostname", Strings.random(10)); + // Fix duration; bypasses the wall clock and avoids per-fork drift. + span.finishWithDuration(10); + trace.add(span); + } + return trace; + } + + static class NoopWriter implements Writer { + @Override + public void write(List trace) {} + + @Override + public void start() {} + + @Override + public boolean flush() { + return true; + } + + @Override + public void close() {} + + @Override + public void incrementDropCounts(int spanCount) {} + } + + @Benchmark + public void benchmark(Blackhole blackhole) { + blackhole.consume(aggregator.publish(spans)); + } +} From ed38f18c4100ff1b1bde377d4c098dce17ad79f2 Mon Sep 17 00:00:00 2001 From: Douglas Q Hawkins Date: Fri, 15 May 2026 13:19:21 -0400 Subject: [PATCH 05/22] Tighten SpanKindFilter encapsulation Make SpanKindFilter.kindMask and its constructor private now that DDSpan.isKind no longer needs direct field access -- it delegates to SpanKindFilter.matches(byte). The Builder.build() in the same outer class still constructs instances via the private constructor. Co-Authored-By: Claude Opus 4.7 (1M context) --- dd-trace-core/src/main/java/datadog/trace/core/DDSpan.java | 2 +- .../src/main/java/datadog/trace/core/SpanKindFilter.java | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/dd-trace-core/src/main/java/datadog/trace/core/DDSpan.java b/dd-trace-core/src/main/java/datadog/trace/core/DDSpan.java index ab074d8d4c8..4c438e1c915 100644 --- a/dd-trace-core/src/main/java/datadog/trace/core/DDSpan.java +++ b/dd-trace-core/src/main/java/datadog/trace/core/DDSpan.java @@ -960,7 +960,7 @@ public boolean isOutbound() { } public boolean isKind(SpanKindFilter filter) { - return (filter.kindMask & (1 << context.getSpanKindOrdinal())) != 0; + return filter.matches(context.getSpanKindOrdinal()); } @Override diff --git a/dd-trace-core/src/main/java/datadog/trace/core/SpanKindFilter.java b/dd-trace-core/src/main/java/datadog/trace/core/SpanKindFilter.java index 600e0d9ca47..9ac3fa9dc06 100644 --- a/dd-trace-core/src/main/java/datadog/trace/core/SpanKindFilter.java +++ b/dd-trace-core/src/main/java/datadog/trace/core/SpanKindFilter.java @@ -42,9 +42,9 @@ public static final Builder builder() { return new Builder(); } - final int kindMask; + private final int kindMask; - SpanKindFilter(int kindMask) { + private SpanKindFilter(int kindMask) { this.kindMask = kindMask; } From 034afc0b1a708190778a408380c85724a00a76e9 Mon Sep 17 00:00:00 2001 From: Douglas Q Hawkins Date: Fri, 15 May 2026 13:50:28 -0400 Subject: [PATCH 06/22] Defer MetricKey construction and cache lookups to the aggregator thread Replace the producer-side conflation pipeline with a thin per-span SpanSnapshot posted to the existing aggregator thread. The aggregator now builds the MetricKey, does the SERVICE_NAMES / SPAN_KINDS / PEER_TAGS_CACHE lookups, and updates the AggregateMetric directly -- all off the producer's hot path. What the producer does now, per span: - filter (shouldComputeMetric, resource-ignored, longRunning) - collect tag values into a SpanSnapshot (1 allocation per span) - inbox.offer(snapshot) + return error flag for forceKeep What moved off the producer: - MetricKey construction and its hash computation - SERVICE_NAMES.computeIfAbsent (UTF8 encoding of service name) - SPAN_KINDS.computeIfAbsent (UTF8 encoding of span.kind) - PEER_TAGS_CACHE lookups (peer-tag name+value UTF8 encoding) - pending/keys ConcurrentHashMap operations - Batch pooling, batch atomic ops, batch contributeTo Removed entirely: - Batch.java -- the conflation primitive is no longer needed; the aggregator's existing LRUCache IS the conflation point now. - pending ConcurrentHashMap - keys ConcurrentHashMap (canonical dedup) - batchPool MessagePassingQueue - The CommonKeyCleaner role of tracking keys.keySet() on LRU eviction -- AggregateExpiry now just reports drops to healthMetrics. Added: - SpanSnapshot: immutable value carrying the raw MetricKey inputs + a tagAndDuration long (duration | ERROR_TAG | TOP_LEVEL_TAG). - AggregateMetric.recordOneDuration(long tagAndDuration) -- the single-hit equivalent of the existing recordDurations(int, AtomicLongArray). - Peer-tag values flow through the snapshot as a flattened String[] of [name0, value0, name1, value1, ...]; the aggregator encodes them through PEER_TAGS_CACHE on its own thread. Benchmark results (2 forks x 5 iter x 15s): ConflatingMetricsAggregatorDDSpanBenchmark prior commit 6.343 +- 0.115 us/op this commit 2.506 +- 0.044 us/op (~60% faster) ConflatingMetricsAggregatorBenchmark (SimpleSpan) prior commit 6.585 +- 0.049 us/op this commit 3.116 +- 0.032 us/op (~53% faster) Caveat on the benchmark: without conflation, the producer pushes 1 inbox item per span instead of ~1 per 64. At the benchmark's synthetic rate the consumer can't keep up and inbox.offer silently drops. The numbers measure producer publish() latency only; consumer throughput at realistic span rates is a follow-up to validate. Tuning maxPending matters more in this design. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../trace/common/metrics/AggregateMetric.java | 21 +++ .../trace/common/metrics/Aggregator.java | 121 ++++++++++------- .../datadog/trace/common/metrics/Batch.java | 90 ------------- .../metrics/ConflatingMetricsAggregator.java | 125 ++++++------------ .../trace/common/metrics/SpanSnapshot.java | 65 +++++++++ .../common/metrics/AggregateMetricTest.groovy | 97 +------------- 6 files changed, 202 insertions(+), 317 deletions(-) delete mode 100644 dd-trace-core/src/main/java/datadog/trace/common/metrics/Batch.java create mode 100644 dd-trace-core/src/main/java/datadog/trace/common/metrics/SpanSnapshot.java diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateMetric.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateMetric.java index 478ff520a37..dba66a5ab9c 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateMetric.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateMetric.java @@ -46,6 +46,27 @@ public AggregateMetric recordDurations(int count, AtomicLongArray durations) { return this; } + /** + * Records a single hit. {@code tagAndDuration} carries the duration nanos with optional {@link + * #ERROR_TAG} / {@link #TOP_LEVEL_TAG} bits OR-ed in. + */ + public AggregateMetric recordOneDuration(long tagAndDuration) { + ++hitCount; + if ((tagAndDuration & TOP_LEVEL_TAG) == TOP_LEVEL_TAG) { + tagAndDuration ^= TOP_LEVEL_TAG; + ++topLevelCount; + } + if ((tagAndDuration & ERROR_TAG) == ERROR_TAG) { + tagAndDuration ^= ERROR_TAG; + errorLatencies.accept(tagAndDuration); + ++errorCount; + } else { + okLatencies.accept(tagAndDuration); + } + duration += tagAndDuration; + return this; + } + public int getErrorCount() { return errorCount; } diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/Aggregator.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/Aggregator.java index 8a69dbc6e56..e632555cc21 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/Aggregator.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/Aggregator.java @@ -1,16 +1,26 @@ package datadog.trace.common.metrics; +import static datadog.trace.api.Functions.UTF8_ENCODE; +import static datadog.trace.common.metrics.ConflatingMetricsAggregator.PEER_TAGS_CACHE; +import static datadog.trace.common.metrics.ConflatingMetricsAggregator.PEER_TAGS_CACHE_ADDER; +import static datadog.trace.common.metrics.ConflatingMetricsAggregator.SERVICE_NAMES; +import static datadog.trace.common.metrics.ConflatingMetricsAggregator.SPAN_KINDS; import static java.util.concurrent.TimeUnit.MILLISECONDS; +import datadog.trace.api.Pair; +import datadog.trace.api.cache.DDCache; +import datadog.trace.bootstrap.instrumentation.api.UTF8BytesString; import datadog.trace.common.metrics.SignalItem.StopSignal; import datadog.trace.core.monitor.HealthMetrics; import datadog.trace.core.util.LRUCache; import edu.umd.cs.findbugs.annotations.SuppressFBWarnings; +import java.util.ArrayList; +import java.util.Collections; import java.util.Iterator; +import java.util.List; import java.util.Map; -import java.util.Set; -import java.util.concurrent.ConcurrentMap; import java.util.concurrent.TimeUnit; +import java.util.function.Function; import org.jctools.queues.MessagePassingQueue; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -21,11 +31,8 @@ final class Aggregator implements Runnable { private static final Logger log = LoggerFactory.getLogger(Aggregator.class); - private final MessagePassingQueue batchPool; private final MessagePassingQueue inbox; private final LRUCache aggregates; - private final ConcurrentMap pending; - private final Set commonKeys; private final MetricWriter writer; // the reporting interval controls how much history will be buffered // when the agent is unresponsive (only 10 pending requests will be @@ -41,20 +48,14 @@ final class Aggregator implements Runnable { Aggregator( MetricWriter writer, - MessagePassingQueue batchPool, MessagePassingQueue inbox, - ConcurrentMap pending, - final Set commonKeys, int maxAggregates, long reportingInterval, TimeUnit reportingIntervalTimeUnit, HealthMetrics healthMetrics) { this( writer, - batchPool, inbox, - pending, - commonKeys, maxAggregates, reportingInterval, reportingIntervalTimeUnit, @@ -64,30 +65,37 @@ final class Aggregator implements Runnable { Aggregator( MetricWriter writer, - MessagePassingQueue batchPool, MessagePassingQueue inbox, - ConcurrentMap pending, - final Set commonKeys, int maxAggregates, long reportingInterval, TimeUnit reportingIntervalTimeUnit, long sleepMillis, HealthMetrics healthMetrics) { this.writer = writer; - this.batchPool = batchPool; this.inbox = inbox; - this.commonKeys = commonKeys; this.aggregates = new LRUCache<>( - new CommonKeyCleaner(commonKeys, healthMetrics), - maxAggregates * 4 / 3, - 0.75f, - maxAggregates); - this.pending = pending; + new AggregateExpiry(healthMetrics), maxAggregates * 4 / 3, 0.75f, maxAggregates); this.reportingIntervalNanos = reportingIntervalTimeUnit.toNanos(reportingInterval); this.sleepMillis = sleepMillis; } + private static final class AggregateExpiry + implements LRUCache.ExpiryListener { + private final HealthMetrics healthMetrics; + + AggregateExpiry(HealthMetrics healthMetrics) { + this.healthMetrics = healthMetrics; + } + + @Override + public void accept(Map.Entry expired) { + if (expired.getValue().getHitCount() > 0) { + healthMetrics.onStatsAggregateDropped(); + } + } + } + public void clearAggregates() { this.aggregates.clear(); } @@ -129,20 +137,54 @@ public void accept(InboxItem item) { } else { signal.ignore(); } - } else if (item instanceof Batch && !stopped) { - Batch batch = (Batch) item; - MetricKey key = batch.getKey(); - // important that it is still *this* batch pending, must not remove otherwise - pending.remove(key, batch); + } else if (item instanceof SpanSnapshot && !stopped) { + SpanSnapshot snapshot = (SpanSnapshot) item; + MetricKey key = buildMetricKey(snapshot); AggregateMetric aggregate = aggregates.computeIfAbsent(key, k -> new AggregateMetric()); - batch.contributeTo(aggregate); + aggregate.recordOneDuration(snapshot.tagAndDuration); dirty = true; - // return the batch for reuse - batchPool.offer(batch); } } } + private static MetricKey buildMetricKey(SpanSnapshot s) { + return new MetricKey( + s.resourceName, + SERVICE_NAMES.computeIfAbsent(s.serviceName, UTF8_ENCODE), + s.operationName, + s.serviceNameSource, + s.spanType, + s.httpStatusCode, + s.synthetic, + s.traceRoot, + SPAN_KINDS.computeIfAbsent(s.spanKind, UTF8BytesString::create), + materializePeerTags(s.peerTagPairs), + s.httpMethod, + s.httpEndpoint, + s.grpcStatusCode); + } + + private static List materializePeerTags(String[] pairs) { + if (pairs == null || pairs.length == 0) { + return Collections.emptyList(); + } + if (pairs.length == 2) { + // single-entry fast path (matches the original singletonList shape for INTERNAL spans) + return Collections.singletonList(encodePeerTag(pairs[0], pairs[1])); + } + List tags = new ArrayList<>(pairs.length / 2); + for (int i = 0; i < pairs.length; i += 2) { + tags.add(encodePeerTag(pairs[i], pairs[i + 1])); + } + return tags; + } + + private static UTF8BytesString encodePeerTag(String name, String value) { + final Pair, Function> + cacheAndCreator = PEER_TAGS_CACHE.computeIfAbsent(name, PEER_TAGS_CACHE_ADDER); + return cacheAndCreator.getLeft().computeIfAbsent(value, cacheAndCreator.getRight()); + } + private void report(long when, SignalItem signal) { boolean skipped = true; if (dirty) { @@ -177,7 +219,6 @@ private void expungeStaleAggregates() { AggregateMetric metric = pair.getValue(); if (metric.getHitCount() == 0) { it.remove(); - commonKeys.remove(pair.getKey()); } } } @@ -185,24 +226,4 @@ private void expungeStaleAggregates() { private long wallClockTime() { return MILLISECONDS.toNanos(System.currentTimeMillis()); } - - private static final class CommonKeyCleaner - implements LRUCache.ExpiryListener { - - private final Set commonKeys; - private final HealthMetrics healthMetrics; - - private CommonKeyCleaner(Set commonKeys, HealthMetrics healthMetrics) { - this.commonKeys = commonKeys; - this.healthMetrics = healthMetrics; - } - - @Override - public void accept(Map.Entry expired) { - commonKeys.remove(expired.getKey()); - if (expired.getValue().getHitCount() > 0) { - healthMetrics.onStatsAggregateDropped(); - } - } - } } diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/Batch.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/Batch.java deleted file mode 100644 index 5f103805e98..00000000000 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/Batch.java +++ /dev/null @@ -1,90 +0,0 @@ -package datadog.trace.common.metrics; - -import java.util.concurrent.atomic.AtomicIntegerFieldUpdater; -import java.util.concurrent.atomic.AtomicLongArray; - -/** - * This is a thread-safe container for partial conflating and accumulating partial aggregates on the - * same key. - * - *

Updates to an already consumed batch are rejected. - * - *

A batch can currently take at most 64 values. Attempts to add the 65th update will be - * rejected. - */ -public final class Batch implements InboxItem { - - private static final int MAX_BATCH_SIZE = 64; - private static final AtomicIntegerFieldUpdater COUNT = - AtomicIntegerFieldUpdater.newUpdater(Batch.class, "count"); - private static final AtomicIntegerFieldUpdater COMMITTED = - AtomicIntegerFieldUpdater.newUpdater(Batch.class, "committed"); - - /** - * This counter has two states: - * - *

    - *
  1. negative: the batch has been used, must not add values - *
  2. otherwise: the number of values added to the batch - *
- */ - private volatile int count = 0; - - /** incremented when a duration has been added. */ - private volatile int committed = 0; - - private MetricKey key; - private final AtomicLongArray durations; - - Batch(MetricKey key) { - this(new AtomicLongArray(MAX_BATCH_SIZE)); - this.key = key; - } - - Batch() { - this(new AtomicLongArray(MAX_BATCH_SIZE)); - } - - private Batch(AtomicLongArray durations) { - this.durations = durations; - } - - public MetricKey getKey() { - return key; - } - - public Batch reset(MetricKey key) { - this.key = key; - COUNT.lazySet(this, 0); - return this; - } - - public boolean isUsed() { - return count < 0; - } - - public boolean add(long tag, long durationNanos) { - // technically this would be wrong if there were 2^31 unsuccessful - // attempts to add a value, but this an acceptable risk - int position = COUNT.getAndIncrement(this); - if (position >= 0 && position < durations.length()) { - durations.set(position, tag | durationNanos); - COMMITTED.getAndIncrement(this); - return true; - } - return false; - } - - public void contributeTo(AggregateMetric aggregate) { - int count = Math.min(COUNT.getAndSet(this, Integer.MIN_VALUE), MAX_BATCH_SIZE); - if (count >= 0) { - // wait for the duration to have been set. - // note this mechanism only supports a single reader - while (committed != count) { - Thread.yield(); - } - COMMITTED.lazySet(this, 0); - aggregate.recordDurations(count, durations); - } - } -} diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java index fee2f9a7748..8268085e269 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java @@ -3,7 +3,6 @@ import static datadog.communication.ddagent.DDAgentFeaturesDiscovery.V06_METRICS_ENDPOINT; import static datadog.trace.api.DDSpanTypes.RPC; import static datadog.trace.api.DDTags.BASE_SERVICE; -import static datadog.trace.api.Functions.UTF8_ENCODE; import static datadog.trace.bootstrap.instrumentation.api.Tags.HTTP_ENDPOINT; import static datadog.trace.bootstrap.instrumentation.api.Tags.HTTP_METHOD; import static datadog.trace.bootstrap.instrumentation.api.Tags.SPAN_KIND; @@ -33,13 +32,11 @@ import datadog.trace.core.SpanKindFilter; import datadog.trace.core.monitor.HealthMetrics; import datadog.trace.util.AgentTaskScheduler; -import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.CompletableFuture; -import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; import java.util.function.Function; @@ -54,18 +51,16 @@ public final class ConflatingMetricsAggregator implements MetricsAggregator, Eve private static final Map DEFAULT_HEADERS = Collections.singletonMap(DDAgentApi.DATADOG_META_TRACER_VERSION, DDTraceCoreInfo.VERSION); - private static final DDCache SERVICE_NAMES = - DDCaches.newFixedSizeCache(32); + static final DDCache SERVICE_NAMES = DDCaches.newFixedSizeCache(32); - private static final DDCache SPAN_KINDS = - DDCaches.newFixedSizeCache(16); - private static final DDCache< + static final DDCache SPAN_KINDS = DDCaches.newFixedSizeCache(16); + static final DDCache< String, Pair, Function>> PEER_TAGS_CACHE = DDCaches.newFixedSizeCache( 64); // it can be unbounded since those values are returned by the agent and should be // under control. 64 entries is enough in this case to contain all the peer tags. - private static final Function< + static final Function< String, Pair, Function>> PEER_TAGS_CACHE_ADDER = key -> @@ -89,9 +84,6 @@ public final class ConflatingMetricsAggregator implements MetricsAggregator, Eve SpanKindFilter.builder().includeInternal().build(); private final Set ignoredResources; - private final MessagePassingQueue batchPool; - private final ConcurrentHashMap pending; - private final ConcurrentHashMap keys; private final Thread thread; private final MessagePassingQueue inbox; private final Sink sink; @@ -185,23 +177,12 @@ public ConflatingMetricsAggregator( this.ignoredResources = ignoredResources; this.includeEndpointInMetrics = includeEndpointInMetrics; this.inbox = Queues.mpscArrayQueue(queueSize); - this.batchPool = Queues.spmcArrayQueue(maxAggregates); - this.pending = new ConcurrentHashMap<>(maxAggregates * 4 / 3); - this.keys = new ConcurrentHashMap<>(); this.features = features; this.healthMetrics = healthMetric; this.sink = sink; this.aggregator = new Aggregator( - metricWriter, - batchPool, - inbox, - pending, - keys.keySet(), - maxAggregates, - reportingInterval, - timeUnit, - healthMetric); + metricWriter, inbox, maxAggregates, reportingInterval, timeUnit, healthMetric); this.thread = newAgentThread(METRICS_AGGREGATOR, aggregator); this.reportingInterval = reportingInterval; this.reportingIntervalTimeUnit = timeUnit; @@ -328,99 +309,71 @@ private boolean publish(CoreSpan span, boolean isTopLevel) { // CharSequence default keeps unsafeGetTag's generic at CharSequence so UTF8BytesString // tag values don't trigger a ClassCastException on the String assignment. final String spanKind = span.unsafeGetTag(SPAN_KIND, (CharSequence) "").toString(); - MetricKey newKey = - new MetricKey( + + boolean error = span.getError() > 0; + long tagAndDuration = + span.getDurationNano() | (error ? ERROR_TAG : 0L) | (isTopLevel ? TOP_LEVEL_TAG : 0L); + + SpanSnapshot snapshot = + new SpanSnapshot( span.getResourceName(), - SERVICE_NAMES.computeIfAbsent(span.getServiceName(), UTF8_ENCODE), + span.getServiceName(), span.getOperationName(), span.getServiceNameSource(), spanType, span.getHttpStatusCode(), isSynthetic(span), span.getParentId() == 0, - SPAN_KINDS.computeIfAbsent( - spanKind, UTF8BytesString::create), // save repeated utf8 conversions - getPeerTags(span), + spanKind, + extractPeerTagPairs(span), httpMethod, httpEndpoint, - grpcStatusCode); - MetricKey key = keys.putIfAbsent(newKey, newKey); - if (null == key) { - key = newKey; - } - long tag = (span.getError() > 0 ? ERROR_TAG : 0L) | (isTopLevel ? TOP_LEVEL_TAG : 0L); - long durationNanos = span.getDurationNano(); - Batch batch = pending.get(key); - if (null != batch) { - // there is a pending batch, try to win the race to add to it - // returning false means that either the batch can't take any - // more data, or it has already been consumed - if (batch.add(tag, durationNanos)) { - // added to a pending batch prior to consumption, - // so skip publishing to the queue (we also know - // the key isn't rare enough to override the sampler) - return false; - } - // recycle the older key - key = batch.getKey(); - } - batch = newBatch(key); - batch.add(tag, durationNanos); - // overwrite the last one if present, it was already full - // or had been consumed by the time we tried to add to it - pending.put(key, batch); - // must offer to the queue after adding to pending - inbox.offer(batch); + grpcStatusCode, + tagAndDuration); + inbox.offer(snapshot); // force keep keys if there are errors - return span.getError() > 0; + return error; } - private List getPeerTags(CoreSpan span) { + private String[] extractPeerTagPairs(CoreSpan span) { if (span.isKind(PEER_AGGREGATION_KINDS)) { final Set eligiblePeerTags = features.peerTags(); - List peerTags = null; + String[] pairs = null; + int count = 0; for (String peerTag : eligiblePeerTags) { Object value = span.unsafeGetTag(peerTag); if (value != null) { - final Pair, Function> - cacheAndCreator = PEER_TAGS_CACHE.computeIfAbsent(peerTag, PEER_TAGS_CACHE_ADDER); - if (peerTags == null) { - peerTags = new ArrayList<>(eligiblePeerTags.size()); + if (pairs == null) { + // pairs are flattened [name, value, ...]; size for worst case + pairs = new String[eligiblePeerTags.size() * 2]; } - peerTags.add( - cacheAndCreator - .getLeft() - .computeIfAbsent(value.toString(), cacheAndCreator.getRight())); + pairs[count++] = peerTag; + pairs[count++] = value.toString(); } } - return peerTags == null ? Collections.emptyList() : peerTags; + if (pairs == null) { + return null; + } + if (count < pairs.length) { + String[] trimmed = new String[count]; + System.arraycopy(pairs, 0, trimmed, 0, count); + return trimmed; + } + return pairs; } else if (span.isKind(INTERNAL_KIND)) { // in this case only the base service should be aggregated if present final Object baseService = span.unsafeGetTag(BASE_SERVICE); if (baseService != null) { - final Pair, Function> - cacheAndCreator = PEER_TAGS_CACHE.computeIfAbsent(BASE_SERVICE, PEER_TAGS_CACHE_ADDER); - return Collections.singletonList( - cacheAndCreator - .getLeft() - .computeIfAbsent(baseService.toString(), cacheAndCreator.getRight())); + return new String[] {BASE_SERVICE, baseService.toString()}; } } - return Collections.emptyList(); + return null; } private static boolean isSynthetic(CoreSpan span) { return span.getOrigin() != null && SYNTHETICS_ORIGIN.equals(span.getOrigin().toString()); } - private Batch newBatch(MetricKey key) { - Batch batch = batchPool.poll(); - if (null == batch) { - return new Batch(key); - } - return batch.reset(key); - } - public void stop() { if (null != cancellation) { cancellation.cancel(); @@ -463,8 +416,6 @@ private void disable() { features.discover(); if (!features.supportsMetrics()) { log.debug("Disabling metric reporting because an agent downgrade was detected"); - this.pending.clear(); - this.batchPool.clear(); this.inbox.clear(); this.aggregator.clearAggregates(); } diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/SpanSnapshot.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/SpanSnapshot.java new file mode 100644 index 00000000000..2816fad0411 --- /dev/null +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/SpanSnapshot.java @@ -0,0 +1,65 @@ +package datadog.trace.common.metrics; + +/** + * Immutable per-span value posted from the producer to the aggregator thread. Carries the raw + * inputs the aggregator needs to build a {@link MetricKey} and update an {@link AggregateMetric}. + * + *

All cache-canonicalization (service-name, span-kind, peer-tag string interning) happens on the + * aggregator thread; the producer just shuffles references. + */ +final class SpanSnapshot implements InboxItem { + + final CharSequence resourceName; + final String serviceName; + final CharSequence operationName; + final CharSequence serviceNameSource; + final CharSequence spanType; + final short httpStatusCode; + final boolean synthetic; + final boolean traceRoot; + final String spanKind; + + /** + * Flattened name/value pairs of peer-tag matches: {@code [name0, value0, name1, value1, ...]}. + * {@code null} when there are no matches (the common case). + */ + final String[] peerTagPairs; + + final String httpMethod; + final String httpEndpoint; + final String grpcStatusCode; + + /** Duration in nanoseconds, OR-ed with {@code ERROR_TAG} / {@code TOP_LEVEL_TAG} as needed. */ + final long tagAndDuration; + + SpanSnapshot( + CharSequence resourceName, + String serviceName, + CharSequence operationName, + CharSequence serviceNameSource, + CharSequence spanType, + short httpStatusCode, + boolean synthetic, + boolean traceRoot, + String spanKind, + String[] peerTagPairs, + String httpMethod, + String httpEndpoint, + String grpcStatusCode, + long tagAndDuration) { + this.resourceName = resourceName; + this.serviceName = serviceName; + this.operationName = operationName; + this.serviceNameSource = serviceNameSource; + this.spanType = spanType; + this.httpStatusCode = httpStatusCode; + this.synthetic = synthetic; + this.traceRoot = traceRoot; + this.spanKind = spanKind; + this.peerTagPairs = peerTagPairs; + this.httpMethod = httpMethod; + this.httpEndpoint = httpEndpoint; + this.grpcStatusCode = grpcStatusCode; + this.tagAndDuration = tagAndDuration; + } +} diff --git a/dd-trace-core/src/test/groovy/datadog/trace/common/metrics/AggregateMetricTest.groovy b/dd-trace-core/src/test/groovy/datadog/trace/common/metrics/AggregateMetricTest.groovy index 0b245552db3..140149d8324 100644 --- a/dd-trace-core/src/test/groovy/datadog/trace/common/metrics/AggregateMetricTest.groovy +++ b/dd-trace-core/src/test/groovy/datadog/trace/common/metrics/AggregateMetricTest.groovy @@ -4,16 +4,9 @@ import datadog.metrics.agent.AgentMeter import datadog.metrics.impl.DDSketchHistograms import datadog.metrics.impl.MonitoringImpl import datadog.metrics.api.statsd.StatsDClient -import datadog.trace.bootstrap.instrumentation.api.UTF8BytesString import datadog.trace.test.util.DDSpecification -import java.util.concurrent.BlockingDeque -import java.util.concurrent.CountDownLatch -import java.util.concurrent.ExecutorService -import java.util.concurrent.Executors -import java.util.concurrent.LinkedBlockingDeque import java.util.concurrent.TimeUnit -import java.util.concurrent.atomic.AtomicInteger import java.util.concurrent.atomic.AtomicLongArray import static datadog.trace.common.metrics.AggregateMetric.ERROR_TAG @@ -61,43 +54,16 @@ class AggregateMetricTest extends DDSpecification { aggregate.getHitCount() == 0 } - def "contribute batch with key to aggregate"() { + def "recordOneDuration accumulates ok and error and top-level"() { given: - AggregateMetric aggregate = new AggregateMetric().recordDurations(3, new AtomicLongArray(0L, 0L, 0L | ERROR_TAG | TOP_LEVEL_TAG)) - - Batch batch = new Batch().reset(new MetricKey("foo", "bar", "qux", null, "type", 0, false, true, "corge", [UTF8BytesString.create("grault:quux")], null, null, null)) - batch.add(0L, 10) - batch.add(0L, 10) - batch.add(0L, 10) - - when: - batch.contributeTo(aggregate) + AggregateMetric aggregate = new AggregateMetric() + .recordOneDuration(10L) + .recordOneDuration(10L | TOP_LEVEL_TAG) + .recordOneDuration(10L | ERROR_TAG) - then: "batch used and values contributed to existing aggregate" - batch.isUsed() + expect: + aggregate.getHitCount() == 3 aggregate.getDuration() == 30 - aggregate.getHitCount() == 6 - aggregate.getErrorCount() == 1 - aggregate.getTopLevelCount() == 1 - } - - def "ignore used batches"() { - given: - AggregateMetric aggregate = new AggregateMetric().recordDurations(10, - new AtomicLongArray(1L, 1L, 1L, 1L, 1L, 1L, 1L | TOP_LEVEL_TAG, 1L, 1L, 1L | ERROR_TAG)) - - - Batch batch = new Batch() - batch.contributeTo(aggregate) - // must be used now - batch.add(0L, 10) - - when: - batch.contributeTo(aggregate) - - then: "batch ignored" - aggregate.getDuration() == 10 - aggregate.getHitCount() == 10 aggregate.getErrorCount() == 1 aggregate.getTopLevelCount() == 1 } @@ -136,53 +102,4 @@ class AggregateMetricTest extends DDSpecification { errorLatencies.getMaxValue() >= 99 okLatencies.getMaxValue() <= 5 } - - def "consistent under concurrent attempts to read and write"() { - given: - AggregateMetric aggregate = new AggregateMetric() - MetricKey key = new MetricKey("foo", "bar", "qux", null, "type", 0, false, true, "corge", [UTF8BytesString.create("grault:quux")], null, null, null) - BlockingDeque queue = new LinkedBlockingDeque<>(1000) - ExecutorService reader = Executors.newSingleThreadExecutor() - int writerCount = 10 - ExecutorService writers = Executors.newFixedThreadPool(writerCount) - CountDownLatch readerLatch = new CountDownLatch(1) - CountDownLatch writerLatch = new CountDownLatch(writerCount) - CountDownLatch queueEmptyLatch = new CountDownLatch(1) - - AtomicInteger written = new AtomicInteger(0) - - when: - for (int i = 0; i < writerCount; ++i) { - writers.submit({ - readerLatch.await() - for (int j = 0; j < 10_000; ++j) { - Batch batch = queue.peekLast() - if (batch?.add(0L, 1)) { - written.incrementAndGet() - } else { - queue.offer(new Batch().reset(key)) - } - } - writerLatch.countDown() - }) - } - def future = reader.submit({ - readerLatch.countDown() - while (!Thread.currentThread().isInterrupted()) { - Batch batch = queue.poll(100, TimeUnit.MILLISECONDS) - if (null == batch && writerLatch.count == 0) { - queueEmptyLatch.countDown() - } else if (null != batch) { - batch.contributeTo(aggregate) - } - } - }) - assert writerLatch.await(10, TimeUnit.SECONDS) - // Wait here until we know that the queue is empty - assert queueEmptyLatch.await(10, TimeUnit.SECONDS) - future.cancel(true) - - then: - aggregate.getHitCount() == written.get() - } } From 3a056b3820644e922b9a0c28f9eadbfe8715032e Mon Sep 17 00:00:00 2001 From: Douglas Q Hawkins Date: Fri, 15 May 2026 13:55:59 -0400 Subject: [PATCH 07/22] Report aggregator inbox-full drops via health metrics With the per-span SpanSnapshot inbox path, the producer can lose snapshots when the bounded MPSC queue is full -- silently, since inbox.offer() returns a boolean we previously ignored. The conflating-Batch design used to absorb ~64x more producer pressure per inbox slot, so this is a new failure mode worth surfacing. Wire it through the existing HealthMetrics path: - HealthMetrics.onStatsInboxFull() (no-op default). - TracerHealthMetrics gets a statsInboxFull LongAdder and a new reason tag reason:inbox_full reported under the same stats.dropped_aggregates metric used for LRU evictions. Two LongAdders, two tagged time series. - ConflatingMetricsAggregator.publish increments the counter when inbox.offer(snapshot) returns false. This doesn't fix the drop -- tuning maxPending and/or building producer-side batching are the actual fixes. But it makes the failure visible in the same place ops already watches. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../metrics/ConflatingMetricsAggregator.java | 4 +++- .../trace/core/monitor/HealthMetrics.java | 5 +++++ .../trace/core/monitor/TracerHealthMetrics.java | 16 +++++++++++++++- 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java index 8268085e269..9ea77140113 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java @@ -330,7 +330,9 @@ private boolean publish(CoreSpan span, boolean isTopLevel) { httpEndpoint, grpcStatusCode, tagAndDuration); - inbox.offer(snapshot); + if (!inbox.offer(snapshot)) { + healthMetrics.onStatsInboxFull(); + } // force keep keys if there are errors return error; } diff --git a/dd-trace-core/src/main/java/datadog/trace/core/monitor/HealthMetrics.java b/dd-trace-core/src/main/java/datadog/trace/core/monitor/HealthMetrics.java index 257d887029b..d1c7fe126b4 100644 --- a/dd-trace-core/src/main/java/datadog/trace/core/monitor/HealthMetrics.java +++ b/dd-trace-core/src/main/java/datadog/trace/core/monitor/HealthMetrics.java @@ -93,6 +93,11 @@ public void onClientStatDowngraded() {} public void onStatsAggregateDropped() {} + /** + * Reports a single span whose stats snapshot was dropped because the aggregator inbox was full. + */ + public void onStatsInboxFull() {} + /** * @return Human-readable summary of the current health metrics. */ diff --git a/dd-trace-core/src/main/java/datadog/trace/core/monitor/TracerHealthMetrics.java b/dd-trace-core/src/main/java/datadog/trace/core/monitor/TracerHealthMetrics.java index 2df54241e56..76051645fcb 100644 --- a/dd-trace-core/src/main/java/datadog/trace/core/monitor/TracerHealthMetrics.java +++ b/dd-trace-core/src/main/java/datadog/trace/core/monitor/TracerHealthMetrics.java @@ -98,6 +98,7 @@ public class TracerHealthMetrics extends HealthMetrics implements AutoCloseable private final LongAdder clientStatsDowngrades = new LongAdder(); private final LongAdder statsAggregateDropped = new LongAdder(); + private final LongAdder statsInboxFull = new LongAdder(); private final StatsDClient statsd; private final long interval; @@ -357,6 +358,11 @@ public void onStatsAggregateDropped() { statsAggregateDropped.increment(); } + @Override + public void onStatsInboxFull() { + statsInboxFull.increment(); + } + @Override public void close() { if (null != cancellation) { @@ -374,6 +380,7 @@ private static class Flush implements AgentTaskScheduler.Task Date: Tue, 19 May 2026 19:55:41 -0400 Subject: [PATCH 09/22] Skip SpanSnapshot allocation when the inbox is already at capacity publish() previously did all of the tag extraction (peer-tag pairs, HTTP method/endpoint, span kind, gRPC status) and the SpanSnapshot allocation before calling inbox.offer; on a full inbox the offer failed and everything became garbage. Early-out with an approximate size() vs capacity() check up front. The jctools MPSC queue's size() is best-effort but that's fine: under- estimation falls through to the existing offer-as-source-of-truth path, over-estimation drops a snapshot that would have fit (and onStatsInboxFull was about to fire on the next span anyway). error is computed first so the force-keep return is correct whether or not the snapshot is built. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../metrics/ConflatingMetricsAggregator.java | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java index 9ea77140113..525dc802e3c 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java @@ -290,6 +290,19 @@ private boolean shouldComputeMetric(CoreSpan span, boolean isTopLevel) { } private boolean publish(CoreSpan span, boolean isTopLevel) { + // Error decision drives force-keep sampling regardless of whether the snapshot gets queued. + boolean error = span.getError() > 0; + + // Fast-path the inbox-full case before any tag extraction or snapshot allocation. size() is + // approximate on jctools' MPSC queue but that's fine: if we under-estimate, we fall through + // and let inbox.offer be the source of truth (existing behavior); if we over-estimate, we + // drop a snapshot that would have fit -- acceptable, onStatsInboxFull was going to fire + // imminently anyway. + if (inbox.size() >= inbox.capacity()) { + healthMetrics.onStatsInboxFull(); + return error; + } + // Extract HTTP method and endpoint only if the feature is enabled String httpMethod = null; String httpEndpoint = null; @@ -310,7 +323,6 @@ private boolean publish(CoreSpan span, boolean isTopLevel) { // tag values don't trigger a ClassCastException on the String assignment. final String spanKind = span.unsafeGetTag(SPAN_KIND, (CharSequence) "").toString(); - boolean error = span.getError() > 0; long tagAndDuration = span.getDurationNano() | (error ? ERROR_TAG : 0L) | (isTopLevel ? TOP_LEVEL_TAG : 0L); From e455801bf17673b2076fa6d496a97f2534c2654a Mon Sep 17 00:00:00 2001 From: Douglas Q Hawkins Date: Wed, 20 May 2026 15:29:28 -0400 Subject: [PATCH 10/22] Introduce slim PeerTagSchema; capture peer-tag values not pairs Addresses sarahchen6's review comment on ConflatingMetricsAggregator extractPeerTagPairs: replaces the worst-case-allocation + trim-and-copy flat-pairs layout with a parallel-array carrier. - New PeerTagSchema: minimal carrier of String[] names. Two flavors -- a static INTERNAL singleton (one entry: base.service) for internal-kind spans, and per-discovery built schemas for client/producer/consumer spans. Deliberately no cardinality limiters or per-cycle state; that layers on top in a later PR. - ConflatingMetricsAggregator: caches the peer-aggregation schema keyed on reference equality of features.peerTags() -- a single volatile read + a long compare on the steady-state producer hot path, no allocation. The producer now captures only a String[] of values parallel to the schema's names; the schema reference is carried on SpanSnapshot. The prior "build worst-case pairs then trim" code is gone. - SpanSnapshot: replaces String[] peerTagPairs with PeerTagSchema + String[] peerTagValues. Producer drops the schema reference if no values fired so the consumer short-circuits on null. - Aggregator.materializePeerTags: now reads name/value pairs at the same index from (schema.names, snapshot.peerTagValues). Counts hits once for exact-size allocation; preserves the singletonList fast path for the common one-entry case (e.g. internal-kind base.service). Producer-side cost goes from "allocate String[2n] + walk + maybe trim" to "single volatile read + walk + lazy String[n] only on first hit". Co-Authored-By: Claude Opus 4.7 (1M context) --- .../trace/common/metrics/Aggregator.java | 39 ++++-- .../metrics/ConflatingMetricsAggregator.java | 114 +++++++++++++----- .../trace/common/metrics/PeerTagSchema.java | 49 ++++++++ .../trace/common/metrics/SpanSnapshot.java | 20 ++- 4 files changed, 177 insertions(+), 45 deletions(-) create mode 100644 dd-trace-core/src/main/java/datadog/trace/common/metrics/PeerTagSchema.java diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/Aggregator.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/Aggregator.java index e632555cc21..a27e14355ba 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/Aggregator.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/Aggregator.java @@ -158,23 +158,44 @@ private static MetricKey buildMetricKey(SpanSnapshot s) { s.synthetic, s.traceRoot, SPAN_KINDS.computeIfAbsent(s.spanKind, UTF8BytesString::create), - materializePeerTags(s.peerTagPairs), + materializePeerTags(s.peerTagSchema, s.peerTagValues), s.httpMethod, s.httpEndpoint, s.grpcStatusCode); } - private static List materializePeerTags(String[] pairs) { - if (pairs == null || pairs.length == 0) { + /** + * Encodes the per-span peer-tag values into the {@code List} the {@link + * MetricKey} consumes. Reads name/value pairs at the same index from the schema's names and the + * snapshot's values; null value slots are skipped (the span didn't set that peer tag). + */ + private static List materializePeerTags(PeerTagSchema schema, String[] values) { + if (schema == null || values == null) { return Collections.emptyList(); } - if (pairs.length == 2) { - // single-entry fast path (matches the original singletonList shape for INTERNAL spans) - return Collections.singletonList(encodePeerTag(pairs[0], pairs[1])); + String[] names = schema.names; + int n = names.length; + // Single-entry fast path (matches the original singletonList shape for INTERNAL spans and any + // other case where exactly one peer tag fired). + int firstHit = -1; + int hitCount = 0; + for (int i = 0; i < n; i++) { + if (values[i] != null) { + if (hitCount == 0) firstHit = i; + hitCount++; + } + } + if (hitCount == 0) { + return Collections.emptyList(); } - List tags = new ArrayList<>(pairs.length / 2); - for (int i = 0; i < pairs.length; i += 2) { - tags.add(encodePeerTag(pairs[i], pairs[i + 1])); + if (hitCount == 1) { + return Collections.singletonList(encodePeerTag(names[firstHit], values[firstHit])); + } + List tags = new ArrayList<>(hitCount); + for (int i = firstHit; i < n; i++) { + if (values[i] != null) { + tags.add(encodePeerTag(names[i], values[i])); + } } return tags; } diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java index 525dc802e3c..50b11aa3e08 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java @@ -2,7 +2,6 @@ import static datadog.communication.ddagent.DDAgentFeaturesDiscovery.V06_METRICS_ENDPOINT; import static datadog.trace.api.DDSpanTypes.RPC; -import static datadog.trace.api.DDTags.BASE_SERVICE; import static datadog.trace.bootstrap.instrumentation.api.Tags.HTTP_ENDPOINT; import static datadog.trace.bootstrap.instrumentation.api.Tags.HTTP_METHOD; import static datadog.trace.bootstrap.instrumentation.api.Tags.SPAN_KIND; @@ -94,6 +93,21 @@ public final class ConflatingMetricsAggregator implements MetricsAggregator, Eve private final HealthMetrics healthMetrics; private final boolean includeEndpointInMetrics; + /** + * Cached peer-aggregation schema, keyed by reference equality of the {@code Set} returned + * by {@link DDAgentFeaturesDiscovery#peerTags()}. {@code DDAgentFeaturesDiscovery} caches the Set + * on its current state, so reference identity changes exactly when discovery replaces state with + * a new tag configuration -- a single volatile read + a reference compare on the steady-state hot + * path. The {@code synchronized} refresh is the only allocator on a miss. + * + *

Both fields are written together inside the synchronized block, but read independently -- + * the reference-equality check on the source Set is what guards against using a stale schema, so + * tearing on the schema field alone is not a correctness concern. + */ + private volatile Set cachedPeerTagsSource; + + private volatile PeerTagSchema cachedPeerTagSchema; + private volatile AgentTaskScheduler.Scheduled cancellation; public ConflatingMetricsAggregator( @@ -326,6 +340,15 @@ private boolean publish(CoreSpan span, boolean isTopLevel) { long tagAndDuration = span.getDurationNano() | (error ? ERROR_TAG : 0L) | (isTopLevel ? TOP_LEVEL_TAG : 0L); + PeerTagSchema peerTagSchema = peerTagSchemaFor(span); + String[] peerTagValues = + peerTagSchema == null ? null : capturePeerTagValues(span, peerTagSchema); + if (peerTagValues == null) { + // No tags fired -- drop the schema reference so the consumer doesn't bother iterating an + // all-null array. + peerTagSchema = null; + } + SpanSnapshot snapshot = new SpanSnapshot( span.getResourceName(), @@ -337,7 +360,8 @@ private boolean publish(CoreSpan span, boolean isTopLevel) { isSynthetic(span), span.getParentId() == 0, spanKind, - extractPeerTagPairs(span), + peerTagSchema, + peerTagValues, httpMethod, httpEndpoint, grpcStatusCode, @@ -349,39 +373,67 @@ private boolean publish(CoreSpan span, boolean isTopLevel) { return error; } - private String[] extractPeerTagPairs(CoreSpan span) { + /** + * Picks the peer-tag schema for a span. For internal-kind spans we always use the static {@link + * PeerTagSchema#INTERNAL} singleton (one entry for {@code base.service}); for {@code + * client}/{@code producer}/{@code consumer} kinds we use the cached peer-aggregation schema + * synced from {@link DDAgentFeaturesDiscovery#peerTags()}. Other kinds get {@code null}. + */ + private PeerTagSchema peerTagSchemaFor(CoreSpan span) { if (span.isKind(PEER_AGGREGATION_KINDS)) { - final Set eligiblePeerTags = features.peerTags(); - String[] pairs = null; - int count = 0; - for (String peerTag : eligiblePeerTags) { - Object value = span.unsafeGetTag(peerTag); - if (value != null) { - if (pairs == null) { - // pairs are flattened [name, value, ...]; size for worst case - pairs = new String[eligiblePeerTags.size() * 2]; - } - pairs[count++] = peerTag; - pairs[count++] = value.toString(); + PeerTagSchema schema = currentPeerAggSchema(); + return schema.size() > 0 ? schema : null; + } + if (span.isKind(INTERNAL_KIND)) { + return PeerTagSchema.INTERNAL; + } + return null; + } + + /** + * Returns the currently-cached peer-aggregation schema, rebuilding it if {@link + * DDAgentFeaturesDiscovery#peerTags()} has returned a different {@code Set} reference since the + * last cache. Steady-state cost: one volatile read + one reference compare. + */ + private PeerTagSchema currentPeerAggSchema() { + Set current = features.peerTags(); + if (current == cachedPeerTagsSource) { + return cachedPeerTagSchema; + } + return refreshPeerAggSchema(current); + } + + private synchronized PeerTagSchema refreshPeerAggSchema(Set current) { + // Double-checked: another producer may have rebuilt while we were waiting on the monitor. + if (current == cachedPeerTagsSource) { + return cachedPeerTagSchema; + } + PeerTagSchema schema = PeerTagSchema.of(current); + cachedPeerTagSchema = schema; + cachedPeerTagsSource = current; + return schema; + } + + /** + * Captures the span's peer-tag values into a {@code String[]} parallel to {@code schema.names}. + * Slots remain {@code null} for tags the span didn't set; the array itself is lazily allocated on + * the first hit so spans that fire no peer tags pay zero allocation. Returns {@code null} when + * none of the configured peer tags are set on the span. + */ + private static String[] capturePeerTagValues(CoreSpan span, PeerTagSchema schema) { + String[] names = schema.names; + int n = names.length; + String[] values = null; + for (int i = 0; i < n; i++) { + Object v = span.unsafeGetTag(names[i]); + if (v != null) { + if (values == null) { + values = new String[n]; } - } - if (pairs == null) { - return null; - } - if (count < pairs.length) { - String[] trimmed = new String[count]; - System.arraycopy(pairs, 0, trimmed, 0, count); - return trimmed; - } - return pairs; - } else if (span.isKind(INTERNAL_KIND)) { - // in this case only the base service should be aggregated if present - final Object baseService = span.unsafeGetTag(BASE_SERVICE); - if (baseService != null) { - return new String[] {BASE_SERVICE, baseService.toString()}; + values[i] = v.toString(); } } - return null; + return values; } private static boolean isSynthetic(CoreSpan span) { diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/PeerTagSchema.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/PeerTagSchema.java new file mode 100644 index 00000000000..8d85a65c63a --- /dev/null +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/PeerTagSchema.java @@ -0,0 +1,49 @@ +package datadog.trace.common.metrics; + +import static datadog.trace.api.DDTags.BASE_SERVICE; + +import java.util.Set; + +/** + * Names of the peer-tags eligible for client-stats aggregation, packed into a flat {@code String[]} + * for parallel-array access by producers and the aggregator thread. + * + *

This is the minimal carrier shape used by {@link SpanSnapshot}: the producer captures per-span + * values into a {@code String[]} parallel to {@link #names}, and the aggregator reconstructs the + * encoded {@code tag:value} pairs from the same name index. It replaces the prior "flat pairs" + * {@code [name0, value0, name1, value1, ...]} layout, which forced a worst-case allocation + + * trim-and-copy on every span. + * + *

Two schemas exist: + * + *

    + *
  • {@link #INTERNAL} -- a singleton with one entry for {@code base.service}, used for + * internal-kind spans where only the base service is aggregated. + *
  • A peer-aggregation schema built via {@link #of(Set)} for {@code client}/{@code + * producer}/{@code consumer} spans, cached on {@link ConflatingMetricsAggregator} keyed by + * reference equality of {@code DDAgentFeaturesDiscovery.peerTags()}. + *
+ * + *

This class deliberately has no cardinality limiters or per-cycle state -- callers that need + * those layer them on top. + */ +final class PeerTagSchema { + + /** Singleton schema for internal-kind spans -- only {@code base.service}. */ + static final PeerTagSchema INTERNAL = new PeerTagSchema(new String[] {BASE_SERVICE}); + + final String[] names; + + private PeerTagSchema(String[] names) { + this.names = names; + } + + /** Builds a schema for the given peer-tag names. Order is determined by the {@link Set}. */ + static PeerTagSchema of(Set tags) { + return new PeerTagSchema(tags.toArray(new String[0])); + } + + int size() { + return names.length; + } +} diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/SpanSnapshot.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/SpanSnapshot.java index 2816fad0411..eb9b741cea6 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/SpanSnapshot.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/SpanSnapshot.java @@ -20,10 +20,18 @@ final class SpanSnapshot implements InboxItem { final String spanKind; /** - * Flattened name/value pairs of peer-tag matches: {@code [name0, value0, name1, value1, ...]}. - * {@code null} when there are no matches (the common case). + * Schema for {@link #peerTagValues}. {@code null} when the span has no peer tags. The schema + * carries the names in parallel-array form; {@code peerTagValues} holds the per-span tag values + * at the same indices. */ - final String[] peerTagPairs; + final PeerTagSchema peerTagSchema; + + /** + * Peer tag values captured from the span, parallel to {@code peerTagSchema.names}. A {@code null} + * entry means the span didn't have that peer tag set. {@code null} (the whole array) when {@link + * #peerTagSchema} is {@code null}. + */ + final String[] peerTagValues; final String httpMethod; final String httpEndpoint; @@ -42,7 +50,8 @@ final class SpanSnapshot implements InboxItem { boolean synthetic, boolean traceRoot, String spanKind, - String[] peerTagPairs, + PeerTagSchema peerTagSchema, + String[] peerTagValues, String httpMethod, String httpEndpoint, String grpcStatusCode, @@ -56,7 +65,8 @@ final class SpanSnapshot implements InboxItem { this.synthetic = synthetic; this.traceRoot = traceRoot; this.spanKind = spanKind; - this.peerTagPairs = peerTagPairs; + this.peerTagSchema = peerTagSchema; + this.peerTagValues = peerTagValues; this.httpMethod = httpMethod; this.httpEndpoint = httpEndpoint; this.grpcStatusCode = grpcStatusCode; From e766fd3db22c1bd7073b74179925d15887334c2a Mon Sep 17 00:00:00 2001 From: Douglas Q Hawkins Date: Thu, 21 May 2026 10:03:38 -0400 Subject: [PATCH 11/22] Address PR #11381 review (round 2) - Aggregator.materializePeerTags: fold the firstHit-discovery nested if into a single guarded post-increment (amarziali, #3279243138). One body line: `if (values[i] != null && hitCount++ == 0) firstHit = i;`. - Drop redundant isKind(SpanKindFilter) overrides in both TraceGenerator.groovy files (amarziali, #3279264553 / #3279382648). CoreSpan.java:84 already supplies a default implementation that reads the same span.kind tag. - Bump TRACER_METRICS_MAX_PENDING default from 2048 -> 131072 to address the capacity regression amarziali flagged (#3279378375). Without producer-side conflation, the inbox now holds 1 SpanSnapshot per metrics-eligible span instead of 1 conflated Batch per ~64 spans; restoring effective capacity parity (~2048 * ~64 = 131072) prevents a ~64x rise in inbox-full drops at the same span rate. ~100 B per SpanSnapshot puts the worst-case heap floor at ~13 MB -- bounded. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../main/java/datadog/trace/common/metrics/Aggregator.java | 5 ++--- .../datadog/trace/common/writer/TraceGenerator.groovy | 6 ------ .../src/traceAgentTest/groovy/TraceGenerator.groovy | 6 ------ internal-api/src/main/java/datadog/trace/api/Config.java | 7 ++++++- 4 files changed, 8 insertions(+), 16 deletions(-) diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/Aggregator.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/Aggregator.java index a27e14355ba..9c23f4931f3 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/Aggregator.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/Aggregator.java @@ -180,9 +180,8 @@ private static List materializePeerTags(PeerTagSchema schema, S int firstHit = -1; int hitCount = 0; for (int i = 0; i < n; i++) { - if (values[i] != null) { - if (hitCount == 0) firstHit = i; - hitCount++; + if (values[i] != null && hitCount++ == 0) { + firstHit = i; } } if (hitCount == 0) { diff --git a/dd-trace-core/src/test/groovy/datadog/trace/common/writer/TraceGenerator.groovy b/dd-trace-core/src/test/groovy/datadog/trace/common/writer/TraceGenerator.groovy index 49e13472249..1e251f09bf2 100644 --- a/dd-trace-core/src/test/groovy/datadog/trace/common/writer/TraceGenerator.groovy +++ b/dd-trace-core/src/test/groovy/datadog/trace/common/writer/TraceGenerator.groovy @@ -323,12 +323,6 @@ class TraceGenerator { return false } - @Override - boolean isKind(SpanKindFilter filter) { - def kind = metadata.getTags().get(Tags.SPAN_KIND) - return filter.matches(kind == null ? null : kind.toString()) - } - @Override short getHttpStatusCode() { return httpStatusCode diff --git a/dd-trace-core/src/traceAgentTest/groovy/TraceGenerator.groovy b/dd-trace-core/src/traceAgentTest/groovy/TraceGenerator.groovy index 2b2bca79406..e7b08915d5f 100644 --- a/dd-trace-core/src/traceAgentTest/groovy/TraceGenerator.groovy +++ b/dd-trace-core/src/traceAgentTest/groovy/TraceGenerator.groovy @@ -300,12 +300,6 @@ class TraceGenerator { return false } - @Override - boolean isKind(SpanKindFilter filter) { - def kind = metadata.getTags().get(Tags.SPAN_KIND) - return filter.matches(kind == null ? null : kind.toString()) - } - Map getBaggage() { return metadata.getBaggage() } diff --git a/internal-api/src/main/java/datadog/trace/api/Config.java b/internal-api/src/main/java/datadog/trace/api/Config.java index a463887f61a..6b912b39de2 100644 --- a/internal-api/src/main/java/datadog/trace/api/Config.java +++ b/internal-api/src/main/java/datadog/trace/api/Config.java @@ -2173,7 +2173,12 @@ private Config(final ConfigProvider configProvider, final InstrumenterConfig ins tracerMetricsBufferingEnabled = configProvider.getBoolean(TRACER_METRICS_BUFFERING_ENABLED, false); tracerMetricsMaxAggregates = configProvider.getInteger(TRACER_METRICS_MAX_AGGREGATES, 2048); - tracerMetricsMaxPending = configProvider.getInteger(TRACER_METRICS_MAX_PENDING, 2048); + // Sized for ~2048 conflation slots * ~64 spans-per-batch effective capacity from the previous + // conflating-Batch design (131072 = 2^17). Without producer-side conflation, the inbox holds 1 + // SpanSnapshot per metrics-eligible span instead of 1 conflated Batch per ~64 spans -- without + // this bump customers would see ~64x more inbox-full drops at the same span rate. ~100 B per + // SpanSnapshot * 131072 ≈ 13 MB worst-case heap floor. + tracerMetricsMaxPending = configProvider.getInteger(TRACER_METRICS_MAX_PENDING, 131072); reportHostName = configProvider.getBoolean(TRACE_REPORT_HOSTNAME, DEFAULT_TRACE_REPORT_HOSTNAME); From 8cfa4a55f474719835409e6d66a00cadc328ff0f Mon Sep 17 00:00:00 2001 From: Douglas Q Hawkins Date: Thu, 21 May 2026 10:27:11 -0400 Subject: [PATCH 12/22] Cover inbox-full fast-path in ConflatingMetricsAggregator.publish Addresses PR #11381 review (amarziali, #3279325340 -- "Are the existing tests covering this case?"). New ConflatingMetricsAggregatorInboxFullTest constructs the aggregator with a small inbox (queueSize=8), deliberately does NOT call start() so the consumer thread never drains, then publishes enough spans to overflow the inbox. Verifies that healthMetrics.onStatsInboxFull() is called at least once -- the fast-path's `inbox.size() >= inbox.capacity()` short-circuit triggers when the producer-side queue is at capacity. Test is Java + JUnit 5 + Mockito per the project convention for new tests; uses a CoreSpan Mockito mock rather than the SimpleSpan Groovy fixture so we don't depend on Groovy-then-Java compile order from the test source set. Co-Authored-By: Claude Opus 4.7 (1M context) --- ...flatingMetricsAggregatorInboxFullTest.java | 84 +++++++++++++++++++ 1 file changed, 84 insertions(+) create mode 100644 dd-trace-core/src/test/java/datadog/trace/common/metrics/ConflatingMetricsAggregatorInboxFullTest.java diff --git a/dd-trace-core/src/test/java/datadog/trace/common/metrics/ConflatingMetricsAggregatorInboxFullTest.java b/dd-trace-core/src/test/java/datadog/trace/common/metrics/ConflatingMetricsAggregatorInboxFullTest.java new file mode 100644 index 00000000000..f4e4c2da253 --- /dev/null +++ b/dd-trace-core/src/test/java/datadog/trace/common/metrics/ConflatingMetricsAggregatorInboxFullTest.java @@ -0,0 +1,84 @@ +package datadog.trace.common.metrics; + +import static java.util.concurrent.TimeUnit.SECONDS; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.ArgumentMatchers.eq; +import static org.mockito.Mockito.atLeastOnce; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +import datadog.communication.ddagent.DDAgentFeaturesDiscovery; +import datadog.trace.bootstrap.instrumentation.api.Tags; +import datadog.trace.core.CoreSpan; +import datadog.trace.core.SpanKindFilter; +import datadog.trace.core.monitor.HealthMetrics; +import java.util.Collections; +import org.junit.jupiter.api.Test; + +/** + * Coverage for the inbox-full fast-path in {@code ConflatingMetricsAggregator.publish}: when the + * producer-side inbox is at capacity, the next {@code publish} call short-circuits before any tag + * extraction or {@code SpanSnapshot} allocation and reports {@code onStatsInboxFull()} to health + * metrics. + */ +class ConflatingMetricsAggregatorInboxFullTest { + + @Test + void publishFiresOnStatsInboxFullOnceInboxIsAtCapacity() { + HealthMetrics healthMetrics = mock(HealthMetrics.class); + MetricWriter writer = mock(MetricWriter.class); + Sink sink = mock(Sink.class); + DDAgentFeaturesDiscovery features = mock(DDAgentFeaturesDiscovery.class); + when(features.supportsMetrics()).thenReturn(true); + when(features.peerTags()).thenReturn(Collections.emptySet()); + + // Small inbox; jctools MPSC array queue rounds up to the next power of two, so use a power of + // two directly. Note: we deliberately do NOT call aggregator.start() so the consumer thread + // never drains -- snapshots accumulate in the inbox until capacity, then the next publish hits + // the size-vs-capacity fast path. + int queueSize = 8; + ConflatingMetricsAggregator aggregator = + new ConflatingMetricsAggregator( + Collections.emptySet(), + features, + healthMetrics, + sink, + writer, + /* maxAggregates */ 16, + queueSize, + /* reportingInterval */ 10, + SECONDS, + /* includeEndpointInMetrics */ false); + + // Publish well past capacity. The first `queueSize` calls land in the inbox; subsequent calls + // see size >= capacity and hit the fast path. + for (int i = 0; i < queueSize * 4; i++) { + aggregator.publish(Collections.>singletonList(metricsEligibleSpan())); + } + + verify(healthMetrics, atLeastOnce()).onStatsInboxFull(); + aggregator.close(); + } + + @SuppressWarnings({"rawtypes", "unchecked"}) + private static CoreSpan metricsEligibleSpan() { + CoreSpan span = mock(CoreSpan.class); + when(span.isMeasured()).thenReturn(false); + when(span.isTopLevel()).thenReturn(true); + when(span.isKind(any(SpanKindFilter.class))).thenReturn(false); + when(span.getLongRunningVersion()).thenReturn(0); + when(span.getDurationNano()).thenReturn(100L); + when(span.getError()).thenReturn(0); + when(span.getResourceName()).thenReturn("resource"); + when(span.getServiceName()).thenReturn("svc"); + when(span.getOperationName()).thenReturn("op"); + when(span.getServiceNameSource()).thenReturn(null); + when(span.getType()).thenReturn("web"); + when(span.getHttpStatusCode()).thenReturn((short) 200); + when(span.getParentId()).thenReturn(0L); + when(span.getOrigin()).thenReturn(null); + when(span.unsafeGetTag(eq(Tags.SPAN_KIND), any(CharSequence.class))).thenReturn("client"); + return span; + } +} From 3644470ddb5084958bcc42f15c63ec77270abd3d Mon Sep 17 00:00:00 2001 From: Douglas Q Hawkins Date: Thu, 21 May 2026 10:55:39 -0400 Subject: [PATCH 13/22] Reconcile PeerTagSchema once per reporting cycle on the aggregator thread Addresses amarziali's review comment #3279340181 ("It would be more efficient to trigger from the other side"). The producer-side reference compare on every publish goes away; the aggregator thread reconciles the cached schema against feature discovery once per reporting cycle. - DDAgentFeaturesDiscovery: expose getLastTimeDiscovered() so callers can detect a discovery refresh without copying the peerTags Set. - PeerTagSchema: add `long lastTimeDiscovered` (plain, aggregator-only) and `hasSameTagsAs(Set)`. of(Set, long) takes the timestamp; INTERNAL uses a -1L sentinel since it's never reconciled. - ConflatingMetricsAggregator: * Drop the cachedPeerTagsSource volatile and the per-publish reference compare. * Producer fast path is now `cachedPeerTagSchema` volatile read + null-check; first publish takes the one-time synchronized bootstrap. * Add reconcilePeerTagSchema() that runs once per cycle on the aggregator thread: fast-path timestamp compare, slow-path set compare, bump-in-place when the set is unchanged. - Aggregator: new `Runnable onReportCycle` constructor parameter, run at the start of report() (before the flush, so any test awaiting writer.finishBucket() observes the schema in its post-reconcile state and so the next publish sees the new schema without a handoff). - Update "should create bucket for each set of peer tags" to drive two reporting cycles separated by a report() that triggers reconcile. The old test relied on per-publish reference detection, which the new design intentionally doesn't preserve -- the schema is now stable within a cycle. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../ddagent/DDAgentFeaturesDiscovery.java | 10 ++ .../trace/common/metrics/Aggregator.java | 27 +++++- .../metrics/ConflatingMetricsAggregator.java | 93 +++++++++++++------ .../trace/common/metrics/PeerTagSchema.java | 61 ++++++++++-- .../ConflatingMetricAggregatorTest.groovy | 31 +++++-- 5 files changed, 176 insertions(+), 46 deletions(-) diff --git a/communication/src/main/java/datadog/communication/ddagent/DDAgentFeaturesDiscovery.java b/communication/src/main/java/datadog/communication/ddagent/DDAgentFeaturesDiscovery.java index 10c1e57efd7..67d279f51b9 100644 --- a/communication/src/main/java/datadog/communication/ddagent/DDAgentFeaturesDiscovery.java +++ b/communication/src/main/java/datadog/communication/ddagent/DDAgentFeaturesDiscovery.java @@ -403,6 +403,16 @@ public Set peerTags() { return discoveryState.peerTags; } + /** + * Wall-clock timestamp ({@link System#currentTimeMillis()}) of the most recent successful + * feature discovery, or {@code 0L} if discovery has never run. Callers (e.g. the client-stats + * aggregator) snapshot this alongside {@link #peerTags()} to detect when discovery has refreshed + * and a cached view of feature state may be stale. + */ + public long getLastTimeDiscovered() { + return discoveryState.lastTimeDiscovered; + } + public String getMetricsEndpoint() { return discoveryState.metricsEndpoint; } diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/Aggregator.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/Aggregator.java index 9c23f4931f3..72440b5d361 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/Aggregator.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/Aggregator.java @@ -41,6 +41,15 @@ final class Aggregator implements Runnable { private final long sleepMillis; + /** + * Per-cycle hook run on the aggregator thread at the start of each report cycle, before the + * flush. Used by {@link ConflatingMetricsAggregator} to reconcile its cached peer-tag schema + * against {@link datadog.communication.ddagent.DDAgentFeaturesDiscovery}; running before the + * flush guarantees that any test awaiting {@code writer.finishBucket()} observes the schema in + * its post-reconcile state. May be {@code null}. + */ + private final Runnable onReportCycle; + @SuppressFBWarnings( value = "AT_STALE_THREAD_WRITE_OF_PRIMITIVE", justification = "the field is confined to the agent thread running the Aggregator") @@ -52,7 +61,8 @@ final class Aggregator implements Runnable { int maxAggregates, long reportingInterval, TimeUnit reportingIntervalTimeUnit, - HealthMetrics healthMetrics) { + HealthMetrics healthMetrics, + Runnable onReportCycle) { this( writer, inbox, @@ -60,7 +70,8 @@ final class Aggregator implements Runnable { reportingInterval, reportingIntervalTimeUnit, DEFAULT_SLEEP_MILLIS, - healthMetrics); + healthMetrics, + onReportCycle); } Aggregator( @@ -70,7 +81,8 @@ final class Aggregator implements Runnable { long reportingInterval, TimeUnit reportingIntervalTimeUnit, long sleepMillis, - HealthMetrics healthMetrics) { + HealthMetrics healthMetrics, + Runnable onReportCycle) { this.writer = writer; this.inbox = inbox; this.aggregates = @@ -78,6 +90,7 @@ final class Aggregator implements Runnable { new AggregateExpiry(healthMetrics), maxAggregates * 4 / 3, 0.75f, maxAggregates); this.reportingIntervalNanos = reportingIntervalTimeUnit.toNanos(reportingInterval); this.sleepMillis = sleepMillis; + this.onReportCycle = onReportCycle; } private static final class AggregateExpiry @@ -206,6 +219,14 @@ private static UTF8BytesString encodePeerTag(String name, String value) { } private void report(long when, SignalItem signal) { + // Per-cycle hook on the aggregator thread -- used by ClientStatsAggregator to reconcile the + // cached peer-tag schema against feature discovery. Runs before the flush so any test that + // awaits writer.finishBucket() observes the schema in its post-reconcile state, and so + // subsequent producer publishes (which may happen as soon as the flush completes) see the new + // schema without an additional handoff. + if (onReportCycle != null) { + onReportCycle.run(); + } boolean skipped = true; if (dirty) { try { diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java index 50b11aa3e08..0d1bbd74360 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java @@ -94,18 +94,20 @@ public final class ConflatingMetricsAggregator implements MetricsAggregator, Eve private final boolean includeEndpointInMetrics; /** - * Cached peer-aggregation schema, keyed by reference equality of the {@code Set} returned - * by {@link DDAgentFeaturesDiscovery#peerTags()}. {@code DDAgentFeaturesDiscovery} caches the Set - * on its current state, so reference identity changes exactly when discovery replaces state with - * a new tag configuration -- a single volatile read + a reference compare on the steady-state hot - * path. The {@code synchronized} refresh is the only allocator on a miss. + * Cached peer-aggregation schema. Producers read this reference once per trace and pass it + * through to the consumer in {@link SpanSnapshot}; they never inspect the schema's timestamp or + * rebuild it. Reconciliation is the aggregator thread's job: {@link #reconcilePeerTagSchema()} + * compares the schema's {@link PeerTagSchema#lastTimeDiscovered} against {@link + * DDAgentFeaturesDiscovery#getLastTimeDiscovered()} once per reporting cycle and either bumps the + * timestamp in place (when the tag set is unchanged) or swaps in a freshly-built schema. * - *

Both fields are written together inside the synchronized block, but read independently -- - * the reference-equality check on the source Set is what guards against using a stale schema, so - * tearing on the schema field alone is not a correctness concern. + *

{@code null} only on the bootstrap window before {@link #bootstrapPeerTagSchema()} runs on + * the first publish. + * + *

{@code volatile} so the consumer's reconcile-time replacement is visible to producer + * threads; the schema's own internal mutable state ({@link PeerTagSchema#lastTimeDiscovered}) is + * exercised only on the aggregator thread. */ - private volatile Set cachedPeerTagsSource; - private volatile PeerTagSchema cachedPeerTagSchema; private volatile AgentTaskScheduler.Scheduled cancellation; @@ -196,7 +198,13 @@ public ConflatingMetricsAggregator( this.sink = sink; this.aggregator = new Aggregator( - metricWriter, inbox, maxAggregates, reportingInterval, timeUnit, healthMetric); + metricWriter, + inbox, + maxAggregates, + reportingInterval, + timeUnit, + healthMetric, + this::reconcilePeerTagSchema); this.thread = newAgentThread(METRICS_AGGREGATOR, aggregator); this.reportingInterval = reportingInterval; this.reportingIntervalTimeUnit = timeUnit; @@ -381,7 +389,10 @@ private boolean publish(CoreSpan span, boolean isTopLevel) { */ private PeerTagSchema peerTagSchemaFor(CoreSpan span) { if (span.isKind(PEER_AGGREGATION_KINDS)) { - PeerTagSchema schema = currentPeerAggSchema(); + PeerTagSchema schema = cachedPeerTagSchema; + if (schema == null) { + schema = bootstrapPeerTagSchema(); + } return schema.size() > 0 ? schema : null; } if (span.isKind(INTERNAL_KIND)) { @@ -391,27 +402,53 @@ private PeerTagSchema peerTagSchemaFor(CoreSpan span) { } /** - * Returns the currently-cached peer-aggregation schema, rebuilding it if {@link - * DDAgentFeaturesDiscovery#peerTags()} has returned a different {@code Set} reference since the - * last cache. Steady-state cost: one volatile read + one reference compare. + * One-time producer-side bootstrap of {@link #cachedPeerTagSchema}. Synchronized double-check + * guards against two producers racing on the very first publish; after this returns, {@code + * cachedPeerTagSchema} is non-null forever and the aggregator thread is the sole subsequent + * mutator (see {@link #reconcilePeerTagSchema()}). */ - private PeerTagSchema currentPeerAggSchema() { - Set current = features.peerTags(); - if (current == cachedPeerTagsSource) { - return cachedPeerTagSchema; + private synchronized PeerTagSchema bootstrapPeerTagSchema() { + PeerTagSchema cached = cachedPeerTagSchema; + if (cached != null) { + return cached; } - return refreshPeerAggSchema(current); + PeerTagSchema schema = buildPeerTagSchema(); + cachedPeerTagSchema = schema; + return schema; + } + + /** Builds a fresh {@link PeerTagSchema} from the current state of feature discovery. */ + private PeerTagSchema buildPeerTagSchema() { + Set names = features.peerTags(); + return PeerTagSchema.of( + names == null ? Collections.emptySet() : names, features.getLastTimeDiscovered()); } - private synchronized PeerTagSchema refreshPeerAggSchema(Set current) { - // Double-checked: another producer may have rebuilt while we were waiting on the monitor. - if (current == cachedPeerTagsSource) { - return cachedPeerTagSchema; + /** + * Reconciles {@link #cachedPeerTagSchema} with the latest feature discovery. Runs on the + * aggregator thread once per reporting cycle via the reset hook passed to {@link Aggregator}. + * Cheap fast path: a long compare against the cached schema's embedded timestamp short-circuits + * when discovery hasn't refreshed since the schema was built. On mismatch, a set compare + * distinguishes "discovery refreshed but tags unchanged" (just bump the timestamp in place) from + * "tags actually changed" (build a new schema and swap the volatile reference). + */ + private void reconcilePeerTagSchema() { + PeerTagSchema cached = cachedPeerTagSchema; + if (cached == null) { + // First reset before the first publish -- producer-side bootstrap hasn't run yet. + return; + } + long latestDiscoveredAt = features.getLastTimeDiscovered(); + if (cached.lastTimeDiscovered == latestDiscoveredAt) { + return; + } + Set latestNames = features.peerTags(); + Set normalized = latestNames == null ? Collections.emptySet() : latestNames; + if (cached.hasSameTagsAs(normalized)) { + cached.lastTimeDiscovered = latestDiscoveredAt; + } else { + cachedPeerTagSchema = PeerTagSchema.of(normalized, latestDiscoveredAt); } - PeerTagSchema schema = PeerTagSchema.of(current); - cachedPeerTagSchema = schema; - cachedPeerTagsSource = current; - return schema; } /** diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/PeerTagSchema.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/PeerTagSchema.java index 8d85a65c63a..87a0b955f5f 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/PeerTagSchema.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/PeerTagSchema.java @@ -2,6 +2,7 @@ import static datadog.trace.api.DDTags.BASE_SERVICE; +import datadog.communication.ddagent.DDAgentFeaturesDiscovery; import java.util.Set; /** @@ -19,28 +20,74 @@ *

    *
  • {@link #INTERNAL} -- a singleton with one entry for {@code base.service}, used for * internal-kind spans where only the base service is aggregated. - *
  • A peer-aggregation schema built via {@link #of(Set)} for {@code client}/{@code - * producer}/{@code consumer} spans, cached on {@link ConflatingMetricsAggregator} keyed by - * reference equality of {@code DDAgentFeaturesDiscovery.peerTags()}. + *
  • A peer-aggregation schema built via {@link #of(Set, long)} for {@code client}/{@code + * producer}/{@code consumer} spans. {@link ConflatingMetricsAggregator} caches the most + * recently built schema and reconciles it on the aggregator thread once per reporting cycle + * by comparing {@link #lastTimeDiscovered} against {@link + * DDAgentFeaturesDiscovery#getLastTimeDiscovered()}. *
* *

This class deliberately has no cardinality limiters or per-cycle state -- callers that need * those layer them on top. + * + *

Thread-safety: {@link #names} is final and safe to read from any thread. {@link + * #lastTimeDiscovered} is exercised only on the aggregator thread (read and updated in + * reconciliation); producer threads access the schema only through the volatile {@code + * cachedPeerTagSchema} reference in {@link ConflatingMetricsAggregator}. */ final class PeerTagSchema { /** Singleton schema for internal-kind spans -- only {@code base.service}. */ - static final PeerTagSchema INTERNAL = new PeerTagSchema(new String[] {BASE_SERVICE}); + static final PeerTagSchema INTERNAL = + // -1L sentinel; INTERNAL is never reconciled, so the value just has to be distinct from any + // real System.currentTimeMillis() that the aggregator might observe. + new PeerTagSchema(new String[] {BASE_SERVICE}, -1L); final String[] names; - private PeerTagSchema(String[] names) { + /** + * The {@code DDAgentFeaturesDiscovery.getLastTimeDiscovered()} value this schema was built from. + * The aggregator thread reads and updates this once per reporting cycle when reconciling against + * the latest discovery; producer threads never touch it. Plain (non-volatile, non-final) because + * the aggregator is the sole reader/writer. + */ + long lastTimeDiscovered; + + private PeerTagSchema(String[] names, long lastTimeDiscovered) { this.names = names; + this.lastTimeDiscovered = lastTimeDiscovered; } /** Builds a schema for the given peer-tag names. Order is determined by the {@link Set}. */ - static PeerTagSchema of(Set tags) { - return new PeerTagSchema(tags.toArray(new String[0])); + static PeerTagSchema of(Set tags, long lastTimeDiscovered) { + return new PeerTagSchema(tags.toArray(new String[0]), lastTimeDiscovered); + } + + /** + * Test-only factory that takes the names array directly so tests can build a schema in a specific + * order without going through a {@link Set}. + */ + static PeerTagSchema testSchema(String[] names) { + return new PeerTagSchema(names, 0L); + } + + /** + * Whether this schema's tag names exactly match {@code other}. Used by the aggregator's reconcile + * path: when a feature discovery refresh bumps {@link + * DDAgentFeaturesDiscovery#getLastTimeDiscovered()} but the resulting set is unchanged, the + * aggregator can keep this schema and just bump {@link #lastTimeDiscovered} instead of + * rebuilding. + */ + boolean hasSameTagsAs(Set other) { + if (this.names.length != other.size()) { + return false; + } + for (String name : this.names) { + if (!other.contains(name)) { + return false; + } + } + return true; } int size() { diff --git a/dd-trace-core/src/test/groovy/datadog/trace/common/metrics/ConflatingMetricAggregatorTest.groovy b/dd-trace-core/src/test/groovy/datadog/trace/common/metrics/ConflatingMetricAggregatorTest.groovy index 962ad2ce892..3ab6e0e09d1 100644 --- a/dd-trace-core/src/test/groovy/datadog/trace/common/metrics/ConflatingMetricAggregatorTest.groovy +++ b/dd-trace-core/src/test/groovy/datadog/trace/common/metrics/ConflatingMetricAggregatorTest.groovy @@ -255,29 +255,44 @@ class ConflatingMetricAggregatorTest extends DDSpecification { def "should create bucket for each set of peer tags"() { setup: + // Peer-tag schema is reconciled with feature discovery once per reporting cycle (on the + // aggregator thread, in the post-report hook), not per-span on the producer. Drive two + // reporting cycles with different peerTags() configurations to verify the aggregator buckets + // each cycle by the schema that was current at publish time. MetricWriter writer = Mock(MetricWriter) Sink sink = Stub(Sink) DDAgentFeaturesDiscovery features = Mock(DDAgentFeaturesDiscovery) features.supportsMetrics() >> true - features.peerTags() >>> [["country"], ["country", "georegion"],] + features.peerTags() >>> [["country"], ["country", "georegion"]] + // Bump the discovered-at timestamp so reconcile during report cycle 1 sees a mismatch and + // rebuilds the schema for span 2. Three calls: bootstrap (span1's publish), reconcile-during- + // report-1 (mismatch -> rebuild + 2nd peerTags() call), reconcile-during-report-2 (no change). + features.getLastTimeDiscovered() >>> [1L, 2L, 2L] ConflatingMetricsAggregator aggregator = new ConflatingMetricsAggregator(empty, features, HealthMetrics.NO_OP, sink, writer, 10, queueSize, reportingInterval, SECONDS, false) aggregator.start() - when: - CountDownLatch latch = new CountDownLatch(1) + when: "cycle 1 -- peerTags=[country]" + CountDownLatch latch1 = new CountDownLatch(1) aggregator.publish([ new SimpleSpan("service", "operation", "resource", "type", true, false, false, 0, 100, HTTP_OK) - .setTag(SPAN_KIND, "client").setTag("country", "france").setTag("georegion", "europe"), + .setTag(SPAN_KIND, "client").setTag("country", "france").setTag("georegion", "europe") + ]) + aggregator.report() + def cycle1Triggered = latch1.await(2, SECONDS) + + and: "cycle 2 -- reconcile picks up peerTags=[country, georegion]" + CountDownLatch latch2 = new CountDownLatch(1) + aggregator.publish([ new SimpleSpan("service", "operation", "resource", "type", true, false, false, 0, 100, HTTP_OK) .setTag(SPAN_KIND, "client").setTag("country", "france").setTag("georegion", "europe") ]) aggregator.report() - def latchTriggered = latch.await(2, SECONDS) + def cycle2Triggered = latch2.await(2, SECONDS) then: - latchTriggered - 1 * writer.startBucket(2, _, _) + cycle1Triggered + cycle2Triggered 1 * writer.add( new MetricKey( "resource", @@ -314,7 +329,7 @@ class ConflatingMetricAggregatorTest extends DDSpecification { ), { AggregateMetric aggregateMetric -> aggregateMetric.getHitCount() == 1 && aggregateMetric.getTopLevelCount() == 0 && aggregateMetric.getDuration() == 100 }) - 1 * writer.finishBucket() >> { latch.countDown() } + 2 * writer.finishBucket() >> { latch1.countDown(); latch2.countDown() } cleanup: aggregator.close() From e7d0b42df1ed6d4dbe2bde15408f90590422d0d0 Mon Sep 17 00:00:00 2001 From: Douglas Q Hawkins Date: Thu, 21 May 2026 11:45:15 -0400 Subject: [PATCH 14/22] Add bootstrap + reconcile coverage for PeerTagSchema Addresses round-3 review nice-to-haves on PR #11381. - PeerTagSchemaTest: unit coverage for hasSameTagsAs() (the predicate that drives the reconcile fast/slow path split), the of(Set, long) factory, and the INTERNAL singleton. The hasSameTagsAs cases include same-content-different-Set-reference (the case the reconcile fast path relies on after a discovery refresh) and content-mismatch in either direction. - ConflatingMetricsAggregatorBootstrapTest: integration coverage for the producer-side bootstrap + aggregator-thread reconcile flow. * bootstrapHappensOnceOnFirstPublish -- three publishes against an un-started aggregator (no consumer thread, no reconciles); verifies features.peerTags() and features.getLastTimeDiscovered() are each called exactly once. * reconcileSkipsDeepCompareWhenTimestampMatches -- two cycles with constant features.getLastTimeDiscovered(); each post-report reconcile short-circuits on the timestamp fast path, so peerTags() is called only by bootstrap (1 total). * reconcileSurvivesTimestampBumpWhenTagsUnchanged -- timestamps bump every reconcile, forcing the slow set-compare path; the tag set stays identical, so the schema is preserved and continues to flush buckets correctly across cycles. Co-Authored-By: Claude Opus 4.7 (1M context) --- ...flatingMetricsAggregatorBootstrapTest.java | 234 ++++++++++++++++++ .../common/metrics/PeerTagSchemaTest.java | 87 +++++++ 2 files changed, 321 insertions(+) create mode 100644 dd-trace-core/src/test/java/datadog/trace/common/metrics/ConflatingMetricsAggregatorBootstrapTest.java create mode 100644 dd-trace-core/src/test/java/datadog/trace/common/metrics/PeerTagSchemaTest.java diff --git a/dd-trace-core/src/test/java/datadog/trace/common/metrics/ConflatingMetricsAggregatorBootstrapTest.java b/dd-trace-core/src/test/java/datadog/trace/common/metrics/ConflatingMetricsAggregatorBootstrapTest.java new file mode 100644 index 00000000000..b8b46a31298 --- /dev/null +++ b/dd-trace-core/src/test/java/datadog/trace/common/metrics/ConflatingMetricsAggregatorBootstrapTest.java @@ -0,0 +1,234 @@ +package datadog.trace.common.metrics; + +import static java.util.concurrent.TimeUnit.SECONDS; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.ArgumentMatchers.eq; +import static org.mockito.Mockito.atLeastOnce; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +import datadog.communication.ddagent.DDAgentFeaturesDiscovery; +import datadog.trace.bootstrap.instrumentation.api.Tags; +import datadog.trace.core.CoreSpan; +import datadog.trace.core.SpanKindFilter; +import datadog.trace.core.monitor.HealthMetrics; +import java.util.Collections; +import java.util.LinkedHashSet; +import java.util.concurrent.CountDownLatch; +import org.junit.jupiter.api.Test; + +/** + * Coverage for the {@code ConflatingMetricsAggregator} peer-tag schema bootstrap and reconcile + * paths. + * + *

    + *
  • {@link #bootstrapHappensOnceOnFirstPublish()} -- verifies the synchronized producer-side + * bootstrap runs exactly once and is skipped on subsequent publishes. + *
  • {@link #reconcileSkipsDeepCompareWhenTimestampMatches()} -- verifies the aggregator-thread + * reconcile's timestamp-only fast path: when the cached schema's {@code lastTimeDiscovered} + * matches {@code features.getLastTimeDiscovered()}, reconcile returns without calling {@code + * features.peerTags()}. + *
  • {@link #reconcileSurvivesTimestampBumpWhenTagsUnchanged()} -- verifies that when the + * discovery timestamp changes but the tag set is identical, the schema continues to function + * correctly across cycles. + *
+ */ +class ConflatingMetricsAggregatorBootstrapTest { + + @Test + void bootstrapHappensOnceOnFirstPublish() { + // Producer-side bootstrap is synchronized; we want to confirm only the first publish + // queries features and subsequent publishes hit the cached schema. + HealthMetrics healthMetrics = mock(HealthMetrics.class); + MetricWriter writer = mock(MetricWriter.class); + Sink sink = mock(Sink.class); + DDAgentFeaturesDiscovery features = mock(DDAgentFeaturesDiscovery.class); + when(features.supportsMetrics()).thenReturn(true); + when(features.peerTags()).thenReturn(Collections.singleton("peer.hostname")); + when(features.getLastTimeDiscovered()).thenReturn(1000L); + + ConflatingMetricsAggregator aggregator = + new ConflatingMetricsAggregator( + Collections.emptySet(), + features, + healthMetrics, + sink, + writer, + /* maxAggregates */ 16, + /* queueSize */ 64, + /* reportingInterval */ 10, + SECONDS, + /* includeEndpointInMetrics */ false); + + // Do not start the aggregator thread -- reconcile must not run, only bootstrap. + aggregator.publish(Collections.>singletonList(peerAggregationSpan())); + aggregator.publish(Collections.>singletonList(peerAggregationSpan())); + aggregator.publish(Collections.>singletonList(peerAggregationSpan())); + + // Bootstrap is the only path that queries features for peer-tag schema, and it runs + // exactly once across three publishes. + verify(features, times(1)).peerTags(); + verify(features, times(1)).getLastTimeDiscovered(); + aggregator.close(); + } + + @Test + void reconcileSkipsDeepCompareWhenTimestampMatches() throws Exception { + // Two reporting cycles with the same (mocked-constant) discovery timestamp -- the second + // reconcile must short-circuit on the timestamp compare and avoid touching peerTags(). + HealthMetrics healthMetrics = mock(HealthMetrics.class); + MetricWriter writer = mock(MetricWriter.class); + Sink sink = mock(Sink.class); + DDAgentFeaturesDiscovery features = mock(DDAgentFeaturesDiscovery.class); + when(features.supportsMetrics()).thenReturn(true); + when(features.peerTags()).thenReturn(Collections.singleton("peer.hostname")); + when(features.getLastTimeDiscovered()).thenReturn(1000L); + + ConflatingMetricsAggregator aggregator = + new ConflatingMetricsAggregator( + Collections.emptySet(), + features, + healthMetrics, + sink, + writer, + /* maxAggregates */ 16, + /* queueSize */ 64, + /* reportingInterval */ 10, + SECONDS, + /* includeEndpointInMetrics */ false); + aggregator.start(); + try { + CountDownLatch cycle1 = new CountDownLatch(1); + CountDownLatch cycle2 = new CountDownLatch(1); + // Both reports flush a bucket; the cycle1/cycle2 countdowns synchronize the test thread + // with the aggregator thread's per-cycle completion. + org.mockito.Mockito.doAnswer( + invocation -> { + cycle1.countDown(); + return null; + }) + .doAnswer( + invocation -> { + cycle2.countDown(); + return null; + }) + .when(writer) + .finishBucket(); + + aggregator.publish(Collections.>singletonList(peerAggregationSpan())); + aggregator.report(); + assertTrue(cycle1.await(2, SECONDS)); + + aggregator.publish(Collections.>singletonList(peerAggregationSpan())); + aggregator.report(); + assertTrue(cycle2.await(2, SECONDS)); + + // peerTags() is called only by bootstrap; both reconciles short-circuit on the timestamp + // fast path (cached lastTimeDiscovered == features.getLastTimeDiscovered() == 1000L), so + // neither reconcile reaches the deep set compare. Total peerTags() calls: 1. + verify(features, times(1)).peerTags(); + // getLastTimeDiscovered() is called by bootstrap (1) + each reconcile (2) = 3 total. + verify(features, times(3)).getLastTimeDiscovered(); + } finally { + aggregator.close(); + } + } + + @Test + void reconcileSurvivesTimestampBumpWhenTagsUnchanged() throws Exception { + // Behavioral cross-check on the "set is unchanged, just bump timestamp" branch: discovery + // refreshes (timestamp moves) but the underlying tag set is identical. The aggregator must + // continue producing valid buckets for the same logical peer tag across cycles. + HealthMetrics healthMetrics = mock(HealthMetrics.class); + MetricWriter writer = mock(MetricWriter.class); + Sink sink = mock(Sink.class); + DDAgentFeaturesDiscovery features = mock(DDAgentFeaturesDiscovery.class); + when(features.supportsMetrics()).thenReturn(true); + // peerTags() returns content-equal sets across calls -- the reconcile slow path's + // hasSameTagsAs check should return true. + when(features.peerTags()) + .thenReturn(new LinkedHashSet<>(Collections.singleton("peer.hostname"))) + .thenReturn(new LinkedHashSet<>(Collections.singleton("peer.hostname"))) + .thenReturn(new LinkedHashSet<>(Collections.singleton("peer.hostname"))); + // Timestamp bumps every reconcile -- forces reconcile into the slow path each time. + when(features.getLastTimeDiscovered()).thenReturn(1L, 2L, 3L); + + ConflatingMetricsAggregator aggregator = + new ConflatingMetricsAggregator( + Collections.emptySet(), + features, + healthMetrics, + sink, + writer, + /* maxAggregates */ 16, + /* queueSize */ 64, + /* reportingInterval */ 10, + SECONDS, + /* includeEndpointInMetrics */ false); + aggregator.start(); + try { + CountDownLatch cycle1 = new CountDownLatch(1); + CountDownLatch cycle2 = new CountDownLatch(1); + org.mockito.Mockito.doAnswer( + invocation -> { + cycle1.countDown(); + return null; + }) + .doAnswer( + invocation -> { + cycle2.countDown(); + return null; + }) + .when(writer) + .finishBucket(); + + aggregator.publish(Collections.>singletonList(peerAggregationSpan())); + aggregator.report(); + assertTrue(cycle1.await(2, SECONDS)); + + aggregator.publish(Collections.>singletonList(peerAggregationSpan())); + aggregator.report(); + assertTrue(cycle2.await(2, SECONDS)); + + // Both cycles flushed: writer.add was invoked twice (once per cycle). The schema kept + // producing the same MetricKey across cycles -- if the schema had been broken by the + // timestamp bump, no buckets would have flushed. + verify(writer, times(2)).add(any(MetricKey.class), any(AggregateMetric.class)); + // Bootstrap (1) + two reconciles (2) -- each reconcile saw a timestamp mismatch and went + // through the deep compare, calling peerTags() once = 3 total. + verify(features, times(3)).peerTags(); + verify(features, atLeastOnce()).getLastTimeDiscovered(); + } finally { + aggregator.close(); + } + } + + @SuppressWarnings({"rawtypes", "unchecked"}) + private static CoreSpan peerAggregationSpan() { + CoreSpan span = mock(CoreSpan.class); + when(span.isMeasured()).thenReturn(false); + when(span.isTopLevel()).thenReturn(true); + // Return true for any SpanKindFilter -- shouldComputeMetric will see METRICS_ELIGIBLE_KINDS + // match, and peerTagSchemaFor will see PEER_AGGREGATION_KINDS match (checked first), which + // routes the span through the bootstrap path. + when(span.isKind(any(SpanKindFilter.class))).thenReturn(true); + when(span.getLongRunningVersion()).thenReturn(0); + when(span.getDurationNano()).thenReturn(100L); + when(span.getError()).thenReturn(0); + when(span.getResourceName()).thenReturn("resource"); + when(span.getServiceName()).thenReturn("svc"); + when(span.getOperationName()).thenReturn("op"); + when(span.getServiceNameSource()).thenReturn(null); + when(span.getType()).thenReturn("web"); + when(span.getHttpStatusCode()).thenReturn((short) 200); + when(span.getParentId()).thenReturn(0L); + when(span.getOrigin()).thenReturn(null); + when(span.unsafeGetTag(eq(Tags.SPAN_KIND), any(CharSequence.class))).thenReturn("client"); + // peer.hostname tag is set so capturePeerTagValues fires for the bootstrapped schema. + when(span.unsafeGetTag("peer.hostname")).thenReturn("localhost"); + return span; + } +} diff --git a/dd-trace-core/src/test/java/datadog/trace/common/metrics/PeerTagSchemaTest.java b/dd-trace-core/src/test/java/datadog/trace/common/metrics/PeerTagSchemaTest.java new file mode 100644 index 00000000000..6b9f557d046 --- /dev/null +++ b/dd-trace-core/src/test/java/datadog/trace/common/metrics/PeerTagSchemaTest.java @@ -0,0 +1,87 @@ +package datadog.trace.common.metrics; + +import static org.junit.jupiter.api.Assertions.assertArrayEquals; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.LinkedHashSet; +import java.util.Set; +import org.junit.jupiter.api.Test; + +/** + * Unit tests for {@link PeerTagSchema}. Covers the {@link PeerTagSchema#hasSameTagsAs(Set)} + * predicate that drives the aggregator's reconcile fast/slow path split, the factory shapes, and + * the {@link PeerTagSchema#INTERNAL} singleton. + */ +class PeerTagSchemaTest { + + @Test + void ofBuildsSchemaFromSetWithTimestamp() { + Set tags = new LinkedHashSet<>(Arrays.asList("peer.hostname", "peer.service")); + PeerTagSchema schema = PeerTagSchema.of(tags, 1234L); + + assertArrayEquals(new String[] {"peer.hostname", "peer.service"}, schema.names); + assertEquals(1234L, schema.lastTimeDiscovered); + assertEquals(2, schema.size()); + } + + @Test + void ofHandlesEmptySet() { + PeerTagSchema schema = PeerTagSchema.of(Collections.emptySet(), 0L); + + assertEquals(0, schema.size()); + assertEquals(0, schema.names.length); + } + + @Test + void internalSingletonCarriesBaseService() { + assertEquals(1, PeerTagSchema.INTERNAL.size()); + assertEquals("_dd.base_service", PeerTagSchema.INTERNAL.names[0]); + } + + @Test + void hasSameTagsAsReturnsTrueForExactMatch() { + PeerTagSchema schema = + PeerTagSchema.of(new LinkedHashSet<>(Arrays.asList("peer.hostname", "peer.service")), 1L); + + // Same content via a different Set reference -- this is the case the reconcile fast-path + // depends on (Set returned from a fresh discovery cycle is content-equal to the prior one). + Set equivalentSet = new HashSet<>(Arrays.asList("peer.service", "peer.hostname")); + assertTrue(schema.hasSameTagsAs(equivalentSet)); + } + + @Test + void hasSameTagsAsReturnsFalseWhenSetGrew() { + PeerTagSchema schema = PeerTagSchema.of(Collections.singleton("peer.hostname"), 1L); + + Set larger = new HashSet<>(Arrays.asList("peer.hostname", "peer.service")); + assertFalse(schema.hasSameTagsAs(larger)); + } + + @Test + void hasSameTagsAsReturnsFalseWhenSetShrank() { + PeerTagSchema schema = + PeerTagSchema.of(new LinkedHashSet<>(Arrays.asList("peer.hostname", "peer.service")), 1L); + + assertFalse(schema.hasSameTagsAs(Collections.singleton("peer.hostname"))); + } + + @Test + void hasSameTagsAsReturnsFalseWhenContentDifferent() { + PeerTagSchema schema = PeerTagSchema.of(Collections.singleton("peer.hostname"), 1L); + + assertFalse(schema.hasSameTagsAs(Collections.singleton("peer.service"))); + } + + @Test + void hasSameTagsAsHandlesEmpty() { + PeerTagSchema empty = PeerTagSchema.of(Collections.emptySet(), 1L); + + assertTrue(empty.hasSameTagsAs(Collections.emptySet())); + assertFalse(empty.hasSameTagsAs(Collections.singleton("peer.hostname"))); + } +} From ba3225c131081221e99c00019de1be522990eb72 Mon Sep 17 00:00:00 2001 From: Douglas Q Hawkins Date: Thu, 21 May 2026 11:47:35 -0400 Subject: [PATCH 15/22] Use writer.finishBucket() count in bootstrap test for cascade compatibility The verify(writer).add(MetricKey, AggregateMetric) signature is unique to #11381; downstream branches use AggregateEntry. Switching to verify(writer, times(2)).finishBucket() keeps the same behavioral guarantee (both cycles flushed) across the stack. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../metrics/ConflatingMetricsAggregatorBootstrapTest.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dd-trace-core/src/test/java/datadog/trace/common/metrics/ConflatingMetricsAggregatorBootstrapTest.java b/dd-trace-core/src/test/java/datadog/trace/common/metrics/ConflatingMetricsAggregatorBootstrapTest.java index b8b46a31298..76347e505c0 100644 --- a/dd-trace-core/src/test/java/datadog/trace/common/metrics/ConflatingMetricsAggregatorBootstrapTest.java +++ b/dd-trace-core/src/test/java/datadog/trace/common/metrics/ConflatingMetricsAggregatorBootstrapTest.java @@ -193,10 +193,10 @@ void reconcileSurvivesTimestampBumpWhenTagsUnchanged() throws Exception { aggregator.report(); assertTrue(cycle2.await(2, SECONDS)); - // Both cycles flushed: writer.add was invoked twice (once per cycle). The schema kept - // producing the same MetricKey across cycles -- if the schema had been broken by the - // timestamp bump, no buckets would have flushed. - verify(writer, times(2)).add(any(MetricKey.class), any(AggregateMetric.class)); + // Both cycles flushed (both latches counted down via writer.finishBucket). The schema kept + // producing buckets across the timestamp bumps; if the schema had been broken by the + // bump-in-place path, the second cycle's flush would not have happened. + verify(writer, times(2)).finishBucket(); // Bootstrap (1) + two reconciles (2) -- each reconcile saw a timestamp mismatch and went // through the deep compare, calling peerTags() once = 3 total. verify(features, times(3)).peerTags(); From 0b86066ec863f4f394be81400a581802ed0983b8 Mon Sep 17 00:00:00 2001 From: Douglas Q Hawkins Date: Thu, 21 May 2026 14:09:56 -0400 Subject: [PATCH 16/22] Preserve TRACER_METRICS_MAX_PENDING semantic + drop stale imports TRACER_METRICS_MAX_PENDING previously counted conflating Batch slots (~64 spans each). The inbox now holds 1 SpanSnapshot per slot, so multiply the configured value by LEGACY_BATCH_SIZE (64) to keep pre-existing customer overrides delivering the same effective span-throughput capacity. Default stays at 2048 logical -> 131072 snapshot slots, identical to the prior 2048 batches * 64 spans. Also drops two unused datadog.trace.core.SpanKindFilter imports left behind in TraceGenerator.groovy after the isKind() override was removed in favor of the CoreSpan default implementation. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../trace/common/writer/TraceGenerator.groovy | 1 - .../groovy/TraceGenerator.groovy | 1 - .../main/java/datadog/trace/api/Config.java | 18 ++++++++++++------ 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/dd-trace-core/src/test/groovy/datadog/trace/common/writer/TraceGenerator.groovy b/dd-trace-core/src/test/groovy/datadog/trace/common/writer/TraceGenerator.groovy index 1e251f09bf2..d8f29f7195b 100644 --- a/dd-trace-core/src/test/groovy/datadog/trace/common/writer/TraceGenerator.groovy +++ b/dd-trace-core/src/test/groovy/datadog/trace/common/writer/TraceGenerator.groovy @@ -16,7 +16,6 @@ import datadog.trace.bootstrap.instrumentation.api.UTF8BytesString import datadog.trace.core.CoreSpan import datadog.trace.core.Metadata import datadog.trace.core.MetadataConsumer -import datadog.trace.core.SpanKindFilter import java.util.concurrent.ThreadLocalRandom import java.util.concurrent.TimeUnit diff --git a/dd-trace-core/src/traceAgentTest/groovy/TraceGenerator.groovy b/dd-trace-core/src/traceAgentTest/groovy/TraceGenerator.groovy index e7b08915d5f..d20a03df6de 100644 --- a/dd-trace-core/src/traceAgentTest/groovy/TraceGenerator.groovy +++ b/dd-trace-core/src/traceAgentTest/groovy/TraceGenerator.groovy @@ -14,7 +14,6 @@ import datadog.trace.bootstrap.instrumentation.api.UTF8BytesString import datadog.trace.core.CoreSpan import datadog.trace.core.Metadata import datadog.trace.core.MetadataConsumer -import datadog.trace.core.SpanKindFilter import java.util.concurrent.ThreadLocalRandom import java.util.concurrent.TimeUnit diff --git a/internal-api/src/main/java/datadog/trace/api/Config.java b/internal-api/src/main/java/datadog/trace/api/Config.java index 6b912b39de2..af598bbd7b3 100644 --- a/internal-api/src/main/java/datadog/trace/api/Config.java +++ b/internal-api/src/main/java/datadog/trace/api/Config.java @@ -809,6 +809,10 @@ public class Config { private static final Pattern COLON = Pattern.compile(":"); + // Historical conflating-Batch size; used to translate TRACER_METRICS_MAX_PENDING (configured in + // legacy batch units) into the new per-SpanSnapshot inbox capacity. + private static final int LEGACY_BATCH_SIZE = 64; + private final InstrumenterConfig instrumenterConfig; private final long startTimeMillis = System.currentTimeMillis(); @@ -2173,12 +2177,14 @@ private Config(final ConfigProvider configProvider, final InstrumenterConfig ins tracerMetricsBufferingEnabled = configProvider.getBoolean(TRACER_METRICS_BUFFERING_ENABLED, false); tracerMetricsMaxAggregates = configProvider.getInteger(TRACER_METRICS_MAX_AGGREGATES, 2048); - // Sized for ~2048 conflation slots * ~64 spans-per-batch effective capacity from the previous - // conflating-Batch design (131072 = 2^17). Without producer-side conflation, the inbox holds 1 - // SpanSnapshot per metrics-eligible span instead of 1 conflated Batch per ~64 spans -- without - // this bump customers would see ~64x more inbox-full drops at the same span rate. ~100 B per - // SpanSnapshot * 131072 ≈ 13 MB worst-case heap floor. - tracerMetricsMaxPending = configProvider.getInteger(TRACER_METRICS_MAX_PENDING, 131072); + // TRACER_METRICS_MAX_PENDING historically counted conflating Batch slots (~64 spans per batch + // via Batch.MAX_BATCH_SIZE). The inbox now holds 1 SpanSnapshot per metrics-eligible span, so + // we multiply the configured value by the legacy batch size to preserve the effective + // span-throughput capacity of the prior default *and* of any existing customer override + // (e.g. a configured 4096 still means "~262144 spans before drops", same as before). ~100 B + // per SpanSnapshot * 131072 ≈ 13 MB worst-case heap floor at the default. + tracerMetricsMaxPending = + configProvider.getInteger(TRACER_METRICS_MAX_PENDING, 2048) * LEGACY_BATCH_SIZE; reportHostName = configProvider.getBoolean(TRACE_REPORT_HOSTNAME, DEFAULT_TRACE_REPORT_HOSTNAME); From 5c78dbb35c27b937ba499627342c2efb3cf141d1 Mon Sep 17 00:00:00 2001 From: Douglas Q Hawkins Date: Thu, 21 May 2026 14:20:09 -0400 Subject: [PATCH 17/22] Add AdversarialMetricsBenchmark for capacity-bound stress testing Ports the adversarial JMH benchmark from #11402 down to this branch so we can compare #11381 vs master on a high-cardinality, high-throughput workload. Adapted to use ConflatingMetricsAggregator (pre-rename) and the FixedAgentFeaturesDiscovery / NullSink helpers already in ConflatingMetricsAggregatorBenchmark. 8 producer threads hammer publish() with unique (service, operation, resource, peer.hostname) per op so the aggregate cache fills+evicts continuously and the inbox saturates. tearDown prints the drop counters (inboxFull vs aggregateDropped) so the test verifies the subsystem stayed bounded under attack. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../metrics/AdversarialMetricsBenchmark.java | 161 ++++++++++++++++++ 1 file changed, 161 insertions(+) create mode 100644 dd-trace-core/src/jmh/java/datadog/trace/common/metrics/AdversarialMetricsBenchmark.java diff --git a/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/AdversarialMetricsBenchmark.java b/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/AdversarialMetricsBenchmark.java new file mode 100644 index 00000000000..ebf1d38ea10 --- /dev/null +++ b/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/AdversarialMetricsBenchmark.java @@ -0,0 +1,161 @@ +package datadog.trace.common.metrics; + +import static datadog.trace.bootstrap.instrumentation.api.Tags.SPAN_KIND; +import static datadog.trace.bootstrap.instrumentation.api.Tags.SPAN_KIND_CLIENT; +import static java.util.concurrent.TimeUnit.SECONDS; + +import datadog.trace.api.WellKnownTags; +import datadog.trace.core.CoreSpan; +import datadog.trace.core.monitor.HealthMetrics; +import java.util.Collections; +import java.util.List; +import java.util.concurrent.ThreadLocalRandom; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.TearDown; +import org.openjdk.jmh.annotations.Threads; +import org.openjdk.jmh.annotations.Warmup; +import org.openjdk.jmh.infra.Blackhole; + +/** + * Adversarial JMH benchmark designed to stress the metrics subsystem's capacity bounds. + * + *

The metrics aggregator is bounded at every layer: + * + *

    + *
  • The aggregate cache caps total entries at {@code tracerMetricsMaxAggregates} (default + * 2048). Beyond that LRU eviction kicks in. + *
  • The producer/consumer inbox is a fixed-size MPSC queue ({@code tracerMetricsMaxPending}); + * when full, producer {@code offer} returns false and the snapshot is dropped via {@link + * HealthMetrics#onStatsInboxFull()}. + *
  • Histograms use a bounded dense store -- per-histogram memory is fixed. + *
+ * + *

The benchmark hammers all of these simultaneously with 8 producer threads, unique labels per + * op (so the aggregate cache fills+evicts repeatedly), random durations across a wide range (so + * histograms accept many distinct bins), and random {@code error}/{@code topLevel} flags (so both + * histograms are exercised). After the run, drop counters are printed so you can see how the + * subsystem absorbed the burst. + * + *

What "OOM the metrics subsystem" would look like if the bounds break: producer-thread + * allocation would grow unbounded (snapshots faster than the inbox can drain produces dropped + * snapshots, not heap growth); aggregator-thread heap would grow if entries weren't capped or + * histograms grew past their dense-store limit. + */ +@State(Scope.Benchmark) +@Warmup(iterations = 2, time = 15, timeUnit = SECONDS) +@Measurement(iterations = 5, time = 15, timeUnit = SECONDS) +@BenchmarkMode(Mode.Throughput) +@OutputTimeUnit(SECONDS) +@Threads(8) +@Fork(value = 1) +public class AdversarialMetricsBenchmark { + + private ConflatingMetricsAggregator aggregator; + private CountingHealthMetrics health; + + @State(Scope.Thread) + public static class ThreadState { + int cursor; + } + + @Setup + public void setup() { + this.health = new CountingHealthMetrics(); + this.aggregator = + new ConflatingMetricsAggregator( + new WellKnownTags("", "", "", "", "", ""), + Collections.emptySet(), + new ConflatingMetricsAggregatorBenchmark.FixedAgentFeaturesDiscovery( + Collections.singleton("peer.hostname"), Collections.emptySet()), + this.health, + new ConflatingMetricsAggregatorBenchmark.NullSink(), + 2048, + 2048, + false); + this.aggregator.start(); + } + + @TearDown + public void tearDown() { + aggregator.close(); + System.err.println( + "[ADVERSARIAL] snapshots offered (across all threads, both forks combined for this run):"); + System.err.println( + " onStatsInboxFull = " + + health.inboxFull + + " (snapshots dropped because the MPSC inbox was full)"); + System.err.println( + " onStatsAggregateDropped = " + + health.aggregateDropped + + " (snapshots dropped because the aggregate cache was full with no stale entry)"); + System.err.println( + " onClientStatTraceComputed total = " + + health.traceComputedCalls + + " spans counted = " + + health.totalSpansCounted); + } + + @Benchmark + public void publish(ThreadState ts, Blackhole blackhole) { + int idx = ts.cursor++; + ThreadLocalRandom rng = ThreadLocalRandom.current(); + + // Mix indices so labels don't fall into linear order. Distinct labels exceed every reasonable + // working-set bound, so the aggregate cache evicts continuously and most ops force a fresh + // MetricKey construction on the consumer thread. + int scrambled = idx * 0x9E3779B1; // golden ratio multiplier + String service = "svc-" + (scrambled & 0xFFFF); + String operation = "op-" + ((scrambled >>> 8) & 0x3FFFF); + String resource = "res-" + ((scrambled ^ 0x5A5A5A) & 0xFFFFF); + String hostname = "host-" + ((scrambled >>> 12) & 0x7FFF); + boolean error = (idx & 7) == 0; + boolean topLevel = (idx & 3) == 0; + // Wide duration spread forces histogram bins to populate broadly. + long durationNanos = 1L + (rng.nextLong() & 0x3FFFFFFFL); // 1 ns .. ~1.07 s + + SimpleSpan span = + new SimpleSpan( + service, operation, resource, "web", true, topLevel, error, 0, durationNanos, 200); + span.setTag(SPAN_KIND, SPAN_KIND_CLIENT); + span.setTag("peer.hostname", hostname); + + List> trace = Collections.singletonList(span); + blackhole.consume(aggregator.publish(trace)); + } + + /** + * Counts what gets dropped. The aggregator publishes onto these counters from many threads, so + * the fields are {@code volatile long} with non-atomic increments -- precise counts aren't the + * point, order-of-magnitude is. + */ + static final class CountingHealthMetrics extends HealthMetrics { + volatile long inboxFull; + volatile long aggregateDropped; + volatile long traceComputedCalls; + volatile long totalSpansCounted; + + @Override + public void onStatsInboxFull() { + inboxFull++; + } + + @Override + public void onStatsAggregateDropped() { + aggregateDropped++; + } + + @Override + public void onClientStatTraceComputed(int counted, int total, boolean dropped) { + traceComputedCalls++; + totalSpansCounted += counted; + } + } +} From 70c20ef704167994ce8c58c0b0f0d04e3fe969d0 Mon Sep 17 00:00:00 2001 From: Douglas Q Hawkins Date: Thu, 21 May 2026 14:43:10 -0400 Subject: [PATCH 18/22] Trim AdversarialMetricsBenchmark counters and clarify printout Drop traceComputedCalls / totalSpansCounted: under 8-way contention the volatile-long ++/+= pattern was losing ~20% of updates (296M counted vs 245M reported), and the numbers duplicate signal JMH's ops/s already provides. Switch inboxFull / aggregateDropped to LongAdder so the printed drop shape (the order-of-magnitude story the bench is built to tell) is accurate under contention. Replace the stale "both forks combined for this run" string with text that matches the actual @Fork(value=1) config and notes that counters accumulate across warmup + measurement. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../metrics/AdversarialMetricsBenchmark.java | 36 +++++++------------ 1 file changed, 13 insertions(+), 23 deletions(-) diff --git a/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/AdversarialMetricsBenchmark.java b/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/AdversarialMetricsBenchmark.java index ebf1d38ea10..02ebd8bb847 100644 --- a/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/AdversarialMetricsBenchmark.java +++ b/dd-trace-core/src/jmh/java/datadog/trace/common/metrics/AdversarialMetricsBenchmark.java @@ -10,6 +10,7 @@ import java.util.Collections; import java.util.List; import java.util.concurrent.ThreadLocalRandom; +import java.util.concurrent.atomic.LongAdder; import org.openjdk.jmh.annotations.Benchmark; import org.openjdk.jmh.annotations.BenchmarkMode; import org.openjdk.jmh.annotations.Fork; @@ -86,21 +87,18 @@ public void setup() { @TearDown public void tearDown() { aggregator.close(); + // Counters accumulate across the trial (warmup + measurement iterations), since the + // CountingHealthMetrics instance is created once in @Setup and never reset. System.err.println( - "[ADVERSARIAL] snapshots offered (across all threads, both forks combined for this run):"); + "[ADVERSARIAL] drops over the trial (8 threads, warmup + measurement combined):"); System.err.println( " onStatsInboxFull = " - + health.inboxFull + + health.inboxFull.sum() + " (snapshots dropped because the MPSC inbox was full)"); System.err.println( " onStatsAggregateDropped = " - + health.aggregateDropped + + health.aggregateDropped.sum() + " (snapshots dropped because the aggregate cache was full with no stale entry)"); - System.err.println( - " onClientStatTraceComputed total = " - + health.traceComputedCalls - + " spans counted = " - + health.totalSpansCounted); } @Benchmark @@ -132,30 +130,22 @@ public void publish(ThreadState ts, Blackhole blackhole) { } /** - * Counts what gets dropped. The aggregator publishes onto these counters from many threads, so - * the fields are {@code volatile long} with non-atomic increments -- precise counts aren't the - * point, order-of-magnitude is. + * Counts what gets dropped. Uses {@link LongAdder} so the printed totals hold up under 8-way + * contention -- {@code volatile long ++} loses ~20% of updates here, which would mask the + * order-of-magnitude shape the bench is trying to surface (inbox-full vs aggregate-dropped). */ static final class CountingHealthMetrics extends HealthMetrics { - volatile long inboxFull; - volatile long aggregateDropped; - volatile long traceComputedCalls; - volatile long totalSpansCounted; + final LongAdder inboxFull = new LongAdder(); + final LongAdder aggregateDropped = new LongAdder(); @Override public void onStatsInboxFull() { - inboxFull++; + inboxFull.increment(); } @Override public void onStatsAggregateDropped() { - aggregateDropped++; - } - - @Override - public void onClientStatTraceComputed(int counted, int total, boolean dropped) { - traceComputedCalls++; - totalSpansCounted += counted; + aggregateDropped.increment(); } } } From 68848adf47f551875f76d549e41fddf279e73fc5 Mon Sep 17 00:00:00 2001 From: Douglas Q Hawkins Date: Thu, 21 May 2026 14:50:11 -0400 Subject: [PATCH 19/22] Close PeerTagSchema reconcile race + cover the swap branch buildPeerTagSchema previously read features.peerTags() before features.getLastTimeDiscovered(). DDAgentFeaturesDiscovery exposes those as two separate accessors against its volatile State -- a state-swap interleaving could leave the cached schema tagged with a NEWER timestamp than its names, after which the next reconcile short-circuits on the timestamp compare and misses the tag-set update until the next discovery refresh (~minute later). Swap the read order so timestamp is captured first. With this ordering, an interleaving leaves the schema OLDER than its names instead -- the next reconcile sees a timestamp mismatch, runs the deep compare, and self-heals on the very next cycle. Also adds reconcileSwapsSchemaWhenTagSetChanges, which closes the test gap on the slow-path swap branch (cachedPeerTagSchema = PeerTagSchema.of(...)). End-to-end check via the writer's captured MetricKeys: pre-swap snapshot carries only peer.hostname, post-swap snapshot carries both peer.hostname and peer.service. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../metrics/ConflatingMetricsAggregator.java | 16 ++- ...flatingMetricsAggregatorBootstrapTest.java | 112 ++++++++++++++++++ 2 files changed, 126 insertions(+), 2 deletions(-) diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java index 0d1bbd74360..42ae33c8057 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java @@ -417,11 +417,23 @@ private synchronized PeerTagSchema bootstrapPeerTagSchema() { return schema; } - /** Builds a fresh {@link PeerTagSchema} from the current state of feature discovery. */ + /** + * Builds a fresh {@link PeerTagSchema} from the current state of feature discovery. + * + *

Read order matters: {@code DDAgentFeaturesDiscovery} exposes {@code peerTags()} and {@code + * getLastTimeDiscovered()} as two separate accessors, each reading its volatile {@code + * discoveryState} independently. If a discovery refresh interleaves between the two reads, we + * want to be left with a schema whose embedded timestamp is *older* than its tag set rather than + * newer -- that way the next reconcile sees a timestamp mismatch and re-runs the deep compare to + * pick up the change, instead of short-circuiting on a too-fresh timestamp and missing it. + * + *

So read {@code getLastTimeDiscovered()} first, then {@code peerTags()}. + */ private PeerTagSchema buildPeerTagSchema() { + long lastTimeDiscovered = features.getLastTimeDiscovered(); Set names = features.peerTags(); return PeerTagSchema.of( - names == null ? Collections.emptySet() : names, features.getLastTimeDiscovered()); + names == null ? Collections.emptySet() : names, lastTimeDiscovered); } /** diff --git a/dd-trace-core/src/test/java/datadog/trace/common/metrics/ConflatingMetricsAggregatorBootstrapTest.java b/dd-trace-core/src/test/java/datadog/trace/common/metrics/ConflatingMetricsAggregatorBootstrapTest.java index 76347e505c0..aea44e3682f 100644 --- a/dd-trace-core/src/test/java/datadog/trace/common/metrics/ConflatingMetricsAggregatorBootstrapTest.java +++ b/dd-trace-core/src/test/java/datadog/trace/common/metrics/ConflatingMetricsAggregatorBootstrapTest.java @@ -1,6 +1,7 @@ package datadog.trace.common.metrics; import static java.util.concurrent.TimeUnit.SECONDS; +import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertTrue; import static org.mockito.ArgumentMatchers.any; import static org.mockito.ArgumentMatchers.eq; @@ -12,13 +13,17 @@ import datadog.communication.ddagent.DDAgentFeaturesDiscovery; import datadog.trace.bootstrap.instrumentation.api.Tags; +import datadog.trace.bootstrap.instrumentation.api.UTF8BytesString; import datadog.trace.core.CoreSpan; import datadog.trace.core.SpanKindFilter; import datadog.trace.core.monitor.HealthMetrics; +import java.util.Arrays; import java.util.Collections; import java.util.LinkedHashSet; +import java.util.List; import java.util.concurrent.CountDownLatch; import org.junit.jupiter.api.Test; +import org.mockito.ArgumentCaptor; /** * Coverage for the {@code ConflatingMetricsAggregator} peer-tag schema bootstrap and reconcile @@ -34,6 +39,9 @@ *

  • {@link #reconcileSurvivesTimestampBumpWhenTagsUnchanged()} -- verifies that when the * discovery timestamp changes but the tag set is identical, the schema continues to function * correctly across cycles. + *
  • {@link #reconcileSwapsSchemaWhenTagSetChanges()} -- verifies the slow-path swap branch: + * when discovery refreshes with a new tag set, the cached schema is replaced and subsequent + * publishes see the new tags. * */ class ConflatingMetricsAggregatorBootstrapTest { @@ -206,6 +214,97 @@ void reconcileSurvivesTimestampBumpWhenTagsUnchanged() throws Exception { } } + @Test + void reconcileSwapsSchemaWhenTagSetChanges() throws Exception { + // The reconcile slow-path's swap branch: discovery refreshes the timestamp AND the tag set + // grows. Cached schema is rebuilt and the volatile reference points at the new schema. + // Verification is end-to-end -- we look at the MetricKey the writer receives. Pre-swap the + // span snapshot was pinned to the old schema so only peer.hostname appears; post-swap a new + // publish reads the new schema and the next flush carries both peer tags. + HealthMetrics healthMetrics = mock(HealthMetrics.class); + MetricWriter writer = mock(MetricWriter.class); + Sink sink = mock(Sink.class); + DDAgentFeaturesDiscovery features = mock(DDAgentFeaturesDiscovery.class); + when(features.supportsMetrics()).thenReturn(true); + // peerTags() shape evolves across calls: + // - bootstrap reads {peer.hostname} + // - cycle 1 reconcile slow-path reads {peer.hostname, peer.service} + // - cycle 2 reconcile is timestamp fast-path (no peerTags call) + when(features.peerTags()) + .thenReturn(Collections.singleton("peer.hostname")) + .thenReturn(new LinkedHashSet<>(Arrays.asList("peer.hostname", "peer.service"))); + // getLastTimeDiscovered() evolves: bootstrap = 1, then bumped to 2 for cycle 1's reconcile + // (mismatch -> slow path), stable at 2 for cycle 2's reconcile (match -> fast path). + when(features.getLastTimeDiscovered()).thenReturn(1L, 2L, 2L); + + ConflatingMetricsAggregator aggregator = + new ConflatingMetricsAggregator( + Collections.emptySet(), + features, + healthMetrics, + sink, + writer, + /* maxAggregates */ 16, + /* queueSize */ 64, + /* reportingInterval */ 10, + SECONDS, + /* includeEndpointInMetrics */ false); + aggregator.start(); + try { + CountDownLatch cycle1 = new CountDownLatch(1); + CountDownLatch cycle2 = new CountDownLatch(1); + org.mockito.Mockito.doAnswer( + invocation -> { + cycle1.countDown(); + return null; + }) + .doAnswer( + invocation -> { + cycle2.countDown(); + return null; + }) + .when(writer) + .finishBucket(); + + // Publish 1: snapshot pinned to the original {peer.hostname} schema. cycle 1's reconcile + // will swap the cached schema BEFORE the flush, but this snapshot is already pinned so its + // MetricKey will still carry only peer.hostname. + aggregator.publish( + Collections.>singletonList(peerAggregationSpanWithBothPeerTags())); + aggregator.report(); + assertTrue(cycle1.await(2, SECONDS)); + + // Publish 2: now reads the post-swap schema {peer.hostname, peer.service} so the snapshot + // captures both tag values. cycle 2's reconcile short-circuits on timestamp match. + aggregator.publish( + Collections.>singletonList(peerAggregationSpanWithBothPeerTags())); + aggregator.report(); + assertTrue(cycle2.await(2, SECONDS)); + + // Capture every (MetricKey, AggregateMetric) the writer saw across both cycles. Pre-swap + // snapshot has 1 peer tag, post-swap has 2. + ArgumentCaptor keyCaptor = ArgumentCaptor.forClass(MetricKey.class); + verify(writer, times(2)).add(keyCaptor.capture(), any(AggregateMetric.class)); + List keys = keyCaptor.getAllValues(); + assertEquals( + Collections.singletonList(UTF8BytesString.create("peer.hostname:localhost")), + keys.get(0).getPeerTags(), + "pre-swap snapshot should encode only peer.hostname"); + assertEquals( + Arrays.asList( + UTF8BytesString.create("peer.hostname:localhost"), + UTF8BytesString.create("peer.service:billing")), + keys.get(1).getPeerTags(), + "post-swap snapshot should encode both peer.hostname and peer.service"); + + // Bootstrap (1) + cycle 1 slow-path (1) -- cycle 2 is fast-path so doesn't reach peerTags(). + verify(features, times(2)).peerTags(); + verify(features, atLeastOnce()).getLastTimeDiscovered(); + } finally { + aggregator.close(); + } + } + @SuppressWarnings({"rawtypes", "unchecked"}) private static CoreSpan peerAggregationSpan() { CoreSpan span = mock(CoreSpan.class); @@ -231,4 +330,17 @@ private static CoreSpan peerAggregationSpan() { when(span.unsafeGetTag("peer.hostname")).thenReturn("localhost"); return span; } + + /** + * Variant of {@link #peerAggregationSpan()} that sets both {@code peer.hostname} and {@code + * peer.service}. Used by {@link #reconcileSwapsSchemaWhenTagSetChanges()} where the schema + * evolves from {@code {peer.hostname}} to {@code {peer.hostname, peer.service}} mid-test, and the + * post-swap snapshot must be able to capture the newly-relevant tag value. + */ + @SuppressWarnings({"rawtypes", "unchecked"}) + private static CoreSpan peerAggregationSpanWithBothPeerTags() { + CoreSpan span = peerAggregationSpan(); + when(span.unsafeGetTag("peer.service")).thenReturn("billing"); + return span; + } } From 2ea61c56a92c6796be52825285dc5175737fee87 Mon Sep 17 00:00:00 2001 From: Douglas Q Hawkins Date: Thu, 21 May 2026 14:58:42 -0400 Subject: [PATCH 20/22] Clarify materializePeerTags hit-counting loop Splits the `if (values[i] != null && hitCount++ == 0)` conjunction into nested ifs. Same semantics, no codegen impact after JIT -- just visibly says what the loop is doing rather than relying on post-increment-inside-conjunction. Closes amarziali's review thread on this block. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../datadog/trace/common/metrics/Aggregator.java | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/Aggregator.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/Aggregator.java index 72440b5d361..9998c21ed0b 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/Aggregator.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/Aggregator.java @@ -188,13 +188,17 @@ private static List materializePeerTags(PeerTagSchema schema, S } String[] names = schema.names; int n = names.length; - // Single-entry fast path (matches the original singletonList shape for INTERNAL spans and any - // other case where exactly one peer tag fired). + // First pass: count how many tags fired and remember the first index. The single-entry case + // is common (e.g. INTERNAL spans only emit base.service) and gets a singletonList to avoid an + // ArrayList allocation on the hot path. int firstHit = -1; int hitCount = 0; for (int i = 0; i < n; i++) { - if (values[i] != null && hitCount++ == 0) { - firstHit = i; + if (values[i] != null) { + if (hitCount == 0) { + firstHit = i; + } + hitCount++; } } if (hitCount == 0) { From 5a4685ff48a09b1844e47192e02764877b89c267 Mon Sep 17 00:00:00 2001 From: Douglas Q Hawkins Date: Thu, 21 May 2026 16:22:49 -0400 Subject: [PATCH 21/22] Drop unused Tags imports flagged by codenarc Leftover from removing the isKind() override in TraceGenerator earlier in this session -- I dropped the SpanKindFilter import but missed datadog.trace.bootstrap.instrumentation.api.Tags, which is no longer referenced in either file. Resolves codenarcTest and codenarcTraceAgentTest UnusedImport violations. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../groovy/datadog/trace/common/writer/TraceGenerator.groovy | 1 - dd-trace-core/src/traceAgentTest/groovy/TraceGenerator.groovy | 1 - 2 files changed, 2 deletions(-) diff --git a/dd-trace-core/src/test/groovy/datadog/trace/common/writer/TraceGenerator.groovy b/dd-trace-core/src/test/groovy/datadog/trace/common/writer/TraceGenerator.groovy index d8f29f7195b..66bdbab137b 100644 --- a/dd-trace-core/src/test/groovy/datadog/trace/common/writer/TraceGenerator.groovy +++ b/dd-trace-core/src/test/groovy/datadog/trace/common/writer/TraceGenerator.groovy @@ -11,7 +11,6 @@ import datadog.trace.api.ProcessTags import datadog.trace.api.TagMap import datadog.trace.api.sampling.PrioritySampling import datadog.trace.bootstrap.instrumentation.api.AgentSpanLink -import datadog.trace.bootstrap.instrumentation.api.Tags import datadog.trace.bootstrap.instrumentation.api.UTF8BytesString import datadog.trace.core.CoreSpan import datadog.trace.core.Metadata diff --git a/dd-trace-core/src/traceAgentTest/groovy/TraceGenerator.groovy b/dd-trace-core/src/traceAgentTest/groovy/TraceGenerator.groovy index d20a03df6de..e668d0112a6 100644 --- a/dd-trace-core/src/traceAgentTest/groovy/TraceGenerator.groovy +++ b/dd-trace-core/src/traceAgentTest/groovy/TraceGenerator.groovy @@ -9,7 +9,6 @@ import datadog.trace.api.DDTags import datadog.trace.api.DDTraceId import datadog.trace.api.IdGenerationStrategy import datadog.trace.api.TagMap -import datadog.trace.bootstrap.instrumentation.api.Tags import datadog.trace.bootstrap.instrumentation.api.UTF8BytesString import datadog.trace.core.CoreSpan import datadog.trace.core.Metadata From a1863db570fa63ae5c35129103c58f67d4ee8cd2 Mon Sep 17 00:00:00 2001 From: Douglas Q Hawkins Date: Thu, 21 May 2026 22:31:47 -0400 Subject: [PATCH 22/22] Update dd-trace-core/src/main/java/datadog/trace/common/metrics/PeerTagSchema.java Co-authored-by: Sarah Chen --- .../main/java/datadog/trace/common/metrics/PeerTagSchema.java | 1 + 1 file changed, 1 insertion(+) diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/PeerTagSchema.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/PeerTagSchema.java index 87a0b955f5f..f0179e46f6b 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/PeerTagSchema.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/PeerTagSchema.java @@ -28,6 +28,7 @@ * * *

    This class deliberately has no cardinality limiters or per-cycle state -- callers that need + *

    This class deliberately has no cardinality limiters -- callers that need * those layer them on top. * *

    Thread-safety: {@link #names} is final and safe to read from any thread. {@link