diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateEntry.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateEntry.java new file mode 100644 index 00000000000..b493696c52b --- /dev/null +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateEntry.java @@ -0,0 +1,449 @@ +package datadog.trace.common.metrics; + +import static datadog.trace.bootstrap.instrumentation.api.UTF8BytesString.EMPTY; + +import datadog.metrics.api.Histogram; +import datadog.trace.api.Pair; +import datadog.trace.api.cache.DDCache; +import datadog.trace.api.cache.DDCaches; +import datadog.trace.bootstrap.instrumentation.api.UTF8BytesString; +import datadog.trace.util.Hashtable; +import datadog.trace.util.LongHashingUtils; +import edu.umd.cs.findbugs.annotations.SuppressFBWarnings; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.Objects; +import java.util.function.Function; +import javax.annotation.Nullable; + +/** + * Hashtable entry for the consumer-side aggregator. Holds the UTF8-encoded label fields that {@link + * SerializingMetricWriter} writes to the wire plus the mutable counter/histogram state for the key. + * + *

{@link #matches(SpanSnapshot)} compares the entry's stored UTF8 forms against the snapshot's + * raw {@code CharSequence}/{@code String}/{@code String[]} fields via content-equality, so {@code + * String} vs {@code UTF8BytesString} mixing on the same logical key collapses into one entry + * instead of splitting. + * + *

The static UTF8 caches that used to live on {@code MetricKey} and {@code + * ConflatingMetricsAggregator} are consolidated here. + * + *

Not thread-safe. Counter and histogram updates are performed by the single aggregator + * thread; producer threads tag durations via {@link #ERROR_TAG} / {@link #TOP_LEVEL_TAG} bits and + * hand them off through the snapshot inbox. + */ +@SuppressFBWarnings( + value = {"AT_NONATOMIC_OPERATIONS_ON_SHARED_VARIABLE", "AT_STALE_THREAD_WRITE_OF_PRIMITIVE"}, + justification = "Explicitly not thread-safe. Accumulates counts and durations.") +final class AggregateEntry extends Hashtable.Entry { + + public static final long ERROR_TAG = 0x8000000000000000L; + public static final long TOP_LEVEL_TAG = 0x4000000000000000L; + + // UTF8 caches consolidated from the previous MetricKey + ConflatingMetricsAggregator split. + private static final DDCache RESOURCE_CACHE = + DDCaches.newFixedSizeCache(32); + private static final DDCache SERVICE_CACHE = + DDCaches.newFixedSizeCache(32); + private static final DDCache OPERATION_CACHE = + DDCaches.newFixedSizeCache(64); + private static final DDCache SERVICE_SOURCE_CACHE = + DDCaches.newFixedSizeCache(16); + private static final DDCache TYPE_CACHE = DDCaches.newFixedSizeCache(8); + private static final DDCache SPAN_KIND_CACHE = + DDCaches.newFixedSizeCache(16); + private static final DDCache HTTP_METHOD_CACHE = + DDCaches.newFixedSizeCache(8); + private static final DDCache HTTP_ENDPOINT_CACHE = + DDCaches.newFixedSizeCache(32); + private static final DDCache GRPC_STATUS_CODE_CACHE = + DDCaches.newFixedSizeCache(32); + + /** + * Outer cache keyed by peer-tag name, with an inner per-name cache keyed by value. The inner + * cache produces the "name:value" encoded form the serializer writes. + */ + private static final DDCache< + String, Pair, Function>> + PEER_TAGS_CACHE = DDCaches.newFixedSizeCache(64); + + private static final Function< + String, Pair, Function>> + PEER_TAGS_CACHE_ADDER = + key -> + Pair.of( + DDCaches.newFixedSizeCache(512), + value -> UTF8BytesString.create(key + ":" + value)); + + private final UTF8BytesString resource; + private final UTF8BytesString service; + private final UTF8BytesString operationName; + @Nullable private final UTF8BytesString serviceSource; + private final UTF8BytesString type; + private final UTF8BytesString spanKind; + @Nullable private final UTF8BytesString httpMethod; + @Nullable private final UTF8BytesString httpEndpoint; + @Nullable private final UTF8BytesString grpcStatusCode; + private final short httpStatusCode; + private final boolean synthetic; + private final boolean traceRoot; + + // Peer tags carried in two forms: parallel String[] arrays mirroring the snapshot's (schema + + // values) shape for matches(), and pre-encoded List ("name:value") for the + // serializer. peerTagNames is the schema's names array (shared by-reference when the schema + // hasn't been replaced); peerTagValues is the per-span String[] parallel to it. + @Nullable private final String[] peerTagNames; + @Nullable private final String[] peerTagValues; + private final List peerTags; + + // Mutable aggregate state -- single-thread (consumer/aggregator) writer. + private final Histogram okLatencies = Histogram.newHistogram(); + private final Histogram errorLatencies = Histogram.newHistogram(); + private int errorCount; + private int hitCount; + private int topLevelCount; + private long duration; + + /** Hot-path constructor for the producer/consumer flow. Builds UTF8 fields via the caches. */ + private AggregateEntry(SpanSnapshot s, long keyHash) { + super(keyHash); + this.resource = canonicalize(RESOURCE_CACHE, s.resourceName); + this.service = canonicalize(SERVICE_CACHE, s.serviceName); + this.operationName = canonicalize(OPERATION_CACHE, s.operationName); + this.serviceSource = + s.serviceNameSource == null + ? null + : canonicalize(SERVICE_SOURCE_CACHE, s.serviceNameSource); + this.type = canonicalize(TYPE_CACHE, s.spanType); + this.spanKind = canonicalize(SPAN_KIND_CACHE, s.spanKind); + this.httpMethod = + s.httpMethod == null + ? null + : HTTP_METHOD_CACHE.computeIfAbsent(s.httpMethod, UTF8BytesString::create); + this.httpEndpoint = + s.httpEndpoint == null + ? null + : HTTP_ENDPOINT_CACHE.computeIfAbsent(s.httpEndpoint, UTF8BytesString::create); + this.grpcStatusCode = + s.grpcStatusCode == null + ? null + : GRPC_STATUS_CODE_CACHE.computeIfAbsent(s.grpcStatusCode, UTF8BytesString::create); + this.httpStatusCode = s.httpStatusCode; + this.synthetic = s.synthetic; + this.traceRoot = s.traceRoot; + this.peerTagNames = s.peerTagSchema == null ? null : s.peerTagSchema.names; + this.peerTagValues = s.peerTagValues; + this.peerTags = materializePeerTags(this.peerTagNames, this.peerTagValues); + } + + /** Construct from a snapshot at consumer-thread miss time. */ + static AggregateEntry forSnapshot(SpanSnapshot s) { + return new AggregateEntry(s, hashOf(s)); + } + + /** + * Records a single hit. {@code tagAndDuration} carries the duration nanos with optional {@link + * #ERROR_TAG} / {@link #TOP_LEVEL_TAG} bits OR-ed in. + */ + AggregateEntry recordOneDuration(long tagAndDuration) { + ++hitCount; + if ((tagAndDuration & TOP_LEVEL_TAG) == TOP_LEVEL_TAG) { + tagAndDuration ^= TOP_LEVEL_TAG; + ++topLevelCount; + } + if ((tagAndDuration & ERROR_TAG) == ERROR_TAG) { + tagAndDuration ^= ERROR_TAG; + errorLatencies.accept(tagAndDuration); + ++errorCount; + } else { + okLatencies.accept(tagAndDuration); + } + duration += tagAndDuration; + return this; + } + + int getErrorCount() { + return errorCount; + } + + int getHitCount() { + return hitCount; + } + + int getTopLevelCount() { + return topLevelCount; + } + + long getDuration() { + return duration; + } + + Histogram getOkLatencies() { + return okLatencies; + } + + Histogram getErrorLatencies() { + return errorLatencies; + } + + @SuppressFBWarnings("AT_NONATOMIC_64BIT_PRIMITIVE") + void clear() { + this.errorCount = 0; + this.hitCount = 0; + this.topLevelCount = 0; + this.duration = 0; + this.okLatencies.clear(); + this.errorLatencies.clear(); + } + + boolean matches(SpanSnapshot s) { + String[] snapshotNames = s.peerTagSchema == null ? null : s.peerTagSchema.names; + return httpStatusCode == s.httpStatusCode + && synthetic == s.synthetic + && traceRoot == s.traceRoot + && contentEquals(resource, s.resourceName) + && stringContentEquals(service, s.serviceName) + && contentEquals(operationName, s.operationName) + && contentEquals(serviceSource, s.serviceNameSource) + && contentEquals(type, s.spanType) + && stringContentEquals(spanKind, s.spanKind) + && Arrays.equals(peerTagNames, snapshotNames) + && Arrays.equals(peerTagValues, s.peerTagValues) + && stringContentEquals(httpMethod, s.httpMethod) + && stringContentEquals(httpEndpoint, s.httpEndpoint) + && stringContentEquals(grpcStatusCode, s.grpcStatusCode); + } + + /** + * Pre-checks {@link #keyHash} against {@code keyHash} before delegating to {@link + * #matches(SpanSnapshot)}. The hash check is cheap and rules out most mismatches without touching + * the field-by-field comparison. + */ + boolean matches(long keyHash, SpanSnapshot s) { + return this.keyHash == keyHash && matches(s); + } + + /** + * Computes the 64-bit lookup hash for a {@link SpanSnapshot}. Chained per-field calls -- no + * varargs / Object[] allocation, no autoboxing on primitive overloads. The constructor's + * super({@code hashOf(s)}) call uses the same function so an entry built from a snapshot hashes + * to the same bucket the snapshot itself looks up. + * + *

Hashes are content-stable across {@code String} / {@code UTF8BytesString}: {@link + * UTF8BytesString#hashCode()} returns the underlying {@code String}'s hash. + */ + static long hashOf(SpanSnapshot s) { + long h = 0; + h = LongHashingUtils.addToHash(h, s.resourceName); + h = LongHashingUtils.addToHash(h, s.serviceName); + h = LongHashingUtils.addToHash(h, s.operationName); + h = LongHashingUtils.addToHash(h, s.serviceNameSource); + h = LongHashingUtils.addToHash(h, s.spanType); + h = LongHashingUtils.addToHash(h, s.httpStatusCode); + h = LongHashingUtils.addToHash(h, s.synthetic); + h = LongHashingUtils.addToHash(h, s.traceRoot); + h = LongHashingUtils.addToHash(h, s.spanKind); + // Always mix in both the schema's content hash and the values' content hash, unconditionally + // (no null-skip). PeerTagSchema overrides hashCode() to be content-based on names; we use + // Arrays.hashCode for the String[] values since the default Object[].hashCode is identity- + // based, not content-based. Null inputs hash to 0 for both, distinct from any real schema's + // hash or any non-empty values array. + h = LongHashingUtils.addToHash(h, s.peerTagSchema); + h = LongHashingUtils.addToHash(h, Arrays.hashCode(s.peerTagValues)); + h = LongHashingUtils.addToHash(h, s.httpMethod); + h = LongHashingUtils.addToHash(h, s.httpEndpoint); + h = LongHashingUtils.addToHash(h, s.grpcStatusCode); + return h; + } + + // Accessors for SerializingMetricWriter. + UTF8BytesString getResource() { + return resource; + } + + UTF8BytesString getService() { + return service; + } + + UTF8BytesString getOperationName() { + return operationName; + } + + @Nullable + UTF8BytesString getServiceSource() { + return serviceSource; + } + + UTF8BytesString getType() { + return type; + } + + UTF8BytesString getSpanKind() { + return spanKind; + } + + @Nullable + UTF8BytesString getHttpMethod() { + return httpMethod; + } + + @Nullable + UTF8BytesString getHttpEndpoint() { + return httpEndpoint; + } + + @Nullable + UTF8BytesString getGrpcStatusCode() { + return grpcStatusCode; + } + + int getHttpStatusCode() { + return httpStatusCode; + } + + boolean isSynthetics() { + return synthetic; + } + + boolean isTraceRoot() { + return traceRoot; + } + + List getPeerTags() { + return peerTags; + } + + /** + * Equality on the 13 label fields (not on the aggregate). Used only by test mock matchers; the + * {@link Hashtable} does its own bucketing via {@link #keyHash} + {@link #matches(SpanSnapshot)} + * and never calls {@code equals}. + * + *

Peer tags are compared via the raw parallel arrays ({@code peerTagNames} and {@code + * peerTagValues}) rather than the pre-encoded {@code peerTags} list, so the equality contract + * stays consistent with {@link #hashCode()} (which goes through {@link #hashOf} -- driven off the + * raw arrays via {@link PeerTagSchema#hashCode} and {@link java.util.Arrays#hashCode}). Comparing + * the encoded list would let two entries with different raw layouts collapse to the same encoded + * form (e.g. tag {@code "b"} at index 1 in schema A vs index 0 in schema B, with matching values) + * and produce {@code equals=true} alongside different {@code hashCode}s -- violating the hashCode + * contract. + */ + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (!(o instanceof AggregateEntry)) return false; + AggregateEntry that = (AggregateEntry) o; + return httpStatusCode == that.httpStatusCode + && synthetic == that.synthetic + && traceRoot == that.traceRoot + && Objects.equals(resource, that.resource) + && Objects.equals(service, that.service) + && Objects.equals(operationName, that.operationName) + && Objects.equals(serviceSource, that.serviceSource) + && Objects.equals(type, that.type) + && Objects.equals(spanKind, that.spanKind) + && Arrays.equals(peerTagNames, that.peerTagNames) + && Arrays.equals(peerTagValues, that.peerTagValues) + && Objects.equals(httpMethod, that.httpMethod) + && Objects.equals(httpEndpoint, that.httpEndpoint) + && Objects.equals(grpcStatusCode, that.grpcStatusCode); + } + + @Override + public int hashCode() { + return (int) keyHash; + } + + // ----- helpers ----- + + private static UTF8BytesString canonicalize( + DDCache cache, CharSequence charSeq) { + if (charSeq == null) { + return EMPTY; + } + if (charSeq instanceof UTF8BytesString) { + return (UTF8BytesString) charSeq; + } + return cache.computeIfAbsent(charSeq.toString(), UTF8BytesString::create); + } + + /** + * UTF8 vs raw CharSequence content-equality, no allocation in the common (String) case. + * + *

Treats {@code null} and empty (length 0) as equivalent on either side. This matches the + * canonicalization semantics: {@link #canonicalize} maps a {@code null} input to {@link + * UTF8BytesString#EMPTY}, so an entry built from a snapshot with a null field needs to match a + * subsequent snapshot whose field is still null. {@code intHash(null) == 0 == "".hashCode()}, so + * the hash already agrees with this view. + */ + private static boolean contentEquals(UTF8BytesString a, CharSequence b) { + if (a == null) { + return b == null || b.length() == 0; + } + if (b == null) { + return a.length() == 0; + } + // UTF8BytesString.toString() returns the underlying String -- O(1), no allocation. + String aStr = a.toString(); + if (b instanceof String) { + return aStr.equals(b); + } + if (b instanceof UTF8BytesString) { + return aStr.equals(b.toString()); + } + return aStr.contentEquals(b); + } + + private static boolean stringContentEquals(UTF8BytesString a, String b) { + if (a == null) { + return b == null || b.isEmpty(); + } + if (b == null) { + return a.length() == 0; + } + return a.toString().equals(b); + } + + /** + * Encodes the per-span peer-tag values into the {@code List} the serializer + * consumes. Reads name/value pairs at the same index from the schema's names and the snapshot's + * values; null value slots are skipped (the span didn't set that peer tag). Counts hits once for + * exact-size allocation and preserves the singletonList fast path for the common one-entry case + * (e.g. internal-kind base.service). + */ + private static List materializePeerTags( + @Nullable String[] names, @Nullable String[] values) { + if (names == null || values == null) { + return Collections.emptyList(); + } + int n = names.length; + int firstHit = -1; + int hitCount = 0; + for (int i = 0; i < n; i++) { + if (values[i] != null) { + if (hitCount == 0) firstHit = i; + hitCount++; + } + } + if (hitCount == 0) { + return Collections.emptyList(); + } + if (hitCount == 1) { + return Collections.singletonList(encodePeerTag(names[firstHit], values[firstHit])); + } + List tags = new ArrayList<>(hitCount); + for (int i = firstHit; i < n; i++) { + if (values[i] != null) { + tags.add(encodePeerTag(names[i], values[i])); + } + } + return tags; + } + + private static UTF8BytesString encodePeerTag(String name, String value) { + final Pair, Function> + cacheAndCreator = PEER_TAGS_CACHE.computeIfAbsent(name, PEER_TAGS_CACHE_ADDER); + return cacheAndCreator.getLeft().computeIfAbsent(value, cacheAndCreator.getRight()); + } +} diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateMetric.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateMetric.java deleted file mode 100644 index dba66a5ab9c..00000000000 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateMetric.java +++ /dev/null @@ -1,103 +0,0 @@ -package datadog.trace.common.metrics; - -import datadog.metrics.api.Histogram; -import edu.umd.cs.findbugs.annotations.SuppressFBWarnings; -import java.util.concurrent.atomic.AtomicLongArray; - -/** Not thread-safe. Accumulates counts and durations. */ -@SuppressFBWarnings( - value = {"AT_NONATOMIC_OPERATIONS_ON_SHARED_VARIABLE", "AT_STALE_THREAD_WRITE_OF_PRIMITIVE"}, - justification = "Explicitly not thread-safe. Accumulates counts and durations.") -public final class AggregateMetric { - - static final long ERROR_TAG = 0x8000000000000000L; - static final long TOP_LEVEL_TAG = 0x4000000000000000L; - - private final Histogram okLatencies; - private final Histogram errorLatencies; - private int errorCount; - private int hitCount; - private int topLevelCount; - private long duration; - - public AggregateMetric() { - okLatencies = Histogram.newHistogram(); - errorLatencies = Histogram.newHistogram(); - } - - public AggregateMetric recordDurations(int count, AtomicLongArray durations) { - this.hitCount += count; - for (int i = 0; i < count && i < durations.length(); ++i) { - long duration = durations.getAndSet(i, 0); - if ((duration & TOP_LEVEL_TAG) == TOP_LEVEL_TAG) { - duration ^= TOP_LEVEL_TAG; - ++topLevelCount; - } - if ((duration & ERROR_TAG) == ERROR_TAG) { - // then it's an error - duration ^= ERROR_TAG; - errorLatencies.accept(duration); - ++errorCount; - } else { - okLatencies.accept(duration); - } - this.duration += duration; - } - return this; - } - - /** - * Records a single hit. {@code tagAndDuration} carries the duration nanos with optional {@link - * #ERROR_TAG} / {@link #TOP_LEVEL_TAG} bits OR-ed in. - */ - public AggregateMetric recordOneDuration(long tagAndDuration) { - ++hitCount; - if ((tagAndDuration & TOP_LEVEL_TAG) == TOP_LEVEL_TAG) { - tagAndDuration ^= TOP_LEVEL_TAG; - ++topLevelCount; - } - if ((tagAndDuration & ERROR_TAG) == ERROR_TAG) { - tagAndDuration ^= ERROR_TAG; - errorLatencies.accept(tagAndDuration); - ++errorCount; - } else { - okLatencies.accept(tagAndDuration); - } - duration += tagAndDuration; - return this; - } - - public int getErrorCount() { - return errorCount; - } - - public int getHitCount() { - return hitCount; - } - - public int getTopLevelCount() { - return topLevelCount; - } - - public long getDuration() { - return duration; - } - - public Histogram getOkLatencies() { - return okLatencies; - } - - public Histogram getErrorLatencies() { - return errorLatencies; - } - - @SuppressFBWarnings("AT_NONATOMIC_64BIT_PRIMITIVE") - public void clear() { - this.errorCount = 0; - this.hitCount = 0; - this.topLevelCount = 0; - this.duration = 0; - this.okLatencies.clear(); - this.errorLatencies.clear(); - } -} diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateTable.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateTable.java new file mode 100644 index 00000000000..ffa6924f0ea --- /dev/null +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/AggregateTable.java @@ -0,0 +1,136 @@ +package datadog.trace.common.metrics; + +import datadog.trace.util.Hashtable; +import datadog.trace.util.Hashtable.MutatingTableIterator; +import datadog.trace.util.Hashtable.Support; +import java.util.function.BiConsumer; +import java.util.function.Consumer; + +/** + * Consumer-side {@link AggregateEntry} store, keyed on the raw fields of a {@link SpanSnapshot}. + * + *

Replaces the prior {@code LRUCache}. The win is on the + * steady-state hit path: a snapshot lookup is a 64-bit hash compute + bucket walk + field-wise + * {@code matches}, with no per-snapshot {@link AggregateEntry} allocation and no UTF8 cache + * lookups. The UTF8-encoded forms (formerly held on {@code MetricKey}) and the mutable counters + * (formerly held on {@code AggregateMetric}) both live on the {@link AggregateEntry} now, built + * once per unique key at insert time. + * + *

Not thread-safe. The aggregator thread is the sole writer; {@link #clear()} must be + * routed through the inbox rather than called from arbitrary threads. + */ +final class AggregateTable { + + private final Hashtable.Entry[] buckets; + private final int maxAggregates; + private int size; + + /** + * Bucket index where the last {@link #evictOneStale} successfully removed an entry. The next call + * resumes from this bucket so a fast-evicting workload doesn't repeatedly re-walk the same hot + * entries clustered near bucket 0. Reset to {@code 0} by {@link #clear}. + */ + private int evictCursor; + + AggregateTable(int maxAggregates) { + this.buckets = Support.create(maxAggregates, Support.MAX_RATIO); + this.maxAggregates = maxAggregates; + } + + int size() { + return size; + } + + boolean isEmpty() { + return size == 0; + } + + /** + * Returns the {@link AggregateEntry} to update for {@code snapshot}, lazily creating one on miss. + * Returns {@code null} when the table is at capacity and no stale entry can be evicted -- the + * caller should drop the data point in that case. + */ + AggregateEntry findOrInsert(SpanSnapshot snapshot) { + long keyHash = AggregateEntry.hashOf(snapshot); + for (AggregateEntry candidate = Support.bucket(buckets, keyHash); + candidate != null; + candidate = candidate.next()) { + if (candidate.matches(keyHash, snapshot)) { + return candidate; + } + } + if (size >= maxAggregates && !evictOneStale()) { + return null; + } + AggregateEntry entry = AggregateEntry.forSnapshot(snapshot); + Support.insertHeadEntry(buckets, keyHash, entry); + size++; + return entry; + } + + /** + * Unlinks the first entry whose {@code getHitCount() == 0}, resuming the scan from {@link + * #evictCursor} so back-to-back evictions amortize to O(1) per call. Worst case for a single call + * is still O(N) when nearly every entry is hot, but a sustained eviction stream never re-scans + * the hot prefix more than twice across N evictions. + * + *

The semantic intent: at cap with all entries live, drop the new key (reported via {@code + * onStatsAggregateDropped}) rather than evicting an established one. Cap is sized to the + * steady-state working set, so eviction is rare; this cursor optimization handles the + * pathological "persistently at cap" case. + */ + private boolean evictOneStale() { + // Two passes -- [cursor, length) then [0, cursor) -- using the half-open-range iterator. The + // second pass is naturally empty when cursor==0, so no extra check needed. + return evictOneStaleInRange(evictCursor, buckets.length) + || evictOneStaleInRange(0, evictCursor); + } + + /** Scans {@code [startBucket, endBucket)} for the first stale entry and unlinks it. */ + private boolean evictOneStaleInRange(int startBucket, int endBucket) { + MutatingTableIterator iter = + Support.mutatingTableIterator(buckets, startBucket, endBucket); + while (iter.hasNext()) { + AggregateEntry e = iter.next(); + if (e.getHitCount() == 0) { + int bucket = iter.currentBucket(); + iter.remove(); + size--; + evictCursor = bucket; + return true; + } + } + return false; + } + + void forEach(Consumer consumer) { + Support.forEach(buckets, consumer); + } + + /** + * Context-passing forEach. Useful for callers that want to avoid a capturing-lambda allocation on + * each invocation -- pass a non-capturing {@link BiConsumer} (typically a {@code static final}) + * plus whatever side-band state it needs as {@code context}. + */ + void forEach(T context, BiConsumer consumer) { + Support.forEach(buckets, context, consumer); + } + + /** Removes entries whose {@code getHitCount() == 0}. */ + void expungeStaleAggregates() { + for (MutatingTableIterator iter = Support.mutatingTableIterator(buckets); + iter.hasNext(); ) { + AggregateEntry e = iter.next(); + if (e.getHitCount() == 0) { + iter.remove(); + size--; + } + } + } + + void clear() { + Support.clear(buckets); + size = 0; + evictCursor = 0; + } +} diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/Aggregator.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/Aggregator.java index 9998c21ed0b..d809d452522 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/Aggregator.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/Aggregator.java @@ -1,26 +1,12 @@ package datadog.trace.common.metrics; -import static datadog.trace.api.Functions.UTF8_ENCODE; -import static datadog.trace.common.metrics.ConflatingMetricsAggregator.PEER_TAGS_CACHE; -import static datadog.trace.common.metrics.ConflatingMetricsAggregator.PEER_TAGS_CACHE_ADDER; -import static datadog.trace.common.metrics.ConflatingMetricsAggregator.SERVICE_NAMES; -import static datadog.trace.common.metrics.ConflatingMetricsAggregator.SPAN_KINDS; import static java.util.concurrent.TimeUnit.MILLISECONDS; -import datadog.trace.api.Pair; -import datadog.trace.api.cache.DDCache; -import datadog.trace.bootstrap.instrumentation.api.UTF8BytesString; +import datadog.trace.common.metrics.SignalItem.ClearSignal; import datadog.trace.common.metrics.SignalItem.StopSignal; import datadog.trace.core.monitor.HealthMetrics; -import datadog.trace.core.util.LRUCache; import edu.umd.cs.findbugs.annotations.SuppressFBWarnings; -import java.util.ArrayList; -import java.util.Collections; -import java.util.Iterator; -import java.util.List; -import java.util.Map; import java.util.concurrent.TimeUnit; -import java.util.function.Function; import org.jctools.queues.MessagePassingQueue; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -32,8 +18,9 @@ final class Aggregator implements Runnable { private static final Logger log = LoggerFactory.getLogger(Aggregator.class); private final MessagePassingQueue inbox; - private final LRUCache aggregates; + private final AggregateTable aggregates; private final MetricWriter writer; + private final HealthMetrics healthMetrics; // the reporting interval controls how much history will be buffered // when the agent is unresponsive (only 10 pending requests will be // buffered by OkHttpSink) @@ -85,34 +72,13 @@ final class Aggregator implements Runnable { Runnable onReportCycle) { this.writer = writer; this.inbox = inbox; - this.aggregates = - new LRUCache<>( - new AggregateExpiry(healthMetrics), maxAggregates * 4 / 3, 0.75f, maxAggregates); + this.aggregates = new AggregateTable(maxAggregates); this.reportingIntervalNanos = reportingIntervalTimeUnit.toNanos(reportingInterval); this.sleepMillis = sleepMillis; + this.healthMetrics = healthMetrics; this.onReportCycle = onReportCycle; } - private static final class AggregateExpiry - implements LRUCache.ExpiryListener { - private final HealthMetrics healthMetrics; - - AggregateExpiry(HealthMetrics healthMetrics) { - this.healthMetrics = healthMetrics; - } - - @Override - public void accept(Map.Entry expired) { - if (expired.getValue().getHitCount() > 0) { - healthMetrics.onStatsAggregateDropped(); - } - } - } - - public void clearAggregates() { - this.aggregates.clear(); - } - @Override public void run() { Thread currentThread = Thread.currentThread(); @@ -139,7 +105,31 @@ private final class Drainer implements MessagePassingQueue.Consumer { @Override public void accept(InboxItem item) { - if (item instanceof SignalItem) { + if (item == ClearSignal.CLEAR) { + // ClearSignal is routed through the inbox (rather than letting the caller mutate + // AggregateTable directly) so the aggregator thread stays the sole writer. AggregateTable + // is not thread-safe; a direct clear() from e.g. the OkHttpSink callback thread would + // race with Drainer.accept on this thread. + // + // We deliberately do NOT call inbox.clear() here. Doing so would erase any queued STOP + // (or REPORT) signals that happen to sit behind CLEAR -- a real concern when a + // downgrade is followed quickly by close(), where the trampled STOP leaves the + // aggregator thread spinning until thread.join times out. features.supportsMetrics() is + // already false by the time CLEAR was offered, so producers have stopped publishing; + // any in-flight snapshots will drain naturally into the just-cleared table, get + // re-aggregated, and flushed on the next report -- where the agent rejects them again, + // triggering another DOWNGRADED -> disable() -> CLEAR cycle. Worst case: one extra + // reporting cycle of wasted work, which we accept for the safety of preserving STOP. + if (!stopped) { + aggregates.clear(); + // Clear dirty too -- without this, the next report() would see dirty=true, run + // expungeStaleAggregates against the (now-empty) table, find isEmpty()=true, and skip + // the flush anyway. Same observable outcome, but resetting here keeps the invariant + // "dirty implies there's data to flush" honest. + dirty = false; + } + ((SignalItem) item).complete(); + } else if (item instanceof SignalItem) { SignalItem signal = (SignalItem) item; if (!stopped) { report(wallClockTime(), signal); @@ -152,74 +142,16 @@ public void accept(InboxItem item) { } } else if (item instanceof SpanSnapshot && !stopped) { SpanSnapshot snapshot = (SpanSnapshot) item; - MetricKey key = buildMetricKey(snapshot); - AggregateMetric aggregate = aggregates.computeIfAbsent(key, k -> new AggregateMetric()); - aggregate.recordOneDuration(snapshot.tagAndDuration); - dirty = true; - } - } - } - - private static MetricKey buildMetricKey(SpanSnapshot s) { - return new MetricKey( - s.resourceName, - SERVICE_NAMES.computeIfAbsent(s.serviceName, UTF8_ENCODE), - s.operationName, - s.serviceNameSource, - s.spanType, - s.httpStatusCode, - s.synthetic, - s.traceRoot, - SPAN_KINDS.computeIfAbsent(s.spanKind, UTF8BytesString::create), - materializePeerTags(s.peerTagSchema, s.peerTagValues), - s.httpMethod, - s.httpEndpoint, - s.grpcStatusCode); - } - - /** - * Encodes the per-span peer-tag values into the {@code List} the {@link - * MetricKey} consumes. Reads name/value pairs at the same index from the schema's names and the - * snapshot's values; null value slots are skipped (the span didn't set that peer tag). - */ - private static List materializePeerTags(PeerTagSchema schema, String[] values) { - if (schema == null || values == null) { - return Collections.emptyList(); - } - String[] names = schema.names; - int n = names.length; - // First pass: count how many tags fired and remember the first index. The single-entry case - // is common (e.g. INTERNAL spans only emit base.service) and gets a singletonList to avoid an - // ArrayList allocation on the hot path. - int firstHit = -1; - int hitCount = 0; - for (int i = 0; i < n; i++) { - if (values[i] != null) { - if (hitCount == 0) { - firstHit = i; + AggregateEntry entry = aggregates.findOrInsert(snapshot); + if (entry != null) { + entry.recordOneDuration(snapshot.tagAndDuration); + dirty = true; + } else { + // table at cap with no stale entry available to evict + healthMetrics.onStatsAggregateDropped(); } - hitCount++; - } - } - if (hitCount == 0) { - return Collections.emptyList(); - } - if (hitCount == 1) { - return Collections.singletonList(encodePeerTag(names[firstHit], values[firstHit])); - } - List tags = new ArrayList<>(hitCount); - for (int i = firstHit; i < n; i++) { - if (values[i] != null) { - tags.add(encodePeerTag(names[i], values[i])); } } - return tags; - } - - private static UTF8BytesString encodePeerTag(String name, String value) { - final Pair, Function> - cacheAndCreator = PEER_TAGS_CACHE.computeIfAbsent(name, PEER_TAGS_CACHE_ADDER); - return cacheAndCreator.getLeft().computeIfAbsent(value, cacheAndCreator.getRight()); } private void report(long when, SignalItem signal) { @@ -234,14 +166,16 @@ private void report(long when, SignalItem signal) { boolean skipped = true; if (dirty) { try { - expungeStaleAggregates(); + aggregates.expungeStaleAggregates(); if (!aggregates.isEmpty()) { skipped = false; writer.startBucket(aggregates.size(), when, reportingIntervalNanos); - for (Map.Entry aggregate : aggregates.entrySet()) { - writer.add(aggregate.getKey(), aggregate.getValue()); - aggregate.getValue().clear(); - } + aggregates.forEach( + writer, + (w, entry) -> { + w.add(entry); + entry.clear(); + }); // note that this may do IO and block writer.finishBucket(); } @@ -257,17 +191,6 @@ private void report(long when, SignalItem signal) { } } - private void expungeStaleAggregates() { - Iterator> it = aggregates.entrySet().iterator(); - while (it.hasNext()) { - Map.Entry pair = it.next(); - AggregateMetric metric = pair.getValue(); - if (metric.getHitCount() == 0) { - it.remove(); - } - } - } - private long wallClockTime() { return MILLISECONDS.toNanos(System.currentTimeMillis()); } diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java index dc5d698bcc1..895ee434854 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java @@ -5,8 +5,9 @@ import static datadog.trace.bootstrap.instrumentation.api.Tags.HTTP_ENDPOINT; import static datadog.trace.bootstrap.instrumentation.api.Tags.HTTP_METHOD; import static datadog.trace.bootstrap.instrumentation.api.Tags.SPAN_KIND; -import static datadog.trace.common.metrics.AggregateMetric.ERROR_TAG; -import static datadog.trace.common.metrics.AggregateMetric.TOP_LEVEL_TAG; +import static datadog.trace.common.metrics.AggregateEntry.ERROR_TAG; +import static datadog.trace.common.metrics.AggregateEntry.TOP_LEVEL_TAG; +import static datadog.trace.common.metrics.SignalItem.ClearSignal.CLEAR; import static datadog.trace.common.metrics.SignalItem.ReportSignal.REPORT; import static datadog.trace.common.metrics.SignalItem.StopSignal.STOP; import static datadog.trace.util.AgentThreadFactory.AgentThread.METRICS_AGGREGATOR; @@ -18,12 +19,8 @@ import datadog.communication.ddagent.DDAgentFeaturesDiscovery; import datadog.communication.ddagent.SharedCommunicationObjects; import datadog.trace.api.Config; -import datadog.trace.api.Pair; import datadog.trace.api.WellKnownTags; -import datadog.trace.api.cache.DDCache; -import datadog.trace.api.cache.DDCaches; import datadog.trace.bootstrap.instrumentation.api.InstrumentationTags; -import datadog.trace.bootstrap.instrumentation.api.UTF8BytesString; import datadog.trace.common.metrics.SignalItem.ReportSignal; import datadog.trace.common.writer.ddagent.DDAgentApi; import datadog.trace.core.CoreSpan; @@ -39,7 +36,6 @@ import java.util.concurrent.CompletableFuture; import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; -import java.util.function.Function; import org.jctools.queues.MessagePassingQueue; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -51,22 +47,6 @@ public final class ConflatingMetricsAggregator implements MetricsAggregator, Eve private static final Map DEFAULT_HEADERS = Collections.singletonMap(DDAgentApi.DATADOG_META_TRACER_VERSION, DDTraceCoreInfo.VERSION); - static final DDCache SERVICE_NAMES = DDCaches.newFixedSizeCache(32); - - static final DDCache SPAN_KINDS = DDCaches.newFixedSizeCache(16); - static final DDCache< - String, Pair, Function>> - PEER_TAGS_CACHE = - DDCaches.newFixedSizeCache( - 64); // it can be unbounded since those values are returned by the agent and should be - // under control. 64 entries is enough in this case to contain all the peer tags. - static final Function< - String, Pair, Function>> - PEER_TAGS_CACHE_ADDER = - key -> - Pair.of( - DDCaches.newFixedSizeCache(512), - value -> UTF8BytesString.create(key + ":" + value)); private static final CharSequence SYNTHETICS_ORIGIN = "synthetics"; private static final SpanKindFilter METRICS_ELIGIBLE_KINDS = @@ -532,8 +512,17 @@ private void disable() { features.discover(); if (!features.supportsMetrics()) { log.debug("Disabling metric reporting because an agent downgrade was detected"); - this.inbox.clear(); - this.aggregator.clearAggregates(); + // Route the clear through the inbox so the aggregator thread is the only writer. + // AggregateTable is not thread-safe; mutating it directly from this thread would race + // with Drainer.accept on the aggregator thread. + // + // Best-effort single offer rather than the retry-loop pattern in report(). If the inbox is + // full at downgrade time the clear is dropped, but the system self-heals: features.discover() + // already flipped supportsMetrics() false, so producer publish() calls now skip the inbox; + // the aggregator drains existing snapshots and ships them on the next report cycle; the + // sink rejects that payload and fires DOWNGRADED again, which retries disable() against a + // now-empty inbox. Worst case: one extra reporting cycle of stale data. + inbox.offer(CLEAR); } } diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/InboxItem.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/InboxItem.java index 7d66cad6a15..a0625be095b 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/InboxItem.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/InboxItem.java @@ -28,4 +28,15 @@ private StopSignal() {} static final class ReportSignal extends SignalItem { static final ReportSignal REPORT = new ReportSignal(); } + + /** + * Posted from arbitrary threads (e.g. the Sink event thread during agent downgrade) so the + * aggregator thread is the one that actually performs the table reset. Keeps {@link + * AggregateTable} and {@code inbox.clear()} single-writer. + */ + static final class ClearSignal extends SignalItem { + static final ClearSignal CLEAR = new ClearSignal(); + + private ClearSignal() {} + } } diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/MetricKey.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/MetricKey.java deleted file mode 100644 index 9e2e2098d1f..00000000000 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/MetricKey.java +++ /dev/null @@ -1,178 +0,0 @@ -package datadog.trace.common.metrics; - -import static datadog.trace.bootstrap.instrumentation.api.UTF8BytesString.EMPTY; - -import datadog.trace.api.cache.DDCache; -import datadog.trace.api.cache.DDCaches; -import datadog.trace.bootstrap.instrumentation.api.UTF8BytesString; -import datadog.trace.util.HashingUtils; -import java.util.Collections; -import java.util.List; -import java.util.Objects; - -/** The aggregation key for tracked metrics. */ -public final class MetricKey { - static final DDCache RESOURCE_CACHE = DDCaches.newFixedSizeCache(32); - static final DDCache SERVICE_CACHE = DDCaches.newFixedSizeCache(8); - static final DDCache SERVICE_SOURCE_CACHE = - DDCaches.newFixedSizeCache(16); - static final DDCache OPERATION_CACHE = DDCaches.newFixedSizeCache(64); - static final DDCache TYPE_CACHE = DDCaches.newFixedSizeCache(8); - static final DDCache KIND_CACHE = DDCaches.newFixedSizeCache(8); - static final DDCache HTTP_METHOD_CACHE = DDCaches.newFixedSizeCache(8); - static final DDCache HTTP_ENDPOINT_CACHE = - DDCaches.newFixedSizeCache(32); - static final DDCache GRPC_STATUS_CODE_CACHE = - DDCaches.newFixedSizeCache(32); - - private final UTF8BytesString resource; - private final UTF8BytesString service; - private final UTF8BytesString serviceSource; - private final UTF8BytesString operationName; - private final UTF8BytesString type; - private final int httpStatusCode; - private final boolean synthetics; - private final int hash; - private final boolean isTraceRoot; - private final UTF8BytesString spanKind; - private final List peerTags; - private final UTF8BytesString httpMethod; - private final UTF8BytesString httpEndpoint; - private final UTF8BytesString grpcStatusCode; - - public MetricKey( - CharSequence resource, - CharSequence service, - CharSequence operationName, - CharSequence serviceSource, - CharSequence type, - int httpStatusCode, - boolean synthetics, - boolean isTraceRoot, - CharSequence spanKind, - List peerTags, - CharSequence httpMethod, - CharSequence httpEndpoint, - CharSequence grpcStatusCode) { - this.resource = null == resource ? EMPTY : utf8(RESOURCE_CACHE, resource); - this.service = null == service ? EMPTY : utf8(SERVICE_CACHE, service); - this.serviceSource = null == serviceSource ? null : utf8(SERVICE_SOURCE_CACHE, serviceSource); - this.operationName = null == operationName ? EMPTY : utf8(OPERATION_CACHE, operationName); - this.type = null == type ? EMPTY : utf8(TYPE_CACHE, type); - this.httpStatusCode = httpStatusCode; - this.synthetics = synthetics; - this.isTraceRoot = isTraceRoot; - this.spanKind = null == spanKind ? EMPTY : utf8(KIND_CACHE, spanKind); - this.peerTags = peerTags == null ? Collections.emptyList() : peerTags; - this.httpMethod = httpMethod == null ? null : utf8(HTTP_METHOD_CACHE, httpMethod); - this.httpEndpoint = httpEndpoint == null ? null : utf8(HTTP_ENDPOINT_CACHE, httpEndpoint); - this.grpcStatusCode = - grpcStatusCode == null ? null : utf8(GRPC_STATUS_CODE_CACHE, grpcStatusCode); - - int tmpHash = 0; - tmpHash = HashingUtils.addToHash(tmpHash, this.isTraceRoot); - tmpHash = HashingUtils.addToHash(tmpHash, this.spanKind); - tmpHash = HashingUtils.addToHash(tmpHash, this.peerTags); - tmpHash = HashingUtils.addToHash(tmpHash, this.resource); - tmpHash = HashingUtils.addToHash(tmpHash, this.service); - tmpHash = HashingUtils.addToHash(tmpHash, this.operationName); - tmpHash = HashingUtils.addToHash(tmpHash, this.type); - tmpHash = HashingUtils.addToHash(tmpHash, this.httpStatusCode); - tmpHash = HashingUtils.addToHash(tmpHash, this.synthetics); - tmpHash = HashingUtils.addToHash(tmpHash, this.serviceSource); - tmpHash = HashingUtils.addToHash(tmpHash, this.httpEndpoint); - tmpHash = HashingUtils.addToHash(tmpHash, this.httpMethod); - tmpHash = HashingUtils.addToHash(tmpHash, this.grpcStatusCode); - this.hash = tmpHash; - } - - static UTF8BytesString utf8(DDCache cache, CharSequence charSeq) { - if (charSeq instanceof UTF8BytesString) { - return (UTF8BytesString) charSeq; - } else { - return cache.computeIfAbsent(charSeq.toString(), UTF8BytesString::create); - } - } - - public UTF8BytesString getResource() { - return resource; - } - - public UTF8BytesString getService() { - return service; - } - - public UTF8BytesString getServiceSource() { - return serviceSource; - } - - public UTF8BytesString getOperationName() { - return operationName; - } - - public UTF8BytesString getType() { - return type; - } - - public int getHttpStatusCode() { - return httpStatusCode; - } - - public boolean isSynthetics() { - return synthetics; - } - - public boolean isTraceRoot() { - return isTraceRoot; - } - - public UTF8BytesString getSpanKind() { - return spanKind; - } - - public List getPeerTags() { - return peerTags; - } - - public UTF8BytesString getHttpMethod() { - return httpMethod; - } - - public UTF8BytesString getHttpEndpoint() { - return httpEndpoint; - } - - public UTF8BytesString getGrpcStatusCode() { - return grpcStatusCode; - } - - @Override - public boolean equals(Object o) { - if (this == o) { - return true; - } - if ((o instanceof MetricKey)) { - MetricKey metricKey = (MetricKey) o; - return hash == metricKey.hash - && synthetics == metricKey.synthetics - && httpStatusCode == metricKey.httpStatusCode - && resource.equals(metricKey.resource) - && service.equals(metricKey.service) - && operationName.equals(metricKey.operationName) - && type.equals(metricKey.type) - && isTraceRoot == metricKey.isTraceRoot - && spanKind.equals(metricKey.spanKind) - && peerTags.equals(metricKey.peerTags) - && Objects.equals(serviceSource, metricKey.serviceSource) - && Objects.equals(httpMethod, metricKey.httpMethod) - && Objects.equals(httpEndpoint, metricKey.httpEndpoint) - && Objects.equals(grpcStatusCode, metricKey.grpcStatusCode); - } - return false; - } - - @Override - public int hashCode() { - return hash; - } -} diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/MetricWriter.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/MetricWriter.java index fa26ed2e5db..905ba498760 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/MetricWriter.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/MetricWriter.java @@ -3,7 +3,11 @@ public interface MetricWriter { void startBucket(int metricCount, long start, long duration); - void add(MetricKey key, AggregateMetric aggregate); + /** + * Serialize one aggregate. The {@link AggregateEntry} carries both the label fields (resource, + * service, span.kind, peer tags, etc.) and the counters being reported. + */ + void add(AggregateEntry entry); void finishBucket(); diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/PeerTagSchema.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/PeerTagSchema.java index 4821d1b33a4..a37c8c2f2a0 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/PeerTagSchema.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/PeerTagSchema.java @@ -3,6 +3,7 @@ import static datadog.trace.api.DDTags.BASE_SERVICE; import datadog.communication.ddagent.DDAgentFeaturesDiscovery; +import java.util.Arrays; import java.util.Set; /** @@ -52,6 +53,15 @@ final class PeerTagSchema { */ String state; + /** + * Lazily computed content hash of {@link #names}, used as the bucket-distinguishing contribution + * when {@link AggregateEntry#hashOf} hashes a snapshot's peer-tag schema. Benign race pattern: a + * concurrent first-time read may recompute the value, but {@link Arrays#hashCode(Object[])} on + * the same content array is deterministic so the recomputed value matches. {@code int} writes are + * atomic per JLS. + */ + private int cachedHashCode; + private PeerTagSchema(String[] names, String state) { this.names = names; this.state = state; @@ -91,4 +101,36 @@ boolean hasSameTagsAs(Set other) { int size() { return names.length; } + + /** + * Content-based hash of {@link #names}. Used by {@link AggregateEntry#hashOf} to incorporate the + * schema identity into a snapshot's lookup hash. Distinct schemas with the same names hash to the + * same value so an entry built under one schema instance still matches a snapshot pinned to a + * content-equal replacement (e.g. after reconcile rebuilds the schema). + */ + @Override + public int hashCode() { + int h = cachedHashCode; + if (h == 0) { + h = Arrays.hashCode(names); + cachedHashCode = h; + } + return h; + } + + /** + * Content equality on {@link #names}. {@link #state} is intentionally excluded: it is a + * reconcile-bookkeeping field, not part of the schema's identity. Two schemas built from the same + * tag list at different discovery snapshots represent the same schema. + */ + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (!(o instanceof PeerTagSchema)) { + return false; + } + return Arrays.equals(names, ((PeerTagSchema) o).names); + } } diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/SerializingMetricWriter.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/SerializingMetricWriter.java index 0f84964e9db..7644ebaf044 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/SerializingMetricWriter.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/SerializingMetricWriter.java @@ -142,12 +142,12 @@ public void startBucket(int metricCount, long start, long duration) { } @Override - public void add(MetricKey key, AggregateMetric aggregate) { + public void add(AggregateEntry entry) { // Calculate dynamic map size based on optional fields - final boolean hasHttpMethod = key.getHttpMethod() != null; - final boolean hasHttpEndpoint = key.getHttpEndpoint() != null; - final boolean hasServiceSource = key.getServiceSource() != null; - final boolean hasGrpcStatusCode = key.getGrpcStatusCode() != null; + final boolean hasHttpMethod = entry.getHttpMethod() != null; + final boolean hasHttpEndpoint = entry.getHttpEndpoint() != null; + final boolean hasServiceSource = entry.getServiceSource() != null; + final boolean hasGrpcStatusCode = entry.getGrpcStatusCode() != null; final int mapSize = 15 + (hasServiceSource ? 1 : 0) @@ -158,31 +158,31 @@ public void add(MetricKey key, AggregateMetric aggregate) { writer.startMap(mapSize); writer.writeUTF8(NAME); - writer.writeUTF8(key.getOperationName()); + writer.writeUTF8(entry.getOperationName()); writer.writeUTF8(SERVICE); - writer.writeUTF8(key.getService()); + writer.writeUTF8(entry.getService()); writer.writeUTF8(RESOURCE); - writer.writeUTF8(key.getResource()); + writer.writeUTF8(entry.getResource()); writer.writeUTF8(TYPE); - writer.writeUTF8(key.getType()); + writer.writeUTF8(entry.getType()); writer.writeUTF8(HTTP_STATUS_CODE); - writer.writeInt(key.getHttpStatusCode()); + writer.writeInt(entry.getHttpStatusCode()); writer.writeUTF8(SYNTHETICS); - writer.writeBoolean(key.isSynthetics()); + writer.writeBoolean(entry.isSynthetics()); writer.writeUTF8(IS_TRACE_ROOT); - writer.writeInt(key.isTraceRoot() ? TRISTATE_TRUE : TRISTATE_FALSE); + writer.writeInt(entry.isTraceRoot() ? TRISTATE_TRUE : TRISTATE_FALSE); writer.writeUTF8(SPAN_KIND); - writer.writeUTF8(key.getSpanKind()); + writer.writeUTF8(entry.getSpanKind()); writer.writeUTF8(PEER_TAGS); - final List peerTags = key.getPeerTags(); + final List peerTags = entry.getPeerTags(); writer.startArray(peerTags.size()); for (UTF8BytesString peerTag : peerTags) { @@ -191,43 +191,43 @@ public void add(MetricKey key, AggregateMetric aggregate) { if (hasServiceSource) { writer.writeUTF8(SERVICE_SOURCE); - writer.writeUTF8(key.getServiceSource()); + writer.writeUTF8(entry.getServiceSource()); } // Only include HTTPMethod if present if (hasHttpMethod) { writer.writeUTF8(HTTP_METHOD); - writer.writeUTF8(key.getHttpMethod()); + writer.writeUTF8(entry.getHttpMethod()); } // Only include HTTPEndpoint if present if (hasHttpEndpoint) { writer.writeUTF8(HTTP_ENDPOINT); - writer.writeUTF8(key.getHttpEndpoint()); + writer.writeUTF8(entry.getHttpEndpoint()); } // Only include GRPCStatusCode if present (rpc-type spans) if (hasGrpcStatusCode) { writer.writeUTF8(GRPC_STATUS_CODE); - writer.writeUTF8(key.getGrpcStatusCode()); + writer.writeUTF8(entry.getGrpcStatusCode()); } writer.writeUTF8(HITS); - writer.writeInt(aggregate.getHitCount()); + writer.writeInt(entry.getHitCount()); writer.writeUTF8(ERRORS); - writer.writeInt(aggregate.getErrorCount()); + writer.writeInt(entry.getErrorCount()); writer.writeUTF8(TOP_LEVEL_HITS); - writer.writeInt(aggregate.getTopLevelCount()); + writer.writeInt(entry.getTopLevelCount()); writer.writeUTF8(DURATION); - writer.writeLong(aggregate.getDuration()); + writer.writeLong(entry.getDuration()); writer.writeUTF8(OK_SUMMARY); - writer.writeBinary(aggregate.getOkLatencies().serialize()); + writer.writeBinary(entry.getOkLatencies().serialize()); writer.writeUTF8(ERROR_SUMMARY); - writer.writeBinary(aggregate.getErrorLatencies().serialize()); + writer.writeBinary(entry.getErrorLatencies().serialize()); } @Override diff --git a/dd-trace-core/src/main/java/datadog/trace/common/metrics/SpanSnapshot.java b/dd-trace-core/src/main/java/datadog/trace/common/metrics/SpanSnapshot.java index eb9b741cea6..152ac42bb55 100644 --- a/dd-trace-core/src/main/java/datadog/trace/common/metrics/SpanSnapshot.java +++ b/dd-trace-core/src/main/java/datadog/trace/common/metrics/SpanSnapshot.java @@ -2,7 +2,8 @@ /** * Immutable per-span value posted from the producer to the aggregator thread. Carries the raw - * inputs the aggregator needs to build a {@link MetricKey} and update an {@link AggregateMetric}. + * inputs the aggregator needs to look up or build an {@link AggregateEntry} and update its + * counters. * *

All cache-canonicalization (service-name, span-kind, peer-tag string interning) happens on the * aggregator thread; the producer just shuffles references. diff --git a/dd-trace-core/src/test/groovy/datadog/trace/common/metrics/AggregateMetricTest.groovy b/dd-trace-core/src/test/groovy/datadog/trace/common/metrics/AggregateMetricTest.groovy deleted file mode 100644 index 140149d8324..00000000000 --- a/dd-trace-core/src/test/groovy/datadog/trace/common/metrics/AggregateMetricTest.groovy +++ /dev/null @@ -1,105 +0,0 @@ -package datadog.trace.common.metrics - -import datadog.metrics.agent.AgentMeter -import datadog.metrics.impl.DDSketchHistograms -import datadog.metrics.impl.MonitoringImpl -import datadog.metrics.api.statsd.StatsDClient -import datadog.trace.test.util.DDSpecification - -import java.util.concurrent.TimeUnit -import java.util.concurrent.atomic.AtomicLongArray - -import static datadog.trace.common.metrics.AggregateMetric.ERROR_TAG -import static datadog.trace.common.metrics.AggregateMetric.TOP_LEVEL_TAG - -class AggregateMetricTest extends DDSpecification { - - def setupSpec() { - // Initialize AgentMeter with monitoring - this is the standard mechanism used in production - def monitoring = new MonitoringImpl(StatsDClient.NO_OP, 1, TimeUnit.SECONDS) - AgentMeter.registerIfAbsent(StatsDClient.NO_OP, monitoring, DDSketchHistograms.FACTORY) - // Create a timer to trigger DDSketchHistograms loading and Factory registration - // This simulates what happens during CoreTracer initialization (traceWriteTimer) - monitoring.newTimer("test.init") - } - - def "record durations sums up to total"() { - given: - AggregateMetric aggregate = new AggregateMetric() - when: - aggregate.recordDurations(3, new AtomicLongArray(1, 2, 3)) - then: - aggregate.getDuration() == 6 - } - - def "total durations include errors"() { - given: - AggregateMetric aggregate = new AggregateMetric() - when: - aggregate.recordDurations(3, new AtomicLongArray(1, 2, 3)) - then: - aggregate.getDuration() == 6 - } - - def "clear"() { - given: - AggregateMetric aggregate = new AggregateMetric() - .recordDurations(3, new AtomicLongArray(5, ERROR_TAG | 6, TOP_LEVEL_TAG | 7)) - when: - aggregate.clear() - then: - aggregate.getDuration() == 0 - aggregate.getErrorCount() == 0 - aggregate.getTopLevelCount() == 0 - aggregate.getHitCount() == 0 - } - - def "recordOneDuration accumulates ok and error and top-level"() { - given: - AggregateMetric aggregate = new AggregateMetric() - .recordOneDuration(10L) - .recordOneDuration(10L | TOP_LEVEL_TAG) - .recordOneDuration(10L | ERROR_TAG) - - expect: - aggregate.getHitCount() == 3 - aggregate.getDuration() == 30 - aggregate.getErrorCount() == 1 - aggregate.getTopLevelCount() == 1 - } - - def "ignore trailing zeros"() { - given: - AggregateMetric aggregate = new AggregateMetric() - when: - aggregate.recordDurations(3, new AtomicLongArray(1, 2, 3, 0, 0, 0)) - then: - aggregate.getDuration() == 6 - aggregate.getHitCount() == 3 - aggregate.getErrorCount() == 0 - } - - def "hit count includes errors"() { - given: - AggregateMetric aggregate = new AggregateMetric() - when: - aggregate.recordDurations(3, new AtomicLongArray(1, 2, 3 | ERROR_TAG)) - then: - aggregate.getHitCount() == 3 - aggregate.getErrorCount() == 1 - } - - def "ok and error durations tracked separately"() { - given: - AggregateMetric aggregate = new AggregateMetric() - when: - aggregate.recordDurations(10, - new AtomicLongArray(1, 100 | ERROR_TAG, 2, 99 | ERROR_TAG, 3, - 98 | ERROR_TAG, 4, 97 | ERROR_TAG)) - then: - def errorLatencies = aggregate.getErrorLatencies() - def okLatencies = aggregate.getOkLatencies() - errorLatencies.getMaxValue() >= 99 - okLatencies.getMaxValue() <= 5 - } -} diff --git a/dd-trace-core/src/test/groovy/datadog/trace/common/metrics/ConflatingMetricAggregatorTest.groovy b/dd-trace-core/src/test/groovy/datadog/trace/common/metrics/ConflatingMetricAggregatorTest.groovy index a95f6bcbdbc..c49b98dc351 100644 --- a/dd-trace-core/src/test/groovy/datadog/trace/common/metrics/ConflatingMetricAggregatorTest.groovy +++ b/dd-trace-core/src/test/groovy/datadog/trace/common/metrics/ConflatingMetricAggregatorTest.groovy @@ -119,7 +119,7 @@ class ConflatingMetricAggregatorTest extends DDSpecification { then: latchTriggered 1 * writer.startBucket(1, _, _) - 1 * writer.add(new MetricKey( + 1 * writer.add(AggregateEntries.of( null, "service", "operation", @@ -133,8 +133,8 @@ class ConflatingMetricAggregatorTest extends DDSpecification { null, null, null - ), _) >> { MetricKey key, AggregateMetric value -> - value.getHitCount() == 1 && value.getTopLevelCount() == 1 && value.getDuration() == 100 + )) >> { AggregateEntry e -> + assert e.getHitCount() == 1 && e.getTopLevelCount() == 1 && e.getDuration() == 100 } 1 * writer.finishBucket() >> { latch.countDown() } @@ -165,7 +165,7 @@ class ConflatingMetricAggregatorTest extends DDSpecification { then: latchTriggered 1 * writer.startBucket(1, _, _) - 1 * writer.add(new MetricKey( + 1 * writer.add(AggregateEntries.of( "resource", "service", "operation", @@ -179,8 +179,8 @@ class ConflatingMetricAggregatorTest extends DDSpecification { null, null, null - ), _) >> { MetricKey key, AggregateMetric value -> - value.getHitCount() == 1 && value.getTopLevelCount() == 1 && value.getDuration() == 100 + )) >> { AggregateEntry e -> + assert e.getHitCount() == 1 && e.getTopLevelCount() == 1 && e.getDuration() == 100 } 1 * writer.finishBucket() >> { latch.countDown() } @@ -217,7 +217,7 @@ class ConflatingMetricAggregatorTest extends DDSpecification { latchTriggered == statsComputed (statsComputed ? 1 : 0) * writer.startBucket(1, _, _) (statsComputed ? 1 : 0) * writer.add( - new MetricKey( + AggregateEntries.of( "resource", "service", "operation", @@ -231,9 +231,9 @@ class ConflatingMetricAggregatorTest extends DDSpecification { httpMethod, httpEndpoint, null - ), { AggregateMetric aggregateMetric -> - aggregateMetric.getHitCount() == 1 && aggregateMetric.getTopLevelCount() == 0 && aggregateMetric.getDuration() == 100 - }) + )) >> { AggregateEntry e -> + assert e.getHitCount() == 1 && e.getTopLevelCount() == 0 && e.getDuration() == 100 + } (statsComputed ? 1 : 0) * writer.finishBucket() >> { latch.countDown() } cleanup: @@ -294,7 +294,7 @@ class ConflatingMetricAggregatorTest extends DDSpecification { cycle1Triggered cycle2Triggered 1 * writer.add( - new MetricKey( + AggregateEntries.of( "resource", "service", "operation", @@ -308,11 +308,11 @@ class ConflatingMetricAggregatorTest extends DDSpecification { null, null, null - ), { AggregateMetric aggregateMetric -> - aggregateMetric.getHitCount() == 1 && aggregateMetric.getTopLevelCount() == 0 && aggregateMetric.getDuration() == 100 - }) + )) >> { AggregateEntry e -> + assert e.getHitCount() == 1 && e.getTopLevelCount() == 0 && e.getDuration() == 100 + } 1 * writer.add( - new MetricKey( + AggregateEntries.of( "resource", "service", "operation", @@ -326,9 +326,9 @@ class ConflatingMetricAggregatorTest extends DDSpecification { null, null, null - ), { AggregateMetric aggregateMetric -> - aggregateMetric.getHitCount() == 1 && aggregateMetric.getTopLevelCount() == 0 && aggregateMetric.getDuration() == 100 - }) + )) >> { AggregateEntry e -> + assert e.getHitCount() == 1 && e.getTopLevelCount() == 0 && e.getDuration() == 100 + } 2 * writer.finishBucket() >> { latch1.countDown(); latch2.countDown() } cleanup: @@ -359,7 +359,7 @@ class ConflatingMetricAggregatorTest extends DDSpecification { latchTriggered 1 * writer.startBucket(1, _, _) 1 * writer.add( - new MetricKey( + AggregateEntries.of( "resource", "service", "operation", @@ -373,9 +373,9 @@ class ConflatingMetricAggregatorTest extends DDSpecification { null, null, null - ), { AggregateMetric aggregateMetric -> - aggregateMetric.getHitCount() == 1 && aggregateMetric.getTopLevelCount() == 0 && aggregateMetric.getDuration() == 100 - }) + )) >> { AggregateEntry e -> + assert e.getHitCount() == 1 && e.getTopLevelCount() == 0 && e.getDuration() == 100 + } 1 * writer.finishBucket() >> { latch.countDown() } cleanup: @@ -411,7 +411,7 @@ class ConflatingMetricAggregatorTest extends DDSpecification { then: latchTriggered 1 * writer.startBucket(1, _, _) - 1 * writer.add(new MetricKey( + 1 * writer.add(AggregateEntries.of( "resource", "service", "operation", @@ -425,9 +425,9 @@ class ConflatingMetricAggregatorTest extends DDSpecification { null, null, null - ), { AggregateMetric value -> - value.getHitCount() == 1 && value.getTopLevelCount() == topLevelCount && value.getDuration() == 100 - }) + )) >> { AggregateEntry e -> + assert e.getHitCount() == 1 && e.getTopLevelCount() == topLevelCount && e.getDuration() == 100 + } 1 * writer.finishBucket() >> { latch.countDown() } cleanup: @@ -470,7 +470,7 @@ class ConflatingMetricAggregatorTest extends DDSpecification { latchTriggered 1 * writer.finishBucket() >> { latch.countDown() } 1 * writer.startBucket(2, _, SECONDS.toNanos(reportingInterval)) - 1 * writer.add(new MetricKey( + 1 * writer.add(AggregateEntries.of( "resource", "service", "operation", @@ -484,10 +484,10 @@ class ConflatingMetricAggregatorTest extends DDSpecification { null, null, null - ), { AggregateMetric value -> - value.getHitCount() == count && value.getDuration() == count * duration - }) - 1 * writer.add(new MetricKey( + )) >> { AggregateEntry e -> + assert e.getHitCount() == count && e.getDuration() == count * duration + } + 1 * writer.add(AggregateEntries.of( "resource2", "service2", "operation2", @@ -501,9 +501,9 @@ class ConflatingMetricAggregatorTest extends DDSpecification { null, null, null - ), { AggregateMetric value -> - value.getHitCount() == count && value.getDuration() == count * duration * 2 - }) + )) >> { AggregateEntry e -> + assert e.getHitCount() == count && e.getDuration() == count * duration * 2 + } cleanup: aggregator.close() @@ -541,7 +541,7 @@ class ConflatingMetricAggregatorTest extends DDSpecification { then: "should aggregate into single metric" latchTriggered 1 * writer.startBucket(1, _, _) - 1 * writer.add(new MetricKey( + 1 * writer.add(AggregateEntries.of( "resource", "service", "operation", @@ -555,9 +555,9 @@ class ConflatingMetricAggregatorTest extends DDSpecification { "GET", "/api/users/:id", null - ), { AggregateMetric value -> - value.getHitCount() == count && value.getDuration() == count * duration - }) + )) >> { AggregateEntry e -> + assert e.getHitCount() == count && e.getDuration() == count * duration + } 1 * writer.finishBucket() >> { latch.countDown() } when: "publish spans with different endpoints" @@ -582,7 +582,7 @@ class ConflatingMetricAggregatorTest extends DDSpecification { then: "should create separate metrics for each endpoint/method combination" latchTriggered2 1 * writer.startBucket(3, _, _) - 1 * writer.add(new MetricKey( + 1 * writer.add(AggregateEntries.of( "resource", "service", "operation", @@ -596,10 +596,10 @@ class ConflatingMetricAggregatorTest extends DDSpecification { "GET", "/api/users/:id", null - ), { AggregateMetric value -> - value.getHitCount() == 1 && value.getDuration() == duration - }) - 1 * writer.add(new MetricKey( + )) >> { AggregateEntry e -> + assert e.getHitCount() == 1 && e.getDuration() == duration + } + 1 * writer.add(AggregateEntries.of( "resource", "service", "operation", @@ -613,10 +613,10 @@ class ConflatingMetricAggregatorTest extends DDSpecification { "GET", "/api/orders/:id", null - ), { AggregateMetric value -> - value.getHitCount() == 1 && value.getDuration() == duration * 2 - }) - 1 * writer.add(new MetricKey( + )) >> { AggregateEntry e -> + assert e.getHitCount() == 1 && e.getDuration() == duration * 2 + } + 1 * writer.add(AggregateEntries.of( "resource", "service", "operation", @@ -630,9 +630,9 @@ class ConflatingMetricAggregatorTest extends DDSpecification { "POST", "/api/users/:id", null - ), { AggregateMetric value -> - value.getHitCount() == 1 && value.getDuration() == duration * 3 - }) + )) >> { AggregateEntry e -> + assert e.getHitCount() == 1 && e.getDuration() == duration * 3 + } 1 * writer.finishBucket() >> { latch2.countDown() } cleanup: @@ -680,7 +680,7 @@ class ConflatingMetricAggregatorTest extends DDSpecification { then: "should create 4 separate metrics" latchTriggered 1 * writer.startBucket(4, _, _) - 1 * writer.add(new MetricKey( + 1 * writer.add(AggregateEntries.of( "resource", "service", "operation", @@ -694,10 +694,10 @@ class ConflatingMetricAggregatorTest extends DDSpecification { "GET", "/api/users/:id", null - ), { AggregateMetric value -> - value.getHitCount() == 1 && value.getDuration() == duration - }) - 1 * writer.add(new MetricKey( + )) >> { AggregateEntry e -> + assert e.getHitCount() == 1 && e.getDuration() == duration + } + 1 * writer.add(AggregateEntries.of( "resource", "service", "operation", @@ -711,10 +711,10 @@ class ConflatingMetricAggregatorTest extends DDSpecification { "POST", "/api/users/:id", null - ), { AggregateMetric value -> - value.getHitCount() == 1 && value.getDuration() == duration * 2 - }) - 1 * writer.add(new MetricKey( + )) >> { AggregateEntry e -> + assert e.getHitCount() == 1 && e.getDuration() == duration * 2 + } + 1 * writer.add(AggregateEntries.of( "resource", "service", "operation", @@ -728,10 +728,10 @@ class ConflatingMetricAggregatorTest extends DDSpecification { "GET", "/api/users/:id", null - ), { AggregateMetric value -> - value.getHitCount() == 1 && value.getDuration() == duration * 3 - }) - 1 * writer.add(new MetricKey( + )) >> { AggregateEntry e -> + assert e.getHitCount() == 1 && e.getDuration() == duration * 3 + } + 1 * writer.add(AggregateEntries.of( "resource", "service", "operation", @@ -745,9 +745,9 @@ class ConflatingMetricAggregatorTest extends DDSpecification { "GET", "/api/orders/:id", null - ), { AggregateMetric value -> - value.getHitCount() == 1 && value.getDuration() == duration * 4 - }) + )) >> { AggregateEntry e -> + assert e.getHitCount() == 1 && e.getDuration() == duration * 4 + } 1 * writer.finishBucket() >> { latch.countDown() } cleanup: @@ -784,7 +784,7 @@ class ConflatingMetricAggregatorTest extends DDSpecification { then: "should create separate metric keys for spans with and without HTTP tags" latchTriggered 1 * writer.startBucket(2, _, _) - 1 * writer.add(new MetricKey( + 1 * writer.add(AggregateEntries.of( "resource", "service", "operation", @@ -798,10 +798,10 @@ class ConflatingMetricAggregatorTest extends DDSpecification { null, null, null - ), { AggregateMetric value -> - value.getHitCount() == 1 && value.getDuration() == duration - }) - 1 * writer.add(new MetricKey( + )) >> { AggregateEntry e -> + assert e.getHitCount() == 1 && e.getDuration() == duration + } + 1 * writer.add(AggregateEntries.of( "resource", "service", "operation", @@ -815,9 +815,9 @@ class ConflatingMetricAggregatorTest extends DDSpecification { "GET", "/api/users/:id", null - ), { AggregateMetric value -> - value.getHitCount() == 1 && value.getDuration() == duration * 2 - }) + )) >> { AggregateEntry e -> + assert e.getHitCount() == 1 && e.getDuration() == duration * 2 + } 1 * writer.finishBucket() >> { latch.countDown() } cleanup: @@ -852,7 +852,7 @@ class ConflatingMetricAggregatorTest extends DDSpecification { then: "should create the different metric keys for spans with and without sources" latchTriggered 1 * writer.startBucket(2, _, _) - 1 * writer.add(new MetricKey( + 1 * writer.add(AggregateEntries.of( "resource", "service", "operation", @@ -866,10 +866,10 @@ class ConflatingMetricAggregatorTest extends DDSpecification { null, null, null - ), { AggregateMetric value -> - value.getHitCount() == 2 && value.getDuration() == 2 * duration - }) - 1 * writer.add(new MetricKey( + )) >> { AggregateEntry e -> + assert e.getHitCount() == 2 && e.getDuration() == 2 * duration + } + 1 * writer.add(AggregateEntries.of( "resource", "service", "operation", @@ -883,16 +883,19 @@ class ConflatingMetricAggregatorTest extends DDSpecification { null, null, null - ), { AggregateMetric value -> - value.getHitCount() == 1 && value.getDuration() == duration - }) + )) >> { AggregateEntry e -> + assert e.getHitCount() == 1 && e.getDuration() == duration + } 1 * writer.finishBucket() >> { latch.countDown() } cleanup: aggregator.close() } - def "test least recently written to aggregate flushed when size limit exceeded"() { + def "new aggregates beyond size limit are dropped when no stale entries can be evicted"() { + // The table only evicts entries with hitCount == 0 to make room. When all entries are live + // (all have been recorded against), an over-cap insert drops the new key rather than evicting + // an established one. This protects the data we've already collected from a burst of new keys. setup: int maxAggregates = 10 MetricWriter writer = Mock(MetricWriter) @@ -916,11 +919,11 @@ class ConflatingMetricAggregatorTest extends DDSpecification { aggregator.report() def latchTriggered = latch.await(2, SECONDS) - then: "the first aggregate should be dropped but the rest reported" + then: "the established service0..service9 are reported; service10 is dropped" latchTriggered 1 * writer.startBucket(10, _, SECONDS.toNanos(reportingInterval)) - for (int i = 1; i < 11; ++i) { - 1 * writer.add(new MetricKey( + for (int i = 0; i < 10; ++i) { + 1 * writer.add(AggregateEntries.of( "resource", "service" + i, "operation", @@ -934,13 +937,13 @@ class ConflatingMetricAggregatorTest extends DDSpecification { null, null, null - ), _) >> { MetricKey key, AggregateMetric value -> - value.getHitCount() == 1 && value.getDuration() == duration + )) >> { AggregateEntry e -> + assert e.getHitCount() == 1 && e.getDuration() == duration } } - 0 * writer.add(new MetricKey( + 0 * writer.add(AggregateEntries.of( "resource", - "service0", + "service10", "operation", null, "type", @@ -952,7 +955,7 @@ class ConflatingMetricAggregatorTest extends DDSpecification { null, null, null - ), _) + )) 1 * writer.finishBucket() >> { latch.countDown() } cleanup: @@ -1067,7 +1070,7 @@ class ConflatingMetricAggregatorTest extends DDSpecification { latchTriggered 1 * writer.startBucket(5, _, SECONDS.toNanos(reportingInterval)) for (int i = 0; i < 5; ++i) { - 1 * writer.add(new MetricKey( + 1 * writer.add(AggregateEntries.of( "resource", "service" + i, "operation", @@ -1081,9 +1084,9 @@ class ConflatingMetricAggregatorTest extends DDSpecification { null, null, null - ), { AggregateMetric value -> - value.getHitCount() == 1 && value.getDuration() == duration - }) + )) >> { AggregateEntry e -> + assert e.getHitCount() == 1 && e.getDuration() == duration + } } 1 * writer.finishBucket() >> { latch.countDown() } @@ -1102,7 +1105,7 @@ class ConflatingMetricAggregatorTest extends DDSpecification { latchTriggered 1 * writer.startBucket(4, _, SECONDS.toNanos(reportingInterval)) for (int i = 1; i < 5; ++i) { - 1 * writer.add(new MetricKey( + 1 * writer.add(AggregateEntries.of( "resource", "service" + i, "operation", @@ -1116,11 +1119,11 @@ class ConflatingMetricAggregatorTest extends DDSpecification { null, null, null - ), { AggregateMetric value -> - value.getHitCount() == 1 && value.getDuration() == duration - }) + )) >> { AggregateEntry e -> + assert e.getHitCount() == 1 && e.getDuration() == duration + } } - 0 * writer.add(new MetricKey( + 0 * writer.add(AggregateEntries.of( "resource", "service0", "operation", @@ -1134,7 +1137,7 @@ class ConflatingMetricAggregatorTest extends DDSpecification { null, null, null - ), _) + )) 1 * writer.finishBucket() >> { latch.countDown() } cleanup: @@ -1169,7 +1172,7 @@ class ConflatingMetricAggregatorTest extends DDSpecification { latchTriggered 1 * writer.startBucket(5, _, SECONDS.toNanos(reportingInterval)) for (int i = 0; i < 5; ++i) { - 1 * writer.add(new MetricKey( + 1 * writer.add(AggregateEntries.of( "resource", "service" + i, "operation", @@ -1183,9 +1186,9 @@ class ConflatingMetricAggregatorTest extends DDSpecification { null, null, null - ), { AggregateMetric value -> - value.getHitCount() == 1 && value.getDuration() == duration - }) + )) >> { AggregateEntry e -> + assert e.getHitCount() == 1 && e.getDuration() == duration + } } 1 * writer.finishBucket() >> { latch.countDown() } @@ -1195,7 +1198,7 @@ class ConflatingMetricAggregatorTest extends DDSpecification { then: "aggregate not updated in cycle is not reported" 0 * writer.finishBucket() 0 * writer.startBucket(_, _, _) - 0 * writer.add(_, _) + 0 * writer.add(_) cleanup: aggregator.close() @@ -1228,7 +1231,7 @@ class ConflatingMetricAggregatorTest extends DDSpecification { latchTriggered 1 * writer.startBucket(5, _, SECONDS.toNanos(1)) for (int i = 0; i < 5; ++i) { - 1 * writer.add(new MetricKey( + 1 * writer.add(AggregateEntries.of( "resource", "service" + i, "operation", @@ -1242,9 +1245,9 @@ class ConflatingMetricAggregatorTest extends DDSpecification { null, null, null - ), { AggregateMetric value -> - value.getHitCount() == 1 && value.getDuration() == duration - }) + )) >> { AggregateEntry e -> + assert e.getHitCount() == 1 && e.getDuration() == duration + } } 1 * writer.finishBucket() >> { latch.countDown() } @@ -1395,7 +1398,7 @@ class ConflatingMetricAggregatorTest extends DDSpecification { latchTriggered 1 * writer.startBucket(1, _, _) 1 * writer.add( - new MetricKey( + AggregateEntries.of( "resource", "service", "operation", @@ -1409,9 +1412,9 @@ class ConflatingMetricAggregatorTest extends DDSpecification { null, null, null - ), { AggregateMetric aggregateMetric -> - aggregateMetric.getHitCount() == 1 && aggregateMetric.getTopLevelCount() == 1 && aggregateMetric.getDuration() == 100 - }) + )) >> { AggregateEntry e -> + assert e.getHitCount() == 1 && e.getTopLevelCount() == 1 && e.getDuration() == 100 + } 1 * writer.finishBucket() >> { latch.countDown() } cleanup: @@ -1450,7 +1453,7 @@ class ConflatingMetricAggregatorTest extends DDSpecification { latchTriggered 1 * writer.startBucket(1, _, _) 1 * writer.add( - new MetricKey( + AggregateEntries.of( "resource", "service", "operation", @@ -1464,9 +1467,9 @@ class ConflatingMetricAggregatorTest extends DDSpecification { null, null, null - ), { AggregateMetric aggregateMetric -> - aggregateMetric.getHitCount() == 3 && aggregateMetric.getTopLevelCount() == 3 && aggregateMetric.getDuration() == 450 - }) + )) >> { AggregateEntry e -> + assert e.getHitCount() == 3 && e.getTopLevelCount() == 3 && e.getDuration() == 450 + } 1 * writer.finishBucket() >> { latch.countDown() } cleanup: @@ -1505,7 +1508,7 @@ class ConflatingMetricAggregatorTest extends DDSpecification { latchTriggered 1 * writer.startBucket(3, _, _) 1 * writer.add( - new MetricKey( + AggregateEntries.of( "resource", "service", "operation", @@ -1519,11 +1522,11 @@ class ConflatingMetricAggregatorTest extends DDSpecification { "GET", "/api/users/:id", null - ), { AggregateMetric aggregateMetric -> - aggregateMetric.getHitCount() == 1 && aggregateMetric.getTopLevelCount() == 1 && aggregateMetric.getDuration() == 100 - }) + )) >> { AggregateEntry e -> + assert e.getHitCount() == 1 && e.getTopLevelCount() == 1 && e.getDuration() == 100 + } 1 * writer.add( - new MetricKey( + AggregateEntries.of( "resource", "service", "operation", @@ -1537,11 +1540,11 @@ class ConflatingMetricAggregatorTest extends DDSpecification { "POST", "/api/orders", null - ), { AggregateMetric aggregateMetric -> - aggregateMetric.getHitCount() == 1 && aggregateMetric.getTopLevelCount() == 1 && aggregateMetric.getDuration() == 200 - }) + )) >> { AggregateEntry e -> + assert e.getHitCount() == 1 && e.getTopLevelCount() == 1 && e.getDuration() == 200 + } 1 * writer.add( - new MetricKey( + AggregateEntries.of( "resource", "service", "operation", @@ -1555,9 +1558,9 @@ class ConflatingMetricAggregatorTest extends DDSpecification { null, null, null - ), { AggregateMetric aggregateMetric -> - aggregateMetric.getHitCount() == 1 && aggregateMetric.getTopLevelCount() == 1 && aggregateMetric.getDuration() == 150 - }) + )) >> { AggregateEntry e -> + assert e.getHitCount() == 1 && e.getTopLevelCount() == 1 && e.getDuration() == 150 + } 1 * writer.finishBucket() >> { latch.countDown() } cleanup: @@ -1593,7 +1596,7 @@ class ConflatingMetricAggregatorTest extends DDSpecification { then: latchTriggered 1 * writer.startBucket(3, _, _) - 1 * writer.add(new MetricKey( + 1 * writer.add(AggregateEntries.of( "grpc.service/Method", "service", "grpc.server", @@ -1607,8 +1610,8 @@ class ConflatingMetricAggregatorTest extends DDSpecification { null, null, "0" - ), _) - 1 * writer.add(new MetricKey( + )) + 1 * writer.add(AggregateEntries.of( "grpc.service/Method", "service", "grpc.server", @@ -1622,8 +1625,8 @@ class ConflatingMetricAggregatorTest extends DDSpecification { null, null, "5" - ), _) - 1 * writer.add(new MetricKey( + )) + 1 * writer.add(AggregateEntries.of( "GET /api", "service", "http.request", @@ -1637,7 +1640,7 @@ class ConflatingMetricAggregatorTest extends DDSpecification { null, null, null - ), _) + )) 1 * writer.finishBucket() >> { latch.countDown() } cleanup: diff --git a/dd-trace-core/src/test/groovy/datadog/trace/common/metrics/SerializingMetricWriterTest.groovy b/dd-trace-core/src/test/groovy/datadog/trace/common/metrics/SerializingMetricWriterTest.groovy index 3ff81de9851..03605dc5273 100644 --- a/dd-trace-core/src/test/groovy/datadog/trace/common/metrics/SerializingMetricWriterTest.groovy +++ b/dd-trace-core/src/test/groovy/datadog/trace/common/metrics/SerializingMetricWriterTest.groovy @@ -7,7 +7,6 @@ import static java.util.concurrent.TimeUnit.SECONDS import datadog.metrics.api.Histograms import datadog.metrics.impl.DDSketchHistograms import datadog.trace.api.Config -import datadog.trace.api.Pair import datadog.trace.api.ProcessTags import datadog.trace.api.WellKnownTags import datadog.trace.api.git.CommitInfo @@ -16,7 +15,6 @@ import datadog.trace.api.git.GitInfoProvider import datadog.trace.bootstrap.instrumentation.api.UTF8BytesString import datadog.trace.test.util.DDSpecification import java.nio.ByteBuffer -import java.util.concurrent.atomic.AtomicLongArray import org.msgpack.core.MessagePack import org.msgpack.core.MessageUnpacker @@ -26,6 +24,30 @@ class SerializingMetricWriterTest extends DDSpecification { Histograms.register(DDSketchHistograms.FACTORY) } + /** Build an {@link AggregateEntry} with a pre-recorded duration count. */ + private static AggregateEntry entry( + CharSequence resource, + CharSequence service, + CharSequence operationName, + CharSequence serviceSource, + CharSequence type, + int httpStatusCode, + boolean synthetic, + boolean traceRoot, + CharSequence spanKind, + List peerTags, + CharSequence httpMethod, + CharSequence httpEndpoint, + CharSequence grpcStatusCode, + int hitCount) { + AggregateEntry e = AggregateEntries.of( + resource, service, operationName, serviceSource, type, + httpStatusCode, synthetic, traceRoot, spanKind, peerTags, + httpMethod, httpEndpoint, grpcStatusCode) + hitCount.times { e.recordOneDuration(1L) } + return e + } + def "should produce correct message #iterationIndex with process tags enabled #withProcessTags" () { setup: if (!withProcessTags) { @@ -40,8 +62,8 @@ class SerializingMetricWriterTest extends DDSpecification { when: writer.startBucket(content.size(), startTime, duration) - for (Pair pair : content) { - writer.add(pair.getLeft(), pair.getRight()) + for (AggregateEntry e : content) { + writer.add(e) } writer.finishBucket() @@ -55,88 +77,40 @@ class SerializingMetricWriterTest extends DDSpecification { where: content << [ [ - Pair.of( - new MetricKey( - "resource1", - "service1", - "operation1", - null, - "type", - 0, - false, - false, - "client", + entry( + "resource1", "service1", "operation1", null, "type", 0, + false, false, "client", [ UTF8BytesString.create("country:canada"), UTF8BytesString.create("georegion:amer"), UTF8BytesString.create("peer.service:remote-service") ], - null, - null, - null - ), - new AggregateMetric().recordDurations(10, new AtomicLongArray(1L)) - ), - Pair.of( - new MetricKey( - "resource2", - "service2", - "operation2", - null, - "type2", - 200, - true, - false, - "producer", + null, null, null, + 10), + entry( + "resource2", "service2", "operation2", null, "type2", 200, + true, false, "producer", [ UTF8BytesString.create("country:canada"), UTF8BytesString.create("georegion:amer"), UTF8BytesString.create("peer.service:remote-service") ], - null, - null, - null - ), - new AggregateMetric().recordDurations(9, new AtomicLongArray(1L)) - ), - Pair.of( - new MetricKey( - "GET /api/users/:id", - "web-service", - "http.request", - null, - "web", - 200, - false, - true, - "server", + null, null, null, + 9), + entry( + "GET /api/users/:id", "web-service", "http.request", null, "web", 200, + false, true, "server", [], - "GET", - "/api/users/:id", - null - ), - new AggregateMetric().recordDurations(5, new AtomicLongArray(1L)) - ) + null, null, null, + 5) ], (0..10000).collect({ i -> - Pair.of( - new MetricKey( - "resource" + i, - "service" + i, - "operation" + i, - null, - "type", - 0, - false, - false, - "producer", + entry( + "resource" + i, "service" + i, "operation" + i, null, "type", 0, + false, false, "producer", [UTF8BytesString.create("messaging.destination:dest" + i)], - null, - null, - null - ), - new AggregateMetric().recordDurations(10, new AtomicLongArray(1L)) - ) + null, null, null, + 10) }) ] withProcessTags << [true, false] @@ -148,22 +122,18 @@ class SerializingMetricWriterTest extends DDSpecification { long duration = SECONDS.toNanos(10) WellKnownTags wellKnownTags = new WellKnownTags("runtimeid", "hostname", "env", "service", "version", "language") - // Create keys with different combinations of HTTP fields - def keyWithNoSource = new MetricKey("resource", "service", "operation", null, "type", 200, false, false, "server", [], "GET", "/api/users", null) - def keyWithSource = new MetricKey("resource", "service", "operation", "source", "type", 200, false, false, "server", [], "POST", null, null) + def entryNoSource = entry("resource", "service", "operation", null, "type", 200, false, false, "server", [], "GET", "/api/users", null, 1) + def entryWithSource = entry("resource", "service", "operation", "source", "type", 200, false, false, "server", [], "POST", null, null, 1) - def content = [ - Pair.of(keyWithNoSource, new AggregateMetric().recordDurations(1, new AtomicLongArray(1L))), - Pair.of(keyWithSource, new AggregateMetric().recordDurations(1, new AtomicLongArray(1L))), - ] + def content = [entryNoSource, entryWithSource] ValidatingSink sink = new ValidatingSink(wellKnownTags, startTime, duration, content) SerializingMetricWriter writer = new SerializingMetricWriter(wellKnownTags, sink, 128) when: writer.startBucket(content.size(), startTime, duration) - for (Pair pair : content) { - writer.add(pair.getLeft(), pair.getRight()) + for (AggregateEntry e : content) { + writer.add(e) } writer.finishBucket() @@ -177,34 +147,25 @@ class SerializingMetricWriterTest extends DDSpecification { long duration = SECONDS.toNanos(10) WellKnownTags wellKnownTags = new WellKnownTags("runtimeid", "hostname", "env", "service", "version", "language") - // Create keys with different combinations of HTTP fields - def keyWithBoth = new MetricKey("resource", "service", "operation", null, "type", 200, false, false, "server", [], "GET", "/api/users", null) - def keyWithMethodOnly = new MetricKey("resource", "service", "operation", null, "type", 200, false, false, "server", [], "POST", null,null) - def keyWithEndpointOnly = new MetricKey("resource", "service", "operation", null, "type", 200, false, false, "server", [], null, "/api/orders",null) - def keyWithNeither = new MetricKey("resource", "service", "operation", null, "type", 200, false, false, "client", [], null, null, null) - - def content = [ - Pair.of(keyWithBoth, new AggregateMetric().recordDurations(1, new AtomicLongArray(1L))), - Pair.of(keyWithMethodOnly, new AggregateMetric().recordDurations(1, new AtomicLongArray(1L))), - Pair.of(keyWithEndpointOnly, new AggregateMetric().recordDurations(1, new AtomicLongArray(1L))), - Pair.of(keyWithNeither, new AggregateMetric().recordDurations(1, new AtomicLongArray(1L))) - ] + def entryWithBoth = entry("resource", "service", "operation", null, "type", 200, false, false, "server", [], "GET", "/api/users", null, 1) + def entryWithMethodOnly = entry("resource", "service", "operation", null, "type", 200, false, false, "server", [], "POST", null, null, 1) + def entryWithEndpointOnly = entry("resource", "service", "operation", null, "type", 200, false, false, "server", [], null, "/api/orders", null, 1) + def entryWithNeither = entry("resource", "service", "operation", null, "type", 200, false, false, "client", [], null, null, null, 1) + + def content = [entryWithBoth, entryWithMethodOnly, entryWithEndpointOnly, entryWithNeither] ValidatingSink sink = new ValidatingSink(wellKnownTags, startTime, duration, content) SerializingMetricWriter writer = new SerializingMetricWriter(wellKnownTags, sink, 128) when: writer.startBucket(content.size(), startTime, duration) - for (Pair pair : content) { - writer.add(pair.getLeft(), pair.getRight()) + for (AggregateEntry e : content) { + writer.add(e) } writer.finishBucket() then: sink.validatedInput() - // Test passes if validation in ValidatingSink succeeds - // ValidatingSink verifies that map size matches actual number of fields - // and that HTTPMethod/HTTPEndpoint are only present when non-empty } def "add git sha commit info when sha commit is #shaCommit"() { @@ -216,40 +177,63 @@ class SerializingMetricWriterTest extends DDSpecification { long duration = SECONDS.toNanos(10) WellKnownTags wellKnownTags = new WellKnownTags("runtimeid", "hostname", "env", "service", "version", "language") - // Create keys with different combinations of HTTP fields - def key = new MetricKey("resource", "service", "operation", null, "type", 200, false, false, "server", [], "GET", "/api/users", null) + def e = entry("resource", "service", "operation", null, "type", 200, false, false, "server", [], "GET", "/api/users", null, 1) - def content = [Pair.of(key, new AggregateMetric().recordDurations(1, new AtomicLongArray(1L))),] + def content = [e] ValidatingSink sink = new ValidatingSink(wellKnownTags, startTime, duration, content) SerializingMetricWriter writer = new SerializingMetricWriter(wellKnownTags, sink, 128, gitInfoProvider) when: - writer.startBucket(content.size(), startTime, duration) - for (Pair pair : content) { - writer.add(pair.getLeft(), pair.getRight()) + for (AggregateEntry entryItem : content) { + writer.add(entryItem) } writer.finishBucket() then: - sink.validatedInput() where: shaCommit << [null, "123456"] } + def "GRPCStatusCode field is present in payload for rpc-type spans"() { + setup: + long startTime = MILLISECONDS.toNanos(System.currentTimeMillis()) + long duration = SECONDS.toNanos(10) + WellKnownTags wellKnownTags = new WellKnownTags("runtimeid", "hostname", "env", "service", "version", "language") + + def entryWithGrpc = entry("grpc.service/Method", "grpc-service", "grpc.server", null, "rpc", 0, false, false, "server", [], null, null, "OK", 1) + def entryWithGrpcError = entry("grpc.service/Method", "grpc-service", "grpc.server", null, "rpc", 0, false, false, "client", [], null, null, "NOT_FOUND", 1) + def entryWithoutGrpc = entry("resource", "service", "operation", null, "web", 200, false, false, "server", [], null, null, null, 1) + + def content = [entryWithGrpc, entryWithGrpcError, entryWithoutGrpc] + + ValidatingSink sink = new ValidatingSink(wellKnownTags, startTime, duration, content) + SerializingMetricWriter writer = new SerializingMetricWriter(wellKnownTags, sink, 128) + + when: + writer.startBucket(content.size(), startTime, duration) + for (AggregateEntry e : content) { + writer.add(e) + } + writer.finishBucket() + + then: + sink.validatedInput() + } + static class ValidatingSink implements Sink { private final WellKnownTags wellKnownTags private final long startTimeNanos private final long duration private boolean validated = false - private List> content + private List content ValidatingSink(WellKnownTags wellKnownTags, long startTimeNanos, long duration, - List> content) { + List content) { this.wellKnownTags = wellKnownTags this.startTimeNanos = startTimeNanos this.duration = duration @@ -298,83 +282,81 @@ class SerializingMetricWriterTest extends DDSpecification { assert unpacker.unpackString() == "Stats" int statCount = unpacker.unpackArrayHeader() assert statCount == content.size() - for (Pair pair : content) { - MetricKey key = pair.getLeft() - AggregateMetric value = pair.getRight() + for (AggregateEntry entry : content) { int metricMapSize = unpacker.unpackMapHeader() // Calculate expected map size based on optional fields - boolean hasHttpMethod = key.getHttpMethod() != null - boolean hasHttpEndpoint = key.getHttpEndpoint() != null - boolean hasServiceSource = key.getServiceSource() != null - boolean hasGrpcStatusCode = key.getGrpcStatusCode() != null + boolean hasHttpMethod = entry.getHttpMethod() != null + boolean hasHttpEndpoint = entry.getHttpEndpoint() != null + boolean hasServiceSource = entry.getServiceSource() != null + boolean hasGrpcStatusCode = entry.getGrpcStatusCode() != null int expectedMapSize = 15 + (hasServiceSource ? 1 : 0) + (hasHttpMethod ? 1 : 0) + (hasHttpEndpoint ? 1 : 0) + (hasGrpcStatusCode ? 1 : 0) assert metricMapSize == expectedMapSize int elementCount = 0 assert unpacker.unpackString() == "Name" - assert unpacker.unpackString() == key.getOperationName() as String + assert unpacker.unpackString() == entry.getOperationName() as String ++elementCount assert unpacker.unpackString() == "Service" - assert unpacker.unpackString() == key.getService() as String + assert unpacker.unpackString() == entry.getService() as String ++elementCount assert unpacker.unpackString() == "Resource" - assert unpacker.unpackString() == key.getResource() as String + assert unpacker.unpackString() == entry.getResource() as String ++elementCount assert unpacker.unpackString() == "Type" - assert unpacker.unpackString() == key.getType() as String + assert unpacker.unpackString() == entry.getType() as String ++elementCount assert unpacker.unpackString() == "HTTPStatusCode" - assert unpacker.unpackInt() == key.getHttpStatusCode() + assert unpacker.unpackInt() == entry.getHttpStatusCode() ++elementCount assert unpacker.unpackString() == "Synthetics" - assert unpacker.unpackBoolean() == key.isSynthetics() + assert unpacker.unpackBoolean() == entry.isSynthetics() ++elementCount assert unpacker.unpackString() == "IsTraceRoot" - assert unpacker.unpackInt() == (key.isTraceRoot() ? TriState.TRUE.serialValue : TriState.FALSE.serialValue) + assert unpacker.unpackInt() == (entry.isTraceRoot() ? TriState.TRUE.serialValue : TriState.FALSE.serialValue) ++elementCount assert unpacker.unpackString() == "SpanKind" - assert unpacker.unpackString() == key.getSpanKind() as String + assert unpacker.unpackString() == entry.getSpanKind() as String ++elementCount assert unpacker.unpackString() == "PeerTags" int peerTagsLength = unpacker.unpackArrayHeader() - assert peerTagsLength == key.getPeerTags().size() + assert peerTagsLength == entry.getPeerTags().size() for (int i = 0; i < peerTagsLength; i++) { def unpackedPeerTag = unpacker.unpackString() - assert unpackedPeerTag == key.getPeerTags()[i].toString() + assert unpackedPeerTag == entry.getPeerTags()[i].toString() } ++elementCount // Service source is only present when the service name has been overridden by the tracer if (hasServiceSource) { assert unpacker.unpackString() == "srv_src" - assert unpacker.unpackString() == key.getServiceSource().toString() + assert unpacker.unpackString() == entry.getServiceSource().toString() ++elementCount } // HTTPMethod and HTTPEndpoint are optional - only present if non-null if (hasHttpMethod) { assert unpacker.unpackString() == "HTTPMethod" - assert unpacker.unpackString() == key.getHttpMethod() as String + assert unpacker.unpackString() == entry.getHttpMethod() as String ++elementCount } if (hasHttpEndpoint) { assert unpacker.unpackString() == "HTTPEndpoint" - assert unpacker.unpackString() == key.getHttpEndpoint() as String + assert unpacker.unpackString() == entry.getHttpEndpoint() as String ++elementCount } if (hasGrpcStatusCode) { assert unpacker.unpackString() == "GRPCStatusCode" - assert unpacker.unpackString() == key.getGrpcStatusCode() as String + assert unpacker.unpackString() == entry.getGrpcStatusCode() as String ++elementCount } assert unpacker.unpackString() == "Hits" - assert unpacker.unpackInt() == value.getHitCount() + assert unpacker.unpackInt() == entry.getHitCount() ++elementCount assert unpacker.unpackString() == "Errors" - assert unpacker.unpackInt() == value.getErrorCount() + assert unpacker.unpackInt() == entry.getErrorCount() ++elementCount assert unpacker.unpackString() == "TopLevelHits" - assert unpacker.unpackInt() == value.getTopLevelCount() + assert unpacker.unpackInt() == entry.getTopLevelCount() ++elementCount assert unpacker.unpackString() == "Duration" - assert unpacker.unpackLong() == value.getDuration() + assert unpacker.unpackLong() == entry.getDuration() ++elementCount assert unpacker.unpackString() == "OkSummary" validateSketch(unpacker) @@ -397,99 +379,4 @@ class SerializingMetricWriterTest extends DDSpecification { return validated } } - - def "ServiceSource optional in the payload"() { - setup: - long startTime = MILLISECONDS.toNanos(System.currentTimeMillis()) - long duration = SECONDS.toNanos(10) - WellKnownTags wellKnownTags = new WellKnownTags("runtimeid", "hostname", "env", "service", "version", "language") - - // Create keys with different combinations of HTTP fields - def keyWithNoSource = new MetricKey("resource", "service", "operation", null, "type", 200, false, false, "server", [], "GET", "/api/users", null) - def keyWithSource = new MetricKey("resource", "service", "operation", "source", "type", 200, false, false, "server", [], "POST", null, null) - - def content = [ - Pair.of(keyWithNoSource, new AggregateMetric().recordDurations(1, new AtomicLongArray(1L))), - Pair.of(keyWithSource, new AggregateMetric().recordDurations(1, new AtomicLongArray(1L))), - ] - - ValidatingSink sink = new ValidatingSink(wellKnownTags, startTime, duration, content) - SerializingMetricWriter writer = new SerializingMetricWriter(wellKnownTags, sink, 128) - - when: - writer.startBucket(content.size(), startTime, duration) - for (Pair pair : content) { - writer.add(pair.getLeft(), pair.getRight()) - } - writer.finishBucket() - - then: - sink.validatedInput() - } - - def "GRPCStatusCode field is present in payload for rpc-type spans"() { - setup: - long startTime = MILLISECONDS.toNanos(System.currentTimeMillis()) - long duration = SECONDS.toNanos(10) - WellKnownTags wellKnownTags = new WellKnownTags("runtimeid", "hostname", "env", "service", "version", "language") - - def keyWithGrpc = new MetricKey("grpc.service/Method", "grpc-service", "grpc.server", null, "rpc", 0, false, false, "server", [], null, null, "OK") - def keyWithGrpcError = new MetricKey("grpc.service/Method", "grpc-service", "grpc.server", null, "rpc", 0, false, false, "client", [], null, null, "NOT_FOUND") - def keyWithoutGrpc = new MetricKey("resource", "service", "operation", null, "web", 200, false, false, "server", [], null, null, null) - - def content = [ - Pair.of(keyWithGrpc, new AggregateMetric().recordDurations(1, new AtomicLongArray(1L))), - Pair.of(keyWithGrpcError, new AggregateMetric().recordDurations(1, new AtomicLongArray(1L))), - Pair.of(keyWithoutGrpc, new AggregateMetric().recordDurations(1, new AtomicLongArray(1L))) - ] - - ValidatingSink sink = new ValidatingSink(wellKnownTags, startTime, duration, content) - SerializingMetricWriter writer = new SerializingMetricWriter(wellKnownTags, sink, 128) - - when: - writer.startBucket(content.size(), startTime, duration) - for (Pair pair : content) { - writer.add(pair.getLeft(), pair.getRight()) - } - writer.finishBucket() - - then: - sink.validatedInput() - } - - def "HTTPMethod and HTTPEndpoint fields are optional in payload"() { - setup: - long startTime = MILLISECONDS.toNanos(System.currentTimeMillis()) - long duration = SECONDS.toNanos(10) - WellKnownTags wellKnownTags = new WellKnownTags("runtimeid", "hostname", "env", "service", "version", "language") - - // Create keys with different combinations of HTTP fields - def keyWithBoth = new MetricKey("resource", "service", "operation", null, "type", 200, false, false, "server", [], "GET", "/api/users", null) - def keyWithMethodOnly = new MetricKey("resource", "service", "operation", null, "type", 200, false, false, "server", [], "POST", null, null) - def keyWithEndpointOnly = new MetricKey("resource", "service", "operation", null, "type", 200, false, false, "server", [], null, "/api/orders", null) - def keyWithNeither = new MetricKey("resource", "service", "operation", null, "type", 200, false, false, "client", [], null, null, null) - - def content = [ - Pair.of(keyWithBoth, new AggregateMetric().recordDurations(1, new AtomicLongArray(1L))), - Pair.of(keyWithMethodOnly, new AggregateMetric().recordDurations(1, new AtomicLongArray(1L))), - Pair.of(keyWithEndpointOnly, new AggregateMetric().recordDurations(1, new AtomicLongArray(1L))), - Pair.of(keyWithNeither, new AggregateMetric().recordDurations(1, new AtomicLongArray(1L))) - ] - - ValidatingSink sink = new ValidatingSink(wellKnownTags, startTime, duration, content) - SerializingMetricWriter writer = new SerializingMetricWriter(wellKnownTags, sink, 128) - - when: - writer.startBucket(content.size(), startTime, duration) - for (Pair pair : content) { - writer.add(pair.getLeft(), pair.getRight()) - } - writer.finishBucket() - - then: - sink.validatedInput() - // Test passes if validation in ValidatingSink succeeds - // ValidatingSink verifies that map size matches actual number of fields - // and that HTTPMethod/HTTPEndpoint are only present when non-empty - } } diff --git a/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateEntries.java b/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateEntries.java new file mode 100644 index 00000000000..1208d88402a --- /dev/null +++ b/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateEntries.java @@ -0,0 +1,76 @@ +package datadog.trace.common.metrics; + +import datadog.trace.bootstrap.instrumentation.api.UTF8BytesString; +import java.util.List; +import javax.annotation.Nullable; + +/** + * Test-side factories for {@link AggregateEntry}. Lives in {@code src/test} so the production class + * stays free of test-only API; same {@code datadog.trace.common.metrics} package so this helper can + * reach {@link AggregateEntry#forSnapshot(SpanSnapshot)} and the package-private {@link + * SpanSnapshot} constructor. + */ +public final class AggregateEntries { + private AggregateEntries() {} + + /** + * Builds an {@link AggregateEntry} from the same positional shape the prior {@code new + * MetricKey(...)} took. Accepts a pre-encoded {@code List} of {@code + * "name:value"} peer tags and recovers the parallel-array {@code (names, values)} form by + * splitting on the {@code ':'} delimiter. + * + *

Test-only. The split is at the first {@code ':'}, so peer-tag values + * containing a colon (URLs, IPv6 addresses, {@code service:env} patterns) will be silently + * misparsed and the recovered (name, value) pair will be wrong. Keep test data colon-free in + * peer-tag values, or wire a production-style snapshot through {@link + * AggregateEntry#forSnapshot(SpanSnapshot)} directly instead. + */ + public static AggregateEntry of( + CharSequence resource, + CharSequence service, + CharSequence operationName, + @Nullable CharSequence serviceSource, + CharSequence type, + int httpStatusCode, + boolean synthetic, + boolean traceRoot, + CharSequence spanKind, + @Nullable List peerTags, + @Nullable CharSequence httpMethod, + @Nullable CharSequence httpEndpoint, + @Nullable CharSequence grpcStatusCode) { + PeerTagSchema schema = null; + String[] values = null; + if (peerTags != null && !peerTags.isEmpty()) { + String[] names = new String[peerTags.size()]; + values = new String[peerTags.size()]; + int i = 0; + for (UTF8BytesString t : peerTags) { + String s = t.toString(); + int colon = s.indexOf(':'); + names[i] = colon < 0 ? s : s.substring(0, colon); + values[i] = colon < 0 ? "" : s.substring(colon + 1); + i++; + } + schema = PeerTagSchema.testSchema(names); + } + SpanSnapshot syntheticSnapshot = + new SpanSnapshot( + resource, + service == null ? null : service.toString(), + operationName, + serviceSource, + type, + (short) httpStatusCode, + synthetic, + traceRoot, + spanKind == null ? null : spanKind.toString(), + schema, + values, + httpMethod == null ? null : httpMethod.toString(), + httpEndpoint == null ? null : httpEndpoint.toString(), + grpcStatusCode == null ? null : grpcStatusCode.toString(), + 0L); + return AggregateEntry.forSnapshot(syntheticSnapshot); + } +} diff --git a/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateEntryTest.java b/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateEntryTest.java new file mode 100644 index 00000000000..42f2a15610e --- /dev/null +++ b/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateEntryTest.java @@ -0,0 +1,165 @@ +package datadog.trace.common.metrics; + +import static datadog.trace.common.metrics.AggregateEntry.ERROR_TAG; +import static datadog.trace.common.metrics.AggregateEntry.TOP_LEVEL_TAG; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import datadog.metrics.agent.AgentMeter; +import datadog.metrics.api.statsd.StatsDClient; +import datadog.metrics.impl.DDSketchHistograms; +import datadog.metrics.impl.MonitoringImpl; +import java.util.concurrent.TimeUnit; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; + +class AggregateEntryTest { + + @BeforeAll + static void initAgentMeter() { + // recordOneDuration -> Histogram.accept needs AgentMeter to be initialized. + MonitoringImpl monitoring = new MonitoringImpl(StatsDClient.NO_OP, 1, TimeUnit.SECONDS); + AgentMeter.registerIfAbsent(StatsDClient.NO_OP, monitoring, DDSketchHistograms.FACTORY); + monitoring.newTimer("test.init"); + } + + @Test + void recordOneDurationSumsToTotal() { + AggregateEntry entry = newEntry(); + entry.recordOneDuration(1L); + entry.recordOneDuration(2L); + entry.recordOneDuration(3L); + assertEquals(6, entry.getDuration()); + } + + @Test + void clearResetsAllCounters() { + AggregateEntry entry = newEntry(); + entry.recordOneDuration(5L); + entry.recordOneDuration(ERROR_TAG | 6L); + entry.recordOneDuration(TOP_LEVEL_TAG | 7L); + entry.clear(); + assertEquals(0, entry.getDuration()); + assertEquals(0, entry.getErrorCount()); + assertEquals(0, entry.getTopLevelCount()); + assertEquals(0, entry.getHitCount()); + } + + @Test + void recordOneDurationAccumulatesOkErrorAndTopLevel() { + AggregateEntry entry = newEntry(); + entry.recordOneDuration(10L); + entry.recordOneDuration(10L | TOP_LEVEL_TAG); + entry.recordOneDuration(10L | ERROR_TAG); + + assertEquals(3, entry.getHitCount()); + assertEquals(30, entry.getDuration()); + assertEquals(1, entry.getErrorCount()); + assertEquals(1, entry.getTopLevelCount()); + } + + @Test + void hitCountIncludesErrors() { + AggregateEntry entry = newEntry(); + entry.recordOneDuration(1L); + entry.recordOneDuration(2L); + entry.recordOneDuration(3L | ERROR_TAG); + assertEquals(3, entry.getHitCount()); + assertEquals(1, entry.getErrorCount()); + } + + @Test + void okAndErrorLatenciesTrackedSeparately() { + AggregateEntry entry = newEntry(); + long[] durations = { + 1L, 100L | ERROR_TAG, 2L, 99L | ERROR_TAG, 3L, 98L | ERROR_TAG, 4L, 97L | ERROR_TAG + }; + for (long d : durations) { + entry.recordOneDuration(d); + } + assertTrue(entry.getErrorLatencies().getMaxValue() >= 99); + assertTrue(entry.getOkLatencies().getMaxValue() <= 5); + } + + @Test + void equalsConsistentWithHashCodeAcrossDifferentSchemaLayouts() { + // Regression: equals() compared the pre-encoded peerTags list, but hashCode (via hashOf) + // mixes in the raw schema names + values arrays. Two entries built from different schema + // layouts could collapse to the same encoded peerTags ("b:x") while their raw arrays differ + // -- equals returned true but hashCodes differed, violating the hashCode contract. Now + // equals compares the raw arrays directly, mirroring matches()/hashOf(). + // + // Build two entries that exercise that exact shape: + // A: schema ["a","b"], values [null,"x"] -> encoded ["b:x"] + // B: schema ["b","c"], values ["x",null] -> encoded ["b:x"] + AggregateEntry a = + AggregateEntry.forSnapshot( + snapshotWithPeerTags(new String[] {"a", "b"}, new String[] {null, "x"})); + AggregateEntry b = + AggregateEntry.forSnapshot( + snapshotWithPeerTags(new String[] {"b", "c"}, new String[] {"x", null})); + + // Sanity: same encoded peer tags, despite different raw layout. + assertEquals(a.getPeerTags(), b.getPeerTags()); + + // Different raw layouts -> entries must not be equal. + assertNotEquals(a, b); + // And different hashCodes (matching the inequality). + assertNotEquals(a.hashCode(), b.hashCode()); + } + + @Test + void equalEntriesHaveEqualHashCodes() { + // Positive case: two entries built from identical snapshots must equal AND share hashCode. + AggregateEntry a = + AggregateEntry.forSnapshot( + snapshotWithPeerTags(new String[] {"a", "b"}, new String[] {null, "x"})); + AggregateEntry b = + AggregateEntry.forSnapshot( + snapshotWithPeerTags(new String[] {"a", "b"}, new String[] {null, "x"})); + + assertEquals(a, b); + assertEquals(a.hashCode(), b.hashCode()); + } + + private static SpanSnapshot snapshotWithPeerTags(String[] names, String[] values) { + return new SpanSnapshot( + "resource", + "svc", + "op", + null, + "type", + (short) 200, + false, + true, + "client", + PeerTagSchema.testSchema(names), + values, + null, + null, + null, + 0L); + } + + private static AggregateEntry newEntry() { + SpanSnapshot snapshot = + new SpanSnapshot( + "resource", + "svc", + "op", + null, + "type", + (short) 200, + false, + true, + "client", + null, + null, + null, + null, + null, + 0L); + return AggregateEntry.forSnapshot(snapshot); + } +} diff --git a/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateTableTest.java b/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateTableTest.java new file mode 100644 index 00000000000..42a5b98db39 --- /dev/null +++ b/dd-trace-core/src/test/java/datadog/trace/common/metrics/AggregateTableTest.java @@ -0,0 +1,371 @@ +package datadog.trace.common.metrics; + +import static datadog.trace.common.metrics.AggregateEntry.ERROR_TAG; +import static datadog.trace.common.metrics.AggregateEntry.TOP_LEVEL_TAG; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNotSame; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertSame; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import datadog.metrics.agent.AgentMeter; +import datadog.metrics.api.statsd.StatsDClient; +import datadog.metrics.impl.DDSketchHistograms; +import datadog.metrics.impl.MonitoringImpl; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.TimeUnit; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; + +class AggregateTableTest { + + @BeforeAll + static void initAgentMeter() { + // AggregateEntry.recordOneDuration -> Histogram.accept needs AgentMeter to be initialized. + MonitoringImpl monitoring = new MonitoringImpl(StatsDClient.NO_OP, 1, TimeUnit.SECONDS); + AgentMeter.registerIfAbsent(StatsDClient.NO_OP, monitoring, DDSketchHistograms.FACTORY); + monitoring.newTimer("test.init"); + } + + @Test + void insertOnMissReturnsNewAggregate() { + AggregateTable table = new AggregateTable(8); + SpanSnapshot s = snapshot("svc", "op", "client"); + + AggregateEntry agg = table.findOrInsert(s); + + assertNotNull(agg); + assertEquals(1, table.size()); + assertEquals(0, agg.getHitCount()); + } + + @Test + void hitReturnsSameAggregateInstance() { + AggregateTable table = new AggregateTable(8); + SpanSnapshot s1 = snapshot("svc", "op", "client"); + SpanSnapshot s2 = snapshot("svc", "op", "client"); + + AggregateEntry first = table.findOrInsert(s1); + AggregateEntry second = table.findOrInsert(s2); + + assertSame(first, second); + assertEquals(1, table.size()); + } + + @Test + void differentKindFieldsAreDistinct() { + AggregateTable table = new AggregateTable(8); + + AggregateEntry clientAgg = table.findOrInsert(snapshot("svc", "op", "client")); + AggregateEntry serverAgg = table.findOrInsert(snapshot("svc", "op", "server")); + + assertNotSame(clientAgg, serverAgg); + assertEquals(2, table.size()); + } + + @Test + void peerTagPairsParticipateInIdentity() { + AggregateTable table = new AggregateTable(8); + SpanSnapshot withTags = + builder("svc", "op", "client").peerTags("peer.hostname", "host-a").build(); + SpanSnapshot otherTags = + builder("svc", "op", "client").peerTags("peer.hostname", "host-b").build(); + SpanSnapshot noTags = builder("svc", "op", "client").build(); + + AggregateEntry a = table.findOrInsert(withTags); + AggregateEntry b = table.findOrInsert(otherTags); + AggregateEntry c = table.findOrInsert(noTags); + + assertNotSame(a, b); + assertNotSame(a, c); + assertNotSame(b, c); + assertEquals(3, table.size()); + } + + @Test + void capOverrunEvictsStaleEntry() { + AggregateTable table = new AggregateTable(2); + + AggregateEntry stale = table.findOrInsert(snapshot("svc-a", "op", "client")); + // do not record on stale -> hitCount stays at 0 + + AggregateEntry live = table.findOrInsert(snapshot("svc-b", "op", "client")); + live.recordOneDuration(10L | TOP_LEVEL_TAG); // hitCount=1, not evictable + + // table is full (size=2). Inserting a third should evict the stale one and succeed. + AggregateEntry newcomer = table.findOrInsert(snapshot("svc-c", "op", "client")); + assertNotNull(newcomer); + assertEquals(2, table.size()); + + // re-inserting the stale snapshot should miss now (it was evicted) and produce a fresh entry + AggregateEntry staleAgain = table.findOrInsert(snapshot("svc-a", "op", "client")); + assertNotSame(stale, staleAgain); + } + + @Test + void backToBackEvictionsAllSucceed() { + // Cursor amortization regression: cap the table, fill with stale entries, then force a + // sequence of cap-overrun inserts. Each insert must succeed (evicting one stale entry and + // inserting one new). The cursor field is internal, but if it were ever wedged (e.g. + // pointing past the end of buckets, or not advancing after a successful eviction), some + // later insert would fail to find a stale entry. Drives ~3x the capacity worth of inserts to + // give wrap-around plenty of chances to misbehave. + AggregateTable table = new AggregateTable(8); + for (int i = 0; i < 8; i++) { + table.findOrInsert(snapshot("init-" + i, "op", "client")); + } + for (int i = 0; i < 32; i++) { + AggregateEntry inserted = table.findOrInsert(snapshot("post-" + i, "op", "client")); + assertNotNull( + inserted, "insert #" + i + " should evict a stale entry and succeed (table full)"); + } + assertEquals(8, table.size()); + } + + @Test + void clearResetsCursorForSubsequentEvictions() { + // The cursor must reset to 0 on clear so a re-filled table doesn't start eviction at a + // stale bucket index. Verified indirectly: clear and re-fill, then force an eviction; the + // newcomer must successfully take a slot (which only works if a stale entry was found). + AggregateTable table = new AggregateTable(4); + + // Fill, age, evict once -- cursor lands at some non-zero bucket + for (int i = 0; i < 4; i++) { + table.findOrInsert(snapshot("warm-" + i, "op", "client")); + } + table.findOrInsert(snapshot("evict-trigger", "op", "client")); + + table.clear(); + assertEquals(0, table.size()); + + // Re-fill, age, force eviction -- should still find a stale entry from bucket 0 onward + for (int i = 0; i < 4; i++) { + table.findOrInsert(snapshot("fresh-" + i, "op", "client")); + } + AggregateEntry newcomer = table.findOrInsert(snapshot("post-clear", "op", "client")); + assertNotNull(newcomer, "post-clear cap-overrun insert must succeed via cursor-reset evict"); + } + + @Test + void capOverrunWithNoStaleReturnsNull() { + AggregateTable table = new AggregateTable(2); + + AggregateEntry a = table.findOrInsert(snapshot("svc-a", "op", "client")); + AggregateEntry b = table.findOrInsert(snapshot("svc-b", "op", "client")); + a.recordOneDuration(10L); + b.recordOneDuration(20L); + + AggregateEntry c = table.findOrInsert(snapshot("svc-c", "op", "client")); + assertNull(c); + assertEquals(2, table.size()); + } + + @Test + void expungeStaleAggregatesRemovesZeroHitsOnly() { + AggregateTable table = new AggregateTable(16); + + AggregateEntry live = table.findOrInsert(snapshot("svc-live", "op", "client")); + live.recordOneDuration(10L); + AggregateEntry stale1 = table.findOrInsert(snapshot("svc-stale1", "op", "client")); + AggregateEntry stale2 = table.findOrInsert(snapshot("svc-stale2", "op", "client")); + assertEquals(3, table.size()); + assertEquals(0, stale1.getHitCount()); + assertEquals(0, stale2.getHitCount()); + + table.expungeStaleAggregates(); + + assertEquals(1, table.size()); + // the live entry must still be reachable + assertSame(live, table.findOrInsert(snapshot("svc-live", "op", "client"))); + } + + @Test + void forEachVisitsEveryEntry() { + AggregateTable table = new AggregateTable(8); + table.findOrInsert(snapshot("a", "op", "client")).recordOneDuration(1L); + table.findOrInsert(snapshot("b", "op", "client")).recordOneDuration(2L); + table.findOrInsert(snapshot("c", "op", "client")).recordOneDuration(3L | ERROR_TAG); + + Map visited = new HashMap<>(); + table.forEach(e -> visited.put(e.getService().toString(), e.getDuration())); + + assertEquals(3, visited.size()); + assertEquals(1L, visited.get("a")); + assertEquals(2L, visited.get("b")); + assertEquals(3L, visited.get("c")); + } + + @Test + void clearEmptiesTheTable() { + AggregateTable table = new AggregateTable(8); + table.findOrInsert(snapshot("a", "op", "client")); + table.findOrInsert(snapshot("b", "op", "client")); + assertEquals(2, table.size()); + + table.clear(); + + assertTrue(table.isEmpty()); + assertEquals(0, table.size()); + // and re-insertion works after clear + assertNotNull(table.findOrInsert(snapshot("a", "op", "client"))); + } + + @Test + void encodedLabelsAreBuiltOnInsert() { + AggregateTable table = new AggregateTable(4); + List seen = new ArrayList<>(); + table.findOrInsert(snapshot("svc", "op", "client")); + table.forEach(seen::add); + + assertEquals(1, seen.size()); + AggregateEntry e = seen.get(0); + assertEquals("svc", e.getService().toString()); + assertEquals("op", e.getOperationName().toString()); + assertEquals("client", e.getSpanKind().toString()); + } + + @Test + void nullAndEmptyOptionalFieldsCollapseToOneEntry() { + // Regression: canonicalize() maps null -> EMPTY (or to a cache.computeIfAbsent("") entry for + // ""), but the prior contentEquals impl treated `non-null vs null` as not-equal -- so a second + // snapshot with the same null fields hashed to the same bucket but failed matches(), causing a + // spurious duplicate insert. The fix unifies null and length-zero on both sides of + // contentEquals/stringContentEquals. + AggregateTable table = new AggregateTable(8); + + SpanSnapshot snapNull = nullableSnapshot(null, null, null, null); + SpanSnapshot snapEmpty = nullableSnapshot("", "", "", ""); + + AggregateEntry first = table.findOrInsert(snapNull); + AggregateEntry secondNull = table.findOrInsert(nullableSnapshot(null, null, null, null)); + AggregateEntry forEmpty = table.findOrInsert(snapEmpty); + + assertSame(first, secondNull, "two null-fielded snapshots must hit the same entry"); + assertSame(first, forEmpty, "null- and empty-fielded snapshots must hit the same entry"); + assertEquals(1, table.size()); + } + + @Test + void nullServiceAndSpanKindDoNotNpeAndCollapseWithEmpty() { + // Regression: serviceName and spanKind used to bypass canonicalize() and call + // cache.computeIfAbsent directly, which would NPE on a null input. Production paths never + // pass null for these (DDSpan always supplies a service; producer defaults spanKind to ""), + // but the matches/contentEquals logic already treats null-and-empty as equal, so the + // constructor should be consistent. This pins both null-safety and null-equals-empty + // behavior for the two fields that recently moved through canonicalize(). + AggregateTable table = new AggregateTable(8); + + SpanSnapshot allNulls = nullServiceKindSnapshot(null, null); + SpanSnapshot allEmpty = nullServiceKindSnapshot("", ""); + + AggregateEntry first = table.findOrInsert(allNulls); + AggregateEntry secondNull = table.findOrInsert(nullServiceKindSnapshot(null, null)); + AggregateEntry forEmpty = table.findOrInsert(allEmpty); + + assertSame(first, secondNull, "two null-service/-kind snapshots must hit the same entry"); + assertSame(first, forEmpty, "null- and empty-service/-kind snapshots must hit the same entry"); + assertEquals(1, table.size()); + assertEquals(0, first.getService().length(), "null serviceName should canonicalize to EMPTY"); + assertEquals(0, first.getSpanKind().length(), "null spanKind should canonicalize to EMPTY"); + } + + private static SpanSnapshot nullServiceKindSnapshot(String service, String spanKind) { + return new SpanSnapshot( + "resource", + service, + "op", + null, + "web", + (short) 200, + false, + true, + spanKind, + null, + null, + null, + null, + null, + 0L); + } + + private static SpanSnapshot nullableSnapshot( + String resource, String operation, String type, String serviceNameSource) { + return new SpanSnapshot( + resource, + "svc", + operation, + serviceNameSource, + type, + (short) 200, + false, + true, + "client", + null, + null, + null, + null, + null, + 0L); + } + + // ---------- helpers ---------- + + private static SpanSnapshot snapshot(String service, String operation, String spanKind) { + return builder(service, operation, spanKind).build(); + } + + private static SnapshotBuilder builder(String service, String operation, String spanKind) { + return new SnapshotBuilder(service, operation, spanKind); + } + + private static final class SnapshotBuilder { + private final String service; + private final String operation; + private final String spanKind; + private PeerTagSchema peerTagSchema; + private String[] peerTagValues; + private long tagAndDuration = 0L; + + SnapshotBuilder(String service, String operation, String spanKind) { + this.service = service; + this.operation = operation; + this.spanKind = spanKind; + } + + SnapshotBuilder peerTags(String... namesAndValues) { + int pairCount = namesAndValues.length / 2; + String[] names = new String[pairCount]; + String[] values = new String[pairCount]; + for (int i = 0; i < pairCount; i++) { + names[i] = namesAndValues[2 * i]; + values[i] = namesAndValues[2 * i + 1]; + } + this.peerTagSchema = PeerTagSchema.testSchema(names); + this.peerTagValues = values; + return this; + } + + SpanSnapshot build() { + return new SpanSnapshot( + "resource", + service, + operation, + null, + "web", + (short) 200, + false, + true, + spanKind, + peerTagSchema, + peerTagValues, + null, + null, + null, + tagAndDuration); + } + } +} diff --git a/dd-trace-core/src/test/java/datadog/trace/common/metrics/ConflatingMetricsAggregatorBootstrapTest.java b/dd-trace-core/src/test/java/datadog/trace/common/metrics/ConflatingMetricsAggregatorBootstrapTest.java index ef07e0fbc19..59681d4724e 100644 --- a/dd-trace-core/src/test/java/datadog/trace/common/metrics/ConflatingMetricsAggregatorBootstrapTest.java +++ b/dd-trace-core/src/test/java/datadog/trace/common/metrics/ConflatingMetricsAggregatorBootstrapTest.java @@ -217,9 +217,9 @@ void reconcileSurvivesStateChangeWhenTagsUnchanged() throws Exception { void reconcileSwapsSchemaWhenTagSetChanges() throws Exception { // The reconcile slow-path's swap branch: discovery refreshes the state AND the tag set // grows. Cached schema is rebuilt and the volatile reference points at the new schema. - // Verification is end-to-end -- we look at the MetricKey the writer receives. Pre-swap the - // span snapshot was pinned to the old schema so only peer.hostname appears; post-swap a new - // publish reads the new schema and the next flush carries both peer tags. + // Verification is end-to-end -- we look at the AggregateEntry the writer receives. Pre-swap + // the span snapshot was pinned to the old schema so only peer.hostname appears; post-swap a + // new publish reads the new schema and the next flush carries both peer tags. HealthMetrics healthMetrics = mock(HealthMetrics.class); MetricWriter writer = mock(MetricWriter.class); Sink sink = mock(Sink.class); @@ -266,8 +266,8 @@ void reconcileSwapsSchemaWhenTagSetChanges() throws Exception { .finishBucket(); // Publish 1: snapshot pinned to the original {peer.hostname} schema. cycle 1's reconcile - // will swap the cached schema BEFORE the flush, but this snapshot is already pinned so its - // MetricKey will still carry only peer.hostname. + // will swap the cached schema BEFORE the flush, but this snapshot is already pinned so the + // resulting AggregateEntry will still carry only peer.hostname. aggregator.publish( Collections.>singletonList(peerAggregationSpanWithBothPeerTags())); aggregator.report(); @@ -280,20 +280,20 @@ void reconcileSwapsSchemaWhenTagSetChanges() throws Exception { aggregator.report(); assertTrue(cycle2.await(2, SECONDS)); - // Capture every (MetricKey, AggregateMetric) the writer saw across both cycles. Pre-swap - // snapshot has 1 peer tag, post-swap has 2. - ArgumentCaptor keyCaptor = ArgumentCaptor.forClass(MetricKey.class); - verify(writer, times(2)).add(keyCaptor.capture(), any(AggregateMetric.class)); - List keys = keyCaptor.getAllValues(); + // Capture every AggregateEntry the writer saw across both cycles. Pre-swap snapshot has 1 + // peer tag, post-swap has 2. + ArgumentCaptor entryCaptor = ArgumentCaptor.forClass(AggregateEntry.class); + verify(writer, times(2)).add(entryCaptor.capture()); + List entries = entryCaptor.getAllValues(); assertEquals( Collections.singletonList(UTF8BytesString.create("peer.hostname:localhost")), - keys.get(0).getPeerTags(), + entries.get(0).getPeerTags(), "pre-swap snapshot should encode only peer.hostname"); assertEquals( Arrays.asList( UTF8BytesString.create("peer.hostname:localhost"), UTF8BytesString.create("peer.service:billing")), - keys.get(1).getPeerTags(), + entries.get(1).getPeerTags(), "post-swap snapshot should encode both peer.hostname and peer.service"); // Bootstrap (1) + cycle 1 slow-path (1) -- cycle 2 is fast-path so doesn't reach peerTags(). diff --git a/dd-trace-core/src/test/java/datadog/trace/common/metrics/ConflatingMetricsAggregatorDisableTest.java b/dd-trace-core/src/test/java/datadog/trace/common/metrics/ConflatingMetricsAggregatorDisableTest.java new file mode 100644 index 00000000000..d95d44917ea --- /dev/null +++ b/dd-trace-core/src/test/java/datadog/trace/common/metrics/ConflatingMetricsAggregatorDisableTest.java @@ -0,0 +1,236 @@ +package datadog.trace.common.metrics; + +import static java.util.concurrent.TimeUnit.SECONDS; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.ArgumentMatchers.anyInt; +import static org.mockito.ArgumentMatchers.anyLong; +import static org.mockito.ArgumentMatchers.eq; +import static org.mockito.Mockito.after; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.reset; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +import datadog.communication.ddagent.DDAgentFeaturesDiscovery; +import datadog.trace.bootstrap.instrumentation.api.Tags; +import datadog.trace.core.CoreSpan; +import datadog.trace.core.SpanKindFilter; +import datadog.trace.core.monitor.HealthMetrics; +import java.util.Collections; +import java.util.concurrent.CountDownLatch; +import org.junit.jupiter.api.Test; +import org.mockito.ArgumentCaptor; + +/** + * Coverage for the {@code disable() -> ClearSignal.CLEAR} threading routing introduced in this PR. + * + *

The bundled fix routes the agent-downgrade clear through the inbox so the aggregator thread + * stays the sole writer to {@link AggregateTable} (which is not thread-safe). The behavioral + * contract this test pins: + * + *

+ * + *

The test would fail if {@code disable()} reverted to mutating {@code AggregateTable} directly + * (the pre-fix path) only via races -- not deterministically -- so the assertions here are about + * the observable end-to-end shape rather than thread identity. + */ +class ConflatingMetricsAggregatorDisableTest { + + @Test + void downgradeRoutesClearThroughInboxBeforeNextReport() throws Exception { + HealthMetrics healthMetrics = mock(HealthMetrics.class); + MetricWriter writer = mock(MetricWriter.class); + Sink sink = mock(Sink.class); + DDAgentFeaturesDiscovery features = mock(DDAgentFeaturesDiscovery.class); + when(features.supportsMetrics()).thenReturn(true); + when(features.peerTags()).thenReturn(Collections.emptySet()); + when(features.state()).thenReturn("state-1"); + + ConflatingMetricsAggregator aggregator = + new ConflatingMetricsAggregator( + Collections.emptySet(), + features, + healthMetrics, + sink, + writer, + /* maxAggregates */ 16, + /* queueSize */ 64, + /* reportingInterval */ 10, + SECONDS, + /* includeEndpointInMetrics */ false); + aggregator.start(); + try { + // Baseline: publish a span, run a report, verify the table flushes normally. This gives + // us a clean post-first-report state with the aggregator's reconcile already having fired + // once on the aggregator thread. + CountDownLatch firstFlush = new CountDownLatch(1); + org.mockito.Mockito.doAnswer( + invocation -> { + firstFlush.countDown(); + return null; + }) + .when(writer) + .finishBucket(); + + aggregator.publish(Collections.>singletonList(metricsEligibleSpan())); + aggregator.report(); + assertTrue(firstFlush.await(2, SECONDS)); + + // Reset writer-side mock interactions so the post-disable verify() blocks below only see + // what happens after the downgrade. features mock keeps accumulating call counts -- we use + // those counts as a latch on aggregator-thread reconcile timing. + reset(writer); + + // Flip the discovery state. disable()'s first action is features.discover() followed by a + // features.supportsMetrics() check; returning false here selects the clear path. + when(features.supportsMetrics()).thenReturn(false); + + // Fire DOWNGRADED on the test thread. This is the production scenario where the OkHttpSink + // callback thread triggers onEvent. disable() offers ClearSignal.CLEAR to the inbox but + // does not (and must not) mutate AggregateTable directly here. + aggregator.onEvent(EventListener.EventType.DOWNGRADED, ""); + + // First: verify nothing flushes immediately after disable. We can't pin reconcile-on-the- + // aggregator-thread as a latch here because CLEAR's inbox.clear() drops any REPORT we'd + // queue behind it -- so we just wait a window for any flush attempt to materialize. + verify(writer, after(500).never()).startBucket(anyInt(), anyLong(), anyLong()); + + // Stronger contract: prove the table is actually empty after CLEAR by re-enabling metrics + // and publishing a *marker* span with a distinct resource name. The next report should + // flush exactly one entry -- the marker -- with the original "resource" gone. If disable() + // had failed to clear the table (or had cleared it from the wrong thread and corrupted + // bucket chains), this assertion would catch it. + when(features.supportsMetrics()).thenReturn(true); + CountDownLatch postClearFlush = new CountDownLatch(1); + org.mockito.Mockito.doAnswer( + invocation -> { + postClearFlush.countDown(); + return null; + }) + .when(writer) + .finishBucket(); + aggregator.publish(Collections.>singletonList(markerSpan())); + aggregator.report(); + assertTrue(postClearFlush.await(2, SECONDS)); + + ArgumentCaptor entryCaptor = ArgumentCaptor.forClass(AggregateEntry.class); + verify(writer, times(1)).add(entryCaptor.capture()); + assertEquals( + "marker-resource", + entryCaptor.getValue().getResource().toString(), + "post-CLEAR bucket should contain only the marker -- the original entry was wiped"); + } finally { + aggregator.close(); + } + } + + @Test + void clearDoesNotTrampleQueuedStopSignal() throws Exception { + // Regression: prior CLEAR handler called inbox.clear(), which would erase any STOP signal + // queued behind it. close() then waited out thread.join's timeout because Drainer never saw + // the STOP and `stopped` was never set. Now the CLEAR handler clears only the aggregates + // table; queued signals (STOP, REPORT) survive and get processed normally. + HealthMetrics healthMetrics = mock(HealthMetrics.class); + MetricWriter writer = mock(MetricWriter.class); + Sink sink = mock(Sink.class); + DDAgentFeaturesDiscovery features = mock(DDAgentFeaturesDiscovery.class); + when(features.supportsMetrics()).thenReturn(true); + when(features.peerTags()).thenReturn(Collections.emptySet()); + when(features.state()).thenReturn("state-1"); + + ConflatingMetricsAggregator aggregator = + new ConflatingMetricsAggregator( + Collections.emptySet(), + features, + healthMetrics, + sink, + writer, + /* maxAggregates */ 16, + /* queueSize */ 64, + /* reportingInterval */ 10, + SECONDS, + /* includeEndpointInMetrics */ false); + aggregator.start(); + + // Force at least one snapshot into the inbox so the aggregator has something to drain. + aggregator.publish(Collections.>singletonList(metricsEligibleSpan())); + + // Fire DOWNGRADED on this thread. disable() flips supportsMetrics() to false and offers + // CLEAR. Then immediately call close() which offers STOP. If CLEAR's handler clears the + // inbox, STOP gets trampled and close() hangs until the join timeout. + when(features.supportsMetrics()).thenReturn(false); + aggregator.onEvent(EventListener.EventType.DOWNGRADED, ""); + + // close() is synchronous; bound it ourselves rather than trusting THREAD_JOIN_TIMEOUT_MS. + long deadlineNanos = System.nanoTime() + java.util.concurrent.TimeUnit.SECONDS.toNanos(2); + Thread closer = new Thread(aggregator::close, "test-closer"); + closer.start(); + while (closer.isAlive() && System.nanoTime() < deadlineNanos) { + closer.join(50); + } + assertTrue( + !closer.isAlive(), + "close() must return promptly -- if CLEAR trampled STOP, this hangs out the join timeout"); + } + + @SuppressWarnings({"rawtypes", "unchecked"}) + private static CoreSpan metricsEligibleSpan() { + CoreSpan span = mock(CoreSpan.class); + when(span.isMeasured()).thenReturn(false); + when(span.isTopLevel()).thenReturn(true); + // Return true for any SpanKindFilter so peerTagSchemaFor enters the bootstrap path on the + // first publish. We want that bootstrap to fire (it's what makes features.state() + // observable), even though peerTags() returns emptySet here and the resulting schema has + // size 0. + when(span.isKind(any(SpanKindFilter.class))).thenReturn(true); + when(span.getLongRunningVersion()).thenReturn(0); + when(span.getDurationNano()).thenReturn(100L); + when(span.getError()).thenReturn(0); + when(span.getResourceName()).thenReturn("resource"); + when(span.getServiceName()).thenReturn("svc"); + when(span.getOperationName()).thenReturn("op"); + when(span.getServiceNameSource()).thenReturn(null); + when(span.getType()).thenReturn("web"); + when(span.getHttpStatusCode()).thenReturn((short) 200); + when(span.getParentId()).thenReturn(0L); + when(span.getOrigin()).thenReturn(null); + when(span.unsafeGetTag(eq(Tags.SPAN_KIND), any(CharSequence.class))).thenReturn("client"); + return span; + } + + /** + * Distinct from {@link #metricsEligibleSpan()} via the resource name: post-CLEAR the writer + * should see "marker-resource", proving the original "resource" entry is gone from the table. + */ + @SuppressWarnings({"rawtypes", "unchecked"}) + private static CoreSpan markerSpan() { + CoreSpan span = mock(CoreSpan.class); + when(span.isMeasured()).thenReturn(false); + when(span.isTopLevel()).thenReturn(true); + when(span.isKind(any(SpanKindFilter.class))).thenReturn(true); + when(span.getLongRunningVersion()).thenReturn(0); + when(span.getDurationNano()).thenReturn(100L); + when(span.getError()).thenReturn(0); + when(span.getResourceName()).thenReturn("marker-resource"); + when(span.getServiceName()).thenReturn("svc"); + when(span.getOperationName()).thenReturn("op"); + when(span.getServiceNameSource()).thenReturn(null); + when(span.getType()).thenReturn("web"); + when(span.getHttpStatusCode()).thenReturn((short) 200); + when(span.getParentId()).thenReturn(0L); + when(span.getOrigin()).thenReturn(null); + when(span.unsafeGetTag(eq(Tags.SPAN_KIND), any(CharSequence.class))).thenReturn("client"); + return span; + } +} diff --git a/dd-trace-core/src/test/java/datadog/trace/common/metrics/PeerTagSchemaTest.java b/dd-trace-core/src/test/java/datadog/trace/common/metrics/PeerTagSchemaTest.java index 7d818a2686b..eb43116e76f 100644 --- a/dd-trace-core/src/test/java/datadog/trace/common/metrics/PeerTagSchemaTest.java +++ b/dd-trace-core/src/test/java/datadog/trace/common/metrics/PeerTagSchemaTest.java @@ -3,6 +3,7 @@ import static org.junit.jupiter.api.Assertions.assertArrayEquals; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotEquals; import static org.junit.jupiter.api.Assertions.assertNull; import static org.junit.jupiter.api.Assertions.assertTrue; @@ -90,4 +91,44 @@ void hasSameTagsAsHandlesEmpty() { assertTrue(empty.hasSameTagsAs(Collections.emptySet())); assertFalse(empty.hasSameTagsAs(Collections.singleton("peer.hostname"))); } + + @Test + void equalsIsContentBasedOnNames() { + PeerTagSchema a = PeerTagSchema.testSchema(new String[] {"peer.hostname", "peer.service"}); + PeerTagSchema b = PeerTagSchema.testSchema(new String[] {"peer.hostname", "peer.service"}); + + assertEquals(a, b); + assertEquals(b, a); + assertEquals(a.hashCode(), b.hashCode()); + } + + @Test + void equalsIgnoresState() { + // state is a reconcile-bookkeeping field, not part of schema identity. + PeerTagSchema early = + PeerTagSchema.of(Collections.singleton("peer.hostname"), "state-1"); + PeerTagSchema late = + PeerTagSchema.of(Collections.singleton("peer.hostname"), "state-2"); + + assertEquals(early, late); + assertEquals(early.hashCode(), late.hashCode()); + } + + @Test + void equalsDistinguishesByOrder() { + // names is positional -- the array index pairs with SpanSnapshot.peerTagValues. Schemas with + // the same tags in different positions are NOT interchangeable. + PeerTagSchema ab = PeerTagSchema.testSchema(new String[] {"a", "b"}); + PeerTagSchema ba = PeerTagSchema.testSchema(new String[] {"b", "a"}); + + assertNotEquals(ab, ba); + } + + @Test + void equalsHandlesNullAndOtherTypes() { + PeerTagSchema schema = PeerTagSchema.testSchema(new String[] {"peer.hostname"}); + + assertNotEquals(schema, null); + assertNotEquals(schema, "peer.hostname"); + } } diff --git a/dd-trace-core/src/traceAgentTest/groovy/MetricsIntegrationTest.groovy b/dd-trace-core/src/traceAgentTest/groovy/MetricsIntegrationTest.groovy index 2972ffa2c18..3cc703603e1 100644 --- a/dd-trace-core/src/traceAgentTest/groovy/MetricsIntegrationTest.groovy +++ b/dd-trace-core/src/traceAgentTest/groovy/MetricsIntegrationTest.groovy @@ -8,14 +8,14 @@ import datadog.metrics.impl.DDSketchHistograms import datadog.trace.api.Config import datadog.trace.api.WellKnownTags import datadog.trace.bootstrap.instrumentation.api.UTF8BytesString -import datadog.trace.common.metrics.AggregateMetric +import datadog.trace.common.metrics.AggregateEntry import datadog.trace.common.metrics.EventListener -import datadog.trace.common.metrics.MetricKey import datadog.trace.common.metrics.OkHttpSink +import datadog.trace.common.metrics.PeerTagSchema import datadog.trace.common.metrics.SerializingMetricWriter +import datadog.trace.common.metrics.SpanSnapshot import java.util.concurrent.CopyOnWriteArrayList import java.util.concurrent.CountDownLatch -import java.util.concurrent.atomic.AtomicLongArray import okhttp3.HttpUrl class MetricsIntegrationTest extends AbstractTraceAgentTest { @@ -39,14 +39,20 @@ class MetricsIntegrationTest extends AbstractTraceAgentTest { sink ) writer.startBucket(2, System.nanoTime(), SECONDS.toNanos(10)) - writer.add( - new MetricKey("resource1", "service1", "operation1", null, "sql", 0, false, true, "xyzzy", [UTF8BytesString.create("grault:quux")], null, null, null), - new AggregateMetric().recordDurations(5, new AtomicLongArray(2, 1, 2, 250, 4, 5)) - ) - writer.add( - new MetricKey("resource2", "service2", "operation2", null, "web", 200, false, true, "xyzzy", [UTF8BytesString.create("grault:quux")], null, null, null), - new AggregateMetric().recordDurations(10, new AtomicLongArray(1, 1, 200, 2, 3, 4, 5, 6, 7, 8, 9)) - ) + // Build entries via SpanSnapshot directly: the test factory lives in src/test/java but this + // is the separate traceAgentTest source set, so we can't see it. Both entries use one peer + // tag (grault:quux) -> schema names=["grault"], values=["quux"]. + PeerTagSchema schema = PeerTagSchema.testSchema(["grault"] as String[]) + def entry1 = AggregateEntry.forSnapshot(new SpanSnapshot( + "resource1", "service1", "operation1", null, "sql", (short) 0, + false, true, "xyzzy", schema, ["quux"] as String[], null, null, null, 0L)) + [2, 1, 2, 250, 4].each { entry1.recordOneDuration(it as long) } + writer.add(entry1) + def entry2 = AggregateEntry.forSnapshot(new SpanSnapshot( + "resource2", "service2", "operation2", null, "web", (short) 200, + false, true, "xyzzy", schema, ["quux"] as String[], null, null, null, 0L)) + [1, 1, 200, 2, 3, 4, 5, 6, 7, 8].each { entry2.recordOneDuration(it as long) } + writer.add(entry2) writer.finishBucket() then: diff --git a/internal-api/src/main/java/datadog/trace/util/Hashtable.java b/internal-api/src/main/java/datadog/trace/util/Hashtable.java index 8f40e4609bc..ff3202c1f33 100644 --- a/internal-api/src/main/java/datadog/trace/util/Hashtable.java +++ b/internal-api/src/main/java/datadog/trace/util/Hashtable.java @@ -482,7 +482,24 @@ MutatingBucketIterator mutatingBucketIterator( */ public static final MutatingTableIterator mutatingTableIterator(Hashtable.Entry[] buckets) { - return new MutatingTableIterator(buckets); + return new MutatingTableIterator(buckets, 0, buckets.length); + } + + /** + * Variant of {@link #mutatingTableIterator(Hashtable.Entry[])} that walks only the half-open + * bucket range {@code [startBucket, endBucket)}. Useful for resumable sweeps -- e.g. cursor- + * based eviction in {@code AggregateTable} -- where one call drives {@code [cursor, length)} + * and a wrap-around call drives {@code [0, cursor)}. The iterator does not wrap around + * within a single instance; callers compose two iterators when wrap-around is desired. An empty + * range ({@code startBucket == endBucket}) produces an immediately exhausted iterator. + * + * @param startBucket inclusive lower bound; must be in {@code [0, buckets.length]}. + * @param endBucket exclusive upper bound; must be in {@code [startBucket, buckets.length]}. + */ + public static final + MutatingTableIterator mutatingTableIterator( + Hashtable.Entry[] buckets, int startBucket, int endBucket) { + return new MutatingTableIterator(buckets, startBucket, endBucket); } public static final int bucketIndex(Object[] buckets, long keyHash) { @@ -752,6 +769,9 @@ public static final class MutatingTableIterator implements Iterator { private final Hashtable.Entry[] buckets; + /** Exclusive upper bound for bucket indices visited by this iterator. */ + private final int endBucket; + /** * Index of the bucket holding {@link #nextEntry} (or holding {@link #curEntry} after remove). */ @@ -782,9 +802,34 @@ public static final class MutatingTableIterator */ private Hashtable.Entry curEntry; - MutatingTableIterator(Hashtable.Entry[] buckets) { + MutatingTableIterator(Hashtable.Entry[] buckets, int startBucket, int endBucket) { this.buckets = buckets; - seekFromBucket(0); + if (startBucket < 0 || startBucket > buckets.length) { + throw new IndexOutOfBoundsException( + "startBucket " + startBucket + " out of range [0, " + buckets.length + "]"); + } + if (endBucket < startBucket || endBucket > buckets.length) { + throw new IndexOutOfBoundsException( + "endBucket " + + endBucket + + " out of range [" + + startBucket + + ", " + + buckets.length + + "]"); + } + this.endBucket = endBucket; + seekFromBucket(startBucket); + } + + /** + * Bucket index of the entry last returned by {@link #next()}, or {@code -1} if {@code next} has + * not yet been called or the most recent call was {@link #remove()}. Useful for callers driving + * a cursor — e.g. resumable eviction sweeps that want to remember where the last successful + * removal landed. + */ + public int currentBucket() { + return this.curBucketIndex; } @Override @@ -841,12 +886,12 @@ public void remove() { } /** - * Advance {@code nextBucketIndex} / {@code nextEntry} to the first non-empty bucket >= {@code - * from}. + * Advance {@code nextBucketIndex} / {@code nextEntry} to the first non-empty bucket {@code >= + * from} within {@code [0, endBucket)}. */ private void seekFromBucket(int from) { Hashtable.Entry[] thisBuckets = this.buckets; - for (int i = from; i < thisBuckets.length; i++) { + for (int i = from; i < this.endBucket; i++) { Hashtable.Entry head = thisBuckets[i]; if (head != null) { this.nextBucketIndex = i; diff --git a/internal-api/src/test/java/datadog/trace/util/HashtableTest.java b/internal-api/src/test/java/datadog/trace/util/HashtableTest.java index 2992279be6d..953453ca3aa 100644 --- a/internal-api/src/test/java/datadog/trace/util/HashtableTest.java +++ b/internal-api/src/test/java/datadog/trace/util/HashtableTest.java @@ -349,5 +349,64 @@ void removeTwiceWithoutInterveningNextThrows() { it.remove(); assertThrows(IllegalStateException.class, it::remove); } + + @Test + void halfOpenRangeOmitsBucketsOutsideTheRange() { + // CollidingKey lets us pin entries to specific buckets via controlled hashCode. 16-slot + // table -> bucketIndex = hash & 15. Place entries in buckets 0, 5, and 10; iterate + // [5, 10) -- should see only bucket 5. + Hashtable.D1 table = new Hashtable.D1<>(16); + table.insert(new CollidingKeyEntry(new CollidingKey("b0", 0), 1)); + table.insert(new CollidingKeyEntry(new CollidingKey("b5", 5), 2)); + table.insert(new CollidingKeyEntry(new CollidingKey("b10", 10), 3)); + + Set seen = new HashSet<>(); + for (MutatingTableIterator it = + Support.mutatingTableIterator(table.buckets, 5, 10); + it.hasNext(); ) { + seen.add(it.next().key.label); + } + assertEquals(1, seen.size()); + assertTrue(seen.contains("b5")); + } + + @Test + void emptyHalfOpenRangeIsExhausted() { + // start == end -> immediately-exhausted iterator. Important: this is the wrap-around + // pass [0, cursor) when cursor == 0 in resumable sweeps. + Hashtable.D1 table = new Hashtable.D1<>(8); + table.insert(new StringIntEntry("a", 1)); + MutatingTableIterator it = Support.mutatingTableIterator(table.buckets, 0, 0); + assertFalse(it.hasNext()); + } + + @Test + void rangeBoundsOutOfOrderThrows() { + Hashtable.D1 table = new Hashtable.D1<>(8); + assertThrows( + IndexOutOfBoundsException.class, + () -> Support.mutatingTableIterator(table.buckets, -1, 4)); + assertThrows( + IndexOutOfBoundsException.class, + () -> Support.mutatingTableIterator(table.buckets, 4, 2)); // end < start + assertThrows( + IndexOutOfBoundsException.class, + () -> + Support.mutatingTableIterator( + table.buckets, 0, table.buckets.length + 1)); // end > len + } + + @Test + void currentBucketReportsLandingIndex() { + // Pin one entry to a known bucket and check currentBucket() after next() reports that + // bucket. Before any next() (or after remove()), currentBucket() returns -1. + Hashtable.D1 table = new Hashtable.D1<>(16); + table.insert(new CollidingKeyEntry(new CollidingKey("b3", 3), 1)); + + MutatingTableIterator it = Support.mutatingTableIterator(table.buckets); + assertEquals(-1, it.currentBucket(), "before any next() currentBucket should be -1"); + it.next(); + assertEquals(3, it.currentBucket(), "currentBucket should report the entry's bucket"); + } } }