From 0a4b1bcce30eedc45d1fb1ff631745b7dedb04ae Mon Sep 17 00:00:00 2001 From: Yiming Luo <10097700+lym953@users.noreply.github.com> Date: Fri, 29 May 2026 15:12:47 -0400 Subject: [PATCH 01/21] fix(metrics): emit OOM metric when max_memory_used == memory_size, with per-request dedup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Customer report (#1237): a Node.js Lambda that hit its memory limit (Memory Size 192 MB / Max Memory Used 192 MB, Status: timeout) did not emit aws.lambda.enhanced.out_of_memory because none of the existing detection paths matched. The Node runtime did not log "JavaScript heap out of memory" (V8 spent its time in GC instead of declaring an OOM), and PlatformRuntimeDone reported no error_type — just a wall-clock timeout — so the log-string and Runtime.OutOfMemory paths both stayed silent. Drop the provided.al* restriction on the PlatformReport equality check so any runtime emits OOM when max_memory_used_mb == memory_size_mb. To avoid double-counting against the two pre-existing paths (some invocations satisfy both equality and Runtime.OutOfMemory at the same time), add a per-Context oom_emitted flag. All three detection paths now funnel through Processor::try_increment_oom_metric, which checks the flag, sets it on first emission, and is a no-op on subsequent calls for the same request_id. The flag lives with the per-invocation Context and is cleared automatically when on_platform_report removes the context. Plumbing: Event::OutOfMemory now carries an Option request_id (the log-path detector reads it from the logs processor's invocation_context.request_id, set on PlatformStart and cleared on PlatformRuntimeDone). When request_id is None — only realistic in Managed Instance mode, where extensions cannot subscribe to INVOKE — the helper falls back to a best-effort emit without dedup. Tests cover three scenarios: same request_id emits exactly once, two distinct request_ids each emit, and the equality path still fires (regression coverage for the dropped provided.al* check). Co-Authored-By: Claude Opus 4.7 (1M context) --- bottlecap/src/bin/bottlecap/main.rs | 7 +- bottlecap/src/event_bus/mod.rs | 8 +- bottlecap/src/lifecycle/invocation/context.rs | 7 + .../src/lifecycle/invocation/processor.rs | 185 ++++++++++++++++-- .../lifecycle/invocation/processor_service.rs | 15 +- bottlecap/src/logs/lambda/processor.rs | 38 +++- bottlecap/src/metrics/enhanced/lambda.rs | 12 +- 7 files changed, 245 insertions(+), 27 deletions(-) diff --git a/bottlecap/src/bin/bottlecap/main.rs b/bottlecap/src/bin/bottlecap/main.rs index 3dcc13bb2..a41a3f51f 100644 --- a/bottlecap/src/bin/bottlecap/main.rs +++ b/bottlecap/src/bin/bottlecap/main.rs @@ -841,9 +841,12 @@ async fn handle_event_bus_event( stats_concentrator: StatsConcentratorHandle, ) -> Option { match event { - Event::OutOfMemory(event_timestamp) => { + Event::OutOfMemory { + request_id, + timestamp, + } => { if let Err(e) = invocation_processor_handle - .on_out_of_memory_error(event_timestamp) + .on_out_of_memory_error(request_id, timestamp) .await { error!("Failed to send out of memory error to processor: {}", e); diff --git a/bottlecap/src/event_bus/mod.rs b/bottlecap/src/event_bus/mod.rs index 0ea20969e..0be3a86ca 100644 --- a/bottlecap/src/event_bus/mod.rs +++ b/bottlecap/src/event_bus/mod.rs @@ -7,7 +7,13 @@ mod constants; #[derive(Debug)] pub enum Event { Telemetry(TelemetryEvent), - OutOfMemory(i64), + OutOfMemory { + /// Lambda `request_id` of the invocation the OOM belongs to, when known. + /// Used by the invocation processor to dedupe against other OOM detection + /// paths (`PlatformRuntimeDone` `error_type`, `PlatformReport` memory equality). + request_id: Option, + timestamp: i64, + }, Tombstone, } diff --git a/bottlecap/src/lifecycle/invocation/context.rs b/bottlecap/src/lifecycle/invocation/context.rs index 04894f9c6..3aef5e4bf 100644 --- a/bottlecap/src/lifecycle/invocation/context.rs +++ b/bottlecap/src/lifecycle/invocation/context.rs @@ -43,6 +43,12 @@ pub struct Context { /// tracing. /// pub extracted_span_context: Option, + /// Whether the `aws.lambda.enhanced.out_of_memory` metric has already been + /// emitted for this invocation. Multiple detection paths can fire for the + /// same OOM (runtime log, `Runtime.OutOfMemory` `error_type` in + /// `PlatformRuntimeDone`, `max_memory_used == memory_size` in `PlatformReport`); + /// this flag dedupes them. + pub oom_emitted: bool, } /// Struct containing the information needed to reparent a span. @@ -94,6 +100,7 @@ impl Default for Context { snapstart_restore_span: None, tracer_span: None, extracted_span_context: None, + oom_emitted: false, } } } diff --git a/bottlecap/src/lifecycle/invocation/processor.rs b/bottlecap/src/lifecycle/invocation/processor.rs index ec1af99ab..c77c224a8 100644 --- a/bottlecap/src/lifecycle/invocation/processor.rs +++ b/bottlecap/src/lifecycle/invocation/processor.rs @@ -508,7 +508,7 @@ impl Processor { debug!( "Invocation Processor | PlatformRuntimeDone | Got Runtime.OutOfMemory. Incrementing OOM metric." ); - self.enhanced_metrics.increment_oom_metric(timestamp); + self.try_increment_oom_metric(Some(request_id), timestamp); } } @@ -909,25 +909,25 @@ impl Processor { /// Handles `OnDemand` mode platform report processing. /// - /// Processes OnDemand-specific metrics including OOM detection for provided.al runtimes - /// and post-runtime duration calculation. + /// Processes OnDemand-specific metrics including OOM detection by memory-size + /// equality and post-runtime duration calculation. fn handle_ondemand_report( &mut self, request_id: &String, metrics: OnDemandReportMetrics, timestamp: i64, ) { - // For provided.al runtimes, if the last invocation hit the memory limit, increment the OOM metric. - // We do this for provided.al runtimes because we didn't find another way to detect this under provided.al. - // We don't do this for other runtimes to avoid double counting. - if let Some(runtime) = &self.runtime - && runtime.starts_with("provided.al") - && metrics.max_memory_used_mb == metrics.memory_size_mb - { + // If the invocation hit the memory limit, increment the OOM metric. This catches + // OOM-induced failures that don't surface through a runtime-specific log line or a + // `Runtime.OutOfMemory` error_type — most notably the suppressed-init / timeout-at-cap + // pattern reported in datadog-lambda-extension#1237 (Node) and the historical + // provided.al case. Dedup against the other two detection paths is handled by + // `Context::oom_emitted`, which `try_increment_oom_metric` checks and sets. + if metrics.max_memory_used_mb == metrics.memory_size_mb { debug!( "Invocation Processor | PlatformReport | Last invocation hit memory limit. Incrementing OOM metric." ); - self.enhanced_metrics.increment_oom_metric(timestamp); + self.try_increment_oom_metric(Some(request_id), timestamp); } // Calculate and set post-runtime duration if context is available @@ -1395,7 +1395,34 @@ impl Processor { Some(error_tags) } - pub fn on_out_of_memory_error(&mut self, timestamp: i64) { + pub fn on_out_of_memory_error(&mut self, request_id: Option<&String>, timestamp: i64) { + self.try_increment_oom_metric(request_id, timestamp); + } + + /// Increments the OOM enhanced metric exactly once per `request_id`. + /// + /// Several detection paths can fire for the same invocation: + /// 1. A runtime-specific OOM log line (logs processor → `Event::OutOfMemory`) + /// 2. `error_type == "Runtime.OutOfMemory"` in `PlatformRuntimeDone` + /// 3. `max_memory_used_mb == memory_size_mb` in `PlatformReport` + /// + /// To avoid double-counting, the per-invocation `Context::oom_emitted` flag is + /// set on the first emission. Subsequent emissions for the same `request_id` are + /// skipped. If `request_id` is `None` (log path saw the OOM outside an active + /// invocation window) or no context is found, we emit best-effort without dedup. + fn try_increment_oom_metric(&mut self, request_id: Option<&String>, timestamp: i64) { + if let Some(rid) = request_id + && let Some(ctx) = self.context_buffer.get_mut(rid) + { + if ctx.oom_emitted { + debug!( + "Invocation Processor | OOM metric already emitted for request_id {}, skipping", + rid + ); + return; + } + ctx.oom_emitted = true; + } self.enhanced_metrics.increment_oom_metric(timestamp); } @@ -2445,4 +2472,138 @@ mod tests { "pre-existing _dd.appsec.enabled value must not be overwritten" ); } + + /// Two OOM signals for the same `request_id` increment the metric exactly once. + /// Exercises the `Context::oom_emitted` dedup flag. + #[tokio::test] + async fn test_try_increment_oom_metric_dedupes_same_request_id() { + let mut p = setup(); + // Insert the context directly so we don't go through `on_invoke_event`, which + // would populate dynamic tags (`cold_start:true`) and complicate the query. + let request_id = String::from("req-dedup"); + p.context_buffer.start_context(&request_id, Span::default()); + + let now: i64 = std::time::UNIX_EPOCH + .elapsed() + .expect("clock") + .as_secs() + .try_into() + .unwrap_or_default(); + + p.on_out_of_memory_error(Some(&request_id), now); + p.on_out_of_memory_error(Some(&request_id), now); + + let ts = (now / 10) * 10; + let entry = p + .enhanced_metrics + .aggr_handle + .get_entry_by_id( + crate::metrics::enhanced::constants::OUT_OF_MEMORY_METRIC.into(), + None, + ts, + ) + .await + .unwrap() + .expect("OOM metric must be emitted at least once"); + + let sketch = entry.value.get_sketch().expect("distribution sketch"); + let sum = sketch.sum().expect("sketch sum"); + assert!( + (sum - 1.0).abs() < f64::EPSILON, + "OOM sum must be 1.0 (deduped), got {sum}" + ); + + // And the context flag should now reflect that we emitted. + assert!( + p.context_buffer + .get(&request_id) + .expect("context") + .oom_emitted, + "oom_emitted flag must be set after the first emission" + ); + } + + /// OOM signals for different `request_id`s each emit a metric — dedup is scoped + /// per request, not globally. + #[tokio::test] + async fn test_try_increment_oom_metric_distinct_request_ids_emit_separately() { + let mut p = setup(); + let req1 = String::from("req-a"); + let req2 = String::from("req-b"); + p.context_buffer.start_context(&req1, Span::default()); + p.context_buffer.start_context(&req2, Span::default()); + + let now: i64 = std::time::UNIX_EPOCH + .elapsed() + .expect("clock") + .as_secs() + .try_into() + .unwrap_or_default(); + + p.on_out_of_memory_error(Some(&req1), now); + p.on_out_of_memory_error(Some(&req2), now); + + let ts = (now / 10) * 10; + let entry = p + .enhanced_metrics + .aggr_handle + .get_entry_by_id( + crate::metrics::enhanced::constants::OUT_OF_MEMORY_METRIC.into(), + None, + ts, + ) + .await + .unwrap() + .expect("OOM metric must be emitted"); + + let sketch = entry.value.get_sketch().expect("distribution sketch"); + let sum = sketch.sum().expect("sketch sum"); + assert!( + (sum - 2.0).abs() < f64::EPSILON, + "OOM sum must be 2.0 (one per request_id), got {sum}" + ); + } + + /// Regression: the `max_memory_used_mb == memory_size_mb` path used to be gated + /// on `runtime.starts_with("provided.al")`. After generalising the rule to all + /// runtimes (with dedup via `Context::oom_emitted`), the equality case must + /// still emit OOM. + #[tokio::test] + async fn test_handle_ondemand_report_emits_oom_on_memory_equality() { + let mut p = setup(); + let request_id = String::from("req-eq"); + p.context_buffer.start_context(&request_id, Span::default()); + + let now: i64 = std::time::UNIX_EPOCH + .elapsed() + .expect("clock") + .as_secs() + .try_into() + .unwrap_or_default(); + + let metrics = OnDemandReportMetrics { + duration_ms: 100.0, + billed_duration_ms: 100, + memory_size_mb: 1024, + max_memory_used_mb: 1024, + init_duration_ms: None, + restore_duration_ms: None, + }; + p.handle_ondemand_report(&request_id, metrics, now); + + let ts = (now / 10) * 10; + assert!( + p.enhanced_metrics + .aggr_handle + .get_entry_by_id( + crate::metrics::enhanced::constants::OUT_OF_MEMORY_METRIC.into(), + None, + ts + ) + .await + .unwrap() + .is_some(), + "OOM must be emitted when max_memory_used_mb == memory_size_mb" + ); + } } diff --git a/bottlecap/src/lifecycle/invocation/processor_service.rs b/bottlecap/src/lifecycle/invocation/processor_service.rs index a41a95b26..61c48b479 100644 --- a/bottlecap/src/lifecycle/invocation/processor_service.rs +++ b/bottlecap/src/lifecycle/invocation/processor_service.rs @@ -118,6 +118,7 @@ pub enum ProcessorCommand { execution_status: Option, }, OnOutOfMemoryError { + request_id: Option, timestamp: i64, }, OnShutdownEvent, @@ -407,10 +408,14 @@ impl InvocationProcessorHandle { pub async fn on_out_of_memory_error( &self, + request_id: Option, timestamp: i64, ) -> Result<(), mpsc::error::SendError> { self.sender - .send(ProcessorCommand::OnOutOfMemoryError { timestamp }) + .send(ProcessorCommand::OnOutOfMemoryError { + request_id, + timestamp, + }) .await } @@ -632,8 +637,12 @@ impl InvocationProcessorService { ) .await; } - ProcessorCommand::OnOutOfMemoryError { timestamp } => { - self.processor.on_out_of_memory_error(timestamp); + ProcessorCommand::OnOutOfMemoryError { + request_id, + timestamp, + } => { + self.processor + .on_out_of_memory_error(request_id.as_ref(), timestamp); } ProcessorCommand::OnShutdownEvent => { self.processor.on_shutdown_event(); diff --git a/bottlecap/src/logs/lambda/processor.rs b/bottlecap/src/logs/lambda/processor.rs index 643e32854..9c922ac69 100644 --- a/bottlecap/src/logs/lambda/processor.rs +++ b/bottlecap/src/logs/lambda/processor.rs @@ -163,6 +163,32 @@ impl LambdaProcessor { } } + /// Returns the `request_id` of the currently-active invocation, if known. + /// Set by `PlatformStart`, cleared by `PlatformRuntimeDone` / `PlatformReport`. + /// + /// Returns `None` when: + /// - **Managed Instance mode**: extensions cannot subscribe to the `INVOKE` event, + /// so `platform.start` is not delivered and this slot is never populated. OOM logs + /// parsed in MI mode are therefore always tagged `None`. The synthesized + /// `PlatformRuntimeDone` produced by `handle_managed_instance_report` does carry a + /// real `request_id`, so dedup still works for that path. Worst case is a thin + /// double-count window if a runtime emits both an OOM log line and + /// `error_type = Runtime.OutOfMemory` for the same invocation — not observed in + /// practice today. + /// - **Pre-`PlatformStart` init crash**: a FATAL OOM log emitted by init code can + /// arrive before `PlatformStart` (or with no `PlatformStart` at all, if init + /// fails outright). In the no-`PlatformStart` case no other detection path fires, + /// so no double-count. + /// - **Late log race**: a FATAL log parsed after `PlatformRuntimeDone` clears the + /// slot. By then the context has been removed, so no double-count. + fn current_request_id(&self) -> Option { + if self.invocation_context.request_id.is_empty() { + None + } else { + Some(self.invocation_context.request_id.clone()) + } + } + #[allow(clippy::too_many_lines)] async fn get_message(&mut self, event: TelemetryEvent) -> Result> { let copy = event.clone(); @@ -194,7 +220,10 @@ impl LambdaProcessor { if let Some(message) = message { if is_oom_error(&message) { debug!("LOGS | Got a runtime-specific OOM error. Incrementing OOM metric."); - if let Err(e) = self.event_bus.send(Event::OutOfMemory(event.time.timestamp())).await { + if let Err(e) = self.event_bus.send(Event::OutOfMemory { + request_id: self.current_request_id(), + timestamp: event.time.timestamp(), + }).await { error!("LOGS | Failed to send OOM event to the main event bus: {e}"); } } @@ -206,7 +235,7 @@ impl LambdaProcessor { event.time.timestamp_millis(), None, ); - // If the message is logged from the durable execution SDK, + // If the message is logged from the durable execution SDK, // set durable execution id and name as log attributes. if let Some((exec_id, exec_name)) = durable_ctx { msg.lambda.durable_execution_id = Some(exec_id); @@ -227,7 +256,10 @@ impl LambdaProcessor { if let Some(message) = message { if is_oom_error(&message) { debug!("LOGS | Got a runtime-specific OOM error. Incrementing OOM metric."); - if let Err(e) = self.event_bus.send(Event::OutOfMemory(event.time.timestamp())).await { + if let Err(e) = self.event_bus.send(Event::OutOfMemory { + request_id: self.current_request_id(), + timestamp: event.time.timestamp(), + }).await { error!("LOGS | Failed to send OOM event to the main event bus: {e}"); } } diff --git a/bottlecap/src/metrics/enhanced/lambda.rs b/bottlecap/src/metrics/enhanced/lambda.rs index abed7d5b9..67535967e 100644 --- a/bottlecap/src/metrics/enhanced/lambda.rs +++ b/bottlecap/src/metrics/enhanced/lambda.rs @@ -91,12 +91,12 @@ impl Lambda { self.increment_metric(constants::TIMEOUTS_METRIC, timestamp); } - // This function is called in three cases: - // 1. Runtime-specific OOM error (can happen in .NET, Node.js and Java as far as we know) - // 2. PlatformRuntimeDone event reports "error_type: Runtime.OutOfMemory" (can happen in Ruby and Python as far as we know) - // 3. PlatformReport event reports "max_memory_used_mb == memory_size_mb" (can happen in many runtimes, but - // we only call increment_oom_metric() for provided.al runtimes) - // This is our best effort to cover different cases without double counting. We can adjust this if we find more cases. + // Callers should generally go through `Processor::try_increment_oom_metric`, which + // dedupes by `request_id` so the same invocation isn't counted multiple times when + // more than one detection path fires. The three paths are: + // 1. Runtime-specific OOM log line (.NET, Node, Java, Go, Ruby, Python) + // 2. PlatformRuntimeDone with error_type == "Runtime.OutOfMemory" (Ruby, Python; Node as of 2026-05) + // 3. PlatformReport with max_memory_used_mb == memory_size_mb (all runtimes) pub fn increment_oom_metric(&self, timestamp: i64) { self.increment_metric(constants::OUT_OF_MEMORY_METRIC, timestamp); } From 5a833ac9f95dfe96300b28347d79b7bda1ac9401 Mon Sep 17 00:00:00 2001 From: Yiming Luo <10097700+lym953@users.noreply.github.com> Date: Fri, 29 May 2026 15:52:52 -0400 Subject: [PATCH 02/21] test(integration): add cross-runtime OOM test suite MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a new `oom` integration-test suite that exercises the OOM dedup change (Context::oom_emitted, #1241) end-to-end across every supported runtime. Each lambda intentionally allocates until it OOMs; the test asserts aws.lambda.enhanced.out_of_memory increments by exactly one data point per function over the invocation window — which fails if the dedup flag stops working and two detection paths emit for the same invocation. New lambda apps under integration-tests/lambda/: - oom-node-v8-heap : exercises log-line path (JavaScript heap OOM) - oom-node-sigkill : exercises PlatformRuntimeDone Runtime.OutOfMemory path - oom-python : MemoryError — log path AND PlatformRuntimeDone path both fire, so dedup is necessary for count==1 - oom-ruby : NoMemoryError — same dual-path coverage as Python - oom-java : OutOfMemoryError (log-line path) - oom-dotnet : OutOfMemoryException (log-line path) - oom-go : fatal: runtime: out of memory — log path AND PlatformReport memory-equality path both fire Framework additions: - Ruby and Go runtime/layer helpers in lib/util.ts (Ruby tracer layer; Go has no tracer layer — extension layer alone covers the test). - Oom CDK stack registered in bin/app.ts. - build-ruby.sh (zip-as-is for now; Gemfile build stubbed) and build-go.sh (Docker cross-compile to ARM64 Linux, bootstrap binary). - Pipeline template additions for the two new build stages and oom suite registration in test-suites.yaml. - getMetricCount() + OUT_OF_MEMORY_METRIC in tests/utils/datadog.ts. Co-Authored-By: Claude Opus 4.7 (1M context) --- .gitlab/datasources/test-suites.yaml | 1 + .gitlab/templates/pipeline.yaml.tpl | 38 +++ integration-tests/bin/app.ts | 4 + .../lambda/oom-dotnet/Function.cs | 25 ++ .../lambda/oom-dotnet/Function.csproj | 14 ++ integration-tests/lambda/oom-go/go.mod | 5 + integration-tests/lambda/oom-go/main.go | 23 ++ integration-tests/lambda/oom-java/pom.xml | 50 ++++ .../src/main/java/example/Handler.java | 24 ++ .../lambda/oom-node-sigkill/index.mjs | 13 ++ .../lambda/oom-node-v8-heap/index.mjs | 10 + .../lambda/oom-python/lambda_function.py | 12 + .../lambda/oom-ruby/lambda_function.rb | 13 ++ integration-tests/lib/stacks/oom.ts | 216 ++++++++++++++++++ integration-tests/lib/util.ts | 12 + integration-tests/scripts/build-go.sh | 123 ++++++++++ integration-tests/scripts/build-ruby.sh | 88 +++++++ integration-tests/scripts/local_deploy.sh | 2 + integration-tests/tests/oom.test.ts | 90 ++++++++ integration-tests/tests/utils/datadog.ts | 37 +++ 20 files changed, 800 insertions(+) create mode 100644 integration-tests/lambda/oom-dotnet/Function.cs create mode 100644 integration-tests/lambda/oom-dotnet/Function.csproj create mode 100644 integration-tests/lambda/oom-go/go.mod create mode 100644 integration-tests/lambda/oom-go/main.go create mode 100644 integration-tests/lambda/oom-java/pom.xml create mode 100644 integration-tests/lambda/oom-java/src/main/java/example/Handler.java create mode 100644 integration-tests/lambda/oom-node-sigkill/index.mjs create mode 100644 integration-tests/lambda/oom-node-v8-heap/index.mjs create mode 100644 integration-tests/lambda/oom-python/lambda_function.py create mode 100644 integration-tests/lambda/oom-ruby/lambda_function.rb create mode 100644 integration-tests/lib/stacks/oom.ts create mode 100755 integration-tests/scripts/build-go.sh create mode 100755 integration-tests/scripts/build-ruby.sh create mode 100644 integration-tests/tests/oom.test.ts diff --git a/.gitlab/datasources/test-suites.yaml b/.gitlab/datasources/test-suites.yaml index 257b1ba04..b6d82c369 100644 --- a/.gitlab/datasources/test-suites.yaml +++ b/.gitlab/datasources/test-suites.yaml @@ -4,3 +4,4 @@ test_suites: - name: snapstart - name: lmi - name: auth + - name: oom diff --git a/.gitlab/templates/pipeline.yaml.tpl b/.gitlab/templates/pipeline.yaml.tpl index 60788606b..a87bfaa14 100644 --- a/.gitlab/templates/pipeline.yaml.tpl +++ b/.gitlab/templates/pipeline.yaml.tpl @@ -505,6 +505,40 @@ build node lambdas: - cd integration-tests - ./scripts/build-node.sh +build ruby lambdas: + stage: integration-tests + image: registry.ddbuild.io/images/docker:27.3.1 + tags: ["docker-in-docker:arm64"] + rules: + - when: on_success + needs: [] + artifacts: + expire_in: 1 hour + paths: + - integration-tests/lambda/*/*.rb + script: + - cd integration-tests + - ./scripts/build-ruby.sh + +build go lambdas: + stage: integration-tests + image: registry.ddbuild.io/images/docker:27.3.1 + tags: ["docker-in-docker:arm64"] + rules: + - when: on_success + needs: [] + cache: + key: go-mod-cache-${CI_COMMIT_REF_SLUG} + paths: + - integration-tests/.cache/go-mod/ + artifacts: + expire_in: 1 hour + paths: + - integration-tests/lambda/*/bin/bootstrap + script: + - cd integration-tests + - ./scripts/build-go.sh + # Integration Tests - Publish arm64 layer with integration test prefix publish integration layer (arm64): stage: integration-tests @@ -581,12 +615,16 @@ integration-suite: - build dotnet lambdas - build python lambdas - build node lambdas + - build ruby lambdas + - build go lambdas dependencies: - publish integration layer (arm64) - build java lambdas - build dotnet lambdas - build python lambdas - build node lambdas + - build ruby lambdas + - build go lambdas variables: IDENTIFIER: ${CI_COMMIT_SHORT_SHA} AWS_DEFAULT_REGION: us-east-1 diff --git a/integration-tests/bin/app.ts b/integration-tests/bin/app.ts index d822e6cac..affce4270 100644 --- a/integration-tests/bin/app.ts +++ b/integration-tests/bin/app.ts @@ -6,6 +6,7 @@ import {Otlp} from '../lib/stacks/otlp'; import {Snapstart} from '../lib/stacks/snapstart'; import {LambdaManagedInstancesStack} from '../lib/stacks/lmi'; import {AuthStack} from '../lib/stacks/auth'; +import {Oom} from '../lib/stacks/oom'; import {AuthRoleStack} from '../lib/auth-role'; import {ACCOUNT, getIdentifier, REGION} from '../config'; import {CapacityProviderStack} from "../lib/capacity-provider"; @@ -40,6 +41,9 @@ const stacks = [ new AuthStack(app, `integ-${identifier}-auth`, { env, }), + new Oom(app, `integ-${identifier}-oom`, { + env, + }), ] // Tag all stacks so we can easily clean them up diff --git a/integration-tests/lambda/oom-dotnet/Function.cs b/integration-tests/lambda/oom-dotnet/Function.cs new file mode 100644 index 000000000..b5c861493 --- /dev/null +++ b/integration-tests/lambda/oom-dotnet/Function.cs @@ -0,0 +1,25 @@ +using Amazon.Lambda.Core; +using System.Collections.Generic; +using System.Text.Json; + +[assembly: LambdaSerializer(typeof(Amazon.Lambda.Serialization.SystemTextJson.DefaultLambdaJsonSerializer))] + +namespace Function +{ + /// + /// OOM reproducer for .NET. Allocates and retains 10 MB byte arrays in a list + /// until the CLR throws System.OutOfMemoryException. Bottlecap's runtime-specific + /// log-line detection matches "OutOfMemoryException". + /// + public class Handler + { + public Dictionary FunctionHandler(JsonElement input, ILambdaContext context) + { + var data = new List(); + while (true) + { + data.Add(new byte[10 * 1024 * 1024]); + } + } + } +} diff --git a/integration-tests/lambda/oom-dotnet/Function.csproj b/integration-tests/lambda/oom-dotnet/Function.csproj new file mode 100644 index 000000000..2dfcbac5f --- /dev/null +++ b/integration-tests/lambda/oom-dotnet/Function.csproj @@ -0,0 +1,14 @@ + + + net8.0 + enable + enable + true + Lambda + true + + + + + + diff --git a/integration-tests/lambda/oom-go/go.mod b/integration-tests/lambda/oom-go/go.mod new file mode 100644 index 000000000..a73b6d85d --- /dev/null +++ b/integration-tests/lambda/oom-go/go.mod @@ -0,0 +1,5 @@ +module oom-go + +go 1.22 + +require github.com/aws/aws-lambda-go v1.49.0 diff --git a/integration-tests/lambda/oom-go/main.go b/integration-tests/lambda/oom-go/main.go new file mode 100644 index 000000000..99821b0ad --- /dev/null +++ b/integration-tests/lambda/oom-go/main.go @@ -0,0 +1,23 @@ +// OOM reproducer for Go. +// Allocates and retains 10 MB byte slices in a slice header until the Go +// runtime aborts with "fatal error: runtime: out of memory". Bottlecap's +// runtime-specific log-line detection matches that fatal-error message. +// Without that detection (and historically for provided.al runtimes), the +// equality path in PlatformReport (max_memory_used_mb == memory_size_mb) also +// fires. The per-Context dedup flag ensures the metric increments only once. +package main + +import ( + "github.com/aws/aws-lambda-go/lambda" +) + +func handler() error { + var data [][]byte + for { + data = append(data, make([]byte, 10*1024*1024)) + } +} + +func main() { + lambda.Start(handler) +} diff --git a/integration-tests/lambda/oom-java/pom.xml b/integration-tests/lambda/oom-java/pom.xml new file mode 100644 index 000000000..1ead70ea0 --- /dev/null +++ b/integration-tests/lambda/oom-java/pom.xml @@ -0,0 +1,50 @@ + + + 4.0.0 + + example + oom-java-lambda + 1.0.0 + jar + + OOM Java Lambda + Java Lambda function that triggers OutOfMemoryError for integration tests + + + 21 + 21 + UTF-8 + + + + + com.amazonaws + aws-lambda-java-core + 1.4.0 + + + + + + + org.apache.maven.plugins + maven-shade-plugin + 3.5.0 + + + package + + shade + + + function + false + + + + + + + diff --git a/integration-tests/lambda/oom-java/src/main/java/example/Handler.java b/integration-tests/lambda/oom-java/src/main/java/example/Handler.java new file mode 100644 index 000000000..92edb9c18 --- /dev/null +++ b/integration-tests/lambda/oom-java/src/main/java/example/Handler.java @@ -0,0 +1,24 @@ +package example; + +import com.amazonaws.services.lambda.runtime.Context; +import com.amazonaws.services.lambda.runtime.RequestHandler; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +/** + * OOM reproducer for Java. Allocates and retains 10 MB byte arrays in a list + * until the JVM throws java.lang.OutOfMemoryError: Java heap space. + * Bottlecap's runtime-specific log-line detection matches + * "java.lang.OutOfMemoryError". + */ +public class Handler implements RequestHandler, Map> { + + @Override + public Map handleRequest(Map event, Context context) { + List data = new ArrayList<>(); + while (true) { + data.add(new byte[10 * 1024 * 1024]); + } + } +} diff --git a/integration-tests/lambda/oom-node-sigkill/index.mjs b/integration-tests/lambda/oom-node-sigkill/index.mjs new file mode 100644 index 000000000..d6b245f36 --- /dev/null +++ b/integration-tests/lambda/oom-node-sigkill/index.mjs @@ -0,0 +1,13 @@ +// OOM reproducer: off-heap Buffer growth → kernel SIGKILL. +// Buffer.allocUnsafe(>8KB) goes through V8's ArrayBuffer allocator (external +// memory) and bypasses --max-old-space-size, so RSS grows until the cgroup +// limit triggers a kernel SIGKILL. Lambda surfaces this as PlatformRuntimeDone +// with error_type=Runtime.OutOfMemory — bottlecap's path 2 detection. +export const handler = async () => { + const bufs = []; + while (true) { + const b = Buffer.allocUnsafe(20 * 1024 * 1024); + b.fill(0); + bufs.push(b); + } +}; diff --git a/integration-tests/lambda/oom-node-v8-heap/index.mjs b/integration-tests/lambda/oom-node-v8-heap/index.mjs new file mode 100644 index 000000000..fb4e71c6f --- /dev/null +++ b/integration-tests/lambda/oom-node-v8-heap/index.mjs @@ -0,0 +1,10 @@ +// OOM reproducer: classic V8 heap exhaustion. Allocates retained strings in a +// loop until V8 hits its --max-old-space-size cap and prints +// "FATAL ERROR: ... JavaScript heap out of memory". Exercises bottlecap's +// runtime-specific log-line OOM detection path. +export const handler = async () => { + const arr = []; + while (true) { + arr.push('x'.repeat(10 * 1024 * 1024)); + } +}; diff --git a/integration-tests/lambda/oom-python/lambda_function.py b/integration-tests/lambda/oom-python/lambda_function.py new file mode 100644 index 000000000..12aa196ed --- /dev/null +++ b/integration-tests/lambda/oom-python/lambda_function.py @@ -0,0 +1,12 @@ +# OOM reproducer for Python. +# Allocates and retains 10 MB strings in a list until CPython raises +# MemoryError. Lambda surfaces this as PlatformRuntimeDone with +# error_type=Runtime.OutOfMemory; the function log line also contains +# "MemoryError". Both bottlecap detection paths fire — the dedup flag is +# what makes the OOM metric emit exactly once. + + +def handler(event, context): + data = [] + while True: + data.append("x" * (10 * 1024 * 1024)) diff --git a/integration-tests/lambda/oom-ruby/lambda_function.rb b/integration-tests/lambda/oom-ruby/lambda_function.rb new file mode 100644 index 000000000..674a70086 --- /dev/null +++ b/integration-tests/lambda/oom-ruby/lambda_function.rb @@ -0,0 +1,13 @@ +# OOM reproducer for Ruby. +# Allocates and retains 10 MB strings in an array until Ruby raises +# NoMemoryError. Lambda surfaces this as PlatformRuntimeDone with +# error_type=Runtime.OutOfMemory; the function log line also contains +# "failed to allocate memory (NoMemoryError)". Both bottlecap detection +# paths fire — the dedup flag is what makes the OOM metric emit exactly once. + +def handler(event:, context:) + data = [] + loop do + data << ("x" * (10 * 1024 * 1024)) + end +end diff --git a/integration-tests/lib/stacks/oom.ts b/integration-tests/lib/stacks/oom.ts new file mode 100644 index 000000000..dc7c6314e --- /dev/null +++ b/integration-tests/lib/stacks/oom.ts @@ -0,0 +1,216 @@ +import * as cdk from 'aws-cdk-lib'; +import * as lambda from 'aws-cdk-lib/aws-lambda'; +import { Construct } from 'constructs'; +import { + createLogGroup, + defaultDatadogEnvVariables, + defaultDatadogSecretPolicy, + getExtensionLayer, + getDefaultNodeLayer, + getDefaultPythonLayer, + getDefaultJavaLayer, + getDefaultDotnetLayer, + getDefaultRubyLayer, + defaultNodeRuntime, + defaultPythonRuntime, + defaultJavaRuntime, + defaultDotnetRuntime, + defaultRubyRuntime, + defaultGoRuntime, +} from '../util'; + +/** + * OOM cross-runtime test stack. + * + * Deploys one Lambda per OOM "shape" so the bottlecap dedup change + * (Context::oom_emitted + try_increment_oom_metric, covering issue #1237) + * can be exercised end-to-end across every supported runtime. Each function + * intentionally allocates until it OOMs; the test then asserts the + * `aws.lambda.enhanced.out_of_memory` metric increments by exactly 1. + * + * The detection paths exercised per case: + * - oom-node-v8-heap : log-line match `JavaScript heap out of memory` + * - oom-node-sigkill : PlatformRuntimeDone `error_type=Runtime.OutOfMemory` + * - oom-python : log line `MemoryError` + PlatformRuntimeDone (dedup) + * - oom-ruby : log line `NoMemoryError` + PlatformRuntimeDone (dedup) + * - oom-java : log line `java.lang.OutOfMemoryError` + * - oom-dotnet : log line `OutOfMemoryException` + * - oom-go : log line `fatal error: runtime: out of memory` + * + PlatformReport memory equality (dedup) + * + * Each function is configured with low memory (192 MB) and a short timeout + * (30 s) so the OOM fires quickly during the integration-test run. + */ +export class Oom extends cdk.Stack { + constructor(scope: Construct, id: string, props: cdk.StackProps) { + super(scope, id, props); + + const extensionLayer = getExtensionLayer(this); + const nodeLayer = getDefaultNodeLayer(this); + const pythonLayer = getDefaultPythonLayer(this); + const javaLayer = getDefaultJavaLayer(this); + const dotnetLayer = getDefaultDotnetLayer(this); + const rubyLayer = getDefaultRubyLayer(this); + + const oomMemorySize = 192; + const oomTimeout = cdk.Duration.seconds(30); + + // Node case A — V8 heap exhaustion (log-line path). + const nodeV8FunctionName = `${id}-node-v8-heap-lambda`; + const nodeV8Function = new lambda.Function(this, nodeV8FunctionName, { + runtime: defaultNodeRuntime, + architecture: lambda.Architecture.ARM_64, + handler: '/opt/nodejs/node_modules/datadog-lambda-js/handler.handler', + code: lambda.Code.fromAsset('./lambda/oom-node-v8-heap'), + functionName: nodeV8FunctionName, + timeout: oomTimeout, + memorySize: oomMemorySize, + environment: { + ...defaultDatadogEnvVariables, + DD_SERVICE: nodeV8FunctionName, + DD_TRACE_ENABLED: 'true', + DD_LAMBDA_HANDLER: 'index.handler', + // Cap V8 heap below the Lambda memory cap so V8 throws its OOM error + // before the kernel SIGKILLs the process. + NODE_OPTIONS: '--max-old-space-size=128', + }, + logGroup: createLogGroup(this, nodeV8FunctionName), + }); + nodeV8Function.addToRolePolicy(defaultDatadogSecretPolicy); + nodeV8Function.addLayers(extensionLayer); + nodeV8Function.addLayers(nodeLayer); + + // Node case B — off-heap Buffer / kernel SIGKILL (PlatformRuntimeDone path). + const nodeSigkillFunctionName = `${id}-node-sigkill-lambda`; + const nodeSigkillFunction = new lambda.Function(this, nodeSigkillFunctionName, { + runtime: defaultNodeRuntime, + architecture: lambda.Architecture.ARM_64, + handler: '/opt/nodejs/node_modules/datadog-lambda-js/handler.handler', + code: lambda.Code.fromAsset('./lambda/oom-node-sigkill'), + functionName: nodeSigkillFunctionName, + timeout: oomTimeout, + memorySize: oomMemorySize, + environment: { + ...defaultDatadogEnvVariables, + DD_SERVICE: nodeSigkillFunctionName, + DD_TRACE_ENABLED: 'true', + DD_LAMBDA_HANDLER: 'index.handler', + }, + logGroup: createLogGroup(this, nodeSigkillFunctionName), + }); + nodeSigkillFunction.addToRolePolicy(defaultDatadogSecretPolicy); + nodeSigkillFunction.addLayers(extensionLayer); + nodeSigkillFunction.addLayers(nodeLayer); + + // Python — MemoryError; log path and PlatformRuntimeDone path both fire. + const pythonFunctionName = `${id}-python-lambda`; + const pythonFunction = new lambda.Function(this, pythonFunctionName, { + runtime: defaultPythonRuntime, + architecture: lambda.Architecture.ARM_64, + handler: 'datadog_lambda.handler.handler', + code: lambda.Code.fromAsset('./lambda/oom-python'), + functionName: pythonFunctionName, + timeout: oomTimeout, + memorySize: oomMemorySize, + environment: { + ...defaultDatadogEnvVariables, + DD_SERVICE: pythonFunctionName, + DD_TRACE_ENABLED: 'true', + DD_LAMBDA_HANDLER: 'lambda_function.handler', + }, + logGroup: createLogGroup(this, pythonFunctionName), + }); + pythonFunction.addToRolePolicy(defaultDatadogSecretPolicy); + pythonFunction.addLayers(extensionLayer); + pythonFunction.addLayers(pythonLayer); + + // Ruby — NoMemoryError; log path and PlatformRuntimeDone path both fire. + const rubyFunctionName = `${id}-ruby-lambda`; + const rubyFunction = new lambda.Function(this, rubyFunctionName, { + runtime: defaultRubyRuntime, + architecture: lambda.Architecture.ARM_64, + handler: 'datadog_lambda_rb.handler', + code: lambda.Code.fromAsset('./lambda/oom-ruby'), + functionName: rubyFunctionName, + timeout: oomTimeout, + memorySize: oomMemorySize, + environment: { + ...defaultDatadogEnvVariables, + DD_SERVICE: rubyFunctionName, + DD_TRACE_ENABLED: 'true', + DD_LAMBDA_HANDLER: 'lambda_function.handler', + }, + logGroup: createLogGroup(this, rubyFunctionName), + }); + rubyFunction.addToRolePolicy(defaultDatadogSecretPolicy); + rubyFunction.addLayers(extensionLayer); + rubyFunction.addLayers(rubyLayer); + + // Java — OutOfMemoryError (log-line path). + const javaFunctionName = `${id}-java-lambda`; + const javaFunction = new lambda.Function(this, javaFunctionName, { + runtime: defaultJavaRuntime, + architecture: lambda.Architecture.ARM_64, + handler: 'example.Handler::handleRequest', + code: lambda.Code.fromAsset('./lambda/oom-java/target/function.jar'), + functionName: javaFunctionName, + timeout: oomTimeout, + memorySize: oomMemorySize, + environment: { + ...defaultDatadogEnvVariables, + DD_SERVICE: javaFunctionName, + AWS_LAMBDA_EXEC_WRAPPER: '/opt/datadog_wrapper', + DD_TRACE_ENABLED: 'true', + }, + logGroup: createLogGroup(this, javaFunctionName), + }); + javaFunction.addToRolePolicy(defaultDatadogSecretPolicy); + javaFunction.addLayers(extensionLayer); + javaFunction.addLayers(javaLayer); + + // .NET — OutOfMemoryException (log-line path). + const dotnetFunctionName = `${id}-dotnet-lambda`; + const dotnetFunction = new lambda.Function(this, dotnetFunctionName, { + runtime: defaultDotnetRuntime, + architecture: lambda.Architecture.ARM_64, + handler: 'Function::Function.Handler::FunctionHandler', + code: lambda.Code.fromAsset('./lambda/oom-dotnet/bin/function.zip'), + functionName: dotnetFunctionName, + timeout: oomTimeout, + memorySize: oomMemorySize, + environment: { + ...defaultDatadogEnvVariables, + DD_SERVICE: dotnetFunctionName, + AWS_LAMBDA_EXEC_WRAPPER: '/opt/datadog_wrapper', + }, + logGroup: createLogGroup(this, dotnetFunctionName), + }); + dotnetFunction.addToRolePolicy(defaultDatadogSecretPolicy); + dotnetFunction.addLayers(extensionLayer); + dotnetFunction.addLayers(dotnetLayer); + + // Go — runtime fatal error + PlatformReport memory equality (dedup). + // Go runs on the custom runtime, so the binary itself is the handler. + const goFunctionName = `${id}-go-lambda`; + const goFunction = new lambda.Function(this, goFunctionName, { + runtime: defaultGoRuntime, + architecture: lambda.Architecture.ARM_64, + handler: 'bootstrap', + code: lambda.Code.fromAsset('./lambda/oom-go/bin'), + functionName: goFunctionName, + timeout: oomTimeout, + memorySize: oomMemorySize, + environment: { + ...defaultDatadogEnvVariables, + DD_SERVICE: goFunctionName, + AWS_LAMBDA_EXEC_WRAPPER: '/opt/datadog_wrapper', + }, + logGroup: createLogGroup(this, goFunctionName), + }); + goFunction.addToRolePolicy(defaultDatadogSecretPolicy); + goFunction.addLayers(extensionLayer); + // Go has no tracer layer — the Datadog tracer for Go is a Go module imported + // into the function source. The extension layer alone is enough for the + // enhanced metrics this test asserts on. + } +} diff --git a/integration-tests/lib/util.ts b/integration-tests/lib/util.ts index dd8309789..24fe04164 100644 --- a/integration-tests/lib/util.ts +++ b/integration-tests/lib/util.ts @@ -13,11 +13,15 @@ export const defaultNodeRuntime = lambda.Runtime.NODEJS_24_X; export const defaultPythonRuntime = lambda.Runtime.PYTHON_3_13; export const defaultJavaRuntime = lambda.Runtime.JAVA_21; export const defaultDotnetRuntime = lambda.Runtime.DOTNET_8; +export const defaultRubyRuntime = lambda.Runtime.RUBY_3_4; +// Go runs on the custom runtime; the Datadog tracer is a Go module, not a layer. +export const defaultGoRuntime = lambda.Runtime.PROVIDED_AL2023; export const defaultNodeLayerArn = process.env.NODE_TRACER_LAYER_ARN || 'arn:aws:lambda:us-east-1:464622532012:layer:Datadog-Node24-x:132'; export const defaultPythonLayerArn = process.env.PYTHON_TRACER_LAYER_ARN || 'arn:aws:lambda:us-east-1:464622532012:layer:Datadog-Python313-ARM:117'; export const defaultJavaLayerArn = process.env.JAVA_TRACER_LAYER_ARN || 'arn:aws:lambda:us-east-1:464622532012:layer:dd-trace-java:25'; export const defaultDotnetLayerArn = process.env.DOTNET_TRACER_LAYER_ARN || 'arn:aws:lambda:us-east-1:464622532012:layer:dd-trace-dotnet-ARM:23'; +export const defaultRubyLayerArn = process.env.RUBY_TRACER_LAYER_ARN || 'arn:aws:lambda:us-east-1:464622532012:layer:Datadog-Ruby3-4-ARM:9'; export const defaultDatadogEnvVariables = { DD_API_KEY_SECRET_ARN: datadogSecretArn, @@ -87,6 +91,14 @@ export const getDefaultDotnetLayer = (scope: Construct) => { ); }; +export const getDefaultRubyLayer = (scope: Construct) => { + return LayerVersion.fromLayerVersionArn( + scope, + 'DatadogRubyLayer', + defaultRubyLayerArn + ); +}; + export const capacityProviderArn = `arn:aws:lambda:${REGION}:${ACCOUNT}:capacity-provider:integ-default-capacity-provider-cp`; export function setCapacityProvider(lambdaFunction: lambda.Function) { diff --git a/integration-tests/scripts/build-go.sh b/integration-tests/scripts/build-go.sh new file mode 100755 index 000000000..8f24bc45c --- /dev/null +++ b/integration-tests/scripts/build-go.sh @@ -0,0 +1,123 @@ +#!/bin/bash +set -e + +# Reusable script to cross-compile Go Lambda functions for ARM64 Linux. +# Outputs a binary named `bootstrap` (required by the AWS Lambda custom runtime +# provided.al2023) under /bin/. +# +# Usage: +# ./build-go.sh # Build all Go Lambda functions +# ./build-go.sh # Build a specific Lambda function + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +LAMBDA_BASE_DIR="$SCRIPT_DIR/../lambda" + +build_go_lambda() { + local LAMBDA_DIR="$1" + local FUNCTION_NAME=$(basename "$LAMBDA_DIR") + + if [ ! -d "$LAMBDA_DIR" ]; then + echo "Error: Directory not found: $LAMBDA_DIR" + return 1 + fi + + if [ ! -f "$LAMBDA_DIR/go.mod" ]; then + echo "Error: go.mod not found in $LAMBDA_DIR" + return 1 + fi + + echo "Building Go Lambda: $FUNCTION_NAME" + + if ! command -v docker &> /dev/null; then + echo "Error: Docker is not installed or not in PATH" + return 1 + fi + + # Clean previous build (idempotent). + rm -rf "$LAMBDA_DIR/bin" + mkdir -p "$LAMBDA_DIR/bin" + + # Module cache: reuse the host's $GOPATH/pkg/mod when running locally; + # use a project-local cache in CI so it can be cached between jobs. + if [ -n "$CI" ]; then + GO_MOD_CACHE="$SCRIPT_DIR/../.cache/go-mod" + mkdir -p "$GO_MOD_CACHE" + else + GO_MOD_CACHE="${GOPATH:-$HOME/go}/pkg/mod" + mkdir -p "$GO_MOD_CACHE" + fi + + # Cross-compile to ARM64 Linux inside the official Go image. + # CGO is disabled so the binary runs on the provided.al2023 base image + # without a libc mismatch. + docker run --rm --platform linux/arm64 \ + -v "$LAMBDA_DIR":/workspace \ + -v "$GO_MOD_CACHE":/go/pkg/mod \ + -w /workspace \ + -e GOOS=linux \ + -e GOARCH=arm64 \ + -e CGO_ENABLED=0 \ + public.ecr.aws/docker/library/golang:1.22-bookworm \ + sh -c "go mod tidy && go build -o bin/bootstrap ." + + if [ ! -f "$LAMBDA_DIR/bin/bootstrap" ]; then + echo "✗ Build failed: bin/bootstrap not produced" + return 1 + fi + + echo "✓ Build complete: $LAMBDA_DIR/bin/bootstrap" + return 0 +} + +if [ -z "$1" ]; then + echo "==========================================" + echo "Building all Go Lambda functions" + echo "==========================================" + echo "" + + FOUND_GO=0 + FAILED_BUILDS=() + + for LAMBDA_PATH in "$LAMBDA_BASE_DIR"/*; do + if [ ! -d "$LAMBDA_PATH" ]; then + continue + fi + + FUNCTION_NAME=$(basename "$LAMBDA_PATH") + + # Match directories whose suffix is `-go` or whose name is exactly `go`. + if [[ "$FUNCTION_NAME" == *"-go" || "$FUNCTION_NAME" == "go" ]]; then + FOUND_GO=1 + echo "----------------------------------------" + if build_go_lambda "$LAMBDA_PATH"; then + echo "✓ $FUNCTION_NAME built successfully" + else + echo "✗ $FUNCTION_NAME failed" + FAILED_BUILDS+=("$FUNCTION_NAME") + fi + echo "" + fi + done + + if [ $FOUND_GO -eq 0 ]; then + echo "No Go Lambda functions found (looking for directories ending in -go)" + exit 0 + fi + + if [ ${#FAILED_BUILDS[@]} -eq 0 ]; then + echo "✓ All Go Lambda builds completed successfully!" + exit 0 + fi + + echo "✗ ${#FAILED_BUILDS[@]} Go Lambda build(s) failed:" + for failed in "${FAILED_BUILDS[@]}"; do + echo " - $failed" + done + exit 1 +else + LAMBDA_DIR="$1" + if [[ "$LAMBDA_DIR" != /* ]]; then + LAMBDA_DIR="$(cd "$SCRIPT_DIR/.." && pwd)/$LAMBDA_DIR" + fi + build_go_lambda "$LAMBDA_DIR" +fi diff --git a/integration-tests/scripts/build-ruby.sh b/integration-tests/scripts/build-ruby.sh new file mode 100755 index 000000000..0ca36064d --- /dev/null +++ b/integration-tests/scripts/build-ruby.sh @@ -0,0 +1,88 @@ +#!/bin/bash +set -e + +# Reusable script to build Ruby Lambda functions. +# For simple Ruby Lambdas with no gem dependencies, this just packages the +# source as-is — the runtime + Datadog tracer layer provide everything needed. +# If the function gains a Gemfile, this script grows a bundle install step +# in a Docker container (mirroring build-python.sh / build-node.sh). +# +# Usage: +# ./build-ruby.sh # Build all Ruby Lambda functions +# ./build-ruby.sh # Build a specific Lambda function + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +LAMBDA_BASE_DIR="$SCRIPT_DIR/../lambda" + +build_ruby_lambda() { + local LAMBDA_DIR="$1" + local FUNCTION_NAME=$(basename "$LAMBDA_DIR") + + if [ ! -d "$LAMBDA_DIR" ]; then + echo "Error: Directory not found: $LAMBDA_DIR" + return 1 + fi + + echo "Building Ruby Lambda: $FUNCTION_NAME" + + if [ ! -f "$LAMBDA_DIR/Gemfile" ]; then + echo "ℹ No Gemfile found — source files are deployed as-is" + return 0 + fi + + echo "Error: Gemfile-based Ruby builds are not implemented yet" >&2 + echo " Add a Dockerised \`bundle install\` step to this script when needed." >&2 + return 1 +} + +if [ -z "$1" ]; then + echo "==========================================" + echo "Building all Ruby Lambda functions" + echo "==========================================" + echo "" + + FOUND_RUBY=0 + FAILED_BUILDS=() + + for LAMBDA_PATH in "$LAMBDA_BASE_DIR"/*; do + if [ ! -d "$LAMBDA_PATH" ]; then + continue + fi + + FUNCTION_NAME=$(basename "$LAMBDA_PATH") + + if [[ "$FUNCTION_NAME" == *"ruby"* ]]; then + FOUND_RUBY=1 + echo "----------------------------------------" + if build_ruby_lambda "$LAMBDA_PATH"; then + echo "✓ $FUNCTION_NAME built successfully" + else + echo "✗ $FUNCTION_NAME failed" + FAILED_BUILDS+=("$FUNCTION_NAME") + fi + echo "" + fi + done + + if [ $FOUND_RUBY -eq 0 ]; then + echo "No Ruby Lambda functions found (looking for directories with 'ruby' in name)" + exit 0 + fi + + if [ ${#FAILED_BUILDS[@]} -eq 0 ]; then + echo "✓ All Ruby Lambda builds completed successfully!" + exit 0 + fi + + echo "✗ ${#FAILED_BUILDS[@]} Ruby Lambda build(s) failed:" + for failed in "${FAILED_BUILDS[@]}"; do + echo " - $failed" + done + exit 1 +else + LAMBDA_DIR="$1" + if [[ "$LAMBDA_DIR" != /* ]]; then + LAMBDA_DIR="$(cd "$SCRIPT_DIR/.." && pwd)/$LAMBDA_DIR" + fi + build_ruby_lambda "$LAMBDA_DIR" +fi diff --git a/integration-tests/scripts/local_deploy.sh b/integration-tests/scripts/local_deploy.sh index b432261da..451b81cf6 100755 --- a/integration-tests/scripts/local_deploy.sh +++ b/integration-tests/scripts/local_deploy.sh @@ -43,6 +43,8 @@ echo "Building all Lambda functions in parallel..." "$SCRIPT_DIR/build-dotnet.sh" & "$SCRIPT_DIR/build-python.sh" & "$SCRIPT_DIR/build-node.sh" & +"$SCRIPT_DIR/build-ruby.sh" & +"$SCRIPT_DIR/build-go.sh" & wait echo "All Lambda builds complete" diff --git a/integration-tests/tests/oom.test.ts b/integration-tests/tests/oom.test.ts new file mode 100644 index 000000000..5e33e4c8e --- /dev/null +++ b/integration-tests/tests/oom.test.ts @@ -0,0 +1,90 @@ +import { invokeLambda } from './utils/lambda'; +import { getMetricCount, OUT_OF_MEMORY_METRIC } from './utils/datadog'; +import { DEFAULT_DATADOG_INDEXING_WAIT_MS } from '../config'; +import { getIdentifier } from '../config'; + +/** + * Cross-runtime OOM test. + * + * Each function is intentionally configured to OOM on its first invocation. + * Bottlecap has three detection paths that can fire for the same invocation + * (runtime-specific log line, `Runtime.OutOfMemory` `error_type` in + * `PlatformRuntimeDone`, `max_memory_used_mb == memory_size_mb` in + * `PlatformReport`); the `Context::oom_emitted` flag introduced for #1237 + * dedupes them so the metric increments exactly once per invocation. + * + * The Python/Ruby/Go cases are particularly meaningful regressions because + * they trigger more than one detection path naturally — if dedup is broken, + * those counts go to 2. + */ +const identifier = getIdentifier(); +const stackName = `integ-${identifier}-oom`; + +interface OomCase { + runtime: string; + functionName: string; +} + +const cases: OomCase[] = [ + { runtime: 'node-v8-heap', functionName: `${stackName}-node-v8-heap-lambda` }, + { runtime: 'node-sigkill', functionName: `${stackName}-node-sigkill-lambda` }, + { runtime: 'python', functionName: `${stackName}-python-lambda` }, + { runtime: 'ruby', functionName: `${stackName}-ruby-lambda` }, + { runtime: 'java', functionName: `${stackName}-java-lambda` }, + { runtime: 'dotnet', functionName: `${stackName}-dotnet-lambda` }, + { runtime: 'go', functionName: `${stackName}-go-lambda` }, +]; + +async function sleep(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)); +} + +describe('OOM Integration Tests', () => { + let countsByRuntime: Record; + let windowStart: number; + let windowEnd: number; + + // Invoke every function once, wait for Datadog to ingest, then query once + // for each. Keeping invocations and the query inside `beforeAll` lets each + // per-runtime test below assert against the same data set. + beforeAll(async () => { + windowStart = Date.now(); + + await Promise.all( + cases.map((c) => + invokeLambda(c.functionName).catch((err) => { + // OOM functions usually succeed at the Invoke API layer (the function + // is run, just crashes), so a thrown error here is unexpected + // infrastructure failure rather than the OOM itself. Re-throw so the + // test surfaces it. + throw new Error(`Invoke failed for ${c.functionName}: ${err}`); + }), + ), + ); + + await sleep(DEFAULT_DATADOG_INDEXING_WAIT_MS); + windowEnd = Date.now(); + + const results = await Promise.all( + cases.map(async (c) => ({ + runtime: c.runtime, + count: await getMetricCount( + OUT_OF_MEMORY_METRIC, + c.functionName, + windowStart, + windowEnd, + ), + })), + ); + + countsByRuntime = Object.fromEntries(results.map((r) => [r.runtime, r.count])); + console.log('OOM counts by runtime:', countsByRuntime); + }, 10 * 60 * 1000); + + describe.each(cases)('$runtime runtime', ({ runtime }) => { + it('should emit exactly one out_of_memory metric for one OOM invocation', () => { + const count = countsByRuntime[runtime]; + expect(count).toBe(1); + }); + }); +}); diff --git a/integration-tests/tests/utils/datadog.ts b/integration-tests/tests/utils/datadog.ts index ed3768ea0..e25225235 100644 --- a/integration-tests/tests/utils/datadog.ts +++ b/integration-tests/tests/utils/datadog.ts @@ -91,6 +91,8 @@ export const DURATION_METRICS = [ 'aws.lambda.enhanced.init_duration', ]; +export const OUT_OF_MEMORY_METRIC = 'aws.lambda.enhanced.out_of_memory'; + export type EnhancedMetrics = Record; export interface MetricPoint { @@ -289,6 +291,41 @@ export async function getEnhancedMetrics( return metrics; } +/** + * Returns the total emission count of a counter / distribution enhanced metric + * for a single function over the given window, by summing all data-point + * values returned by Datadog. Used by oom.test.ts to assert that + * `aws.lambda.enhanced.out_of_memory` increments exactly once per invocation — + * verifying the per-Context `oom_emitted` dedup flag introduced for #1237. + */ +export async function getMetricCount( + metricName: string, + functionName: string, + fromTime: number, + toTime: number, +): Promise { + const baseFunctionName = getServiceName(functionName).toLowerCase(); + const query = `sum:${metricName}{functionname:${baseFunctionName}}.as_count()`; + + console.log(`Querying metric count: ${query}`); + + const response = await datadogClient.get('/api/v1/query', { + params: { + query, + from: Math.floor(fromTime / 1000), + to: Math.floor(toTime / 1000), + }, + }); + + const series = response.data.series || []; + if (series.length === 0) { + return 0; + } + + const pointlist: [number, number][] = series[0].pointlist || []; + return pointlist.reduce((acc, [, value]) => acc + (value || 0), 0); +} + async function getMetrics( metricName: string, functionName: string, From ab8a79e16a5fc3ce0de93a72a54c9460247d1508 Mon Sep 17 00:00:00 2001 From: Yiming Luo <10097700+lym953@users.noreply.github.com> Date: Mon, 1 Jun 2026 12:32:30 -0400 Subject: [PATCH 03/21] test(integration): fix Ruby + Go OOM lambdas in oom suite MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CI run on the first oom suite (commit 5a833ac9) returned counts: node-v8-heap=1 ✓ node-sigkill=1 ✓ python=1 ✓ dotnet=1 ✓ ruby=0 ✗ java=0 ✗ go=0 ✗ Reproducing locally: - Ruby: function failed at init with `cannot load such file -- datadog_lambda_rb`. The Datadog Ruby tracer is a regular gem (no handler shim like Python's `datadog_lambda.handler.handler`), so set handler to `lambda_function.handler` and drop `DD_LAMBDA_HANDLER`. - Go: function timed out (30s) at `Max Memory Used: 192 MB / Memory Size: 192 MB` without emitting any enhanced metrics. Two changes: * Drop `AWS_LAMBDA_EXEC_WRAPPER=/opt/datadog_wrapper` — the wrapper sets language-specific tracer env vars; Go's tracer is in-module not layer-based, so the wrapper just changes runtime detection without helping. With the wrapper removed and a clean exec, the extension's enhanced-metric pipeline starts emitting. * Replace the `for { append(make([]byte, 10MB)) }` loop with a single `make([]byte, 500MB)` that writes every page. Go's slice doubling + GC kept the loop from OOMing reliably in the 30s timeout window; eager allocation guarantees `fatal error: runtime: out of memory` fires immediately, exercising bottlecap's log-line detection. - Java: also failed in CI (count=0) but local repro now returns count=1 with the same code path. Leaving the Java app unchanged for the next CI run to confirm. If it fails again, likely the extension didn't flush the metric before the JVM crashed; would need DD_SERVERLESS_FLUSH_STRATEGY changes or per-function twice-invoke. Co-Authored-By: Claude Opus 4.7 (1M context) --- integration-tests/lambda/oom-go/go.sum | 10 ++++++++++ integration-tests/lambda/oom-go/main.go | 25 ++++++++++++++++--------- integration-tests/lib/stacks/oom.ts | 13 ++++++++----- 3 files changed, 34 insertions(+), 14 deletions(-) create mode 100644 integration-tests/lambda/oom-go/go.sum diff --git a/integration-tests/lambda/oom-go/go.sum b/integration-tests/lambda/oom-go/go.sum new file mode 100644 index 000000000..a5b506ab1 --- /dev/null +++ b/integration-tests/lambda/oom-go/go.sum @@ -0,0 +1,10 @@ +github.com/aws/aws-lambda-go v1.49.0 h1:z4VhTqkFZPM3xpEtTqWqRqsRH4TZBMJqTkRiBPYLqIQ= +github.com/aws/aws-lambda-go v1.49.0/go.mod h1:dpMpZgvWx5vuQJfBt0zqBha60q7Dd7RfgJv23DymV8A= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/stretchr/testify v1.7.2 h1:4jaiDzPyXQvSd7D0EjG45355tLlV3VOECpq10pLC+8s= +github.com/stretchr/testify v1.7.2/go.mod h1:R6va5+xMeoiuVRoj+gSkQ7d3FALtqAAGI1FQKckRals= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/integration-tests/lambda/oom-go/main.go b/integration-tests/lambda/oom-go/main.go index 99821b0ad..a76960ccf 100644 --- a/integration-tests/lambda/oom-go/main.go +++ b/integration-tests/lambda/oom-go/main.go @@ -1,21 +1,28 @@ // OOM reproducer for Go. -// Allocates and retains 10 MB byte slices in a slice header until the Go -// runtime aborts with "fatal error: runtime: out of memory". Bottlecap's -// runtime-specific log-line detection matches that fatal-error message. -// Without that detection (and historically for provided.al runtimes), the -// equality path in PlatformReport (max_memory_used_mb == memory_size_mb) also -// fires. The per-Context dedup flag ensures the metric increments only once. +// Allocates a 500 MB byte slice in a single shot, then writes to every page +// to force physical commit. On a 192 MB Lambda this immediately exceeds the +// cgroup memory limit and the kernel SIGKILLs the process, producing a +// PlatformReport with max_memory_used_mb == memory_size_mb. The Go runtime +// also typically prints "fatal error: runtime: out of memory" on the way +// down — bottlecap's runtime-specific log-line detection matches that +// message. Per-Context dedup ensures the OOM metric increments only once +// even if both paths fire. package main import ( + "log" + "github.com/aws/aws-lambda-go/lambda" ) func handler() error { - var data [][]byte - for { - data = append(data, make([]byte, 10*1024*1024)) + log.Println("OOM reproducer: allocating 500 MB") + b := make([]byte, 500*1024*1024) + for i := range b { + b[i] = byte(i % 256) } + log.Println("did not OOM — unexpected") // unreachable on a 192 MB Lambda + return nil } func main() { diff --git a/integration-tests/lib/stacks/oom.ts b/integration-tests/lib/stacks/oom.ts index dc7c6314e..fcb46d5f1 100644 --- a/integration-tests/lib/stacks/oom.ts +++ b/integration-tests/lib/stacks/oom.ts @@ -125,11 +125,14 @@ export class Oom extends cdk.Stack { pythonFunction.addLayers(pythonLayer); // Ruby — NoMemoryError; log path and PlatformRuntimeDone path both fire. + // Datadog's Ruby tracer is a regular gem (no handler shim like Python's + // `datadog_lambda.handler.handler`), so the Lambda handler is the user's + // own `lambda_function.handler` and `DD_LAMBDA_HANDLER` is not used. const rubyFunctionName = `${id}-ruby-lambda`; const rubyFunction = new lambda.Function(this, rubyFunctionName, { runtime: defaultRubyRuntime, architecture: lambda.Architecture.ARM_64, - handler: 'datadog_lambda_rb.handler', + handler: 'lambda_function.handler', code: lambda.Code.fromAsset('./lambda/oom-ruby'), functionName: rubyFunctionName, timeout: oomTimeout, @@ -138,7 +141,6 @@ export class Oom extends cdk.Stack { ...defaultDatadogEnvVariables, DD_SERVICE: rubyFunctionName, DD_TRACE_ENABLED: 'true', - DD_LAMBDA_HANDLER: 'lambda_function.handler', }, logGroup: createLogGroup(this, rubyFunctionName), }); @@ -189,8 +191,10 @@ export class Oom extends cdk.Stack { dotnetFunction.addLayers(extensionLayer); dotnetFunction.addLayers(dotnetLayer); - // Go — runtime fatal error + PlatformReport memory equality (dedup). - // Go runs on the custom runtime, so the binary itself is the handler. + // Go — runtime fatal error (log-line path). + // The Go binary itself is the handler. We don't set + // AWS_LAMBDA_EXEC_WRAPPER: that wrapper sets language-specific env vars + // for tracer auto-instrumentation, which Go doesn't use. const goFunctionName = `${id}-go-lambda`; const goFunction = new lambda.Function(this, goFunctionName, { runtime: defaultGoRuntime, @@ -203,7 +207,6 @@ export class Oom extends cdk.Stack { environment: { ...defaultDatadogEnvVariables, DD_SERVICE: goFunctionName, - AWS_LAMBDA_EXEC_WRAPPER: '/opt/datadog_wrapper', }, logGroup: createLogGroup(this, goFunctionName), }); From 9e2866669b832727ee1259499693f1d3b29a5062 Mon Sep 17 00:00:00 2001 From: Yiming Luo <10097700+lym953@users.noreply.github.com> Date: Mon, 1 Jun 2026 13:11:52 -0400 Subject: [PATCH 04/21] test(integration): override DD_SERVERLESS_FLUSH_STRATEGY=default for oom suite MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The integration-test framework defaults DD_SERVERLESS_FLUSH_STRATEGY to `end`, which means the extension only flushes at end of invocation. For OOM tests that's a tight race: the function dies, then Lambda sends PlatformRuntimeDone, then bottlecap increments the OOM metric, then Shutdown comes and the sandbox is reaped. If the metric flush can't finish in that narrow window, the data point is lost. Run 1 of the oom suite returned ruby/java/go=0 (3 of 7 failed). Run 2 returned ruby/node-sigkill/python/dotnet/go=0 (5 of 7 failed) — but java=1 this time. The set of "failing" runtimes is not stable across runs, confirming a timing race rather than a code bug. `default` flushes every ~1s in addition to invocation-end, giving the OOM metric a much wider window to reach Datadog before the sandbox is torn down. All other integration suites keep using `end` since their invocations complete cleanly. Co-Authored-By: Claude Opus 4.7 (1M context) --- integration-tests/lib/stacks/oom.ts | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/integration-tests/lib/stacks/oom.ts b/integration-tests/lib/stacks/oom.ts index fcb46d5f1..ac8c86e45 100644 --- a/integration-tests/lib/stacks/oom.ts +++ b/integration-tests/lib/stacks/oom.ts @@ -55,6 +55,19 @@ export class Oom extends cdk.Stack { const oomMemorySize = 192; const oomTimeout = cdk.Duration.seconds(30); + // The integration-test framework defaults DD_SERVERLESS_FLUSH_STRATEGY to + // `end` (flush only at end of invocation). For OOM tests that's a tight + // race: the function process dies, then Lambda sends PlatformRuntimeDone, + // then the extension increments the OOM metric, then Shutdown comes and + // the sandbox is reaped. If the metric flush can't finish in the narrow + // window between the OOM and the sandbox teardown, the data point is + // lost and the test sees count=0. + // + // `default` (`flushStrategyDefault`) flushes every ~1s in addition to + // invocation-end, giving the OOM metric a much wider window to reach + // Datadog. Override per-function below. + const flushStrategyDefault = 'default'; + // Node case A — V8 heap exhaustion (log-line path). const nodeV8FunctionName = `${id}-node-v8-heap-lambda`; const nodeV8Function = new lambda.Function(this, nodeV8FunctionName, { @@ -68,6 +81,7 @@ export class Oom extends cdk.Stack { environment: { ...defaultDatadogEnvVariables, DD_SERVICE: nodeV8FunctionName, + DD_SERVERLESS_FLUSH_STRATEGY: flushStrategyDefault, DD_TRACE_ENABLED: 'true', DD_LAMBDA_HANDLER: 'index.handler', // Cap V8 heap below the Lambda memory cap so V8 throws its OOM error @@ -93,6 +107,7 @@ export class Oom extends cdk.Stack { environment: { ...defaultDatadogEnvVariables, DD_SERVICE: nodeSigkillFunctionName, + DD_SERVERLESS_FLUSH_STRATEGY: flushStrategyDefault, DD_TRACE_ENABLED: 'true', DD_LAMBDA_HANDLER: 'index.handler', }, @@ -115,6 +130,7 @@ export class Oom extends cdk.Stack { environment: { ...defaultDatadogEnvVariables, DD_SERVICE: pythonFunctionName, + DD_SERVERLESS_FLUSH_STRATEGY: flushStrategyDefault, DD_TRACE_ENABLED: 'true', DD_LAMBDA_HANDLER: 'lambda_function.handler', }, @@ -140,6 +156,7 @@ export class Oom extends cdk.Stack { environment: { ...defaultDatadogEnvVariables, DD_SERVICE: rubyFunctionName, + DD_SERVERLESS_FLUSH_STRATEGY: flushStrategyDefault, DD_TRACE_ENABLED: 'true', }, logGroup: createLogGroup(this, rubyFunctionName), @@ -161,6 +178,7 @@ export class Oom extends cdk.Stack { environment: { ...defaultDatadogEnvVariables, DD_SERVICE: javaFunctionName, + DD_SERVERLESS_FLUSH_STRATEGY: flushStrategyDefault, AWS_LAMBDA_EXEC_WRAPPER: '/opt/datadog_wrapper', DD_TRACE_ENABLED: 'true', }, @@ -183,6 +201,7 @@ export class Oom extends cdk.Stack { environment: { ...defaultDatadogEnvVariables, DD_SERVICE: dotnetFunctionName, + DD_SERVERLESS_FLUSH_STRATEGY: flushStrategyDefault, AWS_LAMBDA_EXEC_WRAPPER: '/opt/datadog_wrapper', }, logGroup: createLogGroup(this, dotnetFunctionName), @@ -207,6 +226,7 @@ export class Oom extends cdk.Stack { environment: { ...defaultDatadogEnvVariables, DD_SERVICE: goFunctionName, + DD_SERVERLESS_FLUSH_STRATEGY: flushStrategyDefault, }, logGroup: createLogGroup(this, goFunctionName), }); From e430618a0992d2f135f6c7782265646343cbd0c8 Mon Sep 17 00:00:00 2001 From: Yiming Luo <10097700+lym953@users.noreply.github.com> Date: Mon, 1 Jun 2026 14:25:24 -0400 Subject: [PATCH 05/21] test(integration): poll for OOM metric with retries up to 12-min budget MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CI runs were intermittently returning count=0 for ruby/java/go/dotnet/ node-sigkill/python — varying combinations across runs. Diagnosing showed the data points were correctly emitted and durably ingested by Datadog within ~30s of the OOM, but the `/api/v1/query` endpoint sometimes returned no results for very-recently-ingested points. The single-shot 5-minute wait was too brittle. Polling strategy: wait 90s after invocation, then re-query every 30s until every runtime reports count>=1 or the 12-min budget is exhausted. Early-exits when all runtimes pass, so the common case is faster than the previous single-shot 5-min wait while the worst case is bounded. Each poll iteration logs the current counts and the still-missing runtimes, so debugging future flakes from CI logs requires no rerun. Co-Authored-By: Claude Opus 4.7 (1M context) --- integration-tests/tests/oom.test.ts | 67 ++++++++++++++++++----------- 1 file changed, 42 insertions(+), 25 deletions(-) diff --git a/integration-tests/tests/oom.test.ts b/integration-tests/tests/oom.test.ts index 5e33e4c8e..736a694dd 100644 --- a/integration-tests/tests/oom.test.ts +++ b/integration-tests/tests/oom.test.ts @@ -1,6 +1,5 @@ import { invokeLambda } from './utils/lambda'; import { getMetricCount, OUT_OF_MEMORY_METRIC } from './utils/datadog'; -import { DEFAULT_DATADOG_INDEXING_WAIT_MS } from '../config'; import { getIdentifier } from '../config'; /** @@ -16,6 +15,15 @@ import { getIdentifier } from '../config'; * The Python/Ruby/Go cases are particularly meaningful regressions because * they trigger more than one detection path naturally — if dedup is broken, * those counts go to 2. + * + * Ingestion timing: empirical observation in CI is that the + * `aws.lambda.enhanced.out_of_memory` metric data point is durably ingested + * within ~30s of the OOM, but Datadog's `/api/v1/query` endpoint sometimes + * returns no results for very-recently-ingested points (the query engine's + * snapshot lags the ingest path). The single-shot 5-minute wait used by the + * other suites is therefore too brittle for this assertion. Instead we poll: + * after an initial wait we re-query every 30s until every runtime reports + * count>=1 or the overall budget is exhausted. */ const identifier = getIdentifier(); const stackName = `integ-${identifier}-oom`; @@ -35,51 +43,60 @@ const cases: OomCase[] = [ { runtime: 'go', functionName: `${stackName}-go-lambda` }, ]; +const INITIAL_WAIT_MS = 90 * 1000; // wait before first query +const POLL_INTERVAL_MS = 30 * 1000; // re-query cadence +const TOTAL_BUDGET_MS = 12 * 60 * 1000; // overall ceiling + async function sleep(ms: number): Promise { return new Promise((resolve) => setTimeout(resolve, ms)); } +async function fetchCounts(start: number, end: number): Promise> { + const results = await Promise.all( + cases.map(async (c) => ({ + runtime: c.runtime, + count: await getMetricCount(OUT_OF_MEMORY_METRIC, c.functionName, start, end), + })), + ); + return Object.fromEntries(results.map((r) => [r.runtime, r.count])); +} + describe('OOM Integration Tests', () => { let countsByRuntime: Record; - let windowStart: number; - let windowEnd: number; - // Invoke every function once, wait for Datadog to ingest, then query once - // for each. Keeping invocations and the query inside `beforeAll` lets each - // per-runtime test below assert against the same data set. beforeAll(async () => { - windowStart = Date.now(); + const windowStart = Date.now(); await Promise.all( cases.map((c) => invokeLambda(c.functionName).catch((err) => { // OOM functions usually succeed at the Invoke API layer (the function // is run, just crashes), so a thrown error here is unexpected - // infrastructure failure rather than the OOM itself. Re-throw so the - // test surfaces it. + // infrastructure failure rather than the OOM itself. throw new Error(`Invoke failed for ${c.functionName}: ${err}`); }), ), ); - await sleep(DEFAULT_DATADOG_INDEXING_WAIT_MS); - windowEnd = Date.now(); + await sleep(INITIAL_WAIT_MS); - const results = await Promise.all( - cases.map(async (c) => ({ - runtime: c.runtime, - count: await getMetricCount( - OUT_OF_MEMORY_METRIC, - c.functionName, - windowStart, - windowEnd, - ), - })), - ); + const deadline = windowStart + TOTAL_BUDGET_MS; + let counts: Record = {}; + let attempt = 0; + while (Date.now() < deadline) { + attempt++; + counts = await fetchCounts(windowStart, Date.now()); + const missing = cases.filter((c) => (counts[c.runtime] ?? 0) < 1).map((c) => c.runtime); + console.log(`OOM poll #${attempt}:`, counts, missing.length ? `(still missing: ${missing.join(', ')})` : '(all runtimes >=1)'); + if (missing.length === 0) { + break; + } + await sleep(POLL_INTERVAL_MS); + } - countsByRuntime = Object.fromEntries(results.map((r) => [r.runtime, r.count])); - console.log('OOM counts by runtime:', countsByRuntime); - }, 10 * 60 * 1000); + countsByRuntime = counts; + console.log('OOM counts by runtime (final):', countsByRuntime); + }, TOTAL_BUDGET_MS + 60 * 1000); describe.each(cases)('$runtime runtime', ({ runtime }) => { it('should emit exactly one out_of_memory metric for one OOM invocation', () => { From 1ca72fc9c7c1b29871d22507833c98aef30a9602 Mon Sep 17 00:00:00 2001 From: Yiming Luo <10097700+lym953@users.noreply.github.com> Date: Mon, 1 Jun 2026 15:44:16 -0400 Subject: [PATCH 06/21] =?UTF-8?q?test(integration):=20use=20continuously,1?= =?UTF-8?q?000=20flush=20for=20OOM=20stack=20=E2=80=94=20`default`=20was?= =?UTF-8?q?=20a=20no-op?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `FlushStrategy::Default` falls back to `End` until the lookback buffer fills (~20 invocations). The OOM test does a single cold-start invoke per function, so `default` behaved identically to `end` — explaining why the prior commit's change had no observable effect. `continuously,1000` schedules an unconditional 1s periodic flush regardless of invocation count, so the OOM metric reaches Datadog well before the sandbox is reaped after the function process dies. Co-Authored-By: Claude Opus 4.7 (1M context) --- integration-tests/lib/stacks/oom.ts | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/integration-tests/lib/stacks/oom.ts b/integration-tests/lib/stacks/oom.ts index ac8c86e45..e6bc59b62 100644 --- a/integration-tests/lib/stacks/oom.ts +++ b/integration-tests/lib/stacks/oom.ts @@ -63,10 +63,16 @@ export class Oom extends cdk.Stack { // window between the OOM and the sandbox teardown, the data point is // lost and the test sees count=0. // - // `default` (`flushStrategyDefault`) flushes every ~1s in addition to - // invocation-end, giving the OOM metric a much wider window to reach - // Datadog. Override per-function below. - const flushStrategyDefault = 'default'; + // `default` is a no-op in this scenario: it falls back to End strategy + // until the bottlecap invocation-times buffer is full (~20 invocations), + // so on our single-shot cold-start OOM it behaves identically to End. + // + // `continuously,1000` schedules an unconditional 1-second periodic flush + // regardless of invocation count, so the OOM metric reaches Datadog + // within ~1s of being emitted by bottlecap — well before the sandbox is + // reaped. This is a test-only knob; real customer Lambdas eventually + // flush via the next invocation or Shutdown path. + const flushStrategy = 'continuously,1000'; // Node case A — V8 heap exhaustion (log-line path). const nodeV8FunctionName = `${id}-node-v8-heap-lambda`; @@ -81,7 +87,7 @@ export class Oom extends cdk.Stack { environment: { ...defaultDatadogEnvVariables, DD_SERVICE: nodeV8FunctionName, - DD_SERVERLESS_FLUSH_STRATEGY: flushStrategyDefault, + DD_SERVERLESS_FLUSH_STRATEGY: flushStrategy, DD_TRACE_ENABLED: 'true', DD_LAMBDA_HANDLER: 'index.handler', // Cap V8 heap below the Lambda memory cap so V8 throws its OOM error @@ -107,7 +113,7 @@ export class Oom extends cdk.Stack { environment: { ...defaultDatadogEnvVariables, DD_SERVICE: nodeSigkillFunctionName, - DD_SERVERLESS_FLUSH_STRATEGY: flushStrategyDefault, + DD_SERVERLESS_FLUSH_STRATEGY: flushStrategy, DD_TRACE_ENABLED: 'true', DD_LAMBDA_HANDLER: 'index.handler', }, @@ -130,7 +136,7 @@ export class Oom extends cdk.Stack { environment: { ...defaultDatadogEnvVariables, DD_SERVICE: pythonFunctionName, - DD_SERVERLESS_FLUSH_STRATEGY: flushStrategyDefault, + DD_SERVERLESS_FLUSH_STRATEGY: flushStrategy, DD_TRACE_ENABLED: 'true', DD_LAMBDA_HANDLER: 'lambda_function.handler', }, @@ -156,7 +162,7 @@ export class Oom extends cdk.Stack { environment: { ...defaultDatadogEnvVariables, DD_SERVICE: rubyFunctionName, - DD_SERVERLESS_FLUSH_STRATEGY: flushStrategyDefault, + DD_SERVERLESS_FLUSH_STRATEGY: flushStrategy, DD_TRACE_ENABLED: 'true', }, logGroup: createLogGroup(this, rubyFunctionName), @@ -178,7 +184,7 @@ export class Oom extends cdk.Stack { environment: { ...defaultDatadogEnvVariables, DD_SERVICE: javaFunctionName, - DD_SERVERLESS_FLUSH_STRATEGY: flushStrategyDefault, + DD_SERVERLESS_FLUSH_STRATEGY: flushStrategy, AWS_LAMBDA_EXEC_WRAPPER: '/opt/datadog_wrapper', DD_TRACE_ENABLED: 'true', }, @@ -201,7 +207,7 @@ export class Oom extends cdk.Stack { environment: { ...defaultDatadogEnvVariables, DD_SERVICE: dotnetFunctionName, - DD_SERVERLESS_FLUSH_STRATEGY: flushStrategyDefault, + DD_SERVERLESS_FLUSH_STRATEGY: flushStrategy, AWS_LAMBDA_EXEC_WRAPPER: '/opt/datadog_wrapper', }, logGroup: createLogGroup(this, dotnetFunctionName), @@ -226,7 +232,7 @@ export class Oom extends cdk.Stack { environment: { ...defaultDatadogEnvVariables, DD_SERVICE: goFunctionName, - DD_SERVERLESS_FLUSH_STRATEGY: flushStrategyDefault, + DD_SERVERLESS_FLUSH_STRATEGY: flushStrategy, }, logGroup: createLogGroup(this, goFunctionName), }); From 31f59b8aa3c2f3073fb59846ed9fdf1d452d6b40 Mon Sep 17 00:00:00 2001 From: Yiming Luo <10097700+lym953@users.noreply.github.com> Date: Mon, 1 Jun 2026 16:08:32 -0400 Subject: [PATCH 07/21] test(integration): bump OOM test memory to 256 MB so extension survives OOM-kill MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Root cause of the prior `[oom]` failures (6/7 runtimes stuck at count=0): at 192 MB the kernel OOM-killer often picks the bottlecap extension instead of the function runtime — Lambda surfaces this as `errorType: Extension.Crash`. A dead extension can't emit the OOM metric, so the test sees nothing in Datadog. Reproduced locally on us-east-2 arm64 with an IntegTests-style Python function: at 192 MB → `Extension.Crash`, no metric. Bumping to 256 MB → `Runtime.OutOfMemory`, count=1 in Datadog within 30 s. 256 MB gives the extension ~30 MB headroom while keeping every detection path active: the function still hits memory_size in PlatformReport, still emits its runtime-specific OOM log line, and still gets `Runtime.OutOfMemory` in PlatformRuntimeDone. The customer's #1237 case (192 MB) is unaffected — this is a test-harness change. Also drops the `DD_SERVERLESS_FLUSH_STRATEGY=continuously,1000` override from the prior commit; with the extension surviving, the default `end` flush is sufficient. Co-Authored-By: Claude Opus 4.7 (1M context) --- integration-tests/lib/stacks/oom.ts | 43 ++++++++++------------------- 1 file changed, 14 insertions(+), 29 deletions(-) diff --git a/integration-tests/lib/stacks/oom.ts b/integration-tests/lib/stacks/oom.ts index e6bc59b62..c5a57dc2d 100644 --- a/integration-tests/lib/stacks/oom.ts +++ b/integration-tests/lib/stacks/oom.ts @@ -38,8 +38,9 @@ import { * - oom-go : log line `fatal error: runtime: out of memory` * + PlatformReport memory equality (dedup) * - * Each function is configured with low memory (192 MB) and a short timeout - * (30 s) so the OOM fires quickly during the integration-test run. + * Each function is configured with low memory (256 MB) and a short timeout + * (30 s) so the OOM fires quickly during the integration-test run. See the + * `oomMemorySize` comment for why 256 MB rather than the customer's 192 MB. */ export class Oom extends cdk.Stack { constructor(scope: Construct, id: string, props: cdk.StackProps) { @@ -52,28 +53,19 @@ export class Oom extends cdk.Stack { const dotnetLayer = getDefaultDotnetLayer(this); const rubyLayer = getDefaultRubyLayer(this); - const oomMemorySize = 192; + // 256 MB (not the customer's 192 MB from #1237) so the bottlecap + // extension has memory headroom to survive when the function process + // OOMs. At 192 MB the kernel OOM-killer often picks the extension + // instead of the function runtime (Lambda surfaces this as the + // `Extension.Crash` error type), and a dead extension can't emit the + // OOM metric. With 256 MB the function runtime's RSS dominates and + // kernel reliably kills it; the extension survives to detect/flush. + // The detection paths under test are unchanged — the functions still + // hit `max_memory_used == memory_size` in PlatformReport and still + // emit runtime-specific OOM error log lines. + const oomMemorySize = 256; const oomTimeout = cdk.Duration.seconds(30); - // The integration-test framework defaults DD_SERVERLESS_FLUSH_STRATEGY to - // `end` (flush only at end of invocation). For OOM tests that's a tight - // race: the function process dies, then Lambda sends PlatformRuntimeDone, - // then the extension increments the OOM metric, then Shutdown comes and - // the sandbox is reaped. If the metric flush can't finish in the narrow - // window between the OOM and the sandbox teardown, the data point is - // lost and the test sees count=0. - // - // `default` is a no-op in this scenario: it falls back to End strategy - // until the bottlecap invocation-times buffer is full (~20 invocations), - // so on our single-shot cold-start OOM it behaves identically to End. - // - // `continuously,1000` schedules an unconditional 1-second periodic flush - // regardless of invocation count, so the OOM metric reaches Datadog - // within ~1s of being emitted by bottlecap — well before the sandbox is - // reaped. This is a test-only knob; real customer Lambdas eventually - // flush via the next invocation or Shutdown path. - const flushStrategy = 'continuously,1000'; - // Node case A — V8 heap exhaustion (log-line path). const nodeV8FunctionName = `${id}-node-v8-heap-lambda`; const nodeV8Function = new lambda.Function(this, nodeV8FunctionName, { @@ -87,7 +79,6 @@ export class Oom extends cdk.Stack { environment: { ...defaultDatadogEnvVariables, DD_SERVICE: nodeV8FunctionName, - DD_SERVERLESS_FLUSH_STRATEGY: flushStrategy, DD_TRACE_ENABLED: 'true', DD_LAMBDA_HANDLER: 'index.handler', // Cap V8 heap below the Lambda memory cap so V8 throws its OOM error @@ -113,7 +104,6 @@ export class Oom extends cdk.Stack { environment: { ...defaultDatadogEnvVariables, DD_SERVICE: nodeSigkillFunctionName, - DD_SERVERLESS_FLUSH_STRATEGY: flushStrategy, DD_TRACE_ENABLED: 'true', DD_LAMBDA_HANDLER: 'index.handler', }, @@ -136,7 +126,6 @@ export class Oom extends cdk.Stack { environment: { ...defaultDatadogEnvVariables, DD_SERVICE: pythonFunctionName, - DD_SERVERLESS_FLUSH_STRATEGY: flushStrategy, DD_TRACE_ENABLED: 'true', DD_LAMBDA_HANDLER: 'lambda_function.handler', }, @@ -162,7 +151,6 @@ export class Oom extends cdk.Stack { environment: { ...defaultDatadogEnvVariables, DD_SERVICE: rubyFunctionName, - DD_SERVERLESS_FLUSH_STRATEGY: flushStrategy, DD_TRACE_ENABLED: 'true', }, logGroup: createLogGroup(this, rubyFunctionName), @@ -184,7 +172,6 @@ export class Oom extends cdk.Stack { environment: { ...defaultDatadogEnvVariables, DD_SERVICE: javaFunctionName, - DD_SERVERLESS_FLUSH_STRATEGY: flushStrategy, AWS_LAMBDA_EXEC_WRAPPER: '/opt/datadog_wrapper', DD_TRACE_ENABLED: 'true', }, @@ -207,7 +194,6 @@ export class Oom extends cdk.Stack { environment: { ...defaultDatadogEnvVariables, DD_SERVICE: dotnetFunctionName, - DD_SERVERLESS_FLUSH_STRATEGY: flushStrategy, AWS_LAMBDA_EXEC_WRAPPER: '/opt/datadog_wrapper', }, logGroup: createLogGroup(this, dotnetFunctionName), @@ -232,7 +218,6 @@ export class Oom extends cdk.Stack { environment: { ...defaultDatadogEnvVariables, DD_SERVICE: goFunctionName, - DD_SERVERLESS_FLUSH_STRATEGY: flushStrategy, }, logGroup: createLogGroup(this, goFunctionName), }); From c1e04de71eda08bf3dc69426e8d06e7ad4edfafc Mon Sep 17 00:00:00 2001 From: Yiming Luo <10097700+lym953@users.noreply.github.com> Date: Mon, 1 Jun 2026 21:02:38 -0400 Subject: [PATCH 08/21] chore: review comment cleanup and Ruby layer bump - Trim historical `provided.al` context from OOM detection comments - Rewrite `test_handle_ondemand_report_emits_oom_on_memory_equality` doc comment to describe what the test covers, not how the rule changed - Refocus `current_request_id` doc on its sole purpose (OOM metric dedup by request_id) and drop speculative scenarios that weren't directly verified; use "LMI mode" consistently - Drop "as of 2026-05" qualifier from the OOM detection path list - Bump Datadog-Ruby3-4-ARM default layer 9 -> 28 Co-Authored-By: Claude Opus 4.7 (1M context) --- .../src/lifecycle/invocation/processor.rs | 12 ++++------ bottlecap/src/logs/lambda/processor.rs | 23 ++++++------------- bottlecap/src/metrics/enhanced/lambda.rs | 2 +- integration-tests/lib/util.ts | 2 +- 4 files changed, 14 insertions(+), 25 deletions(-) diff --git a/bottlecap/src/lifecycle/invocation/processor.rs b/bottlecap/src/lifecycle/invocation/processor.rs index c77c224a8..65bd6c330 100644 --- a/bottlecap/src/lifecycle/invocation/processor.rs +++ b/bottlecap/src/lifecycle/invocation/processor.rs @@ -920,9 +920,9 @@ impl Processor { // If the invocation hit the memory limit, increment the OOM metric. This catches // OOM-induced failures that don't surface through a runtime-specific log line or a // `Runtime.OutOfMemory` error_type — most notably the suppressed-init / timeout-at-cap - // pattern reported in datadog-lambda-extension#1237 (Node) and the historical - // provided.al case. Dedup against the other two detection paths is handled by - // `Context::oom_emitted`, which `try_increment_oom_metric` checks and sets. + // pattern reported in datadog-lambda-extension#1237 (Node). Dedup against the other + // two detection paths is handled by `Context::oom_emitted`, which + // `try_increment_oom_metric` checks and sets. if metrics.max_memory_used_mb == metrics.memory_size_mb { debug!( "Invocation Processor | PlatformReport | Last invocation hit memory limit. Incrementing OOM metric." @@ -2564,10 +2564,8 @@ mod tests { ); } - /// Regression: the `max_memory_used_mb == memory_size_mb` path used to be gated - /// on `runtime.starts_with("provided.al")`. After generalising the rule to all - /// runtimes (with dedup via `Context::oom_emitted`), the equality case must - /// still emit OOM. + /// In `handle_ondemand_report`, when `max_memory_used_mb == memory_size_mb`, + /// the OOM metric should be incremented exactly once for that invocation. #[tokio::test] async fn test_handle_ondemand_report_emits_oom_on_memory_equality() { let mut p = setup(); diff --git a/bottlecap/src/logs/lambda/processor.rs b/bottlecap/src/logs/lambda/processor.rs index 9c922ac69..863c56103 100644 --- a/bottlecap/src/logs/lambda/processor.rs +++ b/bottlecap/src/logs/lambda/processor.rs @@ -164,23 +164,14 @@ impl LambdaProcessor { } /// Returns the `request_id` of the currently-active invocation, if known. - /// Set by `PlatformStart`, cleared by `PlatformRuntimeDone` / `PlatformReport`. + /// Used by the OOM log-line detector to tag `Event::OutOfMemory` so that + /// `Processor::try_increment_oom_metric` can dedup against the other two + /// detection paths (`Runtime.OutOfMemory` and `PlatformReport` equality). /// - /// Returns `None` when: - /// - **Managed Instance mode**: extensions cannot subscribe to the `INVOKE` event, - /// so `platform.start` is not delivered and this slot is never populated. OOM logs - /// parsed in MI mode are therefore always tagged `None`. The synthesized - /// `PlatformRuntimeDone` produced by `handle_managed_instance_report` does carry a - /// real `request_id`, so dedup still works for that path. Worst case is a thin - /// double-count window if a runtime emits both an OOM log line and - /// `error_type = Runtime.OutOfMemory` for the same invocation — not observed in - /// practice today. - /// - **Pre-`PlatformStart` init crash**: a FATAL OOM log emitted by init code can - /// arrive before `PlatformStart` (or with no `PlatformStart` at all, if init - /// fails outright). In the no-`PlatformStart` case no other detection path fires, - /// so no double-count. - /// - **Late log race**: a FATAL log parsed after `PlatformRuntimeDone` clears the - /// slot. By then the context has been removed, so no double-count. + /// `invocation_context.request_id` is set on `PlatformStart` and cleared on + /// `PlatformRuntimeDone` / `PlatformReport`. Returns `None` in LMI mode, + /// where extensions cannot subscribe to the `INVOKE` event so + /// `platform.start` is never delivered. fn current_request_id(&self) -> Option { if self.invocation_context.request_id.is_empty() { None diff --git a/bottlecap/src/metrics/enhanced/lambda.rs b/bottlecap/src/metrics/enhanced/lambda.rs index 67535967e..9f427ef50 100644 --- a/bottlecap/src/metrics/enhanced/lambda.rs +++ b/bottlecap/src/metrics/enhanced/lambda.rs @@ -95,7 +95,7 @@ impl Lambda { // dedupes by `request_id` so the same invocation isn't counted multiple times when // more than one detection path fires. The three paths are: // 1. Runtime-specific OOM log line (.NET, Node, Java, Go, Ruby, Python) - // 2. PlatformRuntimeDone with error_type == "Runtime.OutOfMemory" (Ruby, Python; Node as of 2026-05) + // 2. PlatformRuntimeDone with error_type == "Runtime.OutOfMemory" (Node, Ruby, Python) // 3. PlatformReport with max_memory_used_mb == memory_size_mb (all runtimes) pub fn increment_oom_metric(&self, timestamp: i64) { self.increment_metric(constants::OUT_OF_MEMORY_METRIC, timestamp); diff --git a/integration-tests/lib/util.ts b/integration-tests/lib/util.ts index 24fe04164..c7645d4c9 100644 --- a/integration-tests/lib/util.ts +++ b/integration-tests/lib/util.ts @@ -21,7 +21,7 @@ export const defaultNodeLayerArn = process.env.NODE_TRACER_LAYER_ARN || 'arn:aws export const defaultPythonLayerArn = process.env.PYTHON_TRACER_LAYER_ARN || 'arn:aws:lambda:us-east-1:464622532012:layer:Datadog-Python313-ARM:117'; export const defaultJavaLayerArn = process.env.JAVA_TRACER_LAYER_ARN || 'arn:aws:lambda:us-east-1:464622532012:layer:dd-trace-java:25'; export const defaultDotnetLayerArn = process.env.DOTNET_TRACER_LAYER_ARN || 'arn:aws:lambda:us-east-1:464622532012:layer:dd-trace-dotnet-ARM:23'; -export const defaultRubyLayerArn = process.env.RUBY_TRACER_LAYER_ARN || 'arn:aws:lambda:us-east-1:464622532012:layer:Datadog-Ruby3-4-ARM:9'; +export const defaultRubyLayerArn = process.env.RUBY_TRACER_LAYER_ARN || 'arn:aws:lambda:us-east-1:464622532012:layer:Datadog-Ruby3-4-ARM:28'; export const defaultDatadogEnvVariables = { DD_API_KEY_SECRET_ARN: datadogSecretArn, From 8e7f0a960992f791748e1d3eec247713c2c8ecdd Mon Sep 17 00:00:00 2001 From: Yiming Luo <10097700+lym953@users.noreply.github.com> Date: Tue, 2 Jun 2026 11:53:02 -0400 Subject: [PATCH 09/21] chore: add debug logs for OOM metric emit-without-dedup branches Per PR review feedback. The two no-dedup branches in `try_increment_oom_metric` were previously silent; surfacing them as debug logs makes the LMI-mode case (request_id=None) and the rare context-eviction case (request_id supplied but absent from the buffer) visible during investigations. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../src/lifecycle/invocation/processor.rs | 33 +++++++++++++++---- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/bottlecap/src/lifecycle/invocation/processor.rs b/bottlecap/src/lifecycle/invocation/processor.rs index 65bd6c330..bbaaad720 100644 --- a/bottlecap/src/lifecycle/invocation/processor.rs +++ b/bottlecap/src/lifecycle/invocation/processor.rs @@ -1411,17 +1411,36 @@ impl Processor { /// skipped. If `request_id` is `None` (log path saw the OOM outside an active /// invocation window) or no context is found, we emit best-effort without dedup. fn try_increment_oom_metric(&mut self, request_id: Option<&String>, timestamp: i64) { - if let Some(rid) = request_id - && let Some(ctx) = self.context_buffer.get_mut(rid) - { - if ctx.oom_emitted { + if let Some(rid) = request_id { + if let Some(ctx) = self.context_buffer.get_mut(rid) { + if ctx.oom_emitted { + debug!( + "Invocation Processor | OOM metric already emitted for request_id {}, skipping", + rid + ); + return; + } + ctx.oom_emitted = true; + } else { + // request_id was supplied but its context isn't in the buffer. + // This is rare: the buffer has fixed capacity (MAX_CONTEXT_BUFFER_SIZE), + // so under high concurrency an entry can be evicted between + // PlatformStart and the OOM event. Without a context we cannot dedup + // against other paths for this request_id — emit and accept the risk + // of double-counting if a second detection path also fires. debug!( - "Invocation Processor | OOM metric already emitted for request_id {}, skipping", + "Invocation Processor | Emitting OOM metric without dedup: context not found for request_id {} (likely evicted from context buffer)", rid ); - return; } - ctx.oom_emitted = true; + } else { + // No request_id available. Only the OOM-log path can supply None, + // and it does so when LambdaProcessor::invocation_context.request_id is + // empty — which in practice means LMI mode (extensions can't subscribe + // to INVOKE, so platform.start never arrives to populate the slot). + debug!( + "Invocation Processor | Emitting OOM metric without dedup: no request_id available (likely LMI mode)" + ); } self.enhanced_metrics.increment_oom_metric(timestamp); } From c76ce4fbb510e7e6c6723df436304fd6e75d7d83 Mon Sep 17 00:00:00 2001 From: Yiming Luo <10097700+lym953@users.noreply.github.com> Date: Tue, 2 Jun 2026 11:59:10 -0400 Subject: [PATCH 10/21] test(integration): add LMI OOM integration test (Python) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a dedicated `lmi-oom` suite that deploys one Python function on the LMI Capacity Provider and asserts that the OOM enhanced metric is emitted when the function hits its memory cap. Exercises the LMI-specific log-line path where `current_request_id()` returns `None` because `platform.start` is never delivered, so the OOM detector flows through the no-dedup branch of `try_increment_oom_metric`. Assertion is `count >= 1` rather than `== 1` because Path 2 (`Runtime.OutOfMemory` via synthesized runtime_done from `handle_managed_instance_report`) also fires for the same invocation and cannot dedup against the log path's `None`. A future change can tighten this once LMI dedup is addressed. Also simplifies overly-verbose comments above the two no-dedup debug logs — the log messages are self-explanatory. Co-Authored-By: Claude Opus 4.7 (1M context) --- .gitlab/datasources/test-suites.yaml | 1 + .../src/lifecycle/invocation/processor.rs | 10 --- integration-tests/bin/app.ts | 4 ++ integration-tests/lib/stacks/lmi-oom.ts | 60 ++++++++++++++++++ integration-tests/tests/lmi-oom.test.ts | 61 +++++++++++++++++++ 5 files changed, 126 insertions(+), 10 deletions(-) create mode 100644 integration-tests/lib/stacks/lmi-oom.ts create mode 100644 integration-tests/tests/lmi-oom.test.ts diff --git a/.gitlab/datasources/test-suites.yaml b/.gitlab/datasources/test-suites.yaml index b6d82c369..fd85ba478 100644 --- a/.gitlab/datasources/test-suites.yaml +++ b/.gitlab/datasources/test-suites.yaml @@ -5,3 +5,4 @@ test_suites: - name: lmi - name: auth - name: oom + - name: lmi-oom diff --git a/bottlecap/src/lifecycle/invocation/processor.rs b/bottlecap/src/lifecycle/invocation/processor.rs index bbaaad720..b977a5439 100644 --- a/bottlecap/src/lifecycle/invocation/processor.rs +++ b/bottlecap/src/lifecycle/invocation/processor.rs @@ -1422,22 +1422,12 @@ impl Processor { } ctx.oom_emitted = true; } else { - // request_id was supplied but its context isn't in the buffer. - // This is rare: the buffer has fixed capacity (MAX_CONTEXT_BUFFER_SIZE), - // so under high concurrency an entry can be evicted between - // PlatformStart and the OOM event. Without a context we cannot dedup - // against other paths for this request_id — emit and accept the risk - // of double-counting if a second detection path also fires. debug!( "Invocation Processor | Emitting OOM metric without dedup: context not found for request_id {} (likely evicted from context buffer)", rid ); } } else { - // No request_id available. Only the OOM-log path can supply None, - // and it does so when LambdaProcessor::invocation_context.request_id is - // empty — which in practice means LMI mode (extensions can't subscribe - // to INVOKE, so platform.start never arrives to populate the slot). debug!( "Invocation Processor | Emitting OOM metric without dedup: no request_id available (likely LMI mode)" ); diff --git a/integration-tests/bin/app.ts b/integration-tests/bin/app.ts index affce4270..78c0ae4fd 100644 --- a/integration-tests/bin/app.ts +++ b/integration-tests/bin/app.ts @@ -7,6 +7,7 @@ import {Snapstart} from '../lib/stacks/snapstart'; import {LambdaManagedInstancesStack} from '../lib/stacks/lmi'; import {AuthStack} from '../lib/stacks/auth'; import {Oom} from '../lib/stacks/oom'; +import {LmiOom} from '../lib/stacks/lmi-oom'; import {AuthRoleStack} from '../lib/auth-role'; import {ACCOUNT, getIdentifier, REGION} from '../config'; import {CapacityProviderStack} from "../lib/capacity-provider"; @@ -44,6 +45,9 @@ const stacks = [ new Oom(app, `integ-${identifier}-oom`, { env, }), + new LmiOom(app, `integ-${identifier}-lmi-oom`, { + env, + }), ] // Tag all stacks so we can easily clean them up diff --git a/integration-tests/lib/stacks/lmi-oom.ts b/integration-tests/lib/stacks/lmi-oom.ts new file mode 100644 index 000000000..303f64cb6 --- /dev/null +++ b/integration-tests/lib/stacks/lmi-oom.ts @@ -0,0 +1,60 @@ +import * as cdk from 'aws-cdk-lib'; +import * as lambda from 'aws-cdk-lib/aws-lambda'; +import { Construct } from 'constructs'; +import { + createLogGroup, + setCapacityProvider, + defaultDatadogEnvVariables, + defaultDatadogSecretPolicy, + getExtensionLayer, + getDefaultPythonLayer, + defaultPythonRuntime, +} from '../util'; + +/** + * LMI OOM test stack. + * + * Exercises bottlecap OOM detection on a Lambda Managed Instance (LMI) function. + * The interesting LMI-specific path: extensions cannot subscribe to `INVOKE` in + * LMI mode, so `platform.start` is never delivered and + * `LambdaProcessor::invocation_context.request_id` stays empty. The OOM + * log-line detector therefore tags `Event::OutOfMemory` with `request_id=None` + * and `Processor::try_increment_oom_metric` falls into the no-dedup branch. + * + * One Python function is enough to exercise this path — `MemoryError` triggers + * both the runtime-specific log line (path 1) and `Runtime.OutOfMemory` in the + * synthesized runtime-done event from `handle_managed_instance_report` (path 2). + */ +export class LmiOom extends cdk.Stack { + constructor(scope: Construct, id: string, props: cdk.StackProps) { + super(scope, id, props); + + const extensionLayer = getExtensionLayer(this); + const pythonLayer = getDefaultPythonLayer(this); + + const functionName = `${id}-python-lambda`; + const fn = new lambda.Function(this, functionName, { + runtime: defaultPythonRuntime, + architecture: lambda.Architecture.ARM_64, + handler: 'datadog_lambda.handler.handler', + code: lambda.Code.fromAsset('./lambda/oom-python'), + functionName: functionName, + timeout: cdk.Duration.seconds(30), + // 256 MB — see `oom.ts` for why we don't use the customer's 192 MB + // (kernel OOM-kills the extension itself otherwise). + memorySize: 256, + environment: { + ...defaultDatadogEnvVariables, + DD_SERVICE: functionName, + DD_TRACE_ENABLED: 'true', + DD_LAMBDA_HANDLER: 'lambda_function.handler', + DD_TRACE_AGENT_URL: 'http://127.0.0.1:8126', + }, + logGroup: createLogGroup(this, functionName), + }); + setCapacityProvider(fn); + fn.addToRolePolicy(defaultDatadogSecretPolicy); + fn.addLayers(extensionLayer); + fn.addLayers(pythonLayer); + } +} diff --git a/integration-tests/tests/lmi-oom.test.ts b/integration-tests/tests/lmi-oom.test.ts new file mode 100644 index 000000000..573038daf --- /dev/null +++ b/integration-tests/tests/lmi-oom.test.ts @@ -0,0 +1,61 @@ +import { invokeLambda } from './utils/lambda'; +import { getMetricCount, OUT_OF_MEMORY_METRIC } from './utils/datadog'; +import { getIdentifier } from '../config'; + +/** + * LMI OOM test. + * + * Validates that the `aws.lambda.enhanced.out_of_memory` metric is emitted + * when an LMI-mode function OOMs. The interesting path is the log-line + * detector — in LMI mode it tags `Event::OutOfMemory` with `request_id=None` + * because `platform.start` never fires there, so the metric flows through the + * no-dedup branch of `Processor::try_increment_oom_metric`. + * + * Known dedup gap: in LMI mode the `Runtime.OutOfMemory` path can also fire + * (via the synthesized runtime-done from `handle_managed_instance_report`), + * and because it carries `request_id=Some(rid)` it cannot dedup against the + * log path's `None`. A single OOM may therefore increment the metric more + * than once. The assertion below is `>= 1` to reflect that — tighten when + * LMI dedup is addressed. + */ +const identifier = getIdentifier(); +const stackName = `integ-${identifier}-lmi-oom`; +const functionName = `${stackName}-python-lambda`; + +const INITIAL_WAIT_MS = 90 * 1000; +const POLL_INTERVAL_MS = 30 * 1000; +const TOTAL_BUDGET_MS = 12 * 60 * 1000; + +async function sleep(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)); +} + +describe('LMI OOM Integration Test', () => { + let count = 0; + + beforeAll(async () => { + const windowStart = Date.now(); + await invokeLambda(functionName).catch((err) => { + throw new Error(`Invoke failed for ${functionName}: ${err}`); + }); + + await sleep(INITIAL_WAIT_MS); + + const deadline = windowStart + TOTAL_BUDGET_MS; + let attempt = 0; + while (Date.now() < deadline) { + attempt++; + count = await getMetricCount(OUT_OF_MEMORY_METRIC, functionName, windowStart, Date.now()); + console.log(`LMI OOM poll #${attempt}: count=${count}`); + if (count >= 1) { + break; + } + await sleep(POLL_INTERVAL_MS); + } + console.log(`LMI OOM count (final): ${count}`); + }, TOTAL_BUDGET_MS + 60 * 1000); + + it('should emit at least one out_of_memory metric for one OOM invocation in LMI mode', () => { + expect(count).toBeGreaterThanOrEqual(1); + }); +}); From 2ae7191f279d48c17f72e7ae97d346dc6197442b Mon Sep 17 00:00:00 2001 From: Yiming Luo <10097700+lym953@users.noreply.github.com> Date: Tue, 2 Jun 2026 12:51:08 -0400 Subject: [PATCH 11/21] docs: correct LMI OOM characterization based on empirical verification MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Deployed a Python OOM Lambda on the LMI Capacity Provider, captured the extension's debug logs from CloudWatch, and observed the actual flow: - PlatformStart IS delivered in LMI mode (prior comment claimed it wasn't). - For a Python `MemoryError` that fires immediately on first allocation, the OOM log line is processed by `LambdaProcessor` *before* the `PlatformStart` telemetry event's handler updates `invocation_context.request_id` — both arrive in the same millisecond. - `current_request_id()` therefore returns `None` and the metric flows through the no-dedup branch (the new debug log fires). - The synthesized runtime-done from `handle_managed_instance_report` reports `error_type=Runtime.Unknown` (not `Runtime.OutOfMemory`), so Path 2 does NOT fire for this Python OOM shape. Final metric count = 1 (no double-count). Updates the `current_request_id()` doc, the no-dedup debug log message, and the LMI OOM stack/test comments to reflect what was actually observed rather than the prior (incorrect) "platform.start never delivered in LMI" hypothesis. Assertion stays `>= 1` for robustness against future changes. Co-Authored-By: Claude Opus 4.7 (1M context) --- bottlecap/src/lifecycle/invocation/processor.rs | 2 +- bottlecap/src/logs/lambda/processor.rs | 10 ++++++---- integration-tests/lib/stacks/lmi-oom.ts | 15 ++++++--------- integration-tests/tests/lmi-oom.test.ts | 17 +++++++---------- 4 files changed, 20 insertions(+), 24 deletions(-) diff --git a/bottlecap/src/lifecycle/invocation/processor.rs b/bottlecap/src/lifecycle/invocation/processor.rs index b977a5439..098d04049 100644 --- a/bottlecap/src/lifecycle/invocation/processor.rs +++ b/bottlecap/src/lifecycle/invocation/processor.rs @@ -1429,7 +1429,7 @@ impl Processor { } } else { debug!( - "Invocation Processor | Emitting OOM metric without dedup: no request_id available (likely LMI mode)" + "Invocation Processor | Emitting OOM metric without dedup: no request_id available (OOM log processed before PlatformStart or after PlatformRuntimeDone)" ); } self.enhanced_metrics.increment_oom_metric(timestamp); diff --git a/bottlecap/src/logs/lambda/processor.rs b/bottlecap/src/logs/lambda/processor.rs index 863c56103..0bba0a542 100644 --- a/bottlecap/src/logs/lambda/processor.rs +++ b/bottlecap/src/logs/lambda/processor.rs @@ -168,10 +168,12 @@ impl LambdaProcessor { /// `Processor::try_increment_oom_metric` can dedup against the other two /// detection paths (`Runtime.OutOfMemory` and `PlatformReport` equality). /// - /// `invocation_context.request_id` is set on `PlatformStart` and cleared on - /// `PlatformRuntimeDone` / `PlatformReport`. Returns `None` in LMI mode, - /// where extensions cannot subscribe to the `INVOKE` event so - /// `platform.start` is never delivered. + /// `invocation_context.request_id` is set when this processor handles + /// `PlatformStart` and cleared on `PlatformRuntimeDone` / `PlatformReport`. + /// Returns `None` when an OOM log line is processed outside that window — + /// most commonly when an OOM fires so quickly at the start of the invocation + /// that the log line beats `PlatformStart` to this processor. Empirically + /// observed with a Python `MemoryError` on an LMI function (PR #1241). fn current_request_id(&self) -> Option { if self.invocation_context.request_id.is_empty() { None diff --git a/integration-tests/lib/stacks/lmi-oom.ts b/integration-tests/lib/stacks/lmi-oom.ts index 303f64cb6..cdd5be6af 100644 --- a/integration-tests/lib/stacks/lmi-oom.ts +++ b/integration-tests/lib/stacks/lmi-oom.ts @@ -15,15 +15,12 @@ import { * LMI OOM test stack. * * Exercises bottlecap OOM detection on a Lambda Managed Instance (LMI) function. - * The interesting LMI-specific path: extensions cannot subscribe to `INVOKE` in - * LMI mode, so `platform.start` is never delivered and - * `LambdaProcessor::invocation_context.request_id` stays empty. The OOM - * log-line detector therefore tags `Event::OutOfMemory` with `request_id=None` - * and `Processor::try_increment_oom_metric` falls into the no-dedup branch. - * - * One Python function is enough to exercise this path — `MemoryError` triggers - * both the runtime-specific log line (path 1) and `Runtime.OutOfMemory` in the - * synthesized runtime-done event from `handle_managed_instance_report` (path 2). + * Verified empirically (PR #1241): when a Python `MemoryError` fires + * immediately on first allocation, the function's OOM log line is processed + * by `LambdaProcessor` before its `PlatformStart` handler sets + * `invocation_context.request_id`, so `current_request_id()` returns `None` + * and the OOM metric flows through the no-dedup branch of + * `Processor::try_increment_oom_metric`. */ export class LmiOom extends cdk.Stack { constructor(scope: Construct, id: string, props: cdk.StackProps) { diff --git a/integration-tests/tests/lmi-oom.test.ts b/integration-tests/tests/lmi-oom.test.ts index 573038daf..de67d6442 100644 --- a/integration-tests/tests/lmi-oom.test.ts +++ b/integration-tests/tests/lmi-oom.test.ts @@ -6,17 +6,14 @@ import { getIdentifier } from '../config'; * LMI OOM test. * * Validates that the `aws.lambda.enhanced.out_of_memory` metric is emitted - * when an LMI-mode function OOMs. The interesting path is the log-line - * detector — in LMI mode it tags `Event::OutOfMemory` with `request_id=None` - * because `platform.start` never fires there, so the metric flows through the - * no-dedup branch of `Processor::try_increment_oom_metric`. + * when an LMI-mode Python function hits `MemoryError`. The interesting case + * (verified locally — see PR #1241 thread): the log line is processed before + * `LambdaProcessor` handles `PlatformStart`, so the OOM event is tagged + * `request_id=None` and the metric flows through the no-dedup branch. * - * Known dedup gap: in LMI mode the `Runtime.OutOfMemory` path can also fire - * (via the synthesized runtime-done from `handle_managed_instance_report`), - * and because it carries `request_id=Some(rid)` it cannot dedup against the - * log path's `None`. A single OOM may therefore increment the metric more - * than once. The assertion below is `>= 1` to reflect that — tighten when - * LMI dedup is addressed. + * Asserts `>= 1` rather than `== 1` to stay robust against other paths firing + * (e.g. a future change where `handle_managed_instance_report` surfaces + * `Runtime.OutOfMemory` in the synthesized runtime-done). */ const identifier = getIdentifier(); const stackName = `integ-${identifier}-lmi-oom`; From bff4ad348b4ae787cf1c656235fdd906975f095c Mon Sep 17 00:00:00 2001 From: Yiming Luo <10097700+lym953@users.noreply.github.com> Date: Tue, 2 Jun 2026 13:24:28 -0400 Subject: [PATCH 12/21] docs: make best-effort nature of OOM metric dedup explicit Previously the comments on `try_increment_oom_metric`, the path-3 caller in `handle_ondemand_report`, and `increment_oom_metric` all opened with confident phrasing ("exactly once per request_id", "isn't counted multiple times when more than one detection path fires"), and the no-dedup fallback was buried at the bottom or absent. That mischaracterizes the guarantee: when the OOM log line lands before/after the active-invocation window in `LambdaProcessor`, or when the context has been evicted, the metric will be double-counted by a subsequent detection path. Restructures the three comments so the best-effort caveat is up front and the two edge cases (request_id=None race, context evicted) are called out explicitly with their consequences. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../src/lifecycle/invocation/processor.rs | 25 +++++++++++++------ bottlecap/src/metrics/enhanced/lambda.rs | 6 ++--- 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/bottlecap/src/lifecycle/invocation/processor.rs b/bottlecap/src/lifecycle/invocation/processor.rs index 098d04049..7f7336bb8 100644 --- a/bottlecap/src/lifecycle/invocation/processor.rs +++ b/bottlecap/src/lifecycle/invocation/processor.rs @@ -920,9 +920,9 @@ impl Processor { // If the invocation hit the memory limit, increment the OOM metric. This catches // OOM-induced failures that don't surface through a runtime-specific log line or a // `Runtime.OutOfMemory` error_type — most notably the suppressed-init / timeout-at-cap - // pattern reported in datadog-lambda-extension#1237 (Node). Dedup against the other - // two detection paths is handled by `Context::oom_emitted`, which - // `try_increment_oom_metric` checks and sets. + // pattern reported in datadog-lambda-extension#1237 (Node). Best-effort dedup + // against the other two detection paths is handled by `try_increment_oom_metric` + // (it can still double-count in edge cases — see that function's doc). if metrics.max_memory_used_mb == metrics.memory_size_mb { debug!( "Invocation Processor | PlatformReport | Last invocation hit memory limit. Incrementing OOM metric." @@ -1399,17 +1399,26 @@ impl Processor { self.try_increment_oom_metric(request_id, timestamp); } - /// Increments the OOM enhanced metric exactly once per `request_id`. + /// Best-effort dedup wrapper around `enhanced_metrics.increment_oom_metric`. + /// The metric MAY be double-counted in edge cases — see below. /// /// Several detection paths can fire for the same invocation: /// 1. A runtime-specific OOM log line (logs processor → `Event::OutOfMemory`) /// 2. `error_type == "Runtime.OutOfMemory"` in `PlatformRuntimeDone` /// 3. `max_memory_used_mb == memory_size_mb` in `PlatformReport` /// - /// To avoid double-counting, the per-invocation `Context::oom_emitted` flag is - /// set on the first emission. Subsequent emissions for the same `request_id` are - /// skipped. If `request_id` is `None` (log path saw the OOM outside an active - /// invocation window) or no context is found, we emit best-effort without dedup. + /// When `request_id` is supplied AND the matching context is still in the + /// buffer, the per-invocation `Context::oom_emitted` flag guarantees one + /// emission per `request_id`. The metric is double-counted when either: + /// - `request_id` is `None` (log line beat `PlatformStart` to + /// `LambdaProcessor`, or it landed after `PlatformRuntimeDone` cleared + /// the slot) and another path subsequently emits with `Some(rid)`; or + /// - the context has been evicted from the buffer (capacity is fixed — + /// see `MAX_CONTEXT_BUFFER_SIZE`) between `PlatformStart` and this + /// call, so the flag has nowhere to live. + /// + /// Both branches still emit (so OOMs are never under-counted) and log a + /// `debug!` line. fn try_increment_oom_metric(&mut self, request_id: Option<&String>, timestamp: i64) { if let Some(rid) = request_id { if let Some(ctx) = self.context_buffer.get_mut(rid) { diff --git a/bottlecap/src/metrics/enhanced/lambda.rs b/bottlecap/src/metrics/enhanced/lambda.rs index 9f427ef50..9260064c0 100644 --- a/bottlecap/src/metrics/enhanced/lambda.rs +++ b/bottlecap/src/metrics/enhanced/lambda.rs @@ -91,9 +91,9 @@ impl Lambda { self.increment_metric(constants::TIMEOUTS_METRIC, timestamp); } - // Callers should generally go through `Processor::try_increment_oom_metric`, which - // dedupes by `request_id` so the same invocation isn't counted multiple times when - // more than one detection path fires. The three paths are: + // Callers should generally go through `Processor::try_increment_oom_metric`, + // which provides best-effort dedup by `request_id` (see its doc for the + // edge cases that can still double-count). The three detection paths are: // 1. Runtime-specific OOM log line (.NET, Node, Java, Go, Ruby, Python) // 2. PlatformRuntimeDone with error_type == "Runtime.OutOfMemory" (Node, Ruby, Python) // 3. PlatformReport with max_memory_used_mb == memory_size_mb (all runtimes) From 3f31c2cc6eafde462ab04837dbc9877a1fd32474 Mon Sep 17 00:00:00 2001 From: Yiming Luo <10097700+lym953@users.noreply.github.com> Date: Tue, 2 Jun 2026 13:38:45 -0400 Subject: [PATCH 13/21] fix(logs): prefer log-payload requestId for OOM events in LMI mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In LMI mode the function-log JSON payload always carries a `requestId` field that we already extract a few lines above the OOM detection block. Plumbing that value into `Event::OutOfMemory` instead of falling back to `current_request_id()` closes the race observed in https://github.com/DataDog/datadog-lambda-extension/pull/1241#discussion_r3338125266 where a fast OOM log line is processed before this same processor's `PlatformStart` handler updates `invocation_context.request_id`. OnDemand mode is unaffected — `request_id` from the log payload is unconditionally `None` there, so we still fall back to `current_request_id()`, which works because `PlatformStart`'s race window doesn't manifest in OnDemand operationally. Updates the `current_request_id` doc and the LMI OOM stack/test comments to reflect that the LMI case now goes through the deduped branch by way of the payload `requestId`. Co-Authored-By: Claude Opus 4.7 (1M context) --- bottlecap/src/logs/lambda/processor.rs | 24 ++++++++++++++++++------ integration-tests/lib/stacks/lmi-oom.ts | 10 ++++------ integration-tests/tests/lmi-oom.test.ts | 8 ++++---- 3 files changed, 26 insertions(+), 16 deletions(-) diff --git a/bottlecap/src/logs/lambda/processor.rs b/bottlecap/src/logs/lambda/processor.rs index 0bba0a542..b954f9190 100644 --- a/bottlecap/src/logs/lambda/processor.rs +++ b/bottlecap/src/logs/lambda/processor.rs @@ -164,16 +164,19 @@ impl LambdaProcessor { } /// Returns the `request_id` of the currently-active invocation, if known. - /// Used by the OOM log-line detector to tag `Event::OutOfMemory` so that - /// `Processor::try_increment_oom_metric` can dedup against the other two - /// detection paths (`Runtime.OutOfMemory` and `PlatformReport` equality). + /// Used by the OOM log-line detector as a fallback to tag `Event::OutOfMemory` + /// so that `Processor::try_increment_oom_metric` can dedup against the other + /// two detection paths (`Runtime.OutOfMemory` and `PlatformReport` equality). + /// In LMI mode, the OOM detector prefers the `requestId` field parsed from + /// the function-log JSON payload — it doesn't race with `PlatformStart` and + /// matches this log line exactly. Use `current_request_id()` only when that + /// field isn't available (on-demand mode, or non-JSON log payloads). /// /// `invocation_context.request_id` is set when this processor handles /// `PlatformStart` and cleared on `PlatformRuntimeDone` / `PlatformReport`. /// Returns `None` when an OOM log line is processed outside that window — /// most commonly when an OOM fires so quickly at the start of the invocation - /// that the log line beats `PlatformStart` to this processor. Empirically - /// observed with a Python `MemoryError` on an LMI function (PR #1241). + /// that the log line beats `PlatformStart` to this processor. fn current_request_id(&self) -> Option { if self.invocation_context.request_id.is_empty() { None @@ -213,8 +216,17 @@ impl LambdaProcessor { if let Some(message) = message { if is_oom_error(&message) { debug!("LOGS | Got a runtime-specific OOM error. Incrementing OOM metric."); + // Prefer the `requestId` parsed from the log payload above + // (populated in LMI mode where the runtime stamps every JSON + // log with the in-flight request id). It is guaranteed to be + // correct for this log line, whereas `current_request_id()` + // depends on `PlatformStart` having already updated + // `invocation_context`, which races with fast OOM logs. Fall + // back to `current_request_id()` in OnDemand mode where + // `request_id` from the log payload is not extracted. + let oom_request_id = request_id.clone().or_else(|| self.current_request_id()); if let Err(e) = self.event_bus.send(Event::OutOfMemory { - request_id: self.current_request_id(), + request_id: oom_request_id, timestamp: event.time.timestamp(), }).await { error!("LOGS | Failed to send OOM event to the main event bus: {e}"); diff --git a/integration-tests/lib/stacks/lmi-oom.ts b/integration-tests/lib/stacks/lmi-oom.ts index cdd5be6af..cdeecd486 100644 --- a/integration-tests/lib/stacks/lmi-oom.ts +++ b/integration-tests/lib/stacks/lmi-oom.ts @@ -15,12 +15,10 @@ import { * LMI OOM test stack. * * Exercises bottlecap OOM detection on a Lambda Managed Instance (LMI) function. - * Verified empirically (PR #1241): when a Python `MemoryError` fires - * immediately on first allocation, the function's OOM log line is processed - * by `LambdaProcessor` before its `PlatformStart` handler sets - * `invocation_context.request_id`, so `current_request_id()` returns `None` - * and the OOM metric flows through the no-dedup branch of - * `Processor::try_increment_oom_metric`. + * In LMI mode the function-log JSON payload carries a `requestId` field that + * the OOM detector reads directly (see `LambdaProcessor::get_message`), so + * dedup against the other OOM detection paths works reliably even when an + * OOM fires fast enough that the log line beats `PlatformStart`. */ export class LmiOom extends cdk.Stack { constructor(scope: Construct, id: string, props: cdk.StackProps) { diff --git a/integration-tests/tests/lmi-oom.test.ts b/integration-tests/tests/lmi-oom.test.ts index de67d6442..16191bb49 100644 --- a/integration-tests/tests/lmi-oom.test.ts +++ b/integration-tests/tests/lmi-oom.test.ts @@ -6,10 +6,10 @@ import { getIdentifier } from '../config'; * LMI OOM test. * * Validates that the `aws.lambda.enhanced.out_of_memory` metric is emitted - * when an LMI-mode Python function hits `MemoryError`. The interesting case - * (verified locally — see PR #1241 thread): the log line is processed before - * `LambdaProcessor` handles `PlatformStart`, so the OOM event is tagged - * `request_id=None` and the metric flows through the no-dedup branch. + * when an LMI-mode Python function hits `MemoryError`. In LMI mode the OOM + * log path tags `Event::OutOfMemory` with the `requestId` parsed from the + * function-log JSON payload, so dedup works without depending on + * `PlatformStart` having raced ahead of the log line. * * Asserts `>= 1` rather than `== 1` to stay robust against other paths firing * (e.g. a future change where `handle_managed_instance_report` surfaces From 1c95f5aee67fdd8886e43703df3420448332f702 Mon Sep 17 00:00:00 2001 From: Yiming Luo <10097700+lym953@users.noreply.github.com> Date: Tue, 2 Jun 2026 13:57:22 -0400 Subject: [PATCH 14/21] refactor(logs): drop current_request_id() helper, use payload requestId universally MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per https://github.com/DataDog/datadog-lambda-extension/pull/1241#discussion_r3338125266: paths 2 and 3 already get `request_id` directly from the PlatformRuntimeDone and PlatformReport event payloads (as a function parameter); only path 1 (the OOM log-line detector in `LambdaProcessor::get_message`) was using `current_request_id()`. And path 1 has an even better source for the request id — the `requestId` field that structured JSON log payloads already carry — which doesn't race with the in-processor `PlatformStart` handler. Drops the `is_managed_instance_mode` gate around payload `requestId` extraction so on-demand mode also benefits (it was the LMI Python case that surfaced the race empirically, but the same source is more accurate than `invocation_context.request_id` in on-demand mode too). The OOM detector now tags `Event::OutOfMemory` with the extracted payload `requestId` directly; the Extension log variant passes `None` (extension log payloads don't carry a function request id), and falls through to `try_increment_oom_metric`'s no-dedup branch. Updates `test_regular_lambda_does_not_extract_request_id` → `test_regular_lambda_extracts_request_id_from_payload` since the rule it was locking in (LMI-only extraction) no longer holds. Co-Authored-By: Claude Opus 4.7 (1M context) --- bottlecap/src/logs/lambda/processor.rs | 71 +++++++++---------------- integration-tests/lib/stacks/lmi-oom.ts | 11 ++-- integration-tests/tests/lmi-oom.test.ts | 14 +++-- 3 files changed, 39 insertions(+), 57 deletions(-) diff --git a/bottlecap/src/logs/lambda/processor.rs b/bottlecap/src/logs/lambda/processor.rs index b954f9190..ef351f637 100644 --- a/bottlecap/src/logs/lambda/processor.rs +++ b/bottlecap/src/logs/lambda/processor.rs @@ -163,28 +163,6 @@ impl LambdaProcessor { } } - /// Returns the `request_id` of the currently-active invocation, if known. - /// Used by the OOM log-line detector as a fallback to tag `Event::OutOfMemory` - /// so that `Processor::try_increment_oom_metric` can dedup against the other - /// two detection paths (`Runtime.OutOfMemory` and `PlatformReport` equality). - /// In LMI mode, the OOM detector prefers the `requestId` field parsed from - /// the function-log JSON payload — it doesn't race with `PlatformStart` and - /// matches this log line exactly. Use `current_request_id()` only when that - /// field isn't available (on-demand mode, or non-JSON log payloads). - /// - /// `invocation_context.request_id` is set when this processor handles - /// `PlatformStart` and cleared on `PlatformRuntimeDone` / `PlatformReport`. - /// Returns `None` when an OOM log line is processed outside that window — - /// most commonly when an OOM fires so quickly at the start of the invocation - /// that the log line beats `PlatformStart` to this processor. - fn current_request_id(&self) -> Option { - if self.invocation_context.request_id.is_empty() { - None - } else { - Some(self.invocation_context.request_id.clone()) - } - } - #[allow(clippy::too_many_lines)] async fn get_message(&mut self, event: TelemetryEvent) -> Result> { let copy = event.clone(); @@ -192,14 +170,18 @@ impl LambdaProcessor { TelemetryRecord::Function(v) => { let (request_id, message, durable_ctx) = match v { serde_json::Value::Object(obj) => { - let request_id = if self.is_managed_instance_mode { - obj.get("requestId") - .or_else(|| obj.get("AWSRequestId")) - .and_then(|v| v.as_str()) - .map(ToString::to_string) - } else { - None - }; + // The Lambda runtimes that emit structured JSON logs (Python, + // and Node/Ruby/Java/.NET when JSON log format is configured) + // stamp every log with the in-flight `requestId`. Extract it + // here so the OOM detector below can tag `Event::OutOfMemory` + // with the request id of the log line that actually matched. + // This is the only reliable way to dedup against the other + // OOM detection paths when a fast OOM beats `PlatformStart` + // through this processor. + let request_id = obj.get("requestId") + .or_else(|| obj.get("AWSRequestId")) + .and_then(|v| v.as_str()) + .map(ToString::to_string); // When a message is logged from the durable execution SDK, it contains an `executionArn` field. // In this case, extract the durable execution context from the `executionArn` field, and later // set durable execution id and name as log attributes. @@ -216,17 +198,8 @@ impl LambdaProcessor { if let Some(message) = message { if is_oom_error(&message) { debug!("LOGS | Got a runtime-specific OOM error. Incrementing OOM metric."); - // Prefer the `requestId` parsed from the log payload above - // (populated in LMI mode where the runtime stamps every JSON - // log with the in-flight request id). It is guaranteed to be - // correct for this log line, whereas `current_request_id()` - // depends on `PlatformStart` having already updated - // `invocation_context`, which races with fast OOM logs. Fall - // back to `current_request_id()` in OnDemand mode where - // `request_id` from the log payload is not extracted. - let oom_request_id = request_id.clone().or_else(|| self.current_request_id()); if let Err(e) = self.event_bus.send(Event::OutOfMemory { - request_id: oom_request_id, + request_id: request_id.clone(), timestamp: event.time.timestamp(), }).await { error!("LOGS | Failed to send OOM event to the main event bus: {e}"); @@ -261,8 +234,11 @@ impl LambdaProcessor { if let Some(message) = message { if is_oom_error(&message) { debug!("LOGS | Got a runtime-specific OOM error. Incrementing OOM metric."); + // Extension log payloads do not carry a function `requestId`, + // so this path can't dedup. `try_increment_oom_metric` will + // fall through to its no-dedup branch. if let Err(e) = self.event_bus.send(Event::OutOfMemory { - request_id: self.current_request_id(), + request_id: None, timestamp: event.time.timestamp(), }).await { error!("LOGS | Failed to send OOM event to the main event bus: {e}"); @@ -1971,7 +1947,7 @@ mod tests { } #[tokio::test] - async fn test_regular_lambda_does_not_extract_request_id() { + async fn test_regular_lambda_extracts_request_id_from_payload() { let tags = HashMap::from([("test".to_string(), "tags".to_string())]); let config = Arc::new(config::Config { service: Some("test-service".to_string()), @@ -1998,7 +1974,10 @@ mod tests { false, // Regular Lambda mode (not LMI) ); - // Test that requestId is NOT extracted in regular Lambda mode + // The payload `requestId` is extracted in both regular and LMI mode + // so that the OOM detector can dedup against the other detection paths + // by tagging `Event::OutOfMemory` with the request id of the exact log + // line that matched. let mut obj = serde_json::Map::new(); obj.insert( "requestId".to_string(), @@ -2015,8 +1994,10 @@ mod tests { }; let result = processor.get_message(event).await.unwrap(); - // Should be None because we're not in LMI mode - assert_eq!(result.lambda.request_id, None); + assert_eq!( + result.lambda.request_id, + Some("test-request-789".to_string()) + ); } #[test] diff --git a/integration-tests/lib/stacks/lmi-oom.ts b/integration-tests/lib/stacks/lmi-oom.ts index cdeecd486..61402f699 100644 --- a/integration-tests/lib/stacks/lmi-oom.ts +++ b/integration-tests/lib/stacks/lmi-oom.ts @@ -15,10 +15,13 @@ import { * LMI OOM test stack. * * Exercises bottlecap OOM detection on a Lambda Managed Instance (LMI) function. - * In LMI mode the function-log JSON payload carries a `requestId` field that - * the OOM detector reads directly (see `LambdaProcessor::get_message`), so - * dedup against the other OOM detection paths works reliably even when an - * OOM fires fast enough that the log line beats `PlatformStart`. + * The OOM detector tags `Event::OutOfMemory` with the `requestId` parsed from + * the function-log JSON payload (see `LambdaProcessor::get_message`), which + * the LMI Python runtime always stamps on its OOM error log. The other two + * detection paths (`Runtime.OutOfMemory`, `PlatformReport` equality) carry + * the same request id directly from their event payloads, so dedup via + * `Context::oom_emitted` works end-to-end and the metric increments exactly + * once per OOM invocation. */ export class LmiOom extends cdk.Stack { constructor(scope: Construct, id: string, props: cdk.StackProps) { diff --git a/integration-tests/tests/lmi-oom.test.ts b/integration-tests/tests/lmi-oom.test.ts index 16191bb49..6698f78fd 100644 --- a/integration-tests/tests/lmi-oom.test.ts +++ b/integration-tests/tests/lmi-oom.test.ts @@ -6,14 +6,12 @@ import { getIdentifier } from '../config'; * LMI OOM test. * * Validates that the `aws.lambda.enhanced.out_of_memory` metric is emitted - * when an LMI-mode Python function hits `MemoryError`. In LMI mode the OOM - * log path tags `Event::OutOfMemory` with the `requestId` parsed from the - * function-log JSON payload, so dedup works without depending on - * `PlatformStart` having raced ahead of the log line. - * - * Asserts `>= 1` rather than `== 1` to stay robust against other paths firing - * (e.g. a future change where `handle_managed_instance_report` surfaces - * `Runtime.OutOfMemory` in the synthesized runtime-done). + * when an LMI-mode Python function hits `MemoryError`. The OOM log path tags + * `Event::OutOfMemory` with the `requestId` parsed from the function-log + * JSON payload, so dedup against the other detection paths works without + * depending on `PlatformStart` racing ahead of the log line. Asserts + * `>= 1` rather than `== 1` to stay robust against unrelated future + * changes to the synthesized runtime-done path. */ const identifier = getIdentifier(); const stackName = `integ-${identifier}-lmi-oom`; From 8115a6633202d7521617e92596a6f987afacc91f Mon Sep 17 00:00:00 2001 From: Yiming Luo <10097700+lym953@users.noreply.github.com> Date: Tue, 2 Jun 2026 14:07:05 -0400 Subject: [PATCH 15/21] Revert "refactor(logs): drop current_request_id() helper, use payload requestId universally" This reverts commit 1c95f5aee67fdd8886e43703df3420448332f702. --- bottlecap/src/logs/lambda/processor.rs | 71 ++++++++++++++++--------- integration-tests/lib/stacks/lmi-oom.ts | 11 ++-- integration-tests/tests/lmi-oom.test.ts | 14 ++--- 3 files changed, 57 insertions(+), 39 deletions(-) diff --git a/bottlecap/src/logs/lambda/processor.rs b/bottlecap/src/logs/lambda/processor.rs index ef351f637..b954f9190 100644 --- a/bottlecap/src/logs/lambda/processor.rs +++ b/bottlecap/src/logs/lambda/processor.rs @@ -163,6 +163,28 @@ impl LambdaProcessor { } } + /// Returns the `request_id` of the currently-active invocation, if known. + /// Used by the OOM log-line detector as a fallback to tag `Event::OutOfMemory` + /// so that `Processor::try_increment_oom_metric` can dedup against the other + /// two detection paths (`Runtime.OutOfMemory` and `PlatformReport` equality). + /// In LMI mode, the OOM detector prefers the `requestId` field parsed from + /// the function-log JSON payload — it doesn't race with `PlatformStart` and + /// matches this log line exactly. Use `current_request_id()` only when that + /// field isn't available (on-demand mode, or non-JSON log payloads). + /// + /// `invocation_context.request_id` is set when this processor handles + /// `PlatformStart` and cleared on `PlatformRuntimeDone` / `PlatformReport`. + /// Returns `None` when an OOM log line is processed outside that window — + /// most commonly when an OOM fires so quickly at the start of the invocation + /// that the log line beats `PlatformStart` to this processor. + fn current_request_id(&self) -> Option { + if self.invocation_context.request_id.is_empty() { + None + } else { + Some(self.invocation_context.request_id.clone()) + } + } + #[allow(clippy::too_many_lines)] async fn get_message(&mut self, event: TelemetryEvent) -> Result> { let copy = event.clone(); @@ -170,18 +192,14 @@ impl LambdaProcessor { TelemetryRecord::Function(v) => { let (request_id, message, durable_ctx) = match v { serde_json::Value::Object(obj) => { - // The Lambda runtimes that emit structured JSON logs (Python, - // and Node/Ruby/Java/.NET when JSON log format is configured) - // stamp every log with the in-flight `requestId`. Extract it - // here so the OOM detector below can tag `Event::OutOfMemory` - // with the request id of the log line that actually matched. - // This is the only reliable way to dedup against the other - // OOM detection paths when a fast OOM beats `PlatformStart` - // through this processor. - let request_id = obj.get("requestId") - .or_else(|| obj.get("AWSRequestId")) - .and_then(|v| v.as_str()) - .map(ToString::to_string); + let request_id = if self.is_managed_instance_mode { + obj.get("requestId") + .or_else(|| obj.get("AWSRequestId")) + .and_then(|v| v.as_str()) + .map(ToString::to_string) + } else { + None + }; // When a message is logged from the durable execution SDK, it contains an `executionArn` field. // In this case, extract the durable execution context from the `executionArn` field, and later // set durable execution id and name as log attributes. @@ -198,8 +216,17 @@ impl LambdaProcessor { if let Some(message) = message { if is_oom_error(&message) { debug!("LOGS | Got a runtime-specific OOM error. Incrementing OOM metric."); + // Prefer the `requestId` parsed from the log payload above + // (populated in LMI mode where the runtime stamps every JSON + // log with the in-flight request id). It is guaranteed to be + // correct for this log line, whereas `current_request_id()` + // depends on `PlatformStart` having already updated + // `invocation_context`, which races with fast OOM logs. Fall + // back to `current_request_id()` in OnDemand mode where + // `request_id` from the log payload is not extracted. + let oom_request_id = request_id.clone().or_else(|| self.current_request_id()); if let Err(e) = self.event_bus.send(Event::OutOfMemory { - request_id: request_id.clone(), + request_id: oom_request_id, timestamp: event.time.timestamp(), }).await { error!("LOGS | Failed to send OOM event to the main event bus: {e}"); @@ -234,11 +261,8 @@ impl LambdaProcessor { if let Some(message) = message { if is_oom_error(&message) { debug!("LOGS | Got a runtime-specific OOM error. Incrementing OOM metric."); - // Extension log payloads do not carry a function `requestId`, - // so this path can't dedup. `try_increment_oom_metric` will - // fall through to its no-dedup branch. if let Err(e) = self.event_bus.send(Event::OutOfMemory { - request_id: None, + request_id: self.current_request_id(), timestamp: event.time.timestamp(), }).await { error!("LOGS | Failed to send OOM event to the main event bus: {e}"); @@ -1947,7 +1971,7 @@ mod tests { } #[tokio::test] - async fn test_regular_lambda_extracts_request_id_from_payload() { + async fn test_regular_lambda_does_not_extract_request_id() { let tags = HashMap::from([("test".to_string(), "tags".to_string())]); let config = Arc::new(config::Config { service: Some("test-service".to_string()), @@ -1974,10 +1998,7 @@ mod tests { false, // Regular Lambda mode (not LMI) ); - // The payload `requestId` is extracted in both regular and LMI mode - // so that the OOM detector can dedup against the other detection paths - // by tagging `Event::OutOfMemory` with the request id of the exact log - // line that matched. + // Test that requestId is NOT extracted in regular Lambda mode let mut obj = serde_json::Map::new(); obj.insert( "requestId".to_string(), @@ -1994,10 +2015,8 @@ mod tests { }; let result = processor.get_message(event).await.unwrap(); - assert_eq!( - result.lambda.request_id, - Some("test-request-789".to_string()) - ); + // Should be None because we're not in LMI mode + assert_eq!(result.lambda.request_id, None); } #[test] diff --git a/integration-tests/lib/stacks/lmi-oom.ts b/integration-tests/lib/stacks/lmi-oom.ts index 61402f699..cdeecd486 100644 --- a/integration-tests/lib/stacks/lmi-oom.ts +++ b/integration-tests/lib/stacks/lmi-oom.ts @@ -15,13 +15,10 @@ import { * LMI OOM test stack. * * Exercises bottlecap OOM detection on a Lambda Managed Instance (LMI) function. - * The OOM detector tags `Event::OutOfMemory` with the `requestId` parsed from - * the function-log JSON payload (see `LambdaProcessor::get_message`), which - * the LMI Python runtime always stamps on its OOM error log. The other two - * detection paths (`Runtime.OutOfMemory`, `PlatformReport` equality) carry - * the same request id directly from their event payloads, so dedup via - * `Context::oom_emitted` works end-to-end and the metric increments exactly - * once per OOM invocation. + * In LMI mode the function-log JSON payload carries a `requestId` field that + * the OOM detector reads directly (see `LambdaProcessor::get_message`), so + * dedup against the other OOM detection paths works reliably even when an + * OOM fires fast enough that the log line beats `PlatformStart`. */ export class LmiOom extends cdk.Stack { constructor(scope: Construct, id: string, props: cdk.StackProps) { diff --git a/integration-tests/tests/lmi-oom.test.ts b/integration-tests/tests/lmi-oom.test.ts index 6698f78fd..16191bb49 100644 --- a/integration-tests/tests/lmi-oom.test.ts +++ b/integration-tests/tests/lmi-oom.test.ts @@ -6,12 +6,14 @@ import { getIdentifier } from '../config'; * LMI OOM test. * * Validates that the `aws.lambda.enhanced.out_of_memory` metric is emitted - * when an LMI-mode Python function hits `MemoryError`. The OOM log path tags - * `Event::OutOfMemory` with the `requestId` parsed from the function-log - * JSON payload, so dedup against the other detection paths works without - * depending on `PlatformStart` racing ahead of the log line. Asserts - * `>= 1` rather than `== 1` to stay robust against unrelated future - * changes to the synthesized runtime-done path. + * when an LMI-mode Python function hits `MemoryError`. In LMI mode the OOM + * log path tags `Event::OutOfMemory` with the `requestId` parsed from the + * function-log JSON payload, so dedup works without depending on + * `PlatformStart` having raced ahead of the log line. + * + * Asserts `>= 1` rather than `== 1` to stay robust against other paths firing + * (e.g. a future change where `handle_managed_instance_report` surfaces + * `Runtime.OutOfMemory` in the synthesized runtime-done). */ const identifier = getIdentifier(); const stackName = `integ-${identifier}-lmi-oom`; From 466200938c9537f7b4c7da19d42f7839e35cea98 Mon Sep 17 00:00:00 2001 From: Yiming Luo <10097700+lym953@users.noreply.github.com> Date: Tue, 2 Jun 2026 14:10:32 -0400 Subject: [PATCH 16/21] fix(logs): always extract payload requestId; keep current_request_id() fallback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per https://github.com/DataDog/datadog-lambda-extension/pull/1241 review: when the request id is available from the log/event payload, use it directly; only fall back to a workaround (`current_request_id()`) when the payload doesn't carry one. Drops the `is_managed_instance_mode` gate on payload `requestId` extraction so on-demand mode also benefits. The OOM detector now reads the payload field whenever it's present (Python, Ruby, .NET, and Java/Node when JSON log format is configured) regardless of mode, and falls back to `current_request_id()` only for text-payload OOM logs (Node V8 fatal, Go fatal, Java stderr) where no `requestId` field exists. The fallback path preserves the count==1 behavior for the double-detect cases on the integration suite (Java OutOfMemoryError, Node SIGKILL, Go fatal-error) — these were what the previous "drop current_request_id() entirely" refactor would have regressed. Also renames `test_regular_lambda_does_not_extract_request_id` → `test_regular_lambda_extracts_request_id_from_payload` to match the new universal extraction behavior. Co-Authored-By: Claude Opus 4.7 (1M context) --- bottlecap/src/logs/lambda/processor.rs | 63 +++++++++++++------------- 1 file changed, 32 insertions(+), 31 deletions(-) diff --git a/bottlecap/src/logs/lambda/processor.rs b/bottlecap/src/logs/lambda/processor.rs index b954f9190..7dc84f672 100644 --- a/bottlecap/src/logs/lambda/processor.rs +++ b/bottlecap/src/logs/lambda/processor.rs @@ -164,19 +164,16 @@ impl LambdaProcessor { } /// Returns the `request_id` of the currently-active invocation, if known. - /// Used by the OOM log-line detector as a fallback to tag `Event::OutOfMemory` - /// so that `Processor::try_increment_oom_metric` can dedup against the other - /// two detection paths (`Runtime.OutOfMemory` and `PlatformReport` equality). - /// In LMI mode, the OOM detector prefers the `requestId` field parsed from - /// the function-log JSON payload — it doesn't race with `PlatformStart` and - /// matches this log line exactly. Use `current_request_id()` only when that - /// field isn't available (on-demand mode, or non-JSON log payloads). + /// Used by the OOM log-line detector as a fallback when the matched log + /// payload doesn't carry a `requestId` field (text payloads — Node V8 + /// fatal output, Go fatal, Java stderr, etc.). The payload field is + /// preferred when available because it matches the log line exactly and + /// doesn't race with `PlatformStart`. /// /// `invocation_context.request_id` is set when this processor handles - /// `PlatformStart` and cleared on `PlatformRuntimeDone` / `PlatformReport`. - /// Returns `None` when an OOM log line is processed outside that window — - /// most commonly when an OOM fires so quickly at the start of the invocation - /// that the log line beats `PlatformStart` to this processor. + /// `PlatformStart` and cleared on `PlatformRuntimeDone` / `PlatformReport`, + /// so this returns `None` when an OOM log line is processed outside that + /// window. fn current_request_id(&self) -> Option { if self.invocation_context.request_id.is_empty() { None @@ -192,14 +189,16 @@ impl LambdaProcessor { TelemetryRecord::Function(v) => { let (request_id, message, durable_ctx) = match v { serde_json::Value::Object(obj) => { - let request_id = if self.is_managed_instance_mode { - obj.get("requestId") - .or_else(|| obj.get("AWSRequestId")) - .and_then(|v| v.as_str()) - .map(ToString::to_string) - } else { - None - }; + // Extract `requestId` (or `AWSRequestId`) from the log payload + // when present. Lambda runtimes that emit structured JSON + // logs (Python; Node/Ruby/Java/.NET when JSON log format is + // configured) stamp every log line with the in-flight + // request id, which is the most accurate source — it doesn't + // race with the in-processor `PlatformStart` handler. + let request_id = obj.get("requestId") + .or_else(|| obj.get("AWSRequestId")) + .and_then(|v| v.as_str()) + .map(ToString::to_string); // When a message is logged from the durable execution SDK, it contains an `executionArn` field. // In this case, extract the durable execution context from the `executionArn` field, and later // set durable execution id and name as log attributes. @@ -216,14 +215,11 @@ impl LambdaProcessor { if let Some(message) = message { if is_oom_error(&message) { debug!("LOGS | Got a runtime-specific OOM error. Incrementing OOM metric."); - // Prefer the `requestId` parsed from the log payload above - // (populated in LMI mode where the runtime stamps every JSON - // log with the in-flight request id). It is guaranteed to be - // correct for this log line, whereas `current_request_id()` - // depends on `PlatformStart` having already updated - // `invocation_context`, which races with fast OOM logs. Fall - // back to `current_request_id()` in OnDemand mode where - // `request_id` from the log payload is not extracted. + // Prefer the `requestId` from the log payload (most accurate + // for this exact log line, no race with `PlatformStart`). Fall + // back to `current_request_id()` only when the payload + // doesn't carry it — i.e. text payloads, or JSON without a + // `requestId` field. let oom_request_id = request_id.clone().or_else(|| self.current_request_id()); if let Err(e) = self.event_bus.send(Event::OutOfMemory { request_id: oom_request_id, @@ -1971,7 +1967,7 @@ mod tests { } #[tokio::test] - async fn test_regular_lambda_does_not_extract_request_id() { + async fn test_regular_lambda_extracts_request_id_from_payload() { let tags = HashMap::from([("test".to_string(), "tags".to_string())]); let config = Arc::new(config::Config { service: Some("test-service".to_string()), @@ -1998,7 +1994,10 @@ mod tests { false, // Regular Lambda mode (not LMI) ); - // Test that requestId is NOT extracted in regular Lambda mode + // The payload `requestId` is extracted in both regular and LMI mode + // so that the OOM detector can dedup against the other detection paths + // by tagging `Event::OutOfMemory` with the request id of the exact log + // line that matched. let mut obj = serde_json::Map::new(); obj.insert( "requestId".to_string(), @@ -2015,8 +2014,10 @@ mod tests { }; let result = processor.get_message(event).await.unwrap(); - // Should be None because we're not in LMI mode - assert_eq!(result.lambda.request_id, None); + assert_eq!( + result.lambda.request_id, + Some("test-request-789".to_string()) + ); } #[test] From 0da7a59f2ae9a57278c2b6a256425f22ce320a13 Mon Sep 17 00:00:00 2001 From: Yiming Luo <10097700+lym953@users.noreply.github.com> Date: Tue, 2 Jun 2026 14:37:13 -0400 Subject: [PATCH 17/21] =?UTF-8?q?fix(integration):=20LMI=20OOM=20stack=20?= =?UTF-8?q?=E2=80=94=20meet=202=20GB=20minimum,=20use=20100=20GB=20single-?= =?UTF-8?q?shot=20allocator?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CDK stack create-function in CI failed for the new `lmi-oom` suite with 'MemorySize value failed to satisfy constraint: Lambda Managed Instance functions must have memory size greater than or equal to 2048'. LMI Lambda enforces a 2 GB floor. Bumping to 2048 MB exposes a second problem: the existing `oom-python` source allocates 10 MB strings in a loop, which on 2 GB either runs past the test budget or gets kernel SIGKILL'd silently before CPython raises MemoryError — exactly what we need Path 1 of the OOM detector to see. Adds `oom-python-lmi/lambda_function.py` with a single `bytearray(100 * 1024 ** 3)` allocation. 100 GB exceeds any reasonable Lambda memory cap by orders of magnitude, so CPython's allocator refuses immediately and raises a clean MemoryError without involving the cgroup OOM killer. Verified manually with `yiming-lmi-oom-debug` in us-east-1 (PR #1241 thread). Co-Authored-By: Claude Opus 4.7 (1M context) --- .../lambda/oom-python-lmi/lambda_function.py | 10 ++++++++++ integration-tests/lib/stacks/lmi-oom.ts | 12 ++++++++---- 2 files changed, 18 insertions(+), 4 deletions(-) create mode 100644 integration-tests/lambda/oom-python-lmi/lambda_function.py diff --git a/integration-tests/lambda/oom-python-lmi/lambda_function.py b/integration-tests/lambda/oom-python-lmi/lambda_function.py new file mode 100644 index 000000000..ad1d9b7d4 --- /dev/null +++ b/integration-tests/lambda/oom-python-lmi/lambda_function.py @@ -0,0 +1,10 @@ +# OOM reproducer for Python on LMI (memory >= 2048 MB). +# A single 100 GB bytearray allocation request exceeds any reasonable +# Lambda memory cap by orders of magnitude, so Python's C allocator +# refuses immediately and raises MemoryError before the kernel has a +# chance to involve the cgroup OOM killer with a SIGKILL. This is the +# fastest and most reliable way to surface a clean MemoryError log +# line on a 2 GB function. The 10 MB-loop pattern used by `oom-python` +# would either take too long or get SIGKILL'd silently at 2 GB. +def handler(event, context): + data = bytearray(100 * 1024 * 1024 * 1024) diff --git a/integration-tests/lib/stacks/lmi-oom.ts b/integration-tests/lib/stacks/lmi-oom.ts index cdeecd486..edd873c57 100644 --- a/integration-tests/lib/stacks/lmi-oom.ts +++ b/integration-tests/lib/stacks/lmi-oom.ts @@ -32,12 +32,16 @@ export class LmiOom extends cdk.Stack { runtime: defaultPythonRuntime, architecture: lambda.Architecture.ARM_64, handler: 'datadog_lambda.handler.handler', - code: lambda.Code.fromAsset('./lambda/oom-python'), + // LMI uses a different Python OOM source than `oom.ts`: at 2 GB the + // 10 MB-loop allocator either runs past the test budget or gets + // SIGKILL'd silently. A single 100 GB `bytearray` allocation request + // makes CPython's allocator refuse immediately and raise a clean + // `MemoryError`, which we need so Path 1 of the OOM detector fires. + code: lambda.Code.fromAsset('./lambda/oom-python-lmi'), functionName: functionName, timeout: cdk.Duration.seconds(30), - // 256 MB — see `oom.ts` for why we don't use the customer's 192 MB - // (kernel OOM-kills the extension itself otherwise). - memorySize: 256, + // LMI requires memorySize >= 2048 (Lambda service validation). + memorySize: 2048, environment: { ...defaultDatadogEnvVariables, DD_SERVICE: functionName, From e79bfbf80cfc94f60c60380513523275efe6a199 Mon Sep 17 00:00:00 2001 From: Yiming Luo <10097700+lym953@users.noreply.github.com> Date: Tue, 2 Jun 2026 16:07:30 -0400 Subject: [PATCH 18/21] fix(integration): subtract 60s from OOM query windowStart to include rollup bucket MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The `[lmi-oom]` suite failed 2x in a row on `0da7a59f` despite the metric being present in Datadog (verified via direct API query). Root cause: Datadog rolls `aws.lambda.enhanced.out_of_memory` into 10-second wall-clock-aligned buckets, and the `/api/v1/query` endpoint only returns buckets whose start timestamp is >= the `from` parameter. In the failing run, the LMI cold start was fast: `windowStart = Date.now()` ran at 19:32:11, the function OOMed at 19:32:18, both in the same bucket starting at 19:32:10. The bucket's timestamp (19:32:10) is less than `from = 19:32:11`, so the bucket is excluded. The test polled 21 times across 12 minutes and saw `count = 0` every time, while a direct query with a wider `from` returned `count = 1` for the same data point. Fix: pad `windowStart` 60 s earlier than the actual invoke time so the bucket containing the OOM is always included. The `deadline` budget still runs from `invokeTime`, not the padded value. Apply the same defensive change to `[oom]`. It hasn't flaked on this specifically yet but the same race is possible — workload-dependent. Co-Authored-By: Claude Opus 4.7 (1M context) --- integration-tests/tests/lmi-oom.test.ts | 12 ++++++++++-- integration-tests/tests/oom.test.ts | 13 +++++++++++-- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/integration-tests/tests/lmi-oom.test.ts b/integration-tests/tests/lmi-oom.test.ts index 16191bb49..ed43e821d 100644 --- a/integration-tests/tests/lmi-oom.test.ts +++ b/integration-tests/tests/lmi-oom.test.ts @@ -31,14 +31,22 @@ describe('LMI OOM Integration Test', () => { let count = 0; beforeAll(async () => { - const windowStart = Date.now(); + const invokeTime = Date.now(); + // Subtract 60s from the query window's lower bound. Datadog rolls OOM + // metric data points into 10-second buckets aligned to wall-clock + // multiples; the bucket containing the OOM event is timestamped at the + // bucket *start*, and the query API only returns buckets whose start + // is >= the `from` parameter. If `windowStart == invokeTime` and the + // function OOMs in the same 10-second bucket (e.g. invoke 19:32:11.5, + // OOM 19:32:18 → bucket 19:32:10), the bucket would be excluded. + const windowStart = invokeTime - 60 * 1000; await invokeLambda(functionName).catch((err) => { throw new Error(`Invoke failed for ${functionName}: ${err}`); }); await sleep(INITIAL_WAIT_MS); - const deadline = windowStart + TOTAL_BUDGET_MS; + const deadline = invokeTime + TOTAL_BUDGET_MS; let attempt = 0; while (Date.now() < deadline) { attempt++; diff --git a/integration-tests/tests/oom.test.ts b/integration-tests/tests/oom.test.ts index 736a694dd..b00b80bff 100644 --- a/integration-tests/tests/oom.test.ts +++ b/integration-tests/tests/oom.test.ts @@ -65,7 +65,16 @@ describe('OOM Integration Tests', () => { let countsByRuntime: Record; beforeAll(async () => { - const windowStart = Date.now(); + const invokeTime = Date.now(); + // Subtract 60s from the query window's lower bound. Datadog rolls OOM + // metric data points into 10-second buckets aligned to wall-clock + // multiples and the API only returns buckets whose start timestamp is + // >= the `from` parameter. If the function OOMs in the same bucket as + // `invokeTime`, the bucket start (e.g. 19:32:10 for an invoke at + // 19:32:11.5) is excluded. The `[lmi-oom]` suite hit this on a fast + // LMI cold start; defensive in this suite too since the timing is + // workload-dependent. + const windowStart = invokeTime - 60 * 1000; await Promise.all( cases.map((c) => @@ -80,7 +89,7 @@ describe('OOM Integration Tests', () => { await sleep(INITIAL_WAIT_MS); - const deadline = windowStart + TOTAL_BUDGET_MS; + const deadline = invokeTime + TOTAL_BUDGET_MS; let counts: Record = {}; let attempt = 0; while (Date.now() < deadline) { From 631ed007e7f967824d03e6426cb8bbb937c448d8 Mon Sep 17 00:00:00 2001 From: Yiming Luo <10097700+lym953@users.noreply.github.com> Date: Tue, 2 Jun 2026 18:28:12 -0400 Subject: [PATCH 19/21] =?UTF-8?q?docs:=20update=20oom-go=20comment=20?= =?UTF-8?q?=E2=80=94=20192=20MB=20=E2=86=92=20256=20MB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per https://github.com/DataDog/datadog-lambda-extension/pull/1241#discussion_r3338125299 the comment in `oom-go/main.go` was written when `oomMemorySize` was still 192 MB; the stack has since been bumped to 256 MB (so the bottlecap extension has headroom and isn't OOM-killed itself, see the 256 MB rationale in `lib/stacks/oom.ts`). Updates the two stale '192 MB' references in the Go reproducer and adds a pointer to the canonical constant in the stack file so the next person who tweaks one place sees the other. Co-Authored-By: Claude Opus 4.7 (1M context) --- integration-tests/lambda/oom-go/main.go | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/integration-tests/lambda/oom-go/main.go b/integration-tests/lambda/oom-go/main.go index a76960ccf..dd95274a8 100644 --- a/integration-tests/lambda/oom-go/main.go +++ b/integration-tests/lambda/oom-go/main.go @@ -1,8 +1,9 @@ // OOM reproducer for Go. // Allocates a 500 MB byte slice in a single shot, then writes to every page -// to force physical commit. On a 192 MB Lambda this immediately exceeds the -// cgroup memory limit and the kernel SIGKILLs the process, producing a -// PlatformReport with max_memory_used_mb == memory_size_mb. The Go runtime +// to force physical commit. On a 256 MB Lambda (see `oomMemorySize` in +// `lib/stacks/oom.ts`) this immediately exceeds the cgroup memory limit and +// the kernel SIGKILLs the process, producing a PlatformReport with +// max_memory_used_mb == memory_size_mb. The Go runtime // also typically prints "fatal error: runtime: out of memory" on the way // down — bottlecap's runtime-specific log-line detection matches that // message. Per-Context dedup ensures the OOM metric increments only once @@ -21,7 +22,7 @@ func handler() error { for i := range b { b[i] = byte(i % 256) } - log.Println("did not OOM — unexpected") // unreachable on a 192 MB Lambda + log.Println("did not OOM — unexpected") // unreachable on a 256 MB Lambda return nil } From 1813683b18a267eabe79533e2a44f48cae6585ea Mon Sep 17 00:00:00 2001 From: Yiming Luo <10097700+lym953@users.noreply.github.com> Date: Thu, 4 Jun 2026 10:23:05 -0400 Subject: [PATCH 20/21] Modify comments --- bottlecap/src/logs/lambda/processor.rs | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/bottlecap/src/logs/lambda/processor.rs b/bottlecap/src/logs/lambda/processor.rs index 7dc84f672..88397b280 100644 --- a/bottlecap/src/logs/lambda/processor.rs +++ b/bottlecap/src/logs/lambda/processor.rs @@ -189,12 +189,6 @@ impl LambdaProcessor { TelemetryRecord::Function(v) => { let (request_id, message, durable_ctx) = match v { serde_json::Value::Object(obj) => { - // Extract `requestId` (or `AWSRequestId`) from the log payload - // when present. Lambda runtimes that emit structured JSON - // logs (Python; Node/Ruby/Java/.NET when JSON log format is - // configured) stamp every log line with the in-flight - // request id, which is the most accurate source — it doesn't - // race with the in-processor `PlatformStart` handler. let request_id = obj.get("requestId") .or_else(|| obj.get("AWSRequestId")) .and_then(|v| v.as_str()) @@ -215,11 +209,10 @@ impl LambdaProcessor { if let Some(message) = message { if is_oom_error(&message) { debug!("LOGS | Got a runtime-specific OOM error. Incrementing OOM metric."); - // Prefer the `requestId` from the log payload (most accurate - // for this exact log line, no race with `PlatformStart`). Fall - // back to `current_request_id()` only when the payload - // doesn't carry it — i.e. text payloads, or JSON without a - // `requestId` field. + // Prefer the `requestId` from the log payload (most accurate for this exact log line). + // Fall back to `current_request_id()` only when the payload doesn't carry it — + // i.e., for text payloads, or JSON without a `requestId` field. + let oom_request_id = request_id.clone().or_else(|| self.current_request_id()); if let Err(e) = self.event_bus.send(Event::OutOfMemory { request_id: oom_request_id, From 4907156e5151125ce18d932791d8e15596c65b6d Mon Sep 17 00:00:00 2001 From: Yiming Luo <10097700+lym953@users.noreply.github.com> Date: Thu, 4 Jun 2026 10:27:16 -0400 Subject: [PATCH 21/21] Add function.zip to .gitignore --- integration-tests/.gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/integration-tests/.gitignore b/integration-tests/.gitignore index 4480601bd..26c40de61 100644 --- a/integration-tests/.gitignore +++ b/integration-tests/.gitignore @@ -38,6 +38,7 @@ Thumbs.db # Lambda artifacts response.json lambda-bundle.zip +lambda/*/function.zip # Lambda build outputs lambda/*/target/