diff --git a/.gitlab/datasources/test-suites.yaml b/.gitlab/datasources/test-suites.yaml index 257b1ba04..fd85ba478 100644 --- a/.gitlab/datasources/test-suites.yaml +++ b/.gitlab/datasources/test-suites.yaml @@ -4,3 +4,5 @@ test_suites: - name: snapstart - name: lmi - name: auth + - name: oom + - name: lmi-oom diff --git a/.gitlab/templates/pipeline.yaml.tpl b/.gitlab/templates/pipeline.yaml.tpl index 60788606b..a87bfaa14 100644 --- a/.gitlab/templates/pipeline.yaml.tpl +++ b/.gitlab/templates/pipeline.yaml.tpl @@ -505,6 +505,40 @@ build node lambdas: - cd integration-tests - ./scripts/build-node.sh +build ruby lambdas: + stage: integration-tests + image: registry.ddbuild.io/images/docker:27.3.1 + tags: ["docker-in-docker:arm64"] + rules: + - when: on_success + needs: [] + artifacts: + expire_in: 1 hour + paths: + - integration-tests/lambda/*/*.rb + script: + - cd integration-tests + - ./scripts/build-ruby.sh + +build go lambdas: + stage: integration-tests + image: registry.ddbuild.io/images/docker:27.3.1 + tags: ["docker-in-docker:arm64"] + rules: + - when: on_success + needs: [] + cache: + key: go-mod-cache-${CI_COMMIT_REF_SLUG} + paths: + - integration-tests/.cache/go-mod/ + artifacts: + expire_in: 1 hour + paths: + - integration-tests/lambda/*/bin/bootstrap + script: + - cd integration-tests + - ./scripts/build-go.sh + # Integration Tests - Publish arm64 layer with integration test prefix publish integration layer (arm64): stage: integration-tests @@ -581,12 +615,16 @@ integration-suite: - build dotnet lambdas - build python lambdas - build node lambdas + - build ruby lambdas + - build go lambdas dependencies: - publish integration layer (arm64) - build java lambdas - build dotnet lambdas - build python lambdas - build node lambdas + - build ruby lambdas + - build go lambdas variables: IDENTIFIER: ${CI_COMMIT_SHORT_SHA} AWS_DEFAULT_REGION: us-east-1 diff --git a/bottlecap/src/bin/bottlecap/main.rs b/bottlecap/src/bin/bottlecap/main.rs index f7e7d9380..acfbf444c 100644 --- a/bottlecap/src/bin/bottlecap/main.rs +++ b/bottlecap/src/bin/bottlecap/main.rs @@ -841,9 +841,12 @@ async fn handle_event_bus_event( stats_concentrator: StatsConcentratorHandle, ) -> Option { match event { - Event::OutOfMemory(event_timestamp) => { + Event::OutOfMemory { + request_id, + timestamp, + } => { if let Err(e) = invocation_processor_handle - .on_out_of_memory_error(event_timestamp) + .on_out_of_memory_error(request_id, timestamp) .await { error!("Failed to send out of memory error to processor: {}", e); diff --git a/bottlecap/src/event_bus/mod.rs b/bottlecap/src/event_bus/mod.rs index 0ea20969e..0be3a86ca 100644 --- a/bottlecap/src/event_bus/mod.rs +++ b/bottlecap/src/event_bus/mod.rs @@ -7,7 +7,13 @@ mod constants; #[derive(Debug)] pub enum Event { Telemetry(TelemetryEvent), - OutOfMemory(i64), + OutOfMemory { + /// Lambda `request_id` of the invocation the OOM belongs to, when known. + /// Used by the invocation processor to dedupe against other OOM detection + /// paths (`PlatformRuntimeDone` `error_type`, `PlatformReport` memory equality). + request_id: Option, + timestamp: i64, + }, Tombstone, } diff --git a/bottlecap/src/lifecycle/invocation/context.rs b/bottlecap/src/lifecycle/invocation/context.rs index 04894f9c6..3aef5e4bf 100644 --- a/bottlecap/src/lifecycle/invocation/context.rs +++ b/bottlecap/src/lifecycle/invocation/context.rs @@ -43,6 +43,12 @@ pub struct Context { /// tracing. /// pub extracted_span_context: Option, + /// Whether the `aws.lambda.enhanced.out_of_memory` metric has already been + /// emitted for this invocation. Multiple detection paths can fire for the + /// same OOM (runtime log, `Runtime.OutOfMemory` `error_type` in + /// `PlatformRuntimeDone`, `max_memory_used == memory_size` in `PlatformReport`); + /// this flag dedupes them. + pub oom_emitted: bool, } /// Struct containing the information needed to reparent a span. @@ -94,6 +100,7 @@ impl Default for Context { snapstart_restore_span: None, tracer_span: None, extracted_span_context: None, + oom_emitted: false, } } } diff --git a/bottlecap/src/lifecycle/invocation/processor.rs b/bottlecap/src/lifecycle/invocation/processor.rs index ec1af99ab..7f7336bb8 100644 --- a/bottlecap/src/lifecycle/invocation/processor.rs +++ b/bottlecap/src/lifecycle/invocation/processor.rs @@ -508,7 +508,7 @@ impl Processor { debug!( "Invocation Processor | PlatformRuntimeDone | Got Runtime.OutOfMemory. Incrementing OOM metric." ); - self.enhanced_metrics.increment_oom_metric(timestamp); + self.try_increment_oom_metric(Some(request_id), timestamp); } } @@ -909,25 +909,25 @@ impl Processor { /// Handles `OnDemand` mode platform report processing. /// - /// Processes OnDemand-specific metrics including OOM detection for provided.al runtimes - /// and post-runtime duration calculation. + /// Processes OnDemand-specific metrics including OOM detection by memory-size + /// equality and post-runtime duration calculation. fn handle_ondemand_report( &mut self, request_id: &String, metrics: OnDemandReportMetrics, timestamp: i64, ) { - // For provided.al runtimes, if the last invocation hit the memory limit, increment the OOM metric. - // We do this for provided.al runtimes because we didn't find another way to detect this under provided.al. - // We don't do this for other runtimes to avoid double counting. - if let Some(runtime) = &self.runtime - && runtime.starts_with("provided.al") - && metrics.max_memory_used_mb == metrics.memory_size_mb - { + // If the invocation hit the memory limit, increment the OOM metric. This catches + // OOM-induced failures that don't surface through a runtime-specific log line or a + // `Runtime.OutOfMemory` error_type — most notably the suppressed-init / timeout-at-cap + // pattern reported in datadog-lambda-extension#1237 (Node). Best-effort dedup + // against the other two detection paths is handled by `try_increment_oom_metric` + // (it can still double-count in edge cases — see that function's doc). + if metrics.max_memory_used_mb == metrics.memory_size_mb { debug!( "Invocation Processor | PlatformReport | Last invocation hit memory limit. Incrementing OOM metric." ); - self.enhanced_metrics.increment_oom_metric(timestamp); + self.try_increment_oom_metric(Some(request_id), timestamp); } // Calculate and set post-runtime duration if context is available @@ -1395,7 +1395,52 @@ impl Processor { Some(error_tags) } - pub fn on_out_of_memory_error(&mut self, timestamp: i64) { + pub fn on_out_of_memory_error(&mut self, request_id: Option<&String>, timestamp: i64) { + self.try_increment_oom_metric(request_id, timestamp); + } + + /// Best-effort dedup wrapper around `enhanced_metrics.increment_oom_metric`. + /// The metric MAY be double-counted in edge cases — see below. + /// + /// Several detection paths can fire for the same invocation: + /// 1. A runtime-specific OOM log line (logs processor → `Event::OutOfMemory`) + /// 2. `error_type == "Runtime.OutOfMemory"` in `PlatformRuntimeDone` + /// 3. `max_memory_used_mb == memory_size_mb` in `PlatformReport` + /// + /// When `request_id` is supplied AND the matching context is still in the + /// buffer, the per-invocation `Context::oom_emitted` flag guarantees one + /// emission per `request_id`. The metric is double-counted when either: + /// - `request_id` is `None` (log line beat `PlatformStart` to + /// `LambdaProcessor`, or it landed after `PlatformRuntimeDone` cleared + /// the slot) and another path subsequently emits with `Some(rid)`; or + /// - the context has been evicted from the buffer (capacity is fixed — + /// see `MAX_CONTEXT_BUFFER_SIZE`) between `PlatformStart` and this + /// call, so the flag has nowhere to live. + /// + /// Both branches still emit (so OOMs are never under-counted) and log a + /// `debug!` line. + fn try_increment_oom_metric(&mut self, request_id: Option<&String>, timestamp: i64) { + if let Some(rid) = request_id { + if let Some(ctx) = self.context_buffer.get_mut(rid) { + if ctx.oom_emitted { + debug!( + "Invocation Processor | OOM metric already emitted for request_id {}, skipping", + rid + ); + return; + } + ctx.oom_emitted = true; + } else { + debug!( + "Invocation Processor | Emitting OOM metric without dedup: context not found for request_id {} (likely evicted from context buffer)", + rid + ); + } + } else { + debug!( + "Invocation Processor | Emitting OOM metric without dedup: no request_id available (OOM log processed before PlatformStart or after PlatformRuntimeDone)" + ); + } self.enhanced_metrics.increment_oom_metric(timestamp); } @@ -2445,4 +2490,136 @@ mod tests { "pre-existing _dd.appsec.enabled value must not be overwritten" ); } + + /// Two OOM signals for the same `request_id` increment the metric exactly once. + /// Exercises the `Context::oom_emitted` dedup flag. + #[tokio::test] + async fn test_try_increment_oom_metric_dedupes_same_request_id() { + let mut p = setup(); + // Insert the context directly so we don't go through `on_invoke_event`, which + // would populate dynamic tags (`cold_start:true`) and complicate the query. + let request_id = String::from("req-dedup"); + p.context_buffer.start_context(&request_id, Span::default()); + + let now: i64 = std::time::UNIX_EPOCH + .elapsed() + .expect("clock") + .as_secs() + .try_into() + .unwrap_or_default(); + + p.on_out_of_memory_error(Some(&request_id), now); + p.on_out_of_memory_error(Some(&request_id), now); + + let ts = (now / 10) * 10; + let entry = p + .enhanced_metrics + .aggr_handle + .get_entry_by_id( + crate::metrics::enhanced::constants::OUT_OF_MEMORY_METRIC.into(), + None, + ts, + ) + .await + .unwrap() + .expect("OOM metric must be emitted at least once"); + + let sketch = entry.value.get_sketch().expect("distribution sketch"); + let sum = sketch.sum().expect("sketch sum"); + assert!( + (sum - 1.0).abs() < f64::EPSILON, + "OOM sum must be 1.0 (deduped), got {sum}" + ); + + // And the context flag should now reflect that we emitted. + assert!( + p.context_buffer + .get(&request_id) + .expect("context") + .oom_emitted, + "oom_emitted flag must be set after the first emission" + ); + } + + /// OOM signals for different `request_id`s each emit a metric — dedup is scoped + /// per request, not globally. + #[tokio::test] + async fn test_try_increment_oom_metric_distinct_request_ids_emit_separately() { + let mut p = setup(); + let req1 = String::from("req-a"); + let req2 = String::from("req-b"); + p.context_buffer.start_context(&req1, Span::default()); + p.context_buffer.start_context(&req2, Span::default()); + + let now: i64 = std::time::UNIX_EPOCH + .elapsed() + .expect("clock") + .as_secs() + .try_into() + .unwrap_or_default(); + + p.on_out_of_memory_error(Some(&req1), now); + p.on_out_of_memory_error(Some(&req2), now); + + let ts = (now / 10) * 10; + let entry = p + .enhanced_metrics + .aggr_handle + .get_entry_by_id( + crate::metrics::enhanced::constants::OUT_OF_MEMORY_METRIC.into(), + None, + ts, + ) + .await + .unwrap() + .expect("OOM metric must be emitted"); + + let sketch = entry.value.get_sketch().expect("distribution sketch"); + let sum = sketch.sum().expect("sketch sum"); + assert!( + (sum - 2.0).abs() < f64::EPSILON, + "OOM sum must be 2.0 (one per request_id), got {sum}" + ); + } + + /// In `handle_ondemand_report`, when `max_memory_used_mb == memory_size_mb`, + /// the OOM metric should be incremented exactly once for that invocation. + #[tokio::test] + async fn test_handle_ondemand_report_emits_oom_on_memory_equality() { + let mut p = setup(); + let request_id = String::from("req-eq"); + p.context_buffer.start_context(&request_id, Span::default()); + + let now: i64 = std::time::UNIX_EPOCH + .elapsed() + .expect("clock") + .as_secs() + .try_into() + .unwrap_or_default(); + + let metrics = OnDemandReportMetrics { + duration_ms: 100.0, + billed_duration_ms: 100, + memory_size_mb: 1024, + max_memory_used_mb: 1024, + init_duration_ms: None, + restore_duration_ms: None, + }; + p.handle_ondemand_report(&request_id, metrics, now); + + let ts = (now / 10) * 10; + assert!( + p.enhanced_metrics + .aggr_handle + .get_entry_by_id( + crate::metrics::enhanced::constants::OUT_OF_MEMORY_METRIC.into(), + None, + ts + ) + .await + .unwrap() + .is_some(), + "OOM must be emitted when max_memory_used_mb == memory_size_mb" + ); + } } diff --git a/bottlecap/src/lifecycle/invocation/processor_service.rs b/bottlecap/src/lifecycle/invocation/processor_service.rs index a41a95b26..61c48b479 100644 --- a/bottlecap/src/lifecycle/invocation/processor_service.rs +++ b/bottlecap/src/lifecycle/invocation/processor_service.rs @@ -118,6 +118,7 @@ pub enum ProcessorCommand { execution_status: Option, }, OnOutOfMemoryError { + request_id: Option, timestamp: i64, }, OnShutdownEvent, @@ -407,10 +408,14 @@ impl InvocationProcessorHandle { pub async fn on_out_of_memory_error( &self, + request_id: Option, timestamp: i64, ) -> Result<(), mpsc::error::SendError> { self.sender - .send(ProcessorCommand::OnOutOfMemoryError { timestamp }) + .send(ProcessorCommand::OnOutOfMemoryError { + request_id, + timestamp, + }) .await } @@ -632,8 +637,12 @@ impl InvocationProcessorService { ) .await; } - ProcessorCommand::OnOutOfMemoryError { timestamp } => { - self.processor.on_out_of_memory_error(timestamp); + ProcessorCommand::OnOutOfMemoryError { + request_id, + timestamp, + } => { + self.processor + .on_out_of_memory_error(request_id.as_ref(), timestamp); } ProcessorCommand::OnShutdownEvent => { self.processor.on_shutdown_event(); diff --git a/bottlecap/src/logs/lambda/processor.rs b/bottlecap/src/logs/lambda/processor.rs index 643e32854..88397b280 100644 --- a/bottlecap/src/logs/lambda/processor.rs +++ b/bottlecap/src/logs/lambda/processor.rs @@ -163,6 +163,25 @@ impl LambdaProcessor { } } + /// Returns the `request_id` of the currently-active invocation, if known. + /// Used by the OOM log-line detector as a fallback when the matched log + /// payload doesn't carry a `requestId` field (text payloads — Node V8 + /// fatal output, Go fatal, Java stderr, etc.). The payload field is + /// preferred when available because it matches the log line exactly and + /// doesn't race with `PlatformStart`. + /// + /// `invocation_context.request_id` is set when this processor handles + /// `PlatformStart` and cleared on `PlatformRuntimeDone` / `PlatformReport`, + /// so this returns `None` when an OOM log line is processed outside that + /// window. + fn current_request_id(&self) -> Option { + if self.invocation_context.request_id.is_empty() { + None + } else { + Some(self.invocation_context.request_id.clone()) + } + } + #[allow(clippy::too_many_lines)] async fn get_message(&mut self, event: TelemetryEvent) -> Result> { let copy = event.clone(); @@ -170,14 +189,10 @@ impl LambdaProcessor { TelemetryRecord::Function(v) => { let (request_id, message, durable_ctx) = match v { serde_json::Value::Object(obj) => { - let request_id = if self.is_managed_instance_mode { - obj.get("requestId") - .or_else(|| obj.get("AWSRequestId")) - .and_then(|v| v.as_str()) - .map(ToString::to_string) - } else { - None - }; + let request_id = obj.get("requestId") + .or_else(|| obj.get("AWSRequestId")) + .and_then(|v| v.as_str()) + .map(ToString::to_string); // When a message is logged from the durable execution SDK, it contains an `executionArn` field. // In this case, extract the durable execution context from the `executionArn` field, and later // set durable execution id and name as log attributes. @@ -194,7 +209,15 @@ impl LambdaProcessor { if let Some(message) = message { if is_oom_error(&message) { debug!("LOGS | Got a runtime-specific OOM error. Incrementing OOM metric."); - if let Err(e) = self.event_bus.send(Event::OutOfMemory(event.time.timestamp())).await { + // Prefer the `requestId` from the log payload (most accurate for this exact log line). + // Fall back to `current_request_id()` only when the payload doesn't carry it — + // i.e., for text payloads, or JSON without a `requestId` field. + + let oom_request_id = request_id.clone().or_else(|| self.current_request_id()); + if let Err(e) = self.event_bus.send(Event::OutOfMemory { + request_id: oom_request_id, + timestamp: event.time.timestamp(), + }).await { error!("LOGS | Failed to send OOM event to the main event bus: {e}"); } } @@ -206,7 +229,7 @@ impl LambdaProcessor { event.time.timestamp_millis(), None, ); - // If the message is logged from the durable execution SDK, + // If the message is logged from the durable execution SDK, // set durable execution id and name as log attributes. if let Some((exec_id, exec_name)) = durable_ctx { msg.lambda.durable_execution_id = Some(exec_id); @@ -227,7 +250,10 @@ impl LambdaProcessor { if let Some(message) = message { if is_oom_error(&message) { debug!("LOGS | Got a runtime-specific OOM error. Incrementing OOM metric."); - if let Err(e) = self.event_bus.send(Event::OutOfMemory(event.time.timestamp())).await { + if let Err(e) = self.event_bus.send(Event::OutOfMemory { + request_id: self.current_request_id(), + timestamp: event.time.timestamp(), + }).await { error!("LOGS | Failed to send OOM event to the main event bus: {e}"); } } @@ -1934,7 +1960,7 @@ mod tests { } #[tokio::test] - async fn test_regular_lambda_does_not_extract_request_id() { + async fn test_regular_lambda_extracts_request_id_from_payload() { let tags = HashMap::from([("test".to_string(), "tags".to_string())]); let config = Arc::new(config::Config { service: Some("test-service".to_string()), @@ -1961,7 +1987,10 @@ mod tests { false, // Regular Lambda mode (not LMI) ); - // Test that requestId is NOT extracted in regular Lambda mode + // The payload `requestId` is extracted in both regular and LMI mode + // so that the OOM detector can dedup against the other detection paths + // by tagging `Event::OutOfMemory` with the request id of the exact log + // line that matched. let mut obj = serde_json::Map::new(); obj.insert( "requestId".to_string(), @@ -1978,8 +2007,10 @@ mod tests { }; let result = processor.get_message(event).await.unwrap(); - // Should be None because we're not in LMI mode - assert_eq!(result.lambda.request_id, None); + assert_eq!( + result.lambda.request_id, + Some("test-request-789".to_string()) + ); } #[test] diff --git a/bottlecap/src/metrics/enhanced/lambda.rs b/bottlecap/src/metrics/enhanced/lambda.rs index abed7d5b9..9260064c0 100644 --- a/bottlecap/src/metrics/enhanced/lambda.rs +++ b/bottlecap/src/metrics/enhanced/lambda.rs @@ -91,12 +91,12 @@ impl Lambda { self.increment_metric(constants::TIMEOUTS_METRIC, timestamp); } - // This function is called in three cases: - // 1. Runtime-specific OOM error (can happen in .NET, Node.js and Java as far as we know) - // 2. PlatformRuntimeDone event reports "error_type: Runtime.OutOfMemory" (can happen in Ruby and Python as far as we know) - // 3. PlatformReport event reports "max_memory_used_mb == memory_size_mb" (can happen in many runtimes, but - // we only call increment_oom_metric() for provided.al runtimes) - // This is our best effort to cover different cases without double counting. We can adjust this if we find more cases. + // Callers should generally go through `Processor::try_increment_oom_metric`, + // which provides best-effort dedup by `request_id` (see its doc for the + // edge cases that can still double-count). The three detection paths are: + // 1. Runtime-specific OOM log line (.NET, Node, Java, Go, Ruby, Python) + // 2. PlatformRuntimeDone with error_type == "Runtime.OutOfMemory" (Node, Ruby, Python) + // 3. PlatformReport with max_memory_used_mb == memory_size_mb (all runtimes) pub fn increment_oom_metric(&self, timestamp: i64) { self.increment_metric(constants::OUT_OF_MEMORY_METRIC, timestamp); } diff --git a/integration-tests/.gitignore b/integration-tests/.gitignore index 4480601bd..26c40de61 100644 --- a/integration-tests/.gitignore +++ b/integration-tests/.gitignore @@ -38,6 +38,7 @@ Thumbs.db # Lambda artifacts response.json lambda-bundle.zip +lambda/*/function.zip # Lambda build outputs lambda/*/target/ diff --git a/integration-tests/bin/app.ts b/integration-tests/bin/app.ts index 1006b47d9..433c3a1d6 100644 --- a/integration-tests/bin/app.ts +++ b/integration-tests/bin/app.ts @@ -6,6 +6,8 @@ import {Otlp} from '../lib/stacks/otlp'; import {Snapstart} from '../lib/stacks/snapstart'; import {LambdaManagedInstancesStack} from '../lib/stacks/lmi'; import {AuthStack} from '../lib/stacks/auth'; +import {Oom} from '../lib/stacks/oom'; +import {LmiOom} from '../lib/stacks/lmi-oom'; import {CustomMetrics} from '../lib/stacks/custom-metrics'; import {AuthRoleStack} from '../lib/auth-role'; import {ACCOUNT, getIdentifier, REGION} from '../config'; @@ -41,6 +43,12 @@ const stacks = [ new AuthStack(app, `integ-${identifier}-auth`, { env, }), + new Oom(app, `integ-${identifier}-oom`, { + env, + }), + new LmiOom(app, `integ-${identifier}-lmi-oom`, { + env, + }), new CustomMetrics(app, `integ-${identifier}-custom-metrics`, { env, }), diff --git a/integration-tests/lambda/oom-dotnet/Function.cs b/integration-tests/lambda/oom-dotnet/Function.cs new file mode 100644 index 000000000..b5c861493 --- /dev/null +++ b/integration-tests/lambda/oom-dotnet/Function.cs @@ -0,0 +1,25 @@ +using Amazon.Lambda.Core; +using System.Collections.Generic; +using System.Text.Json; + +[assembly: LambdaSerializer(typeof(Amazon.Lambda.Serialization.SystemTextJson.DefaultLambdaJsonSerializer))] + +namespace Function +{ + /// + /// OOM reproducer for .NET. Allocates and retains 10 MB byte arrays in a list + /// until the CLR throws System.OutOfMemoryException. Bottlecap's runtime-specific + /// log-line detection matches "OutOfMemoryException". + /// + public class Handler + { + public Dictionary FunctionHandler(JsonElement input, ILambdaContext context) + { + var data = new List(); + while (true) + { + data.Add(new byte[10 * 1024 * 1024]); + } + } + } +} diff --git a/integration-tests/lambda/oom-dotnet/Function.csproj b/integration-tests/lambda/oom-dotnet/Function.csproj new file mode 100644 index 000000000..2dfcbac5f --- /dev/null +++ b/integration-tests/lambda/oom-dotnet/Function.csproj @@ -0,0 +1,14 @@ + + + net8.0 + enable + enable + true + Lambda + true + + + + + + diff --git a/integration-tests/lambda/oom-go/go.mod b/integration-tests/lambda/oom-go/go.mod new file mode 100644 index 000000000..a73b6d85d --- /dev/null +++ b/integration-tests/lambda/oom-go/go.mod @@ -0,0 +1,5 @@ +module oom-go + +go 1.22 + +require github.com/aws/aws-lambda-go v1.49.0 diff --git a/integration-tests/lambda/oom-go/go.sum b/integration-tests/lambda/oom-go/go.sum new file mode 100644 index 000000000..a5b506ab1 --- /dev/null +++ b/integration-tests/lambda/oom-go/go.sum @@ -0,0 +1,10 @@ +github.com/aws/aws-lambda-go v1.49.0 h1:z4VhTqkFZPM3xpEtTqWqRqsRH4TZBMJqTkRiBPYLqIQ= +github.com/aws/aws-lambda-go v1.49.0/go.mod h1:dpMpZgvWx5vuQJfBt0zqBha60q7Dd7RfgJv23DymV8A= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/stretchr/testify v1.7.2 h1:4jaiDzPyXQvSd7D0EjG45355tLlV3VOECpq10pLC+8s= +github.com/stretchr/testify v1.7.2/go.mod h1:R6va5+xMeoiuVRoj+gSkQ7d3FALtqAAGI1FQKckRals= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/integration-tests/lambda/oom-go/main.go b/integration-tests/lambda/oom-go/main.go new file mode 100644 index 000000000..dd95274a8 --- /dev/null +++ b/integration-tests/lambda/oom-go/main.go @@ -0,0 +1,31 @@ +// OOM reproducer for Go. +// Allocates a 500 MB byte slice in a single shot, then writes to every page +// to force physical commit. On a 256 MB Lambda (see `oomMemorySize` in +// `lib/stacks/oom.ts`) this immediately exceeds the cgroup memory limit and +// the kernel SIGKILLs the process, producing a PlatformReport with +// max_memory_used_mb == memory_size_mb. The Go runtime +// also typically prints "fatal error: runtime: out of memory" on the way +// down — bottlecap's runtime-specific log-line detection matches that +// message. Per-Context dedup ensures the OOM metric increments only once +// even if both paths fire. +package main + +import ( + "log" + + "github.com/aws/aws-lambda-go/lambda" +) + +func handler() error { + log.Println("OOM reproducer: allocating 500 MB") + b := make([]byte, 500*1024*1024) + for i := range b { + b[i] = byte(i % 256) + } + log.Println("did not OOM — unexpected") // unreachable on a 256 MB Lambda + return nil +} + +func main() { + lambda.Start(handler) +} diff --git a/integration-tests/lambda/oom-java/pom.xml b/integration-tests/lambda/oom-java/pom.xml new file mode 100644 index 000000000..1ead70ea0 --- /dev/null +++ b/integration-tests/lambda/oom-java/pom.xml @@ -0,0 +1,50 @@ + + + 4.0.0 + + example + oom-java-lambda + 1.0.0 + jar + + OOM Java Lambda + Java Lambda function that triggers OutOfMemoryError for integration tests + + + 21 + 21 + UTF-8 + + + + + com.amazonaws + aws-lambda-java-core + 1.4.0 + + + + + + + org.apache.maven.plugins + maven-shade-plugin + 3.5.0 + + + package + + shade + + + function + false + + + + + + + diff --git a/integration-tests/lambda/oom-java/src/main/java/example/Handler.java b/integration-tests/lambda/oom-java/src/main/java/example/Handler.java new file mode 100644 index 000000000..92edb9c18 --- /dev/null +++ b/integration-tests/lambda/oom-java/src/main/java/example/Handler.java @@ -0,0 +1,24 @@ +package example; + +import com.amazonaws.services.lambda.runtime.Context; +import com.amazonaws.services.lambda.runtime.RequestHandler; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +/** + * OOM reproducer for Java. Allocates and retains 10 MB byte arrays in a list + * until the JVM throws java.lang.OutOfMemoryError: Java heap space. + * Bottlecap's runtime-specific log-line detection matches + * "java.lang.OutOfMemoryError". + */ +public class Handler implements RequestHandler, Map> { + + @Override + public Map handleRequest(Map event, Context context) { + List data = new ArrayList<>(); + while (true) { + data.add(new byte[10 * 1024 * 1024]); + } + } +} diff --git a/integration-tests/lambda/oom-node-sigkill/index.mjs b/integration-tests/lambda/oom-node-sigkill/index.mjs new file mode 100644 index 000000000..d6b245f36 --- /dev/null +++ b/integration-tests/lambda/oom-node-sigkill/index.mjs @@ -0,0 +1,13 @@ +// OOM reproducer: off-heap Buffer growth → kernel SIGKILL. +// Buffer.allocUnsafe(>8KB) goes through V8's ArrayBuffer allocator (external +// memory) and bypasses --max-old-space-size, so RSS grows until the cgroup +// limit triggers a kernel SIGKILL. Lambda surfaces this as PlatformRuntimeDone +// with error_type=Runtime.OutOfMemory — bottlecap's path 2 detection. +export const handler = async () => { + const bufs = []; + while (true) { + const b = Buffer.allocUnsafe(20 * 1024 * 1024); + b.fill(0); + bufs.push(b); + } +}; diff --git a/integration-tests/lambda/oom-node-v8-heap/index.mjs b/integration-tests/lambda/oom-node-v8-heap/index.mjs new file mode 100644 index 000000000..fb4e71c6f --- /dev/null +++ b/integration-tests/lambda/oom-node-v8-heap/index.mjs @@ -0,0 +1,10 @@ +// OOM reproducer: classic V8 heap exhaustion. Allocates retained strings in a +// loop until V8 hits its --max-old-space-size cap and prints +// "FATAL ERROR: ... JavaScript heap out of memory". Exercises bottlecap's +// runtime-specific log-line OOM detection path. +export const handler = async () => { + const arr = []; + while (true) { + arr.push('x'.repeat(10 * 1024 * 1024)); + } +}; diff --git a/integration-tests/lambda/oom-python-lmi/lambda_function.py b/integration-tests/lambda/oom-python-lmi/lambda_function.py new file mode 100644 index 000000000..ad1d9b7d4 --- /dev/null +++ b/integration-tests/lambda/oom-python-lmi/lambda_function.py @@ -0,0 +1,10 @@ +# OOM reproducer for Python on LMI (memory >= 2048 MB). +# A single 100 GB bytearray allocation request exceeds any reasonable +# Lambda memory cap by orders of magnitude, so Python's C allocator +# refuses immediately and raises MemoryError before the kernel has a +# chance to involve the cgroup OOM killer with a SIGKILL. This is the +# fastest and most reliable way to surface a clean MemoryError log +# line on a 2 GB function. The 10 MB-loop pattern used by `oom-python` +# would either take too long or get SIGKILL'd silently at 2 GB. +def handler(event, context): + data = bytearray(100 * 1024 * 1024 * 1024) diff --git a/integration-tests/lambda/oom-python/lambda_function.py b/integration-tests/lambda/oom-python/lambda_function.py new file mode 100644 index 000000000..12aa196ed --- /dev/null +++ b/integration-tests/lambda/oom-python/lambda_function.py @@ -0,0 +1,12 @@ +# OOM reproducer for Python. +# Allocates and retains 10 MB strings in a list until CPython raises +# MemoryError. Lambda surfaces this as PlatformRuntimeDone with +# error_type=Runtime.OutOfMemory; the function log line also contains +# "MemoryError". Both bottlecap detection paths fire — the dedup flag is +# what makes the OOM metric emit exactly once. + + +def handler(event, context): + data = [] + while True: + data.append("x" * (10 * 1024 * 1024)) diff --git a/integration-tests/lambda/oom-ruby/lambda_function.rb b/integration-tests/lambda/oom-ruby/lambda_function.rb new file mode 100644 index 000000000..674a70086 --- /dev/null +++ b/integration-tests/lambda/oom-ruby/lambda_function.rb @@ -0,0 +1,13 @@ +# OOM reproducer for Ruby. +# Allocates and retains 10 MB strings in an array until Ruby raises +# NoMemoryError. Lambda surfaces this as PlatformRuntimeDone with +# error_type=Runtime.OutOfMemory; the function log line also contains +# "failed to allocate memory (NoMemoryError)". Both bottlecap detection +# paths fire — the dedup flag is what makes the OOM metric emit exactly once. + +def handler(event:, context:) + data = [] + loop do + data << ("x" * (10 * 1024 * 1024)) + end +end diff --git a/integration-tests/lib/stacks/lmi-oom.ts b/integration-tests/lib/stacks/lmi-oom.ts new file mode 100644 index 000000000..edd873c57 --- /dev/null +++ b/integration-tests/lib/stacks/lmi-oom.ts @@ -0,0 +1,59 @@ +import * as cdk from 'aws-cdk-lib'; +import * as lambda from 'aws-cdk-lib/aws-lambda'; +import { Construct } from 'constructs'; +import { + createLogGroup, + setCapacityProvider, + defaultDatadogEnvVariables, + defaultDatadogSecretPolicy, + getExtensionLayer, + getDefaultPythonLayer, + defaultPythonRuntime, +} from '../util'; + +/** + * LMI OOM test stack. + * + * Exercises bottlecap OOM detection on a Lambda Managed Instance (LMI) function. + * In LMI mode the function-log JSON payload carries a `requestId` field that + * the OOM detector reads directly (see `LambdaProcessor::get_message`), so + * dedup against the other OOM detection paths works reliably even when an + * OOM fires fast enough that the log line beats `PlatformStart`. + */ +export class LmiOom extends cdk.Stack { + constructor(scope: Construct, id: string, props: cdk.StackProps) { + super(scope, id, props); + + const extensionLayer = getExtensionLayer(this); + const pythonLayer = getDefaultPythonLayer(this); + + const functionName = `${id}-python-lambda`; + const fn = new lambda.Function(this, functionName, { + runtime: defaultPythonRuntime, + architecture: lambda.Architecture.ARM_64, + handler: 'datadog_lambda.handler.handler', + // LMI uses a different Python OOM source than `oom.ts`: at 2 GB the + // 10 MB-loop allocator either runs past the test budget or gets + // SIGKILL'd silently. A single 100 GB `bytearray` allocation request + // makes CPython's allocator refuse immediately and raise a clean + // `MemoryError`, which we need so Path 1 of the OOM detector fires. + code: lambda.Code.fromAsset('./lambda/oom-python-lmi'), + functionName: functionName, + timeout: cdk.Duration.seconds(30), + // LMI requires memorySize >= 2048 (Lambda service validation). + memorySize: 2048, + environment: { + ...defaultDatadogEnvVariables, + DD_SERVICE: functionName, + DD_TRACE_ENABLED: 'true', + DD_LAMBDA_HANDLER: 'lambda_function.handler', + DD_TRACE_AGENT_URL: 'http://127.0.0.1:8126', + }, + logGroup: createLogGroup(this, functionName), + }); + setCapacityProvider(fn); + fn.addToRolePolicy(defaultDatadogSecretPolicy); + fn.addLayers(extensionLayer); + fn.addLayers(pythonLayer); + } +} diff --git a/integration-tests/lib/stacks/oom.ts b/integration-tests/lib/stacks/oom.ts new file mode 100644 index 000000000..c5a57dc2d --- /dev/null +++ b/integration-tests/lib/stacks/oom.ts @@ -0,0 +1,230 @@ +import * as cdk from 'aws-cdk-lib'; +import * as lambda from 'aws-cdk-lib/aws-lambda'; +import { Construct } from 'constructs'; +import { + createLogGroup, + defaultDatadogEnvVariables, + defaultDatadogSecretPolicy, + getExtensionLayer, + getDefaultNodeLayer, + getDefaultPythonLayer, + getDefaultJavaLayer, + getDefaultDotnetLayer, + getDefaultRubyLayer, + defaultNodeRuntime, + defaultPythonRuntime, + defaultJavaRuntime, + defaultDotnetRuntime, + defaultRubyRuntime, + defaultGoRuntime, +} from '../util'; + +/** + * OOM cross-runtime test stack. + * + * Deploys one Lambda per OOM "shape" so the bottlecap dedup change + * (Context::oom_emitted + try_increment_oom_metric, covering issue #1237) + * can be exercised end-to-end across every supported runtime. Each function + * intentionally allocates until it OOMs; the test then asserts the + * `aws.lambda.enhanced.out_of_memory` metric increments by exactly 1. + * + * The detection paths exercised per case: + * - oom-node-v8-heap : log-line match `JavaScript heap out of memory` + * - oom-node-sigkill : PlatformRuntimeDone `error_type=Runtime.OutOfMemory` + * - oom-python : log line `MemoryError` + PlatformRuntimeDone (dedup) + * - oom-ruby : log line `NoMemoryError` + PlatformRuntimeDone (dedup) + * - oom-java : log line `java.lang.OutOfMemoryError` + * - oom-dotnet : log line `OutOfMemoryException` + * - oom-go : log line `fatal error: runtime: out of memory` + * + PlatformReport memory equality (dedup) + * + * Each function is configured with low memory (256 MB) and a short timeout + * (30 s) so the OOM fires quickly during the integration-test run. See the + * `oomMemorySize` comment for why 256 MB rather than the customer's 192 MB. + */ +export class Oom extends cdk.Stack { + constructor(scope: Construct, id: string, props: cdk.StackProps) { + super(scope, id, props); + + const extensionLayer = getExtensionLayer(this); + const nodeLayer = getDefaultNodeLayer(this); + const pythonLayer = getDefaultPythonLayer(this); + const javaLayer = getDefaultJavaLayer(this); + const dotnetLayer = getDefaultDotnetLayer(this); + const rubyLayer = getDefaultRubyLayer(this); + + // 256 MB (not the customer's 192 MB from #1237) so the bottlecap + // extension has memory headroom to survive when the function process + // OOMs. At 192 MB the kernel OOM-killer often picks the extension + // instead of the function runtime (Lambda surfaces this as the + // `Extension.Crash` error type), and a dead extension can't emit the + // OOM metric. With 256 MB the function runtime's RSS dominates and + // kernel reliably kills it; the extension survives to detect/flush. + // The detection paths under test are unchanged — the functions still + // hit `max_memory_used == memory_size` in PlatformReport and still + // emit runtime-specific OOM error log lines. + const oomMemorySize = 256; + const oomTimeout = cdk.Duration.seconds(30); + + // Node case A — V8 heap exhaustion (log-line path). + const nodeV8FunctionName = `${id}-node-v8-heap-lambda`; + const nodeV8Function = new lambda.Function(this, nodeV8FunctionName, { + runtime: defaultNodeRuntime, + architecture: lambda.Architecture.ARM_64, + handler: '/opt/nodejs/node_modules/datadog-lambda-js/handler.handler', + code: lambda.Code.fromAsset('./lambda/oom-node-v8-heap'), + functionName: nodeV8FunctionName, + timeout: oomTimeout, + memorySize: oomMemorySize, + environment: { + ...defaultDatadogEnvVariables, + DD_SERVICE: nodeV8FunctionName, + DD_TRACE_ENABLED: 'true', + DD_LAMBDA_HANDLER: 'index.handler', + // Cap V8 heap below the Lambda memory cap so V8 throws its OOM error + // before the kernel SIGKILLs the process. + NODE_OPTIONS: '--max-old-space-size=128', + }, + logGroup: createLogGroup(this, nodeV8FunctionName), + }); + nodeV8Function.addToRolePolicy(defaultDatadogSecretPolicy); + nodeV8Function.addLayers(extensionLayer); + nodeV8Function.addLayers(nodeLayer); + + // Node case B — off-heap Buffer / kernel SIGKILL (PlatformRuntimeDone path). + const nodeSigkillFunctionName = `${id}-node-sigkill-lambda`; + const nodeSigkillFunction = new lambda.Function(this, nodeSigkillFunctionName, { + runtime: defaultNodeRuntime, + architecture: lambda.Architecture.ARM_64, + handler: '/opt/nodejs/node_modules/datadog-lambda-js/handler.handler', + code: lambda.Code.fromAsset('./lambda/oom-node-sigkill'), + functionName: nodeSigkillFunctionName, + timeout: oomTimeout, + memorySize: oomMemorySize, + environment: { + ...defaultDatadogEnvVariables, + DD_SERVICE: nodeSigkillFunctionName, + DD_TRACE_ENABLED: 'true', + DD_LAMBDA_HANDLER: 'index.handler', + }, + logGroup: createLogGroup(this, nodeSigkillFunctionName), + }); + nodeSigkillFunction.addToRolePolicy(defaultDatadogSecretPolicy); + nodeSigkillFunction.addLayers(extensionLayer); + nodeSigkillFunction.addLayers(nodeLayer); + + // Python — MemoryError; log path and PlatformRuntimeDone path both fire. + const pythonFunctionName = `${id}-python-lambda`; + const pythonFunction = new lambda.Function(this, pythonFunctionName, { + runtime: defaultPythonRuntime, + architecture: lambda.Architecture.ARM_64, + handler: 'datadog_lambda.handler.handler', + code: lambda.Code.fromAsset('./lambda/oom-python'), + functionName: pythonFunctionName, + timeout: oomTimeout, + memorySize: oomMemorySize, + environment: { + ...defaultDatadogEnvVariables, + DD_SERVICE: pythonFunctionName, + DD_TRACE_ENABLED: 'true', + DD_LAMBDA_HANDLER: 'lambda_function.handler', + }, + logGroup: createLogGroup(this, pythonFunctionName), + }); + pythonFunction.addToRolePolicy(defaultDatadogSecretPolicy); + pythonFunction.addLayers(extensionLayer); + pythonFunction.addLayers(pythonLayer); + + // Ruby — NoMemoryError; log path and PlatformRuntimeDone path both fire. + // Datadog's Ruby tracer is a regular gem (no handler shim like Python's + // `datadog_lambda.handler.handler`), so the Lambda handler is the user's + // own `lambda_function.handler` and `DD_LAMBDA_HANDLER` is not used. + const rubyFunctionName = `${id}-ruby-lambda`; + const rubyFunction = new lambda.Function(this, rubyFunctionName, { + runtime: defaultRubyRuntime, + architecture: lambda.Architecture.ARM_64, + handler: 'lambda_function.handler', + code: lambda.Code.fromAsset('./lambda/oom-ruby'), + functionName: rubyFunctionName, + timeout: oomTimeout, + memorySize: oomMemorySize, + environment: { + ...defaultDatadogEnvVariables, + DD_SERVICE: rubyFunctionName, + DD_TRACE_ENABLED: 'true', + }, + logGroup: createLogGroup(this, rubyFunctionName), + }); + rubyFunction.addToRolePolicy(defaultDatadogSecretPolicy); + rubyFunction.addLayers(extensionLayer); + rubyFunction.addLayers(rubyLayer); + + // Java — OutOfMemoryError (log-line path). + const javaFunctionName = `${id}-java-lambda`; + const javaFunction = new lambda.Function(this, javaFunctionName, { + runtime: defaultJavaRuntime, + architecture: lambda.Architecture.ARM_64, + handler: 'example.Handler::handleRequest', + code: lambda.Code.fromAsset('./lambda/oom-java/target/function.jar'), + functionName: javaFunctionName, + timeout: oomTimeout, + memorySize: oomMemorySize, + environment: { + ...defaultDatadogEnvVariables, + DD_SERVICE: javaFunctionName, + AWS_LAMBDA_EXEC_WRAPPER: '/opt/datadog_wrapper', + DD_TRACE_ENABLED: 'true', + }, + logGroup: createLogGroup(this, javaFunctionName), + }); + javaFunction.addToRolePolicy(defaultDatadogSecretPolicy); + javaFunction.addLayers(extensionLayer); + javaFunction.addLayers(javaLayer); + + // .NET — OutOfMemoryException (log-line path). + const dotnetFunctionName = `${id}-dotnet-lambda`; + const dotnetFunction = new lambda.Function(this, dotnetFunctionName, { + runtime: defaultDotnetRuntime, + architecture: lambda.Architecture.ARM_64, + handler: 'Function::Function.Handler::FunctionHandler', + code: lambda.Code.fromAsset('./lambda/oom-dotnet/bin/function.zip'), + functionName: dotnetFunctionName, + timeout: oomTimeout, + memorySize: oomMemorySize, + environment: { + ...defaultDatadogEnvVariables, + DD_SERVICE: dotnetFunctionName, + AWS_LAMBDA_EXEC_WRAPPER: '/opt/datadog_wrapper', + }, + logGroup: createLogGroup(this, dotnetFunctionName), + }); + dotnetFunction.addToRolePolicy(defaultDatadogSecretPolicy); + dotnetFunction.addLayers(extensionLayer); + dotnetFunction.addLayers(dotnetLayer); + + // Go — runtime fatal error (log-line path). + // The Go binary itself is the handler. We don't set + // AWS_LAMBDA_EXEC_WRAPPER: that wrapper sets language-specific env vars + // for tracer auto-instrumentation, which Go doesn't use. + const goFunctionName = `${id}-go-lambda`; + const goFunction = new lambda.Function(this, goFunctionName, { + runtime: defaultGoRuntime, + architecture: lambda.Architecture.ARM_64, + handler: 'bootstrap', + code: lambda.Code.fromAsset('./lambda/oom-go/bin'), + functionName: goFunctionName, + timeout: oomTimeout, + memorySize: oomMemorySize, + environment: { + ...defaultDatadogEnvVariables, + DD_SERVICE: goFunctionName, + }, + logGroup: createLogGroup(this, goFunctionName), + }); + goFunction.addToRolePolicy(defaultDatadogSecretPolicy); + goFunction.addLayers(extensionLayer); + // Go has no tracer layer — the Datadog tracer for Go is a Go module imported + // into the function source. The extension layer alone is enough for the + // enhanced metrics this test asserts on. + } +} diff --git a/integration-tests/lib/util.ts b/integration-tests/lib/util.ts index dd8309789..c7645d4c9 100644 --- a/integration-tests/lib/util.ts +++ b/integration-tests/lib/util.ts @@ -13,11 +13,15 @@ export const defaultNodeRuntime = lambda.Runtime.NODEJS_24_X; export const defaultPythonRuntime = lambda.Runtime.PYTHON_3_13; export const defaultJavaRuntime = lambda.Runtime.JAVA_21; export const defaultDotnetRuntime = lambda.Runtime.DOTNET_8; +export const defaultRubyRuntime = lambda.Runtime.RUBY_3_4; +// Go runs on the custom runtime; the Datadog tracer is a Go module, not a layer. +export const defaultGoRuntime = lambda.Runtime.PROVIDED_AL2023; export const defaultNodeLayerArn = process.env.NODE_TRACER_LAYER_ARN || 'arn:aws:lambda:us-east-1:464622532012:layer:Datadog-Node24-x:132'; export const defaultPythonLayerArn = process.env.PYTHON_TRACER_LAYER_ARN || 'arn:aws:lambda:us-east-1:464622532012:layer:Datadog-Python313-ARM:117'; export const defaultJavaLayerArn = process.env.JAVA_TRACER_LAYER_ARN || 'arn:aws:lambda:us-east-1:464622532012:layer:dd-trace-java:25'; export const defaultDotnetLayerArn = process.env.DOTNET_TRACER_LAYER_ARN || 'arn:aws:lambda:us-east-1:464622532012:layer:dd-trace-dotnet-ARM:23'; +export const defaultRubyLayerArn = process.env.RUBY_TRACER_LAYER_ARN || 'arn:aws:lambda:us-east-1:464622532012:layer:Datadog-Ruby3-4-ARM:28'; export const defaultDatadogEnvVariables = { DD_API_KEY_SECRET_ARN: datadogSecretArn, @@ -87,6 +91,14 @@ export const getDefaultDotnetLayer = (scope: Construct) => { ); }; +export const getDefaultRubyLayer = (scope: Construct) => { + return LayerVersion.fromLayerVersionArn( + scope, + 'DatadogRubyLayer', + defaultRubyLayerArn + ); +}; + export const capacityProviderArn = `arn:aws:lambda:${REGION}:${ACCOUNT}:capacity-provider:integ-default-capacity-provider-cp`; export function setCapacityProvider(lambdaFunction: lambda.Function) { diff --git a/integration-tests/scripts/build-go.sh b/integration-tests/scripts/build-go.sh new file mode 100755 index 000000000..8f24bc45c --- /dev/null +++ b/integration-tests/scripts/build-go.sh @@ -0,0 +1,123 @@ +#!/bin/bash +set -e + +# Reusable script to cross-compile Go Lambda functions for ARM64 Linux. +# Outputs a binary named `bootstrap` (required by the AWS Lambda custom runtime +# provided.al2023) under /bin/. +# +# Usage: +# ./build-go.sh # Build all Go Lambda functions +# ./build-go.sh # Build a specific Lambda function + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +LAMBDA_BASE_DIR="$SCRIPT_DIR/../lambda" + +build_go_lambda() { + local LAMBDA_DIR="$1" + local FUNCTION_NAME=$(basename "$LAMBDA_DIR") + + if [ ! -d "$LAMBDA_DIR" ]; then + echo "Error: Directory not found: $LAMBDA_DIR" + return 1 + fi + + if [ ! -f "$LAMBDA_DIR/go.mod" ]; then + echo "Error: go.mod not found in $LAMBDA_DIR" + return 1 + fi + + echo "Building Go Lambda: $FUNCTION_NAME" + + if ! command -v docker &> /dev/null; then + echo "Error: Docker is not installed or not in PATH" + return 1 + fi + + # Clean previous build (idempotent). + rm -rf "$LAMBDA_DIR/bin" + mkdir -p "$LAMBDA_DIR/bin" + + # Module cache: reuse the host's $GOPATH/pkg/mod when running locally; + # use a project-local cache in CI so it can be cached between jobs. + if [ -n "$CI" ]; then + GO_MOD_CACHE="$SCRIPT_DIR/../.cache/go-mod" + mkdir -p "$GO_MOD_CACHE" + else + GO_MOD_CACHE="${GOPATH:-$HOME/go}/pkg/mod" + mkdir -p "$GO_MOD_CACHE" + fi + + # Cross-compile to ARM64 Linux inside the official Go image. + # CGO is disabled so the binary runs on the provided.al2023 base image + # without a libc mismatch. + docker run --rm --platform linux/arm64 \ + -v "$LAMBDA_DIR":/workspace \ + -v "$GO_MOD_CACHE":/go/pkg/mod \ + -w /workspace \ + -e GOOS=linux \ + -e GOARCH=arm64 \ + -e CGO_ENABLED=0 \ + public.ecr.aws/docker/library/golang:1.22-bookworm \ + sh -c "go mod tidy && go build -o bin/bootstrap ." + + if [ ! -f "$LAMBDA_DIR/bin/bootstrap" ]; then + echo "✗ Build failed: bin/bootstrap not produced" + return 1 + fi + + echo "✓ Build complete: $LAMBDA_DIR/bin/bootstrap" + return 0 +} + +if [ -z "$1" ]; then + echo "==========================================" + echo "Building all Go Lambda functions" + echo "==========================================" + echo "" + + FOUND_GO=0 + FAILED_BUILDS=() + + for LAMBDA_PATH in "$LAMBDA_BASE_DIR"/*; do + if [ ! -d "$LAMBDA_PATH" ]; then + continue + fi + + FUNCTION_NAME=$(basename "$LAMBDA_PATH") + + # Match directories whose suffix is `-go` or whose name is exactly `go`. + if [[ "$FUNCTION_NAME" == *"-go" || "$FUNCTION_NAME" == "go" ]]; then + FOUND_GO=1 + echo "----------------------------------------" + if build_go_lambda "$LAMBDA_PATH"; then + echo "✓ $FUNCTION_NAME built successfully" + else + echo "✗ $FUNCTION_NAME failed" + FAILED_BUILDS+=("$FUNCTION_NAME") + fi + echo "" + fi + done + + if [ $FOUND_GO -eq 0 ]; then + echo "No Go Lambda functions found (looking for directories ending in -go)" + exit 0 + fi + + if [ ${#FAILED_BUILDS[@]} -eq 0 ]; then + echo "✓ All Go Lambda builds completed successfully!" + exit 0 + fi + + echo "✗ ${#FAILED_BUILDS[@]} Go Lambda build(s) failed:" + for failed in "${FAILED_BUILDS[@]}"; do + echo " - $failed" + done + exit 1 +else + LAMBDA_DIR="$1" + if [[ "$LAMBDA_DIR" != /* ]]; then + LAMBDA_DIR="$(cd "$SCRIPT_DIR/.." && pwd)/$LAMBDA_DIR" + fi + build_go_lambda "$LAMBDA_DIR" +fi diff --git a/integration-tests/scripts/build-ruby.sh b/integration-tests/scripts/build-ruby.sh new file mode 100755 index 000000000..0ca36064d --- /dev/null +++ b/integration-tests/scripts/build-ruby.sh @@ -0,0 +1,88 @@ +#!/bin/bash +set -e + +# Reusable script to build Ruby Lambda functions. +# For simple Ruby Lambdas with no gem dependencies, this just packages the +# source as-is — the runtime + Datadog tracer layer provide everything needed. +# If the function gains a Gemfile, this script grows a bundle install step +# in a Docker container (mirroring build-python.sh / build-node.sh). +# +# Usage: +# ./build-ruby.sh # Build all Ruby Lambda functions +# ./build-ruby.sh # Build a specific Lambda function + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +LAMBDA_BASE_DIR="$SCRIPT_DIR/../lambda" + +build_ruby_lambda() { + local LAMBDA_DIR="$1" + local FUNCTION_NAME=$(basename "$LAMBDA_DIR") + + if [ ! -d "$LAMBDA_DIR" ]; then + echo "Error: Directory not found: $LAMBDA_DIR" + return 1 + fi + + echo "Building Ruby Lambda: $FUNCTION_NAME" + + if [ ! -f "$LAMBDA_DIR/Gemfile" ]; then + echo "ℹ No Gemfile found — source files are deployed as-is" + return 0 + fi + + echo "Error: Gemfile-based Ruby builds are not implemented yet" >&2 + echo " Add a Dockerised \`bundle install\` step to this script when needed." >&2 + return 1 +} + +if [ -z "$1" ]; then + echo "==========================================" + echo "Building all Ruby Lambda functions" + echo "==========================================" + echo "" + + FOUND_RUBY=0 + FAILED_BUILDS=() + + for LAMBDA_PATH in "$LAMBDA_BASE_DIR"/*; do + if [ ! -d "$LAMBDA_PATH" ]; then + continue + fi + + FUNCTION_NAME=$(basename "$LAMBDA_PATH") + + if [[ "$FUNCTION_NAME" == *"ruby"* ]]; then + FOUND_RUBY=1 + echo "----------------------------------------" + if build_ruby_lambda "$LAMBDA_PATH"; then + echo "✓ $FUNCTION_NAME built successfully" + else + echo "✗ $FUNCTION_NAME failed" + FAILED_BUILDS+=("$FUNCTION_NAME") + fi + echo "" + fi + done + + if [ $FOUND_RUBY -eq 0 ]; then + echo "No Ruby Lambda functions found (looking for directories with 'ruby' in name)" + exit 0 + fi + + if [ ${#FAILED_BUILDS[@]} -eq 0 ]; then + echo "✓ All Ruby Lambda builds completed successfully!" + exit 0 + fi + + echo "✗ ${#FAILED_BUILDS[@]} Ruby Lambda build(s) failed:" + for failed in "${FAILED_BUILDS[@]}"; do + echo " - $failed" + done + exit 1 +else + LAMBDA_DIR="$1" + if [[ "$LAMBDA_DIR" != /* ]]; then + LAMBDA_DIR="$(cd "$SCRIPT_DIR/.." && pwd)/$LAMBDA_DIR" + fi + build_ruby_lambda "$LAMBDA_DIR" +fi diff --git a/integration-tests/scripts/local_deploy.sh b/integration-tests/scripts/local_deploy.sh index b432261da..451b81cf6 100755 --- a/integration-tests/scripts/local_deploy.sh +++ b/integration-tests/scripts/local_deploy.sh @@ -43,6 +43,8 @@ echo "Building all Lambda functions in parallel..." "$SCRIPT_DIR/build-dotnet.sh" & "$SCRIPT_DIR/build-python.sh" & "$SCRIPT_DIR/build-node.sh" & +"$SCRIPT_DIR/build-ruby.sh" & +"$SCRIPT_DIR/build-go.sh" & wait echo "All Lambda builds complete" diff --git a/integration-tests/tests/lmi-oom.test.ts b/integration-tests/tests/lmi-oom.test.ts new file mode 100644 index 000000000..ed43e821d --- /dev/null +++ b/integration-tests/tests/lmi-oom.test.ts @@ -0,0 +1,66 @@ +import { invokeLambda } from './utils/lambda'; +import { getMetricCount, OUT_OF_MEMORY_METRIC } from './utils/datadog'; +import { getIdentifier } from '../config'; + +/** + * LMI OOM test. + * + * Validates that the `aws.lambda.enhanced.out_of_memory` metric is emitted + * when an LMI-mode Python function hits `MemoryError`. In LMI mode the OOM + * log path tags `Event::OutOfMemory` with the `requestId` parsed from the + * function-log JSON payload, so dedup works without depending on + * `PlatformStart` having raced ahead of the log line. + * + * Asserts `>= 1` rather than `== 1` to stay robust against other paths firing + * (e.g. a future change where `handle_managed_instance_report` surfaces + * `Runtime.OutOfMemory` in the synthesized runtime-done). + */ +const identifier = getIdentifier(); +const stackName = `integ-${identifier}-lmi-oom`; +const functionName = `${stackName}-python-lambda`; + +const INITIAL_WAIT_MS = 90 * 1000; +const POLL_INTERVAL_MS = 30 * 1000; +const TOTAL_BUDGET_MS = 12 * 60 * 1000; + +async function sleep(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)); +} + +describe('LMI OOM Integration Test', () => { + let count = 0; + + beforeAll(async () => { + const invokeTime = Date.now(); + // Subtract 60s from the query window's lower bound. Datadog rolls OOM + // metric data points into 10-second buckets aligned to wall-clock + // multiples; the bucket containing the OOM event is timestamped at the + // bucket *start*, and the query API only returns buckets whose start + // is >= the `from` parameter. If `windowStart == invokeTime` and the + // function OOMs in the same 10-second bucket (e.g. invoke 19:32:11.5, + // OOM 19:32:18 → bucket 19:32:10), the bucket would be excluded. + const windowStart = invokeTime - 60 * 1000; + await invokeLambda(functionName).catch((err) => { + throw new Error(`Invoke failed for ${functionName}: ${err}`); + }); + + await sleep(INITIAL_WAIT_MS); + + const deadline = invokeTime + TOTAL_BUDGET_MS; + let attempt = 0; + while (Date.now() < deadline) { + attempt++; + count = await getMetricCount(OUT_OF_MEMORY_METRIC, functionName, windowStart, Date.now()); + console.log(`LMI OOM poll #${attempt}: count=${count}`); + if (count >= 1) { + break; + } + await sleep(POLL_INTERVAL_MS); + } + console.log(`LMI OOM count (final): ${count}`); + }, TOTAL_BUDGET_MS + 60 * 1000); + + it('should emit at least one out_of_memory metric for one OOM invocation in LMI mode', () => { + expect(count).toBeGreaterThanOrEqual(1); + }); +}); diff --git a/integration-tests/tests/oom.test.ts b/integration-tests/tests/oom.test.ts new file mode 100644 index 000000000..b00b80bff --- /dev/null +++ b/integration-tests/tests/oom.test.ts @@ -0,0 +1,116 @@ +import { invokeLambda } from './utils/lambda'; +import { getMetricCount, OUT_OF_MEMORY_METRIC } from './utils/datadog'; +import { getIdentifier } from '../config'; + +/** + * Cross-runtime OOM test. + * + * Each function is intentionally configured to OOM on its first invocation. + * Bottlecap has three detection paths that can fire for the same invocation + * (runtime-specific log line, `Runtime.OutOfMemory` `error_type` in + * `PlatformRuntimeDone`, `max_memory_used_mb == memory_size_mb` in + * `PlatformReport`); the `Context::oom_emitted` flag introduced for #1237 + * dedupes them so the metric increments exactly once per invocation. + * + * The Python/Ruby/Go cases are particularly meaningful regressions because + * they trigger more than one detection path naturally — if dedup is broken, + * those counts go to 2. + * + * Ingestion timing: empirical observation in CI is that the + * `aws.lambda.enhanced.out_of_memory` metric data point is durably ingested + * within ~30s of the OOM, but Datadog's `/api/v1/query` endpoint sometimes + * returns no results for very-recently-ingested points (the query engine's + * snapshot lags the ingest path). The single-shot 5-minute wait used by the + * other suites is therefore too brittle for this assertion. Instead we poll: + * after an initial wait we re-query every 30s until every runtime reports + * count>=1 or the overall budget is exhausted. + */ +const identifier = getIdentifier(); +const stackName = `integ-${identifier}-oom`; + +interface OomCase { + runtime: string; + functionName: string; +} + +const cases: OomCase[] = [ + { runtime: 'node-v8-heap', functionName: `${stackName}-node-v8-heap-lambda` }, + { runtime: 'node-sigkill', functionName: `${stackName}-node-sigkill-lambda` }, + { runtime: 'python', functionName: `${stackName}-python-lambda` }, + { runtime: 'ruby', functionName: `${stackName}-ruby-lambda` }, + { runtime: 'java', functionName: `${stackName}-java-lambda` }, + { runtime: 'dotnet', functionName: `${stackName}-dotnet-lambda` }, + { runtime: 'go', functionName: `${stackName}-go-lambda` }, +]; + +const INITIAL_WAIT_MS = 90 * 1000; // wait before first query +const POLL_INTERVAL_MS = 30 * 1000; // re-query cadence +const TOTAL_BUDGET_MS = 12 * 60 * 1000; // overall ceiling + +async function sleep(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)); +} + +async function fetchCounts(start: number, end: number): Promise> { + const results = await Promise.all( + cases.map(async (c) => ({ + runtime: c.runtime, + count: await getMetricCount(OUT_OF_MEMORY_METRIC, c.functionName, start, end), + })), + ); + return Object.fromEntries(results.map((r) => [r.runtime, r.count])); +} + +describe('OOM Integration Tests', () => { + let countsByRuntime: Record; + + beforeAll(async () => { + const invokeTime = Date.now(); + // Subtract 60s from the query window's lower bound. Datadog rolls OOM + // metric data points into 10-second buckets aligned to wall-clock + // multiples and the API only returns buckets whose start timestamp is + // >= the `from` parameter. If the function OOMs in the same bucket as + // `invokeTime`, the bucket start (e.g. 19:32:10 for an invoke at + // 19:32:11.5) is excluded. The `[lmi-oom]` suite hit this on a fast + // LMI cold start; defensive in this suite too since the timing is + // workload-dependent. + const windowStart = invokeTime - 60 * 1000; + + await Promise.all( + cases.map((c) => + invokeLambda(c.functionName).catch((err) => { + // OOM functions usually succeed at the Invoke API layer (the function + // is run, just crashes), so a thrown error here is unexpected + // infrastructure failure rather than the OOM itself. + throw new Error(`Invoke failed for ${c.functionName}: ${err}`); + }), + ), + ); + + await sleep(INITIAL_WAIT_MS); + + const deadline = invokeTime + TOTAL_BUDGET_MS; + let counts: Record = {}; + let attempt = 0; + while (Date.now() < deadline) { + attempt++; + counts = await fetchCounts(windowStart, Date.now()); + const missing = cases.filter((c) => (counts[c.runtime] ?? 0) < 1).map((c) => c.runtime); + console.log(`OOM poll #${attempt}:`, counts, missing.length ? `(still missing: ${missing.join(', ')})` : '(all runtimes >=1)'); + if (missing.length === 0) { + break; + } + await sleep(POLL_INTERVAL_MS); + } + + countsByRuntime = counts; + console.log('OOM counts by runtime (final):', countsByRuntime); + }, TOTAL_BUDGET_MS + 60 * 1000); + + describe.each(cases)('$runtime runtime', ({ runtime }) => { + it('should emit exactly one out_of_memory metric for one OOM invocation', () => { + const count = countsByRuntime[runtime]; + expect(count).toBe(1); + }); + }); +}); diff --git a/integration-tests/tests/utils/datadog.ts b/integration-tests/tests/utils/datadog.ts index f5f98bdf5..3e69dab78 100644 --- a/integration-tests/tests/utils/datadog.ts +++ b/integration-tests/tests/utils/datadog.ts @@ -91,6 +91,8 @@ export const DURATION_METRICS = [ 'aws.lambda.enhanced.init_duration', ]; +export const OUT_OF_MEMORY_METRIC = 'aws.lambda.enhanced.out_of_memory'; + export type EnhancedMetrics = Record; export interface MetricPoint { @@ -289,6 +291,41 @@ export async function getEnhancedMetrics( return metrics; } +/** + * Returns the total emission count of a counter / distribution enhanced metric + * for a single function over the given window, by summing all data-point + * values returned by Datadog. Used by oom.test.ts to assert that + * `aws.lambda.enhanced.out_of_memory` increments exactly once per invocation — + * verifying the per-Context `oom_emitted` dedup flag introduced for #1237. + */ +export async function getMetricCount( + metricName: string, + functionName: string, + fromTime: number, + toTime: number, +): Promise { + const baseFunctionName = getServiceName(functionName).toLowerCase(); + const query = `sum:${metricName}{functionname:${baseFunctionName}}.as_count()`; + + console.log(`Querying metric count: ${query}`); + + const response = await datadogClient.get('/api/v1/query', { + params: { + query, + from: Math.floor(fromTime / 1000), + to: Math.floor(toTime / 1000), + }, + }); + + const series = response.data.series || []; + if (series.length === 0) { + return 0; + } + + const pointlist: [number, number][] = series[0].pointlist || []; + return pointlist.reduce((acc, [, value]) => acc + (value || 0), 0); +} + async function getMetrics( metricName: string, functionName: string,