From ea57bff05a5d8e480a7d21cd67f4479422a93305 Mon Sep 17 00:00:00 2001 From: Alexandre Rulleau Date: Mon, 11 May 2026 13:01:58 +0200 Subject: [PATCH 01/13] ci: add --show-slow 10000 to valgrind run-tests.php invocation (PHP >= 7.2) Lists tests slower than 10s with their durations at the end of the run. The --show-slow flag only exists in PHP 7.2+, so it's gated by a runtime PHP_VERSION_ID check to avoid breaking PHP 7.0/7.1 jobs. Used to verify the hypothesis that recent CI failures are caused by tests exceeding 1000s under valgrind (triggering the number_format E_WARNING in run-tests.php's record() method). --- Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 1d017c7308b..963228d77f4 100644 --- a/Makefile +++ b/Makefile @@ -211,7 +211,8 @@ test_extension_ci: $(SO_FILE) $(TEST_FILES) $(TEST_STUB_FILES) $(BUILD_DIR)/run- \ export TEST_PHP_JUNIT=$(JUNIT_RESULTS_DIR)/valgrind-extension-test.xml; \ export TEST_PHP_OUTPUT=$(JUNIT_RESULTS_DIR)/valgrind-run-tests.out; \ - DD_SPAWN_WORKER_STABLE_TRAMPOLINE=1 $(ALL_TEST_ENV_OVERRIDE) $(RUN_TESTS_CMD) -d extension=$(SO_FILE) -m -s $$TEST_PHP_OUTPUT $(BUILD_DIR)/$(TESTS) && ! grep -e '^LEAKED TEST SUMMARY' $$TEST_PHP_OUTPUT; \ + SHOW_SLOW=$$(php -r 'echo PHP_VERSION_ID >= 70200 ? "--show-slow 10000" : "";'); \ + DD_SPAWN_WORKER_STABLE_TRAMPOLINE=1 $(ALL_TEST_ENV_OVERRIDE) $(RUN_TESTS_CMD) $$SHOW_SLOW -d extension=$(SO_FILE) -m -s $$TEST_PHP_OUTPUT $(BUILD_DIR)/$(TESTS) && ! grep -e '^LEAKED TEST SUMMARY' $$TEST_PHP_OUTPUT; \ ) build_tea: TEA_BUILD_TESTS=ON From f4edb28610d406a162f6b0f6785f2cb86b8ac6e8 Mon Sep 17 00:00:00 2001 From: Alexandre Rulleau Date: Mon, 11 May 2026 15:00:33 +0200 Subject: [PATCH 02/13] fix(ci): patch run-tests.php getTimer() to avoid E_WARNING on long valgrind tests PHP's run-tests.php getTimer() returns number_format($elapsed, 4) which produces a comma-formatted string (e.g. "1,500.0000") when a test takes >=1000 seconds. PHP 8.0+ raises E_WARNING when that string is used in += arithmetic inside record(), causing the test runner to abort with exit code 2. Patch getTimer() at build time so it returns a float (via round()) instead. --- Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/Makefile b/Makefile index 963228d77f4..2e2c9cbb5aa 100644 --- a/Makefile +++ b/Makefile @@ -112,6 +112,7 @@ $(BUILD_DIR)/configure: $(M4_FILES) $(BUILD_DIR)/ddtrace.sym $(BUILD_DIR)/VERSIO $(BUILD_DIR)/run-tests.php: $(if $(ASSUME_COMPILED),, $(BUILD_DIR)/configure) $(if $(ASSUME_COMPILED), cp $(shell dirname $(shell realpath $(shell which phpize)))/../lib/php/build/run-tests.php $(BUILD_DIR)/run-tests.php) sed -i 's/\bdl(/(bool)(/' $(BUILD_DIR)/run-tests.php # this dl() stuff in run-tests.php is for --EXTENSIONS-- sections, which we don't use; just strip it away (see https://github.com/php/php-src/issues/15367) + sed -i 's/return number_format($$this->rootSuite/return round($$this->rootSuite/' $(BUILD_DIR)/run-tests.php # number_format returns a comma-formatted string for elapsed >= 1000s (e.g. "1,500.0000"), which is non-numeric and triggers E_WARNING in PHP 8.0+ when used in += arithmetic inside record() # ensure list of rust files is up to date $(BUILD_DIR)/.rust_files_list: $(RUST_FILES) From b371f6cd93a1226034d4793e17b6312c678203bc Mon Sep 17 00:00:00 2001 From: Alexandre Rulleau Date: Mon, 11 May 2026 17:27:50 +0200 Subject: [PATCH 03/13] ci: wait for request-replayer readiness before running tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The CI .base_test before_script runs wait-for-service-ready.sh, but WAIT_FOR only listed test-agent — request-replayer was never gated on, even though it is in the agent_httpbin_service() service block used by test_extension_ci, ASAN test_c, and ASAN test_c with multiple observers. Add a request-replayer case to detect_service_type (probing /replay, a read-only endpoint that returns valid JSON when php -S is up), and add request-replayer:80 to WAIT_FOR for the three affected job blocks. PHP Language Tests is unchanged: it only declares test-agent in services. --- .gitlab/generate-tracer.php | 6 +++--- .gitlab/wait-for-service-ready.sh | 8 ++++++++ 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/.gitlab/generate-tracer.php b/.gitlab/generate-tracer.php index 42510ff8ce5..ad0b8df75d3 100644 --- a/.gitlab/generate-tracer.php +++ b/.gitlab/generate-tracer.php @@ -232,7 +232,7 @@ function before_script_steps($with_docker_auth = false) { artifacts: true retry: 2 variables: - WAIT_FOR: test-agent:9126 + WAIT_FOR: test-agent:9126 request-replayer:80 KUBERNETES_CPU_REQUEST: 6 KUBERNETES_CPU_LIMIT: 6 KUBERNETES_MEMORY_REQUEST: 4Gi @@ -305,7 +305,7 @@ function before_script_steps($with_docker_auth = false) { ARCH: "amd64" artifacts: true variables: - WAIT_FOR: test-agent:9126 + WAIT_FOR: test-agent:9126 request-replayer:80 KUBERNETES_CPU_REQUEST: 12 MAX_TEST_PARALLELISM: 4 PHP_MAJOR_MINOR: "" @@ -360,7 +360,7 @@ function before_script_steps($with_docker_auth = false) { ARCH: "amd64" artifacts: true variables: - WAIT_FOR: test-agent:9126 + WAIT_FOR: test-agent:9126 request-replayer:80 KUBERNETES_CPU_REQUEST: 12 MAX_TEST_PARALLELISM: 4 PHP_MAJOR_MINOR: "" diff --git a/.gitlab/wait-for-service-ready.sh b/.gitlab/wait-for-service-ready.sh index 45ee96399d4..0f22493bfd5 100755 --- a/.gitlab/wait-for-service-ready.sh +++ b/.gitlab/wait-for-service-ready.sh @@ -6,6 +6,7 @@ detect_service_type() { local host=${1} case ${host} in test-agent) echo "test-agent" ;; + request-replayer) echo "request-replayer" ;; mysql-integration) echo "mysql" ;; elasticsearch*) echo "elasticsearch" ;; zookeeper*) echo "zookeeper" ;; @@ -38,6 +39,13 @@ wait_for_single_service() { return 0 fi ;; + request-replayer) + # /replay is read-only and always returns valid JSON when the PHP server is up + if curl -sf "http://${HOST}:${PORT}/replay" > /dev/null 2>&1; then + echo "request-replayer is ready" + return 0 + fi + ;; mysql) if mysqladmin ping -h"${HOST}" --silent 2>/dev/null; then echo "MySQL is ready" From 674b774d8a4da315e2aa269e965899062ab344eb Mon Sep 17 00:00:00 2001 From: Alexandre Rulleau Date: Mon, 11 May 2026 17:32:52 +0200 Subject: [PATCH 04/13] ci(fix): accept any HTTP response from request-replayer health probe curl -sf fails on 4xx, which would falsely report not-ready if /replay returns 4xx when no dump exists yet (the natural startup state). Drop -f so any HTTP response proves php -S is up and executing index.php; connection failure remains the only signal of an unhealthy service. --- .gitlab/wait-for-service-ready.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.gitlab/wait-for-service-ready.sh b/.gitlab/wait-for-service-ready.sh index 0f22493bfd5..2359920632c 100755 --- a/.gitlab/wait-for-service-ready.sh +++ b/.gitlab/wait-for-service-ready.sh @@ -40,8 +40,9 @@ wait_for_single_service() { fi ;; request-replayer) - # /replay is read-only and always returns valid JSON when the PHP server is up - if curl -sf "http://${HOST}:${PORT}/replay" > /dev/null 2>&1; then + # Any HTTP response (2xx/3xx/4xx/5xx) proves php -S is up and executing index.php. + # /replay may return 4xx when no data has been dumped yet, so do not use -f. + if curl -s -o /dev/null "http://${HOST}:${PORT}/replay"; then echo "request-replayer is ready" return 0 fi From ef06ce73347b6865c385a52aea204f44d80596dd Mon Sep 17 00:00:00 2001 From: Alexandre Rulleau Date: Tue, 12 May 2026 13:51:05 +0200 Subject: [PATCH 05/13] ci: raise --show-slow threshold from 10s to 30s Filters the SLOW TEST SUMMARY to tests that meaningfully approach the per-test 300s timeout, trimming noise from the long tail. --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 2e2c9cbb5aa..a3758cf9217 100644 --- a/Makefile +++ b/Makefile @@ -212,7 +212,7 @@ test_extension_ci: $(SO_FILE) $(TEST_FILES) $(TEST_STUB_FILES) $(BUILD_DIR)/run- \ export TEST_PHP_JUNIT=$(JUNIT_RESULTS_DIR)/valgrind-extension-test.xml; \ export TEST_PHP_OUTPUT=$(JUNIT_RESULTS_DIR)/valgrind-run-tests.out; \ - SHOW_SLOW=$$(php -r 'echo PHP_VERSION_ID >= 70200 ? "--show-slow 10000" : "";'); \ + SHOW_SLOW=$$(php -r 'echo PHP_VERSION_ID >= 70200 ? "--show-slow 30000" : "";'); \ DD_SPAWN_WORKER_STABLE_TRAMPOLINE=1 $(ALL_TEST_ENV_OVERRIDE) $(RUN_TESTS_CMD) $$SHOW_SLOW -d extension=$(SO_FILE) -m -s $$TEST_PHP_OUTPUT $(BUILD_DIR)/$(TESTS) && ! grep -e '^LEAKED TEST SUMMARY' $$TEST_PHP_OUTPUT; \ ) From dced1f49fb7e3d86b0e44fdd7d27d8b736a09e3a Mon Sep 17 00:00:00 2001 From: Alexandre Rulleau Date: Fri, 15 May 2026 12:20:00 +0200 Subject: [PATCH 06/13] test(telemetry): accept integrations bundled in app-started payload On PHP 8.4 the test sandbox integration registers early enough that the worker bundles it into the app-started payload's `integrations` field (see libdd-telemetry build_app_started), so no separate app-integrations-change event is emitted and the existing assertion loop times out. Accept both delivery paths: the dedicated app-integrations-change event, and the integrations array embedded in app-started. The expected output shape is identical because Rust Integration struct field order matches. --- tests/ext/telemetry/integration.phpt | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/ext/telemetry/integration.phpt b/tests/ext/telemetry/integration.phpt index 8e74be22933..b5ee5aea929 100644 --- a/tests/ext/telemetry/integration.phpt +++ b/tests/ext/telemetry/integration.phpt @@ -60,6 +60,13 @@ namespace var_dump($json["payload"]); break 3; } + // The integrations may also be bundled into the app-started payload + // depending on registration timing (see libdd-telemetry build_app_started). + if ($json["request_type"] == "app-started" + && !empty($json["payload"]["integrations"])) { + var_dump(["integrations" => $json["payload"]["integrations"]]); + break 3; + } } } } From 1c5e65c9f1507c9d526fdcde9427ae5bce558507 Mon Sep 17 00:00:00 2001 From: Alexandre Rulleau Date: Fri, 15 May 2026 12:19:09 +0200 Subject: [PATCH 07/13] fix(test): poll for dynamic config in dynamic_config_update.phpt to fix flaky PHP 8.0 valgrind The test guarded a sleep(20) with `if (ini_get(...) != 0.5)`, but the wrapped sleep() blocks SIGVTALRM during the call. Under valgrind on PHP 8.0, the heavier signal traffic can cut the sleep short before the sidecar fetches the config from the request-replayer and propagates it to INI globals, leaving every dynamic value at its default. Replace the single sleep() with a 100ms polling loop bounded at 20s total. Each usleep() iteration goes through the same SIGVTALRM-blocking wrapper, which calls ddtrace_check_for_new_config_now() on unblock under valgrind, giving the test repeated, well-defined opportunities to apply the config while still finishing early on the happy path. --- tests/ext/remote_config/dynamic_config_update.phpt | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tests/ext/remote_config/dynamic_config_update.phpt b/tests/ext/remote_config/dynamic_config_update.phpt index 48be3ca6ad0..ed7902f5dbb 100644 --- a/tests/ext/remote_config/dynamic_config_update.phpt +++ b/tests/ext/remote_config/dynamic_config_update.phpt @@ -50,8 +50,14 @@ put_dynamic_config_file([ // submit span data \DDTrace\start_span(); -if (ini_get("datadog.trace.sample_rate") != 0.5) { - sleep(20); // signal interrupts interrupt the sleep(). +// Poll until the dynamic config is applied. The wrapped sleep() blocks +// SIGVTALRM during the call, so a single sleep(20) can be cut short by +// unrelated signals (notably under valgrind, where signal traffic is heavier). +// Looping ensures we keep waiting up to ~20s total for the sidecar to fetch +// from the request-replayer and propagate the config to INI globals. +$deadline = microtime(true) + 20; +while (ini_get("datadog.trace.sample_rate") != 0.5 && microtime(true) < $deadline) { + usleep(100000); } var_dump(ini_get("datadog.trace.sample_rate")); From d26b1623c8ffb6f2d7b50f9fa98f801ab34a6024 Mon Sep 17 00:00:00 2001 From: Alexandre Rulleau Date: Fri, 15 May 2026 12:31:53 +0200 Subject: [PATCH 08/13] fix(test): use await_agent_info to deflake dd_trace_agent_env.phpt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous warmup was a no-op: $start = microtime(true); ... usleep(floor(microtime(true) - $start) * 100000); floor() of a sub-second float yields 0, so usleep(0) did nothing. Also, the sleep happened *after* the span had already been sampled with an empty env, so even a real delay wouldn't have helped. Replace it with dd_trace_internal_fn('await_agent_info') called before opening the span — same pattern that client_side_stats*.phpt already uses to gate on the sidecar having read /info from the agent. --- tests/ext/request-replayer/dd_trace_agent_env.phpt | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/tests/ext/request-replayer/dd_trace_agent_env.phpt b/tests/ext/request-replayer/dd_trace_agent_env.phpt index 50de644e70a..036d09f8f30 100644 --- a/tests/ext/request-replayer/dd_trace_agent_env.phpt +++ b/tests/ext/request-replayer/dd_trace_agent_env.phpt @@ -30,19 +30,10 @@ datadog.trace.agent_test_session_token=dd_trace_agent_env --FILE-- waitForDataAndReplay(); -usleep(floor(microtime(true) - $start) * 100000); - \DDTrace\close_span(); var_dump($span->env); From 57d4a861d01bb62e2032b3dd0324f0b813100d83 Mon Sep 17 00:00:00 2001 From: Alexandre Rulleau Date: Mon, 1 Jun 2026 16:22:56 +0200 Subject: [PATCH 09/13] test(replayer): instrument polling helpers with HTTP-status diagnostic on timeout MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a waitFor* helper exhausts maxIteration, dump to stdout: - The session token in use - The HTTP status code returned by the replayer for that session - The HTTP status + body length when the same endpoint is queried without the session token This closes the visibility gap on flaky CI failures where the test output currently shows only a generic 'HTTP request failed!' warning followed by 'process timed out'. With the diagnostic, the next failing run will tell us whether the replayer returned 4xx/5xx, whether the expected data was present under a different session bucket, or whether the connection failed at the transport layer. Also suppresses the per-retry file_get_contents warning (the @-prefix) so flaky polling no longer pollutes test stdout with N warning lines during retries — only the final structured diagnostic appears on genuine timeouts. No-op on passing tests (the diagnostic path runs only just before throw). --- tests/ext/includes/request_replayer.inc | 48 +++++++++++++++++++++++-- 1 file changed, 45 insertions(+), 3 deletions(-) diff --git a/tests/ext/includes/request_replayer.inc b/tests/ext/includes/request_replayer.inc index 764bb9d98de..b08e11426e2 100644 --- a/tests/ext/includes/request_replayer.inc +++ b/tests/ext/includes/request_replayer.inc @@ -57,6 +57,7 @@ class RequestReplayer $i = 0; do { if ($i++ == $this->maxIteration) { + $this->diagnosticDump('/replay', 'waitForRequest', $i - 1); throw new Exception("wait for replay timeout"); } usleep($this->flushInterval); @@ -77,6 +78,7 @@ class RequestReplayer $i = 0; do { if ($i++ == $this->maxIteration) { + $this->diagnosticDump('/replay-rc-requests', 'waitForRcRequest', $i - 1); throw new Exception("wait for replay timeout"); } usleep($this->flushInterval); @@ -97,6 +99,7 @@ class RequestReplayer $i = 0; do { if ($i++ == $this->maxIteration) { + $this->diagnosticDump('/replay', 'waitForDataAndReplay', $i - 1); throw new Exception("wait for replay timeout"); } usleep($this->flushInterval); @@ -117,7 +120,7 @@ class RequestReplayer public function replayAllRequests() { - return json_decode(file_get_contents($this->endpoint . '/replay', false, stream_context_create([ + return json_decode(@file_get_contents($this->endpoint . '/replay', false, stream_context_create([ "http" => [ "header" => "X-Datadog-Test-Session-Token: " . ini_get("datadog.trace.agent_test_session_token"), ], @@ -126,7 +129,7 @@ class RequestReplayer public function replayAllStats() { - return json_decode(file_get_contents($this->endpoint . '/replay-stats', false, stream_context_create([ + return json_decode(@file_get_contents($this->endpoint . '/replay-stats', false, stream_context_create([ "http" => [ "header" => "X-Datadog-Test-Session-Token: " . ini_get("datadog.trace.agent_test_session_token"), ], @@ -138,6 +141,7 @@ class RequestReplayer $i = 0; do { if ($i++ == $this->maxIteration) { + $this->diagnosticDump('/replay-stats', 'waitForStats', $i - 1); throw new Exception("wait for stats timeout"); } usleep($this->flushInterval); @@ -154,13 +158,51 @@ class RequestReplayer public function replayAllRcRequests() { - return json_decode(file_get_contents($this->endpoint . '/replay-rc-requests', false, stream_context_create([ + return json_decode(@file_get_contents($this->endpoint . '/replay-rc-requests', false, stream_context_create([ "http" => [ "header" => "X-Datadog-Test-Session-Token: " . ini_get("datadog.trace.agent_test_session_token"), ], ])), true); } + /** + * Parse the HTTP status code from the magic $http_response_header array. + * Returns null if the array is empty (e.g. connection refused / timeout / DNS fail). + */ + private static function parseStatus($headers) + { + if (is_array($headers) && !empty($headers) && preg_match('#HTTP/\S+\s+(\d+)#', $headers[0], $m)) { + return (int)$m[1]; + } + return null; + } + + /** + * Emit a diagnostic dump to stdout when a wait-loop times out. Captures the + * HTTP status code returned by the replayer for the test's session token, then + * cross-checks against the same endpoint without the token, so we can tell if + * the data we were waiting for landed in a different session bucket. + */ + private function diagnosticDump($path, $caller, $attempts) + { + $token = ini_get("datadog.trace.agent_test_session_token"); + $url = $this->endpoint . $path; + + $bodyToken = @file_get_contents($url, false, stream_context_create([ + "http" => ["header" => "X-Datadog-Test-Session-Token: $token"], + ])); + $statusToken = self::parseStatus($http_response_header ?? null); + $lenToken = is_string($bodyToken) ? strlen($bodyToken) : -1; + + $bodyNoToken = @file_get_contents($url, false, stream_context_create([])); + $statusNoToken = self::parseStatus($http_response_header ?? null); + $lenNoToken = is_string($bodyNoToken) ? strlen($bodyNoToken) : -1; + + echo "[RR-DIAG] $caller timeout after $attempts attempts; token=" . var_export($token, true) . "\n"; + echo "[RR-DIAG] $path with token: status=" . var_export($statusToken, true) . " body_len=$lenToken\n"; + echo "[RR-DIAG] $path without token: status=" . var_export($statusNoToken, true) . " body_len=$lenNoToken\n"; + } + public function clearDumpedData() { file_get_contents($this->endpoint . '/clear-dumped-data', false, stream_context_create([ From 2f7181f4b4d00f7710ae91df0ce52f540c6ce0ca Mon Sep 17 00:00:00 2001 From: Alexandre Rulleau Date: Mon, 1 Jun 2026 17:18:19 +0200 Subject: [PATCH 10/13] test(replayer): cap wait loops at 60s wall-clock so diagnostic fires before run-tests external kill MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previous instrumentation only emitted the diagnostic when the iteration count reached 500. Under valgrind each iteration is 10-20x slower than nominal, so the iteration-cap path may take 250s+ wall-clock — past run-tests.php's per-test external kill timer. The diagnostic never fires because the process is killed first. Add an explicit wall-clock budget (60s default) that aborts the wait and dumps the diagnostic well before the external kill. Pass elapsed seconds through to the diagnostic so we can see how much wall-time was actually spent waiting. --- tests/ext/includes/request_replayer.inc | 38 ++++++++++++++++++------- 1 file changed, 28 insertions(+), 10 deletions(-) diff --git a/tests/ext/includes/request_replayer.inc b/tests/ext/includes/request_replayer.inc index b08e11426e2..695afea5634 100644 --- a/tests/ext/includes/request_replayer.inc +++ b/tests/ext/includes/request_replayer.inc @@ -32,6 +32,14 @@ class RequestReplayer */ public $maxIteration; + /** + * @var float Wall-clock seconds before a wait loop gives up and emits diagnostic. + * Iteration-count alone is unreliable under valgrind where each iteration takes + * 10-20x longer than nominal; without a wall-clock cap the process is killed by + * run-tests.php's external per-test timeout before our diagnostic can fire. + */ + public $maxWallTimeSeconds; + public function __construct() { $this->endpoint = sprintf( @@ -45,6 +53,7 @@ class RequestReplayer : 50000; $this->maxIteration = 500; + $this->maxWallTimeSeconds = 60.0; } public function waitForFlush() @@ -55,9 +64,11 @@ class RequestReplayer public function waitForRequest($matcher) { $i = 0; + $start = microtime(true); do { - if ($i++ == $this->maxIteration) { - $this->diagnosticDump('/replay', 'waitForRequest', $i - 1); + $elapsed = microtime(true) - $start; + if ($i++ == $this->maxIteration || $elapsed > $this->maxWallTimeSeconds) { + $this->diagnosticDump('/replay', 'waitForRequest', $i - 1, $elapsed); throw new Exception("wait for replay timeout"); } usleep($this->flushInterval); @@ -76,9 +87,11 @@ class RequestReplayer public function waitForRcRequest($matcher) { $i = 0; + $start = microtime(true); do { - if ($i++ == $this->maxIteration) { - $this->diagnosticDump('/replay-rc-requests', 'waitForRcRequest', $i - 1); + $elapsed = microtime(true) - $start; + if ($i++ == $this->maxIteration || $elapsed > $this->maxWallTimeSeconds) { + $this->diagnosticDump('/replay-rc-requests', 'waitForRcRequest', $i - 1, $elapsed); throw new Exception("wait for replay timeout"); } usleep($this->flushInterval); @@ -97,9 +110,11 @@ class RequestReplayer public function waitForDataAndReplay($ignoreTelemetry = true) { $i = 0; + $start = microtime(true); do { - if ($i++ == $this->maxIteration) { - $this->diagnosticDump('/replay', 'waitForDataAndReplay', $i - 1); + $elapsed = microtime(true) - $start; + if ($i++ == $this->maxIteration || $elapsed > $this->maxWallTimeSeconds) { + $this->diagnosticDump('/replay', 'waitForDataAndReplay', $i - 1, $elapsed); throw new Exception("wait for replay timeout"); } usleep($this->flushInterval); @@ -139,9 +154,11 @@ class RequestReplayer public function waitForStats($matcher = null) { $i = 0; + $start = microtime(true); do { - if ($i++ == $this->maxIteration) { - $this->diagnosticDump('/replay-stats', 'waitForStats', $i - 1); + $elapsed = microtime(true) - $start; + if ($i++ == $this->maxIteration || $elapsed > $this->maxWallTimeSeconds) { + $this->diagnosticDump('/replay-stats', 'waitForStats', $i - 1, $elapsed); throw new Exception("wait for stats timeout"); } usleep($this->flushInterval); @@ -183,7 +200,7 @@ class RequestReplayer * cross-checks against the same endpoint without the token, so we can tell if * the data we were waiting for landed in a different session bucket. */ - private function diagnosticDump($path, $caller, $attempts) + private function diagnosticDump($path, $caller, $attempts, $elapsed = null) { $token = ini_get("datadog.trace.agent_test_session_token"); $url = $this->endpoint . $path; @@ -198,7 +215,8 @@ class RequestReplayer $statusNoToken = self::parseStatus($http_response_header ?? null); $lenNoToken = is_string($bodyNoToken) ? strlen($bodyNoToken) : -1; - echo "[RR-DIAG] $caller timeout after $attempts attempts; token=" . var_export($token, true) . "\n"; + $elapsedStr = $elapsed !== null ? sprintf(" elapsed=%.2fs", $elapsed) : ""; + echo "[RR-DIAG] $caller timeout after $attempts attempts$elapsedStr; token=" . var_export($token, true) . "\n"; echo "[RR-DIAG] $path with token: status=" . var_export($statusToken, true) . " body_len=$lenToken\n"; echo "[RR-DIAG] $path without token: status=" . var_export($statusNoToken, true) . " body_len=$lenNoToken\n"; } From 52cc25ac2e5b20bc9b9a0d9b0817c104b6e4c8f6 Mon Sep 17 00:00:00 2001 From: Alexandre Rulleau Date: Tue, 2 Jun 2026 12:18:37 +0200 Subject: [PATCH 11/13] test(replayer): enrich diagnostic with URI breakdown + per-call fetch_ms MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The first diagnostic that fired on this PR revealed two open questions it couldn't answer: 1. Of the bytes the replayer returns for the test session, which URIs are present? (If the response is all telemetry, the test filter strips it and replayRequest() returns empty even though /replay returned 200 with a non-empty body.) 2. Are individual /replay calls slow, or is the slowness elsewhere in the loop? Extend the diagnostic to: - JSON-decode each /replay response and emit a URI=count summary, so we can tell whether the test's expected payload kind (e.g. /v0.4/traces) actually arrived. - Time each of the diagnostic's own /replay calls (fetch_ms) so we can compare against the wall-time the wait loop spent — if the diagnostic's own calls are fast (say <100ms) but the loop took 30s per attempt, the bottleneck is not the HTTP call. --- tests/ext/includes/request_replayer.inc | 39 +++++++++++++++++++++++-- 1 file changed, 37 insertions(+), 2 deletions(-) diff --git a/tests/ext/includes/request_replayer.inc b/tests/ext/includes/request_replayer.inc index 695afea5634..90c6be71504 100644 --- a/tests/ext/includes/request_replayer.inc +++ b/tests/ext/includes/request_replayer.inc @@ -200,25 +200,60 @@ class RequestReplayer * cross-checks against the same endpoint without the token, so we can tell if * the data we were waiting for landed in a different session bucket. */ + /** + * Summarise a JSON-decoded /replay response by URI to spot data that's there + * but filtered out by the test's logic (e.g. telemetry vs trace). + */ + private static function summariseUris($body) + { + if (!is_string($body) || $body === '') { + return '(empty body)'; + } + $decoded = json_decode($body, true); + if (!is_array($decoded)) { + return '(non-array body)'; + } + if (count($decoded) === 0) { + return '(empty array)'; + } + $counts = []; + foreach ($decoded as $entry) { + $uri = is_array($entry) && isset($entry['uri']) ? $entry['uri'] : '(no uri)'; + $counts[$uri] = ($counts[$uri] ?? 0) + 1; + } + ksort($counts); + $parts = []; + foreach ($counts as $u => $c) { + $parts[] = "$u=$c"; + } + return implode(', ', $parts); + } + private function diagnosticDump($path, $caller, $attempts, $elapsed = null) { $token = ini_get("datadog.trace.agent_test_session_token"); $url = $this->endpoint . $path; + $t0 = microtime(true); $bodyToken = @file_get_contents($url, false, stream_context_create([ "http" => ["header" => "X-Datadog-Test-Session-Token: $token"], ])); + $tTokenMs = (microtime(true) - $t0) * 1000; $statusToken = self::parseStatus($http_response_header ?? null); $lenToken = is_string($bodyToken) ? strlen($bodyToken) : -1; + $urisToken = self::summariseUris($bodyToken); + $t1 = microtime(true); $bodyNoToken = @file_get_contents($url, false, stream_context_create([])); + $tNoTokenMs = (microtime(true) - $t1) * 1000; $statusNoToken = self::parseStatus($http_response_header ?? null); $lenNoToken = is_string($bodyNoToken) ? strlen($bodyNoToken) : -1; $elapsedStr = $elapsed !== null ? sprintf(" elapsed=%.2fs", $elapsed) : ""; echo "[RR-DIAG] $caller timeout after $attempts attempts$elapsedStr; token=" . var_export($token, true) . "\n"; - echo "[RR-DIAG] $path with token: status=" . var_export($statusToken, true) . " body_len=$lenToken\n"; - echo "[RR-DIAG] $path without token: status=" . var_export($statusNoToken, true) . " body_len=$lenNoToken\n"; + echo "[RR-DIAG] $path with token: status=" . var_export($statusToken, true) . " body_len=$lenToken fetch_ms=" . round($tTokenMs, 1) . "\n"; + echo "[RR-DIAG] $path with token uris: $urisToken\n"; + echo "[RR-DIAG] $path without token: status=" . var_export($statusNoToken, true) . " body_len=$lenNoToken fetch_ms=" . round($tNoTokenMs, 1) . "\n"; } public function clearDumpedData() From 488d8d80ddfce2a951bfbef9f23fd20292f7eacd Mon Sep 17 00:00:00 2001 From: Alexandre Rulleau Date: Tue, 2 Jun 2026 14:53:12 +0200 Subject: [PATCH 12/13] test(probe,filters): emit diagnostic when probe install + trace filters fail Two probes used to diagnose the remaining valgrind flake classes: 1. await_probe_installation (live_debugger.inc): add a 30s wall-clock cap alongside the 15000-iteration cap so this fires before run-tests external timeout under valgrind-stretched iterations. On incomplete install (loop exited without seeing enough hook IDs), dump get_loaded_remote_configs to show which RC configs ARE loaded on the extension side, plus elapsed wall time and remaining probe count. No-op for tests where the probe installs in time. 2. client_side_stats_trace_filters.phpt: before the 'in traces:' / 'in stats:' dumps, snapshot dd_trace_internal_fn('get_agent_info') to show whether the filter_tags / filter_tags_regex / ignore_resources rules from the PUT /set-agent-info in SKIPIF actually propagated to the extension. If they didn't, the trace filter regression has its root cause upstream of the filter-application code. Switches EXPECT -> EXPECTF so the agent_info JSON is matched with a %s wildcard regardless of contents (passing runs still pass). --- tests/ext/live-debugger/live_debugger.inc | 20 +++++++++++++++++++ .../client_side_stats_trace_filters.phpt | 19 +++++++++++++++++- 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/tests/ext/live-debugger/live_debugger.inc b/tests/ext/live-debugger/live_debugger.inc index ee336d2d15d..c8ff901b63d 100644 --- a/tests/ext/live-debugger/live_debugger.inc +++ b/tests/ext/live-debugger/live_debugger.inc @@ -125,8 +125,16 @@ function await_probe_installation($trigger, $num = 1) { $last_id = \DDtrace\install_hook("_dummy", function () {}); $ret = $trigger(); + $expected = $num; + $start = microtime(true); + // Wall-clock cap alongside iteration cap so this fires before run-tests.php's + // external per-test timeout under valgrind-stretched iterations. + $maxWallSeconds = 30.0; for ($i = 0; $i < 15000; ++$i) { + if (microtime(true) - $start > $maxWallSeconds) { + break; + } usleep(1000); $id = \DDtrace\install_hook("_dummy", function () {}); @@ -139,5 +147,17 @@ function await_probe_installation($trigger, $num = 1) { $last_id = $id; } + // Probe install incomplete after the budget. Emit a diagnostic dump showing + // which RC configs (if any) are loaded extension-side and how long we waited. + $elapsed = microtime(true) - $start; + $loaded = null; + try { + $loaded = dd_trace_internal_fn('get_loaded_remote_configs'); + } catch (\Throwable $e) { + $loaded = "(get_loaded_remote_configs threw: " . $e->getMessage() . ")"; + } + echo "[PROBE-DIAG] await_probe_installation incomplete: iters=$i elapsed=" . round($elapsed, 2) . "s expected_num=$expected remaining_num=$num\n"; + echo "[PROBE-DIAG] loaded_remote_configs=" . (is_array($loaded) ? json_encode($loaded) : var_export($loaded, true)) . "\n"; + return $ret; } \ No newline at end of file diff --git a/tests/ext/request-replayer/client_side_stats_trace_filters.phpt b/tests/ext/request-replayer/client_side_stats_trace_filters.phpt index 25f2b3b1261..5ec1fbf5f19 100644 --- a/tests/ext/request-replayer/client_side_stats_trace_filters.phpt +++ b/tests/ext/request-replayer/client_side_stats_trace_filters.phpt @@ -157,6 +157,22 @@ foreach ($secondFlushTraces as $req) { } } } +// Diagnostic dump: snapshot the agent_info as seen extension-side. If the filter +// rules aren't present here, the sidecar never propagated them to the extension +// (regardless of what the agent_info /info endpoint returned). +try { + $ai = dd_trace_internal_fn('get_agent_info'); + $aiSnap = is_array($ai) ? [ + 'filter_tags' => $ai['filter_tags'] ?? null, + 'filter_tags_regex' => $ai['filter_tags_regex'] ?? null, + 'ignore_resources' => $ai['ignore_resources'] ?? null, + 'client_drop_p0s' => $ai['client_drop_p0s'] ?? null, + ] : $ai; + echo "[FILTER-DIAG] agent_info_view: " . json_encode($aiSnap) . "\n"; +} catch (\Throwable $e) { + echo "[FILTER-DIAG] agent_info_view: (threw: " . $e->getMessage() . ")\n"; +} + ksort($namesInTraces); foreach (array_keys($namesInTraces) as $n) { echo "in traces: $n\n"; @@ -200,6 +216,7 @@ if (empty($ops)) { } } ?> ---EXPECT-- +--EXPECTF-- +[FILTER-DIAG] agent_info_view: %s in traces: op.pass in stats: op.pass From 16889e93dc4bf97eea3e806483c28ecf68ac4030 Mon Sep 17 00:00:00 2001 From: Alexandre Rulleau Date: Tue, 2 Jun 2026 16:34:16 +0200 Subject: [PATCH 13/13] test(replayer,probe): remove diagnostic stdout emissions to stop polluting EXPECT output MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The PROBE-DIAG/RR-DIAG/FILTER-DIAG dumps were added to investigate flaky valgrind failures. They served their purpose (identified replayer session-token isolation leaks and probe-install regressions) but a false-positive in the await_probe iteration heuristic flipped 2 otherwise-passing tests to failing (debugger_metric_probe on 7.4, debugger_log_probe_process_tags on 8.1). PHPT merges stderr into stdout, so routing diagnostics to stderr would not help. Keep the wall-clock cap added in 2f7181f4b — that prevents run-tests.php's external per-test kill from firing before our throw on valgrind-stretched runs, which was the real fix. --- tests/ext/includes/request_replayer.inc | 90 +------------------ tests/ext/live-debugger/live_debugger.inc | 13 --- .../client_side_stats_trace_filters.phpt | 19 +--- 3 files changed, 5 insertions(+), 117 deletions(-) diff --git a/tests/ext/includes/request_replayer.inc b/tests/ext/includes/request_replayer.inc index 90c6be71504..0b0e46ce2e2 100644 --- a/tests/ext/includes/request_replayer.inc +++ b/tests/ext/includes/request_replayer.inc @@ -66,9 +66,7 @@ class RequestReplayer $i = 0; $start = microtime(true); do { - $elapsed = microtime(true) - $start; - if ($i++ == $this->maxIteration || $elapsed > $this->maxWallTimeSeconds) { - $this->diagnosticDump('/replay', 'waitForRequest', $i - 1, $elapsed); + if ($i++ == $this->maxIteration || microtime(true) - $start > $this->maxWallTimeSeconds) { throw new Exception("wait for replay timeout"); } usleep($this->flushInterval); @@ -89,9 +87,7 @@ class RequestReplayer $i = 0; $start = microtime(true); do { - $elapsed = microtime(true) - $start; - if ($i++ == $this->maxIteration || $elapsed > $this->maxWallTimeSeconds) { - $this->diagnosticDump('/replay-rc-requests', 'waitForRcRequest', $i - 1, $elapsed); + if ($i++ == $this->maxIteration || microtime(true) - $start > $this->maxWallTimeSeconds) { throw new Exception("wait for replay timeout"); } usleep($this->flushInterval); @@ -112,9 +108,7 @@ class RequestReplayer $i = 0; $start = microtime(true); do { - $elapsed = microtime(true) - $start; - if ($i++ == $this->maxIteration || $elapsed > $this->maxWallTimeSeconds) { - $this->diagnosticDump('/replay', 'waitForDataAndReplay', $i - 1, $elapsed); + if ($i++ == $this->maxIteration || microtime(true) - $start > $this->maxWallTimeSeconds) { throw new Exception("wait for replay timeout"); } usleep($this->flushInterval); @@ -156,9 +150,7 @@ class RequestReplayer $i = 0; $start = microtime(true); do { - $elapsed = microtime(true) - $start; - if ($i++ == $this->maxIteration || $elapsed > $this->maxWallTimeSeconds) { - $this->diagnosticDump('/replay-stats', 'waitForStats', $i - 1, $elapsed); + if ($i++ == $this->maxIteration || microtime(true) - $start > $this->maxWallTimeSeconds) { throw new Exception("wait for stats timeout"); } usleep($this->flushInterval); @@ -182,80 +174,6 @@ class RequestReplayer ])), true); } - /** - * Parse the HTTP status code from the magic $http_response_header array. - * Returns null if the array is empty (e.g. connection refused / timeout / DNS fail). - */ - private static function parseStatus($headers) - { - if (is_array($headers) && !empty($headers) && preg_match('#HTTP/\S+\s+(\d+)#', $headers[0], $m)) { - return (int)$m[1]; - } - return null; - } - - /** - * Emit a diagnostic dump to stdout when a wait-loop times out. Captures the - * HTTP status code returned by the replayer for the test's session token, then - * cross-checks against the same endpoint without the token, so we can tell if - * the data we were waiting for landed in a different session bucket. - */ - /** - * Summarise a JSON-decoded /replay response by URI to spot data that's there - * but filtered out by the test's logic (e.g. telemetry vs trace). - */ - private static function summariseUris($body) - { - if (!is_string($body) || $body === '') { - return '(empty body)'; - } - $decoded = json_decode($body, true); - if (!is_array($decoded)) { - return '(non-array body)'; - } - if (count($decoded) === 0) { - return '(empty array)'; - } - $counts = []; - foreach ($decoded as $entry) { - $uri = is_array($entry) && isset($entry['uri']) ? $entry['uri'] : '(no uri)'; - $counts[$uri] = ($counts[$uri] ?? 0) + 1; - } - ksort($counts); - $parts = []; - foreach ($counts as $u => $c) { - $parts[] = "$u=$c"; - } - return implode(', ', $parts); - } - - private function diagnosticDump($path, $caller, $attempts, $elapsed = null) - { - $token = ini_get("datadog.trace.agent_test_session_token"); - $url = $this->endpoint . $path; - - $t0 = microtime(true); - $bodyToken = @file_get_contents($url, false, stream_context_create([ - "http" => ["header" => "X-Datadog-Test-Session-Token: $token"], - ])); - $tTokenMs = (microtime(true) - $t0) * 1000; - $statusToken = self::parseStatus($http_response_header ?? null); - $lenToken = is_string($bodyToken) ? strlen($bodyToken) : -1; - $urisToken = self::summariseUris($bodyToken); - - $t1 = microtime(true); - $bodyNoToken = @file_get_contents($url, false, stream_context_create([])); - $tNoTokenMs = (microtime(true) - $t1) * 1000; - $statusNoToken = self::parseStatus($http_response_header ?? null); - $lenNoToken = is_string($bodyNoToken) ? strlen($bodyNoToken) : -1; - - $elapsedStr = $elapsed !== null ? sprintf(" elapsed=%.2fs", $elapsed) : ""; - echo "[RR-DIAG] $caller timeout after $attempts attempts$elapsedStr; token=" . var_export($token, true) . "\n"; - echo "[RR-DIAG] $path with token: status=" . var_export($statusToken, true) . " body_len=$lenToken fetch_ms=" . round($tTokenMs, 1) . "\n"; - echo "[RR-DIAG] $path with token uris: $urisToken\n"; - echo "[RR-DIAG] $path without token: status=" . var_export($statusNoToken, true) . " body_len=$lenNoToken fetch_ms=" . round($tNoTokenMs, 1) . "\n"; - } - public function clearDumpedData() { file_get_contents($this->endpoint . '/clear-dumped-data', false, stream_context_create([ diff --git a/tests/ext/live-debugger/live_debugger.inc b/tests/ext/live-debugger/live_debugger.inc index c8ff901b63d..14c306434aa 100644 --- a/tests/ext/live-debugger/live_debugger.inc +++ b/tests/ext/live-debugger/live_debugger.inc @@ -125,7 +125,6 @@ function await_probe_installation($trigger, $num = 1) { $last_id = \DDtrace\install_hook("_dummy", function () {}); $ret = $trigger(); - $expected = $num; $start = microtime(true); // Wall-clock cap alongside iteration cap so this fires before run-tests.php's // external per-test timeout under valgrind-stretched iterations. @@ -147,17 +146,5 @@ function await_probe_installation($trigger, $num = 1) { $last_id = $id; } - // Probe install incomplete after the budget. Emit a diagnostic dump showing - // which RC configs (if any) are loaded extension-side and how long we waited. - $elapsed = microtime(true) - $start; - $loaded = null; - try { - $loaded = dd_trace_internal_fn('get_loaded_remote_configs'); - } catch (\Throwable $e) { - $loaded = "(get_loaded_remote_configs threw: " . $e->getMessage() . ")"; - } - echo "[PROBE-DIAG] await_probe_installation incomplete: iters=$i elapsed=" . round($elapsed, 2) . "s expected_num=$expected remaining_num=$num\n"; - echo "[PROBE-DIAG] loaded_remote_configs=" . (is_array($loaded) ? json_encode($loaded) : var_export($loaded, true)) . "\n"; - return $ret; } \ No newline at end of file diff --git a/tests/ext/request-replayer/client_side_stats_trace_filters.phpt b/tests/ext/request-replayer/client_side_stats_trace_filters.phpt index 5ec1fbf5f19..25f2b3b1261 100644 --- a/tests/ext/request-replayer/client_side_stats_trace_filters.phpt +++ b/tests/ext/request-replayer/client_side_stats_trace_filters.phpt @@ -157,22 +157,6 @@ foreach ($secondFlushTraces as $req) { } } } -// Diagnostic dump: snapshot the agent_info as seen extension-side. If the filter -// rules aren't present here, the sidecar never propagated them to the extension -// (regardless of what the agent_info /info endpoint returned). -try { - $ai = dd_trace_internal_fn('get_agent_info'); - $aiSnap = is_array($ai) ? [ - 'filter_tags' => $ai['filter_tags'] ?? null, - 'filter_tags_regex' => $ai['filter_tags_regex'] ?? null, - 'ignore_resources' => $ai['ignore_resources'] ?? null, - 'client_drop_p0s' => $ai['client_drop_p0s'] ?? null, - ] : $ai; - echo "[FILTER-DIAG] agent_info_view: " . json_encode($aiSnap) . "\n"; -} catch (\Throwable $e) { - echo "[FILTER-DIAG] agent_info_view: (threw: " . $e->getMessage() . ")\n"; -} - ksort($namesInTraces); foreach (array_keys($namesInTraces) as $n) { echo "in traces: $n\n"; @@ -216,7 +200,6 @@ if (empty($ops)) { } } ?> ---EXPECTF-- -[FILTER-DIAG] agent_info_view: %s +--EXPECT-- in traces: op.pass in stats: op.pass