Skip to content

Commit 83e15cc

Browse files
committed
Increase and stabilize test timeouts and waits
Raise various test timeouts and replace fragile polling with explicit waits to reduce CI flakiness. Changes include: - backend: increased integration/misc test timeouts (user consume test 10s→20s, runtime metrics 5s→10s) to accommodate shared-server startup delays under CI. - link/kalam-client: widen AUTO_SERVER_READY_TIMEOUT (15s→30s), AUTO_SERVER_HTTP_TIMEOUT (10s→15s) and add AUTO_SERVER_BLOCKING_TIMEOUT (60s); use blocking timeout for server startup/token waits. - proxied tests/helpers: increase RECONNECT_WAIT_TIMEOUT (15s→30s), connection_timeout (10s→15s), subscribe/auth handshake budgets (5s→10s) to tolerate slower CI runs; raise overall test timeout for subscribe_during_reconnect (15s→45s). - double_outage test: replace manual polling loops with explicit wait helpers (wait_for_active_connections / wait_for_reconnect) and add an assert to ensure the client begins reconnecting before the second outage. These changes aim to make tests more robust against intermittent delays in CI and reduce spurious failures.
1 parent 1472c22 commit 83e15cc

6 files changed

Lines changed: 32 additions & 27 deletions

File tree

backend/tests/integration_tests/topic_pubsub.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -320,7 +320,7 @@ async fn test_drop_topic() {
320320

321321
/// Test that user role is forbidden from consuming topics
322322
#[tokio::test]
323-
#[ntest::timeout(10000)]
323+
#[ntest::timeout(20000)]
324324
async fn test_consume_user_role_forbidden() {
325325
let server = TestServer::new_shared().await;
326326

backend/tests/misc/system/test_runtime_metrics.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ use kalam_client::models::ResponseStatus;
33
use serial_test::serial;
44

55
#[tokio::test]
6-
#[ntest::timeout(5000)] // healthy shared-server startup + query now lands around 3.1s
6+
#[ntest::timeout(10000)] // local runs are fast, but shared-server startup can spike under CI load
77
#[serial(memory_metrics)]
88
async fn test_system_stats_expose_memory_breakdown_and_allocator_metrics() {
99
let server = TestServer::new_shared().await;

link/kalam-client/tests/common/mod.rs

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,13 @@ struct AutoTestServer {
2929
_running: RunningTestHttpServer,
3030
}
3131

32-
const AUTO_SERVER_READY_TIMEOUT: Duration = Duration::from_secs(15);
32+
// Full-suite CI can delay isolated bootstrap and login well past 20s even when
33+
// the server eventually becomes healthy, so keep the blocking bridge wider than
34+
// the HTTP polling window used inside the helper itself.
35+
const AUTO_SERVER_READY_TIMEOUT: Duration = Duration::from_secs(30);
3336
const AUTO_SERVER_RETRY_INTERVAL: Duration = Duration::from_millis(100);
34-
const AUTO_SERVER_HTTP_TIMEOUT: Duration = Duration::from_secs(10);
37+
const AUTO_SERVER_HTTP_TIMEOUT: Duration = Duration::from_secs(15);
38+
const AUTO_SERVER_BLOCKING_TIMEOUT: Duration = Duration::from_secs(60);
3539

3640
fn should_auto_start_test_server() -> bool {
3741
if std::env::var("KALAMDB_SERVER_URL").is_ok() {
@@ -74,7 +78,7 @@ fn ensure_auto_test_server(
7478
let _ = tx.send(result);
7579
});
7680

77-
match rx.recv_timeout(Duration::from_secs(20)) {
81+
match rx.recv_timeout(AUTO_SERVER_BLOCKING_TIMEOUT) {
7882
Ok(result) => result,
7983
Err(err) => Err(format!("Timed out starting test server: {}", err)),
8084
}
@@ -307,7 +311,7 @@ fn root_access_token_blocking_for_base_url(
307311
let _ = tx.send(result);
308312
});
309313

310-
match rx.recv_timeout(Duration::from_secs(20)) {
314+
match rx.recv_timeout(AUTO_SERVER_BLOCKING_TIMEOUT) {
311315
Ok(Ok(token)) => return Ok(token),
312316
Ok(Err(err)) => return Err(err.into()),
313317
Err(err) => return Err(err.to_string().into()),

link/kalam-client/tests/proxied/double_outage.rs

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -91,12 +91,12 @@ async fn test_proxy_server_down_while_reconnecting() {
9191
// Briefly resume the proxy so the client starts its reconnect attempt,
9292
// then kill it again immediately.
9393
proxy.simulate_server_up();
94-
for _ in 0..20 {
95-
if connect_count.load(Ordering::SeqCst) >= 2 || proxy.active_count().await >= 1 {
96-
break;
97-
}
98-
sleep(Duration::from_millis(100)).await;
99-
}
94+
assert!(
95+
proxy
96+
.wait_for_active_connections(1, Duration::from_secs(10))
97+
.await,
98+
"client should begin reconnecting before the second outage"
99+
);
100100

101101
// ── Second outage while reconnecting ────────────────────────────────
102102
let dc2 = disconnect_count.load(Ordering::SeqCst);
@@ -126,13 +126,13 @@ async fn test_proxy_server_down_while_reconnecting() {
126126
let expected_connects = connect_count.load(Ordering::SeqCst) + 1;
127127
proxy.simulate_server_up();
128128

129-
for _ in 0..100 {
130-
if connect_count.load(Ordering::SeqCst) >= expected_connects && client.is_connected().await
131-
{
132-
break;
133-
}
134-
sleep(Duration::from_millis(100)).await;
135-
}
129+
wait_for_reconnect(
130+
&client,
131+
&connect_count,
132+
expected_connects,
133+
"double outage final recovery",
134+
)
135+
.await;
136136
assert!(client.is_connected().await, "client should recover after double outage");
137137

138138
let mut resumed_ids = Vec::<String>::new();

link/kalam-client/tests/proxied/helpers.rs

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ use std::time::Duration;
1414
use tokio::time::{sleep, Instant};
1515

1616
pub const TEST_TIMEOUT: Duration = Duration::from_secs(10);
17-
pub const RECONNECT_WAIT_TIMEOUT: Duration = Duration::from_secs(15);
17+
pub const RECONNECT_WAIT_TIMEOUT: Duration = Duration::from_secs(30);
1818

1919
fn reconnect_test_timeouts() -> KalamLinkTimeouts {
2020
KalamLinkTimeouts {
@@ -23,14 +23,15 @@ fn reconnect_test_timeouts() -> KalamLinkTimeouts {
2323
// suite) do not time out before the in-process isolated server finishes
2424
// the WebSocket handshake. When the TCP proxy is paused, connections
2525
// fail immediately regardless of this value.
26-
connection_timeout: Duration::from_secs(10),
26+
connection_timeout: Duration::from_secs(15),
2727
receive_timeout: Duration::from_secs(5),
2828
send_timeout: Duration::from_secs(2),
29-
// Reconnect involves auth + resubscribe handshakes. Keep these high
30-
// enough for a loaded debug build, but do not inflate the overall test
31-
// wall-clock timeout; the outer test timeout remains the guardrail.
32-
subscribe_timeout: Duration::from_secs(5),
33-
auth_timeout: Duration::from_secs(5),
29+
// Reconnect involves auth + resubscribe handshakes. CI full-suite runs
30+
// regularly push the isolated server beyond 5s here, so keep the
31+
// handshake budget above that while the outer test timeout remains the
32+
// main guardrail.
33+
subscribe_timeout: Duration::from_secs(10),
34+
auth_timeout: Duration::from_secs(10),
3435
initial_data_timeout: Duration::from_secs(30),
3536
idle_timeout: Duration::ZERO,
3637
keepalive_interval: Duration::from_secs(1),

link/kalam-client/tests/proxied/subscribe_during_reconnect.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ use tokio::time::{sleep, timeout};
99
/// outage. The new subscription must eventually be established and deliver its
1010
/// data once the connection stabilises.
1111
#[tokio::test]
12-
#[ntest::timeout(15000)]
12+
#[ntest::timeout(45000)]
1313
async fn test_subscribe_during_reconnect_eventually_delivers() {
1414
let result = timeout(Duration::from_secs(60), async {
1515
let writer = match create_test_client() {

0 commit comments

Comments
 (0)