From dc7c8fda299e16903a38d70da25725872bd95a48 Mon Sep 17 00:00:00 2001
From: Tin Dang <tindang.ht97@gmail.com>
Date: Mon, 18 May 2026 09:06:07 +0700
Subject: [PATCH 1/3] fix(persistence): gate multi-part AOF recovery to
 runtime-monoio
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Under runtime-tokio, BGREWRITEAOF writes a single-file appendonly.aof
with an RDB preamble (legacy v2 format) — it never advances the AOF
manifest or produces moon.aof.<seq>.base.rdb files. But main.rs ran
the multi-part AOF block unconditionally, which:

  1. On first boot under tokio, called AofManifest::initialize() to
     create an empty moon.aof.manifest.
  2. On the next boot, the loader saw the manifest, wiped the freshly
     v2-loaded databases (db.clear()), then tried replay_multi_part
     which found no base.rdb and silently produced 0 entries.

Net effect: every SET written under tokio was lost on restart, even
after BGREWRITEAOF. This is an ACID-04 / ACID-11 violation that
test_txn_commit_wal_crash_recovery has been catching as a hang +
unexpected None on GET. Reproducer (any non-TXN write also fails):

  ./moon --port 16399 --shards 1 --dir /tmp/t --appendonly yes &
  redis-cli -p 16399 SET k v
  redis-cli -p 16399 BGREWRITEAOF
  kill -9 $!
  ./moon --port 16399 --shards 1 --dir /tmp/t --appendonly yes &
  redis-cli -p 16399 GET k   # → (nil) before fix, "v" after fix

Fix: wrap the entire multi-part AOF replay/initialize block in
`#[cfg(feature = "runtime-monoio")]`. Under tokio, v2 single-file
recovery owns the AOF path (`appendonly.aof` with RDB preamble). The
monoio path is unchanged.

Add a runtime-tokio-only warn that surfaces when an operator has
switched from monoio: a moon.aof.manifest exists on disk but is now
ignored, with guidance to switch back to monoio to load it. This
keeps data unreachable rather than silently overwritten.

Out of scope: monoio-side first-boot path where manifest exists with
empty base.rdb but non-empty incr produces "AOF base RDB missing ...
incr is N bytes; refusing to replay incr against empty state". That
is a separate pre-existing bug surfaced by the same reproducer under
monoio and will be fixed in a follow-up.

author: Tin Dang
---
 src/main.rs | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)
diff --git a/src/main.rs b/src/main.rs
index 8ec268fb..5791c5bf 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -431,6 +431,14 @@ fn main() -> anyhow::Result<()> {
     //
     // A corrupt manifest is FATAL: overwriting it silently destroys the reference
     // to the real base RDB and loses all persisted data.
+    //
+    // Gated to runtime-monoio: BGREWRITEAOF under runtime-tokio writes a
+    // single-file appendonly.aof with RDB preamble (legacy v2 format) and
+    // never advances the manifest. Engaging this block under tokio creates an
+    // empty manifest at first boot, then on the next boot wipes v2-loaded
+    // state because the multi-part replay finds no base RDB. This caused the
+    // tokio TXN replay regression that surfaced via test_txn_commit_wal_crash_recovery.
+    #[cfg(feature = "runtime-monoio")]
     if config.appendonly == "yes"
         && let Some(ref dir) = persistence_dir
     {
@@ -523,6 +531,26 @@ fn main() -> anyhow::Result<()> {
         }
     }
 
+    // Under tokio, multi-part AOF is not supported. If a manifest exists, the
+    // operator likely switched from monoio — warn so they don't think their
+    // data is silently corrupted. v2 recovery (single-file appendonly.aof)
+    // remains active.
+    #[cfg(not(feature = "runtime-monoio"))]
+    if config.appendonly == "yes"
+        && let Some(ref dir) = persistence_dir
+    {
+        let manifest_path =
+            std::path::PathBuf::from(dir).join("appendonlydir/moon.aof.manifest");
+        if manifest_path.exists() {
+            tracing::warn!(
+                "multi-part AOF manifest found at {} but runtime is tokio; ignoring. \
+                 Switch to monoio (cargo run --no-default-features --features runtime-monoio,jemalloc) \
+                 to load multi-part AOF data.",
+                manifest_path.display()
+            );
+        }
+    }
+
     // Extract databases from all shards and wrap in ShardDatabases
     let all_dbs: Vec<Vec<moon::storage::Database>> = shards
         .iter_mut()

From b0196d19c14123c6559a620b74d535c17e48310c Mon Sep 17 00:00:00 2001
From: Tin Dang <tindang.ht97@gmail.com>
Date: Mon, 18 May 2026 09:08:49 +0700
Subject: [PATCH 2/3] test(txn_kv_wiring): make AOF poll runtime-agnostic +
 bound handshake
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

test_txn_commit_wal_crash_recovery was failing on every CI run because
its post-BGREWRITEAOF wait polled for `appendonlydir/*.base.rdb`, the
multi-part-AOF artifact only produced by runtime-monoio. CI runs under
`--no-default-features --features runtime-tokio,jemalloc`, where
BGREWRITEAOF writes a single-file `appendonly.aof` with RDB preamble
and never produces a base.rdb. Result: the test always hit the
10-second timeout asserting "BGREWRITEAOF did not create a base RDB
file" before reaching the real recovery assertion.

The companion `get_connection()` calls were also unbounded — when the
shard accept loop lagged the TCP bind, the redis-rs handshake could
block indefinitely (the macOS "Connection reset by peer" failure mode
that the previous push tried to deflake by churning wait_for_server).

Two minimal changes:

1. Poll for either `appendonlydir/moon.aof.<seq>.base.rdb` (monoio) OR
   a non-empty `appendonly.aof` (tokio). Bump the deadline from 10s to
   15s for CI runner variance. Update the assertion message to name
   both candidate paths so a future failure points at the right file.

2. Replace `client.get_connection()` with
   `client.get_connection_with_timeout(10s)` at both phase-1 and
   phase-2 connect sites. The handshake now fails fast instead of
   hanging the CI runner for 15 minutes.

Companion to the runtime-monoio gate on multi-part AOF in main.rs
(previous commit): together those two changes restore test_txn_commit_wal_crash_recovery
to green on tokio, which is the CI configuration.

author: Tin Dang
---
 src/main.rs            |  3 +--
 tests/txn_kv_wiring.rs | 39 +++++++++++++++++++++++++++++----------
 2 files changed, 30 insertions(+), 12 deletions(-)

diff --git a/src/main.rs b/src/main.rs
index 5791c5bf..18c9990f 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -539,8 +539,7 @@ fn main() -> anyhow::Result<()> {
     if config.appendonly == "yes"
         && let Some(ref dir) = persistence_dir
     {
-        let manifest_path =
-            std::path::PathBuf::from(dir).join("appendonlydir/moon.aof.manifest");
+        let manifest_path = std::path::PathBuf::from(dir).join("appendonlydir/moon.aof.manifest");
         if manifest_path.exists() {
             tracing::warn!(
                 "multi-part AOF manifest found at {} but runtime is tokio; ignoring. \
diff --git a/tests/txn_kv_wiring.rs b/tests/txn_kv_wiring.rs
index e26574e4..07c2e749 100644
--- a/tests/txn_kv_wiring.rs
+++ b/tests/txn_kv_wiring.rs
@@ -944,7 +944,13 @@ async fn test_txn_commit_wal_crash_recovery() {
 
     // Connect with sync redis client to avoid holding an async runtime across process boundaries.
     let client1 = redis::Client::open(format!("redis://127.0.0.1:{port1}")).unwrap();
-    let mut sync_conn1 = client1.get_connection().expect("connect to phase-1 server");
+    // Bound the RESP handshake: wait_for_server only proves TCP bind. The
+    // shard accept loop can lag the bind by a small window, and a bare
+    // get_connection() blocks indefinitely on macOS CI when the first
+    // RESP exchange races the bind ("Connection reset by peer").
+    let mut sync_conn1 = client1
+        .get_connection_with_timeout(std::time::Duration::from_secs(10))
+        .expect("connect to phase-1 server");
 
     // Non-TXN baseline (verifies plain AOF replay too)
     let _: String = redis::cmd("SET")
@@ -980,13 +986,20 @@ async fn test_txn_commit_wal_crash_recovery() {
         "BGREWRITEAOF should start background rewrite: {bgrw}"
     );
 
-    // Wait for BGREWRITEAOF to complete by polling for the base RDB file.
-    // The file is named moon.aof.<seq>.base.rdb inside the appendonlydir.
+    // Wait for BGREWRITEAOF to produce a non-empty AOF artifact.
+    //
+    // Under runtime-monoio multi-part AOF, this is
+    // `appendonlydir/moon.aof.<seq>.base.rdb`.
+    // Under runtime-tokio single-file AOF, this is `appendonly.aof`
+    // (the rewrite re-writes it with an RDB preamble in-place).
+    //
+    // Poll for either so the test stays runtime-agnostic.
     let aof_dir = tmp_dir.join("appendonlydir");
-    let base_rdb_exists = {
-        let deadline = std::time::Instant::now() + std::time::Duration::from_secs(10);
+    let single_file_aof = tmp_dir.join("appendonly.aof");
+    let aof_artifact_ready = {
+        let deadline = std::time::Instant::now() + std::time::Duration::from_secs(15);
         loop {
-            let found = std::fs::read_dir(&aof_dir)
+            let base_rdb = std::fs::read_dir(&aof_dir)
                 .ok()
                 .and_then(|mut d| {
                     d.find(|e| {
@@ -998,7 +1011,10 @@ async fn test_txn_commit_wal_crash_recovery() {
                     })
                 })
                 .is_some();
-            if found {
+            let single = std::fs::metadata(&single_file_aof)
+                .map(|m| m.len() > 0)
+                .unwrap_or(false);
+            if base_rdb || single {
                 break true;
             }
             if std::time::Instant::now() >= deadline {
@@ -1008,8 +1024,9 @@ async fn test_txn_commit_wal_crash_recovery() {
         }
     };
     assert!(
-        base_rdb_exists,
-        "BGREWRITEAOF did not create a base RDB file within 10s in {aof_dir:?}"
+        aof_artifact_ready,
+        "BGREWRITEAOF did not produce an AOF artifact within 15s \
+         (checked multi-part {aof_dir:?} and single-file {single_file_aof:?})"
     );
 
     // Kill server 1.
@@ -1041,7 +1058,9 @@ async fn test_txn_commit_wal_crash_recovery() {
     );
 
     let client2 = redis::Client::open(format!("redis://127.0.0.1:{port2}")).unwrap();
-    let mut sync_conn2 = client2.get_connection().expect("connect to phase-2 server");
+    let mut sync_conn2 = client2
+        .get_connection_with_timeout(std::time::Duration::from_secs(10))
+        .expect("connect to phase-2 server");
 
     // Verify TXN committed value survived server restart via WAL replay.
     let recovered: Option<String> = redis::cmd("GET")

From 4425632c308f3189c9affa5c7e37458135d98447 Mon Sep 17 00:00:00 2001
From: Tin Dang <tindang.ht97@gmail.com>
Date: Mon, 18 May 2026 14:51:45 +0700
Subject: [PATCH 3/3] test(txn_kv_wiring): retry post-bind RESP handshake +
 CHANGELOG entry
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`test_txn_commit_wal_crash_recovery` was failing on both Linux Check
(EAGAIN, os error 11) and macOS Check (Connection reset by peer, os error
54) at the redis-rs handshake. `wait_for_server` only proves TCP bind;
the shard accept loop and RESP handler can lag the bind by a small
window, during which the first RESP exchange races and fails.

Replace the single `get_connection_with_timeout(10s)` attempt at both
phase-1 and phase-2 connect sites with a `connect_redis_with_retry`
helper that polls 1s attempts with 200ms backoff for up to 15s, then
panics with the last captured error.

Also add the PR #96 CHANGELOG entry under `[0.2.0-alpha] — Unreleased`
to satisfy the Lint gate's `CHANGELOG.md not updated` check.

author: Tin Dang
---
 CHANGELOG.md           | 14 ++++++++++++++
 tests/txn_kv_wiring.rs | 40 ++++++++++++++++++++++++++++++----------
 2 files changed, 44 insertions(+), 10 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3297276a..98c25e16 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -61,6 +61,20 @@ See `docs/guides/cdc.md` for consumer integration.
   graph/temporal/workspace/MQ WAL replay, SO_REUSEPORT, NUMA pinning, and
   cancel-driven graceful shutdown.
 
+### Fixed — PR #96 test deflake + tokio AOF replay
+
+- `main.rs` AOF recovery: gated the multi-part AOF manifest replay block to
+  `#[cfg(feature = "runtime-monoio")]`. Under tokio, the legacy single-file
+  `appendonly.aof` is loaded via the v2 recovery chain; the multi-part loader
+  no longer creates an empty manifest at first boot that wiped v2-loaded
+  state on the next restart (every tokio SET was lost on restart).
+- `tests/txn_kv_wiring.rs`: made `test_txn_commit_wal_crash_recovery`
+  runtime-agnostic by polling for either the monoio multi-part `.base.rdb`
+  artifact or the tokio single-file `appendonly.aof`. Added
+  `connect_redis_with_retry` helper to bound and retry the post-bind RESP
+  handshake (was racing the shard accept loop, surfacing as EAGAIN on Linux
+  and ECONNRESET on macOS CI).
+
 ### Fixed — PR #95 review hardening
 
 - `main.rs` `malloc_conf` symbol: replaced the union-based unsafe pun with
diff --git a/tests/txn_kv_wiring.rs b/tests/txn_kv_wiring.rs
index 07c2e749..2de881c8 100644
--- a/tests/txn_kv_wiring.rs
+++ b/tests/txn_kv_wiring.rs
@@ -878,6 +878,34 @@ fn find_moon_binary() -> Option<std::path::PathBuf> {
     None
 }
 
+/// Open a redis-rs sync connection, retrying through the post-bind handshake race.
+///
+/// `wait_for_server` only proves TCP bind. The shard accept loop and RESP handler
+/// can lag the bind by a small window, during which the first RESP exchange may
+/// fail with EAGAIN (Linux) or ECONNRESET (macOS). Retry with short backoff
+/// instead of letting a single fast attempt panic.
+fn connect_redis_with_retry(client: &redis::Client, label: &str) -> redis::Connection {
+    let deadline = std::time::Instant::now() + std::time::Duration::from_secs(15);
+    #[allow(unused_assignments)]
+    let mut last_err: Option<redis::RedisError> = None;
+    loop {
+        match client.get_connection_with_timeout(std::time::Duration::from_secs(1)) {
+            Ok(conn) => return conn,
+            Err(e) => last_err = Some(e),
+        }
+        if std::time::Instant::now() >= deadline {
+            panic!(
+                "connect to {label} server failed after 15s: {}",
+                last_err
+                    .as_ref()
+                    .map(|e| e.to_string())
+                    .unwrap_or_else(|| "no error captured".into())
+            );
+        }
+        std::thread::sleep(std::time::Duration::from_millis(200));
+    }
+}
+
 /// Bind a free OS port, drop the listener, and return the port number.
 fn free_port() -> u16 {
     // SAFETY: bind + drop releases the port; the server will re-bind it.
@@ -944,13 +972,7 @@ async fn test_txn_commit_wal_crash_recovery() {
 
     // Connect with sync redis client to avoid holding an async runtime across process boundaries.
     let client1 = redis::Client::open(format!("redis://127.0.0.1:{port1}")).unwrap();
-    // Bound the RESP handshake: wait_for_server only proves TCP bind. The
-    // shard accept loop can lag the bind by a small window, and a bare
-    // get_connection() blocks indefinitely on macOS CI when the first
-    // RESP exchange races the bind ("Connection reset by peer").
-    let mut sync_conn1 = client1
-        .get_connection_with_timeout(std::time::Duration::from_secs(10))
-        .expect("connect to phase-1 server");
+    let mut sync_conn1 = connect_redis_with_retry(&client1, "phase-1");
 
     // Non-TXN baseline (verifies plain AOF replay too)
     let _: String = redis::cmd("SET")
@@ -1058,9 +1080,7 @@ async fn test_txn_commit_wal_crash_recovery() {
     );
 
     let client2 = redis::Client::open(format!("redis://127.0.0.1:{port2}")).unwrap();
-    let mut sync_conn2 = client2
-        .get_connection_with_timeout(std::time::Duration::from_secs(10))
-        .expect("connect to phase-2 server");
+    let mut sync_conn2 = connect_redis_with_retry(&client2, "phase-2");
 
     // Verify TXN committed value survived server restart via WAL replay.
     let recovered: Option<String> = redis::cmd("GET")