From 45b279fdc0d599417123a6e40e62b39202503b11 Mon Sep 17 00:00:00 2001 From: Laxman Ch Date: Mon, 4 May 2026 21:17:22 +0530 Subject: [PATCH 1/3] Skip synchronous snapshot during leader election to unblock quorum formation On large ensembles (15M+ znodes), the synchronous snapshot in ZooKeeperServer.loadData() takes 34-43s on production hardware, blocking quorum formation and causing repeated election failures when it exceeds initLimit (incident-10033: 4m47s outage). Implementation: - ZooKeeperServer.loadData(boolean skipSnapshot): new overload that conditionally skips the startup snapshot. The no-arg loadData() delegates to loadData(false) for backward compatibility. - Leader.lead(): passes QuorumPeer.isSkipLeaderStartupSnapshot() to loadData(), controlled by system property. - QuorumPeer: new config property zookeeper.leaderElection.skipStartupSnapshot (default: false). Safety: skipping is safe because killSession() is idempotent on recovery, follower sync uses the in-memory DataTree (not disk), and SyncRequestProcessor takes periodic snapshots after quorum. This matches the approach in ZOOKEEPER-1558 (branch-3.4, 2013). Tests: - ZooKeeperServerTest: loadData(true) skips snapshot, loadData(false) takes snapshot, no-arg loadData() delegates to loadData(false). - QuorumPeerTest: skipLeaderStartupSnapshot defaults to false, getter/setter works correctly. - Zab1_0Test: leader with skipLeaderStartupSnapshot=true does not rewrite snapshot during lead(); leader with it disabled does. All 82 existing + new tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../zookeeper/server/ZooKeeperServer.java | 31 ++++- .../zookeeper/server/quorum/Leader.java | 2 +- .../zookeeper/server/quorum/QuorumPeer.java | 20 +++ .../zookeeper/server/ZooKeeperServerTest.java | 108 +++++++++++++++ .../server/quorum/QuorumPeerTest.java | 16 +++ .../zookeeper/server/quorum/Zab1_0Test.java | 123 ++++++++++++++++++ 6 files changed, 297 insertions(+), 3 deletions(-) diff --git a/zookeeper-server/src/main/java/org/apache/zookeeper/server/ZooKeeperServer.java b/zookeeper-server/src/main/java/org/apache/zookeeper/server/ZooKeeperServer.java index c0963695899..f842a745c6f 100644 --- a/zookeeper-server/src/main/java/org/apache/zookeeper/server/ZooKeeperServer.java +++ b/zookeeper-server/src/main/java/org/apache/zookeeper/server/ZooKeeperServer.java @@ -492,6 +492,28 @@ public void setZKDatabase(ZKDatabase zkDb) { * Restore sessions and data */ public void loadData() throws IOException, InterruptedException { + loadData(false); + } + + /** + * Restore sessions and data, optionally skipping the startup snapshot. + * + * During leader election, the synchronous snapshot in loadData() blocks + * quorum formation — on large ensembles (15M+ znodes) it takes 34-43s, + * which can exceed initLimit and cause repeated election failures. + * Skipping the snapshot is safe because: + * - Dead session cleanup (killSession) is idempotent on recovery + * - Follower sync uses the in-memory DataTree, not the disk snapshot + * - SyncRequestProcessor takes periodic snapshots after quorum forms + * - This matches the approach in ZOOKEEPER-1558 (branch-3.4, 2013) + * + * @param skipSnapshot if true, skip the startup snapshot. The periodic + * snapshot mechanism in SyncRequestProcessor will persist state + * after quorum is established and transactions begin flowing. + * @see ZOOKEEPER-1558 + * @see ZOOKEEPER-4766 + */ + public void loadData(boolean skipSnapshot) throws IOException, InterruptedException { /* * When a new leader starts executing Leader#lead, it * invokes this method. The database, however, has been @@ -529,8 +551,13 @@ public void loadData() throws IOException, InterruptedException { killSession(session, zkDb.getDataTreeLastProcessedZxid()); } - // Make a clean snapshot - takeSnapshot(); + if (skipSnapshot) { + LOG.info("Skipping startup snapshot (periodic snapshot will persist state). " + + "Dead sessions cleaned: {}", deadSessions.size()); + } else { + // Make a clean snapshot + takeSnapshot(); + } } public void takeSnapshot() { diff --git a/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/Leader.java b/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/Leader.java index e90df8f67a1..7677b99dc63 100644 --- a/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/Leader.java +++ b/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/Leader.java @@ -587,7 +587,7 @@ void lead() throws IOException, InterruptedException { try { self.setZabState(QuorumPeer.ZabState.DISCOVERY); self.tick.set(0); - zk.loadData(); + zk.loadData(self.isSkipLeaderStartupSnapshot()); leaderStateSummary = new StateSummary(self.getCurrentEpoch(), zk.getLastProcessedZxid()); diff --git a/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/QuorumPeer.java b/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/QuorumPeer.java index 17705ef7b75..0a5c5bb395c 100644 --- a/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/QuorumPeer.java +++ b/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/QuorumPeer.java @@ -720,6 +720,18 @@ public synchronized void setCurrentVote(Vote v) { */ protected volatile int initLimit; + /** + * Whether to skip the synchronous snapshot during leader startup. + * On large ensembles (15M+ znodes), the snapshot in loadData() takes + * 34-43s, blocking quorum formation and causing repeated election + * failures when it exceeds initLimit. Skipping it is safe — see + * ZOOKEEPER-1558 and ZOOKEEPER-4766. + */ + public static final String SKIP_LEADER_STARTUP_SNAPSHOT = + "zookeeper.leaderElection.skipStartupSnapshot"; + private boolean skipLeaderStartupSnapshot = + Boolean.getBoolean(SKIP_LEADER_STARTUP_SNAPSHOT); + /** * The number of ticks that can pass between sending a request and getting * an acknowledgment @@ -1819,6 +1831,14 @@ public void setInitLimit(int initLimit) { this.initLimit = initLimit; } + public boolean isSkipLeaderStartupSnapshot() { + return skipLeaderStartupSnapshot; + } + + public void setSkipLeaderStartupSnapshot(boolean skip) { + this.skipLeaderStartupSnapshot = skip; + } + /** * Get the current tick */ diff --git a/zookeeper-server/src/test/java/org/apache/zookeeper/server/ZooKeeperServerTest.java b/zookeeper-server/src/test/java/org/apache/zookeeper/server/ZooKeeperServerTest.java index 81469a8f858..64b98a55140 100644 --- a/zookeeper-server/src/test/java/org/apache/zookeeper/server/ZooKeeperServerTest.java +++ b/zookeeper-server/src/test/java/org/apache/zookeeper/server/ZooKeeperServerTest.java @@ -26,6 +26,7 @@ import java.util.List; import org.apache.zookeeper.ZKTestCase; import org.apache.zookeeper.server.persistence.FileTxnLog; +import org.apache.zookeeper.server.persistence.FileTxnSnapLog; import org.apache.zookeeper.server.persistence.SnapStream; import org.apache.zookeeper.server.persistence.Util; import org.apache.zookeeper.test.ClientBase; @@ -135,4 +136,111 @@ public void testInvalidSnapshot() { } } + /** + * Helper: create a ZooKeeperServer with an initialized database. + * This simulates the state during leader election where the DB + * is already loaded (QuorumPeer.start() loads it before election). + */ + private ZooKeeperServer createServerWithInitializedDb(File tmpDir) throws Exception { + FileTxnSnapLog snapLog = new FileTxnSnapLog(tmpDir, tmpDir); + ZKDatabase zkDb = new ZKDatabase(snapLog); + // Initialize DB (as QuorumPeer.start() would do before election) + zkDb.loadDataBase(); + ZooKeeperServer zks = new ZooKeeperServer(snapLog, 2000, null); + zks.setZKDatabase(zkDb); + return zks; + } + + private static int countSnapshotFiles(File dir) { + if (dir == null || !dir.exists()) { + return 0; + } + File[] snaps = dir.listFiles((d, name) -> name.startsWith("snapshot.")); + return snaps == null ? 0 : snaps.length; + } + + /** + * Test that loadData(false) takes a snapshot (default behavior). + * We detect this by checking that the snapshot file's modification + * time is updated (the file name stays the same since zxid is unchanged). + */ + @Test + public void testLoadDataTakesSnapshotByDefault() throws Exception { + File tmpDir = ClientBase.createTmpDir(); + try { + ZooKeeperServer zks = createServerWithInitializedDb(tmpDir); + File snapDir = new File(tmpDir, "version-2"); + long lastModBefore = getLatestSnapshotModTime(snapDir); + // Ensure filesystem timestamp granularity doesn't hide the write + Thread.sleep(50); + + zks.loadData(false); + + long lastModAfter = getLatestSnapshotModTime(snapDir); + assertTrue("Snapshot file should be updated when skipSnapshot=false", + lastModAfter > lastModBefore); + } finally { + ClientBase.recursiveDelete(tmpDir); + } + } + + /** + * Test that loadData(true) skips the snapshot — file is NOT rewritten. + */ + @Test + public void testLoadDataSkipsSnapshotWhenRequested() throws Exception { + File tmpDir = ClientBase.createTmpDir(); + try { + ZooKeeperServer zks = createServerWithInitializedDb(tmpDir); + File snapDir = new File(tmpDir, "version-2"); + long lastModBefore = getLatestSnapshotModTime(snapDir); + Thread.sleep(50); + + zks.loadData(true); + + long lastModAfter = getLatestSnapshotModTime(snapDir); + assertEquals("Snapshot file should NOT be updated when skipSnapshot=true", + lastModBefore, lastModAfter); + } finally { + ClientBase.recursiveDelete(tmpDir); + } + } + + /** + * Test that loadData() no-arg delegates to loadData(false) — takes snapshot. + */ + @Test + public void testLoadDataNoArgDelegatesToDefault() throws Exception { + File tmpDir = ClientBase.createTmpDir(); + try { + ZooKeeperServer zks = createServerWithInitializedDb(tmpDir); + File snapDir = new File(tmpDir, "version-2"); + long lastModBefore = getLatestSnapshotModTime(snapDir); + Thread.sleep(50); + + zks.loadData(); + + long lastModAfter = getLatestSnapshotModTime(snapDir); + assertTrue("No-arg loadData() should take snapshot (backward compatible)", + lastModAfter > lastModBefore); + } finally { + ClientBase.recursiveDelete(tmpDir); + } + } + + private static long getLatestSnapshotModTime(File dir) { + if (dir == null || !dir.exists()) { + return 0; + } + File[] snaps = dir.listFiles((d, name) -> name.startsWith("snapshot.")); + if (snaps == null || snaps.length == 0) { + return 0; + } + long latest = 0; + for (File f : snaps) { + latest = Math.max(latest, f.lastModified()); + } + return latest; + } + } diff --git a/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/QuorumPeerTest.java b/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/QuorumPeerTest.java index 03cb6f06a83..42d3cf8e7c0 100644 --- a/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/QuorumPeerTest.java +++ b/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/QuorumPeerTest.java @@ -107,4 +107,20 @@ public void testIsNotLeaderBecauseNoVote() throws Exception { assertFalse(peer.isLeader(localPeerId)); } + @Test + public void testSkipLeaderStartupSnapshotDefaultsToFalse() throws Exception { + QuorumPeer peer = new QuorumPeer(); + assertFalse("skipLeaderStartupSnapshot should default to false", + peer.isSkipLeaderStartupSnapshot()); + } + + @Test + public void testSkipLeaderStartupSnapshotSetterGetter() throws Exception { + QuorumPeer peer = new QuorumPeer(); + peer.setSkipLeaderStartupSnapshot(true); + assertTrue(peer.isSkipLeaderStartupSnapshot()); + peer.setSkipLeaderStartupSnapshot(false); + assertFalse(peer.isSkipLeaderStartupSnapshot()); + } + } diff --git a/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/Zab1_0Test.java b/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/Zab1_0Test.java index b8630ae1319..602595c88fb 100644 --- a/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/Zab1_0Test.java +++ b/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/Zab1_0Test.java @@ -822,6 +822,129 @@ private void proposeNewSession(QuorumPacket qp, long zxid, long sessionId) throw }); } + /** + * Test that leader election skips snapshot in loadData() when + * skipLeaderStartupSnapshot is enabled. This verifies the fix for + * ZOOKEEPER-4766: leader election time should not scale with tree size. + */ + @Test + public void testLeaderSkipsSnapshotWhenConfigured() throws Exception { + Socket[] pair = getSocketPair(); + Socket leaderSocket = pair[0]; + Socket followerSocket = pair[1]; + File tmpDir = File.createTempFile("test", "dir", testData); + tmpDir.delete(); + tmpDir.mkdir(); + LeadThread leadThread = null; + Leader leader = null; + try { + QuorumPeer peer = createQuorumPeer(tmpDir); + // Enable skip-snapshot + peer.setSkipLeaderStartupSnapshot(true); + + leader = createLeader(tmpDir, peer); + peer.leader = leader; + + // Pre-initialize the database (as QuorumPeer.start() does before election). + // This ensures zkDb.isInitialized()==true so loadData() won't call + // loadDataBase() which itself creates an initial snapshot. + leader.zk.getZKDatabase().loadDataBase(); + + File snapDir = new File(tmpDir, "version-2"); + long lastModBefore = getLatestSnapshotModTime(snapDir); + Thread.sleep(50); + + leadThread = new LeadThread(leader); + leadThread.start(); + + while (leader.cnxAcceptor == null || !leader.cnxAcceptor.isAlive()) { + Thread.sleep(20); + } + + // The leader is now past loadData(). With skipSnapshot=true, + // the snapshot file should NOT have been rewritten. + long lastModAfter = getLatestSnapshotModTime(snapDir); + assertEquals("Snapshot should NOT be rewritten when skipLeaderStartupSnapshot=true", + lastModBefore, lastModAfter); + } finally { + if (leader != null) { + leader.shutdown("end of test"); + } + if (leadThread != null) { + leadThread.interrupt(); + leadThread.join(); + } + TestUtils.deleteFileRecursively(tmpDir); + } + } + + /** + * Test that leader election DOES take snapshot when + * skipLeaderStartupSnapshot is disabled (default behavior). + */ + @Test + public void testLeaderTakesSnapshotByDefault() throws Exception { + Socket[] pair = getSocketPair(); + Socket leaderSocket = pair[0]; + Socket followerSocket = pair[1]; + File tmpDir = File.createTempFile("test", "dir", testData); + tmpDir.delete(); + tmpDir.mkdir(); + LeadThread leadThread = null; + Leader leader = null; + try { + QuorumPeer peer = createQuorumPeer(tmpDir); + // Explicitly disable (default) + peer.setSkipLeaderStartupSnapshot(false); + + leader = createLeader(tmpDir, peer); + peer.leader = leader; + + // Pre-initialize the database (as QuorumPeer.start() does before election) + leader.zk.getZKDatabase().loadDataBase(); + + File snapDir = new File(tmpDir, "version-2"); + long lastModBefore = getLatestSnapshotModTime(snapDir); + Thread.sleep(50); + + leadThread = new LeadThread(leader); + leadThread.start(); + + while (leader.cnxAcceptor == null || !leader.cnxAcceptor.isAlive()) { + Thread.sleep(20); + } + + // With skipSnapshot=false (default), the snapshot should have been rewritten + long lastModAfter = getLatestSnapshotModTime(snapDir); + assertTrue("Snapshot should be rewritten when skipLeaderStartupSnapshot=false", + lastModAfter > lastModBefore); + } finally { + if (leader != null) { + leader.shutdown("end of test"); + } + if (leadThread != null) { + leadThread.interrupt(); + leadThread.join(); + } + TestUtils.deleteFileRecursively(tmpDir); + } + } + + private static long getLatestSnapshotModTime(File dir) { + if (dir == null || !dir.exists()) { + return 0; + } + File[] snaps = dir.listFiles((d, name) -> name.startsWith("snapshot.")); + if (snaps == null || snaps.length == 0) { + return 0; + } + long latest = 0; + for (File f : snaps) { + latest = Math.max(latest, f.lastModified()); + } + return latest; + } + @Test public void testNormalRun() throws Exception { testLeaderConversation(new LeaderConversation() { From 8aa68d9ad77fe9fe1b18e1a7a13791152e7a6dd2 Mon Sep 17 00:00:00 2001 From: Laxman Ch Date: Thu, 28 May 2026 13:03:23 +0530 Subject: [PATCH 2/3] Address review comments on skip-leader-startup-snapshot MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Production changes: - Leader.lead() (Leader.java:590): honor shouldForceWriteInitialSnapshotAfterLeaderElection() by AND'ing the skip flag with the negation of the safety predicate. Mirrors the follower DIFF-sync gate added by ZOOKEEPER-2678 (Learner.java:553-556). Without this, a fresh ensemble post-upgrade — where trustEmptySnapshot=true and getLastSnapshotInfo()==null — would skip the very snapshot the existing safety mechanism was designed to write. - QuorumPeer.skipLeaderStartupSnapshot: add 'volatile'. The setter is JMX-reachable; without volatile, a flip from a non-leader thread is not guaranteed visible to the leader thread reading via isSkipLeaderStartupSnapshot() at the start of lead(). Matches the convention of adjacent volatile fields initLimit, syncLimit, and connectToLearnerMasterLimit. - ZooKeeperServer.loadData(true) log line: include lastProcessedZxid (hex). The Skipping startup snapshot INFO line is the rollout-monitoring signal; the zxid enables correlation to the leader election epoch in operational triage. Test changes: - Zab1_0Test: remove unused getSocketPair() allocation in testLeaderSkipsSnapshotWhenConfigured / testLeaderTakesSnapshotByDefault (each opened a ServerSocket + 2 Sockets that were never used or closed, leaking FDs in CI). Extract waitForCnxAcceptor() helper with a 10s deadline so a leader-thread death in loadData() fails fast with a clear message instead of hanging until the JUnit suite timeout. - All tests using lastModified() mtime comparison: bump Thread.sleep(50) to Thread.sleep(1100) to cover HFS+ 1s mtime granularity on macOS dev hosts (APFS/ext4 have finer granularity and are unaffected). - ZooKeeperServerTest.testLoadDataTakesSnapshotWhenForceWriteRequested: new test that documents the Leader.java:590 composition collapsing to loadData(false) when the upgrade-safety gate fires, ensuring a snapshot is taken regardless of the skip flag. - Zab1_0Test.testFollowerPathUnaffectedBySkipFlag: new regression guard re-running testNormalFollowerRunWithDiff with the skip flag enabled globally. The follower path must not branch on this flag; if a future refactor leaks it into Learner / Follower, this test diverges and flags the regression. - QuorumPeerTest.testSkipLeaderStartupSnapshotDefaultsToFalse: clear the system property in a try/finally so the default-value assertion is not affected by Surefire forked-JVM property pollution or test ordering. All targeted tests (ZooKeeperServerTest, QuorumPeerTest, Zab1_0Test) pass. Regression set (DIFFSyncConsistencyTest, LearnerHandlerTest, LearnerTest, SnapshotDigestTest, InvalidSnapshotTest, QuorumPeerMainTest, ObserverMasterTest) also passes — 93 tests total, 0 failures. Checkstyle and SpotBugs clean. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../zookeeper/server/ZooKeeperServer.java | 4 +- .../zookeeper/server/quorum/Leader.java | 3 +- .../zookeeper/server/quorum/QuorumPeer.java | 2 +- .../zookeeper/server/ZooKeeperServerTest.java | 38 +++++++++++-- .../server/quorum/QuorumPeerTest.java | 17 +++++- .../zookeeper/server/quorum/Zab1_0Test.java | 57 ++++++++++++++----- 6 files changed, 97 insertions(+), 24 deletions(-) diff --git a/zookeeper-server/src/main/java/org/apache/zookeeper/server/ZooKeeperServer.java b/zookeeper-server/src/main/java/org/apache/zookeeper/server/ZooKeeperServer.java index f842a745c6f..2b2a5e5f569 100644 --- a/zookeeper-server/src/main/java/org/apache/zookeeper/server/ZooKeeperServer.java +++ b/zookeeper-server/src/main/java/org/apache/zookeeper/server/ZooKeeperServer.java @@ -553,7 +553,9 @@ public void loadData(boolean skipSnapshot) throws IOException, InterruptedExcept if (skipSnapshot) { LOG.info("Skipping startup snapshot (periodic snapshot will persist state). " - + "Dead sessions cleaned: {}", deadSessions.size()); + + "lastProcessedZxid: 0x{}, dead sessions cleaned: {}", + Long.toHexString(zkDb.getDataTreeLastProcessedZxid()), + deadSessions.size()); } else { // Make a clean snapshot takeSnapshot(); diff --git a/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/Leader.java b/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/Leader.java index 7677b99dc63..83829629da1 100644 --- a/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/Leader.java +++ b/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/Leader.java @@ -587,7 +587,8 @@ void lead() throws IOException, InterruptedException { try { self.setZabState(QuorumPeer.ZabState.DISCOVERY); self.tick.set(0); - zk.loadData(self.isSkipLeaderStartupSnapshot()); + zk.loadData(self.isSkipLeaderStartupSnapshot() + && !zk.shouldForceWriteInitialSnapshotAfterLeaderElection()); leaderStateSummary = new StateSummary(self.getCurrentEpoch(), zk.getLastProcessedZxid()); diff --git a/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/QuorumPeer.java b/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/QuorumPeer.java index 0a5c5bb395c..2f7a22a6ef3 100644 --- a/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/QuorumPeer.java +++ b/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/QuorumPeer.java @@ -729,7 +729,7 @@ public synchronized void setCurrentVote(Vote v) { */ public static final String SKIP_LEADER_STARTUP_SNAPSHOT = "zookeeper.leaderElection.skipStartupSnapshot"; - private boolean skipLeaderStartupSnapshot = + private volatile boolean skipLeaderStartupSnapshot = Boolean.getBoolean(SKIP_LEADER_STARTUP_SNAPSHOT); /** diff --git a/zookeeper-server/src/test/java/org/apache/zookeeper/server/ZooKeeperServerTest.java b/zookeeper-server/src/test/java/org/apache/zookeeper/server/ZooKeeperServerTest.java index 64b98a55140..2d836acdacf 100644 --- a/zookeeper-server/src/test/java/org/apache/zookeeper/server/ZooKeeperServerTest.java +++ b/zookeeper-server/src/test/java/org/apache/zookeeper/server/ZooKeeperServerTest.java @@ -171,8 +171,8 @@ public void testLoadDataTakesSnapshotByDefault() throws Exception { ZooKeeperServer zks = createServerWithInitializedDb(tmpDir); File snapDir = new File(tmpDir, "version-2"); long lastModBefore = getLatestSnapshotModTime(snapDir); - // Ensure filesystem timestamp granularity doesn't hide the write - Thread.sleep(50); + // 1100ms covers HFS+ 1s mtime granularity on macOS dev hosts (APFS/ext4 have finer) + Thread.sleep(1100); zks.loadData(false); @@ -194,7 +194,7 @@ public void testLoadDataSkipsSnapshotWhenRequested() throws Exception { ZooKeeperServer zks = createServerWithInitializedDb(tmpDir); File snapDir = new File(tmpDir, "version-2"); long lastModBefore = getLatestSnapshotModTime(snapDir); - Thread.sleep(50); + Thread.sleep(1100); zks.loadData(true); @@ -216,7 +216,7 @@ public void testLoadDataNoArgDelegatesToDefault() throws Exception { ZooKeeperServer zks = createServerWithInitializedDb(tmpDir); File snapDir = new File(tmpDir, "version-2"); long lastModBefore = getLatestSnapshotModTime(snapDir); - Thread.sleep(50); + Thread.sleep(1100); zks.loadData(); @@ -228,6 +228,36 @@ public void testLoadDataNoArgDelegatesToDefault() throws Exception { } } + /** + * Verifies the loadData() call resolves to the snapshot-taking path + * when the upgrade-safety gate fires. Leader.lead() composes + * zk.loadData(skipFlag && !zk.shouldForceWriteInitialSnapshotAfterLeaderElection()) + * so when shouldForceWriteInitialSnapshotAfterLeaderElection() returns true + * (fresh ensemble post-upgrade per ZOOKEEPER-2678), the composition collapses + * to loadData(false) and a snapshot is written regardless of the skip flag. + * This test exercises that resolved call and verifies a snapshot is taken. + */ + @Test + public void testLoadDataTakesSnapshotWhenForceWriteRequested() throws Exception { + File tmpDir = ClientBase.createTmpDir(); + try { + ZooKeeperServer zks = createServerWithInitializedDb(tmpDir); + File snapDir = new File(tmpDir, "version-2"); + long lastModBefore = getLatestSnapshotModTime(snapDir); + Thread.sleep(1100); + + // Equivalent to the Leader.lead() composition when shouldForceWrite=true: + // loadData(skipFlag=true && !shouldForceWrite=true) == loadData(false) + zks.loadData(false); + + long lastModAfter = getLatestSnapshotModTime(snapDir); + assertTrue("Snapshot must be taken when the force-write safety gate fires", + lastModAfter > lastModBefore); + } finally { + ClientBase.recursiveDelete(tmpDir); + } + } + private static long getLatestSnapshotModTime(File dir) { if (dir == null || !dir.exists()) { return 0; diff --git a/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/QuorumPeerTest.java b/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/QuorumPeerTest.java index 42d3cf8e7c0..b28e56985af 100644 --- a/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/QuorumPeerTest.java +++ b/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/QuorumPeerTest.java @@ -109,9 +109,20 @@ public void testIsNotLeaderBecauseNoVote() throws Exception { @Test public void testSkipLeaderStartupSnapshotDefaultsToFalse() throws Exception { - QuorumPeer peer = new QuorumPeer(); - assertFalse("skipLeaderStartupSnapshot should default to false", - peer.isSkipLeaderStartupSnapshot()); + // Guard against test-ordering pollution: another test or a Surefire forked-JVM + // default could leave the property set. Clear it for this default-value assertion + // and restore on exit. + String prior = System.getProperty(QuorumPeer.SKIP_LEADER_STARTUP_SNAPSHOT); + System.clearProperty(QuorumPeer.SKIP_LEADER_STARTUP_SNAPSHOT); + try { + QuorumPeer peer = new QuorumPeer(); + assertFalse("skipLeaderStartupSnapshot should default to false", + peer.isSkipLeaderStartupSnapshot()); + } finally { + if (prior != null) { + System.setProperty(QuorumPeer.SKIP_LEADER_STARTUP_SNAPSHOT, prior); + } + } } @Test diff --git a/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/Zab1_0Test.java b/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/Zab1_0Test.java index 602595c88fb..07e9b6d15de 100644 --- a/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/Zab1_0Test.java +++ b/zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/Zab1_0Test.java @@ -829,9 +829,6 @@ private void proposeNewSession(QuorumPacket qp, long zxid, long sessionId) throw */ @Test public void testLeaderSkipsSnapshotWhenConfigured() throws Exception { - Socket[] pair = getSocketPair(); - Socket leaderSocket = pair[0]; - Socket followerSocket = pair[1]; File tmpDir = File.createTempFile("test", "dir", testData); tmpDir.delete(); tmpDir.mkdir(); @@ -852,14 +849,13 @@ public void testLeaderSkipsSnapshotWhenConfigured() throws Exception { File snapDir = new File(tmpDir, "version-2"); long lastModBefore = getLatestSnapshotModTime(snapDir); - Thread.sleep(50); + // 1100ms covers HFS+ 1s mtime granularity on macOS dev hosts + Thread.sleep(1100); leadThread = new LeadThread(leader); leadThread.start(); - while (leader.cnxAcceptor == null || !leader.cnxAcceptor.isAlive()) { - Thread.sleep(20); - } + waitForCnxAcceptor(leader); // The leader is now past loadData(). With skipSnapshot=true, // the snapshot file should NOT have been rewritten. @@ -884,9 +880,6 @@ public void testLeaderSkipsSnapshotWhenConfigured() throws Exception { */ @Test public void testLeaderTakesSnapshotByDefault() throws Exception { - Socket[] pair = getSocketPair(); - Socket leaderSocket = pair[0]; - Socket followerSocket = pair[1]; File tmpDir = File.createTempFile("test", "dir", testData); tmpDir.delete(); tmpDir.mkdir(); @@ -905,14 +898,12 @@ public void testLeaderTakesSnapshotByDefault() throws Exception { File snapDir = new File(tmpDir, "version-2"); long lastModBefore = getLatestSnapshotModTime(snapDir); - Thread.sleep(50); + Thread.sleep(1100); leadThread = new LeadThread(leader); leadThread.start(); - while (leader.cnxAcceptor == null || !leader.cnxAcceptor.isAlive()) { - Thread.sleep(20); - } + waitForCnxAcceptor(leader); // With skipSnapshot=false (default), the snapshot should have been rewritten long lastModAfter = getLatestSnapshotModTime(snapDir); @@ -945,6 +936,44 @@ private static long getLatestSnapshotModTime(File dir) { return latest; } + /** + * Poll for cnxAcceptor liveness with a deadline. Without the bound, a leader + * thread that dies in loadData() (e.g., snapshot IOException) would hang the + * test until JUnit times out the whole suite — slow + opaque failure mode. + */ + private static void waitForCnxAcceptor(Leader leader) throws InterruptedException { + long deadline = System.currentTimeMillis() + 10_000; + while (leader.cnxAcceptor == null || !leader.cnxAcceptor.isAlive()) { + if (System.currentTimeMillis() > deadline) { + fail("Leader did not start cnxAcceptor within 10s; loadData() may have failed"); + } + Thread.sleep(20); + } + } + + /** + * Regression guard for the follower path: the skipLeaderStartupSnapshot flag is + * leader-path only (read at Leader.java:590). The follower codepath + * (Learner.syncWithLeader) must not branch on it. Re-runs the standard follower + * DIFF-sync conversation with the flag globally enabled; if a future refactor + * leaks the flag into Learner / Follower, this test will diverge from + * testNormalFollowerRunWithDiff and flag the regression. + */ + @Test + public void testFollowerPathUnaffectedBySkipFlag() throws Exception { + String prior = System.getProperty(QuorumPeer.SKIP_LEADER_STARTUP_SNAPSHOT); + System.setProperty(QuorumPeer.SKIP_LEADER_STARTUP_SNAPSHOT, "true"); + try { + testNormalFollowerRunWithDiff(); + } finally { + if (prior == null) { + System.clearProperty(QuorumPeer.SKIP_LEADER_STARTUP_SNAPSHOT); + } else { + System.setProperty(QuorumPeer.SKIP_LEADER_STARTUP_SNAPSHOT, prior); + } + } + } + @Test public void testNormalRun() throws Exception { testLeaderConversation(new LeaderConversation() { From c7569c2377f735c4a630f96117c6f0584ae69798 Mon Sep 17 00:00:00 2001 From: Laxman Ch Date: Thu, 28 May 2026 14:58:24 +0530 Subject: [PATCH 3/3] Revert shouldForceWriteInitialSnapshotAfterLeaderElection() guard Earlier commit added a guard to Leader.lead() that AND'd the skip flag with the negation of shouldForceWriteInitialSnapshotAfterLeaderElection(). The intent was to mirror the symmetric Learner DIFF-sync gate (ZOOKEEPER-2678) so a fresh-ensemble post-upgrade leader would not skip the safety-mandated initial snapshot. The guard reproduces the original incident-10033 pattern in the exact scenario it was meant to handle: 1. Operator brings up a huge ensemble (e.g. 15M znodes) for disaster recovery or fresh bootstrap, with -Dzookeeper.snapshot.trust.empty=true and no snapshot on disk (only txn log). 2. QuorumPeer.start() -> loadDataBase() -> restore() replays the txn log into the in-memory DataTree. zkDb is initialized. 3. Election runs; some node becomes leader. 4. Leader.lead() -> shouldForceWriteInitialSnapshotAfterLeaderElection() returns true (trustEmptySnapshot && getLastSnapshotInfo() == null). 5. Guard composition collapses to loadData(false) -> takeSnapshot() -> 34-43s pause on 15M znodes. 6. initLimit (40s) exceeded -> followers timeout -> election fails -> incident-10033 pattern reproduced. The Learner DIFF-sync gate is a genuine correctness requirement because follower DIFF sync does not otherwise write a snapshot, and ZOOKEEPER-3781 documented an upgrade case where divergence resulted. The leader's startup snapshot, by contrast, has never been a correctness requirement: fpj called it "convenient" in ZOOKEEPER-1558 and removed it from 3.4. Recovery via txn log replay plus the next periodic snapshot preserves correctness. The skip flag is opt-in via cfg2; operators bringing up a huge ensemble with trust.empty=true who want bounded recovery rather than fast election can disable the flag. Drop the corresponding test ZooKeeperServerTest.testLoadDataTakesSnapshot WhenForceWriteRequested - it was documenting the composition we are no longer doing. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../zookeeper/server/quorum/Leader.java | 3 +- .../zookeeper/server/ZooKeeperServerTest.java | 30 ------------------- 2 files changed, 1 insertion(+), 32 deletions(-) diff --git a/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/Leader.java b/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/Leader.java index 83829629da1..7677b99dc63 100644 --- a/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/Leader.java +++ b/zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/Leader.java @@ -587,8 +587,7 @@ void lead() throws IOException, InterruptedException { try { self.setZabState(QuorumPeer.ZabState.DISCOVERY); self.tick.set(0); - zk.loadData(self.isSkipLeaderStartupSnapshot() - && !zk.shouldForceWriteInitialSnapshotAfterLeaderElection()); + zk.loadData(self.isSkipLeaderStartupSnapshot()); leaderStateSummary = new StateSummary(self.getCurrentEpoch(), zk.getLastProcessedZxid()); diff --git a/zookeeper-server/src/test/java/org/apache/zookeeper/server/ZooKeeperServerTest.java b/zookeeper-server/src/test/java/org/apache/zookeeper/server/ZooKeeperServerTest.java index 2d836acdacf..5956841fcfd 100644 --- a/zookeeper-server/src/test/java/org/apache/zookeeper/server/ZooKeeperServerTest.java +++ b/zookeeper-server/src/test/java/org/apache/zookeeper/server/ZooKeeperServerTest.java @@ -228,36 +228,6 @@ public void testLoadDataNoArgDelegatesToDefault() throws Exception { } } - /** - * Verifies the loadData() call resolves to the snapshot-taking path - * when the upgrade-safety gate fires. Leader.lead() composes - * zk.loadData(skipFlag && !zk.shouldForceWriteInitialSnapshotAfterLeaderElection()) - * so when shouldForceWriteInitialSnapshotAfterLeaderElection() returns true - * (fresh ensemble post-upgrade per ZOOKEEPER-2678), the composition collapses - * to loadData(false) and a snapshot is written regardless of the skip flag. - * This test exercises that resolved call and verifies a snapshot is taken. - */ - @Test - public void testLoadDataTakesSnapshotWhenForceWriteRequested() throws Exception { - File tmpDir = ClientBase.createTmpDir(); - try { - ZooKeeperServer zks = createServerWithInitializedDb(tmpDir); - File snapDir = new File(tmpDir, "version-2"); - long lastModBefore = getLatestSnapshotModTime(snapDir); - Thread.sleep(1100); - - // Equivalent to the Leader.lead() composition when shouldForceWrite=true: - // loadData(skipFlag=true && !shouldForceWrite=true) == loadData(false) - zks.loadData(false); - - long lastModAfter = getLatestSnapshotModTime(snapDir); - assertTrue("Snapshot must be taken when the force-write safety gate fires", - lastModAfter > lastModBefore); - } finally { - ClientBase.recursiveDelete(tmpDir); - } - } - private static long getLatestSnapshotModTime(File dir) { if (dir == null || !dir.exists()) { return 0;